Blame - mm/migrate.c - SHIFTPHONES/mainline/linux

blob: cf25b00f03c8e90a2880ab5febadf523ccbc7266 [file] [log] [blame]

Greg Kroah-Hartman	b244131	2017-11-01 15:07:57 +0100	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	2	/*
Hugh Dickins	14e0f9b	2015-11-05 18:49:43 -0800	[diff] [blame]	3	* Memory Migration functionality - linux/mm/migrate.c
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	4	*
				5	* Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter
				6	*
				7	* Page migration was first developed in the context of the memory hotplug
				8	* project. The main authors of the migration code are:
				9	*
				10	* IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
				11	* Hirokazu Takahashi <taka@valinux.co.jp>
				12	* Dave Hansen <haveblue@us.ibm.com>
Christoph Lameter	cde5353	2008-07-04 09:59:22 -0700	[diff] [blame]	13	* Christoph Lameter
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	14	*/
				15
				16	#include <linux/migrate.h>
Paul Gortmaker	b95f1b31	2011-10-16 02:01:52 -0400	[diff] [blame]	17	#include <linux/export.h>
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	18	#include <linux/swap.h>
Christoph Lameter	0697212	2006-06-23 02:03:35 -0700	[diff] [blame]	19	#include <linux/swapops.h>
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	20	#include <linux/pagemap.h>
Christoph Lameter	e23ca00	2006-04-10 22:52:57 -0700	[diff] [blame]	21	#include <linux/buffer_head.h>
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	22	#include <linux/mm_inline.h>
Pavel Emelyanov	b488893	2007-10-18 23:40:14 -0700	[diff] [blame]	23	#include <linux/nsproxy.h>
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	24	#include <linux/pagevec.h>
Hugh Dickins	e9995ef	2009-12-14 17:59:31 -0800	[diff] [blame]	25	#include <linux/ksm.h>
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	26	#include <linux/rmap.h>
				27	#include <linux/topology.h>
				28	#include <linux/cpu.h>
				29	#include <linux/cpuset.h>
Christoph Lameter	04e62a2	2006-06-23 02:03:38 -0700	[diff] [blame]	30	#include <linux/writeback.h>
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	31	#include <linux/mempolicy.h>
				32	#include <linux/vmalloc.h>
David Quigley	86c3a76	2006-06-23 02:04:02 -0700	[diff] [blame]	33	#include <linux/security.h>
Hugh Dickins	42cb14b	2015-11-05 18:50:05 -0800	[diff] [blame]	34	#include <linux/backing-dev.h>
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	35	#include <linux/compaction.h>
Adrian Bunk	4f5ca26	2008-07-23 21:27:02 -0700	[diff] [blame]	36	#include <linux/syscalls.h>
Dominik Brodowski	7addf44	2018-03-17 16:08:03 +0100	[diff] [blame]	37	#include <linux/compat.h>
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	38	#include <linux/hugetlb.h>
Aneesh Kumar K.V	8e6ac7f	2012-07-31 16:42:27 -0700	[diff] [blame]	39	#include <linux/hugetlb_cgroup.h>
Tejun Heo	5a0e3ad	2010-03-24 17:04:11 +0900	[diff] [blame]	40	#include <linux/gfp.h>
Christoph Hellwig	a520110	2019-08-28 16:19:53 +0200	[diff] [blame]	41	#include <linux/pagewalk.h>
Jérôme Glisse	df6ad69	2017-09-08 16:12:24 -0700	[diff] [blame]	42	#include <linux/pfn_t.h>
Jérôme Glisse	a5430dd	2017-09-08 16:12:17 -0700	[diff] [blame]	43	#include <linux/memremap.h>
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	44	#include <linux/userfaultfd_k.h>
Rafael Aquini	bf6bddf1	2012-12-11 16:02:42 -0800	[diff] [blame]	45	#include <linux/balloon_compaction.h>
Mel Gorman	f714f4f	2013-12-18 17:08:33 -0800	[diff] [blame]	46	#include <linux/mmu_notifier.h>
Vladimir Davydov	33c3fc7	2015-09-09 15:35:45 -0700	[diff] [blame]	47	#include <linux/page_idle.h>
Vlastimil Babka	d435edc	2016-03-15 14:56:15 -0700	[diff] [blame]	48	#include <linux/page_owner.h>
Ingo Molnar	6e84f31	2017-02-08 18:51:29 +0100	[diff] [blame]	49	#include <linux/sched/mm.h>
Linus Torvalds	197e7e5	2017-08-20 13:26:27 -0700	[diff] [blame]	50	#include <linux/ptrace.h>
Ralph Campbell	34290e2	2020-01-30 22:14:44 -0800	[diff] [blame]	51	#include <linux/oom.h>
Dave Hansen	884a6e5	2021-09-02 14:59:09 -0700	[diff] [blame]	52	#include <linux/memory.h>
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	53
Michal Nazarewicz	0d1836c	2010-12-21 17:24:26 -0800	[diff] [blame]	54	#include <asm/tlbflush.h>
				55
Mel Gorman	7b2a2d4	2012-10-19 14:07:31 +0100	[diff] [blame]	56	#define CREATE_TRACE_POINTS
				57	#include <trace/events/migrate.h>
				58
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	59	#include "internal.h"
				60
Yisheng Xie	9e5bcd6	2017-02-24 14:57:29 -0800	[diff] [blame]	61	int isolate_movable_page(struct page *page, isolate_mode_t mode)
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	62	{
				63	struct address_space *mapping;
				64
				65	/*
				66	* Avoid burning cycles with pages that are yet under __free_pages(),
				67	* or just got freed under us.
				68	*
				69	* In case we 'win' a race for a movable page being freed under us and
				70	* raise its refcount preventing __free_pages() from doing its job
				71	* the put_page() at the end of this block will take care of
				72	* release this page, thus avoiding a nasty leakage.
				73	*/
				74	if (unlikely(!get_page_unless_zero(page)))
				75	goto out;
				76
				77	/*
				78	* Check PageMovable before holding a PG_lock because page's owner
				79	* assumes anybody doesn't touch PG_lock of newly allocated page
Wei Yang	8bb4e7a	2019-03-05 15:46:22 -0800	[diff] [blame]	80	* so unconditionally grabbing the lock ruins page's owner side.
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	81	*/
				82	if (unlikely(!__PageMovable(page)))
				83	goto out_putpage;
				84	/*
				85	* As movable pages are not isolated from LRU lists, concurrent
				86	* compaction threads can race against page migration functions
				87	* as well as race against the releasing a page.
				88	*
				89	* In order to avoid having an already isolated movable page
				90	* being (wrongly) re-isolated while it is under migration,
				91	* or to avoid attempting to isolate pages being released,
				92	* lets be sure we have the page lock
				93	* before proceeding with the movable page isolation steps.
				94	*/
				95	if (unlikely(!trylock_page(page)))
				96	goto out_putpage;
				97
				98	if (!PageMovable(page) \|\| PageIsolated(page))
				99	goto out_no_isolated;
				100
				101	mapping = page_mapping(page);
				102	VM_BUG_ON_PAGE(!mapping, page);
				103
				104	if (!mapping->a_ops->isolate_page(page, mode))
				105	goto out_no_isolated;
				106
				107	/* Driver shouldn't use PG_isolated bit of page->flags */
				108	WARN_ON_ONCE(PageIsolated(page));
				109	__SetPageIsolated(page);
				110	unlock_page(page);
				111
Yisheng Xie	9e5bcd6	2017-02-24 14:57:29 -0800	[diff] [blame]	112	return 0;
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	113
				114	out_no_isolated:
				115	unlock_page(page);
				116	out_putpage:
				117	put_page(page);
				118	out:
Yisheng Xie	9e5bcd6	2017-02-24 14:57:29 -0800	[diff] [blame]	119	return -EBUSY;
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	120	}
				121
Miaohe Lin	606a6f7	2021-05-04 18:37:04 -0700	[diff] [blame]	122	static void putback_movable_page(struct page *page)
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	123	{
				124	struct address_space *mapping;
				125
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	126	mapping = page_mapping(page);
				127	mapping->a_ops->putback_page(page);
				128	__ClearPageIsolated(page);
				129	}
				130
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	131	/*
Rafael Aquini	5733c7d	2012-12-11 16:02:47 -0800	[diff] [blame]	132	* Put previously isolated pages back onto the appropriate lists
				133	* from where they were once taken off for compaction/migration.
				134	*
Joonsoo Kim	59c82b7	2014-01-21 15:51:17 -0800	[diff] [blame]	135	* This function shall be used whenever the isolated pageset has been
				136	* built from lru, balloon, hugetlbfs page. See isolate_migratepages_range()
				137	* and isolate_huge_page().
Rafael Aquini	5733c7d	2012-12-11 16:02:47 -0800	[diff] [blame]	138	*/
				139	void putback_movable_pages(struct list_head *l)
				140	{
				141	struct page *page;
				142	struct page *page2;
				143
				144	list_for_each_entry_safe(page, page2, l, lru) {
Naoya Horiguchi	31caf66	2013-09-11 14:21:59 -0700	[diff] [blame]	145	if (unlikely(PageHuge(page))) {
				146	putback_active_hugepage(page);
				147	continue;
				148	}
Rafael Aquini	5733c7d	2012-12-11 16:02:47 -0800	[diff] [blame]	149	list_del(&page->lru);
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	150	/*
				151	* We isolated non-lru movable page so here we can use
				152	* __PageMovable because LRU page's mapping cannot have
				153	* PAGE_MAPPING_MOVABLE.
				154	*/
Minchan Kim	b1123ea6	2016-07-26 15:23:09 -0700	[diff] [blame]	155	if (unlikely(__PageMovable(page))) {
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	156	VM_BUG_ON_PAGE(!PageIsolated(page), page);
				157	lock_page(page);
				158	if (PageMovable(page))
				159	putback_movable_page(page);
				160	else
				161	__ClearPageIsolated(page);
				162	unlock_page(page);
				163	put_page(page);
				164	} else {
Naoya Horiguchi	e8db67e	2017-09-08 16:11:12 -0700	[diff] [blame]	165	mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
Matthew Wilcox (Oracle)	6c35784	2020-08-14 17:30:37 -0700	[diff] [blame]	166	page_is_file_lru(page), -thp_nr_pages(page));
Rabin Vincent	fc280fe	2017-04-20 14:37:46 -0700	[diff] [blame]	167	putback_lru_page(page);
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	168	}
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	169	}
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	170	}
				171
Christoph Lameter	0697212	2006-06-23 02:03:35 -0700	[diff] [blame]	172	/*
				173	* Restore a potential migration pte to a working pte entry
				174	*/
Minchan Kim	e4b8222	2017-05-03 14:54:27 -0700	[diff] [blame]	175	static bool remove_migration_pte(struct page page, struct vm_area_struct vma,
Hugh Dickins	e9995ef	2009-12-14 17:59:31 -0800	[diff] [blame]	176	unsigned long addr, void *old)
Christoph Lameter	0697212	2006-06-23 02:03:35 -0700	[diff] [blame]	177	{
Kirill A. Shutemov	3fe8796	2017-02-24 14:58:16 -0800	[diff] [blame]	178	struct page_vma_mapped_walk pvmw = {
				179	.page = old,
				180	.vma = vma,
				181	.address = addr,
				182	.flags = PVMW_SYNC \| PVMW_MIGRATION,
				183	};
				184	struct page *new;
				185	pte_t pte;
Christoph Lameter	0697212	2006-06-23 02:03:35 -0700	[diff] [blame]	186	swp_entry_t entry;
Christoph Lameter	0697212	2006-06-23 02:03:35 -0700	[diff] [blame]	187
Kirill A. Shutemov	3fe8796	2017-02-24 14:58:16 -0800	[diff] [blame]	188	VM_BUG_ON_PAGE(PageTail(page), page);
				189	while (page_vma_mapped_walk(&pvmw)) {
Naoya Horiguchi	4b0ece6	2017-03-31 15:11:44 -0700	[diff] [blame]	190	if (PageKsm(page))
				191	new = page;
				192	else
				193	new = page - pvmw.page->index +
				194	linear_page_index(vma, pvmw.address);
Christoph Lameter	0697212	2006-06-23 02:03:35 -0700	[diff] [blame]	195
Zi Yan	616b837	2017-09-08 16:10:57 -0700	[diff] [blame]	196	#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
				197	/* PMD-mapped THP migration entry */
				198	if (!pvmw.pte) {
				199	VM_BUG_ON_PAGE(PageHuge(page) \|\| !PageTransCompound(page), page);
				200	remove_migration_pmd(&pvmw, new);
				201	continue;
				202	}
				203	#endif
				204
Kirill A. Shutemov	3fe8796	2017-02-24 14:58:16 -0800	[diff] [blame]	205	get_page(new);
				206	pte = pte_mkold(mk_pte(new, READ_ONCE(vma->vm_page_prot)));
				207	if (pte_swp_soft_dirty(*pvmw.pte))
				208	pte = pte_mksoft_dirty(pte);
Christoph Lameter	0697212	2006-06-23 02:03:35 -0700	[diff] [blame]	209
Hugh Dickins	486cf46	2011-10-19 12:50:35 -0700	[diff] [blame]	210	/*
Kirill A. Shutemov	3fe8796	2017-02-24 14:58:16 -0800	[diff] [blame]	211	* Recheck VMA as permissions can change since migration started
Hugh Dickins	486cf46	2011-10-19 12:50:35 -0700	[diff] [blame]	212	*/
Kirill A. Shutemov	3fe8796	2017-02-24 14:58:16 -0800	[diff] [blame]	213	entry = pte_to_swp_entry(*pvmw.pte);
Alistair Popple	4dd845b	2021-06-30 18:54:09 -0700	[diff] [blame]	214	if (is_writable_migration_entry(entry))
Kirill A. Shutemov	3fe8796	2017-02-24 14:58:16 -0800	[diff] [blame]	215	pte = maybe_mkwrite(pte, vma);
Peter Xu	f45ec5f	2020-04-06 20:06:01 -0700	[diff] [blame]	216	else if (pte_swp_uffd_wp(*pvmw.pte))
				217	pte = pte_mkuffd_wp(pte);
Mel Gorman	d3cb8bf	2014-10-02 19:47:41 +0100	[diff] [blame]	218
Ralph Campbell	6128763	2020-09-04 16:36:04 -0700	[diff] [blame]	219	if (unlikely(is_device_private_page(new))) {
Alistair Popple	4dd845b	2021-06-30 18:54:09 -0700	[diff] [blame]	220	if (pte_write(pte))
				221	entry = make_writable_device_private_entry(
				222	page_to_pfn(new));
				223	else
				224	entry = make_readable_device_private_entry(
				225	page_to_pfn(new));
Ralph Campbell	6128763	2020-09-04 16:36:04 -0700	[diff] [blame]	226	pte = swp_entry_to_pte(entry);
Ralph Campbell	3d321bf8	2020-09-04 16:36:07 -0700	[diff] [blame]	227	if (pte_swp_soft_dirty(*pvmw.pte))
				228	pte = pte_swp_mksoft_dirty(pte);
Ralph Campbell	6128763	2020-09-04 16:36:04 -0700	[diff] [blame]	229	if (pte_swp_uffd_wp(*pvmw.pte))
				230	pte = pte_swp_mkuffd_wp(pte);
Lars Persson	d2b2c6dd	2019-03-28 20:44:28 -0700	[diff] [blame]	231	}
Jérôme Glisse	a5430dd	2017-09-08 16:12:17 -0700	[diff] [blame]	232
Andi Kleen	3ef8fd7	2010-10-11 16:03:21 +0200	[diff] [blame]	233	#ifdef CONFIG_HUGETLB_PAGE
Kirill A. Shutemov	3fe8796	2017-02-24 14:58:16 -0800	[diff] [blame]	234	if (PageHuge(new)) {
Christophe Leroy	79c1c59	2021-06-30 18:48:00 -0700	[diff] [blame]	235	unsigned int shift = huge_page_shift(hstate_vma(vma));
				236
Kirill A. Shutemov	3fe8796	2017-02-24 14:58:16 -0800	[diff] [blame]	237	pte = pte_mkhuge(pte);
Christophe Leroy	79c1c59	2021-06-30 18:48:00 -0700	[diff] [blame]	238	pte = arch_make_huge_pte(pte, shift, vma->vm_flags);
Aneesh Kumar K.V	383321a	2017-07-06 15:38:41 -0700	[diff] [blame]	239	set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
Kirill A. Shutemov	3fe8796	2017-02-24 14:58:16 -0800	[diff] [blame]	240	if (PageAnon(new))
				241	hugepage_add_anon_rmap(new, vma, pvmw.address);
				242	else
				243	page_dup_rmap(new, true);
Aneesh Kumar K.V	383321a	2017-07-06 15:38:41 -0700	[diff] [blame]	244	} else
				245	#endif
				246	{
				247	set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
Christoph Lameter	04e62a2	2006-06-23 02:03:38 -0700	[diff] [blame]	248
Aneesh Kumar K.V	383321a	2017-07-06 15:38:41 -0700	[diff] [blame]	249	if (PageAnon(new))
				250	page_add_anon_rmap(new, vma, pvmw.address, false);
				251	else
				252	page_add_file_rmap(new, false);
				253	}
Kirill A. Shutemov	3fe8796	2017-02-24 14:58:16 -0800	[diff] [blame]	254	if (vma->vm_flags & VM_LOCKED && !PageTransCompound(new))
				255	mlock_vma_page(new);
Hugh Dickins	51afb12	2015-11-05 18:49:37 -0800	[diff] [blame]	256
Kirill A. Shutemov	e125fe4	2018-10-05 15:51:41 -0700	[diff] [blame]	257	if (PageTransHuge(page) && PageMlocked(page))
				258	clear_page_mlock(page);
				259
Kirill A. Shutemov	3fe8796	2017-02-24 14:58:16 -0800	[diff] [blame]	260	/* No need to invalidate - it was non-present before */
				261	update_mmu_cache(vma, pvmw.address, pvmw.pte);
				262	}
				263
Minchan Kim	e4b8222	2017-05-03 14:54:27 -0700	[diff] [blame]	264	return true;
Christoph Lameter	0697212	2006-06-23 02:03:35 -0700	[diff] [blame]	265	}
				266
				267	/*
Christoph Lameter	04e62a2	2006-06-23 02:03:38 -0700	[diff] [blame]	268	* Get rid of all migration entries and replace them by
				269	* references to the indicated page.
				270	*/
Kirill A. Shutemov	e388466	2016-03-17 14:20:07 -0700	[diff] [blame]	271	void remove_migration_ptes(struct page old, struct page new, bool locked)
Christoph Lameter	04e62a2	2006-06-23 02:03:38 -0700	[diff] [blame]	272	{
Joonsoo Kim	051ac83	2014-01-21 15:49:48 -0800	[diff] [blame]	273	struct rmap_walk_control rwc = {
				274	.rmap_one = remove_migration_pte,
				275	.arg = old,
				276	};
				277
Kirill A. Shutemov	e388466	2016-03-17 14:20:07 -0700	[diff] [blame]	278	if (locked)
				279	rmap_walk_locked(new, &rwc);
				280	else
				281	rmap_walk(new, &rwc);
Christoph Lameter	04e62a2	2006-06-23 02:03:38 -0700	[diff] [blame]	282	}
				283
				284	/*
Christoph Lameter	0697212	2006-06-23 02:03:35 -0700	[diff] [blame]	285	* Something used the pte of a page under migration. We need to
				286	* get to the page and wait until migration is finished.
				287	* When we return from this function the fault will be retried.
Christoph Lameter	0697212	2006-06-23 02:03:35 -0700	[diff] [blame]	288	*/
Naoya Horiguchi	e66f17f	2015-02-11 15:25:22 -0800	[diff] [blame]	289	void __migration_entry_wait(struct mm_struct mm, pte_t ptep,
Naoya Horiguchi	30dad30	2013-06-12 14:05:04 -0700	[diff] [blame]	290	spinlock_t *ptl)
Christoph Lameter	0697212	2006-06-23 02:03:35 -0700	[diff] [blame]	291	{
Naoya Horiguchi	30dad30	2013-06-12 14:05:04 -0700	[diff] [blame]	292	pte_t pte;
Christoph Lameter	0697212	2006-06-23 02:03:35 -0700	[diff] [blame]	293	swp_entry_t entry;
				294	struct page *page;
				295
Naoya Horiguchi	30dad30	2013-06-12 14:05:04 -0700	[diff] [blame]	296	spin_lock(ptl);
Christoph Lameter	0697212	2006-06-23 02:03:35 -0700	[diff] [blame]	297	pte = *ptep;
				298	if (!is_swap_pte(pte))
				299	goto out;
				300
				301	entry = pte_to_swp_entry(pte);
				302	if (!is_migration_entry(entry))
				303	goto out;
				304
Alistair Popple	af5cdaf	2021-06-30 18:54:06 -0700	[diff] [blame]	305	page = pfn_swap_entry_to_page(entry);
Xu Yu	ffc90cb	2021-06-15 18:23:42 -0700	[diff] [blame]	306	page = compound_head(page);
Christoph Lameter	0697212	2006-06-23 02:03:35 -0700	[diff] [blame]	307
Nick Piggin	e286781	2008-07-25 19:45:30 -0700	[diff] [blame]	308	/*
Matthew Wilcox	89eb946	2017-12-04 04:35:16 -0500	[diff] [blame]	309	* Once page cache replacement of page migration started, page_count
Hugh Dickins	9a1ea43	2018-12-28 00:36:14 -0800	[diff] [blame]	310	* is zero; but we must not call put_and_wait_on_page_locked() without
				311	* a ref. Use get_page_unless_zero(), and just fault again if it fails.
Nick Piggin	e286781	2008-07-25 19:45:30 -0700	[diff] [blame]	312	*/
				313	if (!get_page_unless_zero(page))
				314	goto out;
Christoph Lameter	0697212	2006-06-23 02:03:35 -0700	[diff] [blame]	315	pte_unmap_unlock(ptep, ptl);
Matthew Wilcox (Oracle)	4805462	2021-02-24 12:02:02 -0800	[diff] [blame]	316	put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE);
Christoph Lameter	0697212	2006-06-23 02:03:35 -0700	[diff] [blame]	317	return;
				318	out:
				319	pte_unmap_unlock(ptep, ptl);
				320	}
				321
Naoya Horiguchi	30dad30	2013-06-12 14:05:04 -0700	[diff] [blame]	322	void migration_entry_wait(struct mm_struct mm, pmd_t pmd,
				323	unsigned long address)
				324	{
				325	spinlock_t *ptl = pte_lockptr(mm, pmd);
				326	pte_t *ptep = pte_offset_map(pmd, address);
				327	__migration_entry_wait(mm, ptep, ptl);
				328	}
				329
Kirill A. Shutemov	cb900f4	2013-11-14 14:31:02 -0800	[diff] [blame]	330	void migration_entry_wait_huge(struct vm_area_struct *vma,
				331	struct mm_struct mm, pte_t pte)
Naoya Horiguchi	30dad30	2013-06-12 14:05:04 -0700	[diff] [blame]	332	{
Kirill A. Shutemov	cb900f4	2013-11-14 14:31:02 -0800	[diff] [blame]	333	spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), mm, pte);
Naoya Horiguchi	30dad30	2013-06-12 14:05:04 -0700	[diff] [blame]	334	__migration_entry_wait(mm, pte, ptl);
				335	}
				336
Zi Yan	616b837	2017-09-08 16:10:57 -0700	[diff] [blame]	337	#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
				338	void pmd_migration_entry_wait(struct mm_struct mm, pmd_t pmd)
				339	{
				340	spinlock_t *ptl;
				341	struct page *page;
				342
				343	ptl = pmd_lock(mm, pmd);
				344	if (!is_pmd_migration_entry(*pmd))
				345	goto unlock;
Alistair Popple	af5cdaf	2021-06-30 18:54:06 -0700	[diff] [blame]	346	page = pfn_swap_entry_to_page(pmd_to_swp_entry(*pmd));
Zi Yan	616b837	2017-09-08 16:10:57 -0700	[diff] [blame]	347	if (!get_page_unless_zero(page))
				348	goto unlock;
				349	spin_unlock(ptl);
Matthew Wilcox (Oracle)	4805462	2021-02-24 12:02:02 -0800	[diff] [blame]	350	put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE);
Zi Yan	616b837	2017-09-08 16:10:57 -0700	[diff] [blame]	351	return;
				352	unlock:
				353	spin_unlock(ptl);
				354	}
				355	#endif
				356
Jan Kara	f900482	2019-03-05 15:48:46 -0800	[diff] [blame]	357	static int expected_page_refs(struct address_space mapping, struct page page)
Jan Kara	0b3901b	2018-12-28 00:39:01 -0800	[diff] [blame]	358	{
				359	int expected_count = 1;
				360
				361	/*
Ralph Campbell	f1f4f3a	2020-10-13 16:58:42 -0700	[diff] [blame]	362	* Device private pages have an extra refcount as they are
Jan Kara	0b3901b	2018-12-28 00:39:01 -0800	[diff] [blame]	363	* ZONE_DEVICE pages.
				364	*/
				365	expected_count += is_device_private_page(page);
Jan Kara	f900482	2019-03-05 15:48:46 -0800	[diff] [blame]	366	if (mapping)
Matthew Wilcox (Oracle)	3417013	2021-05-07 07:28:40 -0400	[diff] [blame]	367	expected_count += compound_nr(page) + page_has_private(page);
Jan Kara	0b3901b	2018-12-28 00:39:01 -0800	[diff] [blame]	368
				369	return expected_count;
				370	}
				371
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	372	/*
Christoph Lameter	c3fcf8a	2006-06-23 02:03:32 -0700	[diff] [blame]	373	* Replace the page in the mapping.
Christoph Lameter	5b5c712	2006-06-23 02:03:29 -0700	[diff] [blame]	374	*
				375	* The number of remaining references must be:
				376	* 1 for anonymous pages without a mapping
				377	* 2 for pages with a mapping
David Howells	266cf65	2009-04-03 16:42:36 +0100	[diff] [blame]	378	* 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	379	*/
Matthew Wilcox (Oracle)	3417013	2021-05-07 07:28:40 -0400	[diff] [blame]	380	int folio_migrate_mapping(struct address_space *mapping,
				381	struct folio newfolio, struct folio folio, int extra_count)
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	382	{
Matthew Wilcox (Oracle)	3417013	2021-05-07 07:28:40 -0400	[diff] [blame]	383	XA_STATE(xas, &mapping->i_pages, folio_index(folio));
Hugh Dickins	42cb14b	2015-11-05 18:50:05 -0800	[diff] [blame]	384	struct zone oldzone, newzone;
				385	int dirty;
Matthew Wilcox (Oracle)	3417013	2021-05-07 07:28:40 -0400	[diff] [blame]	386	int expected_count = expected_page_refs(mapping, &folio->page) + extra_count;
				387	long nr = folio_nr_pages(folio);
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	388
Christoph Lameter	6c5240a	2006-06-23 02:03:37 -0700	[diff] [blame]	389	if (!mapping) {
Christoph Lameter	0e8c7d0	2007-04-23 14:41:09 -0700	[diff] [blame]	390	/* Anonymous page without mapping */
Matthew Wilcox (Oracle)	3417013	2021-05-07 07:28:40 -0400	[diff] [blame]	391	if (folio_ref_count(folio) != expected_count)
Christoph Lameter	6c5240a	2006-06-23 02:03:37 -0700	[diff] [blame]	392	return -EAGAIN;
Hugh Dickins	cf4b769	2015-11-05 18:50:02 -0800	[diff] [blame]	393
				394	/* No turning back from here */
Matthew Wilcox (Oracle)	3417013	2021-05-07 07:28:40 -0400	[diff] [blame]	395	newfolio->index = folio->index;
				396	newfolio->mapping = folio->mapping;
				397	if (folio_test_swapbacked(folio))
				398	__folio_set_swapbacked(newfolio);
Hugh Dickins	cf4b769	2015-11-05 18:50:02 -0800	[diff] [blame]	399
Rafael Aquini	78bd520	2012-12-11 16:02:31 -0800	[diff] [blame]	400	return MIGRATEPAGE_SUCCESS;
Christoph Lameter	6c5240a	2006-06-23 02:03:37 -0700	[diff] [blame]	401	}
				402
Matthew Wilcox (Oracle)	3417013	2021-05-07 07:28:40 -0400	[diff] [blame]	403	oldzone = folio_zone(folio);
				404	newzone = folio_zone(newfolio);
Hugh Dickins	42cb14b	2015-11-05 18:50:05 -0800	[diff] [blame]	405
Matthew Wilcox	89eb946	2017-12-04 04:35:16 -0500	[diff] [blame]	406	xas_lock_irq(&xas);
Matthew Wilcox (Oracle)	3417013	2021-05-07 07:28:40 -0400	[diff] [blame]	407	if (!folio_ref_freeze(folio, expected_count)) {
Matthew Wilcox	89eb946	2017-12-04 04:35:16 -0500	[diff] [blame]	408	xas_unlock_irq(&xas);
Nick Piggin	e286781	2008-07-25 19:45:30 -0700	[diff] [blame]	409	return -EAGAIN;
				410	}
				411
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	412	/*
Matthew Wilcox (Oracle)	3417013	2021-05-07 07:28:40 -0400	[diff] [blame]	413	* Now we know that no one else is looking at the folio:
Hugh Dickins	cf4b769	2015-11-05 18:50:02 -0800	[diff] [blame]	414	* no turning back from here.
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	415	*/
Matthew Wilcox (Oracle)	3417013	2021-05-07 07:28:40 -0400	[diff] [blame]	416	newfolio->index = folio->index;
				417	newfolio->mapping = folio->mapping;
				418	folio_ref_add(newfolio, nr); /* add cache reference */
				419	if (folio_test_swapbacked(folio)) {
				420	__folio_set_swapbacked(newfolio);
				421	if (folio_test_swapcache(folio)) {
				422	folio_set_swapcache(newfolio);
				423	newfolio->private = folio_get_private(folio);
Nicholas Piggin	6326fec	2016-12-25 13:00:29 +1000	[diff] [blame]	424	}
				425	} else {
Matthew Wilcox (Oracle)	3417013	2021-05-07 07:28:40 -0400	[diff] [blame]	426	VM_BUG_ON_FOLIO(folio_test_swapcache(folio), folio);
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	427	}
				428
Hugh Dickins	42cb14b	2015-11-05 18:50:05 -0800	[diff] [blame]	429	/* Move dirty while page refs frozen and newpage not yet exposed */
Matthew Wilcox (Oracle)	3417013	2021-05-07 07:28:40 -0400	[diff] [blame]	430	dirty = folio_test_dirty(folio);
Hugh Dickins	42cb14b	2015-11-05 18:50:05 -0800	[diff] [blame]	431	if (dirty) {
Matthew Wilcox (Oracle)	3417013	2021-05-07 07:28:40 -0400	[diff] [blame]	432	folio_clear_dirty(folio);
				433	folio_set_dirty(newfolio);
Hugh Dickins	42cb14b	2015-11-05 18:50:05 -0800	[diff] [blame]	434	}
				435
Matthew Wilcox (Oracle)	3417013	2021-05-07 07:28:40 -0400	[diff] [blame]	436	xas_store(&xas, newfolio);
				437	if (nr > 1) {
Naoya Horiguchi	e71769a	2018-04-20 14:55:45 -0700	[diff] [blame]	438	int i;
Naoya Horiguchi	e71769a	2018-04-20 14:55:45 -0700	[diff] [blame]	439
Shakeel Butt	5c447d2	2021-01-23 21:01:15 -0800	[diff] [blame]	440	for (i = 1; i < nr; i++) {
Matthew Wilcox	89eb946	2017-12-04 04:35:16 -0500	[diff] [blame]	441	xas_next(&xas);
Matthew Wilcox (Oracle)	3417013	2021-05-07 07:28:40 -0400	[diff] [blame]	442	xas_store(&xas, newfolio);
Naoya Horiguchi	e71769a	2018-04-20 14:55:45 -0700	[diff] [blame]	443	}
Naoya Horiguchi	e71769a	2018-04-20 14:55:45 -0700	[diff] [blame]	444	}
Nick Piggin	7cf9c2c	2006-12-06 20:33:44 -0800	[diff] [blame]	445
				446	/*
Jacobo Giralt	937a94c	2012-01-10 15:07:11 -0800	[diff] [blame]	447	* Drop cache reference from old page by unfreezing
				448	* to one less reference.
Nick Piggin	7cf9c2c	2006-12-06 20:33:44 -0800	[diff] [blame]	449	* We know this isn't the last reference.
				450	*/
Matthew Wilcox (Oracle)	3417013	2021-05-07 07:28:40 -0400	[diff] [blame]	451	folio_ref_unfreeze(folio, expected_count - nr);
Nick Piggin	7cf9c2c	2006-12-06 20:33:44 -0800	[diff] [blame]	452
Matthew Wilcox	89eb946	2017-12-04 04:35:16 -0500	[diff] [blame]	453	xas_unlock(&xas);
Hugh Dickins	42cb14b	2015-11-05 18:50:05 -0800	[diff] [blame]	454	/* Leave irq disabled to prevent preemption while updating stats */
				455
Christoph Lameter	0e8c7d0	2007-04-23 14:41:09 -0700	[diff] [blame]	456	/*
				457	* If moved to a different zone then also account
				458	* the page for that zone. Other VM counters will be
				459	* taken care of when we establish references to the
				460	* new page and drop references to the old page.
				461	*
				462	* Note that anonymous pages are accounted for
Mel Gorman	4b9d0fa	2016-07-28 15:46:17 -0700	[diff] [blame]	463	* via NR_FILE_PAGES and NR_ANON_MAPPED if they
Christoph Lameter	0e8c7d0	2007-04-23 14:41:09 -0700	[diff] [blame]	464	* are mapped to swap space.
				465	*/
Hugh Dickins	42cb14b	2015-11-05 18:50:05 -0800	[diff] [blame]	466	if (newzone != oldzone) {
Johannes Weiner	0d1c207	2020-06-03 16:01:54 -0700	[diff] [blame]	467	struct lruvec old_lruvec, new_lruvec;
				468	struct mem_cgroup *memcg;
				469
Matthew Wilcox (Oracle)	3417013	2021-05-07 07:28:40 -0400	[diff] [blame]	470	memcg = folio_memcg(folio);
Johannes Weiner	0d1c207	2020-06-03 16:01:54 -0700	[diff] [blame]	471	old_lruvec = mem_cgroup_lruvec(memcg, oldzone->zone_pgdat);
				472	new_lruvec = mem_cgroup_lruvec(memcg, newzone->zone_pgdat);
				473
Shakeel Butt	5c447d2	2021-01-23 21:01:15 -0800	[diff] [blame]	474	__mod_lruvec_state(old_lruvec, NR_FILE_PAGES, -nr);
				475	__mod_lruvec_state(new_lruvec, NR_FILE_PAGES, nr);
Matthew Wilcox (Oracle)	3417013	2021-05-07 07:28:40 -0400	[diff] [blame]	476	if (folio_test_swapbacked(folio) && !folio_test_swapcache(folio)) {
Shakeel Butt	5c447d2	2021-01-23 21:01:15 -0800	[diff] [blame]	477	__mod_lruvec_state(old_lruvec, NR_SHMEM, -nr);
				478	__mod_lruvec_state(new_lruvec, NR_SHMEM, nr);
Hugh Dickins	42cb14b	2015-11-05 18:50:05 -0800	[diff] [blame]	479	}
Shakeel Butt	b603894	2021-02-24 12:03:55 -0800	[diff] [blame]	480	#ifdef CONFIG_SWAP
Matthew Wilcox (Oracle)	3417013	2021-05-07 07:28:40 -0400	[diff] [blame]	481	if (folio_test_swapcache(folio)) {
Shakeel Butt	b603894	2021-02-24 12:03:55 -0800	[diff] [blame]	482	__mod_lruvec_state(old_lruvec, NR_SWAPCACHE, -nr);
				483	__mod_lruvec_state(new_lruvec, NR_SWAPCACHE, nr);
				484	}
				485	#endif
Christoph Hellwig	f56753a	2020-09-24 08:51:40 +0200	[diff] [blame]	486	if (dirty && mapping_can_writeback(mapping)) {
Shakeel Butt	5c447d2	2021-01-23 21:01:15 -0800	[diff] [blame]	487	__mod_lruvec_state(old_lruvec, NR_FILE_DIRTY, -nr);
				488	__mod_zone_page_state(oldzone, NR_ZONE_WRITE_PENDING, -nr);
				489	__mod_lruvec_state(new_lruvec, NR_FILE_DIRTY, nr);
				490	__mod_zone_page_state(newzone, NR_ZONE_WRITE_PENDING, nr);
Hugh Dickins	42cb14b	2015-11-05 18:50:05 -0800	[diff] [blame]	491	}
KOSAKI Motohiro	4b02108	2009-09-21 17:01:33 -0700	[diff] [blame]	492	}
Hugh Dickins	42cb14b	2015-11-05 18:50:05 -0800	[diff] [blame]	493	local_irq_enable();
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	494
Rafael Aquini	78bd520	2012-12-11 16:02:31 -0800	[diff] [blame]	495	return MIGRATEPAGE_SUCCESS;
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	496	}
Matthew Wilcox (Oracle)	3417013	2021-05-07 07:28:40 -0400	[diff] [blame]	497	EXPORT_SYMBOL(folio_migrate_mapping);
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	498
				499	/*
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	500	* The expected number of remaining references is the same as that
Matthew Wilcox (Oracle)	3417013	2021-05-07 07:28:40 -0400	[diff] [blame]	501	* of folio_migrate_mapping().
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	502	*/
				503	int migrate_huge_page_move_mapping(struct address_space *mapping,
				504	struct page newpage, struct page page)
				505	{
Matthew Wilcox	89eb946	2017-12-04 04:35:16 -0500	[diff] [blame]	506	XA_STATE(xas, &mapping->i_pages, page_index(page));
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	507	int expected_count;
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	508
Matthew Wilcox	89eb946	2017-12-04 04:35:16 -0500	[diff] [blame]	509	xas_lock_irq(&xas);
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	510	expected_count = 2 + page_has_private(page);
Matthew Wilcox	89eb946	2017-12-04 04:35:16 -0500	[diff] [blame]	511	if (page_count(page) != expected_count \|\| xas_load(&xas) != page) {
				512	xas_unlock_irq(&xas);
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	513	return -EAGAIN;
				514	}
				515
Joonsoo Kim	fe896d1	2016-03-17 14:19:26 -0700	[diff] [blame]	516	if (!page_ref_freeze(page, expected_count)) {
Matthew Wilcox	89eb946	2017-12-04 04:35:16 -0500	[diff] [blame]	517	xas_unlock_irq(&xas);
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	518	return -EAGAIN;
				519	}
				520
Hugh Dickins	cf4b769	2015-11-05 18:50:02 -0800	[diff] [blame]	521	newpage->index = page->index;
				522	newpage->mapping = page->mapping;
Johannes Weiner	6a93ca8	2016-03-15 14:57:19 -0700	[diff] [blame]	523
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	524	get_page(newpage);
				525
Matthew Wilcox	89eb946	2017-12-04 04:35:16 -0500	[diff] [blame]	526	xas_store(&xas, newpage);
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	527
Joonsoo Kim	fe896d1	2016-03-17 14:19:26 -0700	[diff] [blame]	528	page_ref_unfreeze(page, expected_count - 1);
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	529
Matthew Wilcox	89eb946	2017-12-04 04:35:16 -0500	[diff] [blame]	530	xas_unlock_irq(&xas);
Johannes Weiner	6a93ca8	2016-03-15 14:57:19 -0700	[diff] [blame]	531
Rafael Aquini	78bd520	2012-12-11 16:02:31 -0800	[diff] [blame]	532	return MIGRATEPAGE_SUCCESS;
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	533	}
				534
				535	/*
Matthew Wilcox (Oracle)	1913834	2021-05-07 15:26:29 -0400	[diff] [blame]	536	* Copy the flags and some other ancillary information
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	537	*/
Matthew Wilcox (Oracle)	1913834	2021-05-07 15:26:29 -0400	[diff] [blame]	538	void folio_migrate_flags(struct folio newfolio, struct folio folio)
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	539	{
Rik van Riel	7851a45	2013-10-07 11:29:23 +0100	[diff] [blame]	540	int cpupid;
				541
Matthew Wilcox (Oracle)	1913834	2021-05-07 15:26:29 -0400	[diff] [blame]	542	if (folio_test_error(folio))
				543	folio_set_error(newfolio);
				544	if (folio_test_referenced(folio))
				545	folio_set_referenced(newfolio);
				546	if (folio_test_uptodate(folio))
				547	folio_mark_uptodate(newfolio);
				548	if (folio_test_clear_active(folio)) {
				549	VM_BUG_ON_FOLIO(folio_test_unevictable(folio), folio);
				550	folio_set_active(newfolio);
				551	} else if (folio_test_clear_unevictable(folio))
				552	folio_set_unevictable(newfolio);
				553	if (folio_test_workingset(folio))
				554	folio_set_workingset(newfolio);
				555	if (folio_test_checked(folio))
				556	folio_set_checked(newfolio);
				557	if (folio_test_mappedtodisk(folio))
				558	folio_set_mappedtodisk(newfolio);
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	559
Matthew Wilcox (Oracle)	3417013	2021-05-07 07:28:40 -0400	[diff] [blame]	560	/* Move dirty on pages not done by folio_migrate_mapping() */
Matthew Wilcox (Oracle)	1913834	2021-05-07 15:26:29 -0400	[diff] [blame]	561	if (folio_test_dirty(folio))
				562	folio_set_dirty(newfolio);
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	563
Matthew Wilcox (Oracle)	1913834	2021-05-07 15:26:29 -0400	[diff] [blame]	564	if (folio_test_young(folio))
				565	folio_set_young(newfolio);
				566	if (folio_test_idle(folio))
				567	folio_set_idle(newfolio);
Vladimir Davydov	33c3fc7	2015-09-09 15:35:45 -0700	[diff] [blame]	568
Rik van Riel	7851a45	2013-10-07 11:29:23 +0100	[diff] [blame]	569	/*
				570	* Copy NUMA information to the new page, to prevent over-eager
				571	* future migrations of this same page.
				572	*/
Matthew Wilcox (Oracle)	1913834	2021-05-07 15:26:29 -0400	[diff] [blame]	573	cpupid = page_cpupid_xchg_last(&folio->page, -1);
				574	page_cpupid_xchg_last(&newfolio->page, cpupid);
Rik van Riel	7851a45	2013-10-07 11:29:23 +0100	[diff] [blame]	575
Matthew Wilcox (Oracle)	1913834	2021-05-07 15:26:29 -0400	[diff] [blame]	576	folio_migrate_ksm(newfolio, folio);
Hugh Dickins	c8d6553	2013-02-22 16:35:10 -0800	[diff] [blame]	577	/*
				578	* Please do not reorder this without considering how mm/ksm.c's
				579	* get_ksm_page() depends upon ksm_migrate_page() and PageSwapCache().
				580	*/
Matthew Wilcox (Oracle)	1913834	2021-05-07 15:26:29 -0400	[diff] [blame]	581	if (folio_test_swapcache(folio))
				582	folio_clear_swapcache(folio);
				583	folio_clear_private(folio);
Muchun Song	ad2fa37	2021-06-30 18:47:21 -0700	[diff] [blame]	584
				585	/* page->private contains hugetlb specific flags */
Matthew Wilcox (Oracle)	1913834	2021-05-07 15:26:29 -0400	[diff] [blame]	586	if (!folio_test_hugetlb(folio))
				587	folio->private = NULL;
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	588
				589	/*
				590	* If any waiters have accumulated on the new page then
				591	* wake them up.
				592	*/
Matthew Wilcox (Oracle)	1913834	2021-05-07 15:26:29 -0400	[diff] [blame]	593	if (folio_test_writeback(newfolio))
				594	folio_end_writeback(newfolio);
Vlastimil Babka	d435edc	2016-03-15 14:56:15 -0700	[diff] [blame]	595
Yang Shi	6aeff24	2020-04-06 20:04:21 -0700	[diff] [blame]	596	/*
				597	* PG_readahead shares the same bit with PG_reclaim. The above
				598	* end_page_writeback() may clear PG_readahead mistakenly, so set the
				599	* bit after that.
				600	*/
Matthew Wilcox (Oracle)	1913834	2021-05-07 15:26:29 -0400	[diff] [blame]	601	if (folio_test_readahead(folio))
				602	folio_set_readahead(newfolio);
Yang Shi	6aeff24	2020-04-06 20:04:21 -0700	[diff] [blame]	603
Matthew Wilcox (Oracle)	1913834	2021-05-07 15:26:29 -0400	[diff] [blame]	604	folio_copy_owner(newfolio, folio);
Johannes Weiner	74485cf	2016-03-15 14:57:54 -0700	[diff] [blame]	605
Matthew Wilcox (Oracle)	1913834	2021-05-07 15:26:29 -0400	[diff] [blame]	606	if (!folio_test_hugetlb(folio))
Matthew Wilcox (Oracle)	d21bba2	2021-05-06 18:14:59 -0400	[diff] [blame]	607	mem_cgroup_migrate(folio, newfolio);
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	608	}
Matthew Wilcox (Oracle)	1913834	2021-05-07 15:26:29 -0400	[diff] [blame]	609	EXPORT_SYMBOL(folio_migrate_flags);
Jérôme Glisse	2916ecc	2017-09-08 16:12:06 -0700	[diff] [blame]	610
Matthew Wilcox (Oracle)	715cbfd	2021-05-07 15:05:06 -0400	[diff] [blame]	611	void folio_migrate_copy(struct folio newfolio, struct folio folio)
Jérôme Glisse	2916ecc	2017-09-08 16:12:06 -0700	[diff] [blame]	612	{
Matthew Wilcox (Oracle)	715cbfd	2021-05-07 15:05:06 -0400	[diff] [blame]	613	folio_copy(newfolio, folio);
				614	folio_migrate_flags(newfolio, folio);
Jérôme Glisse	2916ecc	2017-09-08 16:12:06 -0700	[diff] [blame]	615	}
Matthew Wilcox (Oracle)	715cbfd	2021-05-07 15:05:06 -0400	[diff] [blame]	616	EXPORT_SYMBOL(folio_migrate_copy);
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	617
Christoph Lameter	1d8b85c	2006-06-23 02:03:28 -0700	[diff] [blame]	618	/************************************************************
				619	* Migration functions
				620	***********************************************************/
				621
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	622	/*
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	623	* Common logic to directly migrate a single LRU page suitable for
David Howells	266cf65	2009-04-03 16:42:36 +0100	[diff] [blame]	624	* pages that do not use PagePrivate/PagePrivate2.
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	625	*
				626	* Pages are locked upon entry and exit.
				627	*/
Christoph Lameter	2d1db3b	2006-06-23 02:03:33 -0700	[diff] [blame]	628	int migrate_page(struct address_space *mapping,
Mel Gorman	a6bc32b	2012-01-12 17:19:43 -0800	[diff] [blame]	629	struct page newpage, struct page page,
				630	enum migrate_mode mode)
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	631	{
Matthew Wilcox (Oracle)	3417013	2021-05-07 07:28:40 -0400	[diff] [blame]	632	struct folio *newfolio = page_folio(newpage);
				633	struct folio *folio = page_folio(page);
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	634	int rc;
				635
Matthew Wilcox (Oracle)	3417013	2021-05-07 07:28:40 -0400	[diff] [blame]	636	BUG_ON(folio_test_writeback(folio)); /* Writeback must be complete */
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	637
Matthew Wilcox (Oracle)	3417013	2021-05-07 07:28:40 -0400	[diff] [blame]	638	rc = folio_migrate_mapping(mapping, newfolio, folio, 0);
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	639
Rafael Aquini	78bd520	2012-12-11 16:02:31 -0800	[diff] [blame]	640	if (rc != MIGRATEPAGE_SUCCESS)
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	641	return rc;
				642
Jérôme Glisse	2916ecc	2017-09-08 16:12:06 -0700	[diff] [blame]	643	if (mode != MIGRATE_SYNC_NO_COPY)
Matthew Wilcox (Oracle)	715cbfd	2021-05-07 15:05:06 -0400	[diff] [blame]	644	folio_migrate_copy(newfolio, folio);
Jérôme Glisse	2916ecc	2017-09-08 16:12:06 -0700	[diff] [blame]	645	else
Matthew Wilcox (Oracle)	1913834	2021-05-07 15:26:29 -0400	[diff] [blame]	646	folio_migrate_flags(newfolio, folio);
Rafael Aquini	78bd520	2012-12-11 16:02:31 -0800	[diff] [blame]	647	return MIGRATEPAGE_SUCCESS;
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	648	}
				649	EXPORT_SYMBOL(migrate_page);
				650
David Howells	9361401	2006-09-30 20:45:40 +0200	[diff] [blame]	651	#ifdef CONFIG_BLOCK
Jan Kara	84ade7c	2018-12-28 00:39:09 -0800	[diff] [blame]	652	/* Returns true if all buffers are successfully locked */
				653	static bool buffer_migrate_lock_buffers(struct buffer_head *head,
				654	enum migrate_mode mode)
				655	{
				656	struct buffer_head *bh = head;
				657
				658	/* Simple case, sync compaction */
				659	if (mode != MIGRATE_ASYNC) {
				660	do {
Jan Kara	84ade7c	2018-12-28 00:39:09 -0800	[diff] [blame]	661	lock_buffer(bh);
				662	bh = bh->b_this_page;
				663
				664	} while (bh != head);
				665
				666	return true;
				667	}
				668
				669	/* async case, we cannot block on lock_buffer so use trylock_buffer */
				670	do {
Jan Kara	84ade7c	2018-12-28 00:39:09 -0800	[diff] [blame]	671	if (!trylock_buffer(bh)) {
				672	/*
				673	* We failed to lock the buffer and cannot stall in
				674	* async migration. Release the taken locks
				675	*/
				676	struct buffer_head *failed_bh = bh;
Jan Kara	84ade7c	2018-12-28 00:39:09 -0800	[diff] [blame]	677	bh = head;
				678	while (bh != failed_bh) {
				679	unlock_buffer(bh);
Jan Kara	84ade7c	2018-12-28 00:39:09 -0800	[diff] [blame]	680	bh = bh->b_this_page;
				681	}
				682	return false;
				683	}
				684
				685	bh = bh->b_this_page;
				686	} while (bh != head);
				687	return true;
				688	}
				689
Jan Kara	89cb088	2018-12-28 00:39:12 -0800	[diff] [blame]	690	static int __buffer_migrate_page(struct address_space *mapping,
				691	struct page newpage, struct page page, enum migrate_mode mode,
				692	bool check_refs)
Christoph Lameter	1d8b85c	2006-06-23 02:03:28 -0700	[diff] [blame]	693	{
Christoph Lameter	1d8b85c	2006-06-23 02:03:28 -0700	[diff] [blame]	694	struct buffer_head bh, head;
				695	int rc;
Jan Kara	cc4f11e	2018-12-28 00:39:05 -0800	[diff] [blame]	696	int expected_count;
Christoph Lameter	1d8b85c	2006-06-23 02:03:28 -0700	[diff] [blame]	697
Christoph Lameter	1d8b85c	2006-06-23 02:03:28 -0700	[diff] [blame]	698	if (!page_has_buffers(page))
Mel Gorman	a6bc32b	2012-01-12 17:19:43 -0800	[diff] [blame]	699	return migrate_page(mapping, newpage, page, mode);
Christoph Lameter	1d8b85c	2006-06-23 02:03:28 -0700	[diff] [blame]	700
Jan Kara	cc4f11e	2018-12-28 00:39:05 -0800	[diff] [blame]	701	/* Check whether page does not have extra refs before we do more work */
Jan Kara	f900482	2019-03-05 15:48:46 -0800	[diff] [blame]	702	expected_count = expected_page_refs(mapping, page);
Jan Kara	cc4f11e	2018-12-28 00:39:05 -0800	[diff] [blame]	703	if (page_count(page) != expected_count)
				704	return -EAGAIN;
				705
Christoph Lameter	1d8b85c	2006-06-23 02:03:28 -0700	[diff] [blame]	706	head = page_buffers(page);
Jan Kara	cc4f11e	2018-12-28 00:39:05 -0800	[diff] [blame]	707	if (!buffer_migrate_lock_buffers(head, mode))
				708	return -EAGAIN;
Christoph Lameter	1d8b85c	2006-06-23 02:03:28 -0700	[diff] [blame]	709
Jan Kara	89cb088	2018-12-28 00:39:12 -0800	[diff] [blame]	710	if (check_refs) {
				711	bool busy;
				712	bool invalidated = false;
				713
				714	recheck_buffers:
				715	busy = false;
				716	spin_lock(&mapping->private_lock);
				717	bh = head;
				718	do {
				719	if (atomic_read(&bh->b_count)) {
				720	busy = true;
				721	break;
				722	}
				723	bh = bh->b_this_page;
				724	} while (bh != head);
Jan Kara	89cb088	2018-12-28 00:39:12 -0800	[diff] [blame]	725	if (busy) {
				726	if (invalidated) {
				727	rc = -EAGAIN;
				728	goto unlock_buffers;
				729	}
Jan Kara	ebdf4de	2019-08-02 21:48:47 -0700	[diff] [blame]	730	spin_unlock(&mapping->private_lock);
Jan Kara	89cb088	2018-12-28 00:39:12 -0800	[diff] [blame]	731	invalidate_bh_lrus();
				732	invalidated = true;
				733	goto recheck_buffers;
				734	}
				735	}
				736
Keith Busch	3710969	2019-07-18 15:58:46 -0700	[diff] [blame]	737	rc = migrate_page_move_mapping(mapping, newpage, page, 0);
Rafael Aquini	78bd520	2012-12-11 16:02:31 -0800	[diff] [blame]	738	if (rc != MIGRATEPAGE_SUCCESS)
Jan Kara	cc4f11e	2018-12-28 00:39:05 -0800	[diff] [blame]	739	goto unlock_buffers;
Christoph Lameter	1d8b85c	2006-06-23 02:03:28 -0700	[diff] [blame]	740
Guoqing Jiang	cd0f371	2020-06-01 21:48:06 -0700	[diff] [blame]	741	attach_page_private(newpage, detach_page_private(page));
Christoph Lameter	1d8b85c	2006-06-23 02:03:28 -0700	[diff] [blame]	742
				743	bh = head;
				744	do {
				745	set_bh_page(bh, newpage, bh_offset(bh));
				746	bh = bh->b_this_page;
				747
				748	} while (bh != head);
				749
Jérôme Glisse	2916ecc	2017-09-08 16:12:06 -0700	[diff] [blame]	750	if (mode != MIGRATE_SYNC_NO_COPY)
				751	migrate_page_copy(newpage, page);
				752	else
				753	migrate_page_states(newpage, page);
Christoph Lameter	1d8b85c	2006-06-23 02:03:28 -0700	[diff] [blame]	754
Jan Kara	cc4f11e	2018-12-28 00:39:05 -0800	[diff] [blame]	755	rc = MIGRATEPAGE_SUCCESS;
				756	unlock_buffers:
Jan Kara	ebdf4de	2019-08-02 21:48:47 -0700	[diff] [blame]	757	if (check_refs)
				758	spin_unlock(&mapping->private_lock);
Christoph Lameter	1d8b85c	2006-06-23 02:03:28 -0700	[diff] [blame]	759	bh = head;
				760	do {
				761	unlock_buffer(bh);
Christoph Lameter	1d8b85c	2006-06-23 02:03:28 -0700	[diff] [blame]	762	bh = bh->b_this_page;
				763
				764	} while (bh != head);
				765
Jan Kara	cc4f11e	2018-12-28 00:39:05 -0800	[diff] [blame]	766	return rc;
Christoph Lameter	1d8b85c	2006-06-23 02:03:28 -0700	[diff] [blame]	767	}
Jan Kara	89cb088	2018-12-28 00:39:12 -0800	[diff] [blame]	768
				769	/*
				770	* Migration function for pages with buffers. This function can only be used
				771	* if the underlying filesystem guarantees that no other references to "page"
				772	* exist. For example attached buffer heads are accessed only under page lock.
				773	*/
				774	int buffer_migrate_page(struct address_space *mapping,
				775	struct page newpage, struct page page, enum migrate_mode mode)
				776	{
				777	return __buffer_migrate_page(mapping, newpage, page, mode, false);
				778	}
Christoph Lameter	1d8b85c	2006-06-23 02:03:28 -0700	[diff] [blame]	779	EXPORT_SYMBOL(buffer_migrate_page);
Jan Kara	89cb088	2018-12-28 00:39:12 -0800	[diff] [blame]	780
				781	/*
				782	* Same as above except that this variant is more careful and checks that there
				783	* are also no buffer head references. This function is the right one for
				784	* mappings where buffer heads are directly looked up and referenced (such as
				785	* block device mappings).
				786	*/
				787	int buffer_migrate_page_norefs(struct address_space *mapping,
				788	struct page newpage, struct page page, enum migrate_mode mode)
				789	{
				790	return __buffer_migrate_page(mapping, newpage, page, mode, true);
				791	}
David Howells	9361401	2006-09-30 20:45:40 +0200	[diff] [blame]	792	#endif
Christoph Lameter	1d8b85c	2006-06-23 02:03:28 -0700	[diff] [blame]	793
Christoph Lameter	04e62a2	2006-06-23 02:03:38 -0700	[diff] [blame]	794	/*
				795	* Writeback a page to clean the dirty state
				796	*/
				797	static int writeout(struct address_space mapping, struct page page)
				798	{
				799	struct writeback_control wbc = {
				800	.sync_mode = WB_SYNC_NONE,
				801	.nr_to_write = 1,
				802	.range_start = 0,
				803	.range_end = LLONG_MAX,
Christoph Lameter	04e62a2	2006-06-23 02:03:38 -0700	[diff] [blame]	804	.for_reclaim = 1
				805	};
				806	int rc;
				807
				808	if (!mapping->a_ops->writepage)
				809	/* No write method for the address space */
				810	return -EINVAL;
				811
				812	if (!clear_page_dirty_for_io(page))
				813	/* Someone else already triggered a write */
				814	return -EAGAIN;
				815
				816	/*
				817	* A dirty page may imply that the underlying filesystem has
				818	* the page on some queue. So the page must be clean for
				819	* migration. Writeout may mean we loose the lock and the
				820	* page state is no longer what we checked for earlier.
				821	* At this point we know that the migration attempt cannot
				822	* be successful.
				823	*/
Kirill A. Shutemov	e388466	2016-03-17 14:20:07 -0700	[diff] [blame]	824	remove_migration_ptes(page, page, false);
Christoph Lameter	04e62a2	2006-06-23 02:03:38 -0700	[diff] [blame]	825
				826	rc = mapping->a_ops->writepage(page, &wbc);
Christoph Lameter	04e62a2	2006-06-23 02:03:38 -0700	[diff] [blame]	827
				828	if (rc != AOP_WRITEPAGE_ACTIVATE)
				829	/* unlocked. Relock */
				830	lock_page(page);
				831
Hugh Dickins	bda8550	2008-11-19 15:36:36 -0800	[diff] [blame]	832	return (rc < 0) ? -EIO : -EAGAIN;
Christoph Lameter	04e62a2	2006-06-23 02:03:38 -0700	[diff] [blame]	833	}
				834
				835	/*
				836	* Default handling if a filesystem does not provide a migration function.
				837	*/
Christoph Lameter	8351a6e	2006-06-23 02:03:33 -0700	[diff] [blame]	838	static int fallback_migrate_page(struct address_space *mapping,
Mel Gorman	a6bc32b	2012-01-12 17:19:43 -0800	[diff] [blame]	839	struct page newpage, struct page page, enum migrate_mode mode)
Christoph Lameter	8351a6e	2006-06-23 02:03:33 -0700	[diff] [blame]	840	{
Mel Gorman	b969c4ab	2012-01-12 17:19:34 -0800	[diff] [blame]	841	if (PageDirty(page)) {
Mel Gorman	a6bc32b	2012-01-12 17:19:43 -0800	[diff] [blame]	842	/* Only writeback pages in full synchronous migration */
Jérôme Glisse	2916ecc	2017-09-08 16:12:06 -0700	[diff] [blame]	843	switch (mode) {
				844	case MIGRATE_SYNC:
				845	case MIGRATE_SYNC_NO_COPY:
				846	break;
				847	default:
Mel Gorman	b969c4ab	2012-01-12 17:19:34 -0800	[diff] [blame]	848	return -EBUSY;
Jérôme Glisse	2916ecc	2017-09-08 16:12:06 -0700	[diff] [blame]	849	}
Christoph Lameter	04e62a2	2006-06-23 02:03:38 -0700	[diff] [blame]	850	return writeout(mapping, page);
Mel Gorman	b969c4ab	2012-01-12 17:19:34 -0800	[diff] [blame]	851	}
Christoph Lameter	8351a6e	2006-06-23 02:03:33 -0700	[diff] [blame]	852
				853	/*
				854	* Buffers may be managed in a filesystem specific way.
				855	* We must have no buffers or drop them.
				856	*/
David Howells	266cf65	2009-04-03 16:42:36 +0100	[diff] [blame]	857	if (page_has_private(page) &&
Christoph Lameter	8351a6e	2006-06-23 02:03:33 -0700	[diff] [blame]	858	!try_to_release_page(page, GFP_KERNEL))
Mel Gorman	806031b	2019-03-05 15:44:43 -0800	[diff] [blame]	859	return mode == MIGRATE_SYNC ? -EAGAIN : -EBUSY;
Christoph Lameter	8351a6e	2006-06-23 02:03:33 -0700	[diff] [blame]	860
Mel Gorman	a6bc32b	2012-01-12 17:19:43 -0800	[diff] [blame]	861	return migrate_page(mapping, newpage, page, mode);
Christoph Lameter	8351a6e	2006-06-23 02:03:33 -0700	[diff] [blame]	862	}
				863
Christoph Lameter	1d8b85c	2006-06-23 02:03:28 -0700	[diff] [blame]	864	/*
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	865	* Move a page to a newly allocated page
				866	* The page is locked and all ptes have been successfully removed.
				867	*
				868	* The new page will have replaced the old page if this function
				869	* is successful.
Lee Schermerhorn	894bc31	2008-10-18 20:26:39 -0700	[diff] [blame]	870	*
				871	* Return value:
				872	* < 0 - error code
Rafael Aquini	78bd520	2012-12-11 16:02:31 -0800	[diff] [blame]	873	* MIGRATEPAGE_SUCCESS - success
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	874	*/
Mel Gorman	3fe2011	2010-05-24 14:32:20 -0700	[diff] [blame]	875	static int move_to_new_page(struct page newpage, struct page page,
Hugh Dickins	5c3f9a6	2015-11-05 18:49:53 -0800	[diff] [blame]	876	enum migrate_mode mode)
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	877	{
				878	struct address_space *mapping;
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	879	int rc = -EAGAIN;
				880	bool is_lru = !__PageMovable(page);
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	881
Hugh Dickins	7db7671	2015-11-05 18:49:49 -0800	[diff] [blame]	882	VM_BUG_ON_PAGE(!PageLocked(page), page);
				883	VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	884
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	885	mapping = page_mapping(page);
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	886
				887	if (likely(is_lru)) {
				888	if (!mapping)
				889	rc = migrate_page(mapping, newpage, page, mode);
				890	else if (mapping->a_ops->migratepage)
				891	/*
				892	* Most pages have a mapping and most filesystems
				893	* provide a migratepage callback. Anonymous pages
				894	* are part of swap space which also has its own
				895	* migratepage callback. This is the most common path
				896	* for page migration.
				897	*/
				898	rc = mapping->a_ops->migratepage(mapping, newpage,
				899	page, mode);
				900	else
				901	rc = fallback_migrate_page(mapping, newpage,
				902	page, mode);
				903	} else {
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	904	/*
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	905	* In case of non-lru page, it could be released after
				906	* isolation step. In that case, we shouldn't try migration.
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	907	*/
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	908	VM_BUG_ON_PAGE(!PageIsolated(page), page);
				909	if (!PageMovable(page)) {
				910	rc = MIGRATEPAGE_SUCCESS;
				911	__ClearPageIsolated(page);
				912	goto out;
				913	}
				914
				915	rc = mapping->a_ops->migratepage(mapping, newpage,
				916	page, mode);
				917	WARN_ON_ONCE(rc == MIGRATEPAGE_SUCCESS &&
				918	!PageIsolated(page));
				919	}
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	920
Hugh Dickins	5c3f9a6	2015-11-05 18:49:53 -0800	[diff] [blame]	921	/*
				922	* When successful, old pagecache page->mapping must be cleared before
				923	* page is freed; but stats require that PageAnon be left as PageAnon.
				924	*/
				925	if (rc == MIGRATEPAGE_SUCCESS) {
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	926	if (__PageMovable(page)) {
				927	VM_BUG_ON_PAGE(!PageIsolated(page), page);
				928
				929	/*
				930	* We clear PG_movable under page_lock so any compactor
				931	* cannot try to migrate this page.
				932	*/
				933	__ClearPageIsolated(page);
				934	}
				935
				936	/*
Ralph Campbell	c23a0c9	2020-01-30 22:14:41 -0800	[diff] [blame]	937	* Anonymous and movable page->mapping will be cleared by
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	938	* free_pages_prepare so don't reset it here for keeping
				939	* the type to work PageAnon, for example.
				940	*/
				941	if (!PageMappingFlags(page))
Hugh Dickins	5c3f9a6	2015-11-05 18:49:53 -0800	[diff] [blame]	942	page->mapping = NULL;
Lars Persson	d2b2c6dd	2019-03-28 20:44:28 -0700	[diff] [blame]	943
Christoph Hellwig	25b2995	2019-06-13 22:50:49 +0200	[diff] [blame]	944	if (likely(!is_zone_device_page(newpage)))
Lars Persson	d2b2c6dd	2019-03-28 20:44:28 -0700	[diff] [blame]	945	flush_dcache_page(newpage);
				946
Mel Gorman	3fe2011	2010-05-24 14:32:20 -0700	[diff] [blame]	947	}
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	948	out:
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	949	return rc;
				950	}
				951
Minchan Kim	0dabec9	2011-10-31 17:06:57 -0700	[diff] [blame]	952	static int __unmap_and_move(struct page page, struct page newpage,
Hugh Dickins	9c620e2	2013-02-22 16:35:14 -0800	[diff] [blame]	953	int force, enum migrate_mode mode)
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	954	{
Minchan Kim	0dabec9	2011-10-31 17:06:57 -0700	[diff] [blame]	955	int rc = -EAGAIN;
Baolin Wang	213ecb3	2021-09-08 15:18:06 -0700	[diff] [blame]	956	bool page_was_mapped = false;
Mel Gorman	3f6c827	2010-05-24 14:32:17 -0700	[diff] [blame]	957	struct anon_vma *anon_vma = NULL;
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	958	bool is_lru = !__PageMovable(page);
Christoph Lameter	95a402c	2006-06-23 02:03:53 -0700	[diff] [blame]	959
Nick Piggin	529ae9a	2008-08-02 12:01:03 +0200	[diff] [blame]	960	if (!trylock_page(page)) {
Mel Gorman	a6bc32b	2012-01-12 17:19:43 -0800	[diff] [blame]	961	if (!force \|\| mode == MIGRATE_ASYNC)
Minchan Kim	0dabec9	2011-10-31 17:06:57 -0700	[diff] [blame]	962	goto out;
Mel Gorman	3e7d344	2011-01-13 15:45:56 -0800	[diff] [blame]	963
				964	/*
				965	* It's not safe for direct compaction to call lock_page.
				966	* For example, during page readahead pages are added locked
				967	* to the LRU. Later, when the IO completes the pages are
				968	* marked uptodate and unlocked. However, the queueing
				969	* could be merging multiple pages for one bio (e.g.
Matthew Wilcox (Oracle)	d438834	2020-06-01 21:47:02 -0700	[diff] [blame]	970	* mpage_readahead). If an allocation happens for the
Mel Gorman	3e7d344	2011-01-13 15:45:56 -0800	[diff] [blame]	971	* second or third page, the process can end up locking
				972	* the same page twice and deadlocking. Rather than
				973	* trying to be clever about what pages can be locked,
				974	* avoid the use of lock_page for direct compaction
				975	* altogether.
				976	*/
				977	if (current->flags & PF_MEMALLOC)
Minchan Kim	0dabec9	2011-10-31 17:06:57 -0700	[diff] [blame]	978	goto out;
Mel Gorman	3e7d344	2011-01-13 15:45:56 -0800	[diff] [blame]	979
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	980	lock_page(page);
				981	}
				982
				983	if (PageWriteback(page)) {
Andrea Arcangeli	11bc82d	2011-03-22 16:33:11 -0700	[diff] [blame]	984	/*
Jianguo Wu	fed5b64	2013-04-29 15:07:58 -0700	[diff] [blame]	985	* Only in the case of a full synchronous migration is it
Mel Gorman	a6bc32b	2012-01-12 17:19:43 -0800	[diff] [blame]	986	* necessary to wait for PageWriteback. In the async case,
				987	* the retry loop is too short and in the sync-light case,
				988	* the overhead of stalling is too much
Andrea Arcangeli	11bc82d	2011-03-22 16:33:11 -0700	[diff] [blame]	989	*/
Jérôme Glisse	2916ecc	2017-09-08 16:12:06 -0700	[diff] [blame]	990	switch (mode) {
				991	case MIGRATE_SYNC:
				992	case MIGRATE_SYNC_NO_COPY:
				993	break;
				994	default:
Andrea Arcangeli	11bc82d	2011-03-22 16:33:11 -0700	[diff] [blame]	995	rc = -EBUSY;
Johannes Weiner	0a31bc9	2014-08-08 14:19:22 -0700	[diff] [blame]	996	goto out_unlock;
Andrea Arcangeli	11bc82d	2011-03-22 16:33:11 -0700	[diff] [blame]	997	}
				998	if (!force)
Johannes Weiner	0a31bc9	2014-08-08 14:19:22 -0700	[diff] [blame]	999	goto out_unlock;
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	1000	wait_on_page_writeback(page);
				1001	}
Hugh Dickins	03f15c8	2015-11-05 18:49:56 -0800	[diff] [blame]	1002
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	1003	/*
Baolin Wang	68a9843	2021-09-08 15:18:03 -0700	[diff] [blame]	1004	* By try_to_migrate(), page->mapcount goes down to 0 here. In this case,
KAMEZAWA Hiroyuki	dc386d4	2007-07-26 10:41:07 -0700	[diff] [blame]	1005	* we cannot notice that anon_vma is freed while we migrates a page.
Hugh Dickins	1ce82b6	2011-01-13 15:47:30 -0800	[diff] [blame]	1006	* This get_anon_vma() delays freeing anon_vma pointer until the end
KAMEZAWA Hiroyuki	dc386d4	2007-07-26 10:41:07 -0700	[diff] [blame]	1007	* of migration. File cache pages are no problem because of page_lock()
KAMEZAWA Hiroyuki	989f89c	2007-08-30 23:56:21 -0700	[diff] [blame]	1008	* File Caches may use write_page() or lock_page() in migration, then,
				1009	* just care Anon page here.
Hugh Dickins	03f15c8	2015-11-05 18:49:56 -0800	[diff] [blame]	1010	*
				1011	* Only page_get_anon_vma() understands the subtleties of
				1012	* getting a hold on an anon_vma from outside one of its mms.
				1013	* But if we cannot get anon_vma, then we won't need it anyway,
				1014	* because that implies that the anon page is no longer mapped
				1015	* (and cannot be remapped so long as we hold the page lock).
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	1016	*/
Hugh Dickins	03f15c8	2015-11-05 18:49:56 -0800	[diff] [blame]	1017	if (PageAnon(page) && !PageKsm(page))
Peter Zijlstra	746b18d	2011-05-24 17:12:10 -0700	[diff] [blame]	1018	anon_vma = page_get_anon_vma(page);
Shaohua Li	62e1c55	2008-02-04 22:29:33 -0800	[diff] [blame]	1019
Hugh Dickins	7db7671	2015-11-05 18:49:49 -0800	[diff] [blame]	1020	/*
				1021	* Block others from accessing the new page when we get around to
				1022	* establishing additional references. We are usually the only one
				1023	* holding a reference to newpage at this point. We used to have a BUG
				1024	* here if trylock_page(newpage) fails, but would like to allow for
				1025	* cases where there might be a race with the previous use of newpage.
				1026	* This is much like races on refcount of oldpage: just don't BUG().
				1027	*/
				1028	if (unlikely(!trylock_page(newpage)))
				1029	goto out_unlock;
				1030
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	1031	if (unlikely(!is_lru)) {
				1032	rc = move_to_new_page(newpage, page, mode);
				1033	goto out_unlock_both;
				1034	}
				1035
KAMEZAWA Hiroyuki	dc386d4	2007-07-26 10:41:07 -0700	[diff] [blame]	1036	/*
Shaohua Li	62e1c55	2008-02-04 22:29:33 -0800	[diff] [blame]	1037	* Corner case handling:
				1038	* 1. When a new swap-cache page is read into, it is added to the LRU
				1039	* and treated as swapcache but it has no rmap yet.
				1040	* Calling try_to_unmap() against a page->mapping==NULL page will
				1041	* trigger a BUG. So handle it here.
Yang Shi	d12b895	2020-12-14 19:13:02 -0800	[diff] [blame]	1042	* 2. An orphaned page (see truncate_cleanup_page) might have
Shaohua Li	62e1c55	2008-02-04 22:29:33 -0800	[diff] [blame]	1043	* fs-private metadata. The page can be picked up due to memory
				1044	* offlining. Everywhere else except page reclaim, the page is
				1045	* invisible to the vm, so the page can not be migrated. So try to
				1046	* free the metadata, so the page can be freed.
KAMEZAWA Hiroyuki	dc386d4	2007-07-26 10:41:07 -0700	[diff] [blame]	1047	*/
Shaohua Li	62e1c55	2008-02-04 22:29:33 -0800	[diff] [blame]	1048	if (!page->mapping) {
Sasha Levin	309381fea	2014-01-23 15:52:54 -0800	[diff] [blame]	1049	VM_BUG_ON_PAGE(PageAnon(page), page);
Hugh Dickins	1ce82b6	2011-01-13 15:47:30 -0800	[diff] [blame]	1050	if (page_has_private(page)) {
Shaohua Li	62e1c55	2008-02-04 22:29:33 -0800	[diff] [blame]	1051	try_to_free_buffers(page);
Hugh Dickins	7db7671	2015-11-05 18:49:49 -0800	[diff] [blame]	1052	goto out_unlock_both;
Shaohua Li	62e1c55	2008-02-04 22:29:33 -0800	[diff] [blame]	1053	}
Hugh Dickins	7db7671	2015-11-05 18:49:49 -0800	[diff] [blame]	1054	} else if (page_mapped(page)) {
				1055	/* Establish migration ptes */
Hugh Dickins	03f15c8	2015-11-05 18:49:56 -0800	[diff] [blame]	1056	VM_BUG_ON_PAGE(PageAnon(page) && !PageKsm(page) && !anon_vma,
				1057	page);
Alistair Popple	a98a2f0	2021-06-30 18:54:16 -0700	[diff] [blame]	1058	try_to_migrate(page, 0);
Baolin Wang	213ecb3	2021-09-08 15:18:06 -0700	[diff] [blame]	1059	page_was_mapped = true;
Hugh Dickins	2ebba6b	2014-12-12 16:56:19 -0800	[diff] [blame]	1060	}
KAMEZAWA Hiroyuki	dc386d4	2007-07-26 10:41:07 -0700	[diff] [blame]	1061
Christoph Lameter	e6a1530	2006-06-25 05:46:49 -0700	[diff] [blame]	1062	if (!page_mapped(page))
Hugh Dickins	5c3f9a6	2015-11-05 18:49:53 -0800	[diff] [blame]	1063	rc = move_to_new_page(newpage, page, mode);
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	1064
Hugh Dickins	5c3f9a6	2015-11-05 18:49:53 -0800	[diff] [blame]	1065	if (page_was_mapped)
				1066	remove_migration_ptes(page,
Kirill A. Shutemov	e388466	2016-03-17 14:20:07 -0700	[diff] [blame]	1067	rc == MIGRATEPAGE_SUCCESS ? newpage : page, false);
Mel Gorman	3f6c827	2010-05-24 14:32:17 -0700	[diff] [blame]	1068
Hugh Dickins	7db7671	2015-11-05 18:49:49 -0800	[diff] [blame]	1069	out_unlock_both:
				1070	unlock_page(newpage);
				1071	out_unlock:
Mel Gorman	3f6c827	2010-05-24 14:32:17 -0700	[diff] [blame]	1072	/* Drop an anon_vma reference if we took one */
Rik van Riel	7654506	2010-08-09 17:18:41 -0700	[diff] [blame]	1073	if (anon_vma)
Peter Zijlstra	9e60109	2011-03-22 16:32:46 -0700	[diff] [blame]	1074	put_anon_vma(anon_vma);
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	1075	unlock_page(page);
Minchan Kim	0dabec9	2011-10-31 17:06:57 -0700	[diff] [blame]	1076	out:
Minchan Kim	c6c919e	2016-07-26 15:23:02 -0700	[diff] [blame]	1077	/*
				1078	* If migration is successful, decrease refcount of the newpage
				1079	* which will not free the page because new page owner increased
				1080	* refcounter. As well, if it is LRU page, add the page to LRU
David Hildenbrand	e0a352f	2019-02-01 14:21:19 -0800	[diff] [blame]	1081	* list in here. Use the old state of the isolated source page to
				1082	* determine if we migrated a LRU page. newpage was already unlocked
				1083	* and possibly modified by its owner - don't rely on the page
				1084	* state.
Minchan Kim	c6c919e	2016-07-26 15:23:02 -0700	[diff] [blame]	1085	*/
				1086	if (rc == MIGRATEPAGE_SUCCESS) {
David Hildenbrand	e0a352f	2019-02-01 14:21:19 -0800	[diff] [blame]	1087	if (unlikely(!is_lru))
Minchan Kim	c6c919e	2016-07-26 15:23:02 -0700	[diff] [blame]	1088	put_page(newpage);
				1089	else
				1090	putback_lru_page(newpage);
				1091	}
				1092
Minchan Kim	0dabec9	2011-10-31 17:06:57 -0700	[diff] [blame]	1093	return rc;
				1094	}
Christoph Lameter	95a402c	2006-06-23 02:03:53 -0700	[diff] [blame]	1095
Dave Hansen	79c28a4	2021-09-02 14:59:06 -0700	[diff] [blame]	1096
				1097	/*
				1098	* node_demotion[] example:
				1099	*
				1100	* Consider a system with two sockets. Each socket has
				1101	* three classes of memory attached: fast, medium and slow.
				1102	* Each memory class is placed in its own NUMA node. The
				1103	* CPUs are placed in the node with the "fast" memory. The
				1104	* 6 NUMA nodes (0-5) might be split among the sockets like
				1105	* this:
				1106	*
				1107	* Socket A: 0, 1, 2
				1108	* Socket B: 3, 4, 5
				1109	*
				1110	* When Node 0 fills up, its memory should be migrated to
				1111	* Node 1. When Node 1 fills up, it should be migrated to
				1112	* Node 2. The migration path start on the nodes with the
				1113	* processors (since allocations default to this node) and
				1114	* fast memory, progress through medium and end with the
				1115	* slow memory:
				1116	*
				1117	* 0 -> 1 -> 2 -> stop
				1118	* 3 -> 4 -> 5 -> stop
				1119	*
				1120	* This is represented in the node_demotion[] like this:
				1121	*
				1122	* { 1, // Node 0 migrates to 1
				1123	* 2, // Node 1 migrates to 2
				1124	* -1, // Node 2 does not migrate
				1125	* 4, // Node 3 migrates to 4
				1126	* 5, // Node 4 migrates to 5
				1127	* -1} // Node 5 does not migrate
				1128	*/
				1129
				1130	/*
				1131	* Writes to this array occur without locking. Cycles are
				1132	* not allowed: Node X demotes to Y which demotes to X...
				1133	*
				1134	* If multiple reads are performed, a single rcu_read_lock()
				1135	* must be held over all reads to ensure that no cycles are
				1136	* observed.
				1137	*/
				1138	static int node_demotion[MAX_NUMNODES] __read_mostly =
				1139	{[0 ... MAX_NUMNODES - 1] = NUMA_NO_NODE};
				1140
				1141	/**
				1142	* next_demotion_node() - Get the next node in the demotion path
				1143	* @node: The starting node to lookup the next node
				1144	*
Randy Dunlap	c9bd7d1	2021-09-02 15:00:36 -0700	[diff] [blame]	1145	* Return: node id for next memory node in the demotion path hierarchy
Dave Hansen	79c28a4	2021-09-02 14:59:06 -0700	[diff] [blame]	1146	* from @node; NUMA_NO_NODE if @node is terminal. This does not keep
				1147	* @node online or guarantee that it continues to be the next demotion
				1148	* target.
				1149	*/
				1150	int next_demotion_node(int node)
				1151	{
				1152	int target;
				1153
				1154	/*
				1155	* node_demotion[] is updated without excluding this
				1156	* function from running. RCU doesn't provide any
				1157	* compiler barriers, so the READ_ONCE() is required
				1158	* to avoid compiler reordering or read merging.
				1159	*
				1160	* Make sure to use RCU over entire code blocks if
				1161	* node_demotion[] reads need to be consistent.
				1162	*/
				1163	rcu_read_lock();
				1164	target = READ_ONCE(node_demotion[node]);
				1165	rcu_read_unlock();
				1166
				1167	return target;
				1168	}
				1169
Minchan Kim	0dabec9	2011-10-31 17:06:57 -0700	[diff] [blame]	1170	/*
				1171	* Obtain the lock on page, remove all ptes and migrate the page
				1172	* to the newly allocated page in newpage.
				1173	*/
Linus Torvalds	6ec4476	2020-07-08 10:48:35 -0700	[diff] [blame]	1174	static int unmap_and_move(new_page_t get_new_page,
Geert Uytterhoeven	ef2a515	2015-04-14 15:44:22 -0700	[diff] [blame]	1175	free_page_t put_new_page,
				1176	unsigned long private, struct page *page,
Naoya Horiguchi	add05ce	2015-06-24 16:56:50 -0700	[diff] [blame]	1177	int force, enum migrate_mode mode,
Yang Shi	dd4ae78	2020-12-14 19:13:06 -0800	[diff] [blame]	1178	enum migrate_reason reason,
				1179	struct list_head *ret)
Minchan Kim	0dabec9	2011-10-31 17:06:57 -0700	[diff] [blame]	1180	{
Hugh Dickins	2def742	2015-11-05 18:49:46 -0800	[diff] [blame]	1181	int rc = MIGRATEPAGE_SUCCESS;
Yang Shi	74d4a57	2019-11-30 17:57:12 -0800	[diff] [blame]	1182	struct page *newpage = NULL;
Minchan Kim	0dabec9	2011-10-31 17:06:57 -0700	[diff] [blame]	1183
Michal Hocko	94723aa	2018-04-10 16:30:07 -0700	[diff] [blame]	1184	if (!thp_migration_supported() && PageTransHuge(page))
Yang Shi	d532e2e	2020-12-14 19:13:16 -0800	[diff] [blame]	1185	return -ENOSYS;
Michal Hocko	94723aa	2018-04-10 16:30:07 -0700	[diff] [blame]	1186
Minchan Kim	0dabec9	2011-10-31 17:06:57 -0700	[diff] [blame]	1187	if (page_count(page) == 1) {
				1188	/* page was freed from under us. So we are done. */
Minchan Kim	c6c919e	2016-07-26 15:23:02 -0700	[diff] [blame]	1189	ClearPageActive(page);
				1190	ClearPageUnevictable(page);
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	1191	if (unlikely(__PageMovable(page))) {
				1192	lock_page(page);
				1193	if (!PageMovable(page))
				1194	__ClearPageIsolated(page);
				1195	unlock_page(page);
				1196	}
Minchan Kim	0dabec9	2011-10-31 17:06:57 -0700	[diff] [blame]	1197	goto out;
				1198	}
				1199
Yang Shi	74d4a57	2019-11-30 17:57:12 -0800	[diff] [blame]	1200	newpage = get_new_page(page, private);
				1201	if (!newpage)
				1202	return -ENOMEM;
				1203
Hugh Dickins	9c620e2	2013-02-22 16:35:14 -0800	[diff] [blame]	1204	rc = __unmap_and_move(page, newpage, force, mode);
Minchan Kim	c6c919e	2016-07-26 15:23:02 -0700	[diff] [blame]	1205	if (rc == MIGRATEPAGE_SUCCESS)
Vlastimil Babka	7cd12b4	2016-03-15 14:56:18 -0700	[diff] [blame]	1206	set_page_owner_migrate_reason(newpage, reason);
Rafael Aquini	bf6bddf1	2012-12-11 16:02:42 -0800	[diff] [blame]	1207
Minchan Kim	0dabec9	2011-10-31 17:06:57 -0700	[diff] [blame]	1208	out:
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	1209	if (rc != -EAGAIN) {
Minchan Kim	0dabec9	2011-10-31 17:06:57 -0700	[diff] [blame]	1210	/*
				1211	* A page that has been migrated has all references
				1212	* removed and will be freed. A page that has not been
Ralph Campbell	c23a0c9	2020-01-30 22:14:41 -0800	[diff] [blame]	1213	* migrated will have kept its references and be restored.
Minchan Kim	0dabec9	2011-10-31 17:06:57 -0700	[diff] [blame]	1214	*/
				1215	list_del(&page->lru);
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	1216	}
David Rientjes	68711a7	2014-06-04 16:08:25 -0700	[diff] [blame]	1217
Christoph Lameter	95a402c	2006-06-23 02:03:53 -0700	[diff] [blame]	1218	/*
Minchan Kim	c6c919e	2016-07-26 15:23:02 -0700	[diff] [blame]	1219	* If migration is successful, releases reference grabbed during
				1220	* isolation. Otherwise, restore the page to right list unless
				1221	* we want to retry.
Christoph Lameter	95a402c	2006-06-23 02:03:53 -0700	[diff] [blame]	1222	*/
Minchan Kim	c6c919e	2016-07-26 15:23:02 -0700	[diff] [blame]	1223	if (rc == MIGRATEPAGE_SUCCESS) {
Yang Shi	dd4ae78	2020-12-14 19:13:06 -0800	[diff] [blame]	1224	/*
				1225	* Compaction can migrate also non-LRU pages which are
				1226	* not accounted to NR_ISOLATED_*. They can be recognized
				1227	* as __PageMovable
				1228	*/
				1229	if (likely(!__PageMovable(page)))
				1230	mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
				1231	page_is_file_lru(page), -thp_nr_pages(page));
				1232
Oscar Salvador	79f5f8f	2020-10-15 20:07:09 -0700	[diff] [blame]	1233	if (reason != MR_MEMORY_FAILURE)
Minchan Kim	c6c919e	2016-07-26 15:23:02 -0700	[diff] [blame]	1234	/*
Oscar Salvador	79f5f8f	2020-10-15 20:07:09 -0700	[diff] [blame]	1235	* We release the page in page_handle_poison.
Minchan Kim	c6c919e	2016-07-26 15:23:02 -0700	[diff] [blame]	1236	*/
Oscar Salvador	79f5f8f	2020-10-15 20:07:09 -0700	[diff] [blame]	1237	put_page(page);
Minchan Kim	c6c919e	2016-07-26 15:23:02 -0700	[diff] [blame]	1238	} else {
Yang Shi	dd4ae78	2020-12-14 19:13:06 -0800	[diff] [blame]	1239	if (rc != -EAGAIN)
				1240	list_add_tail(&page->lru, ret);
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	1241
Minchan Kim	c6c919e	2016-07-26 15:23:02 -0700	[diff] [blame]	1242	if (put_new_page)
				1243	put_new_page(newpage, private);
				1244	else
				1245	put_page(newpage);
				1246	}
David Rientjes	68711a7	2014-06-04 16:08:25 -0700	[diff] [blame]	1247
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	1248	return rc;
				1249	}
				1250
				1251	/*
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	1252	* Counterpart of unmap_and_move_page() for hugepage migration.
				1253	*
				1254	* This function doesn't wait the completion of hugepage I/O
				1255	* because there is no race between I/O and migration for hugepage.
				1256	* Note that currently hugepage I/O occurs only in direct I/O
				1257	* where no lock is held and PG_writeback is irrelevant,
				1258	* and writeback status of all subpages are counted in the reference
				1259	* count of the head page (i.e. if all subpages of a 2MB hugepage are
				1260	* under direct I/O, the reference of the head page is 512 and a bit more.)
				1261	* This means that when we try to migrate hugepage whose subpages are
				1262	* doing direct I/O, some references remain after try_to_unmap() and
				1263	* hugepage migration fails without data corruption.
				1264	*
				1265	* There is also no race when direct I/O is issued on the page under migration,
				1266	* because then pte is replaced with migration swap entry and direct I/O code
				1267	* will wait in the page fault for migration to complete.
				1268	*/
				1269	static int unmap_and_move_huge_page(new_page_t get_new_page,
David Rientjes	68711a7	2014-06-04 16:08:25 -0700	[diff] [blame]	1270	free_page_t put_new_page, unsigned long private,
				1271	struct page *hpage, int force,
Yang Shi	dd4ae78	2020-12-14 19:13:06 -0800	[diff] [blame]	1272	enum migrate_mode mode, int reason,
				1273	struct list_head *ret)
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	1274	{
Hugh Dickins	2def742	2015-11-05 18:49:46 -0800	[diff] [blame]	1275	int rc = -EAGAIN;
Hugh Dickins	2ebba6b	2014-12-12 16:56:19 -0800	[diff] [blame]	1276	int page_was_mapped = 0;
Joonsoo Kim	32665f2	2014-01-21 15:51:15 -0800	[diff] [blame]	1277	struct page *new_hpage;
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	1278	struct anon_vma *anon_vma = NULL;
Mike Kravetz	c0d0381	2020-04-01 21:11:05 -0700	[diff] [blame]	1279	struct address_space *mapping = NULL;
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	1280
Naoya Horiguchi	83467ef	2013-09-11 14:22:11 -0700	[diff] [blame]	1281	/*
Anshuman Khandual	7ed2c31	2019-03-05 15:43:44 -0800	[diff] [blame]	1282	* Migratability of hugepages depends on architectures and their size.
Naoya Horiguchi	83467ef	2013-09-11 14:22:11 -0700	[diff] [blame]	1283	* This check is necessary because some callers of hugepage migration
				1284	* like soft offline and memory hotremove don't walk through page
				1285	* tables or check whether the hugepage is pmd-based or not before
				1286	* kicking migration.
				1287	*/
Naoya Horiguchi	100873d	2014-06-04 16:10:56 -0700	[diff] [blame]	1288	if (!hugepage_migration_supported(page_hstate(hpage))) {
Yang Shi	dd4ae78	2020-12-14 19:13:06 -0800	[diff] [blame]	1289	list_move_tail(&hpage->lru, ret);
Naoya Horiguchi	83467ef	2013-09-11 14:22:11 -0700	[diff] [blame]	1290	return -ENOSYS;
Joonsoo Kim	32665f2	2014-01-21 15:51:15 -0800	[diff] [blame]	1291	}
Naoya Horiguchi	83467ef	2013-09-11 14:22:11 -0700	[diff] [blame]	1292
Muchun Song	71a64f6	2021-02-04 18:32:17 -0800	[diff] [blame]	1293	if (page_count(hpage) == 1) {
				1294	/* page was freed from under us. So we are done. */
				1295	putback_active_hugepage(hpage);
				1296	return MIGRATEPAGE_SUCCESS;
				1297	}
				1298
Michal Hocko	666feb2	2018-04-10 16:30:03 -0700	[diff] [blame]	1299	new_hpage = get_new_page(hpage, private);
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	1300	if (!new_hpage)
				1301	return -ENOMEM;
				1302
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	1303	if (!trylock_page(hpage)) {
Jérôme Glisse	2916ecc	2017-09-08 16:12:06 -0700	[diff] [blame]	1304	if (!force)
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	1305	goto out;
Jérôme Glisse	2916ecc	2017-09-08 16:12:06 -0700	[diff] [blame]	1306	switch (mode) {
				1307	case MIGRATE_SYNC:
				1308	case MIGRATE_SYNC_NO_COPY:
				1309	break;
				1310	default:
				1311	goto out;
				1312	}
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	1313	lock_page(hpage);
				1314	}
				1315
Mike Kravetz	cb6acd0	2019-02-28 16:22:02 -0800	[diff] [blame]	1316	/*
				1317	* Check for pages which are in the process of being freed. Without
				1318	* page_mapping() set, hugetlbfs specific move page routine will not
				1319	* be called and we could leak usage counts for subpools.
				1320	*/
Muchun Song	6acfb5b	2021-06-30 18:51:29 -0700	[diff] [blame]	1321	if (hugetlb_page_subpool(hpage) && !page_mapping(hpage)) {
Mike Kravetz	cb6acd0	2019-02-28 16:22:02 -0800	[diff] [blame]	1322	rc = -EBUSY;
				1323	goto out_unlock;
				1324	}
				1325
Peter Zijlstra	746b18d	2011-05-24 17:12:10 -0700	[diff] [blame]	1326	if (PageAnon(hpage))
				1327	anon_vma = page_get_anon_vma(hpage);
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	1328
Hugh Dickins	7db7671	2015-11-05 18:49:49 -0800	[diff] [blame]	1329	if (unlikely(!trylock_page(new_hpage)))
				1330	goto put_anon;
				1331
Hugh Dickins	2ebba6b	2014-12-12 16:56:19 -0800	[diff] [blame]	1332	if (page_mapped(hpage)) {
Mike Kravetz	336bf30	2020-11-13 22:52:16 -0800	[diff] [blame]	1333	bool mapping_locked = false;
Alistair Popple	a98a2f0	2021-06-30 18:54:16 -0700	[diff] [blame]	1334	enum ttu_flags ttu = 0;
Mike Kravetz	c0d0381	2020-04-01 21:11:05 -0700	[diff] [blame]	1335
Mike Kravetz	336bf30	2020-11-13 22:52:16 -0800	[diff] [blame]	1336	if (!PageAnon(hpage)) {
				1337	/*
				1338	* In shared mappings, try_to_unmap could potentially
				1339	* call huge_pmd_unshare. Because of this, take
				1340	* semaphore in write mode here and set TTU_RMAP_LOCKED
				1341	* to let lower levels know we have taken the lock.
				1342	*/
				1343	mapping = hugetlb_page_mapping_lock_write(hpage);
				1344	if (unlikely(!mapping))
				1345	goto unlock_put_anon;
				1346
				1347	mapping_locked = true;
				1348	ttu \|= TTU_RMAP_LOCKED;
				1349	}
				1350
Alistair Popple	a98a2f0	2021-06-30 18:54:16 -0700	[diff] [blame]	1351	try_to_migrate(hpage, ttu);
Hugh Dickins	2ebba6b	2014-12-12 16:56:19 -0800	[diff] [blame]	1352	page_was_mapped = 1;
Mike Kravetz	336bf30	2020-11-13 22:52:16 -0800	[diff] [blame]	1353
				1354	if (mapping_locked)
				1355	i_mmap_unlock_write(mapping);
Hugh Dickins	2ebba6b	2014-12-12 16:56:19 -0800	[diff] [blame]	1356	}
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	1357
				1358	if (!page_mapped(hpage))
Hugh Dickins	5c3f9a6	2015-11-05 18:49:53 -0800	[diff] [blame]	1359	rc = move_to_new_page(new_hpage, hpage, mode);
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	1360
Mike Kravetz	336bf30	2020-11-13 22:52:16 -0800	[diff] [blame]	1361	if (page_was_mapped)
Hugh Dickins	5c3f9a6	2015-11-05 18:49:53 -0800	[diff] [blame]	1362	remove_migration_ptes(hpage,
Mike Kravetz	336bf30	2020-11-13 22:52:16 -0800	[diff] [blame]	1363	rc == MIGRATEPAGE_SUCCESS ? new_hpage : hpage, false);
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	1364
Mike Kravetz	c0d0381	2020-04-01 21:11:05 -0700	[diff] [blame]	1365	unlock_put_anon:
Hugh Dickins	7db7671	2015-11-05 18:49:49 -0800	[diff] [blame]	1366	unlock_page(new_hpage);
				1367
				1368	put_anon:
Hugh Dickins	fd4a466	2011-01-13 15:47:31 -0800	[diff] [blame]	1369	if (anon_vma)
Peter Zijlstra	9e60109	2011-03-22 16:32:46 -0700	[diff] [blame]	1370	put_anon_vma(anon_vma);
Aneesh Kumar K.V	8e6ac7f	2012-07-31 16:42:27 -0700	[diff] [blame]	1371
Hugh Dickins	2def742	2015-11-05 18:49:46 -0800	[diff] [blame]	1372	if (rc == MIGRATEPAGE_SUCCESS) {
Michal Hocko	ab5ac90	2018-01-31 16:20:48 -0800	[diff] [blame]	1373	move_hugetlb_state(hpage, new_hpage, reason);
Hugh Dickins	2def742	2015-11-05 18:49:46 -0800	[diff] [blame]	1374	put_new_page = NULL;
				1375	}
Aneesh Kumar K.V	8e6ac7f	2012-07-31 16:42:27 -0700	[diff] [blame]	1376
Mike Kravetz	cb6acd0	2019-02-28 16:22:02 -0800	[diff] [blame]	1377	out_unlock:
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	1378	unlock_page(hpage);
Hillf Danton	0976133	2011-12-08 14:34:20 -0800	[diff] [blame]	1379	out:
Yang Shi	dd4ae78	2020-12-14 19:13:06 -0800	[diff] [blame]	1380	if (rc == MIGRATEPAGE_SUCCESS)
Naoya Horiguchi	b8ec1ce	2013-09-11 14:22:01 -0700	[diff] [blame]	1381	putback_active_hugepage(hpage);
Miaohe Lin	a04840c	2021-05-04 18:37:07 -0700	[diff] [blame]	1382	else if (rc != -EAGAIN)
Yang Shi	dd4ae78	2020-12-14 19:13:06 -0800	[diff] [blame]	1383	list_move_tail(&hpage->lru, ret);
David Rientjes	68711a7	2014-06-04 16:08:25 -0700	[diff] [blame]	1384
				1385	/*
				1386	* If migration was not successful and there's a freeing callback, use
				1387	* it. Otherwise, put_page() will drop the reference grabbed during
				1388	* isolation.
				1389	*/
Hugh Dickins	2def742	2015-11-05 18:49:46 -0800	[diff] [blame]	1390	if (put_new_page)
David Rientjes	68711a7	2014-06-04 16:08:25 -0700	[diff] [blame]	1391	put_new_page(new_hpage, private);
				1392	else
Naoya Horiguchi	3aaa76e	2015-09-22 14:59:14 -0700	[diff] [blame]	1393	putback_active_hugepage(new_hpage);
David Rientjes	68711a7	2014-06-04 16:08:25 -0700	[diff] [blame]	1394
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	1395	return rc;
				1396	}
				1397
Yang Shi	d532e2e	2020-12-14 19:13:16 -0800	[diff] [blame]	1398	static inline int try_split_thp(struct page page, struct page *page2,
				1399	struct list_head *from)
				1400	{
				1401	int rc = 0;
				1402
				1403	lock_page(page);
				1404	rc = split_huge_page_to_list(page, from);
				1405	unlock_page(page);
				1406	if (!rc)
				1407	list_safe_reset_next(page, *page2, lru);
				1408
				1409	return rc;
				1410	}
				1411
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	1412	/*
Srivatsa S. Bhat	c73e5c9	2013-04-29 15:08:16 -0700	[diff] [blame]	1413	* migrate_pages - migrate the pages specified in a list, to the free pages
				1414	* supplied as the target for the page migration
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	1415	*
Srivatsa S. Bhat	c73e5c9	2013-04-29 15:08:16 -0700	[diff] [blame]	1416	* @from: The list of pages to be migrated.
				1417	* @get_new_page: The function used to allocate free pages to be used
				1418	* as the target of the page migration.
David Rientjes	68711a7	2014-06-04 16:08:25 -0700	[diff] [blame]	1419	* @put_new_page: The function used to free target pages if migration
				1420	* fails, or NULL if no special handling is necessary.
Srivatsa S. Bhat	c73e5c9	2013-04-29 15:08:16 -0700	[diff] [blame]	1421	* @private: Private data to be passed on to get_new_page()
				1422	* @mode: The migration mode that specifies the constraints for
				1423	* page migration, if any.
				1424	* @reason: The reason for page migration.
Yang Shi	5ac9588	2021-09-02 14:59:13 -0700	[diff] [blame]	1425	* @ret_succeeded: Set to the number of pages migrated successfully if
				1426	* the caller passes a non-NULL pointer.
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	1427	*
Srivatsa S. Bhat	c73e5c9	2013-04-29 15:08:16 -0700	[diff] [blame]	1428	* The function returns after 10 attempts or if no pages are movable any more
				1429	* because the list has become empty or no retryable pages exist any more.
Yang Shi	dd4ae78	2020-12-14 19:13:06 -0800	[diff] [blame]	1430	* It is caller's responsibility to call putback_movable_pages() to return pages
				1431	* to the LRU or free list only if ret != 0.
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	1432	*
Srivatsa S. Bhat	c73e5c9	2013-04-29 15:08:16 -0700	[diff] [blame]	1433	* Returns the number of pages that were not migrated, or an error code.
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	1434	*/
Hugh Dickins	9c620e2	2013-02-22 16:35:14 -0800	[diff] [blame]	1435	int migrate_pages(struct list_head *from, new_page_t get_new_page,
David Rientjes	68711a7	2014-06-04 16:08:25 -0700	[diff] [blame]	1436	free_page_t put_new_page, unsigned long private,
Yang Shi	5ac9588	2021-09-02 14:59:13 -0700	[diff] [blame]	1437	enum migrate_mode mode, int reason, unsigned int *ret_succeeded)
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	1438	{
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	1439	int retry = 1;
Anshuman Khandual	1a5bae2	2020-08-11 18:31:51 -0700	[diff] [blame]	1440	int thp_retry = 1;
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	1441	int nr_failed = 0;
Mel Gorman	5647bc2	2012-10-19 10:46:20 +0100	[diff] [blame]	1442	int nr_succeeded = 0;
Anshuman Khandual	1a5bae2	2020-08-11 18:31:51 -0700	[diff] [blame]	1443	int nr_thp_succeeded = 0;
				1444	int nr_thp_failed = 0;
				1445	int nr_thp_split = 0;
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	1446	int pass = 0;
Anshuman Khandual	1a5bae2	2020-08-11 18:31:51 -0700	[diff] [blame]	1447	bool is_thp = false;
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	1448	struct page *page;
				1449	struct page *page2;
				1450	int swapwrite = current->flags & PF_SWAPWRITE;
Anshuman Khandual	1a5bae2	2020-08-11 18:31:51 -0700	[diff] [blame]	1451	int rc, nr_subpages;
Yang Shi	dd4ae78	2020-12-14 19:13:06 -0800	[diff] [blame]	1452	LIST_HEAD(ret_pages);
Yang Shi	b0b515b	2021-06-30 18:51:48 -0700	[diff] [blame]	1453	bool nosplit = (reason == MR_NUMA_MISPLACED);
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	1454
Liam Mark	7bc1aec	2021-05-04 18:37:25 -0700	[diff] [blame]	1455	trace_mm_migrate_pages_start(mode, reason);
				1456
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	1457	if (!swapwrite)
				1458	current->flags \|= PF_SWAPWRITE;
				1459
Anshuman Khandual	1a5bae2	2020-08-11 18:31:51 -0700	[diff] [blame]	1460	for (pass = 0; pass < 10 && (retry \|\| thp_retry); pass++) {
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	1461	retry = 0;
Anshuman Khandual	1a5bae2	2020-08-11 18:31:51 -0700	[diff] [blame]	1462	thp_retry = 0;
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	1463
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	1464	list_for_each_entry_safe(page, page2, from, lru) {
Michal Hocko	94723aa	2018-04-10 16:30:07 -0700	[diff] [blame]	1465	retry:
Anshuman Khandual	1a5bae2	2020-08-11 18:31:51 -0700	[diff] [blame]	1466	/*
				1467	* THP statistics is based on the source huge page.
				1468	* Capture required information that might get lost
				1469	* during migration.
				1470	*/
Zi Yan	6c5c7b9	2020-09-25 21:19:14 -0700	[diff] [blame]	1471	is_thp = PageTransHuge(page) && !PageHuge(page);
Matthew Wilcox (Oracle)	6c35784	2020-08-14 17:30:37 -0700	[diff] [blame]	1472	nr_subpages = thp_nr_pages(page);
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	1473	cond_resched();
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	1474
Naoya Horiguchi	31caf66	2013-09-11 14:21:59 -0700	[diff] [blame]	1475	if (PageHuge(page))
				1476	rc = unmap_and_move_huge_page(get_new_page,
David Rientjes	68711a7	2014-06-04 16:08:25 -0700	[diff] [blame]	1477	put_new_page, private, page,
Yang Shi	dd4ae78	2020-12-14 19:13:06 -0800	[diff] [blame]	1478	pass > 2, mode, reason,
				1479	&ret_pages);
Naoya Horiguchi	31caf66	2013-09-11 14:21:59 -0700	[diff] [blame]	1480	else
David Rientjes	68711a7	2014-06-04 16:08:25 -0700	[diff] [blame]	1481	rc = unmap_and_move(get_new_page, put_new_page,
Naoya Horiguchi	add05ce	2015-06-24 16:56:50 -0700	[diff] [blame]	1482	private, page, pass > 2, mode,
Yang Shi	dd4ae78	2020-12-14 19:13:06 -0800	[diff] [blame]	1483	reason, &ret_pages);
				1484	/*
				1485	* The rules are:
				1486	* Success: non hugetlb page will be freed, hugetlb
				1487	* page will be put back
				1488	* -EAGAIN: stay on the from list
				1489	* -ENOMEM: stay on the from list
				1490	* Other errno: put on ret_pages list then splice to
				1491	* from list
				1492	*/
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	1493	switch(rc) {
Yang Shi	d532e2e	2020-12-14 19:13:16 -0800	[diff] [blame]	1494	/*
				1495	* THP migration might be unsupported or the
				1496	* allocation could've failed so we should
				1497	* retry on the same page with the THP split
				1498	* to base pages.
				1499	*
				1500	* Head page is retried immediately and tail
				1501	* pages are added to the tail of the list so
				1502	* we encounter them after the rest of the list
				1503	* is processed.
				1504	*/
				1505	case -ENOSYS:
				1506	/* THP migration is unsupported */
				1507	if (is_thp) {
				1508	if (!try_split_thp(page, &page2, from)) {
				1509	nr_thp_split++;
				1510	goto retry;
				1511	}
				1512
				1513	nr_thp_failed++;
				1514	nr_failed += nr_subpages;
				1515	break;
				1516	}
				1517
				1518	/* Hugetlb migration is unsupported */
				1519	nr_failed++;
				1520	break;
Christoph Lameter	95a402c	2006-06-23 02:03:53 -0700	[diff] [blame]	1521	case -ENOMEM:
Michal Hocko	94723aa	2018-04-10 16:30:07 -0700	[diff] [blame]	1522	/*
Yang Shi	d532e2e	2020-12-14 19:13:16 -0800	[diff] [blame]	1523	* When memory is low, don't bother to try to migrate
				1524	* other pages, just exit.
Yang Shi	b0b515b	2021-06-30 18:51:48 -0700	[diff] [blame]	1525	* THP NUMA faulting doesn't split THP to retry.
Michal Hocko	94723aa	2018-04-10 16:30:07 -0700	[diff] [blame]	1526	*/
Yang Shi	b0b515b	2021-06-30 18:51:48 -0700	[diff] [blame]	1527	if (is_thp && !nosplit) {
Yang Shi	d532e2e	2020-12-14 19:13:16 -0800	[diff] [blame]	1528	if (!try_split_thp(page, &page2, from)) {
Anshuman Khandual	1a5bae2	2020-08-11 18:31:51 -0700	[diff] [blame]	1529	nr_thp_split++;
Michal Hocko	94723aa	2018-04-10 16:30:07 -0700	[diff] [blame]	1530	goto retry;
				1531	}
Zi Yan	6c5c7b9	2020-09-25 21:19:14 -0700	[diff] [blame]	1532
Anshuman Khandual	1a5bae2	2020-08-11 18:31:51 -0700	[diff] [blame]	1533	nr_thp_failed++;
				1534	nr_failed += nr_subpages;
				1535	goto out;
				1536	}
David Rientjes	dfef2ef	2016-05-20 16:59:05 -0700	[diff] [blame]	1537	nr_failed++;
Christoph Lameter	95a402c	2006-06-23 02:03:53 -0700	[diff] [blame]	1538	goto out;
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	1539	case -EAGAIN:
Anshuman Khandual	1a5bae2	2020-08-11 18:31:51 -0700	[diff] [blame]	1540	if (is_thp) {
				1541	thp_retry++;
				1542	break;
				1543	}
Christoph Lameter	2d1db3b	2006-06-23 02:03:33 -0700	[diff] [blame]	1544	retry++;
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	1545	break;
Rafael Aquini	78bd520	2012-12-11 16:02:31 -0800	[diff] [blame]	1546	case MIGRATEPAGE_SUCCESS:
Anshuman Khandual	1a5bae2	2020-08-11 18:31:51 -0700	[diff] [blame]	1547	if (is_thp) {
				1548	nr_thp_succeeded++;
				1549	nr_succeeded += nr_subpages;
				1550	break;
				1551	}
Mel Gorman	5647bc2	2012-10-19 10:46:20 +0100	[diff] [blame]	1552	nr_succeeded++;
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	1553	break;
				1554	default:
Naoya Horiguchi	354a336	2014-01-21 15:51:14 -0800	[diff] [blame]	1555	/*
Yang Shi	d532e2e	2020-12-14 19:13:16 -0800	[diff] [blame]	1556	* Permanent failure (-EBUSY, etc.):
Naoya Horiguchi	354a336	2014-01-21 15:51:14 -0800	[diff] [blame]	1557	* unlike -EAGAIN case, the failed page is
				1558	* removed from migration page list and not
				1559	* retried in the next outer loop.
				1560	*/
Anshuman Khandual	1a5bae2	2020-08-11 18:31:51 -0700	[diff] [blame]	1561	if (is_thp) {
				1562	nr_thp_failed++;
				1563	nr_failed += nr_subpages;
				1564	break;
				1565	}
Christoph Lameter	2d1db3b	2006-06-23 02:03:33 -0700	[diff] [blame]	1566	nr_failed++;
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	1567	break;
Christoph Lameter	2d1db3b	2006-06-23 02:03:33 -0700	[diff] [blame]	1568	}
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	1569	}
				1570	}
Anshuman Khandual	1a5bae2	2020-08-11 18:31:51 -0700	[diff] [blame]	1571	nr_failed += retry + thp_retry;
				1572	nr_thp_failed += thp_retry;
Vlastimil Babka	f2f81fb	2015-11-05 18:47:03 -0800	[diff] [blame]	1573	rc = nr_failed;
Christoph Lameter	95a402c	2006-06-23 02:03:53 -0700	[diff] [blame]	1574	out:
Yang Shi	dd4ae78	2020-12-14 19:13:06 -0800	[diff] [blame]	1575	/*
				1576	* Put the permanent failure page back to migration list, they
				1577	* will be put back to the right list by the caller.
				1578	*/
				1579	list_splice(&ret_pages, from);
				1580
Anshuman Khandual	1a5bae2	2020-08-11 18:31:51 -0700	[diff] [blame]	1581	count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
				1582	count_vm_events(PGMIGRATE_FAIL, nr_failed);
				1583	count_vm_events(THP_MIGRATION_SUCCESS, nr_thp_succeeded);
				1584	count_vm_events(THP_MIGRATION_FAIL, nr_thp_failed);
				1585	count_vm_events(THP_MIGRATION_SPLIT, nr_thp_split);
				1586	trace_mm_migrate_pages(nr_succeeded, nr_failed, nr_thp_succeeded,
				1587	nr_thp_failed, nr_thp_split, mode, reason);
Mel Gorman	7b2a2d4	2012-10-19 14:07:31 +0100	[diff] [blame]	1588
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	1589	if (!swapwrite)
				1590	current->flags &= ~PF_SWAPWRITE;
				1591
Yang Shi	5ac9588	2021-09-02 14:59:13 -0700	[diff] [blame]	1592	if (ret_succeeded)
				1593	*ret_succeeded = nr_succeeded;
				1594
Rafael Aquini	78bd520	2012-12-11 16:02:31 -0800	[diff] [blame]	1595	return rc;
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	1596	}
				1597
Joonsoo Kim	19fc7be	2020-08-11 18:37:25 -0700	[diff] [blame]	1598	struct page alloc_migration_target(struct page page, unsigned long private)
Joonsoo Kim	b4b3822	2020-08-11 18:37:14 -0700	[diff] [blame]	1599	{
Joonsoo Kim	19fc7be	2020-08-11 18:37:25 -0700	[diff] [blame]	1600	struct migration_target_control *mtc;
				1601	gfp_t gfp_mask;
Joonsoo Kim	b4b3822	2020-08-11 18:37:14 -0700	[diff] [blame]	1602	unsigned int order = 0;
				1603	struct page *new_page = NULL;
Joonsoo Kim	19fc7be	2020-08-11 18:37:25 -0700	[diff] [blame]	1604	int nid;
				1605	int zidx;
				1606
				1607	mtc = (struct migration_target_control *)private;
				1608	gfp_mask = mtc->gfp_mask;
				1609	nid = mtc->nid;
				1610	if (nid == NUMA_NO_NODE)
				1611	nid = page_to_nid(page);
Joonsoo Kim	b4b3822	2020-08-11 18:37:14 -0700	[diff] [blame]	1612
Joonsoo Kim	d92bbc2	2020-08-11 18:37:17 -0700	[diff] [blame]	1613	if (PageHuge(page)) {
				1614	struct hstate *h = page_hstate(compound_head(page));
				1615
Joonsoo Kim	19fc7be	2020-08-11 18:37:25 -0700	[diff] [blame]	1616	gfp_mask = htlb_modify_alloc_mask(h, gfp_mask);
				1617	return alloc_huge_page_nodemask(h, nid, mtc->nmask, gfp_mask);
Joonsoo Kim	d92bbc2	2020-08-11 18:37:17 -0700	[diff] [blame]	1618	}
Joonsoo Kim	b4b3822	2020-08-11 18:37:14 -0700	[diff] [blame]	1619
				1620	if (PageTransHuge(page)) {
Joonsoo Kim	9933a0c	2020-08-11 18:37:20 -0700	[diff] [blame]	1621	/*
				1622	* clear __GFP_RECLAIM to make the migration callback
				1623	* consistent with regular THP allocations.
				1624	*/
				1625	gfp_mask &= ~__GFP_RECLAIM;
Joonsoo Kim	b4b3822	2020-08-11 18:37:14 -0700	[diff] [blame]	1626	gfp_mask \|= GFP_TRANSHUGE;
				1627	order = HPAGE_PMD_ORDER;
				1628	}
Joonsoo Kim	19fc7be	2020-08-11 18:37:25 -0700	[diff] [blame]	1629	zidx = zone_idx(page_zone(page));
				1630	if (is_highmem_idx(zidx) \|\| zidx == ZONE_MOVABLE)
Joonsoo Kim	b4b3822	2020-08-11 18:37:14 -0700	[diff] [blame]	1631	gfp_mask \|= __GFP_HIGHMEM;
				1632
Matthew Wilcox (Oracle)	84172f4	2021-04-29 23:01:15 -0700	[diff] [blame]	1633	new_page = __alloc_pages(gfp_mask, order, nid, mtc->nmask);
Joonsoo Kim	b4b3822	2020-08-11 18:37:14 -0700	[diff] [blame]	1634
				1635	if (new_page && PageTransHuge(new_page))
				1636	prep_transhuge_page(new_page);
				1637
				1638	return new_page;
				1639	}
				1640
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1641	#ifdef CONFIG_NUMA
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1642
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1643	static int store_status(int __user *status, int start, int value, int nr)
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1644	{
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1645	while (nr-- > 0) {
				1646	if (put_user(value, status + start))
				1647	return -EFAULT;
				1648	start++;
				1649	}
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1650
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1651	return 0;
				1652	}
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1653
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1654	static int do_move_pages_to_node(struct mm_struct *mm,
				1655	struct list_head *pagelist, int node)
				1656	{
				1657	int err;
Joonsoo Kim	a097631	2020-08-11 18:37:28 -0700	[diff] [blame]	1658	struct migration_target_control mtc = {
				1659	.nid = node,
				1660	.gfp_mask = GFP_HIGHUSER_MOVABLE \| __GFP_THISNODE,
				1661	};
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1662
Joonsoo Kim	a097631	2020-08-11 18:37:28 -0700	[diff] [blame]	1663	err = migrate_pages(pagelist, alloc_migration_target, NULL,
Yang Shi	5ac9588	2021-09-02 14:59:13 -0700	[diff] [blame]	1664	(unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL);
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1665	if (err)
				1666	putback_movable_pages(pagelist);
				1667	return err;
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1668	}
				1669
				1670	/*
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1671	* Resolves the given address to a struct page, isolates it from the LRU and
				1672	* puts it to the given pagelist.
Yang Shi	e0153fc	2020-01-04 12:59:46 -0800	[diff] [blame]	1673	* Returns:
				1674	* errno - if the page cannot be found/isolated
				1675	* 0 - when it doesn't have to be migrated because it is already on the
				1676	* target node
				1677	* 1 - when it has been queued
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1678	*/
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1679	static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
				1680	int node, struct list_head *pagelist, bool migrate_all)
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1681	{
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1682	struct vm_area_struct *vma;
				1683	struct page *page;
				1684	unsigned int follflags;
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1685	int err;
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1686
Michel Lespinasse	d8ed45c	2020-06-08 21:33:25 -0700	[diff] [blame]	1687	mmap_read_lock(mm);
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1688	err = -EFAULT;
				1689	vma = find_vma(mm, addr);
				1690	if (!vma \|\| addr < vma->vm_start \|\| !vma_migratable(vma))
				1691	goto out;
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1692
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1693	/* FOLL_DUMP to ignore special (like zero) pages */
				1694	follflags = FOLL_GET \| FOLL_DUMP;
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1695	page = follow_page(vma, addr, follflags);
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1696
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1697	err = PTR_ERR(page);
				1698	if (IS_ERR(page))
				1699	goto out;
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1700
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1701	err = -ENOENT;
				1702	if (!page)
				1703	goto out;
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1704
Brice Goglin	e78bbfa	2008-10-18 20:27:15 -0700	[diff] [blame]	1705	err = 0;
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1706	if (page_to_nid(page) == node)
				1707	goto out_putpage;
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1708
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1709	err = -EACCES;
				1710	if (page_mapcount(page) > 1 && !migrate_all)
				1711	goto out_putpage;
				1712
				1713	if (PageHuge(page)) {
				1714	if (PageHead(page)) {
				1715	isolate_huge_page(page, pagelist);
Yang Shi	e0153fc	2020-01-04 12:59:46 -0800	[diff] [blame]	1716	err = 1;
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1717	}
				1718	} else {
				1719	struct page *head;
				1720
				1721	head = compound_head(page);
				1722	err = isolate_lru_page(head);
				1723	if (err)
				1724	goto out_putpage;
				1725
Yang Shi	e0153fc	2020-01-04 12:59:46 -0800	[diff] [blame]	1726	err = 1;
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1727	list_add_tail(&head->lru, pagelist);
				1728	mod_node_page_state(page_pgdat(head),
Huang Ying	9de4f22	2020-04-06 20:04:41 -0700	[diff] [blame]	1729	NR_ISOLATED_ANON + page_is_file_lru(head),
Matthew Wilcox (Oracle)	6c35784	2020-08-14 17:30:37 -0700	[diff] [blame]	1730	thp_nr_pages(head));
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1731	}
				1732	out_putpage:
				1733	/*
				1734	* Either remove the duplicate refcount from
				1735	* isolate_lru_page() or drop the page ref if it was
				1736	* not isolated.
				1737	*/
				1738	put_page(page);
				1739	out:
Michel Lespinasse	d8ed45c	2020-06-08 21:33:25 -0700	[diff] [blame]	1740	mmap_read_unlock(mm);
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1741	return err;
				1742	}
				1743
Wei Yang	7ca8783	2020-04-06 20:04:12 -0700	[diff] [blame]	1744	static int move_pages_and_store_status(struct mm_struct *mm, int node,
				1745	struct list_head pagelist, int __user status,
				1746	int start, int i, unsigned long nr_pages)
				1747	{
				1748	int err;
				1749
Wei Yang	5d7ae89	2020-04-06 20:04:15 -0700	[diff] [blame]	1750	if (list_empty(pagelist))
				1751	return 0;
				1752
Wei Yang	7ca8783	2020-04-06 20:04:12 -0700	[diff] [blame]	1753	err = do_move_pages_to_node(mm, pagelist, node);
				1754	if (err) {
				1755	/*
				1756	* Positive err means the number of failed
				1757	* pages to migrate. Since we are going to
				1758	* abort and return the number of non-migrated
Long Li	ab9dd4f	2020-12-14 19:12:52 -0800	[diff] [blame]	1759	* pages, so need to include the rest of the
Wei Yang	7ca8783	2020-04-06 20:04:12 -0700	[diff] [blame]	1760	* nr_pages that have not been attempted as
				1761	* well.
				1762	*/
				1763	if (err > 0)
				1764	err += nr_pages - i - 1;
				1765	return err;
				1766	}
				1767	return store_status(status, start, node, i - start);
				1768	}
				1769
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1770	/*
Brice Goglin	5e9a0f0	2008-10-18 20:27:17 -0700	[diff] [blame]	1771	* Migrate an array of page address onto an array of nodes and fill
				1772	* the corresponding array of status.
				1773	*/
Christoph Lameter	3268c63	2012-03-21 16:34:06 -0700	[diff] [blame]	1774	static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
Brice Goglin	5e9a0f0	2008-10-18 20:27:17 -0700	[diff] [blame]	1775	unsigned long nr_pages,
				1776	const void __user * __user *pages,
				1777	const int __user *nodes,
				1778	int __user *status, int flags)
				1779	{
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1780	int current_node = NUMA_NO_NODE;
				1781	LIST_HEAD(pagelist);
				1782	int start, i;
				1783	int err = 0, err1;
Brice Goglin	35282a2	2009-06-16 15:32:43 -0700	[diff] [blame]	1784
Minchan Kim	361a2a2	2021-05-04 18:36:57 -0700	[diff] [blame]	1785	lru_cache_disable();
Brice Goglin	35282a2	2009-06-16 15:32:43 -0700	[diff] [blame]	1786
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1787	for (i = start = 0; i < nr_pages; i++) {
				1788	const void __user *p;
				1789	unsigned long addr;
				1790	int node;
Brice Goglin	5e9a0f0	2008-10-18 20:27:17 -0700	[diff] [blame]	1791
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1792	err = -EFAULT;
				1793	if (get_user(p, pages + i))
				1794	goto out_flush;
				1795	if (get_user(node, nodes + i))
				1796	goto out_flush;
Andrey Konovalov	057d3389	2019-09-25 16:48:30 -0700	[diff] [blame]	1797	addr = (unsigned long)untagged_addr(p);
Brice Goglin	5e9a0f0	2008-10-18 20:27:17 -0700	[diff] [blame]	1798
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1799	err = -ENODEV;
				1800	if (node < 0 \|\| node >= MAX_NUMNODES)
				1801	goto out_flush;
				1802	if (!node_state(node, N_MEMORY))
				1803	goto out_flush;
Brice Goglin	3140a22	2009-01-06 14:38:57 -0800	[diff] [blame]	1804
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1805	err = -EACCES;
				1806	if (!node_isset(node, task_nodes))
				1807	goto out_flush;
Brice Goglin	5e9a0f0	2008-10-18 20:27:17 -0700	[diff] [blame]	1808
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1809	if (current_node == NUMA_NO_NODE) {
				1810	current_node = node;
				1811	start = i;
				1812	} else if (node != current_node) {
Wei Yang	7ca8783	2020-04-06 20:04:12 -0700	[diff] [blame]	1813	err = move_pages_and_store_status(mm, current_node,
				1814	&pagelist, status, start, i, nr_pages);
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1815	if (err)
				1816	goto out;
				1817	start = i;
				1818	current_node = node;
Brice Goglin	3140a22	2009-01-06 14:38:57 -0800	[diff] [blame]	1819	}
Brice Goglin	5e9a0f0	2008-10-18 20:27:17 -0700	[diff] [blame]	1820
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1821	/*
				1822	* Errors in the page lookup or isolation are not fatal and we simply
				1823	* report them via status
				1824	*/
				1825	err = add_page_for_migration(mm, addr, current_node,
				1826	&pagelist, flags & MPOL_MF_MOVE_ALL);
Yang Shi	e0153fc	2020-01-04 12:59:46 -0800	[diff] [blame]	1827
Wei Yang	d08221a	2020-04-06 20:04:18 -0700	[diff] [blame]	1828	if (err > 0) {
Yang Shi	e0153fc	2020-01-04 12:59:46 -0800	[diff] [blame]	1829	/* The page is successfully queued for migration */
				1830	continue;
				1831	}
Brice Goglin	3140a22	2009-01-06 14:38:57 -0800	[diff] [blame]	1832
Wei Yang	d08221a	2020-04-06 20:04:18 -0700	[diff] [blame]	1833	/*
				1834	* If the page is already on the target node (!err), store the
				1835	* node, otherwise, store the err.
				1836	*/
				1837	err = store_status(status, i, err ? : current_node, 1);
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1838	if (err)
				1839	goto out_flush;
Brice Goglin	3140a22	2009-01-06 14:38:57 -0800	[diff] [blame]	1840
Wei Yang	7ca8783	2020-04-06 20:04:12 -0700	[diff] [blame]	1841	err = move_pages_and_store_status(mm, current_node, &pagelist,
				1842	status, start, i, nr_pages);
Wei Yang	4afdace	2020-04-06 20:04:09 -0700	[diff] [blame]	1843	if (err)
				1844	goto out;
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1845	current_node = NUMA_NO_NODE;
Brice Goglin	3140a22	2009-01-06 14:38:57 -0800	[diff] [blame]	1846	}
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1847	out_flush:
				1848	/* Make sure we do not overwrite the existing error */
Wei Yang	7ca8783	2020-04-06 20:04:12 -0700	[diff] [blame]	1849	err1 = move_pages_and_store_status(mm, current_node, &pagelist,
				1850	status, start, i, nr_pages);
Wei Yang	dfe9aa2	2020-01-30 22:11:14 -0800	[diff] [blame]	1851	if (err >= 0)
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1852	err = err1;
Brice Goglin	5e9a0f0	2008-10-18 20:27:17 -0700	[diff] [blame]	1853	out:
Minchan Kim	361a2a2	2021-05-04 18:36:57 -0700	[diff] [blame]	1854	lru_cache_enable();
Brice Goglin	5e9a0f0	2008-10-18 20:27:17 -0700	[diff] [blame]	1855	return err;
				1856	}
				1857
				1858	/*
Brice Goglin	2f007e7	2008-10-18 20:27:16 -0700	[diff] [blame]	1859	* Determine the nodes of an array of pages and store it in an array of status.
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1860	*/
Brice Goglin	80bba12	2008-12-09 13:14:23 -0800	[diff] [blame]	1861	static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
				1862	const void __user *pages, int status)
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1863	{
Brice Goglin	2f007e7	2008-10-18 20:27:16 -0700	[diff] [blame]	1864	unsigned long i;
Brice Goglin	2f007e7	2008-10-18 20:27:16 -0700	[diff] [blame]	1865
Michel Lespinasse	d8ed45c	2020-06-08 21:33:25 -0700	[diff] [blame]	1866	mmap_read_lock(mm);
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1867
Brice Goglin	2f007e7	2008-10-18 20:27:16 -0700	[diff] [blame]	1868	for (i = 0; i < nr_pages; i++) {
Brice Goglin	80bba12	2008-12-09 13:14:23 -0800	[diff] [blame]	1869	unsigned long addr = (unsigned long)(*pages);
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1870	struct vm_area_struct *vma;
				1871	struct page *page;
KOSAKI Motohiro	c095adb	2008-12-16 16:06:43 +0900	[diff] [blame]	1872	int err = -EFAULT;
Brice Goglin	2f007e7	2008-10-18 20:27:16 -0700	[diff] [blame]	1873
Liam Howlett	059b8b4	2021-06-28 19:39:44 -0700	[diff] [blame]	1874	vma = vma_lookup(mm, addr);
				1875	if (!vma)
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1876	goto set_status;
				1877
Kirill A. Shutemov	d899844	2015-09-04 15:47:53 -0700	[diff] [blame]	1878	/* FOLL_DUMP to ignore special (like zero) pages */
				1879	page = follow_page(vma, addr, FOLL_DUMP);
Linus Torvalds	89f5b7d	2008-06-20 11:18:25 -0700	[diff] [blame]	1880
				1881	err = PTR_ERR(page);
				1882	if (IS_ERR(page))
				1883	goto set_status;
				1884
Kirill A. Shutemov	d899844	2015-09-04 15:47:53 -0700	[diff] [blame]	1885	err = page ? page_to_nid(page) : -ENOENT;
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1886	set_status:
Brice Goglin	80bba12	2008-12-09 13:14:23 -0800	[diff] [blame]	1887	*status = err;
				1888
				1889	pages++;
				1890	status++;
				1891	}
				1892
Michel Lespinasse	d8ed45c	2020-06-08 21:33:25 -0700	[diff] [blame]	1893	mmap_read_unlock(mm);
Brice Goglin	80bba12	2008-12-09 13:14:23 -0800	[diff] [blame]	1894	}
				1895
Arnd Bergmann	5b1b561	2021-09-08 15:18:17 -0700	[diff] [blame]	1896	static int get_compat_pages_array(const void __user *chunk_pages[],
				1897	const void __user * __user *pages,
				1898	unsigned long chunk_nr)
				1899	{
				1900	compat_uptr_t __user pages32 = (compat_uptr_t __user )pages;
				1901	compat_uptr_t p;
				1902	int i;
				1903
				1904	for (i = 0; i < chunk_nr; i++) {
				1905	if (get_user(p, pages32 + i))
				1906	return -EFAULT;
				1907	chunk_pages[i] = compat_ptr(p);
				1908	}
				1909
				1910	return 0;
				1911	}
				1912
Brice Goglin	80bba12	2008-12-09 13:14:23 -0800	[diff] [blame]	1913	/*
				1914	* Determine the nodes of a user array of pages and store it in
				1915	* a user array of status.
				1916	*/
				1917	static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
				1918	const void __user * __user *pages,
				1919	int __user *status)
				1920	{
				1921	#define DO_PAGES_STAT_CHUNK_NR 16
				1922	const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR];
				1923	int chunk_status[DO_PAGES_STAT_CHUNK_NR];
Brice Goglin	80bba12	2008-12-09 13:14:23 -0800	[diff] [blame]	1924
H. Peter Anvin	87b8d1a	2010-02-18 16:13:40 -0800	[diff] [blame]	1925	while (nr_pages) {
				1926	unsigned long chunk_nr;
Brice Goglin	80bba12	2008-12-09 13:14:23 -0800	[diff] [blame]	1927
H. Peter Anvin	87b8d1a	2010-02-18 16:13:40 -0800	[diff] [blame]	1928	chunk_nr = nr_pages;
				1929	if (chunk_nr > DO_PAGES_STAT_CHUNK_NR)
				1930	chunk_nr = DO_PAGES_STAT_CHUNK_NR;
				1931
Arnd Bergmann	5b1b561	2021-09-08 15:18:17 -0700	[diff] [blame]	1932	if (in_compat_syscall()) {
				1933	if (get_compat_pages_array(chunk_pages, pages,
				1934	chunk_nr))
				1935	break;
				1936	} else {
				1937	if (copy_from_user(chunk_pages, pages,
				1938	chunk_nr * sizeof(*chunk_pages)))
				1939	break;
				1940	}
Brice Goglin	80bba12	2008-12-09 13:14:23 -0800	[diff] [blame]	1941
				1942	do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status);
				1943
H. Peter Anvin	87b8d1a	2010-02-18 16:13:40 -0800	[diff] [blame]	1944	if (copy_to_user(status, chunk_status, chunk_nr * sizeof(*status)))
				1945	break;
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1946
H. Peter Anvin	87b8d1a	2010-02-18 16:13:40 -0800	[diff] [blame]	1947	pages += chunk_nr;
				1948	status += chunk_nr;
				1949	nr_pages -= chunk_nr;
				1950	}
				1951	return nr_pages ? -EFAULT : 0;
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1952	}
				1953
Miaohe Lin	4dc200c	2020-10-17 16:14:03 -0700	[diff] [blame]	1954	static struct mm_struct find_mm_struct(pid_t pid, nodemask_t mem_nodes)
				1955	{
				1956	struct task_struct *task;
				1957	struct mm_struct *mm;
				1958
				1959	/*
				1960	* There is no need to check if current process has the right to modify
				1961	* the specified process when they are same.
				1962	*/
				1963	if (!pid) {
				1964	mmget(current->mm);
				1965	*mem_nodes = cpuset_mems_allowed(current);
				1966	return current->mm;
				1967	}
				1968
				1969	/* Find the mm_struct */
				1970	rcu_read_lock();
				1971	task = find_task_by_vpid(pid);
				1972	if (!task) {
				1973	rcu_read_unlock();
				1974	return ERR_PTR(-ESRCH);
				1975	}
				1976	get_task_struct(task);
				1977
				1978	/*
				1979	* Check if this process has the right to modify the specified
				1980	* process. Use the regular "ptrace_may_access()" checks.
				1981	*/
				1982	if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
				1983	rcu_read_unlock();
				1984	mm = ERR_PTR(-EPERM);
				1985	goto out;
				1986	}
				1987	rcu_read_unlock();
				1988
				1989	mm = ERR_PTR(security_task_movememory(task));
				1990	if (IS_ERR(mm))
				1991	goto out;
				1992	*mem_nodes = cpuset_mems_allowed(task);
				1993	mm = get_task_mm(task);
				1994	out:
				1995	put_task_struct(task);
				1996	if (!mm)
				1997	mm = ERR_PTR(-EINVAL);
				1998	return mm;
				1999	}
				2000
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	2001	/*
				2002	* Move a list of pages in the address space of the currently executing
				2003	* process.
				2004	*/
Dominik Brodowski	7addf44	2018-03-17 16:08:03 +0100	[diff] [blame]	2005	static int kernel_move_pages(pid_t pid, unsigned long nr_pages,
				2006	const void __user * __user *pages,
				2007	const int __user *nodes,
				2008	int __user *status, int flags)
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	2009	{
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	2010	struct mm_struct *mm;
Brice Goglin	5e9a0f0	2008-10-18 20:27:17 -0700	[diff] [blame]	2011	int err;
Christoph Lameter	3268c63	2012-03-21 16:34:06 -0700	[diff] [blame]	2012	nodemask_t task_nodes;
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	2013
				2014	/* Check flags */
				2015	if (flags & ~(MPOL_MF_MOVE\|MPOL_MF_MOVE_ALL))
				2016	return -EINVAL;
				2017
				2018	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
				2019	return -EPERM;
				2020
Miaohe Lin	4dc200c	2020-10-17 16:14:03 -0700	[diff] [blame]	2021	mm = find_mm_struct(pid, &task_nodes);
				2022	if (IS_ERR(mm))
				2023	return PTR_ERR(mm);
Sasha Levin	6e8b09e	2012-04-25 16:01:53 -0700	[diff] [blame]	2024
				2025	if (nodes)
				2026	err = do_pages_move(mm, task_nodes, nr_pages, pages,
				2027	nodes, status, flags);
				2028	else
				2029	err = do_pages_stat(mm, nr_pages, pages, status);
Christoph Lameter	3268c63	2012-03-21 16:34:06 -0700	[diff] [blame]	2030
				2031	mmput(mm);
				2032	return err;
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	2033	}
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	2034
Dominik Brodowski	7addf44	2018-03-17 16:08:03 +0100	[diff] [blame]	2035	SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
				2036	const void __user * __user *, pages,
				2037	const int __user *, nodes,
				2038	int __user *, status, int, flags)
				2039	{
				2040	return kernel_move_pages(pid, nr_pages, pages, nodes, status, flags);
				2041	}
				2042
Peter Zijlstra	7039e1d	2012-10-25 14:16:34 +0200	[diff] [blame]	2043	#ifdef CONFIG_NUMA_BALANCING
				2044	/*
				2045	* Returns true if this is a safe migration target node for misplaced NUMA
				2046	* pages. Currently it only checks the watermarks which crude
				2047	*/
				2048	static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
Mel Gorman	3abef4e	2013-02-22 16:34:27 -0800	[diff] [blame]	2049	unsigned long nr_migrate_pages)
Peter Zijlstra	7039e1d	2012-10-25 14:16:34 +0200	[diff] [blame]	2050	{
				2051	int z;
Mel Gorman	599d0c9	2016-07-28 15:45:31 -0700	[diff] [blame]	2052
Peter Zijlstra	7039e1d	2012-10-25 14:16:34 +0200	[diff] [blame]	2053	for (z = pgdat->nr_zones - 1; z >= 0; z--) {
				2054	struct zone *zone = pgdat->node_zones + z;
				2055
				2056	if (!populated_zone(zone))
				2057	continue;
				2058
Peter Zijlstra	7039e1d	2012-10-25 14:16:34 +0200	[diff] [blame]	2059	/* Avoid waking kswapd by allocating pages_to_migrate pages. */
				2060	if (!zone_watermark_ok(zone, 0,
				2061	high_wmark_pages(zone) +
				2062	nr_migrate_pages,
Huang Ying	bfe9d00	2019-11-30 17:57:28 -0800	[diff] [blame]	2063	ZONE_MOVABLE, 0))
Peter Zijlstra	7039e1d	2012-10-25 14:16:34 +0200	[diff] [blame]	2064	continue;
				2065	return true;
				2066	}
				2067	return false;
				2068	}
				2069
				2070	static struct page alloc_misplaced_dst_page(struct page page,
Michal Hocko	666feb2	2018-04-10 16:30:03 -0700	[diff] [blame]	2071	unsigned long data)
Peter Zijlstra	7039e1d	2012-10-25 14:16:34 +0200	[diff] [blame]	2072	{
				2073	int nid = (int) data;
				2074	struct page *newpage;
				2075
Vlastimil Babka	96db800	2015-09-08 15:03:50 -0700	[diff] [blame]	2076	newpage = __alloc_pages_node(nid,
Johannes Weiner	e97ca8e5	2014-03-10 15:49:43 -0700	[diff] [blame]	2077	(GFP_HIGHUSER_MOVABLE \|
				2078	__GFP_THISNODE \| __GFP_NOMEMALLOC \|
				2079	__GFP_NORETRY \| __GFP_NOWARN) &
Mel Gorman	8479eba	2016-02-26 15:19:31 -0800	[diff] [blame]	2080	~__GFP_RECLAIM, 0);
Hillf Danton	bac0382	2012-11-27 14:46:24 +0000	[diff] [blame]	2081
Peter Zijlstra	7039e1d	2012-10-25 14:16:34 +0200	[diff] [blame]	2082	return newpage;
				2083	}
				2084
Yang Shi	c5b5a3d	2021-06-30 18:51:42 -0700	[diff] [blame]	2085	static struct page alloc_misplaced_dst_page_thp(struct page page,
				2086	unsigned long data)
				2087	{
				2088	int nid = (int) data;
				2089	struct page *newpage;
				2090
				2091	newpage = alloc_pages_node(nid, (GFP_TRANSHUGE_LIGHT \| __GFP_THISNODE),
				2092	HPAGE_PMD_ORDER);
				2093	if (!newpage)
				2094	goto out;
				2095
				2096	prep_transhuge_page(newpage);
				2097
				2098	out:
				2099	return newpage;
				2100	}
				2101
Mel Gorman	1c30e01	2014-01-21 15:50:58 -0800	[diff] [blame]	2102	static int numamigrate_isolate_page(pg_data_t pgdat, struct page page)
Mel Gorman	b32967f	2012-11-19 12:35:47 +0000	[diff] [blame]	2103	{
Hugh Dickins	340ef39	2013-02-22 16:34:33 -0800	[diff] [blame]	2104	int page_lru;
Baolin Wang	2b9b624	2021-09-08 15:18:01 -0700	[diff] [blame]	2105	int nr_pages = thp_nr_pages(page);
Mel Gorman	b32967f	2012-11-19 12:35:47 +0000	[diff] [blame]	2106
Sasha Levin	309381fea	2014-01-23 15:52:54 -0800	[diff] [blame]	2107	VM_BUG_ON_PAGE(compound_order(page) && !PageTransHuge(page), page);
Mel Gorman	3abef4e	2013-02-22 16:34:27 -0800	[diff] [blame]	2108
Yang Shi	662aeea	2021-06-30 18:51:51 -0700	[diff] [blame]	2109	/* Do not migrate THP mapped by multiple processes */
				2110	if (PageTransHuge(page) && total_mapcount(page) > 1)
				2111	return 0;
				2112
Mel Gorman	b32967f	2012-11-19 12:35:47 +0000	[diff] [blame]	2113	/* Avoid migrating to a node that is nearly full */
Baolin Wang	2b9b624	2021-09-08 15:18:01 -0700	[diff] [blame]	2114	if (!migrate_balanced_pgdat(pgdat, nr_pages))
Hugh Dickins	340ef39	2013-02-22 16:34:33 -0800	[diff] [blame]	2115	return 0;
Mel Gorman	b32967f	2012-11-19 12:35:47 +0000	[diff] [blame]	2116
Hugh Dickins	340ef39	2013-02-22 16:34:33 -0800	[diff] [blame]	2117	if (isolate_lru_page(page))
				2118	return 0;
Mel Gorman	b32967f	2012-11-19 12:35:47 +0000	[diff] [blame]	2119
Huang Ying	9de4f22	2020-04-06 20:04:41 -0700	[diff] [blame]	2120	page_lru = page_is_file_lru(page);
Mel Gorman	599d0c9	2016-07-28 15:45:31 -0700	[diff] [blame]	2121	mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_lru,
Baolin Wang	2b9b624	2021-09-08 15:18:01 -0700	[diff] [blame]	2122	nr_pages);
Hugh Dickins	340ef39	2013-02-22 16:34:33 -0800	[diff] [blame]	2123
				2124	/*
				2125	* Isolating the page has taken another reference, so the
				2126	* caller's reference can be safely dropped without the page
				2127	* disappearing underneath us during migration.
Mel Gorman	b32967f	2012-11-19 12:35:47 +0000	[diff] [blame]	2128	*/
				2129	put_page(page);
Hugh Dickins	340ef39	2013-02-22 16:34:33 -0800	[diff] [blame]	2130	return 1;
Mel Gorman	b32967f	2012-11-19 12:35:47 +0000	[diff] [blame]	2131	}
				2132
Mel Gorman	a8f6077	2012-11-14 21:41:46 +0000	[diff] [blame]	2133	/*
Peter Zijlstra	7039e1d	2012-10-25 14:16:34 +0200	[diff] [blame]	2134	* Attempt to migrate a misplaced page to the specified destination
				2135	* node. Caller is expected to have an elevated reference count on
				2136	* the page that will be dropped by this function before returning.
				2137	*/
Mel Gorman	1bc115d	2013-10-07 11:29:05 +0100	[diff] [blame]	2138	int migrate_misplaced_page(struct page page, struct vm_area_struct vma,
				2139	int node)
Peter Zijlstra	7039e1d	2012-10-25 14:16:34 +0200	[diff] [blame]	2140	{
Mel Gorman	a8f6077	2012-11-14 21:41:46 +0000	[diff] [blame]	2141	pg_data_t *pgdat = NODE_DATA(node);
Hugh Dickins	340ef39	2013-02-22 16:34:33 -0800	[diff] [blame]	2142	int isolated;
Mel Gorman	b32967f	2012-11-19 12:35:47 +0000	[diff] [blame]	2143	int nr_remaining;
Peter Zijlstra	7039e1d	2012-10-25 14:16:34 +0200	[diff] [blame]	2144	LIST_HEAD(migratepages);
Yang Shi	c5b5a3d	2021-06-30 18:51:42 -0700	[diff] [blame]	2145	new_page_t *new;
				2146	bool compound;
Aneesh Kumar K.V	b5916c0	2021-07-29 14:53:47 -0700	[diff] [blame]	2147	int nr_pages = thp_nr_pages(page);
Yang Shi	c5b5a3d	2021-06-30 18:51:42 -0700	[diff] [blame]	2148
				2149	/*
				2150	* PTE mapped THP or HugeTLB page can't reach here so the page could
				2151	* be either base page or THP. And it must be head page if it is
				2152	* THP.
				2153	*/
				2154	compound = PageTransHuge(page);
				2155
				2156	if (compound)
				2157	new = alloc_misplaced_dst_page_thp;
				2158	else
				2159	new = alloc_misplaced_dst_page;
Peter Zijlstra	7039e1d	2012-10-25 14:16:34 +0200	[diff] [blame]	2160
				2161	/*
Mel Gorman	1bc115d	2013-10-07 11:29:05 +0100	[diff] [blame]	2162	* Don't migrate file pages that are mapped in multiple processes
				2163	* with execute permissions as they are probably shared libraries.
Peter Zijlstra	7039e1d	2012-10-25 14:16:34 +0200	[diff] [blame]	2164	*/
Miaohe Lin	7ee820e	2021-05-04 18:37:16 -0700	[diff] [blame]	2165	if (page_mapcount(page) != 1 && page_is_file_lru(page) &&
				2166	(vma->vm_flags & VM_EXEC))
Peter Zijlstra	7039e1d	2012-10-25 14:16:34 +0200	[diff] [blame]	2167	goto out;
Peter Zijlstra	7039e1d	2012-10-25 14:16:34 +0200	[diff] [blame]	2168
Mel Gorman	a8f6077	2012-11-14 21:41:46 +0000	[diff] [blame]	2169	/*
Mel Gorman	09a913a	2018-04-10 16:29:20 -0700	[diff] [blame]	2170	* Also do not migrate dirty pages as not all filesystems can move
				2171	* dirty pages in MIGRATE_ASYNC mode which is a waste of cycles.
				2172	*/
Huang Ying	9de4f22	2020-04-06 20:04:41 -0700	[diff] [blame]	2173	if (page_is_file_lru(page) && PageDirty(page))
Mel Gorman	09a913a	2018-04-10 16:29:20 -0700	[diff] [blame]	2174	goto out;
				2175
Mel Gorman	b32967f	2012-11-19 12:35:47 +0000	[diff] [blame]	2176	isolated = numamigrate_isolate_page(pgdat, page);
				2177	if (!isolated)
				2178	goto out;
Peter Zijlstra	7039e1d	2012-10-25 14:16:34 +0200	[diff] [blame]	2179
Mel Gorman	b32967f	2012-11-19 12:35:47 +0000	[diff] [blame]	2180	list_add(&page->lru, &migratepages);
Yang Shi	c5b5a3d	2021-06-30 18:51:42 -0700	[diff] [blame]	2181	nr_remaining = migrate_pages(&migratepages, *new, NULL, node,
Yang Shi	5ac9588	2021-09-02 14:59:13 -0700	[diff] [blame]	2182	MIGRATE_ASYNC, MR_NUMA_MISPLACED, NULL);
Mel Gorman	b32967f	2012-11-19 12:35:47 +0000	[diff] [blame]	2183	if (nr_remaining) {
Joonsoo Kim	59c82b7	2014-01-21 15:51:17 -0800	[diff] [blame]	2184	if (!list_empty(&migratepages)) {
				2185	list_del(&page->lru);
Yang Shi	c5fc5c3	2021-06-30 18:51:45 -0700	[diff] [blame]	2186	mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
				2187	page_is_file_lru(page), -nr_pages);
Joonsoo Kim	59c82b7	2014-01-21 15:51:17 -0800	[diff] [blame]	2188	putback_lru_page(page);
				2189	}
Mel Gorman	b32967f	2012-11-19 12:35:47 +0000	[diff] [blame]	2190	isolated = 0;
				2191	} else
Yang Shi	c5fc5c3	2021-06-30 18:51:45 -0700	[diff] [blame]	2192	count_vm_numa_events(NUMA_PAGE_MIGRATE, nr_pages);
Peter Zijlstra	7039e1d	2012-10-25 14:16:34 +0200	[diff] [blame]	2193	BUG_ON(!list_empty(&migratepages));
Peter Zijlstra	7039e1d	2012-10-25 14:16:34 +0200	[diff] [blame]	2194	return isolated;
Hugh Dickins	340ef39	2013-02-22 16:34:33 -0800	[diff] [blame]	2195
				2196	out:
				2197	put_page(page);
				2198	return 0;
Peter Zijlstra	7039e1d	2012-10-25 14:16:34 +0200	[diff] [blame]	2199	}
Mel Gorman	220018d	2012-12-05 09:32:56 +0000	[diff] [blame]	2200	#endif /* CONFIG_NUMA_BALANCING */
Peter Zijlstra	7039e1d	2012-10-25 14:16:34 +0200	[diff] [blame]	2201	#endif /* CONFIG_NUMA */
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2202
Christoph Hellwig	9b2ed9c	2019-08-14 09:59:28 +0200	[diff] [blame]	2203	#ifdef CONFIG_DEVICE_PRIVATE
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2204	static int migrate_vma_collect_skip(unsigned long start,
				2205	unsigned long end,
				2206	struct mm_walk *walk)
				2207	{
				2208	struct migrate_vma *migrate = walk->private;
				2209	unsigned long addr;
				2210
Ralph Campbell	872ea70	2020-01-30 22:14:38 -0800	[diff] [blame]	2211	for (addr = start; addr < end; addr += PAGE_SIZE) {
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2212	migrate->dst[migrate->npages] = 0;
				2213	migrate->src[migrate->npages++] = 0;
				2214	}
				2215
				2216	return 0;
				2217	}
				2218
Miaohe Lin	843e1be	2021-05-04 18:37:13 -0700	[diff] [blame]	2219	static int migrate_vma_collect_hole(unsigned long start,
				2220	unsigned long end,
				2221	__always_unused int depth,
				2222	struct mm_walk *walk)
				2223	{
				2224	struct migrate_vma *migrate = walk->private;
				2225	unsigned long addr;
				2226
				2227	/* Only allow populating anonymous memory. */
				2228	if (!vma_is_anonymous(walk->vma))
				2229	return migrate_vma_collect_skip(start, end, walk);
				2230
				2231	for (addr = start; addr < end; addr += PAGE_SIZE) {
				2232	migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE;
				2233	migrate->dst[migrate->npages] = 0;
				2234	migrate->npages++;
				2235	migrate->cpages++;
				2236	}
				2237
				2238	return 0;
				2239	}
				2240
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2241	static int migrate_vma_collect_pmd(pmd_t *pmdp,
				2242	unsigned long start,
				2243	unsigned long end,
				2244	struct mm_walk *walk)
				2245	{
				2246	struct migrate_vma *migrate = walk->private;
				2247	struct vm_area_struct *vma = walk->vma;
				2248	struct mm_struct *mm = vma->vm_mm;
Jérôme Glisse	8c3328f	2017-09-08 16:12:13 -0700	[diff] [blame]	2249	unsigned long addr = start, unmapped = 0;
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2250	spinlock_t *ptl;
				2251	pte_t *ptep;
				2252
				2253	again:
				2254	if (pmd_none(*pmdp))
Steven Price	b7a16c7	2020-02-03 17:36:03 -0800	[diff] [blame]	2255	return migrate_vma_collect_hole(start, end, -1, walk);
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2256
				2257	if (pmd_trans_huge(*pmdp)) {
				2258	struct page *page;
				2259
				2260	ptl = pmd_lock(mm, pmdp);
				2261	if (unlikely(!pmd_trans_huge(*pmdp))) {
				2262	spin_unlock(ptl);
				2263	goto again;
				2264	}
				2265
				2266	page = pmd_page(*pmdp);
				2267	if (is_huge_zero_page(page)) {
				2268	spin_unlock(ptl);
				2269	split_huge_pmd(vma, pmdp, addr);
				2270	if (pmd_trans_unstable(pmdp))
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2271	return migrate_vma_collect_skip(start, end,
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2272	walk);
				2273	} else {
				2274	int ret;
				2275
				2276	get_page(page);
				2277	spin_unlock(ptl);
				2278	if (unlikely(!trylock_page(page)))
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2279	return migrate_vma_collect_skip(start, end,
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2280	walk);
				2281	ret = split_huge_page(page);
				2282	unlock_page(page);
				2283	put_page(page);
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2284	if (ret)
				2285	return migrate_vma_collect_skip(start, end,
				2286	walk);
				2287	if (pmd_none(*pmdp))
Steven Price	b7a16c7	2020-02-03 17:36:03 -0800	[diff] [blame]	2288	return migrate_vma_collect_hole(start, end, -1,
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2289	walk);
				2290	}
				2291	}
				2292
				2293	if (unlikely(pmd_bad(*pmdp)))
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2294	return migrate_vma_collect_skip(start, end, walk);
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2295
				2296	ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
Jérôme Glisse	8c3328f	2017-09-08 16:12:13 -0700	[diff] [blame]	2297	arch_enter_lazy_mmu_mode();
				2298
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2299	for (; addr < end; addr += PAGE_SIZE, ptep++) {
Christoph Hellwig	800bb1c	2020-03-16 20:32:14 +0100	[diff] [blame]	2300	unsigned long mpfn = 0, pfn;
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2301	struct page *page;
Jérôme Glisse	8c3328f	2017-09-08 16:12:13 -0700	[diff] [blame]	2302	swp_entry_t entry;
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2303	pte_t pte;
				2304
				2305	pte = *ptep;
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2306
Jérôme Glisse	a5430dd	2017-09-08 16:12:17 -0700	[diff] [blame]	2307	if (pte_none(pte)) {
Ralph Campbell	0744f28	2020-08-11 18:31:41 -0700	[diff] [blame]	2308	if (vma_is_anonymous(vma)) {
				2309	mpfn = MIGRATE_PFN_MIGRATE;
				2310	migrate->cpages++;
				2311	}
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2312	goto next;
				2313	}
				2314
Jérôme Glisse	a5430dd	2017-09-08 16:12:17 -0700	[diff] [blame]	2315	if (!pte_present(pte)) {
Jérôme Glisse	a5430dd	2017-09-08 16:12:17 -0700	[diff] [blame]	2316	/*
				2317	* Only care about unaddressable device page special
				2318	* page table entry. Other special swap entries are not
				2319	* migratable, and we ignore regular swapped page.
				2320	*/
				2321	entry = pte_to_swp_entry(pte);
				2322	if (!is_device_private_entry(entry))
				2323	goto next;
				2324
Alistair Popple	af5cdaf	2021-06-30 18:54:06 -0700	[diff] [blame]	2325	page = pfn_swap_entry_to_page(entry);
Ralph Campbell	5143192	2020-07-23 15:30:00 -0700	[diff] [blame]	2326	if (!(migrate->flags &
				2327	MIGRATE_VMA_SELECT_DEVICE_PRIVATE) \|\|
				2328	page->pgmap->owner != migrate->pgmap_owner)
Christoph Hellwig	800bb1c	2020-03-16 20:32:14 +0100	[diff] [blame]	2329	goto next;
				2330
Christoph Hellwig	06d462b	2019-08-14 09:59:27 +0200	[diff] [blame]	2331	mpfn = migrate_pfn(page_to_pfn(page)) \|
				2332	MIGRATE_PFN_MIGRATE;
Alistair Popple	4dd845b	2021-06-30 18:54:09 -0700	[diff] [blame]	2333	if (is_writable_device_private_entry(entry))
Jérôme Glisse	a5430dd	2017-09-08 16:12:17 -0700	[diff] [blame]	2334	mpfn \|= MIGRATE_PFN_WRITE;
				2335	} else {
Ralph Campbell	5143192	2020-07-23 15:30:00 -0700	[diff] [blame]	2336	if (!(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM))
Christoph Hellwig	800bb1c	2020-03-16 20:32:14 +0100	[diff] [blame]	2337	goto next;
Pingfan Liu	276f756	2019-09-23 15:37:38 -0700	[diff] [blame]	2338	pfn = pte_pfn(pte);
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2339	if (is_zero_pfn(pfn)) {
				2340	mpfn = MIGRATE_PFN_MIGRATE;
				2341	migrate->cpages++;
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2342	goto next;
				2343	}
Christoph Hellwig	25b2995	2019-06-13 22:50:49 +0200	[diff] [blame]	2344	page = vm_normal_page(migrate->vma, addr, pte);
Jérôme Glisse	a5430dd	2017-09-08 16:12:17 -0700	[diff] [blame]	2345	mpfn = migrate_pfn(pfn) \| MIGRATE_PFN_MIGRATE;
				2346	mpfn \|= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
				2347	}
				2348
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2349	/* FIXME support THP */
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2350	if (!page \|\| !page->mapping \|\| PageTransCompound(page)) {
Pingfan Liu	276f756	2019-09-23 15:37:38 -0700	[diff] [blame]	2351	mpfn = 0;
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2352	goto next;
				2353	}
				2354
				2355	/*
				2356	* By getting a reference on the page we pin it and that blocks
				2357	* any kind of migration. Side effect is that it "freezes" the
				2358	* pte.
				2359	*
				2360	* We drop this reference after isolating the page from the lru
				2361	* for non device page (device page are not on the lru and thus
				2362	* can't be dropped from it).
				2363	*/
				2364	get_page(page);
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2365
Jérôme Glisse	8c3328f	2017-09-08 16:12:13 -0700	[diff] [blame]	2366	/*
				2367	* Optimize for the common case where page is only mapped once
				2368	* in one process. If we can lock the page, then we can safely
				2369	* set up a special migration page table entry now.
				2370	*/
				2371	if (trylock_page(page)) {
				2372	pte_t swp_pte;
				2373
Alistair Popple	ab09243	2021-11-10 20:32:40 -0800	[diff] [blame]	2374	migrate->cpages++;
Jérôme Glisse	8c3328f	2017-09-08 16:12:13 -0700	[diff] [blame]	2375	ptep_get_and_clear(mm, addr, ptep);
				2376
				2377	/* Setup special migration page table entry */
Alistair Popple	4dd845b	2021-06-30 18:54:09 -0700	[diff] [blame]	2378	if (mpfn & MIGRATE_PFN_WRITE)
				2379	entry = make_writable_migration_entry(
				2380	page_to_pfn(page));
				2381	else
				2382	entry = make_readable_migration_entry(
				2383	page_to_pfn(page));
Jérôme Glisse	8c3328f	2017-09-08 16:12:13 -0700	[diff] [blame]	2384	swp_pte = swp_entry_to_pte(entry);
Alistair Popple	ad7df76	2020-09-04 16:36:01 -0700	[diff] [blame]	2385	if (pte_present(pte)) {
				2386	if (pte_soft_dirty(pte))
				2387	swp_pte = pte_swp_mksoft_dirty(swp_pte);
				2388	if (pte_uffd_wp(pte))
				2389	swp_pte = pte_swp_mkuffd_wp(swp_pte);
				2390	} else {
				2391	if (pte_swp_soft_dirty(pte))
				2392	swp_pte = pte_swp_mksoft_dirty(swp_pte);
				2393	if (pte_swp_uffd_wp(pte))
				2394	swp_pte = pte_swp_mkuffd_wp(swp_pte);
				2395	}
Jérôme Glisse	8c3328f	2017-09-08 16:12:13 -0700	[diff] [blame]	2396	set_pte_at(mm, addr, ptep, swp_pte);
				2397
				2398	/*
				2399	* This is like regular unmap: we remove the rmap and
				2400	* drop page refcount. Page won't be freed, as we took
				2401	* a reference just above.
				2402	*/
				2403	page_remove_rmap(page, false);
				2404	put_page(page);
Jérôme Glisse	a5430dd	2017-09-08 16:12:17 -0700	[diff] [blame]	2405
				2406	if (pte_present(pte))
				2407	unmapped++;
Alistair Popple	ab09243	2021-11-10 20:32:40 -0800	[diff] [blame]	2408	} else {
				2409	put_page(page);
				2410	mpfn = 0;
Jérôme Glisse	8c3328f	2017-09-08 16:12:13 -0700	[diff] [blame]	2411	}
				2412
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2413	next:
Jérôme Glisse	a5430dd	2017-09-08 16:12:17 -0700	[diff] [blame]	2414	migrate->dst[migrate->npages] = 0;
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2415	migrate->src[migrate->npages++] = mpfn;
				2416	}
Jérôme Glisse	8c3328f	2017-09-08 16:12:13 -0700	[diff] [blame]	2417	arch_leave_lazy_mmu_mode();
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2418	pte_unmap_unlock(ptep - 1, ptl);
				2419
Jérôme Glisse	8c3328f	2017-09-08 16:12:13 -0700	[diff] [blame]	2420	/* Only flush the TLB if we actually modified any entries */
				2421	if (unmapped)
				2422	flush_tlb_range(walk->vma, start, end);
				2423
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2424	return 0;
				2425	}
				2426
Christoph Hellwig	7b86ac3	2019-08-28 16:19:54 +0200	[diff] [blame]	2427	static const struct mm_walk_ops migrate_vma_walk_ops = {
				2428	.pmd_entry = migrate_vma_collect_pmd,
				2429	.pte_hole = migrate_vma_collect_hole,
				2430	};
				2431
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2432	/*
				2433	* migrate_vma_collect() - collect pages over a range of virtual addresses
				2434	* @migrate: migrate struct containing all migration information
				2435	*
				2436	* This will walk the CPU page table. For each virtual address backed by a
				2437	* valid page, it updates the src array and takes a reference on the page, in
				2438	* order to pin the page until we lock it and unmap it.
				2439	*/
				2440	static void migrate_vma_collect(struct migrate_vma *migrate)
				2441	{
Jérôme Glisse	ac46d4f	2018-12-28 00:38:09 -0800	[diff] [blame]	2442	struct mmu_notifier_range range;
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2443
Ralph Campbell	998427b	2020-07-23 15:30:01 -0700	[diff] [blame]	2444	/*
				2445	* Note that the pgmap_owner is passed to the mmu notifier callback so
				2446	* that the registered device driver can skip invalidating device
				2447	* private page mappings that won't be migrated.
				2448	*/
Alistair Popple	6b49bf6	2021-06-30 18:54:19 -0700	[diff] [blame]	2449	mmu_notifier_range_init_owner(&range, MMU_NOTIFY_MIGRATE, 0,
				2450	migrate->vma, migrate->vma->vm_mm, migrate->start, migrate->end,
Ralph Campbell	c1a06df	2020-08-06 23:17:09 -0700	[diff] [blame]	2451	migrate->pgmap_owner);
Jérôme Glisse	ac46d4f	2018-12-28 00:38:09 -0800	[diff] [blame]	2452	mmu_notifier_invalidate_range_start(&range);
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2453
Christoph Hellwig	7b86ac3	2019-08-28 16:19:54 +0200	[diff] [blame]	2454	walk_page_range(migrate->vma->vm_mm, migrate->start, migrate->end,
				2455	&migrate_vma_walk_ops, migrate);
				2456
				2457	mmu_notifier_invalidate_range_end(&range);
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2458	migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT);
				2459	}
				2460
				2461	/*
				2462	* migrate_vma_check_page() - check if page is pinned or not
				2463	* @page: struct page to check
				2464	*
				2465	* Pinned pages cannot be migrated. This is the same test as in
Matthew Wilcox (Oracle)	3417013	2021-05-07 07:28:40 -0400	[diff] [blame]	2466	* folio_migrate_mapping(), except that here we allow migration of a
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2467	* ZONE_DEVICE page.
				2468	*/
				2469	static bool migrate_vma_check_page(struct page *page)
				2470	{
				2471	/*
				2472	* One extra ref because caller holds an extra reference, either from
				2473	* isolate_lru_page() for a regular page, or migrate_vma_collect() for
				2474	* a device page.
				2475	*/
				2476	int extra = 1;
				2477
				2478	/*
				2479	* FIXME support THP (transparent huge page), it is bit more complex to
				2480	* check them than regular pages, because they can be mapped with a pmd
				2481	* or with a pte (split pte mapping).
				2482	*/
				2483	if (PageCompound(page))
				2484	return false;
				2485
Jérôme Glisse	a5430dd	2017-09-08 16:12:17 -0700	[diff] [blame]	2486	/* Page from ZONE_DEVICE have one extra reference */
				2487	if (is_zone_device_page(page)) {
				2488	/*
				2489	* Private page can never be pin as they have no valid pte and
				2490	* GUP will fail for those. Yet if there is a pending migration
				2491	* a thread might try to wait on the pte migration entry and
				2492	* will bump the page reference count. Sadly there is no way to
				2493	* differentiate a regular pin from migration wait. Hence to
				2494	* avoid 2 racing thread trying to migrate back to CPU to enter
Haitao Shi	8958b24	2020-12-15 20:47:26 -0800	[diff] [blame]	2495	* infinite loop (one stopping migration because the other is
Jérôme Glisse	a5430dd	2017-09-08 16:12:17 -0700	[diff] [blame]	2496	* waiting on pte migration entry). We always return true here.
				2497	*
				2498	* FIXME proper solution is to rework migration_entry_wait() so
				2499	* it does not need to take a reference on page.
				2500	*/
Christoph Hellwig	25b2995	2019-06-13 22:50:49 +0200	[diff] [blame]	2501	return is_device_private_page(page);
Jérôme Glisse	a5430dd	2017-09-08 16:12:17 -0700	[diff] [blame]	2502	}
				2503
Jérôme Glisse	df6ad69	2017-09-08 16:12:24 -0700	[diff] [blame]	2504	/* For file back page */
				2505	if (page_mapping(page))
				2506	extra += 1 + page_has_private(page);
				2507
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2508	if ((page_count(page) - extra) > page_mapcount(page))
				2509	return false;
				2510
				2511	return true;
				2512	}
				2513
				2514	/*
Alistair Popple	ab09243	2021-11-10 20:32:40 -0800	[diff] [blame]	2515	* migrate_vma_unmap() - replace page mapping with special migration pte entry
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2516	* @migrate: migrate struct containing all migration information
				2517	*
Alistair Popple	ab09243	2021-11-10 20:32:40 -0800	[diff] [blame]	2518	* Isolate pages from the LRU and replace mappings (CPU page table pte) with a
				2519	* special migration pte entry and check if it has been pinned. Pinned pages are
				2520	* restored because we cannot migrate them.
				2521	*
				2522	* This is the last step before we call the device driver callback to allocate
				2523	* destination memory and copy contents of original page over to new page.
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2524	*/
Alistair Popple	ab09243	2021-11-10 20:32:40 -0800	[diff] [blame]	2525	static void migrate_vma_unmap(struct migrate_vma *migrate)
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2526	{
				2527	const unsigned long npages = migrate->npages;
Jérôme Glisse	8c3328f	2017-09-08 16:12:13 -0700	[diff] [blame]	2528	const unsigned long start = migrate->start;
				2529	unsigned long addr, i, restore = 0;
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2530	bool allow_drain = true;
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2531
				2532	lru_add_drain();
				2533
Alistair Popple	ab09243	2021-11-10 20:32:40 -0800	[diff] [blame]	2534	for (i = 0; i < npages; i++) {
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2535	struct page *page = migrate_pfn_to_page(migrate->src[i]);
				2536
				2537	if (!page)
				2538	continue;
				2539
Jérôme Glisse	a5430dd	2017-09-08 16:12:17 -0700	[diff] [blame]	2540	/* ZONE_DEVICE pages are not on LRU */
				2541	if (!is_zone_device_page(page)) {
				2542	if (!PageLRU(page) && allow_drain) {
				2543	/* Drain CPU's pagevec */
				2544	lru_add_drain_all();
				2545	allow_drain = false;
Jérôme Glisse	8c3328f	2017-09-08 16:12:13 -0700	[diff] [blame]	2546	}
Jérôme Glisse	a5430dd	2017-09-08 16:12:17 -0700	[diff] [blame]	2547
				2548	if (isolate_lru_page(page)) {
Alistair Popple	ab09243	2021-11-10 20:32:40 -0800	[diff] [blame]	2549	migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
				2550	migrate->cpages--;
				2551	restore++;
Jérôme Glisse	a5430dd	2017-09-08 16:12:17 -0700	[diff] [blame]	2552	continue;
				2553	}
				2554
				2555	/* Drop the reference we took in collect */
				2556	put_page(page);
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2557	}
				2558
Alistair Popple	ab09243	2021-11-10 20:32:40 -0800	[diff] [blame]	2559	if (page_mapped(page))
Alistair Popple	a98a2f0	2021-06-30 18:54:16 -0700	[diff] [blame]	2560	try_to_migrate(page, 0);
Jérôme Glisse	8c3328f	2017-09-08 16:12:13 -0700	[diff] [blame]	2561
Alistair Popple	ab09243	2021-11-10 20:32:40 -0800	[diff] [blame]	2562	if (page_mapped(page) \|\| !migrate_vma_check_page(page)) {
				2563	if (!is_zone_device_page(page)) {
				2564	get_page(page);
				2565	putback_lru_page(page);
				2566	}
				2567
				2568	migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
				2569	migrate->cpages--;
				2570	restore++;
Jérôme Glisse	8c3328f	2017-09-08 16:12:13 -0700	[diff] [blame]	2571	continue;
Alistair Popple	ab09243	2021-11-10 20:32:40 -0800	[diff] [blame]	2572	}
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2573	}
				2574
				2575	for (addr = start, i = 0; i < npages && restore; addr += PAGE_SIZE, i++) {
				2576	struct page *page = migrate_pfn_to_page(migrate->src[i]);
				2577
				2578	if (!page \|\| (migrate->src[i] & MIGRATE_PFN_MIGRATE))
				2579	continue;
				2580
				2581	remove_migration_ptes(page, page, false);
				2582
				2583	migrate->src[i] = 0;
				2584	unlock_page(page);
Alistair Popple	ab09243	2021-11-10 20:32:40 -0800	[diff] [blame]	2585	put_page(page);
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2586	restore--;
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2587	}
				2588	}
				2589
Christoph Hellwig	a7d1f22	2019-08-14 09:59:19 +0200	[diff] [blame]	2590	/**
				2591	* migrate_vma_setup() - prepare to migrate a range of memory
Randy Dunlap	eaf444d	2020-08-11 18:33:08 -0700	[diff] [blame]	2592	* @args: contains the vma, start, and pfns arrays for the migration
Christoph Hellwig	a7d1f22	2019-08-14 09:59:19 +0200	[diff] [blame]	2593	*
				2594	* Returns: negative errno on failures, 0 when 0 or more pages were migrated
				2595	* without an error.
				2596	*
				2597	* Prepare to migrate a range of memory virtual address range by collecting all
				2598	* the pages backing each virtual address in the range, saving them inside the
				2599	* src array. Then lock those pages and unmap them. Once the pages are locked
				2600	* and unmapped, check whether each page is pinned or not. Pages that aren't
				2601	* pinned have the MIGRATE_PFN_MIGRATE flag set (by this function) in the
				2602	* corresponding src array entry. Then restores any pages that are pinned, by
				2603	* remapping and unlocking those pages.
				2604	*
				2605	* The caller should then allocate destination memory and copy source memory to
				2606	* it for all those entries (ie with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE
				2607	* flag set). Once these are allocated and copied, the caller must update each
				2608	* corresponding entry in the dst array with the pfn value of the destination
Alistair Popple	ab09243	2021-11-10 20:32:40 -0800	[diff] [blame]	2609	* page and with MIGRATE_PFN_VALID. Destination pages must be locked via
				2610	* lock_page().
Christoph Hellwig	a7d1f22	2019-08-14 09:59:19 +0200	[diff] [blame]	2611	*
				2612	* Note that the caller does not have to migrate all the pages that are marked
				2613	* with MIGRATE_PFN_MIGRATE flag in src array unless this is a migration from
				2614	* device memory to system memory. If the caller cannot migrate a device page
				2615	* back to system memory, then it must return VM_FAULT_SIGBUS, which has severe
				2616	* consequences for the userspace process, so it must be avoided if at all
				2617	* possible.
				2618	*
				2619	* For empty entries inside CPU page table (pte_none() or pmd_none() is true) we
				2620	* do set MIGRATE_PFN_MIGRATE flag inside the corresponding source array thus
Ingo Molnar	f0953a1	2021-05-06 18:06:47 -0700	[diff] [blame]	2621	* allowing the caller to allocate device memory for those unbacked virtual
				2622	* addresses. For this the caller simply has to allocate device memory and
Christoph Hellwig	a7d1f22	2019-08-14 09:59:19 +0200	[diff] [blame]	2623	* properly set the destination entry like for regular migration. Note that
Ingo Molnar	f0953a1	2021-05-06 18:06:47 -0700	[diff] [blame]	2624	* this can still fail, and thus inside the device driver you must check if the
				2625	* migration was successful for those entries after calling migrate_vma_pages(),
Christoph Hellwig	a7d1f22	2019-08-14 09:59:19 +0200	[diff] [blame]	2626	* just like for regular migration.
				2627	*
				2628	* After that, the callers must call migrate_vma_pages() to go over each entry
				2629	* in the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag
				2630	* set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set,
				2631	* then migrate_vma_pages() to migrate struct page information from the source
				2632	* struct page to the destination struct page. If it fails to migrate the
				2633	* struct page information, then it clears the MIGRATE_PFN_MIGRATE flag in the
				2634	* src array.
				2635	*
				2636	* At this point all successfully migrated pages have an entry in the src
				2637	* array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst
				2638	* array entry with MIGRATE_PFN_VALID flag set.
				2639	*
				2640	* Once migrate_vma_pages() returns the caller may inspect which pages were
				2641	* successfully migrated, and which were not. Successfully migrated pages will
				2642	* have the MIGRATE_PFN_MIGRATE flag set for their src array entry.
				2643	*
				2644	* It is safe to update device page table after migrate_vma_pages() because
Michel Lespinasse	c1e8d7c	2020-06-08 21:33:54 -0700	[diff] [blame]	2645	* both destination and source page are still locked, and the mmap_lock is held
Christoph Hellwig	a7d1f22	2019-08-14 09:59:19 +0200	[diff] [blame]	2646	* in read mode (hence no one can unmap the range being migrated).
				2647	*
				2648	* Once the caller is done cleaning up things and updating its page table (if it
				2649	* chose to do so, this is not an obligation) it finally calls
				2650	* migrate_vma_finalize() to update the CPU page table to point to new pages
				2651	* for successfully migrated pages or otherwise restore the CPU page table to
				2652	* point to the original source pages.
				2653	*/
				2654	int migrate_vma_setup(struct migrate_vma *args)
				2655	{
				2656	long nr_pages = (args->end - args->start) >> PAGE_SHIFT;
				2657
				2658	args->start &= PAGE_MASK;
				2659	args->end &= PAGE_MASK;
				2660	if (!args->vma \|\| is_vm_hugetlb_page(args->vma) \|\|
				2661	(args->vma->vm_flags & VM_SPECIAL) \|\| vma_is_dax(args->vma))
				2662	return -EINVAL;
				2663	if (nr_pages <= 0)
				2664	return -EINVAL;
				2665	if (args->start < args->vma->vm_start \|\|
				2666	args->start >= args->vma->vm_end)
				2667	return -EINVAL;
				2668	if (args->end <= args->vma->vm_start \|\| args->end > args->vma->vm_end)
				2669	return -EINVAL;
				2670	if (!args->src \|\| !args->dst)
				2671	return -EINVAL;
				2672
				2673	memset(args->src, 0, sizeof(args->src) nr_pages);
				2674	args->cpages = 0;
				2675	args->npages = 0;
				2676
				2677	migrate_vma_collect(args);
				2678
				2679	if (args->cpages)
Christoph Hellwig	a7d1f22	2019-08-14 09:59:19 +0200	[diff] [blame]	2680	migrate_vma_unmap(args);
				2681
				2682	/*
				2683	* At this point pages are locked and unmapped, and thus they have
				2684	* stable content and can safely be copied to destination memory that
				2685	* is allocated by the drivers.
				2686	*/
				2687	return 0;
				2688
				2689	}
				2690	EXPORT_SYMBOL(migrate_vma_setup);
				2691
Ralph Campbell	34290e2	2020-01-30 22:14:44 -0800	[diff] [blame]	2692	/*
				2693	* This code closely matches the code in:
				2694	* __handle_mm_fault()
				2695	* handle_pte_fault()
				2696	* do_anonymous_page()
				2697	* to map in an anonymous zero page but the struct page will be a ZONE_DEVICE
				2698	* private page.
				2699	*/
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2700	static void migrate_vma_insert_page(struct migrate_vma *migrate,
				2701	unsigned long addr,
				2702	struct page *page,
Stephen Zhang	d85c6db	2020-12-14 19:13:20 -0800	[diff] [blame]	2703	unsigned long *src)
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2704	{
				2705	struct vm_area_struct *vma = migrate->vma;
				2706	struct mm_struct *mm = vma->vm_mm;
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2707	bool flush = false;
				2708	spinlock_t *ptl;
				2709	pte_t entry;
				2710	pgd_t *pgdp;
				2711	p4d_t *p4dp;
				2712	pud_t *pudp;
				2713	pmd_t *pmdp;
				2714	pte_t *ptep;
				2715
				2716	/* Only allow populating anonymous memory */
				2717	if (!vma_is_anonymous(vma))
				2718	goto abort;
				2719
				2720	pgdp = pgd_offset(mm, addr);
				2721	p4dp = p4d_alloc(mm, pgdp, addr);
				2722	if (!p4dp)
				2723	goto abort;
				2724	pudp = pud_alloc(mm, p4dp, addr);
				2725	if (!pudp)
				2726	goto abort;
				2727	pmdp = pmd_alloc(mm, pudp, addr);
				2728	if (!pmdp)
				2729	goto abort;
				2730
				2731	if (pmd_trans_huge(pmdp) \|\| pmd_devmap(pmdp))
				2732	goto abort;
				2733
				2734	/*
				2735	* Use pte_alloc() instead of pte_alloc_map(). We can't run
				2736	* pte_offset_map() on pmds where a huge pmd might be created
				2737	* from a different thread.
				2738	*
Michel Lespinasse	3e4e28c	2020-06-08 21:33:51 -0700	[diff] [blame]	2739	* pte_alloc_map() is safe to use under mmap_write_lock(mm) or when
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2740	* parallel threads are excluded by other means.
				2741	*
Michel Lespinasse	3e4e28c	2020-06-08 21:33:51 -0700	[diff] [blame]	2742	* Here we only have mmap_read_lock(mm).
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2743	*/
Joel Fernandes (Google)	4cf5892	2019-01-03 15:28:34 -0800	[diff] [blame]	2744	if (pte_alloc(mm, pmdp))
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2745	goto abort;
				2746
				2747	/* See the comment in pte_alloc_one_map() */
				2748	if (unlikely(pmd_trans_unstable(pmdp)))
				2749	goto abort;
				2750
				2751	if (unlikely(anon_vma_prepare(vma)))
				2752	goto abort;
Matthew Wilcox (Oracle)	8f425e4	2021-06-25 09:27:04 -0400	[diff] [blame]	2753	if (mem_cgroup_charge(page_folio(page), vma->vm_mm, GFP_KERNEL))
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2754	goto abort;
				2755
				2756	/*
				2757	* The memory barrier inside __SetPageUptodate makes sure that
				2758	* preceding stores to the page contents become visible before
				2759	* the set_pte_at() write.
				2760	*/
				2761	__SetPageUptodate(page);
				2762
Jérôme Glisse	df6ad69	2017-09-08 16:12:24 -0700	[diff] [blame]	2763	if (is_zone_device_page(page)) {
				2764	if (is_device_private_page(page)) {
				2765	swp_entry_t swp_entry;
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2766
Alistair Popple	4dd845b	2021-06-30 18:54:09 -0700	[diff] [blame]	2767	if (vma->vm_flags & VM_WRITE)
				2768	swp_entry = make_writable_device_private_entry(
				2769	page_to_pfn(page));
				2770	else
				2771	swp_entry = make_readable_device_private_entry(
				2772	page_to_pfn(page));
Jérôme Glisse	df6ad69	2017-09-08 16:12:24 -0700	[diff] [blame]	2773	entry = swp_entry_to_pte(swp_entry);
Miaohe Lin	34f5e9b	2021-05-04 18:37:10 -0700	[diff] [blame]	2774	} else {
				2775	/*
				2776	* For now we only support migrating to un-addressable
				2777	* device memory.
				2778	*/
				2779	pr_warn_once("Unsupported ZONE_DEVICE page type.\n");
				2780	goto abort;
Jérôme Glisse	df6ad69	2017-09-08 16:12:24 -0700	[diff] [blame]	2781	}
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2782	} else {
				2783	entry = mk_pte(page, vma->vm_page_prot);
				2784	if (vma->vm_flags & VM_WRITE)
				2785	entry = pte_mkwrite(pte_mkdirty(entry));
				2786	}
				2787
				2788	ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
				2789
Ralph Campbell	34290e2	2020-01-30 22:14:44 -0800	[diff] [blame]	2790	if (check_stable_address_space(mm))
				2791	goto unlock_abort;
				2792
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2793	if (pte_present(*ptep)) {
				2794	unsigned long pfn = pte_pfn(*ptep);
				2795
Ralph Campbell	c23a0c9	2020-01-30 22:14:41 -0800	[diff] [blame]	2796	if (!is_zero_pfn(pfn))
				2797	goto unlock_abort;
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2798	flush = true;
Ralph Campbell	c23a0c9	2020-01-30 22:14:41 -0800	[diff] [blame]	2799	} else if (!pte_none(*ptep))
				2800	goto unlock_abort;
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2801
				2802	/*
Ralph Campbell	c23a0c9	2020-01-30 22:14:41 -0800	[diff] [blame]	2803	* Check for userfaultfd but do not deliver the fault. Instead,
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2804	* just back off.
				2805	*/
Ralph Campbell	c23a0c9	2020-01-30 22:14:41 -0800	[diff] [blame]	2806	if (userfaultfd_missing(vma))
				2807	goto unlock_abort;
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2808
				2809	inc_mm_counter(mm, MM_ANONPAGES);
Johannes Weiner	be5d0a7	2020-06-03 16:01:57 -0700	[diff] [blame]	2810	page_add_new_anon_rmap(page, vma, addr, false);
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2811	if (!is_zone_device_page(page))
Joonsoo Kim	b518154	2020-08-11 18:30:40 -0700	[diff] [blame]	2812	lru_cache_add_inactive_or_unevictable(page, vma);
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2813	get_page(page);
				2814
				2815	if (flush) {
				2816	flush_cache_page(vma, addr, pte_pfn(*ptep));
				2817	ptep_clear_flush_notify(vma, addr, ptep);
				2818	set_pte_at_notify(mm, addr, ptep, entry);
				2819	update_mmu_cache(vma, addr, ptep);
				2820	} else {
				2821	/* No need to invalidate - it was non-present before */
				2822	set_pte_at(mm, addr, ptep, entry);
				2823	update_mmu_cache(vma, addr, ptep);
				2824	}
				2825
				2826	pte_unmap_unlock(ptep, ptl);
				2827	*src = MIGRATE_PFN_MIGRATE;
				2828	return;
				2829
Ralph Campbell	c23a0c9	2020-01-30 22:14:41 -0800	[diff] [blame]	2830	unlock_abort:
				2831	pte_unmap_unlock(ptep, ptl);
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2832	abort:
				2833	*src &= ~MIGRATE_PFN_MIGRATE;
				2834	}
				2835
Christoph Hellwig	a7d1f22	2019-08-14 09:59:19 +0200	[diff] [blame]	2836	/**
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2837	* migrate_vma_pages() - migrate meta-data from src page to dst page
				2838	* @migrate: migrate struct containing all migration information
				2839	*
				2840	* This migrates struct page meta-data from source struct page to destination
				2841	* struct page. This effectively finishes the migration from source page to the
				2842	* destination page.
				2843	*/
Christoph Hellwig	a7d1f22	2019-08-14 09:59:19 +0200	[diff] [blame]	2844	void migrate_vma_pages(struct migrate_vma *migrate)
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2845	{
				2846	const unsigned long npages = migrate->npages;
				2847	const unsigned long start = migrate->start;
Jérôme Glisse	ac46d4f	2018-12-28 00:38:09 -0800	[diff] [blame]	2848	struct mmu_notifier_range range;
				2849	unsigned long addr, i;
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2850	bool notified = false;
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2851
				2852	for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) {
				2853	struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
				2854	struct page *page = migrate_pfn_to_page(migrate->src[i]);
				2855	struct address_space *mapping;
				2856	int r;
				2857
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2858	if (!newpage) {
				2859	migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2860	continue;
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2861	}
				2862
				2863	if (!page) {
Ralph Campbell	c23a0c9	2020-01-30 22:14:41 -0800	[diff] [blame]	2864	if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE))
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2865	continue;
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2866	if (!notified) {
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2867	notified = true;
Jérôme Glisse	ac46d4f	2018-12-28 00:38:09 -0800	[diff] [blame]	2868
Alistair Popple	6b49bf6	2021-06-30 18:54:19 -0700	[diff] [blame]	2869	mmu_notifier_range_init_owner(&range,
				2870	MMU_NOTIFY_MIGRATE, 0, migrate->vma,
				2871	migrate->vma->vm_mm, addr, migrate->end,
Ralph Campbell	5e5dda8	2020-12-14 19:12:55 -0800	[diff] [blame]	2872	migrate->pgmap_owner);
Jérôme Glisse	ac46d4f	2018-12-28 00:38:09 -0800	[diff] [blame]	2873	mmu_notifier_invalidate_range_start(&range);
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2874	}
				2875	migrate_vma_insert_page(migrate, addr, newpage,
Stephen Zhang	d85c6db	2020-12-14 19:13:20 -0800	[diff] [blame]	2876	&migrate->src[i]);
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2877	continue;
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2878	}
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2879
				2880	mapping = page_mapping(page);
				2881
Jérôme Glisse	a5430dd	2017-09-08 16:12:17 -0700	[diff] [blame]	2882	if (is_zone_device_page(newpage)) {
				2883	if (is_device_private_page(newpage)) {
				2884	/*
				2885	* For now only support private anonymous when
				2886	* migrating to un-addressable device memory.
				2887	*/
				2888	if (mapping) {
				2889	migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
				2890	continue;
				2891	}
Christoph Hellwig	25b2995	2019-06-13 22:50:49 +0200	[diff] [blame]	2892	} else {
Jérôme Glisse	a5430dd	2017-09-08 16:12:17 -0700	[diff] [blame]	2893	/*
				2894	* Other types of ZONE_DEVICE page are not
				2895	* supported.
				2896	*/
				2897	migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
				2898	continue;
				2899	}
				2900	}
				2901
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2902	r = migrate_page(mapping, newpage, page, MIGRATE_SYNC_NO_COPY);
				2903	if (r != MIGRATEPAGE_SUCCESS)
				2904	migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
				2905	}
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2906
Jérôme Glisse	4645b9f	2017-11-15 17:34:11 -0800	[diff] [blame]	2907	/*
				2908	* No need to double call mmu_notifier->invalidate_range() callback as
				2909	* the above ptep_clear_flush_notify() inside migrate_vma_insert_page()
				2910	* did already call it.
				2911	*/
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2912	if (notified)
Jérôme Glisse	ac46d4f	2018-12-28 00:38:09 -0800	[diff] [blame]	2913	mmu_notifier_invalidate_range_only_end(&range);
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2914	}
Christoph Hellwig	a7d1f22	2019-08-14 09:59:19 +0200	[diff] [blame]	2915	EXPORT_SYMBOL(migrate_vma_pages);
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2916
Christoph Hellwig	a7d1f22	2019-08-14 09:59:19 +0200	[diff] [blame]	2917	/**
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2918	* migrate_vma_finalize() - restore CPU page table entry
				2919	* @migrate: migrate struct containing all migration information
				2920	*
				2921	* This replaces the special migration pte entry with either a mapping to the
				2922	* new page if migration was successful for that page, or to the original page
				2923	* otherwise.
				2924	*
				2925	* This also unlocks the pages and puts them back on the lru, or drops the extra
				2926	* refcount, for device pages.
				2927	*/
Christoph Hellwig	a7d1f22	2019-08-14 09:59:19 +0200	[diff] [blame]	2928	void migrate_vma_finalize(struct migrate_vma *migrate)
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2929	{
				2930	const unsigned long npages = migrate->npages;
				2931	unsigned long i;
				2932
				2933	for (i = 0; i < npages; i++) {
				2934	struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
				2935	struct page *page = migrate_pfn_to_page(migrate->src[i]);
				2936
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2937	if (!page) {
				2938	if (newpage) {
				2939	unlock_page(newpage);
				2940	put_page(newpage);
				2941	}
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2942	continue;
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2943	}
				2944
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2945	if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE) \|\| !newpage) {
				2946	if (newpage) {
				2947	unlock_page(newpage);
				2948	put_page(newpage);
				2949	}
				2950	newpage = page;
				2951	}
				2952
				2953	remove_migration_ptes(page, newpage, false);
				2954	unlock_page(page);
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2955
Jérôme Glisse	a5430dd	2017-09-08 16:12:17 -0700	[diff] [blame]	2956	if (is_zone_device_page(page))
				2957	put_page(page);
				2958	else
				2959	putback_lru_page(page);
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2960
				2961	if (newpage != page) {
				2962	unlock_page(newpage);
Jérôme Glisse	a5430dd	2017-09-08 16:12:17 -0700	[diff] [blame]	2963	if (is_zone_device_page(newpage))
				2964	put_page(newpage);
				2965	else
				2966	putback_lru_page(newpage);
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2967	}
				2968	}
				2969	}
Christoph Hellwig	a7d1f22	2019-08-14 09:59:19 +0200	[diff] [blame]	2970	EXPORT_SYMBOL(migrate_vma_finalize);
Christoph Hellwig	9b2ed9c	2019-08-14 09:59:28 +0200	[diff] [blame]	2971	#endif /* CONFIG_DEVICE_PRIVATE */
Dave Hansen	79c28a4	2021-09-02 14:59:06 -0700	[diff] [blame]	2972
Dave Hansen	76af6a0	2021-10-18 15:15:32 -0700	[diff] [blame]	2973	#if defined(CONFIG_HOTPLUG_CPU)
Dave Hansen	79c28a4	2021-09-02 14:59:06 -0700	[diff] [blame]	2974	/* Disable reclaim-based migration. */
				2975	static void __disable_all_migrate_targets(void)
				2976	{
				2977	int node;
				2978
				2979	for_each_online_node(node)
				2980	node_demotion[node] = NUMA_NO_NODE;
				2981	}
				2982
				2983	static void disable_all_migrate_targets(void)
				2984	{
				2985	__disable_all_migrate_targets();
				2986
				2987	/*
				2988	* Ensure that the "disable" is visible across the system.
				2989	* Readers will see either a combination of before+disable
				2990	* state or disable+after. They will never see before and
				2991	* after state together.
				2992	*
				2993	* The before+after state together might have cycles and
				2994	* could cause readers to do things like loop until this
				2995	* function finishes. This ensures they can only see a
				2996	* single "bad" read and would, for instance, only loop
				2997	* once.
				2998	*/
				2999	synchronize_rcu();
				3000	}
				3001
				3002	/*
				3003	* Find an automatic demotion target for 'node'.
				3004	* Failing here is OK. It might just indicate
				3005	* being at the end of a chain.
				3006	*/
				3007	static int establish_migrate_target(int node, nodemask_t *used)
				3008	{
				3009	int migration_target;
				3010
				3011	/*
				3012	* Can not set a migration target on a
				3013	* node with it already set.
				3014	*
				3015	* No need for READ_ONCE() here since this
				3016	* in the write path for node_demotion[].
				3017	* This should be the only thread writing.
				3018	*/
				3019	if (node_demotion[node] != NUMA_NO_NODE)
				3020	return NUMA_NO_NODE;
				3021
				3022	migration_target = find_next_best_node(node, used);
				3023	if (migration_target == NUMA_NO_NODE)
				3024	return NUMA_NO_NODE;
				3025
				3026	node_demotion[node] = migration_target;
				3027
				3028	return migration_target;
				3029	}
				3030
				3031	/*
				3032	* When memory fills up on a node, memory contents can be
				3033	* automatically migrated to another node instead of
				3034	* discarded at reclaim.
				3035	*
				3036	* Establish a "migration path" which will start at nodes
				3037	* with CPUs and will follow the priorities used to build the
				3038	* page allocator zonelists.
				3039	*
				3040	* The difference here is that cycles must be avoided. If
				3041	* node0 migrates to node1, then neither node1, nor anything
				3042	* node1 migrates to can migrate to node0.
				3043	*
				3044	* This function can run simultaneously with readers of
				3045	* node_demotion[]. However, it can not run simultaneously
				3046	* with itself. Exclusion is provided by memory hotplug events
				3047	* being single-threaded.
				3048	*/
				3049	static void __set_migration_target_nodes(void)
				3050	{
				3051	nodemask_t next_pass = NODE_MASK_NONE;
				3052	nodemask_t this_pass = NODE_MASK_NONE;
				3053	nodemask_t used_targets = NODE_MASK_NONE;
				3054	int node;
				3055
				3056	/*
				3057	* Avoid any oddities like cycles that could occur
				3058	* from changes in the topology. This will leave
				3059	* a momentary gap when migration is disabled.
				3060	*/
				3061	disable_all_migrate_targets();
				3062
				3063	/*
				3064	* Allocations go close to CPUs, first. Assume that
				3065	* the migration path starts at the nodes with CPUs.
				3066	*/
				3067	next_pass = node_states[N_CPU];
				3068	again:
				3069	this_pass = next_pass;
				3070	next_pass = NODE_MASK_NONE;
				3071	/*
				3072	* To avoid cycles in the migration "graph", ensure
				3073	* that migration sources are not future targets by
				3074	* setting them in 'used_targets'. Do this only
				3075	* once per pass so that multiple source nodes can
				3076	* share a target node.
				3077	*
				3078	* 'used_targets' will become unavailable in future
				3079	* passes. This limits some opportunities for
				3080	* multiple source nodes to share a destination.
				3081	*/
				3082	nodes_or(used_targets, used_targets, this_pass);
				3083	for_each_node_mask(node, this_pass) {
				3084	int target_node = establish_migrate_target(node, &used_targets);
				3085
				3086	if (target_node == NUMA_NO_NODE)
				3087	continue;
				3088
				3089	/*
				3090	* Visit targets from this pass in the next pass.
				3091	* Eventually, every node will have been part of
				3092	* a pass, and will become set in 'used_targets'.
				3093	*/
				3094	node_set(target_node, next_pass);
				3095	}
				3096	/*
				3097	* 'next_pass' contains nodes which became migration
				3098	* targets in this pass. Make additional passes until
				3099	* no more migrations targets are available.
				3100	*/
				3101	if (!nodes_empty(next_pass))
				3102	goto again;
				3103	}
				3104
				3105	/*
				3106	* For callers that do not hold get_online_mems() already.
				3107	*/
Dave Hansen	79c28a4	2021-09-02 14:59:06 -0700	[diff] [blame]	3108	static void set_migration_target_nodes(void)
				3109	{
				3110	get_online_mems();
				3111	__set_migration_target_nodes();
				3112	put_online_mems();
				3113	}
Dave Hansen	884a6e5	2021-09-02 14:59:09 -0700	[diff] [blame]	3114
				3115	/*
Dave Hansen	884a6e5	2021-09-02 14:59:09 -0700	[diff] [blame]	3116	* This leaves migrate-on-reclaim transiently disabled between
				3117	* the MEM_GOING_OFFLINE and MEM_OFFLINE events. This runs
				3118	* whether reclaim-based migration is enabled or not, which
				3119	* ensures that the user can turn reclaim-based migration at
				3120	* any time without needing to recalculate migration targets.
				3121	*
				3122	* These callbacks already hold get_online_mems(). That is why
				3123	* __set_migration_target_nodes() can be used as opposed to
				3124	* set_migration_target_nodes().
				3125	*/
				3126	static int __meminit migrate_on_reclaim_callback(struct notifier_block *self,
Dave Hansen	295be91	2021-10-18 15:15:29 -0700	[diff] [blame]	3127	unsigned long action, void *_arg)
Dave Hansen	884a6e5	2021-09-02 14:59:09 -0700	[diff] [blame]	3128	{
Dave Hansen	295be91	2021-10-18 15:15:29 -0700	[diff] [blame]	3129	struct memory_notify *arg = _arg;
				3130
				3131	/*
				3132	* Only update the node migration order when a node is
				3133	* changing status, like online->offline. This avoids
				3134	* the overhead of synchronize_rcu() in most cases.
				3135	*/
				3136	if (arg->status_change_nid < 0)
				3137	return notifier_from_errno(0);
				3138
Dave Hansen	884a6e5	2021-09-02 14:59:09 -0700	[diff] [blame]	3139	switch (action) {
				3140	case MEM_GOING_OFFLINE:
				3141	/*
				3142	* Make sure there are not transient states where
				3143	* an offline node is a migration target. This
				3144	* will leave migration disabled until the offline
				3145	* completes and the MEM_OFFLINE case below runs.
				3146	*/
				3147	disable_all_migrate_targets();
				3148	break;
				3149	case MEM_OFFLINE:
				3150	case MEM_ONLINE:
				3151	/*
				3152	* Recalculate the target nodes once the node
				3153	* reaches its final state (online or offline).
				3154	*/
				3155	__set_migration_target_nodes();
				3156	break;
				3157	case MEM_CANCEL_OFFLINE:
				3158	/*
				3159	* MEM_GOING_OFFLINE disabled all the migration
				3160	* targets. Reenable them.
				3161	*/
				3162	__set_migration_target_nodes();
				3163	break;
				3164	case MEM_GOING_ONLINE:
				3165	case MEM_CANCEL_ONLINE:
				3166	break;
				3167	}
				3168
				3169	return notifier_from_errno(0);
				3170	}
				3171
Dave Hansen	76af6a0	2021-10-18 15:15:32 -0700	[diff] [blame]	3172	/*
				3173	* React to hotplug events that might affect the migration targets
				3174	* like events that online or offline NUMA nodes.
				3175	*
				3176	* The ordering is also currently dependent on which nodes have
				3177	* CPUs. That means we need CPU on/offline notification too.
				3178	*/
				3179	static int migration_online_cpu(unsigned int cpu)
				3180	{
				3181	set_migration_target_nodes();
				3182	return 0;
				3183	}
				3184
				3185	static int migration_offline_cpu(unsigned int cpu)
				3186	{
				3187	set_migration_target_nodes();
				3188	return 0;
				3189	}
				3190
Dave Hansen	884a6e5	2021-09-02 14:59:09 -0700	[diff] [blame]	3191	static int __init migrate_on_reclaim_init(void)
				3192	{
				3193	int ret;
				3194
Huang Ying	a6a0251	2021-10-18 15:15:35 -0700	[diff] [blame]	3195	ret = cpuhp_setup_state_nocalls(CPUHP_MM_DEMOTION_DEAD, "mm/demotion:offline",
				3196	NULL, migration_offline_cpu);
Dave Hansen	884a6e5	2021-09-02 14:59:09 -0700	[diff] [blame]	3197	/*
				3198	* In the unlikely case that this fails, the automatic
				3199	* migration targets may become suboptimal for nodes
				3200	* where N_CPU changes. With such a small impact in a
				3201	* rare case, do not bother trying to do anything special.
				3202	*/
				3203	WARN_ON(ret < 0);
Huang Ying	a6a0251	2021-10-18 15:15:35 -0700	[diff] [blame]	3204	ret = cpuhp_setup_state(CPUHP_AP_MM_DEMOTION_ONLINE, "mm/demotion:online",
				3205	migration_online_cpu, NULL);
				3206	WARN_ON(ret < 0);
Dave Hansen	884a6e5	2021-09-02 14:59:09 -0700	[diff] [blame]	3207
				3208	hotplug_memory_notifier(migrate_on_reclaim_callback, 100);
				3209	return 0;
				3210	}
				3211	late_initcall(migrate_on_reclaim_init);
Dave Hansen	76af6a0	2021-10-18 15:15:32 -0700	[diff] [blame]	3212	#endif /* CONFIG_HOTPLUG_CPU */
Yang Shi	20f9ba4	2021-11-05 13:43:35 -0700	[diff] [blame]	3213
				3214	bool numa_demotion_enabled = false;
				3215
				3216	#ifdef CONFIG_SYSFS
				3217	static ssize_t numa_demotion_enabled_show(struct kobject *kobj,
				3218	struct kobj_attribute attr, char buf)
				3219	{
				3220	return sysfs_emit(buf, "%s\n",
				3221	numa_demotion_enabled ? "true" : "false");
				3222	}
				3223
				3224	static ssize_t numa_demotion_enabled_store(struct kobject *kobj,
				3225	struct kobj_attribute *attr,
				3226	const char *buf, size_t count)
				3227	{
				3228	if (!strncmp(buf, "true", 4) \|\| !strncmp(buf, "1", 1))
				3229	numa_demotion_enabled = true;
				3230	else if (!strncmp(buf, "false", 5) \|\| !strncmp(buf, "0", 1))
				3231	numa_demotion_enabled = false;
				3232	else
				3233	return -EINVAL;
				3234
				3235	return count;
				3236	}
				3237
				3238	static struct kobj_attribute numa_demotion_enabled_attr =
				3239	__ATTR(demotion_enabled, 0644, numa_demotion_enabled_show,
				3240	numa_demotion_enabled_store);
				3241
				3242	static struct attribute *numa_attrs[] = {
				3243	&numa_demotion_enabled_attr.attr,
				3244	NULL,
				3245	};
				3246
				3247	static const struct attribute_group numa_attr_group = {
				3248	.attrs = numa_attrs,
				3249	};
				3250
				3251	static int __init numa_init_sysfs(void)
				3252	{
				3253	int err;
				3254	struct kobject *numa_kobj;
				3255
				3256	numa_kobj = kobject_create_and_add("numa", mm_kobj);
				3257	if (!numa_kobj) {
				3258	pr_err("failed to create numa kobject\n");
				3259	return -ENOMEM;
				3260	}
				3261	err = sysfs_create_group(numa_kobj, &numa_attr_group);
				3262	if (err) {
				3263	pr_err("failed to register numa group\n");
				3264	goto delete_obj;
				3265	}
				3266	return 0;
				3267
				3268	delete_obj:
				3269	kobject_put(numa_kobj);
				3270	return err;
				3271	}
				3272	subsys_initcall(numa_init_sysfs);
				3273	#endif