Blame - mm/migrate.c - SHIFTPHONES/mainline/linux

blob: c7da064b4781b80c46f8c4e18abecfddf81b2283 [file] [log] [blame]

Greg Kroah-Hartman	b244131	2017-11-01 15:07:57 +0100	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	2	/*
Hugh Dickins	14e0f9b	2015-11-05 18:49:43 -0800	[diff] [blame]	3	* Memory Migration functionality - linux/mm/migrate.c
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	4	*
				5	* Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter
				6	*
				7	* Page migration was first developed in the context of the memory hotplug
				8	* project. The main authors of the migration code are:
				9	*
				10	* IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
				11	* Hirokazu Takahashi <taka@valinux.co.jp>
				12	* Dave Hansen <haveblue@us.ibm.com>
Christoph Lameter	cde5353	2008-07-04 09:59:22 -0700	[diff] [blame]	13	* Christoph Lameter
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	14	*/
				15
				16	#include <linux/migrate.h>
Paul Gortmaker	b95f1b31	2011-10-16 02:01:52 -0400	[diff] [blame]	17	#include <linux/export.h>
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	18	#include <linux/swap.h>
Christoph Lameter	0697212	2006-06-23 02:03:35 -0700	[diff] [blame]	19	#include <linux/swapops.h>
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	20	#include <linux/pagemap.h>
Christoph Lameter	e23ca00	2006-04-10 22:52:57 -0700	[diff] [blame]	21	#include <linux/buffer_head.h>
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	22	#include <linux/mm_inline.h>
Pavel Emelyanov	b488893	2007-10-18 23:40:14 -0700	[diff] [blame]	23	#include <linux/nsproxy.h>
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	24	#include <linux/pagevec.h>
Hugh Dickins	e9995ef	2009-12-14 17:59:31 -0800	[diff] [blame]	25	#include <linux/ksm.h>
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	26	#include <linux/rmap.h>
				27	#include <linux/topology.h>
				28	#include <linux/cpu.h>
				29	#include <linux/cpuset.h>
Christoph Lameter	04e62a2	2006-06-23 02:03:38 -0700	[diff] [blame]	30	#include <linux/writeback.h>
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	31	#include <linux/mempolicy.h>
				32	#include <linux/vmalloc.h>
David Quigley	86c3a76	2006-06-23 02:04:02 -0700	[diff] [blame]	33	#include <linux/security.h>
Hugh Dickins	42cb14b	2015-11-05 18:50:05 -0800	[diff] [blame]	34	#include <linux/backing-dev.h>
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	35	#include <linux/compaction.h>
Adrian Bunk	4f5ca26	2008-07-23 21:27:02 -0700	[diff] [blame]	36	#include <linux/syscalls.h>
Dominik Brodowski	7addf44	2018-03-17 16:08:03 +0100	[diff] [blame]	37	#include <linux/compat.h>
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	38	#include <linux/hugetlb.h>
Aneesh Kumar K.V	8e6ac7f	2012-07-31 16:42:27 -0700	[diff] [blame]	39	#include <linux/hugetlb_cgroup.h>
Tejun Heo	5a0e3ad	2010-03-24 17:04:11 +0900	[diff] [blame]	40	#include <linux/gfp.h>
Christoph Hellwig	a520110	2019-08-28 16:19:53 +0200	[diff] [blame]	41	#include <linux/pagewalk.h>
Jérôme Glisse	df6ad69	2017-09-08 16:12:24 -0700	[diff] [blame]	42	#include <linux/pfn_t.h>
Jérôme Glisse	a5430dd	2017-09-08 16:12:17 -0700	[diff] [blame]	43	#include <linux/memremap.h>
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	44	#include <linux/userfaultfd_k.h>
Rafael Aquini	bf6bddf1	2012-12-11 16:02:42 -0800	[diff] [blame]	45	#include <linux/balloon_compaction.h>
Mel Gorman	f714f4f	2013-12-18 17:08:33 -0800	[diff] [blame]	46	#include <linux/mmu_notifier.h>
Vladimir Davydov	33c3fc7	2015-09-09 15:35:45 -0700	[diff] [blame]	47	#include <linux/page_idle.h>
Vlastimil Babka	d435edc	2016-03-15 14:56:15 -0700	[diff] [blame]	48	#include <linux/page_owner.h>
Ingo Molnar	6e84f31	2017-02-08 18:51:29 +0100	[diff] [blame]	49	#include <linux/sched/mm.h>
Linus Torvalds	197e7e5	2017-08-20 13:26:27 -0700	[diff] [blame]	50	#include <linux/ptrace.h>
Ralph Campbell	34290e2	2020-01-30 22:14:44 -0800	[diff] [blame]	51	#include <linux/oom.h>
Dave Hansen	884a6e5	2021-09-02 14:59:09 -0700	[diff] [blame]	52	#include <linux/memory.h>
Baolin Wang	ac16ec8	2022-01-14 14:08:43 -0800	[diff] [blame]	53	#include <linux/random.h>
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	54
Michal Nazarewicz	0d1836c	2010-12-21 17:24:26 -0800	[diff] [blame]	55	#include <asm/tlbflush.h>
				56
Mel Gorman	7b2a2d4	2012-10-19 14:07:31 +0100	[diff] [blame]	57	#define CREATE_TRACE_POINTS
				58	#include <trace/events/migrate.h>
				59
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	60	#include "internal.h"
				61
Yisheng Xie	9e5bcd6	2017-02-24 14:57:29 -0800	[diff] [blame]	62	int isolate_movable_page(struct page *page, isolate_mode_t mode)
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	63	{
				64	struct address_space *mapping;
				65
				66	/*
				67	* Avoid burning cycles with pages that are yet under __free_pages(),
				68	* or just got freed under us.
				69	*
				70	* In case we 'win' a race for a movable page being freed under us and
				71	* raise its refcount preventing __free_pages() from doing its job
				72	* the put_page() at the end of this block will take care of
				73	* release this page, thus avoiding a nasty leakage.
				74	*/
				75	if (unlikely(!get_page_unless_zero(page)))
				76	goto out;
				77
				78	/*
				79	* Check PageMovable before holding a PG_lock because page's owner
				80	* assumes anybody doesn't touch PG_lock of newly allocated page
Wei Yang	8bb4e7a	2019-03-05 15:46:22 -0800	[diff] [blame]	81	* so unconditionally grabbing the lock ruins page's owner side.
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	82	*/
				83	if (unlikely(!__PageMovable(page)))
				84	goto out_putpage;
				85	/*
				86	* As movable pages are not isolated from LRU lists, concurrent
				87	* compaction threads can race against page migration functions
				88	* as well as race against the releasing a page.
				89	*
				90	* In order to avoid having an already isolated movable page
				91	* being (wrongly) re-isolated while it is under migration,
				92	* or to avoid attempting to isolate pages being released,
				93	* lets be sure we have the page lock
				94	* before proceeding with the movable page isolation steps.
				95	*/
				96	if (unlikely(!trylock_page(page)))
				97	goto out_putpage;
				98
				99	if (!PageMovable(page) \|\| PageIsolated(page))
				100	goto out_no_isolated;
				101
				102	mapping = page_mapping(page);
				103	VM_BUG_ON_PAGE(!mapping, page);
				104
				105	if (!mapping->a_ops->isolate_page(page, mode))
				106	goto out_no_isolated;
				107
				108	/* Driver shouldn't use PG_isolated bit of page->flags */
				109	WARN_ON_ONCE(PageIsolated(page));
				110	__SetPageIsolated(page);
				111	unlock_page(page);
				112
Yisheng Xie	9e5bcd6	2017-02-24 14:57:29 -0800	[diff] [blame]	113	return 0;
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	114
				115	out_no_isolated:
				116	unlock_page(page);
				117	out_putpage:
				118	put_page(page);
				119	out:
Yisheng Xie	9e5bcd6	2017-02-24 14:57:29 -0800	[diff] [blame]	120	return -EBUSY;
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	121	}
				122
Miaohe Lin	606a6f7	2021-05-04 18:37:04 -0700	[diff] [blame]	123	static void putback_movable_page(struct page *page)
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	124	{
				125	struct address_space *mapping;
				126
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	127	mapping = page_mapping(page);
				128	mapping->a_ops->putback_page(page);
				129	__ClearPageIsolated(page);
				130	}
				131
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	132	/*
Rafael Aquini	5733c7d	2012-12-11 16:02:47 -0800	[diff] [blame]	133	* Put previously isolated pages back onto the appropriate lists
				134	* from where they were once taken off for compaction/migration.
				135	*
Joonsoo Kim	59c82b7	2014-01-21 15:51:17 -0800	[diff] [blame]	136	* This function shall be used whenever the isolated pageset has been
				137	* built from lru, balloon, hugetlbfs page. See isolate_migratepages_range()
				138	* and isolate_huge_page().
Rafael Aquini	5733c7d	2012-12-11 16:02:47 -0800	[diff] [blame]	139	*/
				140	void putback_movable_pages(struct list_head *l)
				141	{
				142	struct page *page;
				143	struct page *page2;
				144
				145	list_for_each_entry_safe(page, page2, l, lru) {
Naoya Horiguchi	31caf66	2013-09-11 14:21:59 -0700	[diff] [blame]	146	if (unlikely(PageHuge(page))) {
				147	putback_active_hugepage(page);
				148	continue;
				149	}
Rafael Aquini	5733c7d	2012-12-11 16:02:47 -0800	[diff] [blame]	150	list_del(&page->lru);
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	151	/*
				152	* We isolated non-lru movable page so here we can use
				153	* __PageMovable because LRU page's mapping cannot have
				154	* PAGE_MAPPING_MOVABLE.
				155	*/
Minchan Kim	b1123ea6	2016-07-26 15:23:09 -0700	[diff] [blame]	156	if (unlikely(__PageMovable(page))) {
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	157	VM_BUG_ON_PAGE(!PageIsolated(page), page);
				158	lock_page(page);
				159	if (PageMovable(page))
				160	putback_movable_page(page);
				161	else
				162	__ClearPageIsolated(page);
				163	unlock_page(page);
				164	put_page(page);
				165	} else {
Naoya Horiguchi	e8db67e	2017-09-08 16:11:12 -0700	[diff] [blame]	166	mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
Matthew Wilcox (Oracle)	6c35784	2020-08-14 17:30:37 -0700	[diff] [blame]	167	page_is_file_lru(page), -thp_nr_pages(page));
Rabin Vincent	fc280fe	2017-04-20 14:37:46 -0700	[diff] [blame]	168	putback_lru_page(page);
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	169	}
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	170	}
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	171	}
				172
Christoph Lameter	0697212	2006-06-23 02:03:35 -0700	[diff] [blame]	173	/*
				174	* Restore a potential migration pte to a working pte entry
				175	*/
Minchan Kim	e4b8222	2017-05-03 14:54:27 -0700	[diff] [blame]	176	static bool remove_migration_pte(struct page page, struct vm_area_struct vma,
Hugh Dickins	e9995ef	2009-12-14 17:59:31 -0800	[diff] [blame]	177	unsigned long addr, void *old)
Christoph Lameter	0697212	2006-06-23 02:03:35 -0700	[diff] [blame]	178	{
Kirill A. Shutemov	3fe8796	2017-02-24 14:58:16 -0800	[diff] [blame]	179	struct page_vma_mapped_walk pvmw = {
				180	.page = old,
				181	.vma = vma,
				182	.address = addr,
				183	.flags = PVMW_SYNC \| PVMW_MIGRATION,
				184	};
				185	struct page *new;
				186	pte_t pte;
Christoph Lameter	0697212	2006-06-23 02:03:35 -0700	[diff] [blame]	187	swp_entry_t entry;
Christoph Lameter	0697212	2006-06-23 02:03:35 -0700	[diff] [blame]	188
Kirill A. Shutemov	3fe8796	2017-02-24 14:58:16 -0800	[diff] [blame]	189	VM_BUG_ON_PAGE(PageTail(page), page);
				190	while (page_vma_mapped_walk(&pvmw)) {
Naoya Horiguchi	4b0ece6	2017-03-31 15:11:44 -0700	[diff] [blame]	191	if (PageKsm(page))
				192	new = page;
				193	else
				194	new = page - pvmw.page->index +
				195	linear_page_index(vma, pvmw.address);
Christoph Lameter	0697212	2006-06-23 02:03:35 -0700	[diff] [blame]	196
Zi Yan	616b837	2017-09-08 16:10:57 -0700	[diff] [blame]	197	#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
				198	/* PMD-mapped THP migration entry */
				199	if (!pvmw.pte) {
				200	VM_BUG_ON_PAGE(PageHuge(page) \|\| !PageTransCompound(page), page);
				201	remove_migration_pmd(&pvmw, new);
				202	continue;
				203	}
				204	#endif
				205
Kirill A. Shutemov	3fe8796	2017-02-24 14:58:16 -0800	[diff] [blame]	206	get_page(new);
				207	pte = pte_mkold(mk_pte(new, READ_ONCE(vma->vm_page_prot)));
				208	if (pte_swp_soft_dirty(*pvmw.pte))
				209	pte = pte_mksoft_dirty(pte);
Christoph Lameter	0697212	2006-06-23 02:03:35 -0700	[diff] [blame]	210
Hugh Dickins	486cf46	2011-10-19 12:50:35 -0700	[diff] [blame]	211	/*
Kirill A. Shutemov	3fe8796	2017-02-24 14:58:16 -0800	[diff] [blame]	212	* Recheck VMA as permissions can change since migration started
Hugh Dickins	486cf46	2011-10-19 12:50:35 -0700	[diff] [blame]	213	*/
Kirill A. Shutemov	3fe8796	2017-02-24 14:58:16 -0800	[diff] [blame]	214	entry = pte_to_swp_entry(*pvmw.pte);
Alistair Popple	4dd845b	2021-06-30 18:54:09 -0700	[diff] [blame]	215	if (is_writable_migration_entry(entry))
Kirill A. Shutemov	3fe8796	2017-02-24 14:58:16 -0800	[diff] [blame]	216	pte = maybe_mkwrite(pte, vma);
Peter Xu	f45ec5f	2020-04-06 20:06:01 -0700	[diff] [blame]	217	else if (pte_swp_uffd_wp(*pvmw.pte))
				218	pte = pte_mkuffd_wp(pte);
Mel Gorman	d3cb8bf	2014-10-02 19:47:41 +0100	[diff] [blame]	219
Ralph Campbell	6128763	2020-09-04 16:36:04 -0700	[diff] [blame]	220	if (unlikely(is_device_private_page(new))) {
Alistair Popple	4dd845b	2021-06-30 18:54:09 -0700	[diff] [blame]	221	if (pte_write(pte))
				222	entry = make_writable_device_private_entry(
				223	page_to_pfn(new));
				224	else
				225	entry = make_readable_device_private_entry(
				226	page_to_pfn(new));
Ralph Campbell	6128763	2020-09-04 16:36:04 -0700	[diff] [blame]	227	pte = swp_entry_to_pte(entry);
Ralph Campbell	3d321bf8	2020-09-04 16:36:07 -0700	[diff] [blame]	228	if (pte_swp_soft_dirty(*pvmw.pte))
				229	pte = pte_swp_mksoft_dirty(pte);
Ralph Campbell	6128763	2020-09-04 16:36:04 -0700	[diff] [blame]	230	if (pte_swp_uffd_wp(*pvmw.pte))
				231	pte = pte_swp_mkuffd_wp(pte);
Lars Persson	d2b2c6dd	2019-03-28 20:44:28 -0700	[diff] [blame]	232	}
Jérôme Glisse	a5430dd	2017-09-08 16:12:17 -0700	[diff] [blame]	233
Andi Kleen	3ef8fd7	2010-10-11 16:03:21 +0200	[diff] [blame]	234	#ifdef CONFIG_HUGETLB_PAGE
Kirill A. Shutemov	3fe8796	2017-02-24 14:58:16 -0800	[diff] [blame]	235	if (PageHuge(new)) {
Christophe Leroy	79c1c59	2021-06-30 18:48:00 -0700	[diff] [blame]	236	unsigned int shift = huge_page_shift(hstate_vma(vma));
				237
Kirill A. Shutemov	3fe8796	2017-02-24 14:58:16 -0800	[diff] [blame]	238	pte = pte_mkhuge(pte);
Christophe Leroy	79c1c59	2021-06-30 18:48:00 -0700	[diff] [blame]	239	pte = arch_make_huge_pte(pte, shift, vma->vm_flags);
Kirill A. Shutemov	3fe8796	2017-02-24 14:58:16 -0800	[diff] [blame]	240	if (PageAnon(new))
				241	hugepage_add_anon_rmap(new, vma, pvmw.address);
				242	else
				243	page_dup_rmap(new, true);
Pasha Tatashin	1eba86c	2022-01-14 14:06:29 -0800	[diff] [blame]	244	set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
Aneesh Kumar K.V	383321a	2017-07-06 15:38:41 -0700	[diff] [blame]	245	} else
				246	#endif
				247	{
Aneesh Kumar K.V	383321a	2017-07-06 15:38:41 -0700	[diff] [blame]	248	if (PageAnon(new))
				249	page_add_anon_rmap(new, vma, pvmw.address, false);
				250	else
				251	page_add_file_rmap(new, false);
Pasha Tatashin	1eba86c	2022-01-14 14:06:29 -0800	[diff] [blame]	252	set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
Aneesh Kumar K.V	383321a	2017-07-06 15:38:41 -0700	[diff] [blame]	253	}
Kirill A. Shutemov	3fe8796	2017-02-24 14:58:16 -0800	[diff] [blame]	254	if (vma->vm_flags & VM_LOCKED && !PageTransCompound(new))
				255	mlock_vma_page(new);
Hugh Dickins	51afb12	2015-11-05 18:49:37 -0800	[diff] [blame]	256
Kirill A. Shutemov	e125fe4	2018-10-05 15:51:41 -0700	[diff] [blame]	257	if (PageTransHuge(page) && PageMlocked(page))
				258	clear_page_mlock(page);
				259
Kirill A. Shutemov	3fe8796	2017-02-24 14:58:16 -0800	[diff] [blame]	260	/* No need to invalidate - it was non-present before */
				261	update_mmu_cache(vma, pvmw.address, pvmw.pte);
				262	}
				263
Minchan Kim	e4b8222	2017-05-03 14:54:27 -0700	[diff] [blame]	264	return true;
Christoph Lameter	0697212	2006-06-23 02:03:35 -0700	[diff] [blame]	265	}
				266
				267	/*
Christoph Lameter	04e62a2	2006-06-23 02:03:38 -0700	[diff] [blame]	268	* Get rid of all migration entries and replace them by
				269	* references to the indicated page.
				270	*/
Kirill A. Shutemov	e388466	2016-03-17 14:20:07 -0700	[diff] [blame]	271	void remove_migration_ptes(struct page old, struct page new, bool locked)
Christoph Lameter	04e62a2	2006-06-23 02:03:38 -0700	[diff] [blame]	272	{
Joonsoo Kim	051ac83	2014-01-21 15:49:48 -0800	[diff] [blame]	273	struct rmap_walk_control rwc = {
				274	.rmap_one = remove_migration_pte,
				275	.arg = old,
				276	};
				277
Kirill A. Shutemov	e388466	2016-03-17 14:20:07 -0700	[diff] [blame]	278	if (locked)
				279	rmap_walk_locked(new, &rwc);
				280	else
				281	rmap_walk(new, &rwc);
Christoph Lameter	04e62a2	2006-06-23 02:03:38 -0700	[diff] [blame]	282	}
				283
				284	/*
Christoph Lameter	0697212	2006-06-23 02:03:35 -0700	[diff] [blame]	285	* Something used the pte of a page under migration. We need to
				286	* get to the page and wait until migration is finished.
				287	* When we return from this function the fault will be retried.
Christoph Lameter	0697212	2006-06-23 02:03:35 -0700	[diff] [blame]	288	*/
Naoya Horiguchi	e66f17f	2015-02-11 15:25:22 -0800	[diff] [blame]	289	void __migration_entry_wait(struct mm_struct mm, pte_t ptep,
Naoya Horiguchi	30dad30	2013-06-12 14:05:04 -0700	[diff] [blame]	290	spinlock_t *ptl)
Christoph Lameter	0697212	2006-06-23 02:03:35 -0700	[diff] [blame]	291	{
Naoya Horiguchi	30dad30	2013-06-12 14:05:04 -0700	[diff] [blame]	292	pte_t pte;
Christoph Lameter	0697212	2006-06-23 02:03:35 -0700	[diff] [blame]	293	swp_entry_t entry;
Christoph Lameter	0697212	2006-06-23 02:03:35 -0700	[diff] [blame]	294
Naoya Horiguchi	30dad30	2013-06-12 14:05:04 -0700	[diff] [blame]	295	spin_lock(ptl);
Christoph Lameter	0697212	2006-06-23 02:03:35 -0700	[diff] [blame]	296	pte = *ptep;
				297	if (!is_swap_pte(pte))
				298	goto out;
				299
				300	entry = pte_to_swp_entry(pte);
				301	if (!is_migration_entry(entry))
				302	goto out;
				303
Alistair Popple	ffa6575	2022-01-21 22:10:46 -0800	[diff] [blame]	304	migration_entry_wait_on_locked(entry, ptep, ptl);
Christoph Lameter	0697212	2006-06-23 02:03:35 -0700	[diff] [blame]	305	return;
				306	out:
				307	pte_unmap_unlock(ptep, ptl);
				308	}
				309
Naoya Horiguchi	30dad30	2013-06-12 14:05:04 -0700	[diff] [blame]	310	void migration_entry_wait(struct mm_struct mm, pmd_t pmd,
				311	unsigned long address)
				312	{
				313	spinlock_t *ptl = pte_lockptr(mm, pmd);
				314	pte_t *ptep = pte_offset_map(pmd, address);
				315	__migration_entry_wait(mm, ptep, ptl);
				316	}
				317
Kirill A. Shutemov	cb900f4	2013-11-14 14:31:02 -0800	[diff] [blame]	318	void migration_entry_wait_huge(struct vm_area_struct *vma,
				319	struct mm_struct mm, pte_t pte)
Naoya Horiguchi	30dad30	2013-06-12 14:05:04 -0700	[diff] [blame]	320	{
Kirill A. Shutemov	cb900f4	2013-11-14 14:31:02 -0800	[diff] [blame]	321	spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), mm, pte);
Naoya Horiguchi	30dad30	2013-06-12 14:05:04 -0700	[diff] [blame]	322	__migration_entry_wait(mm, pte, ptl);
				323	}
				324
Zi Yan	616b837	2017-09-08 16:10:57 -0700	[diff] [blame]	325	#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
				326	void pmd_migration_entry_wait(struct mm_struct mm, pmd_t pmd)
				327	{
				328	spinlock_t *ptl;
Zi Yan	616b837	2017-09-08 16:10:57 -0700	[diff] [blame]	329
				330	ptl = pmd_lock(mm, pmd);
				331	if (!is_pmd_migration_entry(*pmd))
				332	goto unlock;
Alistair Popple	ffa6575	2022-01-21 22:10:46 -0800	[diff] [blame]	333	migration_entry_wait_on_locked(pmd_to_swp_entry(*pmd), NULL, ptl);
Zi Yan	616b837	2017-09-08 16:10:57 -0700	[diff] [blame]	334	return;
				335	unlock:
				336	spin_unlock(ptl);
				337	}
				338	#endif
				339
Jan Kara	f900482	2019-03-05 15:48:46 -0800	[diff] [blame]	340	static int expected_page_refs(struct address_space mapping, struct page page)
Jan Kara	0b3901b	2018-12-28 00:39:01 -0800	[diff] [blame]	341	{
				342	int expected_count = 1;
				343
				344	/*
Ralph Campbell	f1f4f3a	2020-10-13 16:58:42 -0700	[diff] [blame]	345	* Device private pages have an extra refcount as they are
Jan Kara	0b3901b	2018-12-28 00:39:01 -0800	[diff] [blame]	346	* ZONE_DEVICE pages.
				347	*/
				348	expected_count += is_device_private_page(page);
Jan Kara	f900482	2019-03-05 15:48:46 -0800	[diff] [blame]	349	if (mapping)
Matthew Wilcox (Oracle)	3417013	2021-05-07 07:28:40 -0400	[diff] [blame]	350	expected_count += compound_nr(page) + page_has_private(page);
Jan Kara	0b3901b	2018-12-28 00:39:01 -0800	[diff] [blame]	351
				352	return expected_count;
				353	}
				354
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	355	/*
Christoph Lameter	c3fcf8a	2006-06-23 02:03:32 -0700	[diff] [blame]	356	* Replace the page in the mapping.
Christoph Lameter	5b5c712	2006-06-23 02:03:29 -0700	[diff] [blame]	357	*
				358	* The number of remaining references must be:
				359	* 1 for anonymous pages without a mapping
				360	* 2 for pages with a mapping
David Howells	266cf65	2009-04-03 16:42:36 +0100	[diff] [blame]	361	* 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	362	*/
Matthew Wilcox (Oracle)	3417013	2021-05-07 07:28:40 -0400	[diff] [blame]	363	int folio_migrate_mapping(struct address_space *mapping,
				364	struct folio newfolio, struct folio folio, int extra_count)
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	365	{
Matthew Wilcox (Oracle)	3417013	2021-05-07 07:28:40 -0400	[diff] [blame]	366	XA_STATE(xas, &mapping->i_pages, folio_index(folio));
Hugh Dickins	42cb14b	2015-11-05 18:50:05 -0800	[diff] [blame]	367	struct zone oldzone, newzone;
				368	int dirty;
Matthew Wilcox (Oracle)	3417013	2021-05-07 07:28:40 -0400	[diff] [blame]	369	int expected_count = expected_page_refs(mapping, &folio->page) + extra_count;
				370	long nr = folio_nr_pages(folio);
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	371
Christoph Lameter	6c5240a	2006-06-23 02:03:37 -0700	[diff] [blame]	372	if (!mapping) {
Christoph Lameter	0e8c7d0	2007-04-23 14:41:09 -0700	[diff] [blame]	373	/* Anonymous page without mapping */
Matthew Wilcox (Oracle)	3417013	2021-05-07 07:28:40 -0400	[diff] [blame]	374	if (folio_ref_count(folio) != expected_count)
Christoph Lameter	6c5240a	2006-06-23 02:03:37 -0700	[diff] [blame]	375	return -EAGAIN;
Hugh Dickins	cf4b769	2015-11-05 18:50:02 -0800	[diff] [blame]	376
				377	/* No turning back from here */
Matthew Wilcox (Oracle)	3417013	2021-05-07 07:28:40 -0400	[diff] [blame]	378	newfolio->index = folio->index;
				379	newfolio->mapping = folio->mapping;
				380	if (folio_test_swapbacked(folio))
				381	__folio_set_swapbacked(newfolio);
Hugh Dickins	cf4b769	2015-11-05 18:50:02 -0800	[diff] [blame]	382
Rafael Aquini	78bd520	2012-12-11 16:02:31 -0800	[diff] [blame]	383	return MIGRATEPAGE_SUCCESS;
Christoph Lameter	6c5240a	2006-06-23 02:03:37 -0700	[diff] [blame]	384	}
				385
Matthew Wilcox (Oracle)	3417013	2021-05-07 07:28:40 -0400	[diff] [blame]	386	oldzone = folio_zone(folio);
				387	newzone = folio_zone(newfolio);
Hugh Dickins	42cb14b	2015-11-05 18:50:05 -0800	[diff] [blame]	388
Matthew Wilcox	89eb946	2017-12-04 04:35:16 -0500	[diff] [blame]	389	xas_lock_irq(&xas);
Matthew Wilcox (Oracle)	3417013	2021-05-07 07:28:40 -0400	[diff] [blame]	390	if (!folio_ref_freeze(folio, expected_count)) {
Matthew Wilcox	89eb946	2017-12-04 04:35:16 -0500	[diff] [blame]	391	xas_unlock_irq(&xas);
Nick Piggin	e286781	2008-07-25 19:45:30 -0700	[diff] [blame]	392	return -EAGAIN;
				393	}
				394
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	395	/*
Matthew Wilcox (Oracle)	3417013	2021-05-07 07:28:40 -0400	[diff] [blame]	396	* Now we know that no one else is looking at the folio:
Hugh Dickins	cf4b769	2015-11-05 18:50:02 -0800	[diff] [blame]	397	* no turning back from here.
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	398	*/
Matthew Wilcox (Oracle)	3417013	2021-05-07 07:28:40 -0400	[diff] [blame]	399	newfolio->index = folio->index;
				400	newfolio->mapping = folio->mapping;
				401	folio_ref_add(newfolio, nr); /* add cache reference */
				402	if (folio_test_swapbacked(folio)) {
				403	__folio_set_swapbacked(newfolio);
				404	if (folio_test_swapcache(folio)) {
				405	folio_set_swapcache(newfolio);
				406	newfolio->private = folio_get_private(folio);
Nicholas Piggin	6326fec	2016-12-25 13:00:29 +1000	[diff] [blame]	407	}
				408	} else {
Matthew Wilcox (Oracle)	3417013	2021-05-07 07:28:40 -0400	[diff] [blame]	409	VM_BUG_ON_FOLIO(folio_test_swapcache(folio), folio);
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	410	}
				411
Hugh Dickins	42cb14b	2015-11-05 18:50:05 -0800	[diff] [blame]	412	/* Move dirty while page refs frozen and newpage not yet exposed */
Matthew Wilcox (Oracle)	3417013	2021-05-07 07:28:40 -0400	[diff] [blame]	413	dirty = folio_test_dirty(folio);
Hugh Dickins	42cb14b	2015-11-05 18:50:05 -0800	[diff] [blame]	414	if (dirty) {
Matthew Wilcox (Oracle)	3417013	2021-05-07 07:28:40 -0400	[diff] [blame]	415	folio_clear_dirty(folio);
				416	folio_set_dirty(newfolio);
Hugh Dickins	42cb14b	2015-11-05 18:50:05 -0800	[diff] [blame]	417	}
				418
Matthew Wilcox (Oracle)	3417013	2021-05-07 07:28:40 -0400	[diff] [blame]	419	xas_store(&xas, newfolio);
Nick Piggin	7cf9c2c	2006-12-06 20:33:44 -0800	[diff] [blame]	420
				421	/*
Jacobo Giralt	937a94c	2012-01-10 15:07:11 -0800	[diff] [blame]	422	* Drop cache reference from old page by unfreezing
				423	* to one less reference.
Nick Piggin	7cf9c2c	2006-12-06 20:33:44 -0800	[diff] [blame]	424	* We know this isn't the last reference.
				425	*/
Matthew Wilcox (Oracle)	3417013	2021-05-07 07:28:40 -0400	[diff] [blame]	426	folio_ref_unfreeze(folio, expected_count - nr);
Nick Piggin	7cf9c2c	2006-12-06 20:33:44 -0800	[diff] [blame]	427
Matthew Wilcox	89eb946	2017-12-04 04:35:16 -0500	[diff] [blame]	428	xas_unlock(&xas);
Hugh Dickins	42cb14b	2015-11-05 18:50:05 -0800	[diff] [blame]	429	/* Leave irq disabled to prevent preemption while updating stats */
				430
Christoph Lameter	0e8c7d0	2007-04-23 14:41:09 -0700	[diff] [blame]	431	/*
				432	* If moved to a different zone then also account
				433	* the page for that zone. Other VM counters will be
				434	* taken care of when we establish references to the
				435	* new page and drop references to the old page.
				436	*
				437	* Note that anonymous pages are accounted for
Mel Gorman	4b9d0fa	2016-07-28 15:46:17 -0700	[diff] [blame]	438	* via NR_FILE_PAGES and NR_ANON_MAPPED if they
Christoph Lameter	0e8c7d0	2007-04-23 14:41:09 -0700	[diff] [blame]	439	* are mapped to swap space.
				440	*/
Hugh Dickins	42cb14b	2015-11-05 18:50:05 -0800	[diff] [blame]	441	if (newzone != oldzone) {
Johannes Weiner	0d1c207	2020-06-03 16:01:54 -0700	[diff] [blame]	442	struct lruvec old_lruvec, new_lruvec;
				443	struct mem_cgroup *memcg;
				444
Matthew Wilcox (Oracle)	3417013	2021-05-07 07:28:40 -0400	[diff] [blame]	445	memcg = folio_memcg(folio);
Johannes Weiner	0d1c207	2020-06-03 16:01:54 -0700	[diff] [blame]	446	old_lruvec = mem_cgroup_lruvec(memcg, oldzone->zone_pgdat);
				447	new_lruvec = mem_cgroup_lruvec(memcg, newzone->zone_pgdat);
				448
Shakeel Butt	5c447d2	2021-01-23 21:01:15 -0800	[diff] [blame]	449	__mod_lruvec_state(old_lruvec, NR_FILE_PAGES, -nr);
				450	__mod_lruvec_state(new_lruvec, NR_FILE_PAGES, nr);
Matthew Wilcox (Oracle)	3417013	2021-05-07 07:28:40 -0400	[diff] [blame]	451	if (folio_test_swapbacked(folio) && !folio_test_swapcache(folio)) {
Shakeel Butt	5c447d2	2021-01-23 21:01:15 -0800	[diff] [blame]	452	__mod_lruvec_state(old_lruvec, NR_SHMEM, -nr);
				453	__mod_lruvec_state(new_lruvec, NR_SHMEM, nr);
Hugh Dickins	42cb14b	2015-11-05 18:50:05 -0800	[diff] [blame]	454	}
Shakeel Butt	b603894	2021-02-24 12:03:55 -0800	[diff] [blame]	455	#ifdef CONFIG_SWAP
Matthew Wilcox (Oracle)	3417013	2021-05-07 07:28:40 -0400	[diff] [blame]	456	if (folio_test_swapcache(folio)) {
Shakeel Butt	b603894	2021-02-24 12:03:55 -0800	[diff] [blame]	457	__mod_lruvec_state(old_lruvec, NR_SWAPCACHE, -nr);
				458	__mod_lruvec_state(new_lruvec, NR_SWAPCACHE, nr);
				459	}
				460	#endif
Christoph Hellwig	f56753a	2020-09-24 08:51:40 +0200	[diff] [blame]	461	if (dirty && mapping_can_writeback(mapping)) {
Shakeel Butt	5c447d2	2021-01-23 21:01:15 -0800	[diff] [blame]	462	__mod_lruvec_state(old_lruvec, NR_FILE_DIRTY, -nr);
				463	__mod_zone_page_state(oldzone, NR_ZONE_WRITE_PENDING, -nr);
				464	__mod_lruvec_state(new_lruvec, NR_FILE_DIRTY, nr);
				465	__mod_zone_page_state(newzone, NR_ZONE_WRITE_PENDING, nr);
Hugh Dickins	42cb14b	2015-11-05 18:50:05 -0800	[diff] [blame]	466	}
KOSAKI Motohiro	4b02108	2009-09-21 17:01:33 -0700	[diff] [blame]	467	}
Hugh Dickins	42cb14b	2015-11-05 18:50:05 -0800	[diff] [blame]	468	local_irq_enable();
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	469
Rafael Aquini	78bd520	2012-12-11 16:02:31 -0800	[diff] [blame]	470	return MIGRATEPAGE_SUCCESS;
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	471	}
Matthew Wilcox (Oracle)	3417013	2021-05-07 07:28:40 -0400	[diff] [blame]	472	EXPORT_SYMBOL(folio_migrate_mapping);
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	473
				474	/*
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	475	* The expected number of remaining references is the same as that
Matthew Wilcox (Oracle)	3417013	2021-05-07 07:28:40 -0400	[diff] [blame]	476	* of folio_migrate_mapping().
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	477	*/
				478	int migrate_huge_page_move_mapping(struct address_space *mapping,
				479	struct page newpage, struct page page)
				480	{
Matthew Wilcox	89eb946	2017-12-04 04:35:16 -0500	[diff] [blame]	481	XA_STATE(xas, &mapping->i_pages, page_index(page));
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	482	int expected_count;
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	483
Matthew Wilcox	89eb946	2017-12-04 04:35:16 -0500	[diff] [blame]	484	xas_lock_irq(&xas);
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	485	expected_count = 2 + page_has_private(page);
Matthew Wilcox	89eb946	2017-12-04 04:35:16 -0500	[diff] [blame]	486	if (page_count(page) != expected_count \|\| xas_load(&xas) != page) {
				487	xas_unlock_irq(&xas);
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	488	return -EAGAIN;
				489	}
				490
Joonsoo Kim	fe896d1	2016-03-17 14:19:26 -0700	[diff] [blame]	491	if (!page_ref_freeze(page, expected_count)) {
Matthew Wilcox	89eb946	2017-12-04 04:35:16 -0500	[diff] [blame]	492	xas_unlock_irq(&xas);
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	493	return -EAGAIN;
				494	}
				495
Hugh Dickins	cf4b769	2015-11-05 18:50:02 -0800	[diff] [blame]	496	newpage->index = page->index;
				497	newpage->mapping = page->mapping;
Johannes Weiner	6a93ca8	2016-03-15 14:57:19 -0700	[diff] [blame]	498
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	499	get_page(newpage);
				500
Matthew Wilcox	89eb946	2017-12-04 04:35:16 -0500	[diff] [blame]	501	xas_store(&xas, newpage);
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	502
Joonsoo Kim	fe896d1	2016-03-17 14:19:26 -0700	[diff] [blame]	503	page_ref_unfreeze(page, expected_count - 1);
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	504
Matthew Wilcox	89eb946	2017-12-04 04:35:16 -0500	[diff] [blame]	505	xas_unlock_irq(&xas);
Johannes Weiner	6a93ca8	2016-03-15 14:57:19 -0700	[diff] [blame]	506
Rafael Aquini	78bd520	2012-12-11 16:02:31 -0800	[diff] [blame]	507	return MIGRATEPAGE_SUCCESS;
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	508	}
				509
				510	/*
Matthew Wilcox (Oracle)	1913834	2021-05-07 15:26:29 -0400	[diff] [blame]	511	* Copy the flags and some other ancillary information
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	512	*/
Matthew Wilcox (Oracle)	1913834	2021-05-07 15:26:29 -0400	[diff] [blame]	513	void folio_migrate_flags(struct folio newfolio, struct folio folio)
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	514	{
Rik van Riel	7851a45	2013-10-07 11:29:23 +0100	[diff] [blame]	515	int cpupid;
				516
Matthew Wilcox (Oracle)	1913834	2021-05-07 15:26:29 -0400	[diff] [blame]	517	if (folio_test_error(folio))
				518	folio_set_error(newfolio);
				519	if (folio_test_referenced(folio))
				520	folio_set_referenced(newfolio);
				521	if (folio_test_uptodate(folio))
				522	folio_mark_uptodate(newfolio);
				523	if (folio_test_clear_active(folio)) {
				524	VM_BUG_ON_FOLIO(folio_test_unevictable(folio), folio);
				525	folio_set_active(newfolio);
				526	} else if (folio_test_clear_unevictable(folio))
				527	folio_set_unevictable(newfolio);
				528	if (folio_test_workingset(folio))
				529	folio_set_workingset(newfolio);
				530	if (folio_test_checked(folio))
				531	folio_set_checked(newfolio);
				532	if (folio_test_mappedtodisk(folio))
				533	folio_set_mappedtodisk(newfolio);
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	534
Matthew Wilcox (Oracle)	3417013	2021-05-07 07:28:40 -0400	[diff] [blame]	535	/* Move dirty on pages not done by folio_migrate_mapping() */
Matthew Wilcox (Oracle)	1913834	2021-05-07 15:26:29 -0400	[diff] [blame]	536	if (folio_test_dirty(folio))
				537	folio_set_dirty(newfolio);
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	538
Matthew Wilcox (Oracle)	1913834	2021-05-07 15:26:29 -0400	[diff] [blame]	539	if (folio_test_young(folio))
				540	folio_set_young(newfolio);
				541	if (folio_test_idle(folio))
				542	folio_set_idle(newfolio);
Vladimir Davydov	33c3fc7	2015-09-09 15:35:45 -0700	[diff] [blame]	543
Rik van Riel	7851a45	2013-10-07 11:29:23 +0100	[diff] [blame]	544	/*
				545	* Copy NUMA information to the new page, to prevent over-eager
				546	* future migrations of this same page.
				547	*/
Matthew Wilcox (Oracle)	1913834	2021-05-07 15:26:29 -0400	[diff] [blame]	548	cpupid = page_cpupid_xchg_last(&folio->page, -1);
				549	page_cpupid_xchg_last(&newfolio->page, cpupid);
Rik van Riel	7851a45	2013-10-07 11:29:23 +0100	[diff] [blame]	550
Matthew Wilcox (Oracle)	1913834	2021-05-07 15:26:29 -0400	[diff] [blame]	551	folio_migrate_ksm(newfolio, folio);
Hugh Dickins	c8d6553	2013-02-22 16:35:10 -0800	[diff] [blame]	552	/*
				553	* Please do not reorder this without considering how mm/ksm.c's
				554	* get_ksm_page() depends upon ksm_migrate_page() and PageSwapCache().
				555	*/
Matthew Wilcox (Oracle)	1913834	2021-05-07 15:26:29 -0400	[diff] [blame]	556	if (folio_test_swapcache(folio))
				557	folio_clear_swapcache(folio);
				558	folio_clear_private(folio);
Muchun Song	ad2fa37	2021-06-30 18:47:21 -0700	[diff] [blame]	559
				560	/* page->private contains hugetlb specific flags */
Matthew Wilcox (Oracle)	1913834	2021-05-07 15:26:29 -0400	[diff] [blame]	561	if (!folio_test_hugetlb(folio))
				562	folio->private = NULL;
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	563
				564	/*
				565	* If any waiters have accumulated on the new page then
				566	* wake them up.
				567	*/
Matthew Wilcox (Oracle)	1913834	2021-05-07 15:26:29 -0400	[diff] [blame]	568	if (folio_test_writeback(newfolio))
				569	folio_end_writeback(newfolio);
Vlastimil Babka	d435edc	2016-03-15 14:56:15 -0700	[diff] [blame]	570
Yang Shi	6aeff24	2020-04-06 20:04:21 -0700	[diff] [blame]	571	/*
				572	* PG_readahead shares the same bit with PG_reclaim. The above
				573	* end_page_writeback() may clear PG_readahead mistakenly, so set the
				574	* bit after that.
				575	*/
Matthew Wilcox (Oracle)	1913834	2021-05-07 15:26:29 -0400	[diff] [blame]	576	if (folio_test_readahead(folio))
				577	folio_set_readahead(newfolio);
Yang Shi	6aeff24	2020-04-06 20:04:21 -0700	[diff] [blame]	578
Matthew Wilcox (Oracle)	1913834	2021-05-07 15:26:29 -0400	[diff] [blame]	579	folio_copy_owner(newfolio, folio);
Johannes Weiner	74485cf	2016-03-15 14:57:54 -0700	[diff] [blame]	580
Matthew Wilcox (Oracle)	1913834	2021-05-07 15:26:29 -0400	[diff] [blame]	581	if (!folio_test_hugetlb(folio))
Matthew Wilcox (Oracle)	d21bba2	2021-05-06 18:14:59 -0400	[diff] [blame]	582	mem_cgroup_migrate(folio, newfolio);
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	583	}
Matthew Wilcox (Oracle)	1913834	2021-05-07 15:26:29 -0400	[diff] [blame]	584	EXPORT_SYMBOL(folio_migrate_flags);
Jérôme Glisse	2916ecc	2017-09-08 16:12:06 -0700	[diff] [blame]	585
Matthew Wilcox (Oracle)	715cbfd	2021-05-07 15:05:06 -0400	[diff] [blame]	586	void folio_migrate_copy(struct folio newfolio, struct folio folio)
Jérôme Glisse	2916ecc	2017-09-08 16:12:06 -0700	[diff] [blame]	587	{
Matthew Wilcox (Oracle)	715cbfd	2021-05-07 15:05:06 -0400	[diff] [blame]	588	folio_copy(newfolio, folio);
				589	folio_migrate_flags(newfolio, folio);
Jérôme Glisse	2916ecc	2017-09-08 16:12:06 -0700	[diff] [blame]	590	}
Matthew Wilcox (Oracle)	715cbfd	2021-05-07 15:05:06 -0400	[diff] [blame]	591	EXPORT_SYMBOL(folio_migrate_copy);
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	592
Christoph Lameter	1d8b85c	2006-06-23 02:03:28 -0700	[diff] [blame]	593	/************************************************************
				594	* Migration functions
				595	***********************************************************/
				596
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	597	/*
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	598	* Common logic to directly migrate a single LRU page suitable for
David Howells	266cf65	2009-04-03 16:42:36 +0100	[diff] [blame]	599	* pages that do not use PagePrivate/PagePrivate2.
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	600	*
				601	* Pages are locked upon entry and exit.
				602	*/
Christoph Lameter	2d1db3b	2006-06-23 02:03:33 -0700	[diff] [blame]	603	int migrate_page(struct address_space *mapping,
Mel Gorman	a6bc32b	2012-01-12 17:19:43 -0800	[diff] [blame]	604	struct page newpage, struct page page,
				605	enum migrate_mode mode)
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	606	{
Matthew Wilcox (Oracle)	3417013	2021-05-07 07:28:40 -0400	[diff] [blame]	607	struct folio *newfolio = page_folio(newpage);
				608	struct folio *folio = page_folio(page);
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	609	int rc;
				610
Matthew Wilcox (Oracle)	3417013	2021-05-07 07:28:40 -0400	[diff] [blame]	611	BUG_ON(folio_test_writeback(folio)); /* Writeback must be complete */
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	612
Matthew Wilcox (Oracle)	3417013	2021-05-07 07:28:40 -0400	[diff] [blame]	613	rc = folio_migrate_mapping(mapping, newfolio, folio, 0);
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	614
Rafael Aquini	78bd520	2012-12-11 16:02:31 -0800	[diff] [blame]	615	if (rc != MIGRATEPAGE_SUCCESS)
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	616	return rc;
				617
Jérôme Glisse	2916ecc	2017-09-08 16:12:06 -0700	[diff] [blame]	618	if (mode != MIGRATE_SYNC_NO_COPY)
Matthew Wilcox (Oracle)	715cbfd	2021-05-07 15:05:06 -0400	[diff] [blame]	619	folio_migrate_copy(newfolio, folio);
Jérôme Glisse	2916ecc	2017-09-08 16:12:06 -0700	[diff] [blame]	620	else
Matthew Wilcox (Oracle)	1913834	2021-05-07 15:26:29 -0400	[diff] [blame]	621	folio_migrate_flags(newfolio, folio);
Rafael Aquini	78bd520	2012-12-11 16:02:31 -0800	[diff] [blame]	622	return MIGRATEPAGE_SUCCESS;
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	623	}
				624	EXPORT_SYMBOL(migrate_page);
				625
David Howells	9361401	2006-09-30 20:45:40 +0200	[diff] [blame]	626	#ifdef CONFIG_BLOCK
Jan Kara	84ade7c	2018-12-28 00:39:09 -0800	[diff] [blame]	627	/* Returns true if all buffers are successfully locked */
				628	static bool buffer_migrate_lock_buffers(struct buffer_head *head,
				629	enum migrate_mode mode)
				630	{
				631	struct buffer_head *bh = head;
				632
				633	/* Simple case, sync compaction */
				634	if (mode != MIGRATE_ASYNC) {
				635	do {
Jan Kara	84ade7c	2018-12-28 00:39:09 -0800	[diff] [blame]	636	lock_buffer(bh);
				637	bh = bh->b_this_page;
				638
				639	} while (bh != head);
				640
				641	return true;
				642	}
				643
				644	/* async case, we cannot block on lock_buffer so use trylock_buffer */
				645	do {
Jan Kara	84ade7c	2018-12-28 00:39:09 -0800	[diff] [blame]	646	if (!trylock_buffer(bh)) {
				647	/*
				648	* We failed to lock the buffer and cannot stall in
				649	* async migration. Release the taken locks
				650	*/
				651	struct buffer_head *failed_bh = bh;
Jan Kara	84ade7c	2018-12-28 00:39:09 -0800	[diff] [blame]	652	bh = head;
				653	while (bh != failed_bh) {
				654	unlock_buffer(bh);
Jan Kara	84ade7c	2018-12-28 00:39:09 -0800	[diff] [blame]	655	bh = bh->b_this_page;
				656	}
				657	return false;
				658	}
				659
				660	bh = bh->b_this_page;
				661	} while (bh != head);
				662	return true;
				663	}
				664
Jan Kara	89cb088	2018-12-28 00:39:12 -0800	[diff] [blame]	665	static int __buffer_migrate_page(struct address_space *mapping,
				666	struct page newpage, struct page page, enum migrate_mode mode,
				667	bool check_refs)
Christoph Lameter	1d8b85c	2006-06-23 02:03:28 -0700	[diff] [blame]	668	{
Christoph Lameter	1d8b85c	2006-06-23 02:03:28 -0700	[diff] [blame]	669	struct buffer_head bh, head;
				670	int rc;
Jan Kara	cc4f11e	2018-12-28 00:39:05 -0800	[diff] [blame]	671	int expected_count;
Christoph Lameter	1d8b85c	2006-06-23 02:03:28 -0700	[diff] [blame]	672
Christoph Lameter	1d8b85c	2006-06-23 02:03:28 -0700	[diff] [blame]	673	if (!page_has_buffers(page))
Mel Gorman	a6bc32b	2012-01-12 17:19:43 -0800	[diff] [blame]	674	return migrate_page(mapping, newpage, page, mode);
Christoph Lameter	1d8b85c	2006-06-23 02:03:28 -0700	[diff] [blame]	675
Jan Kara	cc4f11e	2018-12-28 00:39:05 -0800	[diff] [blame]	676	/* Check whether page does not have extra refs before we do more work */
Jan Kara	f900482	2019-03-05 15:48:46 -0800	[diff] [blame]	677	expected_count = expected_page_refs(mapping, page);
Jan Kara	cc4f11e	2018-12-28 00:39:05 -0800	[diff] [blame]	678	if (page_count(page) != expected_count)
				679	return -EAGAIN;
				680
Christoph Lameter	1d8b85c	2006-06-23 02:03:28 -0700	[diff] [blame]	681	head = page_buffers(page);
Jan Kara	cc4f11e	2018-12-28 00:39:05 -0800	[diff] [blame]	682	if (!buffer_migrate_lock_buffers(head, mode))
				683	return -EAGAIN;
Christoph Lameter	1d8b85c	2006-06-23 02:03:28 -0700	[diff] [blame]	684
Jan Kara	89cb088	2018-12-28 00:39:12 -0800	[diff] [blame]	685	if (check_refs) {
				686	bool busy;
				687	bool invalidated = false;
				688
				689	recheck_buffers:
				690	busy = false;
				691	spin_lock(&mapping->private_lock);
				692	bh = head;
				693	do {
				694	if (atomic_read(&bh->b_count)) {
				695	busy = true;
				696	break;
				697	}
				698	bh = bh->b_this_page;
				699	} while (bh != head);
Jan Kara	89cb088	2018-12-28 00:39:12 -0800	[diff] [blame]	700	if (busy) {
				701	if (invalidated) {
				702	rc = -EAGAIN;
				703	goto unlock_buffers;
				704	}
Jan Kara	ebdf4de	2019-08-02 21:48:47 -0700	[diff] [blame]	705	spin_unlock(&mapping->private_lock);
Jan Kara	89cb088	2018-12-28 00:39:12 -0800	[diff] [blame]	706	invalidate_bh_lrus();
				707	invalidated = true;
				708	goto recheck_buffers;
				709	}
				710	}
				711
Keith Busch	3710969	2019-07-18 15:58:46 -0700	[diff] [blame]	712	rc = migrate_page_move_mapping(mapping, newpage, page, 0);
Rafael Aquini	78bd520	2012-12-11 16:02:31 -0800	[diff] [blame]	713	if (rc != MIGRATEPAGE_SUCCESS)
Jan Kara	cc4f11e	2018-12-28 00:39:05 -0800	[diff] [blame]	714	goto unlock_buffers;
Christoph Lameter	1d8b85c	2006-06-23 02:03:28 -0700	[diff] [blame]	715
Guoqing Jiang	cd0f371	2020-06-01 21:48:06 -0700	[diff] [blame]	716	attach_page_private(newpage, detach_page_private(page));
Christoph Lameter	1d8b85c	2006-06-23 02:03:28 -0700	[diff] [blame]	717
				718	bh = head;
				719	do {
				720	set_bh_page(bh, newpage, bh_offset(bh));
				721	bh = bh->b_this_page;
				722
				723	} while (bh != head);
				724
Jérôme Glisse	2916ecc	2017-09-08 16:12:06 -0700	[diff] [blame]	725	if (mode != MIGRATE_SYNC_NO_COPY)
				726	migrate_page_copy(newpage, page);
				727	else
				728	migrate_page_states(newpage, page);
Christoph Lameter	1d8b85c	2006-06-23 02:03:28 -0700	[diff] [blame]	729
Jan Kara	cc4f11e	2018-12-28 00:39:05 -0800	[diff] [blame]	730	rc = MIGRATEPAGE_SUCCESS;
				731	unlock_buffers:
Jan Kara	ebdf4de	2019-08-02 21:48:47 -0700	[diff] [blame]	732	if (check_refs)
				733	spin_unlock(&mapping->private_lock);
Christoph Lameter	1d8b85c	2006-06-23 02:03:28 -0700	[diff] [blame]	734	bh = head;
				735	do {
				736	unlock_buffer(bh);
Christoph Lameter	1d8b85c	2006-06-23 02:03:28 -0700	[diff] [blame]	737	bh = bh->b_this_page;
				738
				739	} while (bh != head);
				740
Jan Kara	cc4f11e	2018-12-28 00:39:05 -0800	[diff] [blame]	741	return rc;
Christoph Lameter	1d8b85c	2006-06-23 02:03:28 -0700	[diff] [blame]	742	}
Jan Kara	89cb088	2018-12-28 00:39:12 -0800	[diff] [blame]	743
				744	/*
				745	* Migration function for pages with buffers. This function can only be used
				746	* if the underlying filesystem guarantees that no other references to "page"
				747	* exist. For example attached buffer heads are accessed only under page lock.
				748	*/
				749	int buffer_migrate_page(struct address_space *mapping,
				750	struct page newpage, struct page page, enum migrate_mode mode)
				751	{
				752	return __buffer_migrate_page(mapping, newpage, page, mode, false);
				753	}
Christoph Lameter	1d8b85c	2006-06-23 02:03:28 -0700	[diff] [blame]	754	EXPORT_SYMBOL(buffer_migrate_page);
Jan Kara	89cb088	2018-12-28 00:39:12 -0800	[diff] [blame]	755
				756	/*
				757	* Same as above except that this variant is more careful and checks that there
				758	* are also no buffer head references. This function is the right one for
				759	* mappings where buffer heads are directly looked up and referenced (such as
				760	* block device mappings).
				761	*/
				762	int buffer_migrate_page_norefs(struct address_space *mapping,
				763	struct page newpage, struct page page, enum migrate_mode mode)
				764	{
				765	return __buffer_migrate_page(mapping, newpage, page, mode, true);
				766	}
David Howells	9361401	2006-09-30 20:45:40 +0200	[diff] [blame]	767	#endif
Christoph Lameter	1d8b85c	2006-06-23 02:03:28 -0700	[diff] [blame]	768
Christoph Lameter	04e62a2	2006-06-23 02:03:38 -0700	[diff] [blame]	769	/*
				770	* Writeback a page to clean the dirty state
				771	*/
				772	static int writeout(struct address_space mapping, struct page page)
				773	{
				774	struct writeback_control wbc = {
				775	.sync_mode = WB_SYNC_NONE,
				776	.nr_to_write = 1,
				777	.range_start = 0,
				778	.range_end = LLONG_MAX,
Christoph Lameter	04e62a2	2006-06-23 02:03:38 -0700	[diff] [blame]	779	.for_reclaim = 1
				780	};
				781	int rc;
				782
				783	if (!mapping->a_ops->writepage)
				784	/* No write method for the address space */
				785	return -EINVAL;
				786
				787	if (!clear_page_dirty_for_io(page))
				788	/* Someone else already triggered a write */
				789	return -EAGAIN;
				790
				791	/*
				792	* A dirty page may imply that the underlying filesystem has
				793	* the page on some queue. So the page must be clean for
				794	* migration. Writeout may mean we loose the lock and the
				795	* page state is no longer what we checked for earlier.
				796	* At this point we know that the migration attempt cannot
				797	* be successful.
				798	*/
Kirill A. Shutemov	e388466	2016-03-17 14:20:07 -0700	[diff] [blame]	799	remove_migration_ptes(page, page, false);
Christoph Lameter	04e62a2	2006-06-23 02:03:38 -0700	[diff] [blame]	800
				801	rc = mapping->a_ops->writepage(page, &wbc);
Christoph Lameter	04e62a2	2006-06-23 02:03:38 -0700	[diff] [blame]	802
				803	if (rc != AOP_WRITEPAGE_ACTIVATE)
				804	/* unlocked. Relock */
				805	lock_page(page);
				806
Hugh Dickins	bda8550	2008-11-19 15:36:36 -0800	[diff] [blame]	807	return (rc < 0) ? -EIO : -EAGAIN;
Christoph Lameter	04e62a2	2006-06-23 02:03:38 -0700	[diff] [blame]	808	}
				809
				810	/*
				811	* Default handling if a filesystem does not provide a migration function.
				812	*/
Christoph Lameter	8351a6e	2006-06-23 02:03:33 -0700	[diff] [blame]	813	static int fallback_migrate_page(struct address_space *mapping,
Mel Gorman	a6bc32b	2012-01-12 17:19:43 -0800	[diff] [blame]	814	struct page newpage, struct page page, enum migrate_mode mode)
Christoph Lameter	8351a6e	2006-06-23 02:03:33 -0700	[diff] [blame]	815	{
Mel Gorman	b969c4ab	2012-01-12 17:19:34 -0800	[diff] [blame]	816	if (PageDirty(page)) {
Mel Gorman	a6bc32b	2012-01-12 17:19:43 -0800	[diff] [blame]	817	/* Only writeback pages in full synchronous migration */
Jérôme Glisse	2916ecc	2017-09-08 16:12:06 -0700	[diff] [blame]	818	switch (mode) {
				819	case MIGRATE_SYNC:
				820	case MIGRATE_SYNC_NO_COPY:
				821	break;
				822	default:
Mel Gorman	b969c4ab	2012-01-12 17:19:34 -0800	[diff] [blame]	823	return -EBUSY;
Jérôme Glisse	2916ecc	2017-09-08 16:12:06 -0700	[diff] [blame]	824	}
Christoph Lameter	04e62a2	2006-06-23 02:03:38 -0700	[diff] [blame]	825	return writeout(mapping, page);
Mel Gorman	b969c4ab	2012-01-12 17:19:34 -0800	[diff] [blame]	826	}
Christoph Lameter	8351a6e	2006-06-23 02:03:33 -0700	[diff] [blame]	827
				828	/*
				829	* Buffers may be managed in a filesystem specific way.
				830	* We must have no buffers or drop them.
				831	*/
David Howells	266cf65	2009-04-03 16:42:36 +0100	[diff] [blame]	832	if (page_has_private(page) &&
Christoph Lameter	8351a6e	2006-06-23 02:03:33 -0700	[diff] [blame]	833	!try_to_release_page(page, GFP_KERNEL))
Mel Gorman	806031b	2019-03-05 15:44:43 -0800	[diff] [blame]	834	return mode == MIGRATE_SYNC ? -EAGAIN : -EBUSY;
Christoph Lameter	8351a6e	2006-06-23 02:03:33 -0700	[diff] [blame]	835
Mel Gorman	a6bc32b	2012-01-12 17:19:43 -0800	[diff] [blame]	836	return migrate_page(mapping, newpage, page, mode);
Christoph Lameter	8351a6e	2006-06-23 02:03:33 -0700	[diff] [blame]	837	}
				838
Christoph Lameter	1d8b85c	2006-06-23 02:03:28 -0700	[diff] [blame]	839	/*
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	840	* Move a page to a newly allocated page
				841	* The page is locked and all ptes have been successfully removed.
				842	*
				843	* The new page will have replaced the old page if this function
				844	* is successful.
Lee Schermerhorn	894bc31	2008-10-18 20:26:39 -0700	[diff] [blame]	845	*
				846	* Return value:
				847	* < 0 - error code
Rafael Aquini	78bd520	2012-12-11 16:02:31 -0800	[diff] [blame]	848	* MIGRATEPAGE_SUCCESS - success
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	849	*/
Mel Gorman	3fe2011	2010-05-24 14:32:20 -0700	[diff] [blame]	850	static int move_to_new_page(struct page newpage, struct page page,
Hugh Dickins	5c3f9a6	2015-11-05 18:49:53 -0800	[diff] [blame]	851	enum migrate_mode mode)
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	852	{
				853	struct address_space *mapping;
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	854	int rc = -EAGAIN;
				855	bool is_lru = !__PageMovable(page);
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	856
Hugh Dickins	7db7671	2015-11-05 18:49:49 -0800	[diff] [blame]	857	VM_BUG_ON_PAGE(!PageLocked(page), page);
				858	VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	859
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	860	mapping = page_mapping(page);
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	861
				862	if (likely(is_lru)) {
				863	if (!mapping)
				864	rc = migrate_page(mapping, newpage, page, mode);
				865	else if (mapping->a_ops->migratepage)
				866	/*
				867	* Most pages have a mapping and most filesystems
				868	* provide a migratepage callback. Anonymous pages
				869	* are part of swap space which also has its own
				870	* migratepage callback. This is the most common path
				871	* for page migration.
				872	*/
				873	rc = mapping->a_ops->migratepage(mapping, newpage,
				874	page, mode);
				875	else
				876	rc = fallback_migrate_page(mapping, newpage,
				877	page, mode);
				878	} else {
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	879	/*
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	880	* In case of non-lru page, it could be released after
				881	* isolation step. In that case, we shouldn't try migration.
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	882	*/
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	883	VM_BUG_ON_PAGE(!PageIsolated(page), page);
				884	if (!PageMovable(page)) {
				885	rc = MIGRATEPAGE_SUCCESS;
				886	__ClearPageIsolated(page);
				887	goto out;
				888	}
				889
				890	rc = mapping->a_ops->migratepage(mapping, newpage,
				891	page, mode);
				892	WARN_ON_ONCE(rc == MIGRATEPAGE_SUCCESS &&
				893	!PageIsolated(page));
				894	}
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	895
Hugh Dickins	5c3f9a6	2015-11-05 18:49:53 -0800	[diff] [blame]	896	/*
				897	* When successful, old pagecache page->mapping must be cleared before
				898	* page is freed; but stats require that PageAnon be left as PageAnon.
				899	*/
				900	if (rc == MIGRATEPAGE_SUCCESS) {
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	901	if (__PageMovable(page)) {
				902	VM_BUG_ON_PAGE(!PageIsolated(page), page);
				903
				904	/*
				905	* We clear PG_movable under page_lock so any compactor
				906	* cannot try to migrate this page.
				907	*/
				908	__ClearPageIsolated(page);
				909	}
				910
				911	/*
Ralph Campbell	c23a0c9	2020-01-30 22:14:41 -0800	[diff] [blame]	912	* Anonymous and movable page->mapping will be cleared by
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	913	* free_pages_prepare so don't reset it here for keeping
				914	* the type to work PageAnon, for example.
				915	*/
				916	if (!PageMappingFlags(page))
Hugh Dickins	5c3f9a6	2015-11-05 18:49:53 -0800	[diff] [blame]	917	page->mapping = NULL;
Lars Persson	d2b2c6dd	2019-03-28 20:44:28 -0700	[diff] [blame]	918
Christoph Hellwig	25b2995	2019-06-13 22:50:49 +0200	[diff] [blame]	919	if (likely(!is_zone_device_page(newpage)))
Lars Persson	d2b2c6dd	2019-03-28 20:44:28 -0700	[diff] [blame]	920	flush_dcache_page(newpage);
				921
Mel Gorman	3fe2011	2010-05-24 14:32:20 -0700	[diff] [blame]	922	}
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	923	out:
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	924	return rc;
				925	}
				926
Minchan Kim	0dabec9	2011-10-31 17:06:57 -0700	[diff] [blame]	927	static int __unmap_and_move(struct page page, struct page newpage,
Hugh Dickins	9c620e2	2013-02-22 16:35:14 -0800	[diff] [blame]	928	int force, enum migrate_mode mode)
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	929	{
Minchan Kim	0dabec9	2011-10-31 17:06:57 -0700	[diff] [blame]	930	int rc = -EAGAIN;
Baolin Wang	213ecb3	2021-09-08 15:18:06 -0700	[diff] [blame]	931	bool page_was_mapped = false;
Mel Gorman	3f6c827	2010-05-24 14:32:17 -0700	[diff] [blame]	932	struct anon_vma *anon_vma = NULL;
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	933	bool is_lru = !__PageMovable(page);
Christoph Lameter	95a402c	2006-06-23 02:03:53 -0700	[diff] [blame]	934
Nick Piggin	529ae9a	2008-08-02 12:01:03 +0200	[diff] [blame]	935	if (!trylock_page(page)) {
Mel Gorman	a6bc32b	2012-01-12 17:19:43 -0800	[diff] [blame]	936	if (!force \|\| mode == MIGRATE_ASYNC)
Minchan Kim	0dabec9	2011-10-31 17:06:57 -0700	[diff] [blame]	937	goto out;
Mel Gorman	3e7d344	2011-01-13 15:45:56 -0800	[diff] [blame]	938
				939	/*
				940	* It's not safe for direct compaction to call lock_page.
				941	* For example, during page readahead pages are added locked
				942	* to the LRU. Later, when the IO completes the pages are
				943	* marked uptodate and unlocked. However, the queueing
				944	* could be merging multiple pages for one bio (e.g.
Matthew Wilcox (Oracle)	d438834	2020-06-01 21:47:02 -0700	[diff] [blame]	945	* mpage_readahead). If an allocation happens for the
Mel Gorman	3e7d344	2011-01-13 15:45:56 -0800	[diff] [blame]	946	* second or third page, the process can end up locking
				947	* the same page twice and deadlocking. Rather than
				948	* trying to be clever about what pages can be locked,
				949	* avoid the use of lock_page for direct compaction
				950	* altogether.
				951	*/
				952	if (current->flags & PF_MEMALLOC)
Minchan Kim	0dabec9	2011-10-31 17:06:57 -0700	[diff] [blame]	953	goto out;
Mel Gorman	3e7d344	2011-01-13 15:45:56 -0800	[diff] [blame]	954
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	955	lock_page(page);
				956	}
				957
				958	if (PageWriteback(page)) {
Andrea Arcangeli	11bc82d	2011-03-22 16:33:11 -0700	[diff] [blame]	959	/*
Jianguo Wu	fed5b64	2013-04-29 15:07:58 -0700	[diff] [blame]	960	* Only in the case of a full synchronous migration is it
Mel Gorman	a6bc32b	2012-01-12 17:19:43 -0800	[diff] [blame]	961	* necessary to wait for PageWriteback. In the async case,
				962	* the retry loop is too short and in the sync-light case,
				963	* the overhead of stalling is too much
Andrea Arcangeli	11bc82d	2011-03-22 16:33:11 -0700	[diff] [blame]	964	*/
Jérôme Glisse	2916ecc	2017-09-08 16:12:06 -0700	[diff] [blame]	965	switch (mode) {
				966	case MIGRATE_SYNC:
				967	case MIGRATE_SYNC_NO_COPY:
				968	break;
				969	default:
Andrea Arcangeli	11bc82d	2011-03-22 16:33:11 -0700	[diff] [blame]	970	rc = -EBUSY;
Johannes Weiner	0a31bc9	2014-08-08 14:19:22 -0700	[diff] [blame]	971	goto out_unlock;
Andrea Arcangeli	11bc82d	2011-03-22 16:33:11 -0700	[diff] [blame]	972	}
				973	if (!force)
Johannes Weiner	0a31bc9	2014-08-08 14:19:22 -0700	[diff] [blame]	974	goto out_unlock;
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	975	wait_on_page_writeback(page);
				976	}
Hugh Dickins	03f15c8	2015-11-05 18:49:56 -0800	[diff] [blame]	977
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	978	/*
Baolin Wang	68a9843	2021-09-08 15:18:03 -0700	[diff] [blame]	979	* By try_to_migrate(), page->mapcount goes down to 0 here. In this case,
KAMEZAWA Hiroyuki	dc386d4	2007-07-26 10:41:07 -0700	[diff] [blame]	980	* we cannot notice that anon_vma is freed while we migrates a page.
Hugh Dickins	1ce82b6	2011-01-13 15:47:30 -0800	[diff] [blame]	981	* This get_anon_vma() delays freeing anon_vma pointer until the end
KAMEZAWA Hiroyuki	dc386d4	2007-07-26 10:41:07 -0700	[diff] [blame]	982	* of migration. File cache pages are no problem because of page_lock()
KAMEZAWA Hiroyuki	989f89c	2007-08-30 23:56:21 -0700	[diff] [blame]	983	* File Caches may use write_page() or lock_page() in migration, then,
				984	* just care Anon page here.
Hugh Dickins	03f15c8	2015-11-05 18:49:56 -0800	[diff] [blame]	985	*
				986	* Only page_get_anon_vma() understands the subtleties of
				987	* getting a hold on an anon_vma from outside one of its mms.
				988	* But if we cannot get anon_vma, then we won't need it anyway,
				989	* because that implies that the anon page is no longer mapped
				990	* (and cannot be remapped so long as we hold the page lock).
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	991	*/
Hugh Dickins	03f15c8	2015-11-05 18:49:56 -0800	[diff] [blame]	992	if (PageAnon(page) && !PageKsm(page))
Peter Zijlstra	746b18d	2011-05-24 17:12:10 -0700	[diff] [blame]	993	anon_vma = page_get_anon_vma(page);
Shaohua Li	62e1c55	2008-02-04 22:29:33 -0800	[diff] [blame]	994
Hugh Dickins	7db7671	2015-11-05 18:49:49 -0800	[diff] [blame]	995	/*
				996	* Block others from accessing the new page when we get around to
				997	* establishing additional references. We are usually the only one
				998	* holding a reference to newpage at this point. We used to have a BUG
				999	* here if trylock_page(newpage) fails, but would like to allow for
				1000	* cases where there might be a race with the previous use of newpage.
				1001	* This is much like races on refcount of oldpage: just don't BUG().
				1002	*/
				1003	if (unlikely(!trylock_page(newpage)))
				1004	goto out_unlock;
				1005
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	1006	if (unlikely(!is_lru)) {
				1007	rc = move_to_new_page(newpage, page, mode);
				1008	goto out_unlock_both;
				1009	}
				1010
KAMEZAWA Hiroyuki	dc386d4	2007-07-26 10:41:07 -0700	[diff] [blame]	1011	/*
Shaohua Li	62e1c55	2008-02-04 22:29:33 -0800	[diff] [blame]	1012	* Corner case handling:
				1013	* 1. When a new swap-cache page is read into, it is added to the LRU
				1014	* and treated as swapcache but it has no rmap yet.
				1015	* Calling try_to_unmap() against a page->mapping==NULL page will
				1016	* trigger a BUG. So handle it here.
Yang Shi	d12b895	2020-12-14 19:13:02 -0800	[diff] [blame]	1017	* 2. An orphaned page (see truncate_cleanup_page) might have
Shaohua Li	62e1c55	2008-02-04 22:29:33 -0800	[diff] [blame]	1018	* fs-private metadata. The page can be picked up due to memory
				1019	* offlining. Everywhere else except page reclaim, the page is
				1020	* invisible to the vm, so the page can not be migrated. So try to
				1021	* free the metadata, so the page can be freed.
KAMEZAWA Hiroyuki	dc386d4	2007-07-26 10:41:07 -0700	[diff] [blame]	1022	*/
Shaohua Li	62e1c55	2008-02-04 22:29:33 -0800	[diff] [blame]	1023	if (!page->mapping) {
Sasha Levin	309381fea	2014-01-23 15:52:54 -0800	[diff] [blame]	1024	VM_BUG_ON_PAGE(PageAnon(page), page);
Hugh Dickins	1ce82b6	2011-01-13 15:47:30 -0800	[diff] [blame]	1025	if (page_has_private(page)) {
Shaohua Li	62e1c55	2008-02-04 22:29:33 -0800	[diff] [blame]	1026	try_to_free_buffers(page);
Hugh Dickins	7db7671	2015-11-05 18:49:49 -0800	[diff] [blame]	1027	goto out_unlock_both;
Shaohua Li	62e1c55	2008-02-04 22:29:33 -0800	[diff] [blame]	1028	}
Hugh Dickins	7db7671	2015-11-05 18:49:49 -0800	[diff] [blame]	1029	} else if (page_mapped(page)) {
				1030	/* Establish migration ptes */
Hugh Dickins	03f15c8	2015-11-05 18:49:56 -0800	[diff] [blame]	1031	VM_BUG_ON_PAGE(PageAnon(page) && !PageKsm(page) && !anon_vma,
				1032	page);
Alistair Popple	a98a2f0	2021-06-30 18:54:16 -0700	[diff] [blame]	1033	try_to_migrate(page, 0);
Baolin Wang	213ecb3	2021-09-08 15:18:06 -0700	[diff] [blame]	1034	page_was_mapped = true;
Hugh Dickins	2ebba6b	2014-12-12 16:56:19 -0800	[diff] [blame]	1035	}
KAMEZAWA Hiroyuki	dc386d4	2007-07-26 10:41:07 -0700	[diff] [blame]	1036
Christoph Lameter	e6a1530	2006-06-25 05:46:49 -0700	[diff] [blame]	1037	if (!page_mapped(page))
Hugh Dickins	5c3f9a6	2015-11-05 18:49:53 -0800	[diff] [blame]	1038	rc = move_to_new_page(newpage, page, mode);
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	1039
Hugh Dickins	5c3f9a6	2015-11-05 18:49:53 -0800	[diff] [blame]	1040	if (page_was_mapped)
				1041	remove_migration_ptes(page,
Kirill A. Shutemov	e388466	2016-03-17 14:20:07 -0700	[diff] [blame]	1042	rc == MIGRATEPAGE_SUCCESS ? newpage : page, false);
Mel Gorman	3f6c827	2010-05-24 14:32:17 -0700	[diff] [blame]	1043
Hugh Dickins	7db7671	2015-11-05 18:49:49 -0800	[diff] [blame]	1044	out_unlock_both:
				1045	unlock_page(newpage);
				1046	out_unlock:
Mel Gorman	3f6c827	2010-05-24 14:32:17 -0700	[diff] [blame]	1047	/* Drop an anon_vma reference if we took one */
Rik van Riel	7654506	2010-08-09 17:18:41 -0700	[diff] [blame]	1048	if (anon_vma)
Peter Zijlstra	9e60109	2011-03-22 16:32:46 -0700	[diff] [blame]	1049	put_anon_vma(anon_vma);
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	1050	unlock_page(page);
Minchan Kim	0dabec9	2011-10-31 17:06:57 -0700	[diff] [blame]	1051	out:
Minchan Kim	c6c919e	2016-07-26 15:23:02 -0700	[diff] [blame]	1052	/*
				1053	* If migration is successful, decrease refcount of the newpage
				1054	* which will not free the page because new page owner increased
				1055	* refcounter. As well, if it is LRU page, add the page to LRU
David Hildenbrand	e0a352f	2019-02-01 14:21:19 -0800	[diff] [blame]	1056	* list in here. Use the old state of the isolated source page to
				1057	* determine if we migrated a LRU page. newpage was already unlocked
				1058	* and possibly modified by its owner - don't rely on the page
				1059	* state.
Minchan Kim	c6c919e	2016-07-26 15:23:02 -0700	[diff] [blame]	1060	*/
				1061	if (rc == MIGRATEPAGE_SUCCESS) {
David Hildenbrand	e0a352f	2019-02-01 14:21:19 -0800	[diff] [blame]	1062	if (unlikely(!is_lru))
Minchan Kim	c6c919e	2016-07-26 15:23:02 -0700	[diff] [blame]	1063	put_page(newpage);
				1064	else
				1065	putback_lru_page(newpage);
				1066	}
				1067
Minchan Kim	0dabec9	2011-10-31 17:06:57 -0700	[diff] [blame]	1068	return rc;
				1069	}
Christoph Lameter	95a402c	2006-06-23 02:03:53 -0700	[diff] [blame]	1070
Minchan Kim	0dabec9	2011-10-31 17:06:57 -0700	[diff] [blame]	1071	/*
				1072	* Obtain the lock on page, remove all ptes and migrate the page
				1073	* to the newly allocated page in newpage.
				1074	*/
Linus Torvalds	6ec4476	2020-07-08 10:48:35 -0700	[diff] [blame]	1075	static int unmap_and_move(new_page_t get_new_page,
Geert Uytterhoeven	ef2a515	2015-04-14 15:44:22 -0700	[diff] [blame]	1076	free_page_t put_new_page,
				1077	unsigned long private, struct page *page,
Naoya Horiguchi	add05ce	2015-06-24 16:56:50 -0700	[diff] [blame]	1078	int force, enum migrate_mode mode,
Yang Shi	dd4ae78	2020-12-14 19:13:06 -0800	[diff] [blame]	1079	enum migrate_reason reason,
				1080	struct list_head *ret)
Minchan Kim	0dabec9	2011-10-31 17:06:57 -0700	[diff] [blame]	1081	{
Hugh Dickins	2def742	2015-11-05 18:49:46 -0800	[diff] [blame]	1082	int rc = MIGRATEPAGE_SUCCESS;
Yang Shi	74d4a57	2019-11-30 17:57:12 -0800	[diff] [blame]	1083	struct page *newpage = NULL;
Minchan Kim	0dabec9	2011-10-31 17:06:57 -0700	[diff] [blame]	1084
Michal Hocko	94723aa	2018-04-10 16:30:07 -0700	[diff] [blame]	1085	if (!thp_migration_supported() && PageTransHuge(page))
Yang Shi	d532e2e	2020-12-14 19:13:16 -0800	[diff] [blame]	1086	return -ENOSYS;
Michal Hocko	94723aa	2018-04-10 16:30:07 -0700	[diff] [blame]	1087
Minchan Kim	0dabec9	2011-10-31 17:06:57 -0700	[diff] [blame]	1088	if (page_count(page) == 1) {
				1089	/* page was freed from under us. So we are done. */
Minchan Kim	c6c919e	2016-07-26 15:23:02 -0700	[diff] [blame]	1090	ClearPageActive(page);
				1091	ClearPageUnevictable(page);
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	1092	if (unlikely(__PageMovable(page))) {
				1093	lock_page(page);
				1094	if (!PageMovable(page))
				1095	__ClearPageIsolated(page);
				1096	unlock_page(page);
				1097	}
Minchan Kim	0dabec9	2011-10-31 17:06:57 -0700	[diff] [blame]	1098	goto out;
				1099	}
				1100
Yang Shi	74d4a57	2019-11-30 17:57:12 -0800	[diff] [blame]	1101	newpage = get_new_page(page, private);
				1102	if (!newpage)
				1103	return -ENOMEM;
				1104
Hugh Dickins	9c620e2	2013-02-22 16:35:14 -0800	[diff] [blame]	1105	rc = __unmap_and_move(page, newpage, force, mode);
Minchan Kim	c6c919e	2016-07-26 15:23:02 -0700	[diff] [blame]	1106	if (rc == MIGRATEPAGE_SUCCESS)
Vlastimil Babka	7cd12b4	2016-03-15 14:56:18 -0700	[diff] [blame]	1107	set_page_owner_migrate_reason(newpage, reason);
Rafael Aquini	bf6bddf1	2012-12-11 16:02:42 -0800	[diff] [blame]	1108
Minchan Kim	0dabec9	2011-10-31 17:06:57 -0700	[diff] [blame]	1109	out:
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	1110	if (rc != -EAGAIN) {
Minchan Kim	0dabec9	2011-10-31 17:06:57 -0700	[diff] [blame]	1111	/*
				1112	* A page that has been migrated has all references
				1113	* removed and will be freed. A page that has not been
Ralph Campbell	c23a0c9	2020-01-30 22:14:41 -0800	[diff] [blame]	1114	* migrated will have kept its references and be restored.
Minchan Kim	0dabec9	2011-10-31 17:06:57 -0700	[diff] [blame]	1115	*/
				1116	list_del(&page->lru);
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	1117	}
David Rientjes	68711a7	2014-06-04 16:08:25 -0700	[diff] [blame]	1118
Christoph Lameter	95a402c	2006-06-23 02:03:53 -0700	[diff] [blame]	1119	/*
Minchan Kim	c6c919e	2016-07-26 15:23:02 -0700	[diff] [blame]	1120	* If migration is successful, releases reference grabbed during
				1121	* isolation. Otherwise, restore the page to right list unless
				1122	* we want to retry.
Christoph Lameter	95a402c	2006-06-23 02:03:53 -0700	[diff] [blame]	1123	*/
Minchan Kim	c6c919e	2016-07-26 15:23:02 -0700	[diff] [blame]	1124	if (rc == MIGRATEPAGE_SUCCESS) {
Yang Shi	dd4ae78	2020-12-14 19:13:06 -0800	[diff] [blame]	1125	/*
				1126	* Compaction can migrate also non-LRU pages which are
				1127	* not accounted to NR_ISOLATED_*. They can be recognized
				1128	* as __PageMovable
				1129	*/
				1130	if (likely(!__PageMovable(page)))
				1131	mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
				1132	page_is_file_lru(page), -thp_nr_pages(page));
				1133
Oscar Salvador	79f5f8f	2020-10-15 20:07:09 -0700	[diff] [blame]	1134	if (reason != MR_MEMORY_FAILURE)
Minchan Kim	c6c919e	2016-07-26 15:23:02 -0700	[diff] [blame]	1135	/*
Oscar Salvador	79f5f8f	2020-10-15 20:07:09 -0700	[diff] [blame]	1136	* We release the page in page_handle_poison.
Minchan Kim	c6c919e	2016-07-26 15:23:02 -0700	[diff] [blame]	1137	*/
Oscar Salvador	79f5f8f	2020-10-15 20:07:09 -0700	[diff] [blame]	1138	put_page(page);
Minchan Kim	c6c919e	2016-07-26 15:23:02 -0700	[diff] [blame]	1139	} else {
Yang Shi	dd4ae78	2020-12-14 19:13:06 -0800	[diff] [blame]	1140	if (rc != -EAGAIN)
				1141	list_add_tail(&page->lru, ret);
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	1142
Minchan Kim	c6c919e	2016-07-26 15:23:02 -0700	[diff] [blame]	1143	if (put_new_page)
				1144	put_new_page(newpage, private);
				1145	else
				1146	put_page(newpage);
				1147	}
David Rientjes	68711a7	2014-06-04 16:08:25 -0700	[diff] [blame]	1148
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	1149	return rc;
				1150	}
				1151
				1152	/*
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	1153	* Counterpart of unmap_and_move_page() for hugepage migration.
				1154	*
				1155	* This function doesn't wait the completion of hugepage I/O
				1156	* because there is no race between I/O and migration for hugepage.
				1157	* Note that currently hugepage I/O occurs only in direct I/O
				1158	* where no lock is held and PG_writeback is irrelevant,
				1159	* and writeback status of all subpages are counted in the reference
				1160	* count of the head page (i.e. if all subpages of a 2MB hugepage are
				1161	* under direct I/O, the reference of the head page is 512 and a bit more.)
				1162	* This means that when we try to migrate hugepage whose subpages are
				1163	* doing direct I/O, some references remain after try_to_unmap() and
				1164	* hugepage migration fails without data corruption.
				1165	*
				1166	* There is also no race when direct I/O is issued on the page under migration,
				1167	* because then pte is replaced with migration swap entry and direct I/O code
				1168	* will wait in the page fault for migration to complete.
				1169	*/
				1170	static int unmap_and_move_huge_page(new_page_t get_new_page,
David Rientjes	68711a7	2014-06-04 16:08:25 -0700	[diff] [blame]	1171	free_page_t put_new_page, unsigned long private,
				1172	struct page *hpage, int force,
Yang Shi	dd4ae78	2020-12-14 19:13:06 -0800	[diff] [blame]	1173	enum migrate_mode mode, int reason,
				1174	struct list_head *ret)
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	1175	{
Hugh Dickins	2def742	2015-11-05 18:49:46 -0800	[diff] [blame]	1176	int rc = -EAGAIN;
Hugh Dickins	2ebba6b	2014-12-12 16:56:19 -0800	[diff] [blame]	1177	int page_was_mapped = 0;
Joonsoo Kim	32665f2	2014-01-21 15:51:15 -0800	[diff] [blame]	1178	struct page *new_hpage;
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	1179	struct anon_vma *anon_vma = NULL;
Mike Kravetz	c0d0381	2020-04-01 21:11:05 -0700	[diff] [blame]	1180	struct address_space *mapping = NULL;
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	1181
Naoya Horiguchi	83467ef	2013-09-11 14:22:11 -0700	[diff] [blame]	1182	/*
Anshuman Khandual	7ed2c31	2019-03-05 15:43:44 -0800	[diff] [blame]	1183	* Migratability of hugepages depends on architectures and their size.
Naoya Horiguchi	83467ef	2013-09-11 14:22:11 -0700	[diff] [blame]	1184	* This check is necessary because some callers of hugepage migration
				1185	* like soft offline and memory hotremove don't walk through page
				1186	* tables or check whether the hugepage is pmd-based or not before
				1187	* kicking migration.
				1188	*/
Naoya Horiguchi	100873d	2014-06-04 16:10:56 -0700	[diff] [blame]	1189	if (!hugepage_migration_supported(page_hstate(hpage))) {
Yang Shi	dd4ae78	2020-12-14 19:13:06 -0800	[diff] [blame]	1190	list_move_tail(&hpage->lru, ret);
Naoya Horiguchi	83467ef	2013-09-11 14:22:11 -0700	[diff] [blame]	1191	return -ENOSYS;
Joonsoo Kim	32665f2	2014-01-21 15:51:15 -0800	[diff] [blame]	1192	}
Naoya Horiguchi	83467ef	2013-09-11 14:22:11 -0700	[diff] [blame]	1193
Muchun Song	71a64f6	2021-02-04 18:32:17 -0800	[diff] [blame]	1194	if (page_count(hpage) == 1) {
				1195	/* page was freed from under us. So we are done. */
				1196	putback_active_hugepage(hpage);
				1197	return MIGRATEPAGE_SUCCESS;
				1198	}
				1199
Michal Hocko	666feb2	2018-04-10 16:30:03 -0700	[diff] [blame]	1200	new_hpage = get_new_page(hpage, private);
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	1201	if (!new_hpage)
				1202	return -ENOMEM;
				1203
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	1204	if (!trylock_page(hpage)) {
Jérôme Glisse	2916ecc	2017-09-08 16:12:06 -0700	[diff] [blame]	1205	if (!force)
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	1206	goto out;
Jérôme Glisse	2916ecc	2017-09-08 16:12:06 -0700	[diff] [blame]	1207	switch (mode) {
				1208	case MIGRATE_SYNC:
				1209	case MIGRATE_SYNC_NO_COPY:
				1210	break;
				1211	default:
				1212	goto out;
				1213	}
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	1214	lock_page(hpage);
				1215	}
				1216
Mike Kravetz	cb6acd0	2019-02-28 16:22:02 -0800	[diff] [blame]	1217	/*
				1218	* Check for pages which are in the process of being freed. Without
				1219	* page_mapping() set, hugetlbfs specific move page routine will not
				1220	* be called and we could leak usage counts for subpools.
				1221	*/
Muchun Song	6acfb5b	2021-06-30 18:51:29 -0700	[diff] [blame]	1222	if (hugetlb_page_subpool(hpage) && !page_mapping(hpage)) {
Mike Kravetz	cb6acd0	2019-02-28 16:22:02 -0800	[diff] [blame]	1223	rc = -EBUSY;
				1224	goto out_unlock;
				1225	}
				1226
Peter Zijlstra	746b18d	2011-05-24 17:12:10 -0700	[diff] [blame]	1227	if (PageAnon(hpage))
				1228	anon_vma = page_get_anon_vma(hpage);
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	1229
Hugh Dickins	7db7671	2015-11-05 18:49:49 -0800	[diff] [blame]	1230	if (unlikely(!trylock_page(new_hpage)))
				1231	goto put_anon;
				1232
Hugh Dickins	2ebba6b	2014-12-12 16:56:19 -0800	[diff] [blame]	1233	if (page_mapped(hpage)) {
Mike Kravetz	336bf30	2020-11-13 22:52:16 -0800	[diff] [blame]	1234	bool mapping_locked = false;
Alistair Popple	a98a2f0	2021-06-30 18:54:16 -0700	[diff] [blame]	1235	enum ttu_flags ttu = 0;
Mike Kravetz	c0d0381	2020-04-01 21:11:05 -0700	[diff] [blame]	1236
Mike Kravetz	336bf30	2020-11-13 22:52:16 -0800	[diff] [blame]	1237	if (!PageAnon(hpage)) {
				1238	/*
				1239	* In shared mappings, try_to_unmap could potentially
				1240	* call huge_pmd_unshare. Because of this, take
				1241	* semaphore in write mode here and set TTU_RMAP_LOCKED
				1242	* to let lower levels know we have taken the lock.
				1243	*/
				1244	mapping = hugetlb_page_mapping_lock_write(hpage);
				1245	if (unlikely(!mapping))
				1246	goto unlock_put_anon;
				1247
				1248	mapping_locked = true;
				1249	ttu \|= TTU_RMAP_LOCKED;
				1250	}
				1251
Alistair Popple	a98a2f0	2021-06-30 18:54:16 -0700	[diff] [blame]	1252	try_to_migrate(hpage, ttu);
Hugh Dickins	2ebba6b	2014-12-12 16:56:19 -0800	[diff] [blame]	1253	page_was_mapped = 1;
Mike Kravetz	336bf30	2020-11-13 22:52:16 -0800	[diff] [blame]	1254
				1255	if (mapping_locked)
				1256	i_mmap_unlock_write(mapping);
Hugh Dickins	2ebba6b	2014-12-12 16:56:19 -0800	[diff] [blame]	1257	}
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	1258
				1259	if (!page_mapped(hpage))
Hugh Dickins	5c3f9a6	2015-11-05 18:49:53 -0800	[diff] [blame]	1260	rc = move_to_new_page(new_hpage, hpage, mode);
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	1261
Mike Kravetz	336bf30	2020-11-13 22:52:16 -0800	[diff] [blame]	1262	if (page_was_mapped)
Hugh Dickins	5c3f9a6	2015-11-05 18:49:53 -0800	[diff] [blame]	1263	remove_migration_ptes(hpage,
Mike Kravetz	336bf30	2020-11-13 22:52:16 -0800	[diff] [blame]	1264	rc == MIGRATEPAGE_SUCCESS ? new_hpage : hpage, false);
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	1265
Mike Kravetz	c0d0381	2020-04-01 21:11:05 -0700	[diff] [blame]	1266	unlock_put_anon:
Hugh Dickins	7db7671	2015-11-05 18:49:49 -0800	[diff] [blame]	1267	unlock_page(new_hpage);
				1268
				1269	put_anon:
Hugh Dickins	fd4a466	2011-01-13 15:47:31 -0800	[diff] [blame]	1270	if (anon_vma)
Peter Zijlstra	9e60109	2011-03-22 16:32:46 -0700	[diff] [blame]	1271	put_anon_vma(anon_vma);
Aneesh Kumar K.V	8e6ac7f	2012-07-31 16:42:27 -0700	[diff] [blame]	1272
Hugh Dickins	2def742	2015-11-05 18:49:46 -0800	[diff] [blame]	1273	if (rc == MIGRATEPAGE_SUCCESS) {
Michal Hocko	ab5ac90	2018-01-31 16:20:48 -0800	[diff] [blame]	1274	move_hugetlb_state(hpage, new_hpage, reason);
Hugh Dickins	2def742	2015-11-05 18:49:46 -0800	[diff] [blame]	1275	put_new_page = NULL;
				1276	}
Aneesh Kumar K.V	8e6ac7f	2012-07-31 16:42:27 -0700	[diff] [blame]	1277
Mike Kravetz	cb6acd0	2019-02-28 16:22:02 -0800	[diff] [blame]	1278	out_unlock:
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	1279	unlock_page(hpage);
Hillf Danton	0976133	2011-12-08 14:34:20 -0800	[diff] [blame]	1280	out:
Yang Shi	dd4ae78	2020-12-14 19:13:06 -0800	[diff] [blame]	1281	if (rc == MIGRATEPAGE_SUCCESS)
Naoya Horiguchi	b8ec1ce	2013-09-11 14:22:01 -0700	[diff] [blame]	1282	putback_active_hugepage(hpage);
Miaohe Lin	a04840c	2021-05-04 18:37:07 -0700	[diff] [blame]	1283	else if (rc != -EAGAIN)
Yang Shi	dd4ae78	2020-12-14 19:13:06 -0800	[diff] [blame]	1284	list_move_tail(&hpage->lru, ret);
David Rientjes	68711a7	2014-06-04 16:08:25 -0700	[diff] [blame]	1285
				1286	/*
				1287	* If migration was not successful and there's a freeing callback, use
				1288	* it. Otherwise, put_page() will drop the reference grabbed during
				1289	* isolation.
				1290	*/
Hugh Dickins	2def742	2015-11-05 18:49:46 -0800	[diff] [blame]	1291	if (put_new_page)
David Rientjes	68711a7	2014-06-04 16:08:25 -0700	[diff] [blame]	1292	put_new_page(new_hpage, private);
				1293	else
Naoya Horiguchi	3aaa76e	2015-09-22 14:59:14 -0700	[diff] [blame]	1294	putback_active_hugepage(new_hpage);
David Rientjes	68711a7	2014-06-04 16:08:25 -0700	[diff] [blame]	1295
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	1296	return rc;
				1297	}
				1298
Yang Shi	d532e2e	2020-12-14 19:13:16 -0800	[diff] [blame]	1299	static inline int try_split_thp(struct page page, struct page *page2,
				1300	struct list_head *from)
				1301	{
				1302	int rc = 0;
				1303
				1304	lock_page(page);
				1305	rc = split_huge_page_to_list(page, from);
				1306	unlock_page(page);
				1307	if (!rc)
				1308	list_safe_reset_next(page, *page2, lru);
				1309
				1310	return rc;
				1311	}
				1312
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	1313	/*
Srivatsa S. Bhat	c73e5c9	2013-04-29 15:08:16 -0700	[diff] [blame]	1314	* migrate_pages - migrate the pages specified in a list, to the free pages
				1315	* supplied as the target for the page migration
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	1316	*
Srivatsa S. Bhat	c73e5c9	2013-04-29 15:08:16 -0700	[diff] [blame]	1317	* @from: The list of pages to be migrated.
				1318	* @get_new_page: The function used to allocate free pages to be used
				1319	* as the target of the page migration.
David Rientjes	68711a7	2014-06-04 16:08:25 -0700	[diff] [blame]	1320	* @put_new_page: The function used to free target pages if migration
				1321	* fails, or NULL if no special handling is necessary.
Srivatsa S. Bhat	c73e5c9	2013-04-29 15:08:16 -0700	[diff] [blame]	1322	* @private: Private data to be passed on to get_new_page()
				1323	* @mode: The migration mode that specifies the constraints for
				1324	* page migration, if any.
				1325	* @reason: The reason for page migration.
Baolin Wang	b5bade9	2022-01-14 14:08:34 -0800	[diff] [blame]	1326	* @ret_succeeded: Set to the number of normal pages migrated successfully if
Yang Shi	5ac9588	2021-09-02 14:59:13 -0700	[diff] [blame]	1327	* the caller passes a non-NULL pointer.
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	1328	*
Srivatsa S. Bhat	c73e5c9	2013-04-29 15:08:16 -0700	[diff] [blame]	1329	* The function returns after 10 attempts or if no pages are movable any more
				1330	* because the list has become empty or no retryable pages exist any more.
Yang Shi	dd4ae78	2020-12-14 19:13:06 -0800	[diff] [blame]	1331	* It is caller's responsibility to call putback_movable_pages() to return pages
				1332	* to the LRU or free list only if ret != 0.
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	1333	*
Baolin Wang	5d39a7e	2022-01-14 14:08:37 -0800	[diff] [blame]	1334	* Returns the number of {normal page, THP, hugetlb} that were not migrated, or
				1335	* an error code. The number of THP splits will be considered as the number of
				1336	* non-migrated THP, no matter how many subpages of the THP are migrated successfully.
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	1337	*/
Hugh Dickins	9c620e2	2013-02-22 16:35:14 -0800	[diff] [blame]	1338	int migrate_pages(struct list_head *from, new_page_t get_new_page,
David Rientjes	68711a7	2014-06-04 16:08:25 -0700	[diff] [blame]	1339	free_page_t put_new_page, unsigned long private,
Yang Shi	5ac9588	2021-09-02 14:59:13 -0700	[diff] [blame]	1340	enum migrate_mode mode, int reason, unsigned int *ret_succeeded)
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	1341	{
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	1342	int retry = 1;
Anshuman Khandual	1a5bae2	2020-08-11 18:31:51 -0700	[diff] [blame]	1343	int thp_retry = 1;
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	1344	int nr_failed = 0;
Baolin Wang	b5bade9	2022-01-14 14:08:34 -0800	[diff] [blame]	1345	int nr_failed_pages = 0;
Mel Gorman	5647bc2	2012-10-19 10:46:20 +0100	[diff] [blame]	1346	int nr_succeeded = 0;
Anshuman Khandual	1a5bae2	2020-08-11 18:31:51 -0700	[diff] [blame]	1347	int nr_thp_succeeded = 0;
				1348	int nr_thp_failed = 0;
				1349	int nr_thp_split = 0;
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	1350	int pass = 0;
Anshuman Khandual	1a5bae2	2020-08-11 18:31:51 -0700	[diff] [blame]	1351	bool is_thp = false;
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	1352	struct page *page;
				1353	struct page *page2;
				1354	int swapwrite = current->flags & PF_SWAPWRITE;
Anshuman Khandual	1a5bae2	2020-08-11 18:31:51 -0700	[diff] [blame]	1355	int rc, nr_subpages;
Yang Shi	dd4ae78	2020-12-14 19:13:06 -0800	[diff] [blame]	1356	LIST_HEAD(ret_pages);
Baolin Wang	b5bade9	2022-01-14 14:08:34 -0800	[diff] [blame]	1357	LIST_HEAD(thp_split_pages);
Yang Shi	b0b515b	2021-06-30 18:51:48 -0700	[diff] [blame]	1358	bool nosplit = (reason == MR_NUMA_MISPLACED);
Baolin Wang	b5bade9	2022-01-14 14:08:34 -0800	[diff] [blame]	1359	bool no_subpage_counting = false;
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	1360
Liam Mark	7bc1aec	2021-05-04 18:37:25 -0700	[diff] [blame]	1361	trace_mm_migrate_pages_start(mode, reason);
				1362
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	1363	if (!swapwrite)
				1364	current->flags \|= PF_SWAPWRITE;
				1365
Baolin Wang	b5bade9	2022-01-14 14:08:34 -0800	[diff] [blame]	1366	thp_subpage_migration:
Anshuman Khandual	1a5bae2	2020-08-11 18:31:51 -0700	[diff] [blame]	1367	for (pass = 0; pass < 10 && (retry \|\| thp_retry); pass++) {
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	1368	retry = 0;
Anshuman Khandual	1a5bae2	2020-08-11 18:31:51 -0700	[diff] [blame]	1369	thp_retry = 0;
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	1370
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	1371	list_for_each_entry_safe(page, page2, from, lru) {
Michal Hocko	94723aa	2018-04-10 16:30:07 -0700	[diff] [blame]	1372	retry:
Anshuman Khandual	1a5bae2	2020-08-11 18:31:51 -0700	[diff] [blame]	1373	/*
				1374	* THP statistics is based on the source huge page.
				1375	* Capture required information that might get lost
				1376	* during migration.
				1377	*/
Zi Yan	6c5c7b9	2020-09-25 21:19:14 -0700	[diff] [blame]	1378	is_thp = PageTransHuge(page) && !PageHuge(page);
Baolin Wang	5d39a7e	2022-01-14 14:08:37 -0800	[diff] [blame]	1379	nr_subpages = compound_nr(page);
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	1380	cond_resched();
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	1381
Naoya Horiguchi	31caf66	2013-09-11 14:21:59 -0700	[diff] [blame]	1382	if (PageHuge(page))
				1383	rc = unmap_and_move_huge_page(get_new_page,
David Rientjes	68711a7	2014-06-04 16:08:25 -0700	[diff] [blame]	1384	put_new_page, private, page,
Yang Shi	dd4ae78	2020-12-14 19:13:06 -0800	[diff] [blame]	1385	pass > 2, mode, reason,
				1386	&ret_pages);
Naoya Horiguchi	31caf66	2013-09-11 14:21:59 -0700	[diff] [blame]	1387	else
David Rientjes	68711a7	2014-06-04 16:08:25 -0700	[diff] [blame]	1388	rc = unmap_and_move(get_new_page, put_new_page,
Naoya Horiguchi	add05ce	2015-06-24 16:56:50 -0700	[diff] [blame]	1389	private, page, pass > 2, mode,
Yang Shi	dd4ae78	2020-12-14 19:13:06 -0800	[diff] [blame]	1390	reason, &ret_pages);
				1391	/*
				1392	* The rules are:
				1393	* Success: non hugetlb page will be freed, hugetlb
				1394	* page will be put back
				1395	* -EAGAIN: stay on the from list
				1396	* -ENOMEM: stay on the from list
				1397	* Other errno: put on ret_pages list then splice to
				1398	* from list
				1399	*/
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	1400	switch(rc) {
Yang Shi	d532e2e	2020-12-14 19:13:16 -0800	[diff] [blame]	1401	/*
				1402	* THP migration might be unsupported or the
				1403	* allocation could've failed so we should
				1404	* retry on the same page with the THP split
				1405	* to base pages.
				1406	*
				1407	* Head page is retried immediately and tail
				1408	* pages are added to the tail of the list so
				1409	* we encounter them after the rest of the list
				1410	* is processed.
				1411	*/
				1412	case -ENOSYS:
				1413	/* THP migration is unsupported */
				1414	if (is_thp) {
Baolin Wang	b5bade9	2022-01-14 14:08:34 -0800	[diff] [blame]	1415	nr_thp_failed++;
				1416	if (!try_split_thp(page, &page2, &thp_split_pages)) {
Yang Shi	d532e2e	2020-12-14 19:13:16 -0800	[diff] [blame]	1417	nr_thp_split++;
				1418	goto retry;
				1419	}
				1420
Baolin Wang	b5bade9	2022-01-14 14:08:34 -0800	[diff] [blame]	1421	nr_failed_pages += nr_subpages;
Yang Shi	d532e2e	2020-12-14 19:13:16 -0800	[diff] [blame]	1422	break;
				1423	}
				1424
				1425	/* Hugetlb migration is unsupported */
Baolin Wang	b5bade9	2022-01-14 14:08:34 -0800	[diff] [blame]	1426	if (!no_subpage_counting)
				1427	nr_failed++;
Baolin Wang	5d39a7e	2022-01-14 14:08:37 -0800	[diff] [blame]	1428	nr_failed_pages += nr_subpages;
Yang Shi	d532e2e	2020-12-14 19:13:16 -0800	[diff] [blame]	1429	break;
Christoph Lameter	95a402c	2006-06-23 02:03:53 -0700	[diff] [blame]	1430	case -ENOMEM:
Michal Hocko	94723aa	2018-04-10 16:30:07 -0700	[diff] [blame]	1431	/*
Yang Shi	d532e2e	2020-12-14 19:13:16 -0800	[diff] [blame]	1432	* When memory is low, don't bother to try to migrate
				1433	* other pages, just exit.
Yang Shi	b0b515b	2021-06-30 18:51:48 -0700	[diff] [blame]	1434	* THP NUMA faulting doesn't split THP to retry.
Michal Hocko	94723aa	2018-04-10 16:30:07 -0700	[diff] [blame]	1435	*/
Yang Shi	b0b515b	2021-06-30 18:51:48 -0700	[diff] [blame]	1436	if (is_thp && !nosplit) {
Baolin Wang	b5bade9	2022-01-14 14:08:34 -0800	[diff] [blame]	1437	nr_thp_failed++;
				1438	if (!try_split_thp(page, &page2, &thp_split_pages)) {
Anshuman Khandual	1a5bae2	2020-08-11 18:31:51 -0700	[diff] [blame]	1439	nr_thp_split++;
Michal Hocko	94723aa	2018-04-10 16:30:07 -0700	[diff] [blame]	1440	goto retry;
				1441	}
Zi Yan	6c5c7b9	2020-09-25 21:19:14 -0700	[diff] [blame]	1442
Baolin Wang	b5bade9	2022-01-14 14:08:34 -0800	[diff] [blame]	1443	nr_failed_pages += nr_subpages;
Anshuman Khandual	1a5bae2	2020-08-11 18:31:51 -0700	[diff] [blame]	1444	goto out;
				1445	}
Baolin Wang	b5bade9	2022-01-14 14:08:34 -0800	[diff] [blame]	1446
				1447	if (!no_subpage_counting)
				1448	nr_failed++;
Baolin Wang	5d39a7e	2022-01-14 14:08:37 -0800	[diff] [blame]	1449	nr_failed_pages += nr_subpages;
Christoph Lameter	95a402c	2006-06-23 02:03:53 -0700	[diff] [blame]	1450	goto out;
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	1451	case -EAGAIN:
Anshuman Khandual	1a5bae2	2020-08-11 18:31:51 -0700	[diff] [blame]	1452	if (is_thp) {
				1453	thp_retry++;
				1454	break;
				1455	}
Christoph Lameter	2d1db3b	2006-06-23 02:03:33 -0700	[diff] [blame]	1456	retry++;
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	1457	break;
Rafael Aquini	78bd520	2012-12-11 16:02:31 -0800	[diff] [blame]	1458	case MIGRATEPAGE_SUCCESS:
Baolin Wang	5d39a7e	2022-01-14 14:08:37 -0800	[diff] [blame]	1459	nr_succeeded += nr_subpages;
Anshuman Khandual	1a5bae2	2020-08-11 18:31:51 -0700	[diff] [blame]	1460	if (is_thp) {
				1461	nr_thp_succeeded++;
Anshuman Khandual	1a5bae2	2020-08-11 18:31:51 -0700	[diff] [blame]	1462	break;
				1463	}
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	1464	break;
				1465	default:
Naoya Horiguchi	354a336	2014-01-21 15:51:14 -0800	[diff] [blame]	1466	/*
Yang Shi	d532e2e	2020-12-14 19:13:16 -0800	[diff] [blame]	1467	* Permanent failure (-EBUSY, etc.):
Naoya Horiguchi	354a336	2014-01-21 15:51:14 -0800	[diff] [blame]	1468	* unlike -EAGAIN case, the failed page is
				1469	* removed from migration page list and not
				1470	* retried in the next outer loop.
				1471	*/
Anshuman Khandual	1a5bae2	2020-08-11 18:31:51 -0700	[diff] [blame]	1472	if (is_thp) {
				1473	nr_thp_failed++;
Baolin Wang	b5bade9	2022-01-14 14:08:34 -0800	[diff] [blame]	1474	nr_failed_pages += nr_subpages;
Anshuman Khandual	1a5bae2	2020-08-11 18:31:51 -0700	[diff] [blame]	1475	break;
				1476	}
Baolin Wang	b5bade9	2022-01-14 14:08:34 -0800	[diff] [blame]	1477
				1478	if (!no_subpage_counting)
				1479	nr_failed++;
Baolin Wang	5d39a7e	2022-01-14 14:08:37 -0800	[diff] [blame]	1480	nr_failed_pages += nr_subpages;
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	1481	break;
Christoph Lameter	2d1db3b	2006-06-23 02:03:33 -0700	[diff] [blame]	1482	}
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	1483	}
				1484	}
Baolin Wang	b5bade9	2022-01-14 14:08:34 -0800	[diff] [blame]	1485	nr_failed += retry;
Anshuman Khandual	1a5bae2	2020-08-11 18:31:51 -0700	[diff] [blame]	1486	nr_thp_failed += thp_retry;
Baolin Wang	b5bade9	2022-01-14 14:08:34 -0800	[diff] [blame]	1487	/*
				1488	* Try to migrate subpages of fail-to-migrate THPs, no nr_failed
				1489	* counting in this round, since all subpages of a THP is counted
				1490	* as 1 failure in the first round.
				1491	*/
				1492	if (!list_empty(&thp_split_pages)) {
				1493	/*
				1494	* Move non-migrated pages (after 10 retries) to ret_pages
				1495	* to avoid migrating them again.
				1496	*/
				1497	list_splice_init(from, &ret_pages);
				1498	list_splice_init(&thp_split_pages, from);
				1499	no_subpage_counting = true;
				1500	retry = 1;
				1501	goto thp_subpage_migration;
				1502	}
				1503
				1504	rc = nr_failed + nr_thp_failed;
Christoph Lameter	95a402c	2006-06-23 02:03:53 -0700	[diff] [blame]	1505	out:
Yang Shi	dd4ae78	2020-12-14 19:13:06 -0800	[diff] [blame]	1506	/*
				1507	* Put the permanent failure page back to migration list, they
				1508	* will be put back to the right list by the caller.
				1509	*/
				1510	list_splice(&ret_pages, from);
				1511
Anshuman Khandual	1a5bae2	2020-08-11 18:31:51 -0700	[diff] [blame]	1512	count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
Baolin Wang	b5bade9	2022-01-14 14:08:34 -0800	[diff] [blame]	1513	count_vm_events(PGMIGRATE_FAIL, nr_failed_pages);
Anshuman Khandual	1a5bae2	2020-08-11 18:31:51 -0700	[diff] [blame]	1514	count_vm_events(THP_MIGRATION_SUCCESS, nr_thp_succeeded);
				1515	count_vm_events(THP_MIGRATION_FAIL, nr_thp_failed);
				1516	count_vm_events(THP_MIGRATION_SPLIT, nr_thp_split);
Baolin Wang	b5bade9	2022-01-14 14:08:34 -0800	[diff] [blame]	1517	trace_mm_migrate_pages(nr_succeeded, nr_failed_pages, nr_thp_succeeded,
Anshuman Khandual	1a5bae2	2020-08-11 18:31:51 -0700	[diff] [blame]	1518	nr_thp_failed, nr_thp_split, mode, reason);
Mel Gorman	7b2a2d4	2012-10-19 14:07:31 +0100	[diff] [blame]	1519
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	1520	if (!swapwrite)
				1521	current->flags &= ~PF_SWAPWRITE;
				1522
Yang Shi	5ac9588	2021-09-02 14:59:13 -0700	[diff] [blame]	1523	if (ret_succeeded)
				1524	*ret_succeeded = nr_succeeded;
				1525
Rafael Aquini	78bd520	2012-12-11 16:02:31 -0800	[diff] [blame]	1526	return rc;
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	1527	}
				1528
Joonsoo Kim	19fc7be	2020-08-11 18:37:25 -0700	[diff] [blame]	1529	struct page alloc_migration_target(struct page page, unsigned long private)
Joonsoo Kim	b4b3822	2020-08-11 18:37:14 -0700	[diff] [blame]	1530	{
Joonsoo Kim	19fc7be	2020-08-11 18:37:25 -0700	[diff] [blame]	1531	struct migration_target_control *mtc;
				1532	gfp_t gfp_mask;
Joonsoo Kim	b4b3822	2020-08-11 18:37:14 -0700	[diff] [blame]	1533	unsigned int order = 0;
				1534	struct page *new_page = NULL;
Joonsoo Kim	19fc7be	2020-08-11 18:37:25 -0700	[diff] [blame]	1535	int nid;
				1536	int zidx;
				1537
				1538	mtc = (struct migration_target_control *)private;
				1539	gfp_mask = mtc->gfp_mask;
				1540	nid = mtc->nid;
				1541	if (nid == NUMA_NO_NODE)
				1542	nid = page_to_nid(page);
Joonsoo Kim	b4b3822	2020-08-11 18:37:14 -0700	[diff] [blame]	1543
Joonsoo Kim	d92bbc2	2020-08-11 18:37:17 -0700	[diff] [blame]	1544	if (PageHuge(page)) {
				1545	struct hstate *h = page_hstate(compound_head(page));
				1546
Joonsoo Kim	19fc7be	2020-08-11 18:37:25 -0700	[diff] [blame]	1547	gfp_mask = htlb_modify_alloc_mask(h, gfp_mask);
				1548	return alloc_huge_page_nodemask(h, nid, mtc->nmask, gfp_mask);
Joonsoo Kim	d92bbc2	2020-08-11 18:37:17 -0700	[diff] [blame]	1549	}
Joonsoo Kim	b4b3822	2020-08-11 18:37:14 -0700	[diff] [blame]	1550
				1551	if (PageTransHuge(page)) {
Joonsoo Kim	9933a0c	2020-08-11 18:37:20 -0700	[diff] [blame]	1552	/*
				1553	* clear __GFP_RECLAIM to make the migration callback
				1554	* consistent with regular THP allocations.
				1555	*/
				1556	gfp_mask &= ~__GFP_RECLAIM;
Joonsoo Kim	b4b3822	2020-08-11 18:37:14 -0700	[diff] [blame]	1557	gfp_mask \|= GFP_TRANSHUGE;
				1558	order = HPAGE_PMD_ORDER;
				1559	}
Joonsoo Kim	19fc7be	2020-08-11 18:37:25 -0700	[diff] [blame]	1560	zidx = zone_idx(page_zone(page));
				1561	if (is_highmem_idx(zidx) \|\| zidx == ZONE_MOVABLE)
Joonsoo Kim	b4b3822	2020-08-11 18:37:14 -0700	[diff] [blame]	1562	gfp_mask \|= __GFP_HIGHMEM;
				1563
Matthew Wilcox (Oracle)	84172f4	2021-04-29 23:01:15 -0700	[diff] [blame]	1564	new_page = __alloc_pages(gfp_mask, order, nid, mtc->nmask);
Joonsoo Kim	b4b3822	2020-08-11 18:37:14 -0700	[diff] [blame]	1565
				1566	if (new_page && PageTransHuge(new_page))
				1567	prep_transhuge_page(new_page);
				1568
				1569	return new_page;
				1570	}
				1571
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1572	#ifdef CONFIG_NUMA
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1573
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1574	static int store_status(int __user *status, int start, int value, int nr)
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1575	{
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1576	while (nr-- > 0) {
				1577	if (put_user(value, status + start))
				1578	return -EFAULT;
				1579	start++;
				1580	}
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1581
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1582	return 0;
				1583	}
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1584
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1585	static int do_move_pages_to_node(struct mm_struct *mm,
				1586	struct list_head *pagelist, int node)
				1587	{
				1588	int err;
Joonsoo Kim	a097631	2020-08-11 18:37:28 -0700	[diff] [blame]	1589	struct migration_target_control mtc = {
				1590	.nid = node,
				1591	.gfp_mask = GFP_HIGHUSER_MOVABLE \| __GFP_THISNODE,
				1592	};
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1593
Joonsoo Kim	a097631	2020-08-11 18:37:28 -0700	[diff] [blame]	1594	err = migrate_pages(pagelist, alloc_migration_target, NULL,
Yang Shi	5ac9588	2021-09-02 14:59:13 -0700	[diff] [blame]	1595	(unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL);
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1596	if (err)
				1597	putback_movable_pages(pagelist);
				1598	return err;
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1599	}
				1600
				1601	/*
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1602	* Resolves the given address to a struct page, isolates it from the LRU and
				1603	* puts it to the given pagelist.
Yang Shi	e0153fc	2020-01-04 12:59:46 -0800	[diff] [blame]	1604	* Returns:
				1605	* errno - if the page cannot be found/isolated
				1606	* 0 - when it doesn't have to be migrated because it is already on the
				1607	* target node
				1608	* 1 - when it has been queued
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1609	*/
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1610	static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
				1611	int node, struct list_head *pagelist, bool migrate_all)
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1612	{
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1613	struct vm_area_struct *vma;
				1614	struct page *page;
				1615	unsigned int follflags;
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1616	int err;
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1617
Michel Lespinasse	d8ed45c	2020-06-08 21:33:25 -0700	[diff] [blame]	1618	mmap_read_lock(mm);
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1619	err = -EFAULT;
				1620	vma = find_vma(mm, addr);
				1621	if (!vma \|\| addr < vma->vm_start \|\| !vma_migratable(vma))
				1622	goto out;
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1623
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1624	/* FOLL_DUMP to ignore special (like zero) pages */
				1625	follflags = FOLL_GET \| FOLL_DUMP;
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1626	page = follow_page(vma, addr, follflags);
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1627
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1628	err = PTR_ERR(page);
				1629	if (IS_ERR(page))
				1630	goto out;
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1631
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1632	err = -ENOENT;
				1633	if (!page)
				1634	goto out;
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1635
Brice Goglin	e78bbfa	2008-10-18 20:27:15 -0700	[diff] [blame]	1636	err = 0;
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1637	if (page_to_nid(page) == node)
				1638	goto out_putpage;
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1639
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1640	err = -EACCES;
				1641	if (page_mapcount(page) > 1 && !migrate_all)
				1642	goto out_putpage;
				1643
				1644	if (PageHuge(page)) {
				1645	if (PageHead(page)) {
				1646	isolate_huge_page(page, pagelist);
Yang Shi	e0153fc	2020-01-04 12:59:46 -0800	[diff] [blame]	1647	err = 1;
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1648	}
				1649	} else {
				1650	struct page *head;
				1651
				1652	head = compound_head(page);
				1653	err = isolate_lru_page(head);
				1654	if (err)
				1655	goto out_putpage;
				1656
Yang Shi	e0153fc	2020-01-04 12:59:46 -0800	[diff] [blame]	1657	err = 1;
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1658	list_add_tail(&head->lru, pagelist);
				1659	mod_node_page_state(page_pgdat(head),
Huang Ying	9de4f22	2020-04-06 20:04:41 -0700	[diff] [blame]	1660	NR_ISOLATED_ANON + page_is_file_lru(head),
Matthew Wilcox (Oracle)	6c35784	2020-08-14 17:30:37 -0700	[diff] [blame]	1661	thp_nr_pages(head));
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1662	}
				1663	out_putpage:
				1664	/*
				1665	* Either remove the duplicate refcount from
				1666	* isolate_lru_page() or drop the page ref if it was
				1667	* not isolated.
				1668	*/
				1669	put_page(page);
				1670	out:
Michel Lespinasse	d8ed45c	2020-06-08 21:33:25 -0700	[diff] [blame]	1671	mmap_read_unlock(mm);
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1672	return err;
				1673	}
				1674
Wei Yang	7ca8783	2020-04-06 20:04:12 -0700	[diff] [blame]	1675	static int move_pages_and_store_status(struct mm_struct *mm, int node,
				1676	struct list_head pagelist, int __user status,
				1677	int start, int i, unsigned long nr_pages)
				1678	{
				1679	int err;
				1680
Wei Yang	5d7ae89	2020-04-06 20:04:15 -0700	[diff] [blame]	1681	if (list_empty(pagelist))
				1682	return 0;
				1683
Wei Yang	7ca8783	2020-04-06 20:04:12 -0700	[diff] [blame]	1684	err = do_move_pages_to_node(mm, pagelist, node);
				1685	if (err) {
				1686	/*
				1687	* Positive err means the number of failed
				1688	* pages to migrate. Since we are going to
				1689	* abort and return the number of non-migrated
Long Li	ab9dd4f	2020-12-14 19:12:52 -0800	[diff] [blame]	1690	* pages, so need to include the rest of the
Wei Yang	7ca8783	2020-04-06 20:04:12 -0700	[diff] [blame]	1691	* nr_pages that have not been attempted as
				1692	* well.
				1693	*/
				1694	if (err > 0)
				1695	err += nr_pages - i - 1;
				1696	return err;
				1697	}
				1698	return store_status(status, start, node, i - start);
				1699	}
				1700
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1701	/*
Brice Goglin	5e9a0f0	2008-10-18 20:27:17 -0700	[diff] [blame]	1702	* Migrate an array of page address onto an array of nodes and fill
				1703	* the corresponding array of status.
				1704	*/
Christoph Lameter	3268c63	2012-03-21 16:34:06 -0700	[diff] [blame]	1705	static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
Brice Goglin	5e9a0f0	2008-10-18 20:27:17 -0700	[diff] [blame]	1706	unsigned long nr_pages,
				1707	const void __user * __user *pages,
				1708	const int __user *nodes,
				1709	int __user *status, int flags)
				1710	{
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1711	int current_node = NUMA_NO_NODE;
				1712	LIST_HEAD(pagelist);
				1713	int start, i;
				1714	int err = 0, err1;
Brice Goglin	35282a2	2009-06-16 15:32:43 -0700	[diff] [blame]	1715
Minchan Kim	361a2a2	2021-05-04 18:36:57 -0700	[diff] [blame]	1716	lru_cache_disable();
Brice Goglin	35282a2	2009-06-16 15:32:43 -0700	[diff] [blame]	1717
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1718	for (i = start = 0; i < nr_pages; i++) {
				1719	const void __user *p;
				1720	unsigned long addr;
				1721	int node;
Brice Goglin	5e9a0f0	2008-10-18 20:27:17 -0700	[diff] [blame]	1722
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1723	err = -EFAULT;
				1724	if (get_user(p, pages + i))
				1725	goto out_flush;
				1726	if (get_user(node, nodes + i))
				1727	goto out_flush;
Andrey Konovalov	057d3389	2019-09-25 16:48:30 -0700	[diff] [blame]	1728	addr = (unsigned long)untagged_addr(p);
Brice Goglin	5e9a0f0	2008-10-18 20:27:17 -0700	[diff] [blame]	1729
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1730	err = -ENODEV;
				1731	if (node < 0 \|\| node >= MAX_NUMNODES)
				1732	goto out_flush;
				1733	if (!node_state(node, N_MEMORY))
				1734	goto out_flush;
Brice Goglin	3140a22	2009-01-06 14:38:57 -0800	[diff] [blame]	1735
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1736	err = -EACCES;
				1737	if (!node_isset(node, task_nodes))
				1738	goto out_flush;
Brice Goglin	5e9a0f0	2008-10-18 20:27:17 -0700	[diff] [blame]	1739
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1740	if (current_node == NUMA_NO_NODE) {
				1741	current_node = node;
				1742	start = i;
				1743	} else if (node != current_node) {
Wei Yang	7ca8783	2020-04-06 20:04:12 -0700	[diff] [blame]	1744	err = move_pages_and_store_status(mm, current_node,
				1745	&pagelist, status, start, i, nr_pages);
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1746	if (err)
				1747	goto out;
				1748	start = i;
				1749	current_node = node;
Brice Goglin	3140a22	2009-01-06 14:38:57 -0800	[diff] [blame]	1750	}
Brice Goglin	5e9a0f0	2008-10-18 20:27:17 -0700	[diff] [blame]	1751
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1752	/*
				1753	* Errors in the page lookup or isolation are not fatal and we simply
				1754	* report them via status
				1755	*/
				1756	err = add_page_for_migration(mm, addr, current_node,
				1757	&pagelist, flags & MPOL_MF_MOVE_ALL);
Yang Shi	e0153fc	2020-01-04 12:59:46 -0800	[diff] [blame]	1758
Wei Yang	d08221a	2020-04-06 20:04:18 -0700	[diff] [blame]	1759	if (err > 0) {
Yang Shi	e0153fc	2020-01-04 12:59:46 -0800	[diff] [blame]	1760	/* The page is successfully queued for migration */
				1761	continue;
				1762	}
Brice Goglin	3140a22	2009-01-06 14:38:57 -0800	[diff] [blame]	1763
Wei Yang	d08221a	2020-04-06 20:04:18 -0700	[diff] [blame]	1764	/*
				1765	* If the page is already on the target node (!err), store the
				1766	* node, otherwise, store the err.
				1767	*/
				1768	err = store_status(status, i, err ? : current_node, 1);
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1769	if (err)
				1770	goto out_flush;
Brice Goglin	3140a22	2009-01-06 14:38:57 -0800	[diff] [blame]	1771
Wei Yang	7ca8783	2020-04-06 20:04:12 -0700	[diff] [blame]	1772	err = move_pages_and_store_status(mm, current_node, &pagelist,
				1773	status, start, i, nr_pages);
Wei Yang	4afdace	2020-04-06 20:04:09 -0700	[diff] [blame]	1774	if (err)
				1775	goto out;
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1776	current_node = NUMA_NO_NODE;
Brice Goglin	3140a22	2009-01-06 14:38:57 -0800	[diff] [blame]	1777	}
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1778	out_flush:
				1779	/* Make sure we do not overwrite the existing error */
Wei Yang	7ca8783	2020-04-06 20:04:12 -0700	[diff] [blame]	1780	err1 = move_pages_and_store_status(mm, current_node, &pagelist,
				1781	status, start, i, nr_pages);
Wei Yang	dfe9aa2	2020-01-30 22:11:14 -0800	[diff] [blame]	1782	if (err >= 0)
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1783	err = err1;
Brice Goglin	5e9a0f0	2008-10-18 20:27:17 -0700	[diff] [blame]	1784	out:
Minchan Kim	361a2a2	2021-05-04 18:36:57 -0700	[diff] [blame]	1785	lru_cache_enable();
Brice Goglin	5e9a0f0	2008-10-18 20:27:17 -0700	[diff] [blame]	1786	return err;
				1787	}
				1788
				1789	/*
Brice Goglin	2f007e7	2008-10-18 20:27:16 -0700	[diff] [blame]	1790	* Determine the nodes of an array of pages and store it in an array of status.
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1791	*/
Brice Goglin	80bba12	2008-12-09 13:14:23 -0800	[diff] [blame]	1792	static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
				1793	const void __user *pages, int status)
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1794	{
Brice Goglin	2f007e7	2008-10-18 20:27:16 -0700	[diff] [blame]	1795	unsigned long i;
Brice Goglin	2f007e7	2008-10-18 20:27:16 -0700	[diff] [blame]	1796
Michel Lespinasse	d8ed45c	2020-06-08 21:33:25 -0700	[diff] [blame]	1797	mmap_read_lock(mm);
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1798
Brice Goglin	2f007e7	2008-10-18 20:27:16 -0700	[diff] [blame]	1799	for (i = 0; i < nr_pages; i++) {
Brice Goglin	80bba12	2008-12-09 13:14:23 -0800	[diff] [blame]	1800	unsigned long addr = (unsigned long)(*pages);
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1801	struct vm_area_struct *vma;
				1802	struct page *page;
KOSAKI Motohiro	c095adb	2008-12-16 16:06:43 +0900	[diff] [blame]	1803	int err = -EFAULT;
Brice Goglin	2f007e7	2008-10-18 20:27:16 -0700	[diff] [blame]	1804
Liam Howlett	059b8b4	2021-06-28 19:39:44 -0700	[diff] [blame]	1805	vma = vma_lookup(mm, addr);
				1806	if (!vma)
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1807	goto set_status;
				1808
Kirill A. Shutemov	d899844	2015-09-04 15:47:53 -0700	[diff] [blame]	1809	/* FOLL_DUMP to ignore special (like zero) pages */
				1810	page = follow_page(vma, addr, FOLL_DUMP);
Linus Torvalds	89f5b7d	2008-06-20 11:18:25 -0700	[diff] [blame]	1811
				1812	err = PTR_ERR(page);
				1813	if (IS_ERR(page))
				1814	goto set_status;
				1815
Kirill A. Shutemov	d899844	2015-09-04 15:47:53 -0700	[diff] [blame]	1816	err = page ? page_to_nid(page) : -ENOENT;
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1817	set_status:
Brice Goglin	80bba12	2008-12-09 13:14:23 -0800	[diff] [blame]	1818	*status = err;
				1819
				1820	pages++;
				1821	status++;
				1822	}
				1823
Michel Lespinasse	d8ed45c	2020-06-08 21:33:25 -0700	[diff] [blame]	1824	mmap_read_unlock(mm);
Brice Goglin	80bba12	2008-12-09 13:14:23 -0800	[diff] [blame]	1825	}
				1826
Arnd Bergmann	5b1b561	2021-09-08 15:18:17 -0700	[diff] [blame]	1827	static int get_compat_pages_array(const void __user *chunk_pages[],
				1828	const void __user * __user *pages,
				1829	unsigned long chunk_nr)
				1830	{
				1831	compat_uptr_t __user pages32 = (compat_uptr_t __user )pages;
				1832	compat_uptr_t p;
				1833	int i;
				1834
				1835	for (i = 0; i < chunk_nr; i++) {
				1836	if (get_user(p, pages32 + i))
				1837	return -EFAULT;
				1838	chunk_pages[i] = compat_ptr(p);
				1839	}
				1840
				1841	return 0;
				1842	}
				1843
Brice Goglin	80bba12	2008-12-09 13:14:23 -0800	[diff] [blame]	1844	/*
				1845	* Determine the nodes of a user array of pages and store it in
				1846	* a user array of status.
				1847	*/
				1848	static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
				1849	const void __user * __user *pages,
				1850	int __user *status)
				1851	{
				1852	#define DO_PAGES_STAT_CHUNK_NR 16
				1853	const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR];
				1854	int chunk_status[DO_PAGES_STAT_CHUNK_NR];
Brice Goglin	80bba12	2008-12-09 13:14:23 -0800	[diff] [blame]	1855
H. Peter Anvin	87b8d1a	2010-02-18 16:13:40 -0800	[diff] [blame]	1856	while (nr_pages) {
				1857	unsigned long chunk_nr;
Brice Goglin	80bba12	2008-12-09 13:14:23 -0800	[diff] [blame]	1858
H. Peter Anvin	87b8d1a	2010-02-18 16:13:40 -0800	[diff] [blame]	1859	chunk_nr = nr_pages;
				1860	if (chunk_nr > DO_PAGES_STAT_CHUNK_NR)
				1861	chunk_nr = DO_PAGES_STAT_CHUNK_NR;
				1862
Arnd Bergmann	5b1b561	2021-09-08 15:18:17 -0700	[diff] [blame]	1863	if (in_compat_syscall()) {
				1864	if (get_compat_pages_array(chunk_pages, pages,
				1865	chunk_nr))
				1866	break;
				1867	} else {
				1868	if (copy_from_user(chunk_pages, pages,
				1869	chunk_nr * sizeof(*chunk_pages)))
				1870	break;
				1871	}
Brice Goglin	80bba12	2008-12-09 13:14:23 -0800	[diff] [blame]	1872
				1873	do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status);
				1874
H. Peter Anvin	87b8d1a	2010-02-18 16:13:40 -0800	[diff] [blame]	1875	if (copy_to_user(status, chunk_status, chunk_nr * sizeof(*status)))
				1876	break;
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1877
H. Peter Anvin	87b8d1a	2010-02-18 16:13:40 -0800	[diff] [blame]	1878	pages += chunk_nr;
				1879	status += chunk_nr;
				1880	nr_pages -= chunk_nr;
				1881	}
				1882	return nr_pages ? -EFAULT : 0;
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1883	}
				1884
Miaohe Lin	4dc200c	2020-10-17 16:14:03 -0700	[diff] [blame]	1885	static struct mm_struct find_mm_struct(pid_t pid, nodemask_t mem_nodes)
				1886	{
				1887	struct task_struct *task;
				1888	struct mm_struct *mm;
				1889
				1890	/*
				1891	* There is no need to check if current process has the right to modify
				1892	* the specified process when they are same.
				1893	*/
				1894	if (!pid) {
				1895	mmget(current->mm);
				1896	*mem_nodes = cpuset_mems_allowed(current);
				1897	return current->mm;
				1898	}
				1899
				1900	/* Find the mm_struct */
				1901	rcu_read_lock();
				1902	task = find_task_by_vpid(pid);
				1903	if (!task) {
				1904	rcu_read_unlock();
				1905	return ERR_PTR(-ESRCH);
				1906	}
				1907	get_task_struct(task);
				1908
				1909	/*
				1910	* Check if this process has the right to modify the specified
				1911	* process. Use the regular "ptrace_may_access()" checks.
				1912	*/
				1913	if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
				1914	rcu_read_unlock();
				1915	mm = ERR_PTR(-EPERM);
				1916	goto out;
				1917	}
				1918	rcu_read_unlock();
				1919
				1920	mm = ERR_PTR(security_task_movememory(task));
				1921	if (IS_ERR(mm))
				1922	goto out;
				1923	*mem_nodes = cpuset_mems_allowed(task);
				1924	mm = get_task_mm(task);
				1925	out:
				1926	put_task_struct(task);
				1927	if (!mm)
				1928	mm = ERR_PTR(-EINVAL);
				1929	return mm;
				1930	}
				1931
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1932	/*
				1933	* Move a list of pages in the address space of the currently executing
				1934	* process.
				1935	*/
Dominik Brodowski	7addf44	2018-03-17 16:08:03 +0100	[diff] [blame]	1936	static int kernel_move_pages(pid_t pid, unsigned long nr_pages,
				1937	const void __user * __user *pages,
				1938	const int __user *nodes,
				1939	int __user *status, int flags)
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1940	{
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1941	struct mm_struct *mm;
Brice Goglin	5e9a0f0	2008-10-18 20:27:17 -0700	[diff] [blame]	1942	int err;
Christoph Lameter	3268c63	2012-03-21 16:34:06 -0700	[diff] [blame]	1943	nodemask_t task_nodes;
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1944
				1945	/* Check flags */
				1946	if (flags & ~(MPOL_MF_MOVE\|MPOL_MF_MOVE_ALL))
				1947	return -EINVAL;
				1948
				1949	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
				1950	return -EPERM;
				1951
Miaohe Lin	4dc200c	2020-10-17 16:14:03 -0700	[diff] [blame]	1952	mm = find_mm_struct(pid, &task_nodes);
				1953	if (IS_ERR(mm))
				1954	return PTR_ERR(mm);
Sasha Levin	6e8b09e	2012-04-25 16:01:53 -0700	[diff] [blame]	1955
				1956	if (nodes)
				1957	err = do_pages_move(mm, task_nodes, nr_pages, pages,
				1958	nodes, status, flags);
				1959	else
				1960	err = do_pages_stat(mm, nr_pages, pages, status);
Christoph Lameter	3268c63	2012-03-21 16:34:06 -0700	[diff] [blame]	1961
				1962	mmput(mm);
				1963	return err;
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1964	}
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1965
Dominik Brodowski	7addf44	2018-03-17 16:08:03 +0100	[diff] [blame]	1966	SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
				1967	const void __user * __user *, pages,
				1968	const int __user *, nodes,
				1969	int __user *, status, int, flags)
				1970	{
				1971	return kernel_move_pages(pid, nr_pages, pages, nodes, status, flags);
				1972	}
				1973
Peter Zijlstra	7039e1d	2012-10-25 14:16:34 +0200	[diff] [blame]	1974	#ifdef CONFIG_NUMA_BALANCING
				1975	/*
				1976	* Returns true if this is a safe migration target node for misplaced NUMA
				1977	* pages. Currently it only checks the watermarks which crude
				1978	*/
				1979	static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
Mel Gorman	3abef4e	2013-02-22 16:34:27 -0800	[diff] [blame]	1980	unsigned long nr_migrate_pages)
Peter Zijlstra	7039e1d	2012-10-25 14:16:34 +0200	[diff] [blame]	1981	{
				1982	int z;
Mel Gorman	599d0c9	2016-07-28 15:45:31 -0700	[diff] [blame]	1983
Peter Zijlstra	7039e1d	2012-10-25 14:16:34 +0200	[diff] [blame]	1984	for (z = pgdat->nr_zones - 1; z >= 0; z--) {
				1985	struct zone *zone = pgdat->node_zones + z;
				1986
				1987	if (!populated_zone(zone))
				1988	continue;
				1989
Peter Zijlstra	7039e1d	2012-10-25 14:16:34 +0200	[diff] [blame]	1990	/* Avoid waking kswapd by allocating pages_to_migrate pages. */
				1991	if (!zone_watermark_ok(zone, 0,
				1992	high_wmark_pages(zone) +
				1993	nr_migrate_pages,
Huang Ying	bfe9d00	2019-11-30 17:57:28 -0800	[diff] [blame]	1994	ZONE_MOVABLE, 0))
Peter Zijlstra	7039e1d	2012-10-25 14:16:34 +0200	[diff] [blame]	1995	continue;
				1996	return true;
				1997	}
				1998	return false;
				1999	}
				2000
				2001	static struct page alloc_misplaced_dst_page(struct page page,
Michal Hocko	666feb2	2018-04-10 16:30:03 -0700	[diff] [blame]	2002	unsigned long data)
Peter Zijlstra	7039e1d	2012-10-25 14:16:34 +0200	[diff] [blame]	2003	{
				2004	int nid = (int) data;
				2005	struct page *newpage;
				2006
Vlastimil Babka	96db800	2015-09-08 15:03:50 -0700	[diff] [blame]	2007	newpage = __alloc_pages_node(nid,
Johannes Weiner	e97ca8e5	2014-03-10 15:49:43 -0700	[diff] [blame]	2008	(GFP_HIGHUSER_MOVABLE \|
				2009	__GFP_THISNODE \| __GFP_NOMEMALLOC \|
				2010	__GFP_NORETRY \| __GFP_NOWARN) &
Mel Gorman	8479eba	2016-02-26 15:19:31 -0800	[diff] [blame]	2011	~__GFP_RECLAIM, 0);
Hillf Danton	bac0382	2012-11-27 14:46:24 +0000	[diff] [blame]	2012
Peter Zijlstra	7039e1d	2012-10-25 14:16:34 +0200	[diff] [blame]	2013	return newpage;
				2014	}
				2015
Yang Shi	c5b5a3d	2021-06-30 18:51:42 -0700	[diff] [blame]	2016	static struct page alloc_misplaced_dst_page_thp(struct page page,
				2017	unsigned long data)
				2018	{
				2019	int nid = (int) data;
				2020	struct page *newpage;
				2021
				2022	newpage = alloc_pages_node(nid, (GFP_TRANSHUGE_LIGHT \| __GFP_THISNODE),
				2023	HPAGE_PMD_ORDER);
				2024	if (!newpage)
				2025	goto out;
				2026
				2027	prep_transhuge_page(newpage);
				2028
				2029	out:
				2030	return newpage;
				2031	}
				2032
Mel Gorman	1c30e01	2014-01-21 15:50:58 -0800	[diff] [blame]	2033	static int numamigrate_isolate_page(pg_data_t pgdat, struct page page)
Mel Gorman	b32967f	2012-11-19 12:35:47 +0000	[diff] [blame]	2034	{
Hugh Dickins	340ef39	2013-02-22 16:34:33 -0800	[diff] [blame]	2035	int page_lru;
Baolin Wang	2b9b624	2021-09-08 15:18:01 -0700	[diff] [blame]	2036	int nr_pages = thp_nr_pages(page);
Mel Gorman	b32967f	2012-11-19 12:35:47 +0000	[diff] [blame]	2037
Sasha Levin	309381fea	2014-01-23 15:52:54 -0800	[diff] [blame]	2038	VM_BUG_ON_PAGE(compound_order(page) && !PageTransHuge(page), page);
Mel Gorman	3abef4e	2013-02-22 16:34:27 -0800	[diff] [blame]	2039
Yang Shi	662aeea	2021-06-30 18:51:51 -0700	[diff] [blame]	2040	/* Do not migrate THP mapped by multiple processes */
				2041	if (PageTransHuge(page) && total_mapcount(page) > 1)
				2042	return 0;
				2043
Mel Gorman	b32967f	2012-11-19 12:35:47 +0000	[diff] [blame]	2044	/* Avoid migrating to a node that is nearly full */
Baolin Wang	2b9b624	2021-09-08 15:18:01 -0700	[diff] [blame]	2045	if (!migrate_balanced_pgdat(pgdat, nr_pages))
Hugh Dickins	340ef39	2013-02-22 16:34:33 -0800	[diff] [blame]	2046	return 0;
Mel Gorman	b32967f	2012-11-19 12:35:47 +0000	[diff] [blame]	2047
Hugh Dickins	340ef39	2013-02-22 16:34:33 -0800	[diff] [blame]	2048	if (isolate_lru_page(page))
				2049	return 0;
Mel Gorman	b32967f	2012-11-19 12:35:47 +0000	[diff] [blame]	2050
Huang Ying	9de4f22	2020-04-06 20:04:41 -0700	[diff] [blame]	2051	page_lru = page_is_file_lru(page);
Mel Gorman	599d0c9	2016-07-28 15:45:31 -0700	[diff] [blame]	2052	mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_lru,
Baolin Wang	2b9b624	2021-09-08 15:18:01 -0700	[diff] [blame]	2053	nr_pages);
Hugh Dickins	340ef39	2013-02-22 16:34:33 -0800	[diff] [blame]	2054
				2055	/*
				2056	* Isolating the page has taken another reference, so the
				2057	* caller's reference can be safely dropped without the page
				2058	* disappearing underneath us during migration.
Mel Gorman	b32967f	2012-11-19 12:35:47 +0000	[diff] [blame]	2059	*/
				2060	put_page(page);
Hugh Dickins	340ef39	2013-02-22 16:34:33 -0800	[diff] [blame]	2061	return 1;
Mel Gorman	b32967f	2012-11-19 12:35:47 +0000	[diff] [blame]	2062	}
				2063
Mel Gorman	a8f6077	2012-11-14 21:41:46 +0000	[diff] [blame]	2064	/*
Peter Zijlstra	7039e1d	2012-10-25 14:16:34 +0200	[diff] [blame]	2065	* Attempt to migrate a misplaced page to the specified destination
				2066	* node. Caller is expected to have an elevated reference count on
				2067	* the page that will be dropped by this function before returning.
				2068	*/
Mel Gorman	1bc115d	2013-10-07 11:29:05 +0100	[diff] [blame]	2069	int migrate_misplaced_page(struct page page, struct vm_area_struct vma,
				2070	int node)
Peter Zijlstra	7039e1d	2012-10-25 14:16:34 +0200	[diff] [blame]	2071	{
Mel Gorman	a8f6077	2012-11-14 21:41:46 +0000	[diff] [blame]	2072	pg_data_t *pgdat = NODE_DATA(node);
Hugh Dickins	340ef39	2013-02-22 16:34:33 -0800	[diff] [blame]	2073	int isolated;
Mel Gorman	b32967f	2012-11-19 12:35:47 +0000	[diff] [blame]	2074	int nr_remaining;
Peter Zijlstra	7039e1d	2012-10-25 14:16:34 +0200	[diff] [blame]	2075	LIST_HEAD(migratepages);
Yang Shi	c5b5a3d	2021-06-30 18:51:42 -0700	[diff] [blame]	2076	new_page_t *new;
				2077	bool compound;
Aneesh Kumar K.V	b5916c0	2021-07-29 14:53:47 -0700	[diff] [blame]	2078	int nr_pages = thp_nr_pages(page);
Yang Shi	c5b5a3d	2021-06-30 18:51:42 -0700	[diff] [blame]	2079
				2080	/*
				2081	* PTE mapped THP or HugeTLB page can't reach here so the page could
				2082	* be either base page or THP. And it must be head page if it is
				2083	* THP.
				2084	*/
				2085	compound = PageTransHuge(page);
				2086
				2087	if (compound)
				2088	new = alloc_misplaced_dst_page_thp;
				2089	else
				2090	new = alloc_misplaced_dst_page;
Peter Zijlstra	7039e1d	2012-10-25 14:16:34 +0200	[diff] [blame]	2091
				2092	/*
Mel Gorman	1bc115d	2013-10-07 11:29:05 +0100	[diff] [blame]	2093	* Don't migrate file pages that are mapped in multiple processes
				2094	* with execute permissions as they are probably shared libraries.
Peter Zijlstra	7039e1d	2012-10-25 14:16:34 +0200	[diff] [blame]	2095	*/
Miaohe Lin	7ee820e	2021-05-04 18:37:16 -0700	[diff] [blame]	2096	if (page_mapcount(page) != 1 && page_is_file_lru(page) &&
				2097	(vma->vm_flags & VM_EXEC))
Peter Zijlstra	7039e1d	2012-10-25 14:16:34 +0200	[diff] [blame]	2098	goto out;
Peter Zijlstra	7039e1d	2012-10-25 14:16:34 +0200	[diff] [blame]	2099
Mel Gorman	a8f6077	2012-11-14 21:41:46 +0000	[diff] [blame]	2100	/*
Mel Gorman	09a913a	2018-04-10 16:29:20 -0700	[diff] [blame]	2101	* Also do not migrate dirty pages as not all filesystems can move
				2102	* dirty pages in MIGRATE_ASYNC mode which is a waste of cycles.
				2103	*/
Huang Ying	9de4f22	2020-04-06 20:04:41 -0700	[diff] [blame]	2104	if (page_is_file_lru(page) && PageDirty(page))
Mel Gorman	09a913a	2018-04-10 16:29:20 -0700	[diff] [blame]	2105	goto out;
				2106
Mel Gorman	b32967f	2012-11-19 12:35:47 +0000	[diff] [blame]	2107	isolated = numamigrate_isolate_page(pgdat, page);
				2108	if (!isolated)
				2109	goto out;
Peter Zijlstra	7039e1d	2012-10-25 14:16:34 +0200	[diff] [blame]	2110
Mel Gorman	b32967f	2012-11-19 12:35:47 +0000	[diff] [blame]	2111	list_add(&page->lru, &migratepages);
Yang Shi	c5b5a3d	2021-06-30 18:51:42 -0700	[diff] [blame]	2112	nr_remaining = migrate_pages(&migratepages, *new, NULL, node,
Yang Shi	5ac9588	2021-09-02 14:59:13 -0700	[diff] [blame]	2113	MIGRATE_ASYNC, MR_NUMA_MISPLACED, NULL);
Mel Gorman	b32967f	2012-11-19 12:35:47 +0000	[diff] [blame]	2114	if (nr_remaining) {
Joonsoo Kim	59c82b7	2014-01-21 15:51:17 -0800	[diff] [blame]	2115	if (!list_empty(&migratepages)) {
				2116	list_del(&page->lru);
Yang Shi	c5fc5c3	2021-06-30 18:51:45 -0700	[diff] [blame]	2117	mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
				2118	page_is_file_lru(page), -nr_pages);
Joonsoo Kim	59c82b7	2014-01-21 15:51:17 -0800	[diff] [blame]	2119	putback_lru_page(page);
				2120	}
Mel Gorman	b32967f	2012-11-19 12:35:47 +0000	[diff] [blame]	2121	isolated = 0;
				2122	} else
Yang Shi	c5fc5c3	2021-06-30 18:51:45 -0700	[diff] [blame]	2123	count_vm_numa_events(NUMA_PAGE_MIGRATE, nr_pages);
Peter Zijlstra	7039e1d	2012-10-25 14:16:34 +0200	[diff] [blame]	2124	BUG_ON(!list_empty(&migratepages));
Peter Zijlstra	7039e1d	2012-10-25 14:16:34 +0200	[diff] [blame]	2125	return isolated;
Hugh Dickins	340ef39	2013-02-22 16:34:33 -0800	[diff] [blame]	2126
				2127	out:
				2128	put_page(page);
				2129	return 0;
Peter Zijlstra	7039e1d	2012-10-25 14:16:34 +0200	[diff] [blame]	2130	}
Mel Gorman	220018d	2012-12-05 09:32:56 +0000	[diff] [blame]	2131	#endif /* CONFIG_NUMA_BALANCING */
Peter Zijlstra	7039e1d	2012-10-25 14:16:34 +0200	[diff] [blame]	2132	#endif /* CONFIG_NUMA */
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2133
Christoph Hellwig	9b2ed9c	2019-08-14 09:59:28 +0200	[diff] [blame]	2134	#ifdef CONFIG_DEVICE_PRIVATE
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2135	static int migrate_vma_collect_skip(unsigned long start,
				2136	unsigned long end,
				2137	struct mm_walk *walk)
				2138	{
				2139	struct migrate_vma *migrate = walk->private;
				2140	unsigned long addr;
				2141
Ralph Campbell	872ea70	2020-01-30 22:14:38 -0800	[diff] [blame]	2142	for (addr = start; addr < end; addr += PAGE_SIZE) {
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2143	migrate->dst[migrate->npages] = 0;
				2144	migrate->src[migrate->npages++] = 0;
				2145	}
				2146
				2147	return 0;
				2148	}
				2149
Miaohe Lin	843e1be	2021-05-04 18:37:13 -0700	[diff] [blame]	2150	static int migrate_vma_collect_hole(unsigned long start,
				2151	unsigned long end,
				2152	__always_unused int depth,
				2153	struct mm_walk *walk)
				2154	{
				2155	struct migrate_vma *migrate = walk->private;
				2156	unsigned long addr;
				2157
				2158	/* Only allow populating anonymous memory. */
				2159	if (!vma_is_anonymous(walk->vma))
				2160	return migrate_vma_collect_skip(start, end, walk);
				2161
				2162	for (addr = start; addr < end; addr += PAGE_SIZE) {
				2163	migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE;
				2164	migrate->dst[migrate->npages] = 0;
				2165	migrate->npages++;
				2166	migrate->cpages++;
				2167	}
				2168
				2169	return 0;
				2170	}
				2171
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2172	static int migrate_vma_collect_pmd(pmd_t *pmdp,
				2173	unsigned long start,
				2174	unsigned long end,
				2175	struct mm_walk *walk)
				2176	{
				2177	struct migrate_vma *migrate = walk->private;
				2178	struct vm_area_struct *vma = walk->vma;
				2179	struct mm_struct *mm = vma->vm_mm;
Jérôme Glisse	8c3328f	2017-09-08 16:12:13 -0700	[diff] [blame]	2180	unsigned long addr = start, unmapped = 0;
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2181	spinlock_t *ptl;
				2182	pte_t *ptep;
				2183
				2184	again:
				2185	if (pmd_none(*pmdp))
Steven Price	b7a16c7	2020-02-03 17:36:03 -0800	[diff] [blame]	2186	return migrate_vma_collect_hole(start, end, -1, walk);
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2187
				2188	if (pmd_trans_huge(*pmdp)) {
				2189	struct page *page;
				2190
				2191	ptl = pmd_lock(mm, pmdp);
				2192	if (unlikely(!pmd_trans_huge(*pmdp))) {
				2193	spin_unlock(ptl);
				2194	goto again;
				2195	}
				2196
				2197	page = pmd_page(*pmdp);
				2198	if (is_huge_zero_page(page)) {
				2199	spin_unlock(ptl);
				2200	split_huge_pmd(vma, pmdp, addr);
				2201	if (pmd_trans_unstable(pmdp))
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2202	return migrate_vma_collect_skip(start, end,
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2203	walk);
				2204	} else {
				2205	int ret;
				2206
				2207	get_page(page);
				2208	spin_unlock(ptl);
				2209	if (unlikely(!trylock_page(page)))
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2210	return migrate_vma_collect_skip(start, end,
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2211	walk);
				2212	ret = split_huge_page(page);
				2213	unlock_page(page);
				2214	put_page(page);
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2215	if (ret)
				2216	return migrate_vma_collect_skip(start, end,
				2217	walk);
				2218	if (pmd_none(*pmdp))
Steven Price	b7a16c7	2020-02-03 17:36:03 -0800	[diff] [blame]	2219	return migrate_vma_collect_hole(start, end, -1,
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2220	walk);
				2221	}
				2222	}
				2223
				2224	if (unlikely(pmd_bad(*pmdp)))
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2225	return migrate_vma_collect_skip(start, end, walk);
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2226
				2227	ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
Jérôme Glisse	8c3328f	2017-09-08 16:12:13 -0700	[diff] [blame]	2228	arch_enter_lazy_mmu_mode();
				2229
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2230	for (; addr < end; addr += PAGE_SIZE, ptep++) {
Christoph Hellwig	800bb1c	2020-03-16 20:32:14 +0100	[diff] [blame]	2231	unsigned long mpfn = 0, pfn;
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2232	struct page *page;
Jérôme Glisse	8c3328f	2017-09-08 16:12:13 -0700	[diff] [blame]	2233	swp_entry_t entry;
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2234	pte_t pte;
				2235
				2236	pte = *ptep;
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2237
Jérôme Glisse	a5430dd	2017-09-08 16:12:17 -0700	[diff] [blame]	2238	if (pte_none(pte)) {
Ralph Campbell	0744f28	2020-08-11 18:31:41 -0700	[diff] [blame]	2239	if (vma_is_anonymous(vma)) {
				2240	mpfn = MIGRATE_PFN_MIGRATE;
				2241	migrate->cpages++;
				2242	}
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2243	goto next;
				2244	}
				2245
Jérôme Glisse	a5430dd	2017-09-08 16:12:17 -0700	[diff] [blame]	2246	if (!pte_present(pte)) {
Jérôme Glisse	a5430dd	2017-09-08 16:12:17 -0700	[diff] [blame]	2247	/*
				2248	* Only care about unaddressable device page special
				2249	* page table entry. Other special swap entries are not
				2250	* migratable, and we ignore regular swapped page.
				2251	*/
				2252	entry = pte_to_swp_entry(pte);
				2253	if (!is_device_private_entry(entry))
				2254	goto next;
				2255
Alistair Popple	af5cdaf	2021-06-30 18:54:06 -0700	[diff] [blame]	2256	page = pfn_swap_entry_to_page(entry);
Ralph Campbell	5143192	2020-07-23 15:30:00 -0700	[diff] [blame]	2257	if (!(migrate->flags &
				2258	MIGRATE_VMA_SELECT_DEVICE_PRIVATE) \|\|
				2259	page->pgmap->owner != migrate->pgmap_owner)
Christoph Hellwig	800bb1c	2020-03-16 20:32:14 +0100	[diff] [blame]	2260	goto next;
				2261
Christoph Hellwig	06d462b	2019-08-14 09:59:27 +0200	[diff] [blame]	2262	mpfn = migrate_pfn(page_to_pfn(page)) \|
				2263	MIGRATE_PFN_MIGRATE;
Alistair Popple	4dd845b	2021-06-30 18:54:09 -0700	[diff] [blame]	2264	if (is_writable_device_private_entry(entry))
Jérôme Glisse	a5430dd	2017-09-08 16:12:17 -0700	[diff] [blame]	2265	mpfn \|= MIGRATE_PFN_WRITE;
				2266	} else {
Ralph Campbell	5143192	2020-07-23 15:30:00 -0700	[diff] [blame]	2267	if (!(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM))
Christoph Hellwig	800bb1c	2020-03-16 20:32:14 +0100	[diff] [blame]	2268	goto next;
Pingfan Liu	276f756	2019-09-23 15:37:38 -0700	[diff] [blame]	2269	pfn = pte_pfn(pte);
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2270	if (is_zero_pfn(pfn)) {
				2271	mpfn = MIGRATE_PFN_MIGRATE;
				2272	migrate->cpages++;
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2273	goto next;
				2274	}
Christoph Hellwig	25b2995	2019-06-13 22:50:49 +0200	[diff] [blame]	2275	page = vm_normal_page(migrate->vma, addr, pte);
Jérôme Glisse	a5430dd	2017-09-08 16:12:17 -0700	[diff] [blame]	2276	mpfn = migrate_pfn(pfn) \| MIGRATE_PFN_MIGRATE;
				2277	mpfn \|= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
				2278	}
				2279
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2280	/* FIXME support THP */
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2281	if (!page \|\| !page->mapping \|\| PageTransCompound(page)) {
Pingfan Liu	276f756	2019-09-23 15:37:38 -0700	[diff] [blame]	2282	mpfn = 0;
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2283	goto next;
				2284	}
				2285
				2286	/*
				2287	* By getting a reference on the page we pin it and that blocks
				2288	* any kind of migration. Side effect is that it "freezes" the
				2289	* pte.
				2290	*
				2291	* We drop this reference after isolating the page from the lru
				2292	* for non device page (device page are not on the lru and thus
				2293	* can't be dropped from it).
				2294	*/
				2295	get_page(page);
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2296
Jérôme Glisse	8c3328f	2017-09-08 16:12:13 -0700	[diff] [blame]	2297	/*
				2298	* Optimize for the common case where page is only mapped once
				2299	* in one process. If we can lock the page, then we can safely
				2300	* set up a special migration page table entry now.
				2301	*/
				2302	if (trylock_page(page)) {
				2303	pte_t swp_pte;
				2304
Alistair Popple	ab09243	2021-11-10 20:32:40 -0800	[diff] [blame]	2305	migrate->cpages++;
Jérôme Glisse	8c3328f	2017-09-08 16:12:13 -0700	[diff] [blame]	2306	ptep_get_and_clear(mm, addr, ptep);
				2307
				2308	/* Setup special migration page table entry */
Alistair Popple	4dd845b	2021-06-30 18:54:09 -0700	[diff] [blame]	2309	if (mpfn & MIGRATE_PFN_WRITE)
				2310	entry = make_writable_migration_entry(
				2311	page_to_pfn(page));
				2312	else
				2313	entry = make_readable_migration_entry(
				2314	page_to_pfn(page));
Jérôme Glisse	8c3328f	2017-09-08 16:12:13 -0700	[diff] [blame]	2315	swp_pte = swp_entry_to_pte(entry);
Alistair Popple	ad7df76	2020-09-04 16:36:01 -0700	[diff] [blame]	2316	if (pte_present(pte)) {
				2317	if (pte_soft_dirty(pte))
				2318	swp_pte = pte_swp_mksoft_dirty(swp_pte);
				2319	if (pte_uffd_wp(pte))
				2320	swp_pte = pte_swp_mkuffd_wp(swp_pte);
				2321	} else {
				2322	if (pte_swp_soft_dirty(pte))
				2323	swp_pte = pte_swp_mksoft_dirty(swp_pte);
				2324	if (pte_swp_uffd_wp(pte))
				2325	swp_pte = pte_swp_mkuffd_wp(swp_pte);
				2326	}
Jérôme Glisse	8c3328f	2017-09-08 16:12:13 -0700	[diff] [blame]	2327	set_pte_at(mm, addr, ptep, swp_pte);
				2328
				2329	/*
				2330	* This is like regular unmap: we remove the rmap and
				2331	* drop page refcount. Page won't be freed, as we took
				2332	* a reference just above.
				2333	*/
				2334	page_remove_rmap(page, false);
				2335	put_page(page);
Jérôme Glisse	a5430dd	2017-09-08 16:12:17 -0700	[diff] [blame]	2336
				2337	if (pte_present(pte))
				2338	unmapped++;
Alistair Popple	ab09243	2021-11-10 20:32:40 -0800	[diff] [blame]	2339	} else {
				2340	put_page(page);
				2341	mpfn = 0;
Jérôme Glisse	8c3328f	2017-09-08 16:12:13 -0700	[diff] [blame]	2342	}
				2343
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2344	next:
Jérôme Glisse	a5430dd	2017-09-08 16:12:17 -0700	[diff] [blame]	2345	migrate->dst[migrate->npages] = 0;
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2346	migrate->src[migrate->npages++] = mpfn;
				2347	}
Jérôme Glisse	8c3328f	2017-09-08 16:12:13 -0700	[diff] [blame]	2348	arch_leave_lazy_mmu_mode();
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2349	pte_unmap_unlock(ptep - 1, ptl);
				2350
Jérôme Glisse	8c3328f	2017-09-08 16:12:13 -0700	[diff] [blame]	2351	/* Only flush the TLB if we actually modified any entries */
				2352	if (unmapped)
				2353	flush_tlb_range(walk->vma, start, end);
				2354
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2355	return 0;
				2356	}
				2357
Christoph Hellwig	7b86ac3	2019-08-28 16:19:54 +0200	[diff] [blame]	2358	static const struct mm_walk_ops migrate_vma_walk_ops = {
				2359	.pmd_entry = migrate_vma_collect_pmd,
				2360	.pte_hole = migrate_vma_collect_hole,
				2361	};
				2362
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2363	/*
				2364	* migrate_vma_collect() - collect pages over a range of virtual addresses
				2365	* @migrate: migrate struct containing all migration information
				2366	*
				2367	* This will walk the CPU page table. For each virtual address backed by a
				2368	* valid page, it updates the src array and takes a reference on the page, in
				2369	* order to pin the page until we lock it and unmap it.
				2370	*/
				2371	static void migrate_vma_collect(struct migrate_vma *migrate)
				2372	{
Jérôme Glisse	ac46d4f	2018-12-28 00:38:09 -0800	[diff] [blame]	2373	struct mmu_notifier_range range;
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2374
Ralph Campbell	998427b	2020-07-23 15:30:01 -0700	[diff] [blame]	2375	/*
				2376	* Note that the pgmap_owner is passed to the mmu notifier callback so
				2377	* that the registered device driver can skip invalidating device
				2378	* private page mappings that won't be migrated.
				2379	*/
Alistair Popple	6b49bf6	2021-06-30 18:54:19 -0700	[diff] [blame]	2380	mmu_notifier_range_init_owner(&range, MMU_NOTIFY_MIGRATE, 0,
				2381	migrate->vma, migrate->vma->vm_mm, migrate->start, migrate->end,
Ralph Campbell	c1a06df	2020-08-06 23:17:09 -0700	[diff] [blame]	2382	migrate->pgmap_owner);
Jérôme Glisse	ac46d4f	2018-12-28 00:38:09 -0800	[diff] [blame]	2383	mmu_notifier_invalidate_range_start(&range);
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2384
Christoph Hellwig	7b86ac3	2019-08-28 16:19:54 +0200	[diff] [blame]	2385	walk_page_range(migrate->vma->vm_mm, migrate->start, migrate->end,
				2386	&migrate_vma_walk_ops, migrate);
				2387
				2388	mmu_notifier_invalidate_range_end(&range);
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2389	migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT);
				2390	}
				2391
				2392	/*
				2393	* migrate_vma_check_page() - check if page is pinned or not
				2394	* @page: struct page to check
				2395	*
				2396	* Pinned pages cannot be migrated. This is the same test as in
Matthew Wilcox (Oracle)	3417013	2021-05-07 07:28:40 -0400	[diff] [blame]	2397	* folio_migrate_mapping(), except that here we allow migration of a
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2398	* ZONE_DEVICE page.
				2399	*/
				2400	static bool migrate_vma_check_page(struct page *page)
				2401	{
				2402	/*
				2403	* One extra ref because caller holds an extra reference, either from
				2404	* isolate_lru_page() for a regular page, or migrate_vma_collect() for
				2405	* a device page.
				2406	*/
				2407	int extra = 1;
				2408
				2409	/*
				2410	* FIXME support THP (transparent huge page), it is bit more complex to
				2411	* check them than regular pages, because they can be mapped with a pmd
				2412	* or with a pte (split pte mapping).
				2413	*/
				2414	if (PageCompound(page))
				2415	return false;
				2416
Jérôme Glisse	a5430dd	2017-09-08 16:12:17 -0700	[diff] [blame]	2417	/* Page from ZONE_DEVICE have one extra reference */
Alistair Popple	ffa6575	2022-01-21 22:10:46 -0800	[diff] [blame]	2418	if (is_zone_device_page(page))
				2419	extra++;
Jérôme Glisse	a5430dd	2017-09-08 16:12:17 -0700	[diff] [blame]	2420
Jérôme Glisse	df6ad69	2017-09-08 16:12:24 -0700	[diff] [blame]	2421	/* For file back page */
				2422	if (page_mapping(page))
				2423	extra += 1 + page_has_private(page);
				2424
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2425	if ((page_count(page) - extra) > page_mapcount(page))
				2426	return false;
				2427
				2428	return true;
				2429	}
				2430
				2431	/*
Alistair Popple	ab09243	2021-11-10 20:32:40 -0800	[diff] [blame]	2432	* migrate_vma_unmap() - replace page mapping with special migration pte entry
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2433	* @migrate: migrate struct containing all migration information
				2434	*
Alistair Popple	ab09243	2021-11-10 20:32:40 -0800	[diff] [blame]	2435	* Isolate pages from the LRU and replace mappings (CPU page table pte) with a
				2436	* special migration pte entry and check if it has been pinned. Pinned pages are
				2437	* restored because we cannot migrate them.
				2438	*
				2439	* This is the last step before we call the device driver callback to allocate
				2440	* destination memory and copy contents of original page over to new page.
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2441	*/
Alistair Popple	ab09243	2021-11-10 20:32:40 -0800	[diff] [blame]	2442	static void migrate_vma_unmap(struct migrate_vma *migrate)
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2443	{
				2444	const unsigned long npages = migrate->npages;
Colin Ian King	f1e8db0	2022-01-14 14:08:53 -0800	[diff] [blame]	2445	unsigned long i, restore = 0;
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2446	bool allow_drain = true;
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2447
				2448	lru_add_drain();
				2449
Alistair Popple	ab09243	2021-11-10 20:32:40 -0800	[diff] [blame]	2450	for (i = 0; i < npages; i++) {
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2451	struct page *page = migrate_pfn_to_page(migrate->src[i]);
				2452
				2453	if (!page)
				2454	continue;
				2455
Jérôme Glisse	a5430dd	2017-09-08 16:12:17 -0700	[diff] [blame]	2456	/* ZONE_DEVICE pages are not on LRU */
				2457	if (!is_zone_device_page(page)) {
				2458	if (!PageLRU(page) && allow_drain) {
				2459	/* Drain CPU's pagevec */
				2460	lru_add_drain_all();
				2461	allow_drain = false;
Jérôme Glisse	8c3328f	2017-09-08 16:12:13 -0700	[diff] [blame]	2462	}
Jérôme Glisse	a5430dd	2017-09-08 16:12:17 -0700	[diff] [blame]	2463
				2464	if (isolate_lru_page(page)) {
Alistair Popple	ab09243	2021-11-10 20:32:40 -0800	[diff] [blame]	2465	migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
				2466	migrate->cpages--;
				2467	restore++;
Jérôme Glisse	a5430dd	2017-09-08 16:12:17 -0700	[diff] [blame]	2468	continue;
				2469	}
				2470
				2471	/* Drop the reference we took in collect */
				2472	put_page(page);
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2473	}
				2474
Alistair Popple	ab09243	2021-11-10 20:32:40 -0800	[diff] [blame]	2475	if (page_mapped(page))
Alistair Popple	a98a2f0	2021-06-30 18:54:16 -0700	[diff] [blame]	2476	try_to_migrate(page, 0);
Jérôme Glisse	8c3328f	2017-09-08 16:12:13 -0700	[diff] [blame]	2477
Alistair Popple	ab09243	2021-11-10 20:32:40 -0800	[diff] [blame]	2478	if (page_mapped(page) \|\| !migrate_vma_check_page(page)) {
				2479	if (!is_zone_device_page(page)) {
				2480	get_page(page);
				2481	putback_lru_page(page);
				2482	}
				2483
				2484	migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
				2485	migrate->cpages--;
				2486	restore++;
Jérôme Glisse	8c3328f	2017-09-08 16:12:13 -0700	[diff] [blame]	2487	continue;
Alistair Popple	ab09243	2021-11-10 20:32:40 -0800	[diff] [blame]	2488	}
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2489	}
				2490
Colin Ian King	f1e8db0	2022-01-14 14:08:53 -0800	[diff] [blame]	2491	for (i = 0; i < npages && restore; i++) {
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2492	struct page *page = migrate_pfn_to_page(migrate->src[i]);
				2493
				2494	if (!page \|\| (migrate->src[i] & MIGRATE_PFN_MIGRATE))
				2495	continue;
				2496
				2497	remove_migration_ptes(page, page, false);
				2498
				2499	migrate->src[i] = 0;
				2500	unlock_page(page);
Alistair Popple	ab09243	2021-11-10 20:32:40 -0800	[diff] [blame]	2501	put_page(page);
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2502	restore--;
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2503	}
				2504	}
				2505
Christoph Hellwig	a7d1f22	2019-08-14 09:59:19 +0200	[diff] [blame]	2506	/**
				2507	* migrate_vma_setup() - prepare to migrate a range of memory
Randy Dunlap	eaf444d	2020-08-11 18:33:08 -0700	[diff] [blame]	2508	* @args: contains the vma, start, and pfns arrays for the migration
Christoph Hellwig	a7d1f22	2019-08-14 09:59:19 +0200	[diff] [blame]	2509	*
				2510	* Returns: negative errno on failures, 0 when 0 or more pages were migrated
				2511	* without an error.
				2512	*
				2513	* Prepare to migrate a range of memory virtual address range by collecting all
				2514	* the pages backing each virtual address in the range, saving them inside the
				2515	* src array. Then lock those pages and unmap them. Once the pages are locked
				2516	* and unmapped, check whether each page is pinned or not. Pages that aren't
				2517	* pinned have the MIGRATE_PFN_MIGRATE flag set (by this function) in the
				2518	* corresponding src array entry. Then restores any pages that are pinned, by
				2519	* remapping and unlocking those pages.
				2520	*
				2521	* The caller should then allocate destination memory and copy source memory to
				2522	* it for all those entries (ie with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE
				2523	* flag set). Once these are allocated and copied, the caller must update each
				2524	* corresponding entry in the dst array with the pfn value of the destination
Alistair Popple	ab09243	2021-11-10 20:32:40 -0800	[diff] [blame]	2525	* page and with MIGRATE_PFN_VALID. Destination pages must be locked via
				2526	* lock_page().
Christoph Hellwig	a7d1f22	2019-08-14 09:59:19 +0200	[diff] [blame]	2527	*
				2528	* Note that the caller does not have to migrate all the pages that are marked
				2529	* with MIGRATE_PFN_MIGRATE flag in src array unless this is a migration from
				2530	* device memory to system memory. If the caller cannot migrate a device page
				2531	* back to system memory, then it must return VM_FAULT_SIGBUS, which has severe
				2532	* consequences for the userspace process, so it must be avoided if at all
				2533	* possible.
				2534	*
				2535	* For empty entries inside CPU page table (pte_none() or pmd_none() is true) we
				2536	* do set MIGRATE_PFN_MIGRATE flag inside the corresponding source array thus
Ingo Molnar	f0953a1	2021-05-06 18:06:47 -0700	[diff] [blame]	2537	* allowing the caller to allocate device memory for those unbacked virtual
				2538	* addresses. For this the caller simply has to allocate device memory and
Christoph Hellwig	a7d1f22	2019-08-14 09:59:19 +0200	[diff] [blame]	2539	* properly set the destination entry like for regular migration. Note that
Ingo Molnar	f0953a1	2021-05-06 18:06:47 -0700	[diff] [blame]	2540	* this can still fail, and thus inside the device driver you must check if the
				2541	* migration was successful for those entries after calling migrate_vma_pages(),
Christoph Hellwig	a7d1f22	2019-08-14 09:59:19 +0200	[diff] [blame]	2542	* just like for regular migration.
				2543	*
				2544	* After that, the callers must call migrate_vma_pages() to go over each entry
				2545	* in the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag
				2546	* set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set,
				2547	* then migrate_vma_pages() to migrate struct page information from the source
				2548	* struct page to the destination struct page. If it fails to migrate the
				2549	* struct page information, then it clears the MIGRATE_PFN_MIGRATE flag in the
				2550	* src array.
				2551	*
				2552	* At this point all successfully migrated pages have an entry in the src
				2553	* array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst
				2554	* array entry with MIGRATE_PFN_VALID flag set.
				2555	*
				2556	* Once migrate_vma_pages() returns the caller may inspect which pages were
				2557	* successfully migrated, and which were not. Successfully migrated pages will
				2558	* have the MIGRATE_PFN_MIGRATE flag set for their src array entry.
				2559	*
				2560	* It is safe to update device page table after migrate_vma_pages() because
Michel Lespinasse	c1e8d7c	2020-06-08 21:33:54 -0700	[diff] [blame]	2561	* both destination and source page are still locked, and the mmap_lock is held
Christoph Hellwig	a7d1f22	2019-08-14 09:59:19 +0200	[diff] [blame]	2562	* in read mode (hence no one can unmap the range being migrated).
				2563	*
				2564	* Once the caller is done cleaning up things and updating its page table (if it
				2565	* chose to do so, this is not an obligation) it finally calls
				2566	* migrate_vma_finalize() to update the CPU page table to point to new pages
				2567	* for successfully migrated pages or otherwise restore the CPU page table to
				2568	* point to the original source pages.
				2569	*/
				2570	int migrate_vma_setup(struct migrate_vma *args)
				2571	{
				2572	long nr_pages = (args->end - args->start) >> PAGE_SHIFT;
				2573
				2574	args->start &= PAGE_MASK;
				2575	args->end &= PAGE_MASK;
				2576	if (!args->vma \|\| is_vm_hugetlb_page(args->vma) \|\|
				2577	(args->vma->vm_flags & VM_SPECIAL) \|\| vma_is_dax(args->vma))
				2578	return -EINVAL;
				2579	if (nr_pages <= 0)
				2580	return -EINVAL;
				2581	if (args->start < args->vma->vm_start \|\|
				2582	args->start >= args->vma->vm_end)
				2583	return -EINVAL;
				2584	if (args->end <= args->vma->vm_start \|\| args->end > args->vma->vm_end)
				2585	return -EINVAL;
				2586	if (!args->src \|\| !args->dst)
				2587	return -EINVAL;
				2588
				2589	memset(args->src, 0, sizeof(args->src) nr_pages);
				2590	args->cpages = 0;
				2591	args->npages = 0;
				2592
				2593	migrate_vma_collect(args);
				2594
				2595	if (args->cpages)
Christoph Hellwig	a7d1f22	2019-08-14 09:59:19 +0200	[diff] [blame]	2596	migrate_vma_unmap(args);
				2597
				2598	/*
				2599	* At this point pages are locked and unmapped, and thus they have
				2600	* stable content and can safely be copied to destination memory that
				2601	* is allocated by the drivers.
				2602	*/
				2603	return 0;
				2604
				2605	}
				2606	EXPORT_SYMBOL(migrate_vma_setup);
				2607
Ralph Campbell	34290e2	2020-01-30 22:14:44 -0800	[diff] [blame]	2608	/*
				2609	* This code closely matches the code in:
				2610	* __handle_mm_fault()
				2611	* handle_pte_fault()
				2612	* do_anonymous_page()
				2613	* to map in an anonymous zero page but the struct page will be a ZONE_DEVICE
				2614	* private page.
				2615	*/
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2616	static void migrate_vma_insert_page(struct migrate_vma *migrate,
				2617	unsigned long addr,
				2618	struct page *page,
Stephen Zhang	d85c6db	2020-12-14 19:13:20 -0800	[diff] [blame]	2619	unsigned long *src)
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2620	{
				2621	struct vm_area_struct *vma = migrate->vma;
				2622	struct mm_struct *mm = vma->vm_mm;
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2623	bool flush = false;
				2624	spinlock_t *ptl;
				2625	pte_t entry;
				2626	pgd_t *pgdp;
				2627	p4d_t *p4dp;
				2628	pud_t *pudp;
				2629	pmd_t *pmdp;
				2630	pte_t *ptep;
				2631
				2632	/* Only allow populating anonymous memory */
				2633	if (!vma_is_anonymous(vma))
				2634	goto abort;
				2635
				2636	pgdp = pgd_offset(mm, addr);
				2637	p4dp = p4d_alloc(mm, pgdp, addr);
				2638	if (!p4dp)
				2639	goto abort;
				2640	pudp = pud_alloc(mm, p4dp, addr);
				2641	if (!pudp)
				2642	goto abort;
				2643	pmdp = pmd_alloc(mm, pudp, addr);
				2644	if (!pmdp)
				2645	goto abort;
				2646
				2647	if (pmd_trans_huge(pmdp) \|\| pmd_devmap(pmdp))
				2648	goto abort;
				2649
				2650	/*
				2651	* Use pte_alloc() instead of pte_alloc_map(). We can't run
				2652	* pte_offset_map() on pmds where a huge pmd might be created
				2653	* from a different thread.
				2654	*
Michel Lespinasse	3e4e28c	2020-06-08 21:33:51 -0700	[diff] [blame]	2655	* pte_alloc_map() is safe to use under mmap_write_lock(mm) or when
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2656	* parallel threads are excluded by other means.
				2657	*
Michel Lespinasse	3e4e28c	2020-06-08 21:33:51 -0700	[diff] [blame]	2658	* Here we only have mmap_read_lock(mm).
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2659	*/
Joel Fernandes (Google)	4cf5892	2019-01-03 15:28:34 -0800	[diff] [blame]	2660	if (pte_alloc(mm, pmdp))
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2661	goto abort;
				2662
				2663	/* See the comment in pte_alloc_one_map() */
				2664	if (unlikely(pmd_trans_unstable(pmdp)))
				2665	goto abort;
				2666
				2667	if (unlikely(anon_vma_prepare(vma)))
				2668	goto abort;
Matthew Wilcox (Oracle)	8f425e4	2021-06-25 09:27:04 -0400	[diff] [blame]	2669	if (mem_cgroup_charge(page_folio(page), vma->vm_mm, GFP_KERNEL))
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2670	goto abort;
				2671
				2672	/*
				2673	* The memory barrier inside __SetPageUptodate makes sure that
				2674	* preceding stores to the page contents become visible before
				2675	* the set_pte_at() write.
				2676	*/
				2677	__SetPageUptodate(page);
				2678
Jérôme Glisse	df6ad69	2017-09-08 16:12:24 -0700	[diff] [blame]	2679	if (is_zone_device_page(page)) {
				2680	if (is_device_private_page(page)) {
				2681	swp_entry_t swp_entry;
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2682
Alistair Popple	4dd845b	2021-06-30 18:54:09 -0700	[diff] [blame]	2683	if (vma->vm_flags & VM_WRITE)
				2684	swp_entry = make_writable_device_private_entry(
				2685	page_to_pfn(page));
				2686	else
				2687	swp_entry = make_readable_device_private_entry(
				2688	page_to_pfn(page));
Jérôme Glisse	df6ad69	2017-09-08 16:12:24 -0700	[diff] [blame]	2689	entry = swp_entry_to_pte(swp_entry);
Miaohe Lin	34f5e9b	2021-05-04 18:37:10 -0700	[diff] [blame]	2690	} else {
				2691	/*
				2692	* For now we only support migrating to un-addressable
				2693	* device memory.
				2694	*/
				2695	pr_warn_once("Unsupported ZONE_DEVICE page type.\n");
				2696	goto abort;
Jérôme Glisse	df6ad69	2017-09-08 16:12:24 -0700	[diff] [blame]	2697	}
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2698	} else {
				2699	entry = mk_pte(page, vma->vm_page_prot);
				2700	if (vma->vm_flags & VM_WRITE)
				2701	entry = pte_mkwrite(pte_mkdirty(entry));
				2702	}
				2703
				2704	ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
				2705
Ralph Campbell	34290e2	2020-01-30 22:14:44 -0800	[diff] [blame]	2706	if (check_stable_address_space(mm))
				2707	goto unlock_abort;
				2708
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2709	if (pte_present(*ptep)) {
				2710	unsigned long pfn = pte_pfn(*ptep);
				2711
Ralph Campbell	c23a0c9	2020-01-30 22:14:41 -0800	[diff] [blame]	2712	if (!is_zero_pfn(pfn))
				2713	goto unlock_abort;
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2714	flush = true;
Ralph Campbell	c23a0c9	2020-01-30 22:14:41 -0800	[diff] [blame]	2715	} else if (!pte_none(*ptep))
				2716	goto unlock_abort;
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2717
				2718	/*
Ralph Campbell	c23a0c9	2020-01-30 22:14:41 -0800	[diff] [blame]	2719	* Check for userfaultfd but do not deliver the fault. Instead,
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2720	* just back off.
				2721	*/
Ralph Campbell	c23a0c9	2020-01-30 22:14:41 -0800	[diff] [blame]	2722	if (userfaultfd_missing(vma))
				2723	goto unlock_abort;
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2724
				2725	inc_mm_counter(mm, MM_ANONPAGES);
Johannes Weiner	be5d0a7	2020-06-03 16:01:57 -0700	[diff] [blame]	2726	page_add_new_anon_rmap(page, vma, addr, false);
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2727	if (!is_zone_device_page(page))
Joonsoo Kim	b518154	2020-08-11 18:30:40 -0700	[diff] [blame]	2728	lru_cache_add_inactive_or_unevictable(page, vma);
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2729	get_page(page);
				2730
				2731	if (flush) {
				2732	flush_cache_page(vma, addr, pte_pfn(*ptep));
				2733	ptep_clear_flush_notify(vma, addr, ptep);
				2734	set_pte_at_notify(mm, addr, ptep, entry);
				2735	update_mmu_cache(vma, addr, ptep);
				2736	} else {
				2737	/* No need to invalidate - it was non-present before */
				2738	set_pte_at(mm, addr, ptep, entry);
				2739	update_mmu_cache(vma, addr, ptep);
				2740	}
				2741
				2742	pte_unmap_unlock(ptep, ptl);
				2743	*src = MIGRATE_PFN_MIGRATE;
				2744	return;
				2745
Ralph Campbell	c23a0c9	2020-01-30 22:14:41 -0800	[diff] [blame]	2746	unlock_abort:
				2747	pte_unmap_unlock(ptep, ptl);
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2748	abort:
				2749	*src &= ~MIGRATE_PFN_MIGRATE;
				2750	}
				2751
Christoph Hellwig	a7d1f22	2019-08-14 09:59:19 +0200	[diff] [blame]	2752	/**
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2753	* migrate_vma_pages() - migrate meta-data from src page to dst page
				2754	* @migrate: migrate struct containing all migration information
				2755	*
				2756	* This migrates struct page meta-data from source struct page to destination
				2757	* struct page. This effectively finishes the migration from source page to the
				2758	* destination page.
				2759	*/
Christoph Hellwig	a7d1f22	2019-08-14 09:59:19 +0200	[diff] [blame]	2760	void migrate_vma_pages(struct migrate_vma *migrate)
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2761	{
				2762	const unsigned long npages = migrate->npages;
				2763	const unsigned long start = migrate->start;
Jérôme Glisse	ac46d4f	2018-12-28 00:38:09 -0800	[diff] [blame]	2764	struct mmu_notifier_range range;
				2765	unsigned long addr, i;
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2766	bool notified = false;
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2767
				2768	for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) {
				2769	struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
				2770	struct page *page = migrate_pfn_to_page(migrate->src[i]);
				2771	struct address_space *mapping;
				2772	int r;
				2773
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2774	if (!newpage) {
				2775	migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2776	continue;
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2777	}
				2778
				2779	if (!page) {
Ralph Campbell	c23a0c9	2020-01-30 22:14:41 -0800	[diff] [blame]	2780	if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE))
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2781	continue;
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2782	if (!notified) {
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2783	notified = true;
Jérôme Glisse	ac46d4f	2018-12-28 00:38:09 -0800	[diff] [blame]	2784
Alistair Popple	6b49bf6	2021-06-30 18:54:19 -0700	[diff] [blame]	2785	mmu_notifier_range_init_owner(&range,
				2786	MMU_NOTIFY_MIGRATE, 0, migrate->vma,
				2787	migrate->vma->vm_mm, addr, migrate->end,
Ralph Campbell	5e5dda8	2020-12-14 19:12:55 -0800	[diff] [blame]	2788	migrate->pgmap_owner);
Jérôme Glisse	ac46d4f	2018-12-28 00:38:09 -0800	[diff] [blame]	2789	mmu_notifier_invalidate_range_start(&range);
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2790	}
				2791	migrate_vma_insert_page(migrate, addr, newpage,
Stephen Zhang	d85c6db	2020-12-14 19:13:20 -0800	[diff] [blame]	2792	&migrate->src[i]);
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2793	continue;
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2794	}
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2795
				2796	mapping = page_mapping(page);
				2797
Jérôme Glisse	a5430dd	2017-09-08 16:12:17 -0700	[diff] [blame]	2798	if (is_zone_device_page(newpage)) {
				2799	if (is_device_private_page(newpage)) {
				2800	/*
				2801	* For now only support private anonymous when
				2802	* migrating to un-addressable device memory.
				2803	*/
				2804	if (mapping) {
				2805	migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
				2806	continue;
				2807	}
Christoph Hellwig	25b2995	2019-06-13 22:50:49 +0200	[diff] [blame]	2808	} else {
Jérôme Glisse	a5430dd	2017-09-08 16:12:17 -0700	[diff] [blame]	2809	/*
				2810	* Other types of ZONE_DEVICE page are not
				2811	* supported.
				2812	*/
				2813	migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
				2814	continue;
				2815	}
				2816	}
				2817
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2818	r = migrate_page(mapping, newpage, page, MIGRATE_SYNC_NO_COPY);
				2819	if (r != MIGRATEPAGE_SUCCESS)
				2820	migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
				2821	}
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2822
Jérôme Glisse	4645b9f	2017-11-15 17:34:11 -0800	[diff] [blame]	2823	/*
				2824	* No need to double call mmu_notifier->invalidate_range() callback as
				2825	* the above ptep_clear_flush_notify() inside migrate_vma_insert_page()
				2826	* did already call it.
				2827	*/
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2828	if (notified)
Jérôme Glisse	ac46d4f	2018-12-28 00:38:09 -0800	[diff] [blame]	2829	mmu_notifier_invalidate_range_only_end(&range);
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2830	}
Christoph Hellwig	a7d1f22	2019-08-14 09:59:19 +0200	[diff] [blame]	2831	EXPORT_SYMBOL(migrate_vma_pages);
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2832
Christoph Hellwig	a7d1f22	2019-08-14 09:59:19 +0200	[diff] [blame]	2833	/**
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2834	* migrate_vma_finalize() - restore CPU page table entry
				2835	* @migrate: migrate struct containing all migration information
				2836	*
				2837	* This replaces the special migration pte entry with either a mapping to the
				2838	* new page if migration was successful for that page, or to the original page
				2839	* otherwise.
				2840	*
				2841	* This also unlocks the pages and puts them back on the lru, or drops the extra
				2842	* refcount, for device pages.
				2843	*/
Christoph Hellwig	a7d1f22	2019-08-14 09:59:19 +0200	[diff] [blame]	2844	void migrate_vma_finalize(struct migrate_vma *migrate)
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2845	{
				2846	const unsigned long npages = migrate->npages;
				2847	unsigned long i;
				2848
				2849	for (i = 0; i < npages; i++) {
				2850	struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
				2851	struct page *page = migrate_pfn_to_page(migrate->src[i]);
				2852
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2853	if (!page) {
				2854	if (newpage) {
				2855	unlock_page(newpage);
				2856	put_page(newpage);
				2857	}
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2858	continue;
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2859	}
				2860
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2861	if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE) \|\| !newpage) {
				2862	if (newpage) {
				2863	unlock_page(newpage);
				2864	put_page(newpage);
				2865	}
				2866	newpage = page;
				2867	}
				2868
				2869	remove_migration_ptes(page, newpage, false);
				2870	unlock_page(page);
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2871
Jérôme Glisse	a5430dd	2017-09-08 16:12:17 -0700	[diff] [blame]	2872	if (is_zone_device_page(page))
				2873	put_page(page);
				2874	else
				2875	putback_lru_page(page);
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2876
				2877	if (newpage != page) {
				2878	unlock_page(newpage);
Jérôme Glisse	a5430dd	2017-09-08 16:12:17 -0700	[diff] [blame]	2879	if (is_zone_device_page(newpage))
				2880	put_page(newpage);
				2881	else
				2882	putback_lru_page(newpage);
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2883	}
				2884	}
				2885	}
Christoph Hellwig	a7d1f22	2019-08-14 09:59:19 +0200	[diff] [blame]	2886	EXPORT_SYMBOL(migrate_vma_finalize);
Christoph Hellwig	9b2ed9c	2019-08-14 09:59:28 +0200	[diff] [blame]	2887	#endif /* CONFIG_DEVICE_PRIVATE */
Dave Hansen	79c28a4	2021-09-02 14:59:06 -0700	[diff] [blame]	2888
Huang Ying	dcee9bf5	2022-01-14 14:08:49 -0800	[diff] [blame]	2889	/*
				2890	* node_demotion[] example:
				2891	*
				2892	* Consider a system with two sockets. Each socket has
				2893	* three classes of memory attached: fast, medium and slow.
				2894	* Each memory class is placed in its own NUMA node. The
				2895	* CPUs are placed in the node with the "fast" memory. The
				2896	* 6 NUMA nodes (0-5) might be split among the sockets like
				2897	* this:
				2898	*
				2899	* Socket A: 0, 1, 2
				2900	* Socket B: 3, 4, 5
				2901	*
				2902	* When Node 0 fills up, its memory should be migrated to
				2903	* Node 1. When Node 1 fills up, it should be migrated to
				2904	* Node 2. The migration path start on the nodes with the
				2905	* processors (since allocations default to this node) and
				2906	* fast memory, progress through medium and end with the
				2907	* slow memory:
				2908	*
				2909	* 0 -> 1 -> 2 -> stop
				2910	* 3 -> 4 -> 5 -> stop
				2911	*
				2912	* This is represented in the node_demotion[] like this:
				2913	*
				2914	* { nr=1, nodes[0]=1 }, // Node 0 migrates to 1
				2915	* { nr=1, nodes[0]=2 }, // Node 1 migrates to 2
				2916	* { nr=0, nodes[0]=-1 }, // Node 2 does not migrate
				2917	* { nr=1, nodes[0]=4 }, // Node 3 migrates to 4
				2918	* { nr=1, nodes[0]=5 }, // Node 4 migrates to 5
				2919	* { nr=0, nodes[0]=-1 }, // Node 5 does not migrate
				2920	*
				2921	* Moreover some systems may have multiple slow memory nodes.
				2922	* Suppose a system has one socket with 3 memory nodes, node 0
				2923	* is fast memory type, and node 1/2 both are slow memory
				2924	* type, and the distance between fast memory node and slow
				2925	* memory node is same. So the migration path should be:
				2926	*
				2927	* 0 -> 1/2 -> stop
				2928	*
				2929	* This is represented in the node_demotion[] like this:
				2930	* { nr=2, {nodes[0]=1, nodes[1]=2} }, // Node 0 migrates to node 1 and node 2
				2931	* { nr=0, nodes[0]=-1, }, // Node 1 dose not migrate
				2932	* { nr=0, nodes[0]=-1, }, // Node 2 does not migrate
				2933	*/
				2934
				2935	/*
				2936	* Writes to this array occur without locking. Cycles are
				2937	* not allowed: Node X demotes to Y which demotes to X...
				2938	*
				2939	* If multiple reads are performed, a single rcu_read_lock()
				2940	* must be held over all reads to ensure that no cycles are
				2941	* observed.
				2942	*/
				2943	#define DEFAULT_DEMOTION_TARGET_NODES 15
				2944
				2945	#if MAX_NUMNODES < DEFAULT_DEMOTION_TARGET_NODES
				2946	#define DEMOTION_TARGET_NODES (MAX_NUMNODES - 1)
				2947	#else
				2948	#define DEMOTION_TARGET_NODES DEFAULT_DEMOTION_TARGET_NODES
				2949	#endif
				2950
				2951	struct demotion_nodes {
				2952	unsigned short nr;
				2953	short nodes[DEMOTION_TARGET_NODES];
				2954	};
				2955
				2956	static struct demotion_nodes *node_demotion __read_mostly;
				2957
				2958	/**
				2959	* next_demotion_node() - Get the next node in the demotion path
				2960	* @node: The starting node to lookup the next node
				2961	*
				2962	* Return: node id for next memory node in the demotion path hierarchy
				2963	* from @node; NUMA_NO_NODE if @node is terminal. This does not keep
				2964	* @node online or guarantee that it continues to be the next demotion
				2965	* target.
				2966	*/
				2967	int next_demotion_node(int node)
				2968	{
				2969	struct demotion_nodes *nd;
				2970	unsigned short target_nr, index;
				2971	int target;
				2972
				2973	if (!node_demotion)
				2974	return NUMA_NO_NODE;
				2975
				2976	nd = &node_demotion[node];
				2977
				2978	/*
				2979	* node_demotion[] is updated without excluding this
				2980	* function from running. RCU doesn't provide any
				2981	* compiler barriers, so the READ_ONCE() is required
				2982	* to avoid compiler reordering or read merging.
				2983	*
				2984	* Make sure to use RCU over entire code blocks if
				2985	* node_demotion[] reads need to be consistent.
				2986	*/
				2987	rcu_read_lock();
				2988	target_nr = READ_ONCE(nd->nr);
				2989
				2990	switch (target_nr) {
				2991	case 0:
				2992	target = NUMA_NO_NODE;
				2993	goto out;
				2994	case 1:
				2995	index = 0;
				2996	break;
				2997	default:
				2998	/*
				2999	* If there are multiple target nodes, just select one
				3000	* target node randomly.
				3001	*
				3002	* In addition, we can also use round-robin to select
				3003	* target node, but we should introduce another variable
				3004	* for node_demotion[] to record last selected target node,
				3005	* that may cause cache ping-pong due to the changing of
				3006	* last target node. Or introducing per-cpu data to avoid
				3007	* caching issue, which seems more complicated. So selecting
				3008	* target node randomly seems better until now.
				3009	*/
				3010	index = get_random_int() % target_nr;
				3011	break;
				3012	}
				3013
				3014	target = READ_ONCE(nd->nodes[index]);
				3015
				3016	out:
				3017	rcu_read_unlock();
				3018	return target;
				3019	}
				3020
Dave Hansen	76af6a0	2021-10-18 15:15:32 -0700	[diff] [blame]	3021	#if defined(CONFIG_HOTPLUG_CPU)
Dave Hansen	79c28a4	2021-09-02 14:59:06 -0700	[diff] [blame]	3022	/* Disable reclaim-based migration. */
				3023	static void __disable_all_migrate_targets(void)
				3024	{
Baolin Wang	ac16ec8	2022-01-14 14:08:43 -0800	[diff] [blame]	3025	int node, i;
Dave Hansen	79c28a4	2021-09-02 14:59:06 -0700	[diff] [blame]	3026
Baolin Wang	ac16ec8	2022-01-14 14:08:43 -0800	[diff] [blame]	3027	if (!node_demotion)
				3028	return;
				3029
				3030	for_each_online_node(node) {
				3031	node_demotion[node].nr = 0;
				3032	for (i = 0; i < DEMOTION_TARGET_NODES; i++)
				3033	node_demotion[node].nodes[i] = NUMA_NO_NODE;
				3034	}
Dave Hansen	79c28a4	2021-09-02 14:59:06 -0700	[diff] [blame]	3035	}
				3036
				3037	static void disable_all_migrate_targets(void)
				3038	{
				3039	__disable_all_migrate_targets();
				3040
				3041	/*
				3042	* Ensure that the "disable" is visible across the system.
				3043	* Readers will see either a combination of before+disable
				3044	* state or disable+after. They will never see before and
				3045	* after state together.
				3046	*
				3047	* The before+after state together might have cycles and
				3048	* could cause readers to do things like loop until this
				3049	* function finishes. This ensures they can only see a
				3050	* single "bad" read and would, for instance, only loop
				3051	* once.
				3052	*/
				3053	synchronize_rcu();
				3054	}
				3055
				3056	/*
				3057	* Find an automatic demotion target for 'node'.
				3058	* Failing here is OK. It might just indicate
				3059	* being at the end of a chain.
				3060	*/
Baolin Wang	ac16ec8	2022-01-14 14:08:43 -0800	[diff] [blame]	3061	static int establish_migrate_target(int node, nodemask_t *used,
				3062	int best_distance)
Dave Hansen	79c28a4	2021-09-02 14:59:06 -0700	[diff] [blame]	3063	{
Baolin Wang	ac16ec8	2022-01-14 14:08:43 -0800	[diff] [blame]	3064	int migration_target, index, val;
				3065	struct demotion_nodes *nd;
Dave Hansen	79c28a4	2021-09-02 14:59:06 -0700	[diff] [blame]	3066
Baolin Wang	ac16ec8	2022-01-14 14:08:43 -0800	[diff] [blame]	3067	if (!node_demotion)
Dave Hansen	79c28a4	2021-09-02 14:59:06 -0700	[diff] [blame]	3068	return NUMA_NO_NODE;
				3069
Baolin Wang	ac16ec8	2022-01-14 14:08:43 -0800	[diff] [blame]	3070	nd = &node_demotion[node];
				3071
Dave Hansen	79c28a4	2021-09-02 14:59:06 -0700	[diff] [blame]	3072	migration_target = find_next_best_node(node, used);
				3073	if (migration_target == NUMA_NO_NODE)
				3074	return NUMA_NO_NODE;
				3075
Baolin Wang	ac16ec8	2022-01-14 14:08:43 -0800	[diff] [blame]	3076	/*
				3077	* If the node has been set a migration target node before,
				3078	* which means it's the best distance between them. Still
				3079	* check if this node can be demoted to other target nodes
				3080	* if they have a same best distance.
				3081	*/
				3082	if (best_distance != -1) {
				3083	val = node_distance(node, migration_target);
				3084	if (val > best_distance)
				3085	return NUMA_NO_NODE;
				3086	}
				3087
				3088	index = nd->nr;
				3089	if (WARN_ONCE(index >= DEMOTION_TARGET_NODES,
				3090	"Exceeds maximum demotion target nodes\n"))
				3091	return NUMA_NO_NODE;
				3092
				3093	nd->nodes[index] = migration_target;
				3094	nd->nr++;
Dave Hansen	79c28a4	2021-09-02 14:59:06 -0700	[diff] [blame]	3095
				3096	return migration_target;
				3097	}
				3098
				3099	/*
				3100	* When memory fills up on a node, memory contents can be
				3101	* automatically migrated to another node instead of
				3102	* discarded at reclaim.
				3103	*
				3104	* Establish a "migration path" which will start at nodes
				3105	* with CPUs and will follow the priorities used to build the
				3106	* page allocator zonelists.
				3107	*
				3108	* The difference here is that cycles must be avoided. If
				3109	* node0 migrates to node1, then neither node1, nor anything
Baolin Wang	ac16ec8	2022-01-14 14:08:43 -0800	[diff] [blame]	3110	* node1 migrates to can migrate to node0. Also one node can
				3111	* be migrated to multiple nodes if the target nodes all have
				3112	* a same best-distance against the source node.
Dave Hansen	79c28a4	2021-09-02 14:59:06 -0700	[diff] [blame]	3113	*
				3114	* This function can run simultaneously with readers of
				3115	* node_demotion[]. However, it can not run simultaneously
				3116	* with itself. Exclusion is provided by memory hotplug events
				3117	* being single-threaded.
				3118	*/
				3119	static void __set_migration_target_nodes(void)
				3120	{
				3121	nodemask_t next_pass = NODE_MASK_NONE;
				3122	nodemask_t this_pass = NODE_MASK_NONE;
				3123	nodemask_t used_targets = NODE_MASK_NONE;
Baolin Wang	ac16ec8	2022-01-14 14:08:43 -0800	[diff] [blame]	3124	int node, best_distance;
Dave Hansen	79c28a4	2021-09-02 14:59:06 -0700	[diff] [blame]	3125
				3126	/*
				3127	* Avoid any oddities like cycles that could occur
				3128	* from changes in the topology. This will leave
				3129	* a momentary gap when migration is disabled.
				3130	*/
				3131	disable_all_migrate_targets();
				3132
				3133	/*
				3134	* Allocations go close to CPUs, first. Assume that
				3135	* the migration path starts at the nodes with CPUs.
				3136	*/
				3137	next_pass = node_states[N_CPU];
				3138	again:
				3139	this_pass = next_pass;
				3140	next_pass = NODE_MASK_NONE;
				3141	/*
				3142	* To avoid cycles in the migration "graph", ensure
				3143	* that migration sources are not future targets by
				3144	* setting them in 'used_targets'. Do this only
				3145	* once per pass so that multiple source nodes can
				3146	* share a target node.
				3147	*
				3148	* 'used_targets' will become unavailable in future
				3149	* passes. This limits some opportunities for
				3150	* multiple source nodes to share a destination.
				3151	*/
				3152	nodes_or(used_targets, used_targets, this_pass);
Dave Hansen	79c28a4	2021-09-02 14:59:06 -0700	[diff] [blame]	3153
Baolin Wang	ac16ec8	2022-01-14 14:08:43 -0800	[diff] [blame]	3154	for_each_node_mask(node, this_pass) {
				3155	best_distance = -1;
Dave Hansen	79c28a4	2021-09-02 14:59:06 -0700	[diff] [blame]	3156
				3157	/*
Baolin Wang	ac16ec8	2022-01-14 14:08:43 -0800	[diff] [blame]	3158	* Try to set up the migration path for the node, and the target
				3159	* migration nodes can be multiple, so doing a loop to find all
				3160	* the target nodes if they all have a best node distance.
Dave Hansen	79c28a4	2021-09-02 14:59:06 -0700	[diff] [blame]	3161	*/
Baolin Wang	ac16ec8	2022-01-14 14:08:43 -0800	[diff] [blame]	3162	do {
				3163	int target_node =
				3164	establish_migrate_target(node, &used_targets,
				3165	best_distance);
				3166
				3167	if (target_node == NUMA_NO_NODE)
				3168	break;
				3169
				3170	if (best_distance == -1)
				3171	best_distance = node_distance(node, target_node);
				3172
				3173	/*
				3174	* Visit targets from this pass in the next pass.
				3175	* Eventually, every node will have been part of
				3176	* a pass, and will become set in 'used_targets'.
				3177	*/
				3178	node_set(target_node, next_pass);
				3179	} while (1);
Dave Hansen	79c28a4	2021-09-02 14:59:06 -0700	[diff] [blame]	3180	}
				3181	/*
				3182	* 'next_pass' contains nodes which became migration
				3183	* targets in this pass. Make additional passes until
				3184	* no more migrations targets are available.
				3185	*/
				3186	if (!nodes_empty(next_pass))
				3187	goto again;
				3188	}
				3189
				3190	/*
				3191	* For callers that do not hold get_online_mems() already.
				3192	*/
Dave Hansen	79c28a4	2021-09-02 14:59:06 -0700	[diff] [blame]	3193	static void set_migration_target_nodes(void)
				3194	{
				3195	get_online_mems();
				3196	__set_migration_target_nodes();
				3197	put_online_mems();
				3198	}
Dave Hansen	884a6e5	2021-09-02 14:59:09 -0700	[diff] [blame]	3199
				3200	/*
Dave Hansen	884a6e5	2021-09-02 14:59:09 -0700	[diff] [blame]	3201	* This leaves migrate-on-reclaim transiently disabled between
				3202	* the MEM_GOING_OFFLINE and MEM_OFFLINE events. This runs
				3203	* whether reclaim-based migration is enabled or not, which
				3204	* ensures that the user can turn reclaim-based migration at
				3205	* any time without needing to recalculate migration targets.
				3206	*
				3207	* These callbacks already hold get_online_mems(). That is why
				3208	* __set_migration_target_nodes() can be used as opposed to
				3209	* set_migration_target_nodes().
				3210	*/
				3211	static int __meminit migrate_on_reclaim_callback(struct notifier_block *self,
Dave Hansen	295be91	2021-10-18 15:15:29 -0700	[diff] [blame]	3212	unsigned long action, void *_arg)
Dave Hansen	884a6e5	2021-09-02 14:59:09 -0700	[diff] [blame]	3213	{
Dave Hansen	295be91	2021-10-18 15:15:29 -0700	[diff] [blame]	3214	struct memory_notify *arg = _arg;
				3215
				3216	/*
				3217	* Only update the node migration order when a node is
				3218	* changing status, like online->offline. This avoids
				3219	* the overhead of synchronize_rcu() in most cases.
				3220	*/
				3221	if (arg->status_change_nid < 0)
				3222	return notifier_from_errno(0);
				3223
Dave Hansen	884a6e5	2021-09-02 14:59:09 -0700	[diff] [blame]	3224	switch (action) {
				3225	case MEM_GOING_OFFLINE:
				3226	/*
				3227	* Make sure there are not transient states where
				3228	* an offline node is a migration target. This
				3229	* will leave migration disabled until the offline
				3230	* completes and the MEM_OFFLINE case below runs.
				3231	*/
				3232	disable_all_migrate_targets();
				3233	break;
				3234	case MEM_OFFLINE:
				3235	case MEM_ONLINE:
				3236	/*
				3237	* Recalculate the target nodes once the node
				3238	* reaches its final state (online or offline).
				3239	*/
				3240	__set_migration_target_nodes();
				3241	break;
				3242	case MEM_CANCEL_OFFLINE:
				3243	/*
				3244	* MEM_GOING_OFFLINE disabled all the migration
				3245	* targets. Reenable them.
				3246	*/
				3247	__set_migration_target_nodes();
				3248	break;
				3249	case MEM_GOING_ONLINE:
				3250	case MEM_CANCEL_ONLINE:
				3251	break;
				3252	}
				3253
				3254	return notifier_from_errno(0);
				3255	}
				3256
Dave Hansen	76af6a0	2021-10-18 15:15:32 -0700	[diff] [blame]	3257	/*
				3258	* React to hotplug events that might affect the migration targets
				3259	* like events that online or offline NUMA nodes.
				3260	*
				3261	* The ordering is also currently dependent on which nodes have
				3262	* CPUs. That means we need CPU on/offline notification too.
				3263	*/
				3264	static int migration_online_cpu(unsigned int cpu)
				3265	{
				3266	set_migration_target_nodes();
				3267	return 0;
				3268	}
				3269
				3270	static int migration_offline_cpu(unsigned int cpu)
				3271	{
				3272	set_migration_target_nodes();
				3273	return 0;
				3274	}
				3275
Dave Hansen	884a6e5	2021-09-02 14:59:09 -0700	[diff] [blame]	3276	static int __init migrate_on_reclaim_init(void)
				3277	{
				3278	int ret;
				3279
Baolin Wang	ac16ec8	2022-01-14 14:08:43 -0800	[diff] [blame]	3280	node_demotion = kmalloc_array(nr_node_ids,
				3281	sizeof(struct demotion_nodes),
				3282	GFP_KERNEL);
				3283	WARN_ON(!node_demotion);
				3284
Huang Ying	a6a0251	2021-10-18 15:15:35 -0700	[diff] [blame]	3285	ret = cpuhp_setup_state_nocalls(CPUHP_MM_DEMOTION_DEAD, "mm/demotion:offline",
				3286	NULL, migration_offline_cpu);
Dave Hansen	884a6e5	2021-09-02 14:59:09 -0700	[diff] [blame]	3287	/*
				3288	* In the unlikely case that this fails, the automatic
				3289	* migration targets may become suboptimal for nodes
				3290	* where N_CPU changes. With such a small impact in a
				3291	* rare case, do not bother trying to do anything special.
				3292	*/
				3293	WARN_ON(ret < 0);
Huang Ying	a6a0251	2021-10-18 15:15:35 -0700	[diff] [blame]	3294	ret = cpuhp_setup_state(CPUHP_AP_MM_DEMOTION_ONLINE, "mm/demotion:online",
				3295	migration_online_cpu, NULL);
				3296	WARN_ON(ret < 0);
Dave Hansen	884a6e5	2021-09-02 14:59:09 -0700	[diff] [blame]	3297
				3298	hotplug_memory_notifier(migrate_on_reclaim_callback, 100);
				3299	return 0;
				3300	}
				3301	late_initcall(migrate_on_reclaim_init);
Dave Hansen	76af6a0	2021-10-18 15:15:32 -0700	[diff] [blame]	3302	#endif /* CONFIG_HOTPLUG_CPU */
Yang Shi	20f9ba4	2021-11-05 13:43:35 -0700	[diff] [blame]	3303
				3304	bool numa_demotion_enabled = false;
				3305
				3306	#ifdef CONFIG_SYSFS
				3307	static ssize_t numa_demotion_enabled_show(struct kobject *kobj,
				3308	struct kobj_attribute attr, char buf)
				3309	{
				3310	return sysfs_emit(buf, "%s\n",
				3311	numa_demotion_enabled ? "true" : "false");
				3312	}
				3313
				3314	static ssize_t numa_demotion_enabled_store(struct kobject *kobj,
				3315	struct kobj_attribute *attr,
				3316	const char *buf, size_t count)
				3317	{
				3318	if (!strncmp(buf, "true", 4) \|\| !strncmp(buf, "1", 1))
				3319	numa_demotion_enabled = true;
				3320	else if (!strncmp(buf, "false", 5) \|\| !strncmp(buf, "0", 1))
				3321	numa_demotion_enabled = false;
				3322	else
				3323	return -EINVAL;
				3324
				3325	return count;
				3326	}
				3327
				3328	static struct kobj_attribute numa_demotion_enabled_attr =
				3329	__ATTR(demotion_enabled, 0644, numa_demotion_enabled_show,
				3330	numa_demotion_enabled_store);
				3331
				3332	static struct attribute *numa_attrs[] = {
				3333	&numa_demotion_enabled_attr.attr,
				3334	NULL,
				3335	};
				3336
				3337	static const struct attribute_group numa_attr_group = {
				3338	.attrs = numa_attrs,
				3339	};
				3340
				3341	static int __init numa_init_sysfs(void)
				3342	{
				3343	int err;
				3344	struct kobject *numa_kobj;
				3345
				3346	numa_kobj = kobject_create_and_add("numa", mm_kobj);
				3347	if (!numa_kobj) {
				3348	pr_err("failed to create numa kobject\n");
				3349	return -ENOMEM;
				3350	}
				3351	err = sysfs_create_group(numa_kobj, &numa_attr_group);
				3352	if (err) {
				3353	pr_err("failed to register numa group\n");
				3354	goto delete_obj;
				3355	}
				3356	return 0;
				3357
				3358	delete_obj:
				3359	kobject_put(numa_kobj);
				3360	return err;
				3361	}
				3362	subsys_initcall(numa_init_sysfs);
				3363	#endif