Blame - mm/memory_hotplug.c - SHIFTPHONES/mainline/linux

blob: 2a9627dc784c31072f39527f0a52953f72257aff [file] [log] [blame]

Thomas Gleixner	457c899	2019-05-19 13:08:55 +0100	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0-only
Dave Hansen	3947be1	2005-10-29 18:16:54 -0700	[diff] [blame]	2	/*
				3	* linux/mm/memory_hotplug.c
				4	*
				5	* Copyright (C)
				6	*/
				7
Dave Hansen	3947be1	2005-10-29 18:16:54 -0700	[diff] [blame]	8	#include <linux/stddef.h>
				9	#include <linux/mm.h>
Ingo Molnar	174cd4b	2017-02-02 19:15:33 +0100	[diff] [blame]	10	#include <linux/sched/signal.h>
Dave Hansen	3947be1	2005-10-29 18:16:54 -0700	[diff] [blame]	11	#include <linux/swap.h>
				12	#include <linux/interrupt.h>
				13	#include <linux/pagemap.h>
Dave Hansen	3947be1	2005-10-29 18:16:54 -0700	[diff] [blame]	14	#include <linux/compiler.h>
Paul Gortmaker	b95f1b31	2011-10-16 02:01:52 -0400	[diff] [blame]	15	#include <linux/export.h>
Dave Hansen	3947be1	2005-10-29 18:16:54 -0700	[diff] [blame]	16	#include <linux/pagevec.h>
Chandra Seetharaman	2d1d43f	2006-09-29 02:01:25 -0700	[diff] [blame]	17	#include <linux/writeback.h>
Dave Hansen	3947be1	2005-10-29 18:16:54 -0700	[diff] [blame]	18	#include <linux/slab.h>
				19	#include <linux/sysctl.h>
				20	#include <linux/cpu.h>
				21	#include <linux/memory.h>
Dan Williams	4b94ffd	2016-01-15 16:56:22 -0800	[diff] [blame]	22	#include <linux/memremap.h>
Dave Hansen	3947be1	2005-10-29 18:16:54 -0700	[diff] [blame]	23	#include <linux/memory_hotplug.h>
Dave Hansen	3947be1	2005-10-29 18:16:54 -0700	[diff] [blame]	24	#include <linux/vmalloc.h>
KAMEZAWA Hiroyuki	0a54703	2006-06-27 02:53:35 -0700	[diff] [blame]	25	#include <linux/ioport.h>
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	26	#include <linux/delay.h>
				27	#include <linux/migrate.h>
				28	#include <linux/page-isolation.h>
Badari Pulavarty	7108878	2008-10-18 20:25:58 -0700	[diff] [blame]	29	#include <linux/pfn.h>
Andi Kleen	6ad696d	2009-11-17 14:06:22 -0800	[diff] [blame]	30	#include <linux/suspend.h>
KOSAKI Motohiro	6d9c285	2009-12-14 17:58:11 -0800	[diff] [blame]	31	#include <linux/mm_inline.h>
akpm@linux-foundation.org	d96ae53	2010-03-05 13:41:58 -0800	[diff] [blame]	32	#include <linux/firmware-map.h>
Tang Chen	60a5a19	2013-02-22 16:33:14 -0800	[diff] [blame]	33	#include <linux/stop_machine.h>
Naoya Horiguchi	c8721bb	2013-09-11 14:22:09 -0700	[diff] [blame]	34	#include <linux/hugetlb.h>
Tang Chen	c532092	2013-11-12 15:08:10 -0800	[diff] [blame]	35	#include <linux/memblock.h>
Vlastimil Babka	698b1b3	2016-03-17 14:18:08 -0700	[diff] [blame]	36	#include <linux/compaction.h>
Michal Hocko	b15c872	2018-12-28 00:38:01 -0800	[diff] [blame]	37	#include <linux/rmap.h>
Jakub Kicinski	8581fd4	2021-12-02 12:34:00 -0800	[diff] [blame]	38	#include <linux/module.h>
Dave Hansen	3947be1	2005-10-29 18:16:54 -0700	[diff] [blame]	39
				40	#include <asm/tlbflush.h>
				41
Adrian Bunk	1e5ad9a	2008-04-28 20:40:08 +0300	[diff] [blame]	42	#include "internal.h"
Dan Williams	e900a91	2019-05-14 15:41:28 -0700	[diff] [blame]	43	#include "shuffle.h"
Adrian Bunk	1e5ad9a	2008-04-28 20:40:08 +0300	[diff] [blame]	44
Oscar Salvador	e3a9d9f	2021-05-04 18:39:48 -0700	[diff] [blame]	45
				46	/*
				47	* memory_hotplug.memmap_on_memory parameter
				48	*/
				49	static bool memmap_on_memory __ro_after_init;
				50	#ifdef CONFIG_MHP_MEMMAP_ON_MEMORY
				51	module_param(memmap_on_memory, bool, 0444);
				52	MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory hotplug");
				53	#endif
Oscar Salvador	a08a2ae	2021-05-04 18:39:42 -0700	[diff] [blame]	54
David Hildenbrand	e83a437	2021-09-07 19:55:23 -0700	[diff] [blame]	55	enum {
				56	ONLINE_POLICY_CONTIG_ZONES = 0,
				57	ONLINE_POLICY_AUTO_MOVABLE,
				58	};
				59
Tang Yizhou	ac62554	2021-11-05 13:44:08 -0700	[diff] [blame]	60	static const char * const online_policy_to_str[] = {
David Hildenbrand	e83a437	2021-09-07 19:55:23 -0700	[diff] [blame]	61	[ONLINE_POLICY_CONTIG_ZONES] = "contig-zones",
				62	[ONLINE_POLICY_AUTO_MOVABLE] = "auto-movable",
				63	};
				64
				65	static int set_online_policy(const char val, const struct kernel_param kp)
				66	{
				67	int ret = sysfs_match_string(online_policy_to_str, val);
				68
				69	if (ret < 0)
				70	return ret;
				71	((int )kp->arg) = ret;
				72	return 0;
				73	}
				74
				75	static int get_online_policy(char buffer, const struct kernel_param kp)
				76	{
				77	return sprintf(buffer, "%s\n", online_policy_to_str[((int )kp->arg)]);
				78	}
				79
				80	/*
				81	* memory_hotplug.online_policy: configure online behavior when onlining without
				82	* specifying a zone (MMOP_ONLINE)
				83	*
				84	* "contig-zones": keep zone contiguous
				85	* "auto-movable": online memory to ZONE_MOVABLE if the configuration
				86	* (auto_movable_ratio, auto_movable_numa_aware) allows for it
				87	*/
				88	static int online_policy __read_mostly = ONLINE_POLICY_CONTIG_ZONES;
				89	static const struct kernel_param_ops online_policy_ops = {
				90	.set = set_online_policy,
				91	.get = get_online_policy,
				92	};
				93	module_param_cb(online_policy, &online_policy_ops, &online_policy, 0644);
				94	MODULE_PARM_DESC(online_policy,
				95	"Set the online policy (\"contig-zones\", \"auto-movable\") "
				96	"Default: \"contig-zones\"");
				97
				98	/*
				99	* memory_hotplug.auto_movable_ratio: specify maximum MOVABLE:KERNEL ratio
				100	*
				101	* The ratio represent an upper limit and the kernel might decide to not
				102	* online some memory to ZONE_MOVABLE -- e.g., because hotplugged KERNEL memory
				103	* doesn't allow for more MOVABLE memory.
				104	*/
				105	static unsigned int auto_movable_ratio __read_mostly = 301;
				106	module_param(auto_movable_ratio, uint, 0644);
				107	MODULE_PARM_DESC(auto_movable_ratio,
				108	"Set the maximum ratio of MOVABLE:KERNEL memory in the system "
				109	"in percent for \"auto-movable\" online policy. Default: 301");
				110
				111	/*
				112	* memory_hotplug.auto_movable_numa_aware: consider numa node stats
				113	*/
				114	#ifdef CONFIG_NUMA
				115	static bool auto_movable_numa_aware __read_mostly = true;
				116	module_param(auto_movable_numa_aware, bool, 0644);
				117	MODULE_PARM_DESC(auto_movable_numa_aware,
				118	"Consider numa node stats in addition to global stats in "
				119	"\"auto-movable\" online policy. Default: true");
				120	#endif /* CONFIG_NUMA */
				121
Daniel Kiper	9d0ad8c	2011-07-25 17:12:05 -0700	[diff] [blame]	122	/*
				123	* online_page_callback contains pointer to current page onlining function.
				124	* Initially it is generic_online_page(). If it is required it could be
				125	* changed by calling set_online_page_callback() for callback registration
				126	* and restore_online_page_callback() for generic callback restore.
				127	*/
				128
Daniel Kiper	9d0ad8c	2011-07-25 17:12:05 -0700	[diff] [blame]	129	static online_page_callback_t online_page_callback = generic_online_page;
Vladimir Davydov	bfc8c90	2014-06-04 16:07:18 -0700	[diff] [blame]	130	static DEFINE_MUTEX(online_page_callback_lock);
Daniel Kiper	9d0ad8c	2011-07-25 17:12:05 -0700	[diff] [blame]	131
Thomas Gleixner	3f906ba	2017-07-10 15:50:09 -0700	[diff] [blame]	132	DEFINE_STATIC_PERCPU_RWSEM(mem_hotplug_lock);
KOSAKI Motohiro	20d6c96	2010-12-02 14:31:19 -0800	[diff] [blame]	133
Thomas Gleixner	3f906ba	2017-07-10 15:50:09 -0700	[diff] [blame]	134	void get_online_mems(void)
				135	{
				136	percpu_down_read(&mem_hotplug_lock);
				137	}
Vladimir Davydov	bfc8c90	2014-06-04 16:07:18 -0700	[diff] [blame]	138
Thomas Gleixner	3f906ba	2017-07-10 15:50:09 -0700	[diff] [blame]	139	void put_online_mems(void)
				140	{
				141	percpu_up_read(&mem_hotplug_lock);
				142	}
Vladimir Davydov	bfc8c90	2014-06-04 16:07:18 -0700	[diff] [blame]	143
Michal Hocko	4932381	2017-07-06 15:41:05 -0700	[diff] [blame]	144	bool movable_node_enabled = false;
				145
Vitaly Kuznetsov	8604d9e	2016-05-19 17:13:03 -0700	[diff] [blame]	146	#ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE
Anshuman Khandual	1adf8b4	2021-02-25 17:17:13 -0800	[diff] [blame]	147	int mhp_default_online_type = MMOP_OFFLINE;
Vitaly Kuznetsov	8604d9e	2016-05-19 17:13:03 -0700	[diff] [blame]	148	#else
Anshuman Khandual	1adf8b4	2021-02-25 17:17:13 -0800	[diff] [blame]	149	int mhp_default_online_type = MMOP_ONLINE;
Vitaly Kuznetsov	8604d9e	2016-05-19 17:13:03 -0700	[diff] [blame]	150	#endif
Vitaly Kuznetsov	31bc385	2016-03-15 14:56:48 -0700	[diff] [blame]	151
Vitaly Kuznetsov	86dd995	2016-05-19 17:13:06 -0700	[diff] [blame]	152	static int __init setup_memhp_default_state(char *str)
				153	{
Anshuman Khandual	1adf8b4	2021-02-25 17:17:13 -0800	[diff] [blame]	154	const int online_type = mhp_online_type_from_str(str);
David Hildenbrand	5f47adf	2020-04-06 20:07:44 -0700	[diff] [blame]	155
				156	if (online_type >= 0)
Anshuman Khandual	1adf8b4	2021-02-25 17:17:13 -0800	[diff] [blame]	157	mhp_default_online_type = online_type;
Vitaly Kuznetsov	86dd995	2016-05-19 17:13:06 -0700	[diff] [blame]	158
				159	return 1;
				160	}
				161	__setup("memhp_default_state=", setup_memhp_default_state);
				162
David Rientjes	30467e0	2015-04-14 15:45:11 -0700	[diff] [blame]	163	void mem_hotplug_begin(void)
Vladimir Davydov	bfc8c90	2014-06-04 16:07:18 -0700	[diff] [blame]	164	{
Thomas Gleixner	3f906ba	2017-07-10 15:50:09 -0700	[diff] [blame]	165	cpus_read_lock();
				166	percpu_down_write(&mem_hotplug_lock);
Vladimir Davydov	bfc8c90	2014-06-04 16:07:18 -0700	[diff] [blame]	167	}
				168
David Rientjes	30467e0	2015-04-14 15:45:11 -0700	[diff] [blame]	169	void mem_hotplug_done(void)
Vladimir Davydov	bfc8c90	2014-06-04 16:07:18 -0700	[diff] [blame]	170	{
Thomas Gleixner	3f906ba	2017-07-10 15:50:09 -0700	[diff] [blame]	171	percpu_up_write(&mem_hotplug_lock);
				172	cpus_read_unlock();
Vladimir Davydov	bfc8c90	2014-06-04 16:07:18 -0700	[diff] [blame]	173	}
KOSAKI Motohiro	20d6c96	2010-12-02 14:31:19 -0800	[diff] [blame]	174
Juergen Gross	357b4da	2019-02-14 11:42:39 +0100	[diff] [blame]	175	u64 max_mem_size = U64_MAX;
				176
Keith Mannthey	45e0b78	2006-09-30 23:27:09 -0700	[diff] [blame]	177	/* add this memory to iomem resource */
David Hildenbrand	7b7b272	2020-06-04 16:48:41 -0700	[diff] [blame]	178	static struct resource *register_memory_resource(u64 start, u64 size,
				179	const char *resource_name)
Keith Mannthey	45e0b78	2006-09-30 23:27:09 -0700	[diff] [blame]	180	{
Dave Hansen	2794129	2019-02-25 10:57:36 -0800	[diff] [blame]	181	struct resource *res;
				182	unsigned long flags = IORESOURCE_SYSTEM_RAM \| IORESOURCE_BUSY;
David Hildenbrand	7b7b272	2020-06-04 16:48:41 -0700	[diff] [blame]	183
				184	if (strcmp(resource_name, "System RAM"))
David Hildenbrand	7cf603d	2020-10-15 20:08:33 -0700	[diff] [blame]	185	flags \|= IORESOURCE_SYSRAM_DRIVER_MANAGED;
Juergen Gross	357b4da	2019-02-14 11:42:39 +0100	[diff] [blame]	186
Anshuman Khandual	bca3fea	2021-02-25 17:17:33 -0800	[diff] [blame]	187	if (!mhp_range_allowed(start, size, true))
				188	return ERR_PTR(-E2BIG);
				189
Baoquan He	f3cd4c8	2020-04-06 20:06:50 -0700	[diff] [blame]	190	/*
				191	* Make sure value parsed from 'mem=' only restricts memory adding
				192	* while booting, so that memory hotplug won't be impacted. Please
				193	* refer to document of 'mem=' in kernel-parameters.txt for more
				194	* details.
				195	*/
				196	if (start + size > max_mem_size && system_state < SYSTEM_RUNNING)
Juergen Gross	357b4da	2019-02-14 11:42:39 +0100	[diff] [blame]	197	return ERR_PTR(-E2BIG);
				198
Dave Hansen	2794129	2019-02-25 10:57:36 -0800	[diff] [blame]	199	/*
				200	* Request ownership of the new memory range. This might be
				201	* a child of an existing resource that was present but
				202	* not marked as busy.
				203	*/
				204	res = __request_region(&iomem_resource, start, size,
				205	resource_name, flags);
Keith Mannthey	45e0b78	2006-09-30 23:27:09 -0700	[diff] [blame]	206
Dave Hansen	2794129	2019-02-25 10:57:36 -0800	[diff] [blame]	207	if (!res) {
				208	pr_debug("Unable to reserve System RAM region: %016llx->%016llx\n",
				209	start, start + size);
Vitaly Kuznetsov	6f754ba	2016-01-14 15:21:55 -0800	[diff] [blame]	210	return ERR_PTR(-EEXIST);
Keith Mannthey	45e0b78	2006-09-30 23:27:09 -0700	[diff] [blame]	211	}
				212	return res;
				213	}
				214
				215	static void release_memory_resource(struct resource *res)
				216	{
				217	if (!res)
				218	return;
				219	release_resource(res);
				220	kfree(res);
Keith Mannthey	45e0b78	2006-09-30 23:27:09 -0700	[diff] [blame]	221	}
				222
Dan Williams	7ea6216	2019-07-18 15:58:22 -0700	[diff] [blame]	223	static int check_pfn_span(unsigned long pfn, unsigned long nr_pages,
				224	const char *reason)
				225	{
				226	/*
				227	* Disallow all operations smaller than a sub-section and only
				228	* allow operations smaller than a section for
				229	* SPARSEMEM_VMEMMAP. Note that check_hotplug_memory_range()
				230	* enforces a larger memory_block_size_bytes() granularity for
				231	* memory that will be marked online, so this check should only
				232	* fire for direct arch_{add,remove}_memory() users outside of
				233	* add_memory_resource().
				234	*/
				235	unsigned long min_align;
				236
				237	if (IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP))
				238	min_align = PAGES_PER_SUBSECTION;
				239	else
				240	min_align = PAGES_PER_SECTION;
				241	if (!IS_ALIGNED(pfn, min_align)
				242	\|\| !IS_ALIGNED(nr_pages, min_align)) {
				243	WARN(1, "Misaligned __%s_pages start: %#lx end: #%lx\n",
				244	reason, pfn, pfn + nr_pages - 1);
				245	return -EINVAL;
				246	}
				247	return 0;
				248	}
				249
David Rientjes	4edd7ce	2013-04-29 15:08:22 -0700	[diff] [blame]	250	/*
Dan Williams	9f605f2	2021-02-25 17:16:57 -0800	[diff] [blame]	251	* Return page for the valid pfn only if the page is online. All pfn
				252	* walkers which rely on the fully initialized page->flags and others
				253	* should use this rather than pfn_valid && pfn_to_page
				254	*/
				255	struct page *pfn_to_online_page(unsigned long pfn)
				256	{
				257	unsigned long nr = pfn_to_section_nr(pfn);
Dan Williams	1f90a34	2021-02-25 17:17:05 -0800	[diff] [blame]	258	struct dev_pagemap *pgmap;
Dan Williams	9f9b02e	2021-02-25 17:17:01 -0800	[diff] [blame]	259	struct mem_section *ms;
Dan Williams	9f605f2	2021-02-25 17:16:57 -0800	[diff] [blame]	260
Dan Williams	9f9b02e	2021-02-25 17:17:01 -0800	[diff] [blame]	261	if (nr >= NR_MEM_SECTIONS)
				262	return NULL;
				263
				264	ms = __nr_to_section(nr);
				265	if (!online_section(ms))
				266	return NULL;
				267
				268	/*
				269	* Save some code text when online_section() +
				270	* pfn_section_valid() are sufficient.
				271	*/
				272	if (IS_ENABLED(CONFIG_HAVE_ARCH_PFN_VALID) && !pfn_valid(pfn))
				273	return NULL;
				274
				275	if (!pfn_section_valid(ms, pfn))
				276	return NULL;
				277
Dan Williams	1f90a34	2021-02-25 17:17:05 -0800	[diff] [blame]	278	if (!online_device_section(ms))
				279	return pfn_to_page(pfn);
				280
				281	/*
				282	* Slowpath: when ZONE_DEVICE collides with
				283	* ZONE_{NORMAL,MOVABLE} within the same section some pfns in
				284	* the section may be 'offline' but 'valid'. Only
				285	* get_dev_pagemap() can determine sub-section online status.
				286	*/
				287	pgmap = get_dev_pagemap(pfn, NULL);
				288	put_dev_pagemap(pgmap);
				289
				290	/* The presence of a pgmap indicates ZONE_DEVICE offline pfn */
				291	if (pgmap)
				292	return NULL;
				293
Dan Williams	9f9b02e	2021-02-25 17:17:01 -0800	[diff] [blame]	294	return pfn_to_page(pfn);
Dan Williams	9f605f2	2021-02-25 17:16:57 -0800	[diff] [blame]	295	}
				296	EXPORT_SYMBOL_GPL(pfn_to_online_page);
				297
				298	/*
David Rientjes	4edd7ce	2013-04-29 15:08:22 -0700	[diff] [blame]	299	* Reasonably generic function for adding memory. It is
				300	* expected that archs that support memory hotplug will
				301	* call this function after deciding the zone to which to
				302	* add the new pages.
				303	*/
Dan Williams	7ea6216	2019-07-18 15:58:22 -0700	[diff] [blame]	304	int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages,
Logan Gunthorpe	f5637d3	2020-04-10 14:33:21 -0700	[diff] [blame]	305	struct mhp_params *params)
David Rientjes	4edd7ce	2013-04-29 15:08:22 -0700	[diff] [blame]	306	{
David Hildenbrand	6cdd0b3	2020-04-06 20:06:56 -0700	[diff] [blame]	307	const unsigned long end_pfn = pfn + nr_pages;
				308	unsigned long cur_nr_pages;
Dan Williams	9a84503	2019-07-18 15:58:43 -0700	[diff] [blame]	309	int err;
Logan Gunthorpe	f5637d3	2020-04-10 14:33:21 -0700	[diff] [blame]	310	struct vmem_altmap *altmap = params->altmap;
Dan Williams	4b94ffd	2016-01-15 16:56:22 -0800	[diff] [blame]	311
Logan Gunthorpe	bfeb022	2020-04-10 14:33:36 -0700	[diff] [blame]	312	if (WARN_ON_ONCE(!params->pgprot.pgprot))
				313	return -EINVAL;
				314
Anshuman Khandual	bca3fea	2021-02-25 17:17:33 -0800	[diff] [blame]	315	VM_BUG_ON(!mhp_range_allowed(PFN_PHYS(pfn), nr_pages * PAGE_SIZE, false));
Alastair D'Silva	dca4436	2019-11-30 17:53:48 -0800	[diff] [blame]	316
Dan Williams	4b94ffd	2016-01-15 16:56:22 -0800	[diff] [blame]	317	if (altmap) {
				318	/*
				319	* Validate altmap is within bounds of the total request
				320	*/
Dan Williams	7ea6216	2019-07-18 15:58:22 -0700	[diff] [blame]	321	if (altmap->base_pfn != pfn
Dan Williams	4b94ffd	2016-01-15 16:56:22 -0800	[diff] [blame]	322	\|\| vmem_altmap_offset(altmap) > nr_pages) {
				323	pr_warn_once("memory add fail, invalid altmap\n");
Dan Williams	7ea6216	2019-07-18 15:58:22 -0700	[diff] [blame]	324	return -EINVAL;
Dan Williams	4b94ffd	2016-01-15 16:56:22 -0800	[diff] [blame]	325	}
				326	altmap->alloc = 0;
				327	}
				328
Dan Williams	7ea6216	2019-07-18 15:58:22 -0700	[diff] [blame]	329	err = check_pfn_span(pfn, nr_pages, "add");
				330	if (err)
				331	return err;
				332
David Hildenbrand	6cdd0b3	2020-04-06 20:06:56 -0700	[diff] [blame]	333	for (; pfn < end_pfn; pfn += cur_nr_pages) {
				334	/* Select all remaining pages up to the next section boundary */
				335	cur_nr_pages = min(end_pfn - pfn,
				336	SECTION_ALIGN_UP(pfn + 1) - pfn);
				337	err = sparse_add_section(nid, pfn, cur_nr_pages, altmap);
Dan Williams	ba72b4c	2019-07-18 15:58:26 -0700	[diff] [blame]	338	if (err)
				339	break;
Michal Hocko	f64ac5e	2017-10-03 16:16:16 -0700	[diff] [blame]	340	cond_resched();
David Rientjes	4edd7ce	2013-04-29 15:08:22 -0700	[diff] [blame]	341	}
Zhu Guihua	c435a39	2015-06-24 16:58:42 -0700	[diff] [blame]	342	vmemmap_populate_print_last();
David Rientjes	4edd7ce	2013-04-29 15:08:22 -0700	[diff] [blame]	343	return err;
				344	}
David Rientjes	4edd7ce	2013-04-29 15:08:22 -0700	[diff] [blame]	345
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	346	/* find the smallest valid pfn in the range [start_pfn, end_pfn) */
YASUAKI ISHIMATSU	d09b013	2017-10-03 16:16:32 -0700	[diff] [blame]	347	static unsigned long find_smallest_section_pfn(int nid, struct zone *zone,
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	348	unsigned long start_pfn,
				349	unsigned long end_pfn)
				350	{
Dan Williams	49ba3c6	2019-07-18 15:58:07 -0700	[diff] [blame]	351	for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SUBSECTION) {
David Hildenbrand	7ce700b	2019-11-21 17:53:56 -0800	[diff] [blame]	352	if (unlikely(!pfn_to_online_page(start_pfn)))
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	353	continue;
				354
				355	if (unlikely(pfn_to_nid(start_pfn) != nid))
				356	continue;
				357
David Hildenbrand	9b05158	2020-02-03 17:34:12 -0800	[diff] [blame]	358	if (zone != page_zone(pfn_to_page(start_pfn)))
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	359	continue;
				360
				361	return start_pfn;
				362	}
				363
				364	return 0;
				365	}
				366
				367	/* find the biggest valid pfn in the range [start_pfn, end_pfn). */
YASUAKI ISHIMATSU	d09b013	2017-10-03 16:16:32 -0700	[diff] [blame]	368	static unsigned long find_biggest_section_pfn(int nid, struct zone *zone,
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	369	unsigned long start_pfn,
				370	unsigned long end_pfn)
				371	{
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	372	unsigned long pfn;
				373
				374	/* pfn is the end pfn of a memory section. */
				375	pfn = end_pfn - 1;
Dan Williams	49ba3c6	2019-07-18 15:58:07 -0700	[diff] [blame]	376	for (; pfn >= start_pfn; pfn -= PAGES_PER_SUBSECTION) {
David Hildenbrand	7ce700b	2019-11-21 17:53:56 -0800	[diff] [blame]	377	if (unlikely(!pfn_to_online_page(pfn)))
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	378	continue;
				379
				380	if (unlikely(pfn_to_nid(pfn) != nid))
				381	continue;
				382
David Hildenbrand	9b05158	2020-02-03 17:34:12 -0800	[diff] [blame]	383	if (zone != page_zone(pfn_to_page(pfn)))
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	384	continue;
				385
				386	return pfn;
				387	}
				388
				389	return 0;
				390	}
				391
				392	static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
				393	unsigned long end_pfn)
				394	{
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	395	unsigned long pfn;
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	396	int nid = zone_to_nid(zone);
				397
David Hildenbrand	5d12071	2020-02-03 17:34:19 -0800	[diff] [blame]	398	if (zone->zone_start_pfn == start_pfn) {
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	399	/*
				400	* If the section is smallest section in the zone, it need
				401	* shrink zone->zone_start_pfn and zone->zone_spanned_pages.
				402	* In this case, we find second smallest valid mem_section
				403	* for shrinking zone.
				404	*/
				405	pfn = find_smallest_section_pfn(nid, zone, end_pfn,
David Hildenbrand	5d12071	2020-02-03 17:34:19 -0800	[diff] [blame]	406	zone_end_pfn(zone));
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	407	if (pfn) {
David Hildenbrand	5d12071	2020-02-03 17:34:19 -0800	[diff] [blame]	408	zone->spanned_pages = zone_end_pfn(zone) - pfn;
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	409	zone->zone_start_pfn = pfn;
David Hildenbrand	950b68d	2020-02-03 17:34:16 -0800	[diff] [blame]	410	} else {
				411	zone->zone_start_pfn = 0;
				412	zone->spanned_pages = 0;
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	413	}
David Hildenbrand	5d12071	2020-02-03 17:34:19 -0800	[diff] [blame]	414	} else if (zone_end_pfn(zone) == end_pfn) {
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	415	/*
				416	* If the section is biggest section in the zone, it need
				417	* shrink zone->spanned_pages.
				418	* In this case, we find second biggest valid mem_section for
				419	* shrinking zone.
				420	*/
David Hildenbrand	5d12071	2020-02-03 17:34:19 -0800	[diff] [blame]	421	pfn = find_biggest_section_pfn(nid, zone, zone->zone_start_pfn,
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	422	start_pfn);
				423	if (pfn)
David Hildenbrand	5d12071	2020-02-03 17:34:19 -0800	[diff] [blame]	424	zone->spanned_pages = pfn - zone->zone_start_pfn + 1;
David Hildenbrand	950b68d	2020-02-03 17:34:16 -0800	[diff] [blame]	425	else {
				426	zone->zone_start_pfn = 0;
				427	zone->spanned_pages = 0;
				428	}
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	429	}
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	430	}
				431
David Hildenbrand	00d6c01	2019-10-18 20:19:33 -0700	[diff] [blame]	432	static void update_pgdat_span(struct pglist_data *pgdat)
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	433	{
David Hildenbrand	00d6c01	2019-10-18 20:19:33 -0700	[diff] [blame]	434	unsigned long node_start_pfn = 0, node_end_pfn = 0;
				435	struct zone *zone;
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	436
David Hildenbrand	00d6c01	2019-10-18 20:19:33 -0700	[diff] [blame]	437	for (zone = pgdat->node_zones;
				438	zone < pgdat->node_zones + MAX_NR_ZONES; zone++) {
Miaohe Lin	6c922cf	2021-02-25 17:17:21 -0800	[diff] [blame]	439	unsigned long end_pfn = zone_end_pfn(zone);
David Hildenbrand	00d6c01	2019-10-18 20:19:33 -0700	[diff] [blame]	440
				441	/* No need to lock the zones, they can't change. */
David Hildenbrand	656d571	2019-11-05 21:17:10 -0800	[diff] [blame]	442	if (!zone->spanned_pages)
				443	continue;
				444	if (!node_end_pfn) {
				445	node_start_pfn = zone->zone_start_pfn;
Miaohe Lin	6c922cf	2021-02-25 17:17:21 -0800	[diff] [blame]	446	node_end_pfn = end_pfn;
David Hildenbrand	656d571	2019-11-05 21:17:10 -0800	[diff] [blame]	447	continue;
				448	}
				449
Miaohe Lin	6c922cf	2021-02-25 17:17:21 -0800	[diff] [blame]	450	if (end_pfn > node_end_pfn)
				451	node_end_pfn = end_pfn;
David Hildenbrand	00d6c01	2019-10-18 20:19:33 -0700	[diff] [blame]	452	if (zone->zone_start_pfn < node_start_pfn)
				453	node_start_pfn = zone->zone_start_pfn;
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	454	}
				455
David Hildenbrand	00d6c01	2019-10-18 20:19:33 -0700	[diff] [blame]	456	pgdat->node_start_pfn = node_start_pfn;
				457	pgdat->node_spanned_pages = node_end_pfn - node_start_pfn;
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	458	}
				459
David Hildenbrand	feee6b2	2020-01-04 12:59:33 -0800	[diff] [blame]	460	void __ref remove_pfn_range_from_zone(struct zone *zone,
				461	unsigned long start_pfn,
				462	unsigned long nr_pages)
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	463	{
Ben Widawsky	b7e3deb	2020-06-25 20:30:51 -0700	[diff] [blame]	464	const unsigned long end_pfn = start_pfn + nr_pages;
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	465	struct pglist_data *pgdat = zone->zone_pgdat;
Oscar Salvador	27cacaa	2021-06-30 18:52:46 -0700	[diff] [blame]	466	unsigned long pfn, cur_nr_pages;
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	467
David Hildenbrand	d33695b	2020-02-03 17:34:09 -0800	[diff] [blame]	468	/* Poison struct pages because they are now uninitialized again. */
Ben Widawsky	b7e3deb	2020-06-25 20:30:51 -0700	[diff] [blame]	469	for (pfn = start_pfn; pfn < end_pfn; pfn += cur_nr_pages) {
				470	cond_resched();
				471
				472	/* Select all remaining pages up to the next section boundary */
				473	cur_nr_pages =
				474	min(end_pfn - pfn, SECTION_ALIGN_UP(pfn + 1) - pfn);
				475	page_init_poison(pfn_to_page(pfn),
				476	sizeof(struct page) * cur_nr_pages);
				477	}
David Hildenbrand	d33695b	2020-02-03 17:34:09 -0800	[diff] [blame]	478
David Hildenbrand	7ce700b	2019-11-21 17:53:56 -0800	[diff] [blame]	479	/*
				480	* Zone shrinking code cannot properly deal with ZONE_DEVICE. So
				481	* we will not try to shrink the zones - which is okay as
				482	* set_zone_contiguous() cannot deal with ZONE_DEVICE either way.
				483	*/
Miaohe Lin	5ef5f81	2021-09-07 19:55:52 -0700	[diff] [blame]	484	if (zone_is_zone_device(zone))
David Hildenbrand	7ce700b	2019-11-21 17:53:56 -0800	[diff] [blame]	485	return;
David Hildenbrand	7ce700b	2019-11-21 17:53:56 -0800	[diff] [blame]	486
David Hildenbrand	feee6b2	2020-01-04 12:59:33 -0800	[diff] [blame]	487	clear_zone_contiguous(zone);
				488
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	489	shrink_zone_span(zone, start_pfn, start_pfn + nr_pages);
David Hildenbrand	00d6c01	2019-10-18 20:19:33 -0700	[diff] [blame]	490	update_pgdat_span(pgdat);
David Hildenbrand	feee6b2	2020-01-04 12:59:33 -0800	[diff] [blame]	491
				492	set_zone_contiguous(zone);
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	493	}
				494
David Hildenbrand	feee6b2	2020-01-04 12:59:33 -0800	[diff] [blame]	495	static void __remove_section(unsigned long pfn, unsigned long nr_pages,
				496	unsigned long map_offset,
				497	struct vmem_altmap *altmap)
Badari Pulavarty	ea01ea9	2008-04-28 02:12:01 -0700	[diff] [blame]	498	{
chenqiwu	1040490	2020-04-06 20:07:48 -0700	[diff] [blame]	499	struct mem_section *ms = __pfn_to_section(pfn);
Badari Pulavarty	ea01ea9	2008-04-28 02:12:01 -0700	[diff] [blame]	500
David Hildenbrand	9d1d887	2019-05-13 17:21:41 -0700	[diff] [blame]	501	if (WARN_ON_ONCE(!valid_section(ms)))
				502	return;
Badari Pulavarty	ea01ea9	2008-04-28 02:12:01 -0700	[diff] [blame]	503
Dan Williams	ba72b4c	2019-07-18 15:58:26 -0700	[diff] [blame]	504	sparse_remove_section(ms, pfn, nr_pages, map_offset, altmap);
Badari Pulavarty	ea01ea9	2008-04-28 02:12:01 -0700	[diff] [blame]	505	}
				506
Badari Pulavarty	ea01ea9	2008-04-28 02:12:01 -0700	[diff] [blame]	507	/**
David Hildenbrand	feee6b2	2020-01-04 12:59:33 -0800	[diff] [blame]	508	* __remove_pages() - remove sections of pages
Dan Williams	7ea6216	2019-07-18 15:58:22 -0700	[diff] [blame]	509	* @pfn: starting pageframe (must be aligned to start of a section)
Badari Pulavarty	ea01ea9	2008-04-28 02:12:01 -0700	[diff] [blame]	510	* @nr_pages: number of pages to remove (must be multiple of section size)
Mike Rapoport	e8b098f	2018-04-05 16:24:57 -0700	[diff] [blame]	511	* @altmap: alternative device page map or %NULL if default memmap is used
Badari Pulavarty	ea01ea9	2008-04-28 02:12:01 -0700	[diff] [blame]	512	*
				513	* Generic helper function to remove section mappings and sysfs entries
				514	* for the section of the memory we are removing. Caller needs to make
				515	* sure that pages are marked reserved and zones are adjust properly by
				516	* calling offline_pages().
				517	*/
David Hildenbrand	feee6b2	2020-01-04 12:59:33 -0800	[diff] [blame]	518	void __remove_pages(unsigned long pfn, unsigned long nr_pages,
				519	struct vmem_altmap *altmap)
Badari Pulavarty	ea01ea9	2008-04-28 02:12:01 -0700	[diff] [blame]	520	{
David Hildenbrand	52fb87c	2020-02-03 17:34:23 -0800	[diff] [blame]	521	const unsigned long end_pfn = pfn + nr_pages;
				522	unsigned long cur_nr_pages;
Dan Williams	4b94ffd	2016-01-15 16:56:22 -0800	[diff] [blame]	523	unsigned long map_offset = 0;
Dan Williams	4b94ffd	2016-01-15 16:56:22 -0800	[diff] [blame]	524
Dan Williams	96da435	2019-07-18 15:58:15 -0700	[diff] [blame]	525	map_offset = vmem_altmap_offset(altmap);
Badari Pulavarty	ea01ea9	2008-04-28 02:12:01 -0700	[diff] [blame]	526
Dan Williams	7ea6216	2019-07-18 15:58:22 -0700	[diff] [blame]	527	if (check_pfn_span(pfn, nr_pages, "remove"))
				528	return;
Badari Pulavarty	ea01ea9	2008-04-28 02:12:01 -0700	[diff] [blame]	529
David Hildenbrand	52fb87c	2020-02-03 17:34:23 -0800	[diff] [blame]	530	for (; pfn < end_pfn; pfn += cur_nr_pages) {
Michal Hocko	dd33ad7	2018-11-02 15:48:46 -0700	[diff] [blame]	531	cond_resched();
David Hildenbrand	52fb87c	2020-02-03 17:34:23 -0800	[diff] [blame]	532	/* Select all remaining pages up to the next section boundary */
David Hildenbrand	a11b941	2020-04-06 20:06:53 -0700	[diff] [blame]	533	cur_nr_pages = min(end_pfn - pfn,
				534	SECTION_ALIGN_UP(pfn + 1) - pfn);
David Hildenbrand	52fb87c	2020-02-03 17:34:23 -0800	[diff] [blame]	535	__remove_section(pfn, cur_nr_pages, map_offset, altmap);
Dan Williams	4b94ffd	2016-01-15 16:56:22 -0800	[diff] [blame]	536	map_offset = 0;
Badari Pulavarty	ea01ea9	2008-04-28 02:12:01 -0700	[diff] [blame]	537	}
Badari Pulavarty	ea01ea9	2008-04-28 02:12:01 -0700	[diff] [blame]	538	}
Badari Pulavarty	ea01ea9	2008-04-28 02:12:01 -0700	[diff] [blame]	539
Daniel Kiper	9d0ad8c	2011-07-25 17:12:05 -0700	[diff] [blame]	540	int set_online_page_callback(online_page_callback_t callback)
				541	{
				542	int rc = -EINVAL;
				543
Vladimir Davydov	bfc8c90	2014-06-04 16:07:18 -0700	[diff] [blame]	544	get_online_mems();
				545	mutex_lock(&online_page_callback_lock);
Daniel Kiper	9d0ad8c	2011-07-25 17:12:05 -0700	[diff] [blame]	546
				547	if (online_page_callback == generic_online_page) {
				548	online_page_callback = callback;
				549	rc = 0;
				550	}
				551
Vladimir Davydov	bfc8c90	2014-06-04 16:07:18 -0700	[diff] [blame]	552	mutex_unlock(&online_page_callback_lock);
				553	put_online_mems();
Daniel Kiper	9d0ad8c	2011-07-25 17:12:05 -0700	[diff] [blame]	554
				555	return rc;
				556	}
				557	EXPORT_SYMBOL_GPL(set_online_page_callback);
				558
				559	int restore_online_page_callback(online_page_callback_t callback)
				560	{
				561	int rc = -EINVAL;
				562
Vladimir Davydov	bfc8c90	2014-06-04 16:07:18 -0700	[diff] [blame]	563	get_online_mems();
				564	mutex_lock(&online_page_callback_lock);
Daniel Kiper	9d0ad8c	2011-07-25 17:12:05 -0700	[diff] [blame]	565
				566	if (online_page_callback == callback) {
				567	online_page_callback = generic_online_page;
				568	rc = 0;
				569	}
				570
Vladimir Davydov	bfc8c90	2014-06-04 16:07:18 -0700	[diff] [blame]	571	mutex_unlock(&online_page_callback_lock);
				572	put_online_mems();
Daniel Kiper	9d0ad8c	2011-07-25 17:12:05 -0700	[diff] [blame]	573
				574	return rc;
				575	}
				576	EXPORT_SYMBOL_GPL(restore_online_page_callback);
				577
David Hildenbrand	18db149	2019-11-30 17:53:51 -0800	[diff] [blame]	578	void generic_online_page(struct page *page, unsigned int order)
Daniel Kiper	9d0ad8c	2011-07-25 17:12:05 -0700	[diff] [blame]	579	{
Vlastimil Babka	c87cbc1	2020-03-05 22:28:42 -0800	[diff] [blame]	580	/*
				581	* Freeing the page with debug_pagealloc enabled will try to unmap it,
				582	* so we should map it first. This is better than introducing a special
				583	* case in page freeing fast path.
				584	*/
Mike Rapoport	77bc7fd	2020-12-14 19:10:20 -0800	[diff] [blame]	585	debug_pagealloc_map_pages(page, 1 << order);
Arun KS	a9cd410	2019-03-05 15:42:14 -0800	[diff] [blame]	586	__free_pages_core(page, order);
				587	totalram_pages_add(1UL << order);
Arun KS	a9cd410	2019-03-05 15:42:14 -0800	[diff] [blame]	588	}
David Hildenbrand	18db149	2019-11-30 17:53:51 -0800	[diff] [blame]	589	EXPORT_SYMBOL_GPL(generic_online_page);
Arun KS	a9cd410	2019-03-05 15:42:14 -0800	[diff] [blame]	590
David Hildenbrand	aac6532	2020-10-15 20:08:11 -0700	[diff] [blame]	591	static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages)
Dave Hansen	3947be1	2005-10-29 18:16:54 -0700	[diff] [blame]	592	{
David Hildenbrand	b2c2ab2	2019-09-23 15:36:02 -0700	[diff] [blame]	593	const unsigned long end_pfn = start_pfn + nr_pages;
				594	unsigned long pfn;
Michal Hocko	2d070ea	2017-07-06 15:37:56 -0700	[diff] [blame]	595
David Hildenbrand	b2c2ab2	2019-09-23 15:36:02 -0700	[diff] [blame]	596	/*
David Hildenbrand	aac6532	2020-10-15 20:08:11 -0700	[diff] [blame]	597	* Online the pages in MAX_ORDER - 1 aligned chunks. The callback might
				598	* decide to not expose all pages to the buddy (e.g., expose them
				599	* later). We account all pages as being online and belonging to this
				600	* zone ("present").
Oscar Salvador	a08a2ae	2021-05-04 18:39:42 -0700	[diff] [blame]	601	* When using memmap_on_memory, the range might not be aligned to
				602	* MAX_ORDER_NR_PAGES - 1, but pageblock aligned. __ffs() will detect
				603	* this and the first chunk to online will be pageblock_nr_pages.
David Hildenbrand	b2c2ab2	2019-09-23 15:36:02 -0700	[diff] [blame]	604	*/
Oscar Salvador	a08a2ae	2021-05-04 18:39:42 -0700	[diff] [blame]	605	for (pfn = start_pfn; pfn < end_pfn;) {
				606	int order = min(MAX_ORDER - 1UL, __ffs(pfn));
				607
				608	(*online_page_callback)(pfn_to_page(pfn), order);
				609	pfn += (1UL << order);
				610	}
Michal Hocko	2d070ea	2017-07-06 15:37:56 -0700	[diff] [blame]	611
David Hildenbrand	b2c2ab2	2019-09-23 15:36:02 -0700	[diff] [blame]	612	/* mark all involved sections as online */
				613	online_mem_sections(start_pfn, end_pfn);
KAMEZAWA Hiroyuki	75884fb	2007-10-16 01:26:10 -0700	[diff] [blame]	614	}
				615
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	616	/* check which state of node_states will be changed when online memory */
				617	static void node_states_check_changes_online(unsigned long nr_pages,
				618	struct zone zone, struct memory_notify arg)
				619	{
				620	int nid = zone_to_nid(zone);
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	621
Anshuman Khandual	98fa15f	2019-03-05 15:42:58 -0800	[diff] [blame]	622	arg->status_change_nid = NUMA_NO_NODE;
				623	arg->status_change_nid_normal = NUMA_NO_NODE;
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	624
Lai Jiangshan	6715ddf	2012-12-12 13:51:49 -0800	[diff] [blame]	625	if (!node_state(nid, N_MEMORY))
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	626	arg->status_change_nid = nid;
Oscar Salvador	8efe33f	2018-10-26 15:07:34 -0700	[diff] [blame]	627	if (zone_idx(zone) <= ZONE_NORMAL && !node_state(nid, N_NORMAL_MEMORY))
				628	arg->status_change_nid_normal = nid;
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	629	}
				630
				631	static void node_states_set_node(int node, struct memory_notify *arg)
				632	{
				633	if (arg->status_change_nid_normal >= 0)
				634	node_set_state(node, N_NORMAL_MEMORY);
				635
Oscar Salvador	83d8361	2018-10-26 15:07:25 -0700	[diff] [blame]	636	if (arg->status_change_nid >= 0)
				637	node_set_state(node, N_MEMORY);
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	638	}
				639
Michal Hocko	f1dd2cd	2017-07-06 15:38:11 -0700	[diff] [blame]	640	static void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn,
				641	unsigned long nr_pages)
				642	{
				643	unsigned long old_end_pfn = zone_end_pfn(zone);
				644
				645	if (zone_is_empty(zone) \|\| start_pfn < zone->zone_start_pfn)
				646	zone->zone_start_pfn = start_pfn;
				647
				648	zone->spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - zone->zone_start_pfn;
				649	}
				650
				651	static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned long start_pfn,
				652	unsigned long nr_pages)
				653	{
				654	unsigned long old_end_pfn = pgdat_end_pfn(pgdat);
				655
				656	if (!pgdat->node_spanned_pages \|\| start_pfn < pgdat->node_start_pfn)
				657	pgdat->node_start_pfn = start_pfn;
				658
				659	pgdat->node_spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - pgdat->node_start_pfn;
Michal Hocko	f1dd2cd	2017-07-06 15:38:11 -0700	[diff] [blame]	660
David Hildenbrand	3fccb74	2019-09-23 15:35:37 -0700	[diff] [blame]	661	}
Dan Williams	1f90a34	2021-02-25 17:17:05 -0800	[diff] [blame]	662
				663	static void section_taint_zone_device(unsigned long pfn)
				664	{
				665	struct mem_section *ms = __pfn_to_section(pfn);
				666
				667	ms->section_mem_map \|= SECTION_TAINT_ZONE_DEVICE;
				668	}
				669
David Hildenbrand	3fccb74	2019-09-23 15:35:37 -0700	[diff] [blame]	670	/*
				671	* Associate the pfn range with the given zone, initializing the memmaps
				672	* and resizing the pgdat/zone data to span the added pages. After this
				673	* call, all affected pages are PG_reserved.
David Hildenbrand	d882c00	2020-10-15 20:08:19 -0700	[diff] [blame]	674	*
				675	* All aligned pageblocks are initialized to the specified migratetype
				676	* (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related
				677	* zone stats (e.g., nr_isolate_pageblock) are touched.
David Hildenbrand	3fccb74	2019-09-23 15:35:37 -0700	[diff] [blame]	678	*/
Christoph Hellwig	a99583e	2017-12-29 08:53:57 +0100	[diff] [blame]	679	void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
David Hildenbrand	d882c00	2020-10-15 20:08:19 -0700	[diff] [blame]	680	unsigned long nr_pages,
				681	struct vmem_altmap *altmap, int migratetype)
Michal Hocko	f1dd2cd	2017-07-06 15:38:11 -0700	[diff] [blame]	682	{
				683	struct pglist_data *pgdat = zone->zone_pgdat;
				684	int nid = pgdat->node_id;
Michal Hocko	f1dd2cd	2017-07-06 15:38:11 -0700	[diff] [blame]	685
Michal Hocko	f1dd2cd	2017-07-06 15:38:11 -0700	[diff] [blame]	686	clear_zone_contiguous(zone);
				687
Wei Yang	fa004ab	2018-12-28 00:37:10 -0800	[diff] [blame]	688	if (zone_is_empty(zone))
				689	init_currently_empty_zone(zone, start_pfn, nr_pages);
Michal Hocko	f1dd2cd	2017-07-06 15:38:11 -0700	[diff] [blame]	690	resize_zone_range(zone, start_pfn, nr_pages);
Michal Hocko	f1dd2cd	2017-07-06 15:38:11 -0700	[diff] [blame]	691	resize_pgdat_range(pgdat, start_pfn, nr_pages);
Michal Hocko	f1dd2cd	2017-07-06 15:38:11 -0700	[diff] [blame]	692
				693	/*
Dan Williams	1f90a34	2021-02-25 17:17:05 -0800	[diff] [blame]	694	* Subsection population requires care in pfn_to_online_page().
				695	* Set the taint to enable the slow path detection of
				696	* ZONE_DEVICE pages in an otherwise ZONE_{NORMAL,MOVABLE}
				697	* section.
				698	*/
				699	if (zone_is_zone_device(zone)) {
				700	if (!IS_ALIGNED(start_pfn, PAGES_PER_SECTION))
				701	section_taint_zone_device(start_pfn);
				702	if (!IS_ALIGNED(start_pfn + nr_pages, PAGES_PER_SECTION))
				703	section_taint_zone_device(start_pfn + nr_pages);
				704	}
				705
				706	/*
Michal Hocko	f1dd2cd	2017-07-06 15:38:11 -0700	[diff] [blame]	707	* TODO now we have a visible range of pages which are not associated
				708	* with their zone properly. Not nice but set_pfnblock_flags_mask
				709	* expects the zone spans the pfn range. All the pages in the range
				710	* are reserved so nobody should be touching them so we should be safe
				711	*/
Baoquan He	ab28cb6	2021-02-24 12:06:14 -0800	[diff] [blame]	712	memmap_init_range(nr_pages, nid, zone_idx(zone), start_pfn, 0,
David Hildenbrand	d882c00	2020-10-15 20:08:19 -0700	[diff] [blame]	713	MEMINIT_HOTPLUG, altmap, migratetype);
Michal Hocko	f1dd2cd	2017-07-06 15:38:11 -0700	[diff] [blame]	714
				715	set_zone_contiguous(zone);
				716	}
				717
David Hildenbrand	e83a437	2021-09-07 19:55:23 -0700	[diff] [blame]	718	struct auto_movable_stats {
				719	unsigned long kernel_early_pages;
				720	unsigned long movable_pages;
				721	};
				722
				723	static void auto_movable_stats_account_zone(struct auto_movable_stats *stats,
				724	struct zone *zone)
				725	{
				726	if (zone_idx(zone) == ZONE_MOVABLE) {
				727	stats->movable_pages += zone->present_pages;
				728	} else {
				729	stats->kernel_early_pages += zone->present_early_pages;
				730	#ifdef CONFIG_CMA
				731	/*
				732	* CMA pages (never on hotplugged memory) behave like
				733	* ZONE_MOVABLE.
				734	*/
				735	stats->movable_pages += zone->cma_pages;
				736	stats->kernel_early_pages -= zone->cma_pages;
				737	#endif /* CONFIG_CMA */
				738	}
				739	}
David Hildenbrand	3fcebf9	2021-09-07 19:55:48 -0700	[diff] [blame]	740	struct auto_movable_group_stats {
				741	unsigned long movable_pages;
				742	unsigned long req_kernel_early_pages;
				743	};
David Hildenbrand	e83a437	2021-09-07 19:55:23 -0700	[diff] [blame]	744
David Hildenbrand	3fcebf9	2021-09-07 19:55:48 -0700	[diff] [blame]	745	static int auto_movable_stats_account_group(struct memory_group *group,
				746	void *arg)
David Hildenbrand	e83a437	2021-09-07 19:55:23 -0700	[diff] [blame]	747	{
David Hildenbrand	3fcebf9	2021-09-07 19:55:48 -0700	[diff] [blame]	748	const int ratio = READ_ONCE(auto_movable_ratio);
				749	struct auto_movable_group_stats *stats = arg;
				750	long pages;
				751
				752	/*
				753	* We don't support modifying the config while the auto-movable online
				754	* policy is already enabled. Just avoid the division by zero below.
				755	*/
				756	if (!ratio)
				757	return 0;
				758
				759	/*
				760	* Calculate how many early kernel pages this group requires to
				761	* satisfy the configured zone ratio.
				762	*/
				763	pages = group->present_movable_pages * 100 / ratio;
				764	pages -= group->present_kernel_pages;
				765
				766	if (pages > 0)
				767	stats->req_kernel_early_pages += pages;
				768	stats->movable_pages += group->present_movable_pages;
				769	return 0;
				770	}
				771
				772	static bool auto_movable_can_online_movable(int nid, struct memory_group *group,
				773	unsigned long nr_pages)
				774	{
David Hildenbrand	e83a437	2021-09-07 19:55:23 -0700	[diff] [blame]	775	unsigned long kernel_early_pages, movable_pages;
David Hildenbrand	3fcebf9	2021-09-07 19:55:48 -0700	[diff] [blame]	776	struct auto_movable_group_stats group_stats = {};
				777	struct auto_movable_stats stats = {};
David Hildenbrand	e83a437	2021-09-07 19:55:23 -0700	[diff] [blame]	778	pg_data_t *pgdat = NODE_DATA(nid);
				779	struct zone *zone;
				780	int i;
				781
				782	/* Walk all relevant zones and collect MOVABLE vs. KERNEL stats. */
				783	if (nid == NUMA_NO_NODE) {
				784	/* TODO: cache values */
				785	for_each_populated_zone(zone)
				786	auto_movable_stats_account_zone(&stats, zone);
				787	} else {
				788	for (i = 0; i < MAX_NR_ZONES; i++) {
				789	zone = pgdat->node_zones + i;
				790	if (populated_zone(zone))
				791	auto_movable_stats_account_zone(&stats, zone);
				792	}
				793	}
				794
				795	kernel_early_pages = stats.kernel_early_pages;
				796	movable_pages = stats.movable_pages;
				797
				798	/*
David Hildenbrand	3fcebf9	2021-09-07 19:55:48 -0700	[diff] [blame]	799	* Kernel memory inside dynamic memory group allows for more MOVABLE
				800	* memory within the same group. Remove the effect of all but the
				801	* current group from the stats.
				802	*/
				803	walk_dynamic_memory_groups(nid, auto_movable_stats_account_group,
				804	group, &group_stats);
				805	if (kernel_early_pages <= group_stats.req_kernel_early_pages)
				806	return false;
				807	kernel_early_pages -= group_stats.req_kernel_early_pages;
				808	movable_pages -= group_stats.movable_pages;
				809
				810	if (group && group->is_dynamic)
				811	kernel_early_pages += group->present_kernel_pages;
				812
				813	/*
David Hildenbrand	e83a437	2021-09-07 19:55:23 -0700	[diff] [blame]	814	* Test if we could online the given number of pages to ZONE_MOVABLE
				815	* and still stay in the configured ratio.
				816	*/
				817	movable_pages += nr_pages;
				818	return movable_pages <= (auto_movable_ratio * kernel_early_pages) / 100;
				819	}
				820
Michal Hocko	f1dd2cd	2017-07-06 15:38:11 -0700	[diff] [blame]	821	/*
Michal Hocko	c246a21	2017-07-06 15:38:18 -0700	[diff] [blame]	822	* Returns a default kernel memory zone for the given pfn range.
				823	* If no kernel zone covers this pfn range it will automatically go
				824	* to the ZONE_NORMAL.
				825	*/
Michal Hocko	c6f03e2	2017-09-06 16:19:40 -0700	[diff] [blame]	826	static struct zone *default_kernel_zone_for_pfn(int nid, unsigned long start_pfn,
Michal Hocko	c246a21	2017-07-06 15:38:18 -0700	[diff] [blame]	827	unsigned long nr_pages)
				828	{
				829	struct pglist_data *pgdat = NODE_DATA(nid);
				830	int zid;
				831
				832	for (zid = 0; zid <= ZONE_NORMAL; zid++) {
				833	struct zone *zone = &pgdat->node_zones[zid];
				834
				835	if (zone_intersects(zone, start_pfn, nr_pages))
				836	return zone;
				837	}
				838
				839	return &pgdat->node_zones[ZONE_NORMAL];
				840	}
				841
David Hildenbrand	e83a437	2021-09-07 19:55:23 -0700	[diff] [blame]	842	/*
				843	* Determine to which zone to online memory dynamically based on user
				844	* configuration and system stats. We care about the following ratio:
				845	*
				846	* MOVABLE : KERNEL
				847	*
				848	* Whereby MOVABLE is memory in ZONE_MOVABLE and KERNEL is memory in
				849	* one of the kernel zones. CMA pages inside one of the kernel zones really
				850	* behaves like ZONE_MOVABLE, so we treat them accordingly.
				851	*
				852	* We don't allow for hotplugged memory in a KERNEL zone to increase the
				853	* amount of MOVABLE memory we can have, so we end up with:
				854	*
				855	* MOVABLE : KERNEL_EARLY
				856	*
				857	* Whereby KERNEL_EARLY is memory in one of the kernel zones, available sinze
				858	* boot. We base our calculation on KERNEL_EARLY internally, because:
				859	*
				860	* a) Hotplugged memory in one of the kernel zones can sometimes still get
				861	* hotunplugged, especially when hot(un)plugging individual memory blocks.
				862	* There is no coordination across memory devices, therefore "automatic"
				863	* hotunplugging, as implemented in hypervisors, could result in zone
				864	* imbalances.
				865	* b) Early/boot memory in one of the kernel zones can usually not get
				866	* hotunplugged again (e.g., no firmware interface to unplug, fragmented
				867	* with unmovable allocations). While there are corner cases where it might
				868	* still work, it is barely relevant in practice.
				869	*
David Hildenbrand	3fcebf9	2021-09-07 19:55:48 -0700	[diff] [blame]	870	* Exceptions are dynamic memory groups, which allow for more MOVABLE
				871	* memory within the same memory group -- because in that case, there is
				872	* coordination within the single memory device managed by a single driver.
				873	*
David Hildenbrand	e83a437	2021-09-07 19:55:23 -0700	[diff] [blame]	874	* We rely on "present pages" instead of "managed pages", as the latter is
				875	* highly unreliable and dynamic in virtualized environments, and does not
				876	* consider boot time allocations. For example, memory ballooning adjusts the
				877	* managed pages when inflating/deflating the balloon, and balloon compaction
				878	* can even migrate inflated pages between zones.
				879	*
				880	* Using "present pages" is better but some things to keep in mind are:
				881	*
				882	* a) Some memblock allocations, such as for the crashkernel area, are
				883	* effectively unused by the kernel, yet they account to "present pages".
				884	* Fortunately, these allocations are comparatively small in relevant setups
				885	* (e.g., fraction of system memory).
				886	* b) Some hotplugged memory blocks in virtualized environments, esecially
				887	* hotplugged by virtio-mem, look like they are completely present, however,
				888	* only parts of the memory block are actually currently usable.
				889	* "present pages" is an upper limit that can get reached at runtime. As
				890	* we base our calculations on KERNEL_EARLY, this is not an issue.
				891	*/
David Hildenbrand	445fcf7	2021-09-07 19:55:45 -0700	[diff] [blame]	892	static struct zone *auto_movable_zone_for_pfn(int nid,
				893	struct memory_group *group,
				894	unsigned long pfn,
David Hildenbrand	e83a437	2021-09-07 19:55:23 -0700	[diff] [blame]	895	unsigned long nr_pages)
				896	{
David Hildenbrand	445fcf7	2021-09-07 19:55:45 -0700	[diff] [blame]	897	unsigned long online_pages = 0, max_pages, end_pfn;
				898	struct page *page;
				899
David Hildenbrand	e83a437	2021-09-07 19:55:23 -0700	[diff] [blame]	900	if (!auto_movable_ratio)
				901	goto kernel_zone;
				902
David Hildenbrand	445fcf7	2021-09-07 19:55:45 -0700	[diff] [blame]	903	if (group && !group->is_dynamic) {
				904	max_pages = group->s.max_pages;
				905	online_pages = group->present_movable_pages;
				906
				907	/* If anything is !MOVABLE online the rest !MOVABLE. */
				908	if (group->present_kernel_pages)
				909	goto kernel_zone;
				910	} else if (!group \|\| group->d.unit_pages == nr_pages) {
				911	max_pages = nr_pages;
				912	} else {
				913	max_pages = group->d.unit_pages;
				914	/*
				915	* Take a look at all online sections in the current unit.
				916	* We can safely assume that all pages within a section belong
				917	* to the same zone, because dynamic memory groups only deal
				918	* with hotplugged memory.
				919	*/
				920	pfn = ALIGN_DOWN(pfn, group->d.unit_pages);
				921	end_pfn = pfn + group->d.unit_pages;
				922	for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
				923	page = pfn_to_online_page(pfn);
				924	if (!page)
				925	continue;
				926	/* If anything is !MOVABLE online the rest !MOVABLE. */
				927	if (page_zonenum(page) != ZONE_MOVABLE)
				928	goto kernel_zone;
				929	online_pages += PAGES_PER_SECTION;
				930	}
				931	}
				932
				933	/*
				934	* Online MOVABLE if we could currently online all remaining parts
				935	* MOVABLE. We expect to (add+) online them immediately next, so if
				936	* nobody interferes, all will be MOVABLE if possible.
				937	*/
				938	nr_pages = max_pages - online_pages;
David Hildenbrand	3fcebf9	2021-09-07 19:55:48 -0700	[diff] [blame]	939	if (!auto_movable_can_online_movable(NUMA_NO_NODE, group, nr_pages))
David Hildenbrand	e83a437	2021-09-07 19:55:23 -0700	[diff] [blame]	940	goto kernel_zone;
				941
				942	#ifdef CONFIG_NUMA
				943	if (auto_movable_numa_aware &&
David Hildenbrand	3fcebf9	2021-09-07 19:55:48 -0700	[diff] [blame]	944	!auto_movable_can_online_movable(nid, group, nr_pages))
David Hildenbrand	e83a437	2021-09-07 19:55:23 -0700	[diff] [blame]	945	goto kernel_zone;
				946	#endif /* CONFIG_NUMA */
				947
				948	return &NODE_DATA(nid)->node_zones[ZONE_MOVABLE];
				949	kernel_zone:
				950	return default_kernel_zone_for_pfn(nid, pfn, nr_pages);
				951	}
				952
Michal Hocko	c6f03e2	2017-09-06 16:19:40 -0700	[diff] [blame]	953	static inline struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn,
				954	unsigned long nr_pages)
Michal Hocko	e5e6893	2017-09-06 16:19:37 -0700	[diff] [blame]	955	{
Michal Hocko	c6f03e2	2017-09-06 16:19:40 -0700	[diff] [blame]	956	struct zone *kernel_zone = default_kernel_zone_for_pfn(nid, start_pfn,
				957	nr_pages);
				958	struct zone *movable_zone = &NODE_DATA(nid)->node_zones[ZONE_MOVABLE];
				959	bool in_kernel = zone_intersects(kernel_zone, start_pfn, nr_pages);
				960	bool in_movable = zone_intersects(movable_zone, start_pfn, nr_pages);
Michal Hocko	e5e6893	2017-09-06 16:19:37 -0700	[diff] [blame]	961
				962	/*
Michal Hocko	c6f03e2	2017-09-06 16:19:40 -0700	[diff] [blame]	963	* We inherit the existing zone in a simple case where zones do not
				964	* overlap in the given range
Michal Hocko	e5e6893	2017-09-06 16:19:37 -0700	[diff] [blame]	965	*/
Michal Hocko	c6f03e2	2017-09-06 16:19:40 -0700	[diff] [blame]	966	if (in_kernel ^ in_movable)
				967	return (in_kernel) ? kernel_zone : movable_zone;
Michal Hocko	e5e6893	2017-09-06 16:19:37 -0700	[diff] [blame]	968
Michal Hocko	c6f03e2	2017-09-06 16:19:40 -0700	[diff] [blame]	969	/*
				970	* If the range doesn't belong to any zone or two zones overlap in the
				971	* given range then we use movable zone only if movable_node is
				972	* enabled because we always online to a kernel zone by default.
				973	*/
				974	return movable_node_enabled ? movable_zone : kernel_zone;
Michal Hocko	9f123ab	2017-07-10 15:48:37 -0700	[diff] [blame]	975	}
				976
David Hildenbrand	7cf209b	2021-09-07 19:54:59 -0700	[diff] [blame]	977	struct zone *zone_for_pfn_range(int online_type, int nid,
David Hildenbrand	445fcf7	2021-09-07 19:55:45 -0700	[diff] [blame]	978	struct memory_group *group, unsigned long start_pfn,
Michal Hocko	e5e6893	2017-09-06 16:19:37 -0700	[diff] [blame]	979	unsigned long nr_pages)
Michal Hocko	f1dd2cd	2017-07-06 15:38:11 -0700	[diff] [blame]	980	{
Michal Hocko	c6f03e2	2017-09-06 16:19:40 -0700	[diff] [blame]	981	if (online_type == MMOP_ONLINE_KERNEL)
				982	return default_kernel_zone_for_pfn(nid, start_pfn, nr_pages);
Michal Hocko	f1dd2cd	2017-07-06 15:38:11 -0700	[diff] [blame]	983
Michal Hocko	c6f03e2	2017-09-06 16:19:40 -0700	[diff] [blame]	984	if (online_type == MMOP_ONLINE_MOVABLE)
				985	return &NODE_DATA(nid)->node_zones[ZONE_MOVABLE];
Reza Arbab	df429ac	2016-07-26 15:22:23 -0700	[diff] [blame]	986
David Hildenbrand	e83a437	2021-09-07 19:55:23 -0700	[diff] [blame]	987	if (online_policy == ONLINE_POLICY_AUTO_MOVABLE)
David Hildenbrand	445fcf7	2021-09-07 19:55:45 -0700	[diff] [blame]	988	return auto_movable_zone_for_pfn(nid, group, start_pfn, nr_pages);
David Hildenbrand	e83a437	2021-09-07 19:55:23 -0700	[diff] [blame]	989
Michal Hocko	c6f03e2	2017-09-06 16:19:40 -0700	[diff] [blame]	990	return default_zone_for_pfn(nid, start_pfn, nr_pages);
Michal Hocko	e5e6893	2017-09-06 16:19:37 -0700	[diff] [blame]	991	}
				992
Oscar Salvador	a08a2ae	2021-05-04 18:39:42 -0700	[diff] [blame]	993	/*
				994	* This function should only be called by memory_block_{online,offline},
				995	* and {online,offline}_pages.
				996	*/
David Hildenbrand	836809e	2021-09-07 19:55:30 -0700	[diff] [blame]	997	void adjust_present_page_count(struct page page, struct memory_group group,
				998	long nr_pages)
David Hildenbrand	f990114	2021-05-04 18:39:39 -0700	[diff] [blame]	999	{
David Hildenbrand	4b09700	2021-09-07 19:55:19 -0700	[diff] [blame]	1000	struct zone *zone = page_zone(page);
David Hildenbrand	836809e	2021-09-07 19:55:30 -0700	[diff] [blame]	1001	const bool movable = zone_idx(zone) == ZONE_MOVABLE;
David Hildenbrand	4b09700	2021-09-07 19:55:19 -0700	[diff] [blame]	1002
				1003	/*
				1004	* We only support onlining/offlining/adding/removing of complete
				1005	* memory blocks; therefore, either all is either early or hotplugged.
				1006	*/
				1007	if (early_section(__pfn_to_section(page_to_pfn(page))))
				1008	zone->present_early_pages += nr_pages;
David Hildenbrand	f990114	2021-05-04 18:39:39 -0700	[diff] [blame]	1009	zone->present_pages += nr_pages;
David Hildenbrand	f990114	2021-05-04 18:39:39 -0700	[diff] [blame]	1010	zone->zone_pgdat->node_present_pages += nr_pages;
David Hildenbrand	836809e	2021-09-07 19:55:30 -0700	[diff] [blame]	1011
				1012	if (group && movable)
				1013	group->present_movable_pages += nr_pages;
				1014	else if (group && !movable)
				1015	group->present_kernel_pages += nr_pages;
David Hildenbrand	f990114	2021-05-04 18:39:39 -0700	[diff] [blame]	1016	}
				1017
Oscar Salvador	a08a2ae	2021-05-04 18:39:42 -0700	[diff] [blame]	1018	int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
				1019	struct zone *zone)
				1020	{
				1021	unsigned long end_pfn = pfn + nr_pages;
				1022	int ret;
				1023
				1024	ret = kasan_add_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages));
				1025	if (ret)
				1026	return ret;
				1027
				1028	move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE);
				1029
				1030	/*
				1031	* It might be that the vmemmap_pages fully span sections. If that is
				1032	* the case, mark those sections online here as otherwise they will be
				1033	* left offline.
				1034	*/
				1035	if (nr_pages >= PAGES_PER_SECTION)
				1036	online_mem_sections(pfn, ALIGN_DOWN(end_pfn, PAGES_PER_SECTION));
				1037
				1038	return ret;
				1039	}
				1040
				1041	void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages)
				1042	{
				1043	unsigned long end_pfn = pfn + nr_pages;
				1044
				1045	/*
				1046	* It might be that the vmemmap_pages fully span sections. If that is
				1047	* the case, mark those sections offline here as otherwise they will be
				1048	* left online.
				1049	*/
				1050	if (nr_pages >= PAGES_PER_SECTION)
				1051	offline_mem_sections(pfn, ALIGN_DOWN(end_pfn, PAGES_PER_SECTION));
				1052
				1053	/*
				1054	* The pages associated with this vmemmap have been offlined, so
				1055	* we can reset its state here.
				1056	*/
				1057	remove_pfn_range_from_zone(page_zone(pfn_to_page(pfn)), pfn, nr_pages);
				1058	kasan_remove_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages));
				1059	}
				1060
David Hildenbrand	836809e	2021-09-07 19:55:30 -0700	[diff] [blame]	1061	int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
				1062	struct zone zone, struct memory_group group)
KAMEZAWA Hiroyuki	75884fb	2007-10-16 01:26:10 -0700	[diff] [blame]	1063	{
Cody P Schafer	aa47228	2013-07-03 15:02:10 -0700	[diff] [blame]	1064	unsigned long flags;
Yasunori Goto	6811378	2006-06-23 02:03:11 -0700	[diff] [blame]	1065	int need_zonelists_rebuild = 0;
Oscar Salvador	a08a2ae	2021-05-04 18:39:42 -0700	[diff] [blame]	1066	const int nid = zone_to_nid(zone);
Yasunori Goto	7b78d33	2007-10-21 16:41:36 -0700	[diff] [blame]	1067	int ret;
				1068	struct memory_notify arg;
Dave Hansen	3947be1	2005-10-29 18:16:54 -0700	[diff] [blame]	1069
Oscar Salvador	dd8e2f2	2021-05-04 18:39:36 -0700	[diff] [blame]	1070	/*
				1071	* {on,off}lining is constrained to full memory sections (or more
Zhen Lei	041711c	2021-06-30 18:53:17 -0700	[diff] [blame]	1072	* precisely to memory blocks from the user space POV).
Oscar Salvador	dd8e2f2	2021-05-04 18:39:36 -0700	[diff] [blame]	1073	* memmap_on_memory is an exception because it reserves initial part
				1074	* of the physical memory space for vmemmaps. That space is pageblock
				1075	* aligned.
				1076	*/
David Hildenbrand	4986fac	2020-10-15 20:07:50 -0700	[diff] [blame]	1077	if (WARN_ON_ONCE(!nr_pages \|\|
Oscar Salvador	dd8e2f2	2021-05-04 18:39:36 -0700	[diff] [blame]	1078	!IS_ALIGNED(pfn, pageblock_nr_pages) \|\|
				1079	!IS_ALIGNED(pfn + nr_pages, PAGES_PER_SECTION)))
David Hildenbrand	4986fac	2020-10-15 20:07:50 -0700	[diff] [blame]	1080	return -EINVAL;
				1081
David Hildenbrand	381eab4	2018-10-30 15:10:29 -0700	[diff] [blame]	1082	mem_hotplug_begin();
				1083
Michal Hocko	f1dd2cd	2017-07-06 15:38:11 -0700	[diff] [blame]	1084	/* associate pfn range with the zone */
David Hildenbrand	b30c592	2020-10-15 20:08:23 -0700	[diff] [blame]	1085	move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_ISOLATE);
Michal Hocko	f1dd2cd	2017-07-06 15:38:11 -0700	[diff] [blame]	1086
Yasunori Goto	7b78d33	2007-10-21 16:41:36 -0700	[diff] [blame]	1087	arg.start_pfn = pfn;
				1088	arg.nr_pages = nr_pages;
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	1089	node_states_check_changes_online(nr_pages, zone, &arg);
Yasunori Goto	7b78d33	2007-10-21 16:41:36 -0700	[diff] [blame]	1090
Yasunori Goto	7b78d33	2007-10-21 16:41:36 -0700	[diff] [blame]	1091	ret = memory_notify(MEM_GOING_ONLINE, &arg);
				1092	ret = notifier_to_errno(ret);
Chen Yucong	e33e33b	2016-03-17 14:19:35 -0700	[diff] [blame]	1093	if (ret)
				1094	goto failed_addition;
				1095
Dave Hansen	3947be1	2005-10-29 18:16:54 -0700	[diff] [blame]	1096	/*
David Hildenbrand	b30c592	2020-10-15 20:08:23 -0700	[diff] [blame]	1097	* Fixup the number of isolated pageblocks before marking the sections
				1098	* onlining, such that undo_isolate_page_range() works correctly.
				1099	*/
				1100	spin_lock_irqsave(&zone->lock, flags);
				1101	zone->nr_isolate_pageblock += nr_pages / pageblock_nr_pages;
				1102	spin_unlock_irqrestore(&zone->lock, flags);
				1103
				1104	/*
Yasunori Goto	6811378	2006-06-23 02:03:11 -0700	[diff] [blame]	1105	* If this zone is not populated, then it is not in zonelist.
				1106	* This means the page allocator ignores this zone.
				1107	* So, zonelist must be updated after online.
				1108	*/
Wen Congyang	6dcd73d	2012-12-11 16:01:01 -0800	[diff] [blame]	1109	if (!populated_zone(zone)) {
Yasunori Goto	6811378	2006-06-23 02:03:11 -0700	[diff] [blame]	1110	need_zonelists_rebuild = 1;
Michal Hocko	72675e1	2017-09-06 16:20:24 -0700	[diff] [blame]	1111	setup_zone_pageset(zone);
Wen Congyang	6dcd73d	2012-12-11 16:01:01 -0800	[diff] [blame]	1112	}
Yasunori Goto	6811378	2006-06-23 02:03:11 -0700	[diff] [blame]	1113
David Hildenbrand	aac6532	2020-10-15 20:08:11 -0700	[diff] [blame]	1114	online_pages_range(pfn, nr_pages);
David Hildenbrand	836809e	2021-09-07 19:55:30 -0700	[diff] [blame]	1115	adjust_present_page_count(pfn_to_page(pfn), group, nr_pages);
Cody P Schafer	aa47228	2013-07-03 15:02:10 -0700	[diff] [blame]	1116
David Hildenbrand	b30c592	2020-10-15 20:08:23 -0700	[diff] [blame]	1117	node_states_set_node(nid, &arg);
				1118	if (need_zonelists_rebuild)
				1119	build_all_zonelists(NULL);
David Hildenbrand	b30c592	2020-10-15 20:08:23 -0700	[diff] [blame]	1120
				1121	/* Basic onlining is complete, allow allocation of onlined pages. */
				1122	undo_isolate_page_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE);
				1123
David Hildenbrand	93146d9	2020-08-06 23:25:35 -0700	[diff] [blame]	1124	/*
David Hildenbrand	b86c5fc	2020-10-15 20:09:39 -0700	[diff] [blame]	1125	* Freshly onlined pages aren't shuffled (e.g., all pages are placed to
				1126	* the tail of the freelist when undoing isolation). Shuffle the whole
				1127	* zone to make sure the just onlined pages are properly distributed
				1128	* across the whole freelist - to create an initial shuffle.
David Hildenbrand	93146d9	2020-08-06 23:25:35 -0700	[diff] [blame]	1129	*/
Dan Williams	e900a91	2019-05-14 15:41:28 -0700	[diff] [blame]	1130	shuffle_zone(zone);
				1131
Mel Gorman	b92ca18	2021-06-28 19:42:12 -0700	[diff] [blame]	1132	/* reinitialise watermarks and update pcp limits */
KOSAKI Motohiro	1b79acc	2011-05-24 17:11:32 -0700	[diff] [blame]	1133	init_per_zone_wmark_min();
				1134
David Hildenbrand	ca9a46f	2019-09-23 15:36:08 -0700	[diff] [blame]	1135	kswapd_run(nid);
				1136	kcompactd_run(nid);
Dave Hansen	61b1399	2005-10-29 18:16:56 -0700	[diff] [blame]	1137
Chandra Seetharaman	2d1d43f	2006-09-29 02:01:25 -0700	[diff] [blame]	1138	writeback_set_ratelimit();
Yasunori Goto	7b78d33	2007-10-21 16:41:36 -0700	[diff] [blame]	1139
David Hildenbrand	ca9a46f	2019-09-23 15:36:08 -0700	[diff] [blame]	1140	memory_notify(MEM_ONLINE, &arg);
David Hildenbrand	381eab4	2018-10-30 15:10:29 -0700	[diff] [blame]	1141	mem_hotplug_done();
David Rientjes	30467e0	2015-04-14 15:45:11 -0700	[diff] [blame]	1142	return 0;
Chen Yucong	e33e33b	2016-03-17 14:19:35 -0700	[diff] [blame]	1143
				1144	failed_addition:
				1145	pr_debug("online_pages [mem %#010llx-%#010llx] failed\n",
				1146	(unsigned long long) pfn << PAGE_SHIFT,
				1147	(((unsigned long long) pfn + nr_pages) << PAGE_SHIFT) - 1);
				1148	memory_notify(MEM_CANCEL_ONLINE, &arg);
David Hildenbrand	feee6b2	2020-01-04 12:59:33 -0800	[diff] [blame]	1149	remove_pfn_range_from_zone(zone, pfn, nr_pages);
David Hildenbrand	381eab4	2018-10-30 15:10:29 -0700	[diff] [blame]	1150	mem_hotplug_done();
Chen Yucong	e33e33b	2016-03-17 14:19:35 -0700	[diff] [blame]	1151	return ret;
Dave Hansen	3947be1	2005-10-29 18:16:54 -0700	[diff] [blame]	1152	}
Yasunori Goto	bc02af9	2006-06-27 02:53:30 -0700	[diff] [blame]	1153
Tang Chen	0bd8542	2014-11-13 15:19:41 -0800	[diff] [blame]	1154	static void reset_node_present_pages(pg_data_t *pgdat)
				1155	{
				1156	struct zone *z;
				1157
				1158	for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
				1159	z->present_pages = 0;
				1160
				1161	pgdat->node_present_pages = 0;
				1162	}
				1163
Hidetoshi Seto	e131933	2009-11-17 14:06:18 -0800	[diff] [blame]	1164	/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
David Hildenbrand	c68ab18	2020-06-04 16:48:35 -0700	[diff] [blame]	1165	static pg_data_t __ref *hotadd_new_pgdat(int nid)
Yasunori Goto	9af3c2d	2006-06-27 02:53:34 -0700	[diff] [blame]	1166	{
				1167	struct pglist_data *pgdat;
Yasunori Goto	9af3c2d	2006-06-27 02:53:34 -0700	[diff] [blame]	1168
Tang Chen	a1e565a	2013-02-22 16:33:18 -0800	[diff] [blame]	1169	pgdat = NODE_DATA(nid);
				1170	if (!pgdat) {
				1171	pgdat = arch_alloc_nodedata(nid);
				1172	if (!pgdat)
				1173	return NULL;
Yasunori Goto	9af3c2d	2006-06-27 02:53:34 -0700	[diff] [blame]	1174
Wei Yang	33fce01	2019-09-23 15:35:52 -0700	[diff] [blame]	1175	pgdat->per_cpu_nodestats =
				1176	alloc_percpu(struct per_cpu_nodestat);
Tang Chen	a1e565a	2013-02-22 16:33:18 -0800	[diff] [blame]	1177	arch_refresh_nodedata(nid, pgdat);
Gu Zheng	b0dc3a3	2015-03-25 15:55:20 -0700	[diff] [blame]	1178	} else {
Wei Yang	33fce01	2019-09-23 15:35:52 -0700	[diff] [blame]	1179	int cpu;
Mel Gorman	e716f2e	2017-05-03 14:53:45 -0700	[diff] [blame]	1180	/*
Joonsoo Kim	97a225e	2020-06-03 15:59:01 -0700	[diff] [blame]	1181	* Reset the nr_zones, order and highest_zoneidx before reuse.
				1182	* Note that kswapd will init kswapd_highest_zoneidx properly
Mel Gorman	e716f2e	2017-05-03 14:53:45 -0700	[diff] [blame]	1183	* when it starts in the near future.
				1184	*/
Gu Zheng	b0dc3a3	2015-03-25 15:55:20 -0700	[diff] [blame]	1185	pgdat->nr_zones = 0;
Mel Gorman	38087d9	2016-07-28 15:45:49 -0700	[diff] [blame]	1186	pgdat->kswapd_order = 0;
Joonsoo Kim	97a225e	2020-06-03 15:59:01 -0700	[diff] [blame]	1187	pgdat->kswapd_highest_zoneidx = 0;
Wei Yang	33fce01	2019-09-23 15:35:52 -0700	[diff] [blame]	1188	for_each_online_cpu(cpu) {
				1189	struct per_cpu_nodestat *p;
				1190
				1191	p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);
				1192	memset(p, 0, sizeof(*p));
				1193	}
Tang Chen	a1e565a	2013-02-22 16:33:18 -0800	[diff] [blame]	1194	}
Yasunori Goto	9af3c2d	2006-06-27 02:53:34 -0700	[diff] [blame]	1195
				1196	/* we can use NODE_DATA(nid) from here */
Oscar Salvador	03e85f9	2018-08-21 21:53:43 -0700	[diff] [blame]	1197	pgdat->node_id = nid;
David Hildenbrand	c68ab18	2020-06-04 16:48:35 -0700	[diff] [blame]	1198	pgdat->node_start_pfn = 0;
Oscar Salvador	03e85f9	2018-08-21 21:53:43 -0700	[diff] [blame]	1199
Yasunori Goto	9af3c2d	2006-06-27 02:53:34 -0700	[diff] [blame]	1200	/* init node's zones as empty zones, we don't have any present pages.*/
Oscar Salvador	03e85f9	2018-08-21 21:53:43 -0700	[diff] [blame]	1201	free_area_init_core_hotplug(nid);
Yasunori Goto	9af3c2d	2006-06-27 02:53:34 -0700	[diff] [blame]	1202
KAMEZAWA Hiroyuki	959ecc4	2011-06-15 15:08:38 -0700	[diff] [blame]	1203	/*
				1204	* The node we allocated has no zone fallback lists. For avoiding
				1205	* to access not-initialized zonelist, build here.
				1206	*/
Michal Hocko	72675e1	2017-09-06 16:20:24 -0700	[diff] [blame]	1207	build_all_zonelists(pgdat);
KAMEZAWA Hiroyuki	959ecc4	2011-06-15 15:08:38 -0700	[diff] [blame]	1208
Tang Chen	f784a3f	2014-11-13 15:19:39 -0800	[diff] [blame]	1209	/*
Tang Chen	0bd8542	2014-11-13 15:19:41 -0800	[diff] [blame]	1210	* When memory is hot-added, all the memory is in offline state. So
				1211	* clear all zones' present_pages because they will be updated in
				1212	* online_pages() and offline_pages().
				1213	*/
Oscar Salvador	03e85f9	2018-08-21 21:53:43 -0700	[diff] [blame]	1214	reset_node_managed_pages(pgdat);
Tang Chen	0bd8542	2014-11-13 15:19:41 -0800	[diff] [blame]	1215	reset_node_present_pages(pgdat);
				1216
Yasunori Goto	9af3c2d	2006-06-27 02:53:34 -0700	[diff] [blame]	1217	return pgdat;
				1218	}
				1219
Oscar Salvador	b9ff036	2018-08-17 15:46:15 -0700	[diff] [blame]	1220	static void rollback_node_hotadd(int nid)
Yasunori Goto	9af3c2d	2006-06-27 02:53:34 -0700	[diff] [blame]	1221	{
Oscar Salvador	b9ff036	2018-08-17 15:46:15 -0700	[diff] [blame]	1222	pg_data_t *pgdat = NODE_DATA(nid);
				1223
Yasunori Goto	9af3c2d	2006-06-27 02:53:34 -0700	[diff] [blame]	1224	arch_refresh_nodedata(nid, NULL);
Reza Arbab	5830169	2016-08-11 15:33:12 -0700	[diff] [blame]	1225	free_percpu(pgdat->per_cpu_nodestats);
Yasunori Goto	9af3c2d	2006-06-27 02:53:34 -0700	[diff] [blame]	1226	arch_free_nodedata(pgdat);
Yasunori Goto	9af3c2d	2006-06-27 02:53:34 -0700	[diff] [blame]	1227	}
				1228
KAMEZAWA Hiroyuki	0a54703	2006-06-27 02:53:35 -0700	[diff] [blame]	1229
Mel Gorman	ba2d266	2021-06-30 18:53:35 -0700	[diff] [blame]	1230	/*
				1231	* __try_online_node - online a node if offlined
Mike Rapoport	e8b098f	2018-04-05 16:24:57 -0700	[diff] [blame]	1232	* @nid: the node ID
Oscar Salvador	b9ff036	2018-08-17 15:46:15 -0700	[diff] [blame]	1233	* @set_node_online: Whether we want to online the node
minskey guo	cf23422	2010-05-24 14:32:41 -0700	[diff] [blame]	1234	* called by cpu_up() to online a node without onlined memory.
Oscar Salvador	b9ff036	2018-08-17 15:46:15 -0700	[diff] [blame]	1235	*
				1236	* Returns:
				1237	* 1 -> a new node has been allocated
				1238	* 0 -> the node is already online
				1239	* -ENOMEM -> the node could not be allocated
minskey guo	cf23422	2010-05-24 14:32:41 -0700	[diff] [blame]	1240	*/
David Hildenbrand	c68ab18	2020-06-04 16:48:35 -0700	[diff] [blame]	1241	static int __try_online_node(int nid, bool set_node_online)
minskey guo	cf23422	2010-05-24 14:32:41 -0700	[diff] [blame]	1242	{
Oscar Salvador	b9ff036	2018-08-17 15:46:15 -0700	[diff] [blame]	1243	pg_data_t *pgdat;
				1244	int ret = 1;
minskey guo	cf23422	2010-05-24 14:32:41 -0700	[diff] [blame]	1245
Toshi Kani	01b0f19	2013-11-12 15:07:25 -0800	[diff] [blame]	1246	if (node_online(nid))
				1247	return 0;
				1248
David Hildenbrand	c68ab18	2020-06-04 16:48:35 -0700	[diff] [blame]	1249	pgdat = hotadd_new_pgdat(nid);
David Rientjes	7553e8f	2011-06-22 18:13:01 -0700	[diff] [blame]	1250	if (!pgdat) {
Toshi Kani	01b0f19	2013-11-12 15:07:25 -0800	[diff] [blame]	1251	pr_err("Cannot online node %d due to NULL pgdat\n", nid);
minskey guo	cf23422	2010-05-24 14:32:41 -0700	[diff] [blame]	1252	ret = -ENOMEM;
				1253	goto out;
				1254	}
Oscar Salvador	b9ff036	2018-08-17 15:46:15 -0700	[diff] [blame]	1255
				1256	if (set_node_online) {
				1257	node_set_online(nid);
				1258	ret = register_one_node(nid);
				1259	BUG_ON(ret);
				1260	}
minskey guo	cf23422	2010-05-24 14:32:41 -0700	[diff] [blame]	1261	out:
Oscar Salvador	b9ff036	2018-08-17 15:46:15 -0700	[diff] [blame]	1262	return ret;
				1263	}
				1264
				1265	/*
				1266	* Users of this function always want to online/register the node
				1267	*/
				1268	int try_online_node(int nid)
				1269	{
				1270	int ret;
				1271
				1272	mem_hotplug_begin();
David Hildenbrand	c68ab18	2020-06-04 16:48:35 -0700	[diff] [blame]	1273	ret = __try_online_node(nid, true);
Vladimir Davydov	bfc8c90	2014-06-04 16:07:18 -0700	[diff] [blame]	1274	mem_hotplug_done();
minskey guo	cf23422	2010-05-24 14:32:41 -0700	[diff] [blame]	1275	return ret;
				1276	}
				1277
Toshi Kani	27356f5	2013-09-11 14:21:49 -0700	[diff] [blame]	1278	static int check_hotplug_memory_range(u64 start, u64 size)
				1279	{
Pavel Tatashin	ba32558	2018-04-05 16:22:39 -0700	[diff] [blame]	1280	/* memory range must be block size aligned */
David Hildenbrand	cec3ebd	2019-07-18 15:56:25 -0700	[diff] [blame]	1281	if (!size \|\| !IS_ALIGNED(start, memory_block_size_bytes()) \|\|
				1282	!IS_ALIGNED(size, memory_block_size_bytes())) {
Pavel Tatashin	ba32558	2018-04-05 16:22:39 -0700	[diff] [blame]	1283	pr_err("Block size [%#lx] unaligned hotplug range: start %#llx, size %#llx",
David Hildenbrand	cec3ebd	2019-07-18 15:56:25 -0700	[diff] [blame]	1284	memory_block_size_bytes(), start, size);
Toshi Kani	27356f5	2013-09-11 14:21:49 -0700	[diff] [blame]	1285	return -EINVAL;
				1286	}
				1287
				1288	return 0;
				1289	}
				1290
Vitaly Kuznetsov	31bc385	2016-03-15 14:56:48 -0700	[diff] [blame]	1291	static int online_memory_block(struct memory_block mem, void arg)
				1292	{
Anshuman Khandual	1adf8b4	2021-02-25 17:17:13 -0800	[diff] [blame]	1293	mem->online_type = mhp_default_online_type;
Nathan Fontenot	dc18d70	2017-02-24 15:00:02 -0800	[diff] [blame]	1294	return device_online(&mem->dev);
Vitaly Kuznetsov	31bc385	2016-03-15 14:56:48 -0700	[diff] [blame]	1295	}
				1296
Oscar Salvador	a08a2ae	2021-05-04 18:39:42 -0700	[diff] [blame]	1297	bool mhp_supports_memmap_on_memory(unsigned long size)
				1298	{
				1299	unsigned long nr_vmemmap_pages = size / PAGE_SIZE;
				1300	unsigned long vmemmap_size = nr_vmemmap_pages * sizeof(struct page);
				1301	unsigned long remaining_size = size - vmemmap_size;
				1302
				1303	/*
				1304	* Besides having arch support and the feature enabled at runtime, we
				1305	* need a few more assumptions to hold true:
				1306	*
				1307	* a) We span a single memory block: memory onlining/offlinin;g happens
				1308	* in memory block granularity. We don't want the vmemmap of online
				1309	* memory blocks to reside on offline memory blocks. In the future,
				1310	* we might want to support variable-sized memory blocks to make the
				1311	* feature more versatile.
				1312	*
				1313	* b) The vmemmap pages span complete PMDs: We don't want vmemmap code
				1314	* to populate memory from the altmap for unrelated parts (i.e.,
				1315	* other memory blocks)
				1316	*
				1317	* c) The vmemmap pages (and thereby the pages that will be exposed to
				1318	* the buddy) have to cover full pageblocks: memory onlining/offlining
				1319	* code requires applicable ranges to be page-aligned, for example, to
				1320	* set the migratetypes properly.
				1321	*
				1322	* TODO: Although we have a check here to make sure that vmemmap pages
				1323	* fully populate a PMD, it is not the right place to check for
				1324	* this. A much better solution involves improving vmemmap code
				1325	* to fallback to base pages when trying to populate vmemmap using
				1326	* altmap as an alternative source of memory, and we do not exactly
				1327	* populate a single PMD.
				1328	*/
				1329	return memmap_on_memory &&
Muchun Song	2d7a217	2021-06-30 18:48:25 -0700	[diff] [blame]	1330	!hugetlb_free_vmemmap_enabled &&
Oscar Salvador	a08a2ae	2021-05-04 18:39:42 -0700	[diff] [blame]	1331	IS_ENABLED(CONFIG_MHP_MEMMAP_ON_MEMORY) &&
				1332	size == memory_block_size_bytes() &&
				1333	IS_ALIGNED(vmemmap_size, PMD_SIZE) &&
				1334	IS_ALIGNED(remaining_size, (pageblock_nr_pages << PAGE_SHIFT));
				1335	}
				1336
David Hildenbrand	8df1d0e	2018-10-30 15:10:24 -0700	[diff] [blame]	1337	/*
				1338	* NOTE: The caller must call lock_device_hotplug() to serialize hotplug
				1339	* and online/offline operations (triggered e.g. by sysfs).
				1340	*
				1341	* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG
				1342	*/
David Hildenbrand	b611719	2020-10-15 20:08:44 -0700	[diff] [blame]	1343	int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
Yasunori Goto	bc02af9	2006-06-27 02:53:30 -0700	[diff] [blame]	1344	{
Catalin Marinas	d15dfd3	2021-03-09 12:26:01 +0000	[diff] [blame]	1345	struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) };
David Hildenbrand	32befe9	2021-11-05 13:44:56 -0700	[diff] [blame]	1346	enum memblock_flags memblock_flags = MEMBLOCK_NONE;
Oscar Salvador	a08a2ae	2021-05-04 18:39:42 -0700	[diff] [blame]	1347	struct vmem_altmap mhp_altmap = {};
David Hildenbrand	028fc57	2021-09-07 19:55:26 -0700	[diff] [blame]	1348	struct memory_group *group = NULL;
David Vrabel	62cedb9	2015-06-25 16:35:49 +0100	[diff] [blame]	1349	u64 start, size;
Oscar Salvador	b9ff036	2018-08-17 15:46:15 -0700	[diff] [blame]	1350	bool new_node = false;
Yasunori Goto	bc02af9	2006-06-27 02:53:30 -0700	[diff] [blame]	1351	int ret;
				1352
David Vrabel	62cedb9	2015-06-25 16:35:49 +0100	[diff] [blame]	1353	start = res->start;
				1354	size = resource_size(res);
				1355
Toshi Kani	27356f5	2013-09-11 14:21:49 -0700	[diff] [blame]	1356	ret = check_hotplug_memory_range(start, size);
				1357	if (ret)
				1358	return ret;
				1359
David Hildenbrand	028fc57	2021-09-07 19:55:26 -0700	[diff] [blame]	1360	if (mhp_flags & MHP_NID_IS_MGID) {
				1361	group = memory_group_find_by_id(nid);
				1362	if (!group)
				1363	return -EINVAL;
				1364	nid = group->nid;
				1365	}
				1366
Vishal Verma	fa6d9ec	2020-06-04 16:48:25 -0700	[diff] [blame]	1367	if (!node_possible(nid)) {
				1368	WARN(1, "node %d was absent from the node_possible_map\n", nid);
				1369	return -EINVAL;
				1370	}
				1371
Vladimir Davydov	bfc8c90	2014-06-04 16:07:18 -0700	[diff] [blame]	1372	mem_hotplug_begin();
Nathan Zimmer	ac13c46	2014-01-23 15:53:26 -0800	[diff] [blame]	1373
David Hildenbrand	53d3831	2021-11-05 13:44:42 -0700	[diff] [blame]	1374	if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) {
David Hildenbrand	32befe9	2021-11-05 13:44:56 -0700	[diff] [blame]	1375	if (res->flags & IORESOURCE_SYSRAM_DRIVER_MANAGED)
				1376	memblock_flags = MEMBLOCK_DRIVER_MANAGED;
				1377	ret = memblock_add_node(start, size, nid, memblock_flags);
David Hildenbrand	53d3831	2021-11-05 13:44:42 -0700	[diff] [blame]	1378	if (ret)
				1379	goto error_mem_hotplug_end;
				1380	}
Tang Chen	7f36e3e	2015-09-04 15:42:32 -0700	[diff] [blame]	1381
David Hildenbrand	c68ab18	2020-06-04 16:48:35 -0700	[diff] [blame]	1382	ret = __try_online_node(nid, false);
Oscar Salvador	b9ff036	2018-08-17 15:46:15 -0700	[diff] [blame]	1383	if (ret < 0)
				1384	goto error;
				1385	new_node = ret;
Yasunori Goto	9af3c2d	2006-06-27 02:53:34 -0700	[diff] [blame]	1386
Oscar Salvador	a08a2ae	2021-05-04 18:39:42 -0700	[diff] [blame]	1387	/*
				1388	* Self hosted memmap array
				1389	*/
				1390	if (mhp_flags & MHP_MEMMAP_ON_MEMORY) {
				1391	if (!mhp_supports_memmap_on_memory(size)) {
				1392	ret = -EINVAL;
				1393	goto error;
				1394	}
				1395	mhp_altmap.free = PHYS_PFN(size);
				1396	mhp_altmap.base_pfn = PHYS_PFN(start);
				1397	params.altmap = &mhp_altmap;
				1398	}
				1399
Yasunori Goto	bc02af9	2006-06-27 02:53:30 -0700	[diff] [blame]	1400	/* call arch's memory hotadd */
Logan Gunthorpe	f5637d3	2020-04-10 14:33:21 -0700	[diff] [blame]	1401	ret = arch_add_memory(nid, start, size, &params);
Yasunori Goto	9af3c2d	2006-06-27 02:53:34 -0700	[diff] [blame]	1402	if (ret < 0)
				1403	goto error;
				1404
David Hildenbrand	db051a0	2019-07-18 15:56:56 -0700	[diff] [blame]	1405	/* create memory block devices after memory was added */
David Hildenbrand	028fc57	2021-09-07 19:55:26 -0700	[diff] [blame]	1406	ret = create_memory_block_devices(start, size, mhp_altmap.alloc,
				1407	group);
David Hildenbrand	db051a0	2019-07-18 15:56:56 -0700	[diff] [blame]	1408	if (ret) {
David Hildenbrand	65a2aa5	2021-09-07 19:55:04 -0700	[diff] [blame]	1409	arch_remove_memory(start, size, NULL);
David Hildenbrand	db051a0	2019-07-18 15:56:56 -0700	[diff] [blame]	1410	goto error;
				1411	}
				1412
Tang Chen	a1e565a	2013-02-22 16:33:18 -0800	[diff] [blame]	1413	if (new_node) {
Oscar Salvador	d5b6f6a3	2018-08-17 15:46:18 -0700	[diff] [blame]	1414	/* If sysfs file of new node can't be created, cpu on the node
Yasunori Goto	0fc4415	2006-06-27 02:53:38 -0700	[diff] [blame]	1415	* can't be hot-added. There is no rollback way now.
				1416	* So, check by BUG_ON() to catch it reluctantly..
Oscar Salvador	d5b6f6a3	2018-08-17 15:46:18 -0700	[diff] [blame]	1417	* We online node here. We can't roll back from here.
Yasunori Goto	0fc4415	2006-06-27 02:53:38 -0700	[diff] [blame]	1418	*/
Oscar Salvador	d5b6f6a3	2018-08-17 15:46:18 -0700	[diff] [blame]	1419	node_set_online(nid);
				1420	ret = __register_one_node(nid);
Yasunori Goto	0fc4415	2006-06-27 02:53:38 -0700	[diff] [blame]	1421	BUG_ON(ret);
				1422	}
				1423
Oscar Salvador	d5b6f6a3	2018-08-17 15:46:18 -0700	[diff] [blame]	1424	/* link memory sections under this node.*/
Laurent Dufour	90c7eae	2020-10-15 20:09:15 -0700	[diff] [blame]	1425	link_mem_sections(nid, PFN_DOWN(start), PFN_UP(start + size - 1),
				1426	MEMINIT_HOTPLUG);
Oscar Salvador	d5b6f6a3	2018-08-17 15:46:18 -0700	[diff] [blame]	1427
akpm@linux-foundation.org	d96ae53	2010-03-05 13:41:58 -0800	[diff] [blame]	1428	/* create new memmap entry */
David Hildenbrand	7b7b272	2020-06-04 16:48:41 -0700	[diff] [blame]	1429	if (!strcmp(res->name, "System RAM"))
				1430	firmware_map_add_hotplug(start, start + size, "System RAM");
akpm@linux-foundation.org	d96ae53	2010-03-05 13:41:58 -0800	[diff] [blame]	1431
David Hildenbrand	381eab4	2018-10-30 15:10:29 -0700	[diff] [blame]	1432	/* device_online() will take the lock when calling online_pages() */
				1433	mem_hotplug_done();
				1434
David Hildenbrand	9ca6551	2020-10-15 20:08:49 -0700	[diff] [blame]	1435	/*
				1436	* In case we're allowed to merge the resource, flag it and trigger
				1437	* merging now that adding succeeded.
				1438	*/
David Hildenbrand	2601126	2021-02-25 17:17:17 -0800	[diff] [blame]	1439	if (mhp_flags & MHP_MERGE_RESOURCE)
David Hildenbrand	9ca6551	2020-10-15 20:08:49 -0700	[diff] [blame]	1440	merge_system_ram_resource(res);
				1441
Vitaly Kuznetsov	31bc385	2016-03-15 14:56:48 -0700	[diff] [blame]	1442	/* online pages if requested */
Anshuman Khandual	1adf8b4	2021-02-25 17:17:13 -0800	[diff] [blame]	1443	if (mhp_default_online_type != MMOP_OFFLINE)
David Hildenbrand	fbcf73c	2019-07-18 15:57:46 -0700	[diff] [blame]	1444	walk_memory_blocks(start, size, NULL, online_memory_block);
Vitaly Kuznetsov	31bc385	2016-03-15 14:56:48 -0700	[diff] [blame]	1445
David Hildenbrand	381eab4	2018-10-30 15:10:29 -0700	[diff] [blame]	1446	return ret;
Yasunori Goto	9af3c2d	2006-06-27 02:53:34 -0700	[diff] [blame]	1447	error:
				1448	/* rollback pgdat allocation and others */
Oscar Salvador	b9ff036	2018-08-17 15:46:15 -0700	[diff] [blame]	1449	if (new_node)
				1450	rollback_node_hotadd(nid);
David Hildenbrand	52219ae	2020-06-04 16:48:38 -0700	[diff] [blame]	1451	if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
				1452	memblock_remove(start, size);
David Hildenbrand	53d3831	2021-11-05 13:44:42 -0700	[diff] [blame]	1453	error_mem_hotplug_end:
Vladimir Davydov	bfc8c90	2014-06-04 16:07:18 -0700	[diff] [blame]	1454	mem_hotplug_done();
Yasunori Goto	bc02af9	2006-06-27 02:53:30 -0700	[diff] [blame]	1455	return ret;
				1456	}
David Vrabel	62cedb9	2015-06-25 16:35:49 +0100	[diff] [blame]	1457
David Hildenbrand	8df1d0e	2018-10-30 15:10:24 -0700	[diff] [blame]	1458	/* requires device_hotplug_lock, see add_memory_resource() */
David Hildenbrand	b611719	2020-10-15 20:08:44 -0700	[diff] [blame]	1459	int __ref __add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags)
David Vrabel	62cedb9	2015-06-25 16:35:49 +0100	[diff] [blame]	1460	{
				1461	struct resource *res;
				1462	int ret;
				1463
David Hildenbrand	7b7b272	2020-06-04 16:48:41 -0700	[diff] [blame]	1464	res = register_memory_resource(start, size, "System RAM");
Vitaly Kuznetsov	6f754ba	2016-01-14 15:21:55 -0800	[diff] [blame]	1465	if (IS_ERR(res))
				1466	return PTR_ERR(res);
David Vrabel	62cedb9	2015-06-25 16:35:49 +0100	[diff] [blame]	1467
David Hildenbrand	b611719	2020-10-15 20:08:44 -0700	[diff] [blame]	1468	ret = add_memory_resource(nid, res, mhp_flags);
David Vrabel	62cedb9	2015-06-25 16:35:49 +0100	[diff] [blame]	1469	if (ret < 0)
				1470	release_memory_resource(res);
				1471	return ret;
				1472	}
David Hildenbrand	8df1d0e	2018-10-30 15:10:24 -0700	[diff] [blame]	1473
David Hildenbrand	b611719	2020-10-15 20:08:44 -0700	[diff] [blame]	1474	int add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags)
David Hildenbrand	8df1d0e	2018-10-30 15:10:24 -0700	[diff] [blame]	1475	{
				1476	int rc;
				1477
				1478	lock_device_hotplug();
David Hildenbrand	b611719	2020-10-15 20:08:44 -0700	[diff] [blame]	1479	rc = __add_memory(nid, start, size, mhp_flags);
David Hildenbrand	8df1d0e	2018-10-30 15:10:24 -0700	[diff] [blame]	1480	unlock_device_hotplug();
				1481
				1482	return rc;
				1483	}
Yasunori Goto	bc02af9	2006-06-27 02:53:30 -0700	[diff] [blame]	1484	EXPORT_SYMBOL_GPL(add_memory);
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1485
David Hildenbrand	7b7b272	2020-06-04 16:48:41 -0700	[diff] [blame]	1486	/*
				1487	* Add special, driver-managed memory to the system as system RAM. Such
				1488	* memory is not exposed via the raw firmware-provided memmap as system
				1489	* RAM, instead, it is detected and added by a driver - during cold boot,
				1490	* after a reboot, and after kexec.
				1491	*
				1492	* Reasons why this memory should not be used for the initial memmap of a
				1493	* kexec kernel or for placing kexec images:
				1494	* - The booting kernel is in charge of determining how this memory will be
				1495	* used (e.g., use persistent memory as system RAM)
				1496	* - Coordination with a hypervisor is required before this memory
				1497	* can be used (e.g., inaccessible parts).
				1498	*
				1499	* For this memory, no entries in /sys/firmware/memmap ("raw firmware-provided
				1500	* memory map") are created. Also, the created memory resource is flagged
David Hildenbrand	7cf603d	2020-10-15 20:08:33 -0700	[diff] [blame]	1501	* with IORESOURCE_SYSRAM_DRIVER_MANAGED, so in-kernel users can special-case
David Hildenbrand	7b7b272	2020-06-04 16:48:41 -0700	[diff] [blame]	1502	* this memory as well (esp., not place kexec images onto it).
				1503	*
				1504	* The resource_name (visible via /proc/iomem) has to have the format
				1505	* "System RAM ($DRIVER)".
				1506	*/
				1507	int add_memory_driver_managed(int nid, u64 start, u64 size,
David Hildenbrand	b611719	2020-10-15 20:08:44 -0700	[diff] [blame]	1508	const char *resource_name, mhp_t mhp_flags)
David Hildenbrand	7b7b272	2020-06-04 16:48:41 -0700	[diff] [blame]	1509	{
				1510	struct resource *res;
				1511	int rc;
				1512
				1513	if (!resource_name \|\|
				1514	strstr(resource_name, "System RAM (") != resource_name \|\|
				1515	resource_name[strlen(resource_name) - 1] != ')')
				1516	return -EINVAL;
				1517
				1518	lock_device_hotplug();
				1519
				1520	res = register_memory_resource(start, size, resource_name);
				1521	if (IS_ERR(res)) {
				1522	rc = PTR_ERR(res);
				1523	goto out_unlock;
				1524	}
				1525
David Hildenbrand	b611719	2020-10-15 20:08:44 -0700	[diff] [blame]	1526	rc = add_memory_resource(nid, res, mhp_flags);
David Hildenbrand	7b7b272	2020-06-04 16:48:41 -0700	[diff] [blame]	1527	if (rc < 0)
				1528	release_memory_resource(res);
				1529
				1530	out_unlock:
				1531	unlock_device_hotplug();
				1532	return rc;
				1533	}
				1534	EXPORT_SYMBOL_GPL(add_memory_driver_managed);
				1535
Anshuman Khandual	bca3fea	2021-02-25 17:17:33 -0800	[diff] [blame]	1536	/*
				1537	* Platforms should define arch_get_mappable_range() that provides
				1538	* maximum possible addressable physical memory range for which the
				1539	* linear mapping could be created. The platform returned address
				1540	* range must adhere to these following semantics.
				1541	*
				1542	* - range.start <= range.end
				1543	* - Range includes both end points [range.start..range.end]
				1544	*
				1545	* There is also a fallback definition provided here, allowing the
				1546	* entire possible physical address range in case any platform does
				1547	* not define arch_get_mappable_range().
				1548	*/
				1549	struct range __weak arch_get_mappable_range(void)
				1550	{
				1551	struct range mhp_range = {
				1552	.start = 0UL,
				1553	.end = -1ULL,
				1554	};
				1555	return mhp_range;
				1556	}
				1557
				1558	struct range mhp_get_pluggable_range(bool need_mapping)
				1559	{
				1560	const u64 max_phys = (1ULL << MAX_PHYSMEM_BITS) - 1;
				1561	struct range mhp_range;
				1562
				1563	if (need_mapping) {
				1564	mhp_range = arch_get_mappable_range();
				1565	if (mhp_range.start > max_phys) {
				1566	mhp_range.start = 0;
				1567	mhp_range.end = 0;
				1568	}
				1569	mhp_range.end = min_t(u64, mhp_range.end, max_phys);
				1570	} else {
				1571	mhp_range.start = 0;
				1572	mhp_range.end = max_phys;
				1573	}
				1574	return mhp_range;
				1575	}
				1576	EXPORT_SYMBOL_GPL(mhp_get_pluggable_range);
				1577
				1578	bool mhp_range_allowed(u64 start, u64 size, bool need_mapping)
				1579	{
				1580	struct range mhp_range = mhp_get_pluggable_range(need_mapping);
				1581	u64 end = start + size;
				1582
				1583	if (start < end && start >= mhp_range.start && (end - 1) <= mhp_range.end)
				1584	return true;
				1585
				1586	pr_warn("Hotplug memory [%#llx-%#llx] exceeds maximum addressable range [%#llx-%#llx]\n",
				1587	start, end, mhp_range.start, mhp_range.end);
				1588	return false;
				1589	}
				1590
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1591	#ifdef CONFIG_MEMORY_HOTREMOVE
				1592	/*
David Hildenbrand	9291799	2020-02-03 17:34:26 -0800	[diff] [blame]	1593	* Confirm all pages in a range [start, end) belong to the same zone (skipping
				1594	* memory holes). When true, return the zone.
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1595	*/
David Hildenbrand	9291799	2020-02-03 17:34:26 -0800	[diff] [blame]	1596	struct zone *test_pages_in_a_zone(unsigned long start_pfn,
				1597	unsigned long end_pfn)
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1598	{
Andrew Banman	5f0f288	2015-12-29 14:54:25 -0800	[diff] [blame]	1599	unsigned long pfn, sec_end_pfn;
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1600	struct zone *zone = NULL;
				1601	struct page *page;
Mike Rapoport	673d40c	2021-09-07 19:54:55 -0700	[diff] [blame]	1602
Toshi Kani	deb88a2	2017-02-03 13:13:20 -0800	[diff] [blame]	1603	for (pfn = start_pfn, sec_end_pfn = SECTION_ALIGN_UP(start_pfn + 1);
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1604	pfn < end_pfn;
Toshi Kani	deb88a2	2017-02-03 13:13:20 -0800	[diff] [blame]	1605	pfn = sec_end_pfn, sec_end_pfn += PAGES_PER_SECTION) {
Andrew Banman	5f0f288	2015-12-29 14:54:25 -0800	[diff] [blame]	1606	/* Make sure the memory section is present first */
				1607	if (!present_section_nr(pfn_to_section_nr(pfn)))
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1608	continue;
Andrew Banman	5f0f288	2015-12-29 14:54:25 -0800	[diff] [blame]	1609	for (; pfn < sec_end_pfn && pfn < end_pfn;
				1610	pfn += MAX_ORDER_NR_PAGES) {
Mikhail Zaslonko	24feb47	2019-02-01 14:20:38 -0800	[diff] [blame]	1611	/* Check if we got outside of the zone */
Mike Rapoport	673d40c	2021-09-07 19:54:55 -0700	[diff] [blame]	1612	if (zone && !zone_spans_pfn(zone, pfn))
David Hildenbrand	9291799	2020-02-03 17:34:26 -0800	[diff] [blame]	1613	return NULL;
Mike Rapoport	673d40c	2021-09-07 19:54:55 -0700	[diff] [blame]	1614	page = pfn_to_page(pfn);
Andrew Banman	5f0f288	2015-12-29 14:54:25 -0800	[diff] [blame]	1615	if (zone && page_zone(page) != zone)
David Hildenbrand	9291799	2020-02-03 17:34:26 -0800	[diff] [blame]	1616	return NULL;
Andrew Banman	5f0f288	2015-12-29 14:54:25 -0800	[diff] [blame]	1617	zone = page_zone(page);
				1618	}
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1619	}
Toshi Kani	deb88a2	2017-02-03 13:13:20 -0800	[diff] [blame]	1620
David Hildenbrand	9291799	2020-02-03 17:34:26 -0800	[diff] [blame]	1621	return zone;
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1622	}
				1623
				1624	/*
Yisheng Xie	0efadf4	2017-02-24 14:57:39 -0800	[diff] [blame]	1625	* Scan pfn range [start,end) to find movable/migratable pages (LRU pages,
David Hildenbrand	aa21879	2020-05-07 16:01:30 +0200	[diff] [blame]	1626	* non-lru movable pages and hugepages). Will skip over most unmovable
				1627	* pages (esp., pages that can be skipped when offlining), but bail out on
				1628	* definitely unmovable pages.
				1629	*
				1630	* Returns:
				1631	* 0 in case a movable page is found and movable_pfn was updated.
				1632	* -ENOENT in case no movable page was found.
				1633	* -EBUSY in case a definitely unmovable page was found.
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1634	*/
David Hildenbrand	aa21879	2020-05-07 16:01:30 +0200	[diff] [blame]	1635	static int scan_movable_pages(unsigned long start, unsigned long end,
				1636	unsigned long *movable_pfn)
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1637	{
				1638	unsigned long pfn;
Oscar Salvador	eeb0efd	2019-02-01 14:20:47 -0800	[diff] [blame]	1639
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1640	for (pfn = start; pfn < end; pfn++) {
Oscar Salvador	eeb0efd	2019-02-01 14:20:47 -0800	[diff] [blame]	1641	struct page page, head;
				1642	unsigned long skip;
				1643
				1644	if (!pfn_valid(pfn))
				1645	continue;
				1646	page = pfn_to_page(pfn);
				1647	if (PageLRU(page))
David Hildenbrand	aa21879	2020-05-07 16:01:30 +0200	[diff] [blame]	1648	goto found;
Oscar Salvador	eeb0efd	2019-02-01 14:20:47 -0800	[diff] [blame]	1649	if (__PageMovable(page))
David Hildenbrand	aa21879	2020-05-07 16:01:30 +0200	[diff] [blame]	1650	goto found;
				1651
				1652	/*
				1653	* PageOffline() pages that are not marked __PageMovable() and
				1654	* have a reference count > 0 (after MEM_GOING_OFFLINE) are
				1655	* definitely unmovable. If their reference count would be 0,
				1656	* they could at least be skipped when offlining memory.
				1657	*/
				1658	if (PageOffline(page) && page_count(page))
				1659	return -EBUSY;
Oscar Salvador	eeb0efd	2019-02-01 14:20:47 -0800	[diff] [blame]	1660
				1661	if (!PageHuge(page))
				1662	continue;
				1663	head = compound_head(page);
Mike Kravetz	8f251a3	2021-02-24 12:08:56 -0800	[diff] [blame]	1664	/*
				1665	* This test is racy as we hold no reference or lock. The
				1666	* hugetlb page could have been free'ed and head is no longer
				1667	* a hugetlb page before the following check. In such unlikely
				1668	* cases false positives and negatives are possible. Calling
				1669	* code must deal with these scenarios.
				1670	*/
				1671	if (HPageMigratable(head))
David Hildenbrand	aa21879	2020-05-07 16:01:30 +0200	[diff] [blame]	1672	goto found;
Matthew Wilcox (Oracle)	d8c6546	2019-09-23 15:34:30 -0700	[diff] [blame]	1673	skip = compound_nr(head) - (page - head);
Oscar Salvador	eeb0efd	2019-02-01 14:20:47 -0800	[diff] [blame]	1674	pfn += skip - 1;
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1675	}
David Hildenbrand	aa21879	2020-05-07 16:01:30 +0200	[diff] [blame]	1676	return -ENOENT;
				1677	found:
				1678	*movable_pfn = pfn;
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1679	return 0;
				1680	}
				1681
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1682	static int
				1683	do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
				1684	{
				1685	unsigned long pfn;
Matthew Wilcox (Oracle)	6c35784	2020-08-14 17:30:37 -0700	[diff] [blame]	1686	struct page page, head;
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1687	int ret = 0;
				1688	LIST_HEAD(source);
Liam Mark	786dee8	2021-06-30 18:52:43 -0700	[diff] [blame]	1689	static DEFINE_RATELIMIT_STATE(migrate_rs, DEFAULT_RATELIMIT_INTERVAL,
				1690	DEFAULT_RATELIMIT_BURST);
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1691
Michal Hocko	a85009c	2018-12-28 00:38:29 -0800	[diff] [blame]	1692	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1693	if (!pfn_valid(pfn))
				1694	continue;
				1695	page = pfn_to_page(pfn);
Matthew Wilcox (Oracle)	6c35784	2020-08-14 17:30:37 -0700	[diff] [blame]	1696	head = compound_head(page);
Naoya Horiguchi	c8721bb	2013-09-11 14:22:09 -0700	[diff] [blame]	1697
				1698	if (PageHuge(page)) {
Matthew Wilcox (Oracle)	d8c6546	2019-09-23 15:34:30 -0700	[diff] [blame]	1699	pfn = page_to_pfn(head) + compound_nr(head) - 1;
Oscar Salvador	daf3538	2019-03-05 15:48:53 -0800	[diff] [blame]	1700	isolate_huge_page(head, &source);
Naoya Horiguchi	c8721bb	2013-09-11 14:22:09 -0700	[diff] [blame]	1701	continue;
Michal Hocko	94723aa	2018-04-10 16:30:07 -0700	[diff] [blame]	1702	} else if (PageTransHuge(page))
Matthew Wilcox (Oracle)	6c35784	2020-08-14 17:30:37 -0700	[diff] [blame]	1703	pfn = page_to_pfn(head) + thp_nr_pages(page) - 1;
Naoya Horiguchi	c8721bb	2013-09-11 14:22:09 -0700	[diff] [blame]	1704
Michal Hocko	b15c872	2018-12-28 00:38:01 -0800	[diff] [blame]	1705	/*
				1706	* HWPoison pages have elevated reference counts so the migration would
				1707	* fail on them. It also doesn't make any sense to migrate them in the
				1708	* first place. Still try to unmap such a page in case it is still mapped
				1709	* (e.g. current hwpoison implementation doesn't unmap KSM pages but keep
				1710	* the unmap as the catch all safety net).
				1711	*/
				1712	if (PageHWPoison(page)) {
				1713	if (WARN_ON(PageLRU(page)))
				1714	isolate_lru_page(page);
				1715	if (page_mapped(page))
Shakeel Butt	013339d	2020-12-14 19:06:39 -0800	[diff] [blame]	1716	try_to_unmap(page, TTU_IGNORE_MLOCK);
Michal Hocko	b15c872	2018-12-28 00:38:01 -0800	[diff] [blame]	1717	continue;
				1718	}
				1719
Konstantin Khlebnikov	700c2a4	2011-05-24 17:12:19 -0700	[diff] [blame]	1720	if (!get_page_unless_zero(page))
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1721	continue;
				1722	/*
Yisheng Xie	0efadf4	2017-02-24 14:57:39 -0800	[diff] [blame]	1723	* We can skip free pages. And we can deal with pages on
				1724	* LRU and non-lru movable pages.
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1725	*/
Yisheng Xie	0efadf4	2017-02-24 14:57:39 -0800	[diff] [blame]	1726	if (PageLRU(page))
				1727	ret = isolate_lru_page(page);
				1728	else
				1729	ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE);
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1730	if (!ret) { /* Success */
Nick Piggin	62695a8	2008-10-18 20:26:09 -0700	[diff] [blame]	1731	list_add_tail(&page->lru, &source);
Yisheng Xie	0efadf4	2017-02-24 14:57:39 -0800	[diff] [blame]	1732	if (!__PageMovable(page))
				1733	inc_node_page_state(page, NR_ISOLATED_ANON +
Huang Ying	9de4f22	2020-04-06 20:04:41 -0700	[diff] [blame]	1734	page_is_file_lru(page));
KOSAKI Motohiro	6d9c285	2009-12-14 17:58:11 -0800	[diff] [blame]	1735
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1736	} else {
Liam Mark	786dee8	2021-06-30 18:52:43 -0700	[diff] [blame]	1737	if (__ratelimit(&migrate_rs)) {
				1738	pr_warn("failed to isolate pfn %lx\n", pfn);
				1739	dump_page(page, "isolation failed");
				1740	}
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1741	}
Oscar Salvador	1723058	2019-02-01 14:19:57 -0800	[diff] [blame]	1742	put_page(page);
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1743	}
Bob Liu	f3ab263	2010-10-26 14:22:10 -0700	[diff] [blame]	1744	if (!list_empty(&source)) {
Joonsoo Kim	203e6e5	2020-10-17 16:14:00 -0700	[diff] [blame]	1745	nodemask_t nmask = node_states[N_MEMORY];
				1746	struct migration_target_control mtc = {
				1747	.nmask = &nmask,
				1748	.gfp_mask = GFP_USER \| __GFP_MOVABLE \| __GFP_RETRY_MAYFAIL,
				1749	};
				1750
				1751	/*
				1752	* We have checked that migration range is on a single zone so
				1753	* we can use the nid of the first page to all the others.
				1754	*/
				1755	mtc.nid = page_to_nid(list_first_entry(&source, struct page, lru));
				1756
				1757	/*
				1758	* try to allocate from a different node but reuse this node
				1759	* if there are no other online nodes to be used (e.g. we are
				1760	* offlining a part of the only existing node)
				1761	*/
				1762	node_clear(mtc.nid, nmask);
				1763	if (nodes_empty(nmask))
				1764	node_set(mtc.nid, nmask);
				1765	ret = migrate_pages(&source, alloc_migration_target, NULL,
Yang Shi	5ac9588	2021-09-02 14:59:13 -0700	[diff] [blame]	1766	(unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_HOTPLUG, NULL);
Michal Hocko	2932c8b	2018-12-28 00:33:53 -0800	[diff] [blame]	1767	if (ret) {
				1768	list_for_each_entry(page, &source, lru) {
Liam Mark	786dee8	2021-06-30 18:52:43 -0700	[diff] [blame]	1769	if (__ratelimit(&migrate_rs)) {
				1770	pr_warn("migrating pfn %lx failed ret:%d\n",
				1771	page_to_pfn(page), ret);
				1772	dump_page(page, "migration failure");
				1773	}
Michal Hocko	2932c8b	2018-12-28 00:33:53 -0800	[diff] [blame]	1774	}
Naoya Horiguchi	c8721bb	2013-09-11 14:22:09 -0700	[diff] [blame]	1775	putback_movable_pages(&source);
Michal Hocko	2932c8b	2018-12-28 00:33:53 -0800	[diff] [blame]	1776	}
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1777	}
Oscar Salvador	1723058	2019-02-01 14:19:57 -0800	[diff] [blame]	1778
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1779	return ret;
				1780	}
				1781
Tang Chen	c532092	2013-11-12 15:08:10 -0800	[diff] [blame]	1782	static int __init cmdline_parse_movable_node(char *p)
				1783	{
Tang Chen	55ac590	2014-01-21 15:49:35 -0800	[diff] [blame]	1784	movable_node_enabled = true;
Tang Chen	c532092	2013-11-12 15:08:10 -0800	[diff] [blame]	1785	return 0;
				1786	}
				1787	early_param("movable_node", cmdline_parse_movable_node);
				1788
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	1789	/* check which state of node_states will be changed when offline memory */
				1790	static void node_states_check_changes_offline(unsigned long nr_pages,
				1791	struct zone zone, struct memory_notify arg)
				1792	{
				1793	struct pglist_data *pgdat = zone->zone_pgdat;
				1794	unsigned long present_pages = 0;
Oscar Salvador	86b27be	2018-10-26 15:07:38 -0700	[diff] [blame]	1795	enum zone_type zt;
				1796
Anshuman Khandual	98fa15f	2019-03-05 15:42:58 -0800	[diff] [blame]	1797	arg->status_change_nid = NUMA_NO_NODE;
				1798	arg->status_change_nid_normal = NUMA_NO_NODE;
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	1799
				1800	/*
Oscar Salvador	86b27be	2018-10-26 15:07:38 -0700	[diff] [blame]	1801	* Check whether node_states[N_NORMAL_MEMORY] will be changed.
				1802	* If the memory to be offline is within the range
				1803	* [0..ZONE_NORMAL], and it is the last present memory there,
				1804	* the zones in that range will become empty after the offlining,
				1805	* thus we can determine that we need to clear the node from
				1806	* node_states[N_NORMAL_MEMORY].
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	1807	*/
Oscar Salvador	86b27be	2018-10-26 15:07:38 -0700	[diff] [blame]	1808	for (zt = 0; zt <= ZONE_NORMAL; zt++)
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	1809	present_pages += pgdat->node_zones[zt].present_pages;
Oscar Salvador	86b27be	2018-10-26 15:07:38 -0700	[diff] [blame]	1810	if (zone_idx(zone) <= ZONE_NORMAL && nr_pages >= present_pages)
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	1811	arg->status_change_nid_normal = zone_to_nid(zone);
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	1812
Lai Jiangshan	6715ddf	2012-12-12 13:51:49 -0800	[diff] [blame]	1813	/*
David Hildenbrand	6b740c6	2021-11-05 13:44:31 -0700	[diff] [blame]	1814	* We have accounted the pages from [0..ZONE_NORMAL); ZONE_HIGHMEM
				1815	* does not apply as we don't support 32bit.
Oscar Salvador	86b27be	2018-10-26 15:07:38 -0700	[diff] [blame]	1816	* Here we count the possible pages from ZONE_MOVABLE.
				1817	* If after having accounted all the pages, we see that the nr_pages
				1818	* to be offlined is over or equal to the accounted pages,
				1819	* we know that the node will become empty, and so, we can clear
				1820	* it for N_MEMORY as well.
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	1821	*/
Oscar Salvador	86b27be	2018-10-26 15:07:38 -0700	[diff] [blame]	1822	present_pages += pgdat->node_zones[ZONE_MOVABLE].present_pages;
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	1823
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	1824	if (nr_pages >= present_pages)
				1825	arg->status_change_nid = zone_to_nid(zone);
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	1826	}
				1827
				1828	static void node_states_clear_node(int node, struct memory_notify *arg)
				1829	{
				1830	if (arg->status_change_nid_normal >= 0)
				1831	node_clear_state(node, N_NORMAL_MEMORY);
				1832
Oscar Salvador	cf01f6f5	2018-10-26 15:07:28 -0700	[diff] [blame]	1833	if (arg->status_change_nid >= 0)
Lai Jiangshan	6715ddf	2012-12-12 13:51:49 -0800	[diff] [blame]	1834	node_clear_state(node, N_MEMORY);
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	1835	}
				1836
David Hildenbrand	c5e79ef	2019-11-30 17:54:17 -0800	[diff] [blame]	1837	static int count_system_ram_pages_cb(unsigned long start_pfn,
				1838	unsigned long nr_pages, void *data)
				1839	{
				1840	unsigned long *nr_system_ram_pages = data;
				1841
				1842	*nr_system_ram_pages += nr_pages;
				1843	return 0;
				1844	}
				1845
David Hildenbrand	836809e	2021-09-07 19:55:30 -0700	[diff] [blame]	1846	int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
				1847	struct memory_group *group)
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1848	{
David Hildenbrand	73a11c9	2020-10-15 20:07:46 -0700	[diff] [blame]	1849	const unsigned long end_pfn = start_pfn + nr_pages;
David Hildenbrand	0a1a9a0	2020-10-15 20:07:54 -0700	[diff] [blame]	1850	unsigned long pfn, system_ram_pages = 0;
Cody P Schafer	d702909	2013-07-03 15:02:11 -0700	[diff] [blame]	1851	unsigned long flags;
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1852	struct zone *zone;
Yasunori Goto	7b78d33	2007-10-21 16:41:36 -0700	[diff] [blame]	1853	struct memory_notify arg;
David Hildenbrand	ea15153	2020-10-15 20:08:03 -0700	[diff] [blame]	1854	int ret, node;
Michal Hocko	7960509	2018-12-28 00:33:49 -0800	[diff] [blame]	1855	char *reason;
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1856
Oscar Salvador	dd8e2f2	2021-05-04 18:39:36 -0700	[diff] [blame]	1857	/*
				1858	* {on,off}lining is constrained to full memory sections (or more
Zhen Lei	041711c	2021-06-30 18:53:17 -0700	[diff] [blame]	1859	* precisely to memory blocks from the user space POV).
Oscar Salvador	dd8e2f2	2021-05-04 18:39:36 -0700	[diff] [blame]	1860	* memmap_on_memory is an exception because it reserves initial part
				1861	* of the physical memory space for vmemmaps. That space is pageblock
				1862	* aligned.
				1863	*/
David Hildenbrand	4986fac	2020-10-15 20:07:50 -0700	[diff] [blame]	1864	if (WARN_ON_ONCE(!nr_pages \|\|
Oscar Salvador	dd8e2f2	2021-05-04 18:39:36 -0700	[diff] [blame]	1865	!IS_ALIGNED(start_pfn, pageblock_nr_pages) \|\|
				1866	!IS_ALIGNED(start_pfn + nr_pages, PAGES_PER_SECTION)))
David Hildenbrand	4986fac	2020-10-15 20:07:50 -0700	[diff] [blame]	1867	return -EINVAL;
				1868
David Hildenbrand	381eab4	2018-10-30 15:10:29 -0700	[diff] [blame]	1869	mem_hotplug_begin();
				1870
David Hildenbrand	c5e79ef	2019-11-30 17:54:17 -0800	[diff] [blame]	1871	/*
				1872	* Don't allow to offline memory blocks that contain holes.
				1873	* Consequently, memory blocks with holes can never get onlined
				1874	* via the hotplug path - online_pages() - as hotplugged memory has
				1875	* no holes. This way, we e.g., don't have to worry about marking
				1876	* memory holes PG_reserved, don't need pfn_valid() checks, and can
				1877	* avoid using walk_system_ram_range() later.
				1878	*/
David Hildenbrand	73a11c9	2020-10-15 20:07:46 -0700	[diff] [blame]	1879	walk_system_ram_range(start_pfn, nr_pages, &system_ram_pages,
David Hildenbrand	c5e79ef	2019-11-30 17:54:17 -0800	[diff] [blame]	1880	count_system_ram_pages_cb);
David Hildenbrand	73a11c9	2020-10-15 20:07:46 -0700	[diff] [blame]	1881	if (system_ram_pages != nr_pages) {
David Hildenbrand	c5e79ef	2019-11-30 17:54:17 -0800	[diff] [blame]	1882	ret = -EINVAL;
				1883	reason = "memory holes";
				1884	goto failed_removal;
				1885	}
				1886
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1887	/* This makes hotplug much easier...and readable.
				1888	we assume this for now. .*/
David Hildenbrand	9291799	2020-02-03 17:34:26 -0800	[diff] [blame]	1889	zone = test_pages_in_a_zone(start_pfn, end_pfn);
				1890	if (!zone) {
Michal Hocko	7960509	2018-12-28 00:33:49 -0800	[diff] [blame]	1891	ret = -EINVAL;
				1892	reason = "multizone range";
				1893	goto failed_removal;
David Hildenbrand	381eab4	2018-10-30 15:10:29 -0700	[diff] [blame]	1894	}
Yasunori Goto	7b78d33	2007-10-21 16:41:36 -0700	[diff] [blame]	1895	node = zone_to_nid(zone);
Yasunori Goto	7b78d33	2007-10-21 16:41:36 -0700	[diff] [blame]	1896
Vlastimil Babka	ec6e8c7e	2020-12-14 19:10:59 -0800	[diff] [blame]	1897	/*
				1898	* Disable pcplists so that page isolation cannot race with freeing
				1899	* in a way that pages from isolated pageblock are left on pcplists.
				1900	*/
				1901	zone_pcp_disable(zone);
Minchan Kim	d479960e	2021-05-04 18:36:54 -0700	[diff] [blame]	1902	lru_cache_disable();
Vlastimil Babka	ec6e8c7e	2020-12-14 19:10:59 -0800	[diff] [blame]	1903
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1904	/* set above range as isolated */
Wen Congyang	b023f46	2012-12-11 16:00:45 -0800	[diff] [blame]	1905	ret = start_isolate_page_range(start_pfn, end_pfn,
Michal Hocko	d381c54	2018-12-28 00:33:56 -0800	[diff] [blame]	1906	MIGRATE_MOVABLE,
David Hildenbrand	756d25b	2019-11-30 17:54:07 -0800	[diff] [blame]	1907	MEMORY_OFFLINE \| REPORT_FAILURE);
David Hildenbrand	3fa0c7c	2020-10-15 20:08:07 -0700	[diff] [blame]	1908	if (ret) {
Michal Hocko	7960509	2018-12-28 00:33:49 -0800	[diff] [blame]	1909	reason = "failure to isolate range";
Vlastimil Babka	ec6e8c7e	2020-12-14 19:10:59 -0800	[diff] [blame]	1910	goto failed_removal_pcplists_disabled;
David Hildenbrand	381eab4	2018-10-30 15:10:29 -0700	[diff] [blame]	1911	}
Yasunori Goto	7b78d33	2007-10-21 16:41:36 -0700	[diff] [blame]	1912
				1913	arg.start_pfn = start_pfn;
				1914	arg.nr_pages = nr_pages;
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	1915	node_states_check_changes_offline(nr_pages, zone, &arg);
Yasunori Goto	7b78d33	2007-10-21 16:41:36 -0700	[diff] [blame]	1916
				1917	ret = memory_notify(MEM_GOING_OFFLINE, &arg);
				1918	ret = notifier_to_errno(ret);
Michal Hocko	7960509	2018-12-28 00:33:49 -0800	[diff] [blame]	1919	if (ret) {
				1920	reason = "notifier failure";
				1921	goto failed_removal_isolated;
				1922	}
Yasunori Goto	7b78d33	2007-10-21 16:41:36 -0700	[diff] [blame]	1923
Michal Hocko	bb8965b	2018-12-28 00:38:32 -0800	[diff] [blame]	1924	do {
David Hildenbrand	aa21879	2020-05-07 16:01:30 +0200	[diff] [blame]	1925	pfn = start_pfn;
				1926	do {
Michal Hocko	bb8965b	2018-12-28 00:38:32 -0800	[diff] [blame]	1927	if (signal_pending(current)) {
				1928	ret = -EINTR;
				1929	reason = "signal backoff";
				1930	goto failed_removal_isolated;
				1931	}
Michal Hocko	72b39cf	2017-11-15 17:33:34 -0800	[diff] [blame]	1932
Michal Hocko	bb8965b	2018-12-28 00:38:32 -0800	[diff] [blame]	1933	cond_resched();
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1934
David Hildenbrand	aa21879	2020-05-07 16:01:30 +0200	[diff] [blame]	1935	ret = scan_movable_pages(pfn, end_pfn, &pfn);
				1936	if (!ret) {
Michal Hocko	bb8965b	2018-12-28 00:38:32 -0800	[diff] [blame]	1937	/*
				1938	* TODO: fatal migration failures should bail
				1939	* out
				1940	*/
				1941	do_migrate_range(pfn, end_pfn);
				1942	}
David Hildenbrand	aa21879	2020-05-07 16:01:30 +0200	[diff] [blame]	1943	} while (!ret);
				1944
				1945	if (ret != -ENOENT) {
				1946	reason = "unmovable page";
				1947	goto failed_removal_isolated;
Michal Hocko	bb8965b	2018-12-28 00:38:32 -0800	[diff] [blame]	1948	}
Michal Hocko	72b39cf	2017-11-15 17:33:34 -0800	[diff] [blame]	1949
Michal Hocko	bb8965b	2018-12-28 00:38:32 -0800	[diff] [blame]	1950	/*
				1951	* Dissolve free hugepages in the memory block before doing
				1952	* offlining actually in order to make hugetlbfs's object
				1953	* counting consistent.
				1954	*/
				1955	ret = dissolve_free_huge_pages(start_pfn, end_pfn);
				1956	if (ret) {
				1957	reason = "failure to dissolve huge pages";
				1958	goto failed_removal_isolated;
				1959	}
David Hildenbrand	0a1a9a0	2020-10-15 20:07:54 -0700	[diff] [blame]	1960
David Hildenbrand	0a1a9a0	2020-10-15 20:07:54 -0700	[diff] [blame]	1961	ret = test_pages_isolated(start_pfn, end_pfn, MEMORY_OFFLINE);
Vlastimil Babka	ec6e8c7e	2020-12-14 19:10:59 -0800	[diff] [blame]	1962
Michal Hocko	5557c76	2019-05-13 17:21:24 -0700	[diff] [blame]	1963	} while (ret);
Michal Hocko	bb8965b	2018-12-28 00:38:32 -0800	[diff] [blame]	1964
David Hildenbrand	0a1a9a0	2020-10-15 20:07:54 -0700	[diff] [blame]	1965	/* Mark all sections offline and remove free pages from the buddy. */
				1966	__offline_isolated_pages(start_pfn, end_pfn);
Laurent Dufour	7c33023	2020-12-15 20:42:26 -0800	[diff] [blame]	1967	pr_debug("Offlined Pages %ld\n", nr_pages);
David Hildenbrand	0a1a9a0	2020-10-15 20:07:54 -0700	[diff] [blame]	1968
Qian Cai	9b7ea46	2019-03-28 20:43:34 -0700	[diff] [blame]	1969	/*
David Hildenbrand	b30c592	2020-10-15 20:08:23 -0700	[diff] [blame]	1970	* The memory sections are marked offline, and the pageblock flags
				1971	* effectively stale; nobody should be touching them. Fixup the number
				1972	* of isolated pageblocks, memory onlining will properly revert this.
Qian Cai	9b7ea46	2019-03-28 20:43:34 -0700	[diff] [blame]	1973	*/
				1974	spin_lock_irqsave(&zone->lock, flags);
David Hildenbrand	ea15153	2020-10-15 20:08:03 -0700	[diff] [blame]	1975	zone->nr_isolate_pageblock -= nr_pages / pageblock_nr_pages;
Qian Cai	9b7ea46	2019-03-28 20:43:34 -0700	[diff] [blame]	1976	spin_unlock_irqrestore(&zone->lock, flags);
				1977
Minchan Kim	d479960e	2021-05-04 18:36:54 -0700	[diff] [blame]	1978	lru_cache_enable();
Vlastimil Babka	ec6e8c7e	2020-12-14 19:10:59 -0800	[diff] [blame]	1979	zone_pcp_enable(zone);
				1980
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1981	/* removal success */
David Hildenbrand	0a1a9a0	2020-10-15 20:07:54 -0700	[diff] [blame]	1982	adjust_managed_page_count(pfn_to_page(start_pfn), -nr_pages);
David Hildenbrand	836809e	2021-09-07 19:55:30 -0700	[diff] [blame]	1983	adjust_present_page_count(pfn_to_page(start_pfn), group, -nr_pages);
Yasunori Goto	7b78d33	2007-10-21 16:41:36 -0700	[diff] [blame]	1984
Mel Gorman	b92ca18	2021-06-28 19:42:12 -0700	[diff] [blame]	1985	/* reinitialise watermarks and update pcp limits */
KOSAKI Motohiro	1b79acc	2011-05-24 17:11:32 -0700	[diff] [blame]	1986	init_per_zone_wmark_min();
				1987
Xishi Qiu	1e8537b	2012-10-08 16:31:51 -0700	[diff] [blame]	1988	if (!populated_zone(zone)) {
Jiang Liu	340175b	2012-07-31 16:43:32 -0700	[diff] [blame]	1989	zone_pcp_reset(zone);
Michal Hocko	72675e1	2017-09-06 16:20:24 -0700	[diff] [blame]	1990	build_all_zonelists(NULL);
Mel Gorman	b92ca18	2021-06-28 19:42:12 -0700	[diff] [blame]	1991	}
Jiang Liu	340175b	2012-07-31 16:43:32 -0700	[diff] [blame]	1992
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	1993	node_states_clear_node(node, &arg);
Vlastimil Babka	698b1b3	2016-03-17 14:18:08 -0700	[diff] [blame]	1994	if (arg.status_change_nid >= 0) {
David Rientjes	8fe23e0	2009-12-14 17:58:33 -0800	[diff] [blame]	1995	kswapd_stop(node);
Vlastimil Babka	698b1b3	2016-03-17 14:18:08 -0700	[diff] [blame]	1996	kcompactd_stop(node);
				1997	}
Minchan Kim	bce7394	2009-06-16 15:32:50 -0700	[diff] [blame]	1998
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1999	writeback_set_ratelimit();
Yasunori Goto	7b78d33	2007-10-21 16:41:36 -0700	[diff] [blame]	2000
				2001	memory_notify(MEM_OFFLINE, &arg);
David Hildenbrand	feee6b2	2020-01-04 12:59:33 -0800	[diff] [blame]	2002	remove_pfn_range_from_zone(zone, start_pfn, nr_pages);
David Hildenbrand	381eab4	2018-10-30 15:10:29 -0700	[diff] [blame]	2003	mem_hotplug_done();
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	2004	return 0;
				2005
Michal Hocko	7960509	2018-12-28 00:33:49 -0800	[diff] [blame]	2006	failed_removal_isolated:
				2007	undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
Qian Cai	c4efe48	2019-03-28 20:44:16 -0700	[diff] [blame]	2008	memory_notify(MEM_CANCEL_OFFLINE, &arg);
Vlastimil Babka	ec6e8c7e	2020-12-14 19:10:59 -0800	[diff] [blame]	2009	failed_removal_pcplists_disabled:
Miaohe Lin	946746d1	2021-08-25 12:17:55 -0700	[diff] [blame]	2010	lru_cache_enable();
Vlastimil Babka	ec6e8c7e	2020-12-14 19:10:59 -0800	[diff] [blame]	2011	zone_pcp_enable(zone);
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	2012	failed_removal:
Michal Hocko	7960509	2018-12-28 00:33:49 -0800	[diff] [blame]	2013	pr_debug("memory offlining [mem %#010llx-%#010llx] failed due to %s\n",
Chen Yucong	e33e33b	2016-03-17 14:19:35 -0700	[diff] [blame]	2014	(unsigned long long) start_pfn << PAGE_SHIFT,
Michal Hocko	7960509	2018-12-28 00:33:49 -0800	[diff] [blame]	2015	((unsigned long long) end_pfn << PAGE_SHIFT) - 1,
				2016	reason);
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	2017	/* pushback to free area */
David Hildenbrand	381eab4	2018-10-30 15:10:29 -0700	[diff] [blame]	2018	mem_hotplug_done();
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	2019	return ret;
				2020	}
Badari Pulavarty	7108878	2008-10-18 20:25:58 -0700	[diff] [blame]	2021
Xishi Qiu	d6de9d5	2013-11-12 15:07:20 -0800	[diff] [blame]	2022	static int check_memblock_offlined_cb(struct memory_block mem, void arg)
Wen Congyang	bbc76be	2013-02-22 16:32:54 -0800	[diff] [blame]	2023	{
				2024	int ret = !is_memblock_offlined(mem);
David Hildenbrand	e1c158e	2021-09-07 19:55:09 -0700	[diff] [blame]	2025	int *nid = arg;
Wen Congyang	bbc76be	2013-02-22 16:32:54 -0800	[diff] [blame]	2026
David Hildenbrand	e1c158e	2021-09-07 19:55:09 -0700	[diff] [blame]	2027	*nid = mem->nid;
Randy Dunlap	349daa0	2013-04-29 15:08:49 -0700	[diff] [blame]	2028	if (unlikely(ret)) {
				2029	phys_addr_t beginpa, endpa;
				2030
				2031	beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr));
David Hildenbrand	b6c88d3	2019-09-23 15:35:49 -0700	[diff] [blame]	2032	endpa = beginpa + memory_block_size_bytes() - 1;
Joe Perches	756a025	2016-03-17 14:19:47 -0700	[diff] [blame]	2033	pr_warn("removing memory fails, because memory [%pa-%pa] is onlined\n",
Randy Dunlap	349daa0	2013-04-29 15:08:49 -0700	[diff] [blame]	2034	&beginpa, &endpa);
Wen Congyang	bbc76be	2013-02-22 16:32:54 -0800	[diff] [blame]	2035
Pavel Tatashin	eca499a	2019-07-16 16:30:31 -0700	[diff] [blame]	2036	return -EBUSY;
				2037	}
				2038	return 0;
Wen Congyang	bbc76be	2013-02-22 16:32:54 -0800	[diff] [blame]	2039	}
				2040
Oscar Salvador	a08a2ae	2021-05-04 18:39:42 -0700	[diff] [blame]	2041	static int get_nr_vmemmap_pages_cb(struct memory_block mem, void arg)
				2042	{
				2043	/*
				2044	* If not set, continue with the next block.
				2045	*/
				2046	return mem->nr_vmemmap_pages;
				2047	}
				2048
Toshi Kani	0f1cfe9	2013-09-11 14:21:50 -0700	[diff] [blame]	2049	static int check_cpu_on_node(pg_data_t *pgdat)
Tang Chen	60a5a19	2013-02-22 16:33:14 -0800	[diff] [blame]	2050	{
Tang Chen	60a5a19	2013-02-22 16:33:14 -0800	[diff] [blame]	2051	int cpu;
				2052
				2053	for_each_present_cpu(cpu) {
				2054	if (cpu_to_node(cpu) == pgdat->node_id)
				2055	/*
				2056	* the cpu on this node isn't removed, and we can't
				2057	* offline this node.
				2058	*/
				2059	return -EBUSY;
				2060	}
				2061
				2062	return 0;
				2063	}
				2064
David Hildenbrand	2c91f8f	2019-11-15 17:34:57 -0800	[diff] [blame]	2065	static int check_no_memblock_for_node_cb(struct memory_block mem, void arg)
				2066	{
				2067	int nid = (int )arg;
				2068
				2069	/*
				2070	* If a memory block belongs to multiple nodes, the stored nid is not
				2071	* reliable. However, such blocks are always online (e.g., cannot get
				2072	* offlined) and, therefore, are still spanned by the node.
				2073	*/
				2074	return mem->nid == nid ? -EEXIST : 0;
				2075	}
				2076
Toshi Kani	0f1cfe9	2013-09-11 14:21:50 -0700	[diff] [blame]	2077	/**
				2078	* try_offline_node
Mike Rapoport	e8b098f	2018-04-05 16:24:57 -0700	[diff] [blame]	2079	* @nid: the node ID
Toshi Kani	0f1cfe9	2013-09-11 14:21:50 -0700	[diff] [blame]	2080	*
				2081	* Offline a node if all memory sections and cpus of the node are removed.
				2082	*
				2083	* NOTE: The caller must call lock_device_hotplug() to serialize hotplug
				2084	* and online/offline operations before this call.
				2085	*/
Wen Congyang	90b30cd	2013-02-22 16:33:27 -0800	[diff] [blame]	2086	void try_offline_node(int nid)
Tang Chen	60a5a19	2013-02-22 16:33:14 -0800	[diff] [blame]	2087	{
Wen Congyang	d822b86	2013-02-22 16:33:16 -0800	[diff] [blame]	2088	pg_data_t *pgdat = NODE_DATA(nid);
David Hildenbrand	2c91f8f	2019-11-15 17:34:57 -0800	[diff] [blame]	2089	int rc;
Tang Chen	60a5a19	2013-02-22 16:33:14 -0800	[diff] [blame]	2090
David Hildenbrand	2c91f8f	2019-11-15 17:34:57 -0800	[diff] [blame]	2091	/*
				2092	* If the node still spans pages (especially ZONE_DEVICE), don't
				2093	* offline it. A node spans memory after move_pfn_range_to_zone(),
				2094	* e.g., after the memory block was onlined.
				2095	*/
				2096	if (pgdat->node_spanned_pages)
Tang Chen	60a5a19	2013-02-22 16:33:14 -0800	[diff] [blame]	2097	return;
David Hildenbrand	2c91f8f	2019-11-15 17:34:57 -0800	[diff] [blame]	2098
				2099	/*
				2100	* Especially offline memory blocks might not be spanned by the
				2101	* node. They will get spanned by the node once they get onlined.
				2102	* However, they link to the node in sysfs and can get onlined later.
				2103	*/
				2104	rc = for_each_memory_block(&nid, check_no_memblock_for_node_cb);
				2105	if (rc)
				2106	return;
Tang Chen	60a5a19	2013-02-22 16:33:14 -0800	[diff] [blame]	2107
Michal Hocko	46a3679	2018-12-28 00:34:13 -0800	[diff] [blame]	2108	if (check_cpu_on_node(pgdat))
Tang Chen	60a5a19	2013-02-22 16:33:14 -0800	[diff] [blame]	2109	return;
				2110
				2111	/*
				2112	* all memory/cpu of this node are removed, we can offline this
				2113	* node now.
				2114	*/
				2115	node_set_offline(nid);
				2116	unregister_one_node(nid);
				2117	}
Wen Congyang	90b30cd	2013-02-22 16:33:27 -0800	[diff] [blame]	2118	EXPORT_SYMBOL(try_offline_node);
Tang Chen	60a5a19	2013-02-22 16:33:14 -0800	[diff] [blame]	2119
David Hildenbrand	e1c158e	2021-09-07 19:55:09 -0700	[diff] [blame]	2120	static int __ref try_remove_memory(u64 start, u64 size)
Wen Congyang	bbc76be	2013-02-22 16:32:54 -0800	[diff] [blame]	2121	{
Oscar Salvador	a08a2ae	2021-05-04 18:39:42 -0700	[diff] [blame]	2122	struct vmem_altmap mhp_altmap = {};
				2123	struct vmem_altmap *altmap = NULL;
				2124	unsigned long nr_vmemmap_pages;
David Hildenbrand	e1c158e	2021-09-07 19:55:09 -0700	[diff] [blame]	2125	int rc = 0, nid = NUMA_NO_NODE;
Wen Congyang	993c1aa	2013-02-22 16:32:50 -0800	[diff] [blame]	2126
Toshi Kani	27356f5	2013-09-11 14:21:49 -0700	[diff] [blame]	2127	BUG_ON(check_hotplug_memory_range(start, size));
				2128
Yasuaki Ishimatsu	6677e3e	2013-02-22 16:32:52 -0800	[diff] [blame]	2129	/*
Rafael J. Wysocki	242831e	2013-05-27 12:58:46 +0200	[diff] [blame]	2130	* All memory blocks must be offlined before removing memory. Check
Pavel Tatashin	eca499a	2019-07-16 16:30:31 -0700	[diff] [blame]	2131	* whether all memory blocks in question are offline and return error
Rafael J. Wysocki	242831e	2013-05-27 12:58:46 +0200	[diff] [blame]	2132	* if this is not the case.
David Hildenbrand	e1c158e	2021-09-07 19:55:09 -0700	[diff] [blame]	2133	*
				2134	* While at it, determine the nid. Note that if we'd have mixed nodes,
				2135	* we'd only try to offline the last determined one -- which is good
				2136	* enough for the cases we care about.
Yasuaki Ishimatsu	6677e3e	2013-02-22 16:32:52 -0800	[diff] [blame]	2137	*/
David Hildenbrand	e1c158e	2021-09-07 19:55:09 -0700	[diff] [blame]	2138	rc = walk_memory_blocks(start, size, &nid, check_memblock_offlined_cb);
Pavel Tatashin	eca499a	2019-07-16 16:30:31 -0700	[diff] [blame]	2139	if (rc)
Jia He	b4223a5	2020-08-11 18:32:20 -0700	[diff] [blame]	2140	return rc;
Yasuaki Ishimatsu	6677e3e	2013-02-22 16:32:52 -0800	[diff] [blame]	2141
Oscar Salvador	a08a2ae	2021-05-04 18:39:42 -0700	[diff] [blame]	2142	/*
				2143	* We only support removing memory added with MHP_MEMMAP_ON_MEMORY in
				2144	* the same granularity it was added - a single memory block.
				2145	*/
				2146	if (memmap_on_memory) {
				2147	nr_vmemmap_pages = walk_memory_blocks(start, size, NULL,
				2148	get_nr_vmemmap_pages_cb);
				2149	if (nr_vmemmap_pages) {
				2150	if (size != memory_block_size_bytes()) {
				2151	pr_warn("Refuse to remove %#llx - %#llx,"
				2152	"wrong granularity\n",
				2153	start, start + size);
				2154	return -EINVAL;
				2155	}
				2156
				2157	/*
				2158	* Let remove_pmd_table->free_hugepage_table do the
				2159	* right thing if we used vmem_altmap when hot-adding
				2160	* the range.
				2161	*/
				2162	mhp_altmap.alloc = nr_vmemmap_pages;
				2163	altmap = &mhp_altmap;
				2164	}
				2165	}
				2166
Yasuaki Ishimatsu	46c66c4	2013-02-22 16:32:56 -0800	[diff] [blame]	2167	/* remove memmap entry */
				2168	firmware_map_remove(start, start + size, "System RAM");
				2169
Dan Williams	f1037ec	2020-01-30 22:11:17 -0800	[diff] [blame]	2170	/*
				2171	* Memory block device removal under the device_hotplug_lock is
				2172	* a barrier against racing online attempts.
				2173	*/
David Hildenbrand	4c4b7f9	2019-07-18 15:57:06 -0700	[diff] [blame]	2174	remove_memory_block_devices(start, size);
				2175
Dan Williams	f1037ec	2020-01-30 22:11:17 -0800	[diff] [blame]	2176	mem_hotplug_begin();
				2177
David Hildenbrand	65a2aa5	2021-09-07 19:55:04 -0700	[diff] [blame]	2178	arch_remove_memory(start, size, altmap);
David Hildenbrand	52219ae	2020-06-04 16:48:38 -0700	[diff] [blame]	2179
				2180	if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) {
Mike Rapoport	3ecc683	2021-11-05 13:43:19 -0700	[diff] [blame]	2181	memblock_phys_free(start, size);
David Hildenbrand	52219ae	2020-06-04 16:48:38 -0700	[diff] [blame]	2182	memblock_remove(start, size);
				2183	}
				2184
David Hildenbrand	cb8e3c8	2020-10-15 20:09:12 -0700	[diff] [blame]	2185	release_mem_region_adjustable(start, size);
Wen Congyang	24d335c	2013-02-22 16:32:58 -0800	[diff] [blame]	2186
David Hildenbrand	e1c158e	2021-09-07 19:55:09 -0700	[diff] [blame]	2187	if (nid != NUMA_NO_NODE)
				2188	try_offline_node(nid);
Tang Chen	60a5a19	2013-02-22 16:33:14 -0800	[diff] [blame]	2189
Vladimir Davydov	bfc8c90	2014-06-04 16:07:18 -0700	[diff] [blame]	2190	mem_hotplug_done();
Jia He	b4223a5	2020-08-11 18:32:20 -0700	[diff] [blame]	2191	return 0;
Badari Pulavarty	7108878	2008-10-18 20:25:58 -0700	[diff] [blame]	2192	}
David Hildenbrand	d15e592	2018-10-30 15:10:18 -0700	[diff] [blame]	2193
Pavel Tatashin	eca499a	2019-07-16 16:30:31 -0700	[diff] [blame]	2194	/**
Mel Gorman	5640c9c	2021-06-30 18:53:38 -0700	[diff] [blame]	2195	* __remove_memory - Remove memory if every memory block is offline
Pavel Tatashin	eca499a	2019-07-16 16:30:31 -0700	[diff] [blame]	2196	* @start: physical address of the region to remove
				2197	* @size: size of the region to remove
				2198	*
				2199	* NOTE: The caller must call lock_device_hotplug() to serialize hotplug
				2200	* and online/offline operations before this call, as required by
				2201	* try_offline_node().
				2202	*/
David Hildenbrand	e1c158e	2021-09-07 19:55:09 -0700	[diff] [blame]	2203	void __remove_memory(u64 start, u64 size)
David Hildenbrand	d15e592	2018-10-30 15:10:18 -0700	[diff] [blame]	2204	{
Pavel Tatashin	eca499a	2019-07-16 16:30:31 -0700	[diff] [blame]	2205
				2206	/*
Souptick Joarder	29a90db	2019-09-23 15:36:18 -0700	[diff] [blame]	2207	* trigger BUG() if some memory is not offlined prior to calling this
Pavel Tatashin	eca499a	2019-07-16 16:30:31 -0700	[diff] [blame]	2208	* function
				2209	*/
David Hildenbrand	e1c158e	2021-09-07 19:55:09 -0700	[diff] [blame]	2210	if (try_remove_memory(start, size))
Pavel Tatashin	eca499a	2019-07-16 16:30:31 -0700	[diff] [blame]	2211	BUG();
				2212	}
				2213
				2214	/*
				2215	* Remove memory if every memory block is offline, otherwise return -EBUSY is
				2216	* some memory is not offline
				2217	*/
David Hildenbrand	e1c158e	2021-09-07 19:55:09 -0700	[diff] [blame]	2218	int remove_memory(u64 start, u64 size)
Pavel Tatashin	eca499a	2019-07-16 16:30:31 -0700	[diff] [blame]	2219	{
				2220	int rc;
				2221
David Hildenbrand	d15e592	2018-10-30 15:10:18 -0700	[diff] [blame]	2222	lock_device_hotplug();
David Hildenbrand	e1c158e	2021-09-07 19:55:09 -0700	[diff] [blame]	2223	rc = try_remove_memory(start, size);
David Hildenbrand	d15e592	2018-10-30 15:10:18 -0700	[diff] [blame]	2224	unlock_device_hotplug();
Pavel Tatashin	eca499a	2019-07-16 16:30:31 -0700	[diff] [blame]	2225
				2226	return rc;
David Hildenbrand	d15e592	2018-10-30 15:10:18 -0700	[diff] [blame]	2227	}
Badari Pulavarty	7108878	2008-10-18 20:25:58 -0700	[diff] [blame]	2228	EXPORT_SYMBOL_GPL(remove_memory);
David Hildenbrand	08b3acd	2020-05-07 16:01:32 +0200	[diff] [blame]	2229
David Hildenbrand	8dc4bb5	2020-11-12 14:38:13 +0100	[diff] [blame]	2230	static int try_offline_memory_block(struct memory_block mem, void arg)
				2231	{
				2232	uint8_t online_type = MMOP_ONLINE_KERNEL;
				2233	uint8_t **online_types = arg;
				2234	struct page *page;
				2235	int rc;
				2236
				2237	/*
				2238	* Sense the online_type via the zone of the memory block. Offlining
				2239	* with multiple zones within one memory block will be rejected
				2240	* by offlining code ... so we don't care about that.
				2241	*/
				2242	page = pfn_to_online_page(section_nr_to_pfn(mem->start_section_nr));
				2243	if (page && zone_idx(page_zone(page)) == ZONE_MOVABLE)
				2244	online_type = MMOP_ONLINE_MOVABLE;
				2245
				2246	rc = device_offline(&mem->dev);
				2247	/*
				2248	* Default is MMOP_OFFLINE - change it only if offlining succeeded,
				2249	* so try_reonline_memory_block() can do the right thing.
				2250	*/
				2251	if (!rc)
				2252	**online_types = online_type;
				2253
				2254	(*online_types)++;
				2255	/* Ignore if already offline. */
				2256	return rc < 0 ? rc : 0;
				2257	}
				2258
				2259	static int try_reonline_memory_block(struct memory_block mem, void arg)
				2260	{
				2261	uint8_t **online_types = arg;
				2262	int rc;
				2263
				2264	if (**online_types != MMOP_OFFLINE) {
				2265	mem->online_type = **online_types;
				2266	rc = device_online(&mem->dev);
				2267	if (rc < 0)
				2268	pr_warn("%s: Failed to re-online memory: %d",
				2269	__func__, rc);
				2270	}
				2271
				2272	/* Continue processing all remaining memory blocks. */
				2273	(*online_types)++;
				2274	return 0;
				2275	}
				2276
David Hildenbrand	08b3acd	2020-05-07 16:01:32 +0200	[diff] [blame]	2277	/*
David Hildenbrand	8dc4bb5	2020-11-12 14:38:13 +0100	[diff] [blame]	2278	* Try to offline and remove memory. Might take a long time to finish in case
				2279	* memory is still in use. Primarily useful for memory devices that logically
				2280	* unplugged all memory (so it's no longer in use) and want to offline + remove
				2281	* that memory.
David Hildenbrand	08b3acd	2020-05-07 16:01:32 +0200	[diff] [blame]	2282	*/
David Hildenbrand	e1c158e	2021-09-07 19:55:09 -0700	[diff] [blame]	2283	int offline_and_remove_memory(u64 start, u64 size)
David Hildenbrand	08b3acd	2020-05-07 16:01:32 +0200	[diff] [blame]	2284	{
David Hildenbrand	8dc4bb5	2020-11-12 14:38:13 +0100	[diff] [blame]	2285	const unsigned long mb_count = size / memory_block_size_bytes();
				2286	uint8_t online_types, tmp;
				2287	int rc;
David Hildenbrand	08b3acd	2020-05-07 16:01:32 +0200	[diff] [blame]	2288
				2289	if (!IS_ALIGNED(start, memory_block_size_bytes()) \|\|
David Hildenbrand	8dc4bb5	2020-11-12 14:38:13 +0100	[diff] [blame]	2290	!IS_ALIGNED(size, memory_block_size_bytes()) \|\| !size)
				2291	return -EINVAL;
David Hildenbrand	08b3acd	2020-05-07 16:01:32 +0200	[diff] [blame]	2292
				2293	/*
David Hildenbrand	8dc4bb5	2020-11-12 14:38:13 +0100	[diff] [blame]	2294	* We'll remember the old online type of each memory block, so we can
				2295	* try to revert whatever we did when offlining one memory block fails
				2296	* after offlining some others succeeded.
				2297	*/
				2298	online_types = kmalloc_array(mb_count, sizeof(*online_types),
				2299	GFP_KERNEL);
				2300	if (!online_types)
				2301	return -ENOMEM;
				2302	/*
				2303	* Initialize all states to MMOP_OFFLINE, so when we abort processing in
				2304	* try_offline_memory_block(), we'll skip all unprocessed blocks in
				2305	* try_reonline_memory_block().
				2306	*/
				2307	memset(online_types, MMOP_OFFLINE, mb_count);
				2308
				2309	lock_device_hotplug();
				2310
				2311	tmp = online_types;
				2312	rc = walk_memory_blocks(start, size, &tmp, try_offline_memory_block);
				2313
				2314	/*
				2315	* In case we succeeded to offline all memory, remove it.
David Hildenbrand	08b3acd	2020-05-07 16:01:32 +0200	[diff] [blame]	2316	* This cannot fail as it cannot get onlined in the meantime.
				2317	*/
				2318	if (!rc) {
David Hildenbrand	e1c158e	2021-09-07 19:55:09 -0700	[diff] [blame]	2319	rc = try_remove_memory(start, size);
David Hildenbrand	8dc4bb5	2020-11-12 14:38:13 +0100	[diff] [blame]	2320	if (rc)
				2321	pr_err("%s: Failed to remove memory: %d", __func__, rc);
				2322	}
				2323
				2324	/*
				2325	* Rollback what we did. While memory onlining might theoretically fail
				2326	* (nacked by a notifier), it barely ever happens.
				2327	*/
				2328	if (rc) {
				2329	tmp = online_types;
				2330	walk_memory_blocks(start, size, &tmp,
				2331	try_reonline_memory_block);
David Hildenbrand	08b3acd	2020-05-07 16:01:32 +0200	[diff] [blame]	2332	}
				2333	unlock_device_hotplug();
				2334
David Hildenbrand	8dc4bb5	2020-11-12 14:38:13 +0100	[diff] [blame]	2335	kfree(online_types);
David Hildenbrand	08b3acd	2020-05-07 16:01:32 +0200	[diff] [blame]	2336	return rc;
				2337	}
				2338	EXPORT_SYMBOL_GPL(offline_and_remove_memory);
Rafael J. Wysocki	aba6efc	2013-06-01 22:24:07 +0200	[diff] [blame]	2339	#endif /* CONFIG_MEMORY_HOTREMOVE */