Blame - mm/page_reporting.c - SHIFTPHONES/mainline/linux

blob: c50d93ffa2526924deea2433a9405a1f5f785ac1 [file] [log] [blame]

Alexander Duyck	36e66c5	2020-04-06 20:04:56 -0700	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
				2	#include <linux/mm.h>
				3	#include <linux/mmzone.h>
				4	#include <linux/page_reporting.h>
				5	#include <linux/gfp.h>
				6	#include <linux/export.h>
				7	#include <linux/delay.h>
				8	#include <linux/scatterlist.h>
				9
				10	#include "page_reporting.h"
				11	#include "internal.h"
				12
				13	#define PAGE_REPORTING_DELAY (2 * HZ)
				14	static struct page_reporting_dev_info __rcu *pr_dev_info __read_mostly;
				15
				16	enum {
				17	PAGE_REPORTING_IDLE = 0,
				18	PAGE_REPORTING_REQUESTED,
				19	PAGE_REPORTING_ACTIVE
				20	};
				21
				22	/* request page reporting */
				23	static void
				24	__page_reporting_request(struct page_reporting_dev_info *prdev)
				25	{
				26	unsigned int state;
				27
				28	/* Check to see if we are in desired state */
				29	state = atomic_read(&prdev->state);
				30	if (state == PAGE_REPORTING_REQUESTED)
				31	return;
				32
				33	/*
				34	* If reporting is already active there is nothing we need to do.
				35	* Test against 0 as that represents PAGE_REPORTING_IDLE.
				36	*/
				37	state = atomic_xchg(&prdev->state, PAGE_REPORTING_REQUESTED);
				38	if (state != PAGE_REPORTING_IDLE)
				39	return;
				40
				41	/*
				42	* Delay the start of work to allow a sizable queue to build. For
				43	* now we are limiting this to running no more than once every
				44	* couple of seconds.
				45	*/
				46	schedule_delayed_work(&prdev->work, PAGE_REPORTING_DELAY);
				47	}
				48
				49	/* notify prdev of free page reporting request */
				50	void __page_reporting_notify(void)
				51	{
				52	struct page_reporting_dev_info *prdev;
				53
				54	/*
				55	* We use RCU to protect the pr_dev_info pointer. In almost all
				56	* cases this should be present, however in the unlikely case of
				57	* a shutdown this will be NULL and we should exit.
				58	*/
				59	rcu_read_lock();
				60	prdev = rcu_dereference(pr_dev_info);
				61	if (likely(prdev))
				62	__page_reporting_request(prdev);
				63
				64	rcu_read_unlock();
				65	}
				66
				67	static void
				68	page_reporting_drain(struct page_reporting_dev_info *prdev,
				69	struct scatterlist *sgl, unsigned int nents, bool reported)
				70	{
				71	struct scatterlist *sg = sgl;
				72
				73	/*
				74	* Drain the now reported pages back into their respective
				75	* free lists/areas. We assume at least one page is populated.
				76	*/
				77	do {
				78	struct page *page = sg_page(sg);
				79	int mt = get_pageblock_migratetype(page);
				80	unsigned int order = get_order(sg->length);
				81
				82	__putback_isolated_page(page, order, mt);
				83
				84	/* If the pages were not reported due to error skip flagging */
				85	if (!reported)
				86	continue;
				87
				88	/*
				89	* If page was not comingled with another page we can
				90	* consider the result to be "reported" since the page
				91	* hasn't been modified, otherwise we will need to
				92	* report on the new larger page when we make our way
				93	* up to that higher order.
				94	*/
Matthew Wilcox (Oracle)	ab130f91	2020-10-15 20:10:15 -0700	[diff] [blame]	95	if (PageBuddy(page) && buddy_order(page) == order)
Alexander Duyck	36e66c5	2020-04-06 20:04:56 -0700	[diff] [blame]	96	__SetPageReported(page);
				97	} while ((sg = sg_next(sg)));
				98
				99	/* reinitialize scatterlist now that it is empty */
				100	sg_init_table(sgl, nents);
				101	}
				102
				103	/*
				104	* The page reporting cycle consists of 4 stages, fill, report, drain, and
				105	* idle. We will cycle through the first 3 stages until we cannot obtain a
				106	* full scatterlist of pages, in that case we will switch to idle.
				107	*/
				108	static int
				109	page_reporting_cycle(struct page_reporting_dev_info prdev, struct zone zone,
				110	unsigned int order, unsigned int mt,
				111	struct scatterlist sgl, unsigned int offset)
				112	{
				113	struct free_area *area = &zone->free_area[order];
				114	struct list_head *list = &area->free_list[mt];
				115	unsigned int page_len = PAGE_SIZE << order;
				116	struct page page, next;
Alexander Duyck	43b76f2	2020-04-06 20:05:14 -0700	[diff] [blame]	117	long budget;
Alexander Duyck	36e66c5	2020-04-06 20:04:56 -0700	[diff] [blame]	118	int err = 0;
				119
				120	/*
				121	* Perform early check, if free area is empty there is
				122	* nothing to process so we can skip this free_list.
				123	*/
				124	if (list_empty(list))
				125	return err;
				126
				127	spin_lock_irq(&zone->lock);
				128
Alexander Duyck	43b76f2	2020-04-06 20:05:14 -0700	[diff] [blame]	129	/*
				130	* Limit how many calls we will be making to the page reporting
				131	* device for this list. By doing this we avoid processing any
				132	* given list for too long.
				133	*
				134	* The current value used allows us enough calls to process over a
				135	* sixteenth of the current list plus one additional call to handle
				136	* any pages that may have already been present from the previous
				137	* list processed. This should result in us reporting all pages on
				138	* an idle system in about 30 seconds.
				139	*
				140	* The division here should be cheap since PAGE_REPORTING_CAPACITY
				141	* should always be a power of 2.
				142	*/
				143	budget = DIV_ROUND_UP(area->nr_free, PAGE_REPORTING_CAPACITY * 16);
				144
Alexander Duyck	36e66c5	2020-04-06 20:04:56 -0700	[diff] [blame]	145	/* loop through free list adding unreported pages to sg list */
				146	list_for_each_entry_safe(page, next, list, lru) {
				147	/* We are going to skip over the reported pages. */
				148	if (PageReported(page))
				149	continue;
				150
Alexander Duyck	43b76f2	2020-04-06 20:05:14 -0700	[diff] [blame]	151	/*
				152	* If we fully consumed our budget then update our
				153	* state to indicate that we are requesting additional
				154	* processing and exit this list.
				155	*/
				156	if (budget < 0) {
				157	atomic_set(&prdev->state, PAGE_REPORTING_REQUESTED);
				158	next = page;
				159	break;
				160	}
				161
Alexander Duyck	02cf871	2020-04-06 20:05:10 -0700	[diff] [blame]	162	/* Attempt to pull page from list and place in scatterlist */
				163	if (*offset) {
				164	if (!__isolate_free_page(page, order)) {
				165	next = page;
				166	break;
				167	}
Alexander Duyck	36e66c5	2020-04-06 20:04:56 -0700	[diff] [blame]	168
Alexander Duyck	02cf871	2020-04-06 20:05:10 -0700	[diff] [blame]	169	/* Add page to scatter list */
				170	--(*offset);
				171	sg_set_page(&sgl[*offset], page, page_len, 0);
Alexander Duyck	36e66c5	2020-04-06 20:04:56 -0700	[diff] [blame]	172
Alexander Duyck	36e66c5	2020-04-06 20:04:56 -0700	[diff] [blame]	173	continue;
Alexander Duyck	02cf871	2020-04-06 20:05:10 -0700	[diff] [blame]	174	}
				175
				176	/*
Alexander Duyck	43b76f2	2020-04-06 20:05:14 -0700	[diff] [blame]	177	* Make the first non-reported page in the free list
Alexander Duyck	02cf871	2020-04-06 20:05:10 -0700	[diff] [blame]	178	* the new head of the free list before we release the
				179	* zone lock.
				180	*/
Wei Yang	58f6f03	2020-10-15 20:09:49 -0700	[diff] [blame]	181	if (!list_is_first(&page->lru, list))
Alexander Duyck	02cf871	2020-04-06 20:05:10 -0700	[diff] [blame]	182	list_rotate_to_front(&page->lru, list);
Alexander Duyck	36e66c5	2020-04-06 20:04:56 -0700	[diff] [blame]	183
				184	/* release lock before waiting on report processing */
				185	spin_unlock_irq(&zone->lock);
				186
				187	/* begin processing pages in local list */
				188	err = prdev->report(prdev, sgl, PAGE_REPORTING_CAPACITY);
				189
				190	/* reset offset since the full list was reported */
				191	*offset = PAGE_REPORTING_CAPACITY;
				192
Alexander Duyck	43b76f2	2020-04-06 20:05:14 -0700	[diff] [blame]	193	/* update budget to reflect call to report function */
				194	budget--;
				195
Alexander Duyck	36e66c5	2020-04-06 20:04:56 -0700	[diff] [blame]	196	/* reacquire zone lock and resume processing */
				197	spin_lock_irq(&zone->lock);
				198
				199	/* flush reported pages from the sg list */
				200	page_reporting_drain(prdev, sgl, PAGE_REPORTING_CAPACITY, !err);
				201
				202	/*
				203	* Reset next to first entry, the old next isn't valid
				204	* since we dropped the lock to report the pages
				205	*/
				206	next = list_first_entry(list, struct page, lru);
				207
				208	/* exit on error */
				209	if (err)
				210	break;
				211	}
				212
Alexander Duyck	02cf871	2020-04-06 20:05:10 -0700	[diff] [blame]	213	/* Rotate any leftover pages to the head of the freelist */
sh_def@163.com	5df6d79	2021-02-24 12:04:57 -0800	[diff] [blame]	214	if (!list_entry_is_head(next, list, lru) && !list_is_first(&next->lru, list))
Alexander Duyck	02cf871	2020-04-06 20:05:10 -0700	[diff] [blame]	215	list_rotate_to_front(&next->lru, list);
				216
Alexander Duyck	36e66c5	2020-04-06 20:04:56 -0700	[diff] [blame]	217	spin_unlock_irq(&zone->lock);
				218
				219	return err;
				220	}
				221
				222	static int
				223	page_reporting_process_zone(struct page_reporting_dev_info *prdev,
				224	struct scatterlist sgl, struct zone zone)
				225	{
				226	unsigned int order, mt, leftover, offset = PAGE_REPORTING_CAPACITY;
				227	unsigned long watermark;
				228	int err = 0;
				229
				230	/* Generate minimum watermark to be able to guarantee progress */
				231	watermark = low_wmark_pages(zone) +
				232	(PAGE_REPORTING_CAPACITY << PAGE_REPORTING_MIN_ORDER);
				233
				234	/*
				235	* Cancel request if insufficient free memory or if we failed
				236	* to allocate page reporting statistics for the zone.
				237	*/
				238	if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
				239	return err;
				240
				241	/* Process each free list starting from lowest order/mt */
				242	for (order = PAGE_REPORTING_MIN_ORDER; order < MAX_ORDER; order++) {
				243	for (mt = 0; mt < MIGRATE_TYPES; mt++) {
				244	/* We do not pull pages from the isolate free list */
				245	if (is_migrate_isolate(mt))
				246	continue;
				247
				248	err = page_reporting_cycle(prdev, zone, order, mt,
				249	sgl, &offset);
				250	if (err)
				251	return err;
				252	}
				253	}
				254
				255	/* report the leftover pages before going idle */
				256	leftover = PAGE_REPORTING_CAPACITY - offset;
				257	if (leftover) {
				258	sgl = &sgl[offset];
				259	err = prdev->report(prdev, sgl, leftover);
				260
				261	/* flush any remaining pages out from the last report */
				262	spin_lock_irq(&zone->lock);
				263	page_reporting_drain(prdev, sgl, leftover, !err);
				264	spin_unlock_irq(&zone->lock);
				265	}
				266
				267	return err;
				268	}
				269
				270	static void page_reporting_process(struct work_struct *work)
				271	{
				272	struct delayed_work *d_work = to_delayed_work(work);
				273	struct page_reporting_dev_info *prdev =
				274	container_of(d_work, struct page_reporting_dev_info, work);
				275	int err = 0, state = PAGE_REPORTING_ACTIVE;
				276	struct scatterlist *sgl;
				277	struct zone *zone;
				278
				279	/*
				280	* Change the state to "Active" so that we can track if there is
				281	* anyone requests page reporting after we complete our pass. If
				282	* the state is not altered by the end of the pass we will switch
				283	* to idle and quit scheduling reporting runs.
				284	*/
				285	atomic_set(&prdev->state, state);
				286
				287	/* allocate scatterlist to store pages being reported on */
				288	sgl = kmalloc_array(PAGE_REPORTING_CAPACITY, sizeof(*sgl), GFP_KERNEL);
				289	if (!sgl)
				290	goto err_out;
				291
				292	sg_init_table(sgl, PAGE_REPORTING_CAPACITY);
				293
				294	for_each_zone(zone) {
				295	err = page_reporting_process_zone(prdev, sgl, zone);
				296	if (err)
				297	break;
				298	}
				299
				300	kfree(sgl);
				301	err_out:
				302	/*
				303	* If the state has reverted back to requested then there may be
				304	* additional pages to be processed. We will defer for 2s to allow
				305	* more pages to accumulate.
				306	*/
				307	state = atomic_cmpxchg(&prdev->state, state, PAGE_REPORTING_IDLE);
				308	if (state == PAGE_REPORTING_REQUESTED)
				309	schedule_delayed_work(&prdev->work, PAGE_REPORTING_DELAY);
				310	}
				311
				312	static DEFINE_MUTEX(page_reporting_mutex);
				313	DEFINE_STATIC_KEY_FALSE(page_reporting_enabled);
				314
				315	int page_reporting_register(struct page_reporting_dev_info *prdev)
				316	{
				317	int err = 0;
				318
				319	mutex_lock(&page_reporting_mutex);
				320
				321	/* nothing to do if already in use */
				322	if (rcu_access_pointer(pr_dev_info)) {
				323	err = -EBUSY;
				324	goto err_out;
				325	}
				326
				327	/* initialize state and work structures */
				328	atomic_set(&prdev->state, PAGE_REPORTING_IDLE);
				329	INIT_DELAYED_WORK(&prdev->work, &page_reporting_process);
				330
				331	/* Begin initial flush of zones */
				332	__page_reporting_request(prdev);
				333
				334	/* Assign device to allow notifications */
				335	rcu_assign_pointer(pr_dev_info, prdev);
				336
				337	/* enable page reporting notification */
				338	if (!static_key_enabled(&page_reporting_enabled)) {
				339	static_branch_enable(&page_reporting_enabled);
				340	pr_info("Free page reporting enabled\n");
				341	}
				342	err_out:
				343	mutex_unlock(&page_reporting_mutex);
				344
				345	return err;
				346	}
				347	EXPORT_SYMBOL_GPL(page_reporting_register);
				348
				349	void page_reporting_unregister(struct page_reporting_dev_info *prdev)
				350	{
				351	mutex_lock(&page_reporting_mutex);
				352
				353	if (rcu_access_pointer(pr_dev_info) == prdev) {
				354	/* Disable page reporting notification */
				355	RCU_INIT_POINTER(pr_dev_info, NULL);
				356	synchronize_rcu();
				357
				358	/* Flush any existing work, and lock it out */
				359	cancel_delayed_work_sync(&prdev->work);
				360	}
				361
				362	mutex_unlock(&page_reporting_mutex);
				363	}
				364	EXPORT_SYMBOL_GPL(page_reporting_unregister);