Blame - mm/ksm.c - SHIFTPHONES/kernel/shift/mainline

blob: 8960f6ecbc12bbf952f0dee41be2a2e35eda078d [file] [log] [blame]

Hugh Dickins	f8af4da	2009-09-21 17:01:57 -0700	[diff] [blame]	1	/*
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	2	* Memory merging support.
				3	*
				4	* This code enables dynamic sharing of identical pages found in different
				5	* memory areas, even if they are not shared by fork()
				6	*
Izik Eidus	36b2528	2009-09-21 17:02:06 -0700	[diff] [blame]	7	* Copyright (C) 2008-2009 Red Hat, Inc.
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	8	* Authors:
				9	* Izik Eidus
				10	* Andrea Arcangeli
				11	* Chris Wright
Izik Eidus	36b2528	2009-09-21 17:02:06 -0700	[diff] [blame]	12	* Hugh Dickins
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	13	*
				14	* This work is licensed under the terms of the GNU GPL, version 2.
Hugh Dickins	f8af4da	2009-09-21 17:01:57 -0700	[diff] [blame]	15	*/
				16
				17	#include <linux/errno.h>
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	18	#include <linux/mm.h>
				19	#include <linux/fs.h>
Hugh Dickins	f8af4da	2009-09-21 17:01:57 -0700	[diff] [blame]	20	#include <linux/mman.h>
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	21	#include <linux/sched.h>
				22	#include <linux/rwsem.h>
				23	#include <linux/pagemap.h>
				24	#include <linux/rmap.h>
				25	#include <linux/spinlock.h>
				26	#include <linux/jhash.h>
				27	#include <linux/delay.h>
				28	#include <linux/kthread.h>
				29	#include <linux/wait.h>
				30	#include <linux/slab.h>
				31	#include <linux/rbtree.h>
Hugh Dickins	62b61f6	2009-12-14 17:59:33 -0800	[diff] [blame]	32	#include <linux/memory.h>
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	33	#include <linux/mmu_notifier.h>
Izik Eidus	2c6854f	2009-09-23 15:56:04 -0700	[diff] [blame]	34	#include <linux/swap.h>
Hugh Dickins	f8af4da	2009-09-21 17:01:57 -0700	[diff] [blame]	35	#include <linux/ksm.h>
Sasha Levin	4ca3a69	2013-02-22 16:32:28 -0800	[diff] [blame]	36	#include <linux/hashtable.h>
Andrea Arcangeli	878aee7	2011-01-13 15:47:10 -0800	[diff] [blame]	37	#include <linux/freezer.h>
David Rientjes	72788c3	2011-05-24 17:11:40 -0700	[diff] [blame]	38	#include <linux/oom.h>
Petr Holasek	90bd6fd	2013-02-22 16:35:00 -0800	[diff] [blame]	39	#include <linux/numa.h>
Hugh Dickins	f8af4da	2009-09-21 17:01:57 -0700	[diff] [blame]	40
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	41	#include <asm/tlbflush.h>
Hugh Dickins	73848b4	2009-12-14 17:59:22 -0800	[diff] [blame]	42	#include "internal.h"
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	43
Hugh Dickins	e850dcf	2013-02-22 16:35:03 -0800	[diff] [blame]	44	#ifdef CONFIG_NUMA
				45	#define NUMA(x) (x)
				46	#define DO_NUMA(x) do { (x); } while (0)
				47	#else
				48	#define NUMA(x) (0)
				49	#define DO_NUMA(x) do { } while (0)
				50	#endif
				51
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	52	/*
				53	* A few notes about the KSM scanning process,
				54	* to make it easier to understand the data structures below:
				55	*
				56	* In order to reduce excessive scanning, KSM sorts the memory pages by their
				57	* contents into a data structure that holds pointers to the pages' locations.
				58	*
				59	* Since the contents of the pages may change at any moment, KSM cannot just
				60	* insert the pages into a normal sorted tree and expect it to find anything.
				61	* Therefore KSM uses two data structures - the stable and the unstable tree.
				62	*
				63	* The stable tree holds pointers to all the merged pages (ksm pages), sorted
				64	* by their contents. Because each such page is write-protected, searching on
				65	* this tree is fully assured to be working (except when pages are unmapped),
				66	* and therefore this tree is called the stable tree.
				67	*
				68	* In addition to the stable tree, KSM uses a second data structure called the
				69	* unstable tree: this tree holds pointers to pages which have been found to
				70	* be "unchanged for a period of time". The unstable tree sorts these pages
				71	* by their contents, but since they are not write-protected, KSM cannot rely
				72	* upon the unstable tree to work correctly - the unstable tree is liable to
				73	* be corrupted as its contents are modified, and so it is called unstable.
				74	*
				75	* KSM solves this problem by several techniques:
				76	*
				77	* 1) The unstable tree is flushed every time KSM completes scanning all
				78	* memory areas, and then the tree is rebuilt again from the beginning.
				79	* 2) KSM will only insert into the unstable tree, pages whose hash value
				80	* has not changed since the previous scan of all memory areas.
				81	* 3) The unstable tree is a RedBlack Tree - so its balancing is based on the
				82	* colors of the nodes and not on their contents, assuring that even when
				83	* the tree gets "corrupted" it won't get out of balance, so scanning time
				84	* remains the same (also, searching and inserting nodes in an rbtree uses
				85	* the same algorithm, so we have no overhead when we flush and rebuild).
				86	* 4) KSM never flushes the stable tree, which means that even if it were to
				87	* take 10 attempts to find a page in the unstable tree, once it is found,
				88	* it is secured in the stable tree. (When we scan a new page, we first
				89	* compare it against the stable tree, and then against the unstable tree.)
Hugh Dickins	8fdb3db	2013-02-22 16:36:03 -0800	[diff] [blame]	90	*
				91	* If the merge_across_nodes tunable is unset, then KSM maintains multiple
				92	* stable trees and multiple unstable trees: one of each for each NUMA node.
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	93	*/
				94
				95	/**
				96	* struct mm_slot - ksm information per mm that is being scanned
				97	* @link: link to the mm_slots hash list
				98	* @mm_list: link into the mm_slots list, rooted in ksm_mm_head
Hugh Dickins	6514d51	2009-12-14 17:59:19 -0800	[diff] [blame]	99	* @rmap_list: head for this mm_slot's singly-linked list of rmap_items
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	100	* @mm: the mm that this information is valid for
				101	*/
				102	struct mm_slot {
				103	struct hlist_node link;
				104	struct list_head mm_list;
Hugh Dickins	6514d51	2009-12-14 17:59:19 -0800	[diff] [blame]	105	struct rmap_item *rmap_list;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	106	struct mm_struct *mm;
				107	};
				108
				109	/**
				110	* struct ksm_scan - cursor for scanning
				111	* @mm_slot: the current mm_slot we are scanning
				112	* @address: the next address inside that to be scanned
Hugh Dickins	6514d51	2009-12-14 17:59:19 -0800	[diff] [blame]	113	* @rmap_list: link to the next rmap to be scanned in the rmap_list
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	114	* @seqnr: count of completed full scans (needed when removing unstable node)
				115	*
				116	* There is only the one ksm_scan instance of this cursor structure.
				117	*/
				118	struct ksm_scan {
				119	struct mm_slot *mm_slot;
				120	unsigned long address;
Hugh Dickins	6514d51	2009-12-14 17:59:19 -0800	[diff] [blame]	121	struct rmap_item **rmap_list;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	122	unsigned long seqnr;
				123	};
				124
				125	/**
Hugh Dickins	7b6ba2c	2009-12-14 17:59:20 -0800	[diff] [blame]	126	* struct stable_node - node of the stable rbtree
				127	* @node: rb node of this ksm page in the stable tree
Hugh Dickins	4146d2d	2013-02-22 16:35:11 -0800	[diff] [blame]	128	* @head: (overlaying parent) &migrate_nodes indicates temporarily on that list
				129	* @list: linked into migrate_nodes, pending placement in the proper node tree
Hugh Dickins	7b6ba2c	2009-12-14 17:59:20 -0800	[diff] [blame]	130	* @hlist: hlist head of rmap_items using this ksm page
Hugh Dickins	4146d2d	2013-02-22 16:35:11 -0800	[diff] [blame]	131	* @kpfn: page frame number of this ksm page (perhaps temporarily on wrong nid)
				132	* @nid: NUMA node id of stable tree in which linked (may not match kpfn)
Hugh Dickins	7b6ba2c	2009-12-14 17:59:20 -0800	[diff] [blame]	133	*/
				134	struct stable_node {
Hugh Dickins	4146d2d	2013-02-22 16:35:11 -0800	[diff] [blame]	135	union {
				136	struct rb_node node; /* when node of stable tree */
				137	struct { /* when listed for migration */
				138	struct list_head *head;
				139	struct list_head list;
				140	};
				141	};
Hugh Dickins	7b6ba2c	2009-12-14 17:59:20 -0800	[diff] [blame]	142	struct hlist_head hlist;
Hugh Dickins	62b61f6	2009-12-14 17:59:33 -0800	[diff] [blame]	143	unsigned long kpfn;
Hugh Dickins	4146d2d	2013-02-22 16:35:11 -0800	[diff] [blame]	144	#ifdef CONFIG_NUMA
				145	int nid;
				146	#endif
Hugh Dickins	7b6ba2c	2009-12-14 17:59:20 -0800	[diff] [blame]	147	};
				148
				149	/**
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	150	* struct rmap_item - reverse mapping item for virtual addresses
Hugh Dickins	6514d51	2009-12-14 17:59:19 -0800	[diff] [blame]	151	* @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list
Hugh Dickins	db114b8	2009-12-14 17:59:25 -0800	[diff] [blame]	152	* @anon_vma: pointer to anon_vma for this mm,address, when in stable tree
Hugh Dickins	bc56620	2013-02-22 16:36:06 -0800	[diff] [blame]	153	* @nid: NUMA node id of unstable tree in which linked (may not match page)
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	154	* @mm: the memory structure this rmap_item is pointing into
				155	* @address: the virtual address this rmap_item tracks (+ flags in low bits)
				156	* @oldchecksum: previous checksum of the page at that virtual address
Hugh Dickins	7b6ba2c	2009-12-14 17:59:20 -0800	[diff] [blame]	157	* @node: rb node of this rmap_item in the unstable tree
				158	* @head: pointer to stable_node heading this list in the stable tree
				159	* @hlist: link into hlist of rmap_items hanging off that stable_node
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	160	*/
				161	struct rmap_item {
Hugh Dickins	6514d51	2009-12-14 17:59:19 -0800	[diff] [blame]	162	struct rmap_item *rmap_list;
Hugh Dickins	bc56620	2013-02-22 16:36:06 -0800	[diff] [blame]	163	union {
				164	struct anon_vma anon_vma; / when stable */
				165	#ifdef CONFIG_NUMA
				166	int nid; /* when node of unstable tree */
				167	#endif
				168	};
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	169	struct mm_struct *mm;
				170	unsigned long address; /* + low bits used for flags below */
Hugh Dickins	7b6ba2c	2009-12-14 17:59:20 -0800	[diff] [blame]	171	unsigned int oldchecksum; /* when unstable */
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	172	union {
Hugh Dickins	7b6ba2c	2009-12-14 17:59:20 -0800	[diff] [blame]	173	struct rb_node node; /* when node of unstable tree */
				174	struct { /* when listed from stable tree */
				175	struct stable_node *head;
				176	struct hlist_node hlist;
				177	};
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	178	};
				179	};
				180
				181	#define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */
Hugh Dickins	7b6ba2c	2009-12-14 17:59:20 -0800	[diff] [blame]	182	#define UNSTABLE_FLAG 0x100 /* is a node of the unstable tree */
				183	#define STABLE_FLAG 0x200 /* is listed from the stable tree */
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	184
				185	/* The stable and unstable tree heads */
Hugh Dickins	ef53d16	2013-02-22 16:36:12 -0800	[diff] [blame]	186	static struct rb_root one_stable_tree[1] = { RB_ROOT };
				187	static struct rb_root one_unstable_tree[1] = { RB_ROOT };
				188	static struct rb_root *root_stable_tree = one_stable_tree;
				189	static struct rb_root *root_unstable_tree = one_unstable_tree;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	190
Hugh Dickins	4146d2d	2013-02-22 16:35:11 -0800	[diff] [blame]	191	/* Recently migrated nodes of stable tree, pending proper placement */
				192	static LIST_HEAD(migrate_nodes);
				193
Sasha Levin	4ca3a69	2013-02-22 16:32:28 -0800	[diff] [blame]	194	#define MM_SLOTS_HASH_BITS 10
				195	static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	196
				197	static struct mm_slot ksm_mm_head = {
				198	.mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list),
				199	};
				200	static struct ksm_scan ksm_scan = {
				201	.mm_slot = &ksm_mm_head,
				202	};
				203
				204	static struct kmem_cache *rmap_item_cache;
Hugh Dickins	7b6ba2c	2009-12-14 17:59:20 -0800	[diff] [blame]	205	static struct kmem_cache *stable_node_cache;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	206	static struct kmem_cache *mm_slot_cache;
				207
				208	/* The number of nodes in the stable tree */
Hugh Dickins	b402826	2009-09-21 17:02:09 -0700	[diff] [blame]	209	static unsigned long ksm_pages_shared;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	210
Hugh Dickins	e178dfd	2009-09-21 17:02:10 -0700	[diff] [blame]	211	/* The number of page slots additionally sharing those nodes */
Hugh Dickins	b402826	2009-09-21 17:02:09 -0700	[diff] [blame]	212	static unsigned long ksm_pages_sharing;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	213
Hugh Dickins	473b0ce	2009-09-21 17:02:11 -0700	[diff] [blame]	214	/* The number of nodes in the unstable tree */
				215	static unsigned long ksm_pages_unshared;
				216
				217	/* The number of rmap_items in use: to calculate pages_volatile */
				218	static unsigned long ksm_rmap_items;
				219
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	220	/* Number of pages ksmd should scan in one batch */
Izik Eidus	2c6854f	2009-09-23 15:56:04 -0700	[diff] [blame]	221	static unsigned int ksm_thread_pages_to_scan = 100;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	222
				223	/* Milliseconds ksmd should sleep between batches */
Hugh Dickins	2ffd867	2009-09-21 17:02:23 -0700	[diff] [blame]	224	static unsigned int ksm_thread_sleep_millisecs = 20;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	225
Claudio Imbrenda	e86c59b	2017-02-24 14:55:39 -0800	[diff] [blame]	226	/* Checksum of an empty (zeroed) page */
				227	static unsigned int zero_checksum __read_mostly;
				228
				229	/* Whether to merge empty (zeroed) pages with actual zero pages */
				230	static bool ksm_use_zero_pages __read_mostly;
				231
Hugh Dickins	e850dcf	2013-02-22 16:35:03 -0800	[diff] [blame]	232	#ifdef CONFIG_NUMA
Petr Holasek	90bd6fd	2013-02-22 16:35:00 -0800	[diff] [blame]	233	/* Zeroed when merging across nodes is not allowed */
				234	static unsigned int ksm_merge_across_nodes = 1;
Hugh Dickins	ef53d16	2013-02-22 16:36:12 -0800	[diff] [blame]	235	static int ksm_nr_node_ids = 1;
Hugh Dickins	e850dcf	2013-02-22 16:35:03 -0800	[diff] [blame]	236	#else
				237	#define ksm_merge_across_nodes 1U
Hugh Dickins	ef53d16	2013-02-22 16:36:12 -0800	[diff] [blame]	238	#define ksm_nr_node_ids 1
Hugh Dickins	e850dcf	2013-02-22 16:35:03 -0800	[diff] [blame]	239	#endif
Petr Holasek	90bd6fd	2013-02-22 16:35:00 -0800	[diff] [blame]	240
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	241	#define KSM_RUN_STOP 0
				242	#define KSM_RUN_MERGE 1
				243	#define KSM_RUN_UNMERGE 2
Hugh Dickins	ef4d43a	2013-02-22 16:35:16 -0800	[diff] [blame]	244	#define KSM_RUN_OFFLINE 4
				245	static unsigned long ksm_run = KSM_RUN_STOP;
				246	static void wait_while_offlining(void);
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	247
				248	static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait);
				249	static DEFINE_MUTEX(ksm_thread_mutex);
				250	static DEFINE_SPINLOCK(ksm_mmlist_lock);
				251
				252	#define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\
				253	sizeof(struct __struct), __alignof__(struct __struct),\
				254	(__flags), NULL)
				255
				256	static int __init ksm_slab_init(void)
				257	{
				258	rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0);
				259	if (!rmap_item_cache)
				260	goto out;
				261
Hugh Dickins	7b6ba2c	2009-12-14 17:59:20 -0800	[diff] [blame]	262	stable_node_cache = KSM_KMEM_CACHE(stable_node, 0);
				263	if (!stable_node_cache)
				264	goto out_free1;
				265
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	266	mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0);
				267	if (!mm_slot_cache)
Hugh Dickins	7b6ba2c	2009-12-14 17:59:20 -0800	[diff] [blame]	268	goto out_free2;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	269
				270	return 0;
				271
Hugh Dickins	7b6ba2c	2009-12-14 17:59:20 -0800	[diff] [blame]	272	out_free2:
				273	kmem_cache_destroy(stable_node_cache);
				274	out_free1:
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	275	kmem_cache_destroy(rmap_item_cache);
				276	out:
				277	return -ENOMEM;
				278	}
				279
				280	static void __init ksm_slab_free(void)
				281	{
				282	kmem_cache_destroy(mm_slot_cache);
Hugh Dickins	7b6ba2c	2009-12-14 17:59:20 -0800	[diff] [blame]	283	kmem_cache_destroy(stable_node_cache);
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	284	kmem_cache_destroy(rmap_item_cache);
				285	mm_slot_cache = NULL;
				286	}
				287
				288	static inline struct rmap_item *alloc_rmap_item(void)
				289	{
Hugh Dickins	473b0ce	2009-09-21 17:02:11 -0700	[diff] [blame]	290	struct rmap_item *rmap_item;
				291
zhong jiang	5b398e4	2016-09-28 15:22:30 -0700	[diff] [blame]	292	rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL \|
				293	__GFP_NORETRY \| __GFP_NOWARN);
Hugh Dickins	473b0ce	2009-09-21 17:02:11 -0700	[diff] [blame]	294	if (rmap_item)
				295	ksm_rmap_items++;
				296	return rmap_item;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	297	}
				298
				299	static inline void free_rmap_item(struct rmap_item *rmap_item)
				300	{
Hugh Dickins	473b0ce	2009-09-21 17:02:11 -0700	[diff] [blame]	301	ksm_rmap_items--;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	302	rmap_item->mm = NULL; /* debug safety */
				303	kmem_cache_free(rmap_item_cache, rmap_item);
				304	}
				305
Hugh Dickins	7b6ba2c	2009-12-14 17:59:20 -0800	[diff] [blame]	306	static inline struct stable_node *alloc_stable_node(void)
				307	{
zhong jiang	6213055	2016-10-07 17:01:19 -0700	[diff] [blame]	308	/*
				309	* The allocation can take too long with GFP_KERNEL when memory is under
				310	* pressure, which may lead to hung task warnings. Adding __GFP_HIGH
				311	* grants access to memory reserves, helping to avoid this problem.
				312	*/
				313	return kmem_cache_alloc(stable_node_cache, GFP_KERNEL \| __GFP_HIGH);
Hugh Dickins	7b6ba2c	2009-12-14 17:59:20 -0800	[diff] [blame]	314	}
				315
				316	static inline void free_stable_node(struct stable_node *stable_node)
				317	{
				318	kmem_cache_free(stable_node_cache, stable_node);
				319	}
				320
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	321	static inline struct mm_slot *alloc_mm_slot(void)
				322	{
				323	if (!mm_slot_cache) /* initialization failed */
				324	return NULL;
				325	return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
				326	}
				327
				328	static inline void free_mm_slot(struct mm_slot *mm_slot)
				329	{
				330	kmem_cache_free(mm_slot_cache, mm_slot);
				331	}
				332
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	333	static struct mm_slot get_mm_slot(struct mm_struct mm)
				334	{
Sasha Levin	4ca3a69	2013-02-22 16:32:28 -0800	[diff] [blame]	335	struct mm_slot *slot;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	336
Sasha Levin	b67bfe0	2013-02-27 17:06:00 -0800	[diff] [blame]	337	hash_for_each_possible(mm_slots_hash, slot, link, (unsigned long)mm)
Sasha Levin	4ca3a69	2013-02-22 16:32:28 -0800	[diff] [blame]	338	if (slot->mm == mm)
				339	return slot;
				340
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	341	return NULL;
				342	}
				343
				344	static void insert_to_mm_slots_hash(struct mm_struct *mm,
				345	struct mm_slot *mm_slot)
				346	{
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	347	mm_slot->mm = mm;
Sasha Levin	4ca3a69	2013-02-22 16:32:28 -0800	[diff] [blame]	348	hash_add(mm_slots_hash, &mm_slot->link, (unsigned long)mm);
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	349	}
				350
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	351	/*
Hugh Dickins	a913e18	2009-09-21 17:02:26 -0700	[diff] [blame]	352	* ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's
				353	* page tables after it has passed through ksm_exit() - which, if necessary,
				354	* takes mmap_sem briefly to serialize against them. ksm_exit() does not set
				355	* a special flag: they can just back out as soon as mm_users goes to zero.
				356	* ksm_test_exit() is used throughout to make this test for exit: in some
				357	* places for correctness, in some places just to avoid unnecessary work.
				358	*/
				359	static inline bool ksm_test_exit(struct mm_struct *mm)
				360	{
				361	return atomic_read(&mm->mm_users) == 0;
				362	}
				363
				364	/*
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	365	* We use break_ksm to break COW on a ksm page: it's a stripped down
				366	*
Dave Hansen	d4edcf0	2016-02-12 13:01:56 -0800	[diff] [blame]	367	* if (get_user_pages(addr, 1, 1, 1, &page, NULL) == 1)
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	368	* put_page(page);
				369	*
				370	* but taking great care only to touch a ksm page, in a VM_MERGEABLE vma,
				371	* in case the application has unmapped and remapped mm,addr meanwhile.
				372	* Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP
				373	* mmap of /dev/mem or /dev/kmem, where we would not want to touch it.
Dave Hansen	1b2ee12	2016-02-12 13:02:21 -0800	[diff] [blame]	374	*
				375	* FAULT_FLAG/FOLL_REMOTE are because we do this outside the context
				376	* of the process that owns 'vma'. We also do not want to enforce
				377	* protection keys here anyway.
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	378	*/
Hugh Dickins	d952b79	2009-09-21 17:02:16 -0700	[diff] [blame]	379	static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	380	{
				381	struct page *page;
Hugh Dickins	d952b79	2009-09-21 17:02:16 -0700	[diff] [blame]	382	int ret = 0;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	383
				384	do {
				385	cond_resched();
Dave Hansen	1b2ee12	2016-02-12 13:02:21 -0800	[diff] [blame]	386	page = follow_page(vma, addr,
				387	FOLL_GET \| FOLL_MIGRATION \| FOLL_REMOTE);
Dan Carpenter	22eccdd	2010-04-23 13:18:10 -0400	[diff] [blame]	388	if (IS_ERR_OR_NULL(page))
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	389	break;
				390	if (PageKsm(page))
Kirill A. Shutemov	dcddffd	2016-07-26 15:25:18 -0700	[diff] [blame]	391	ret = handle_mm_fault(vma, addr,
				392	FAULT_FLAG_WRITE \| FAULT_FLAG_REMOTE);
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	393	else
				394	ret = VM_FAULT_WRITE;
				395	put_page(page);
Linus Torvalds	33692f2	2015-01-29 10:51:32 -0800	[diff] [blame]	396	} while (!(ret & (VM_FAULT_WRITE \| VM_FAULT_SIGBUS \| VM_FAULT_SIGSEGV \| VM_FAULT_OOM)));
Hugh Dickins	d952b79	2009-09-21 17:02:16 -0700	[diff] [blame]	397	/*
				398	* We must loop because handle_mm_fault() may back out if there's
				399	* any difficulty e.g. if pte accessed bit gets updated concurrently.
				400	*
				401	* VM_FAULT_WRITE is what we have been hoping for: it indicates that
				402	* COW has been broken, even if the vma does not permit VM_WRITE;
				403	* but note that a concurrent fault might break PageKsm for us.
				404	*
				405	* VM_FAULT_SIGBUS could occur if we race with truncation of the
				406	* backing file, which also invalidates anonymous pages: that's
				407	* okay, that truncation will have unmapped the PageKsm for us.
				408	*
				409	* VM_FAULT_OOM: at the time of writing (late July 2009), setting
				410	* aside mem_cgroup limits, VM_FAULT_OOM would only be set if the
				411	* current task has TIF_MEMDIE set, and will be OOM killed on return
				412	* to user; and ksmd, having no mm, would never be chosen for that.
				413	*
				414	* But if the mm is in a limited mem_cgroup, then the fault may fail
				415	* with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and
				416	* even ksmd can fail in this way - though it's usually breaking ksm
				417	* just to undo a merge it made a moment before, so unlikely to oom.
				418	*
				419	* That's a pity: we might therefore have more kernel pages allocated
				420	* than we're counting as nodes in the stable tree; but ksm_do_scan
				421	* will retry to break_cow on each pass, so should recover the page
				422	* in due course. The important thing is to not let VM_MERGEABLE
				423	* be cleared while any such pages might remain in the area.
				424	*/
				425	return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	426	}
				427
Bob Liu	ef69422	2012-03-21 16:34:11 -0700	[diff] [blame]	428	static struct vm_area_struct find_mergeable_vma(struct mm_struct mm,
				429	unsigned long addr)
				430	{
				431	struct vm_area_struct *vma;
				432	if (ksm_test_exit(mm))
				433	return NULL;
				434	vma = find_vma(mm, addr);
				435	if (!vma \|\| vma->vm_start > addr)
				436	return NULL;
				437	if (!(vma->vm_flags & VM_MERGEABLE) \|\| !vma->anon_vma)
				438	return NULL;
				439	return vma;
				440	}
				441
Hugh Dickins	8dd3557	2009-12-14 17:59:18 -0800	[diff] [blame]	442	static void break_cow(struct rmap_item *rmap_item)
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	443	{
Hugh Dickins	8dd3557	2009-12-14 17:59:18 -0800	[diff] [blame]	444	struct mm_struct *mm = rmap_item->mm;
				445	unsigned long addr = rmap_item->address;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	446	struct vm_area_struct *vma;
				447
Hugh Dickins	4035c07a	2009-12-14 17:59:27 -0800	[diff] [blame]	448	/*
				449	* It is not an accident that whenever we want to break COW
				450	* to undo, we also need to drop a reference to the anon_vma.
				451	*/
Peter Zijlstra	9e60109	2011-03-22 16:32:46 -0700	[diff] [blame]	452	put_anon_vma(rmap_item->anon_vma);
Hugh Dickins	4035c07a	2009-12-14 17:59:27 -0800	[diff] [blame]	453
Hugh Dickins	81464e30	2009-09-21 17:02:15 -0700	[diff] [blame]	454	down_read(&mm->mmap_sem);
Bob Liu	ef69422	2012-03-21 16:34:11 -0700	[diff] [blame]	455	vma = find_mergeable_vma(mm, addr);
				456	if (vma)
				457	break_ksm(vma, addr);
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	458	up_read(&mm->mmap_sem);
				459	}
				460
				461	static struct page get_mergeable_page(struct rmap_item rmap_item)
				462	{
				463	struct mm_struct *mm = rmap_item->mm;
				464	unsigned long addr = rmap_item->address;
				465	struct vm_area_struct *vma;
				466	struct page *page;
				467
				468	down_read(&mm->mmap_sem);
Bob Liu	ef69422	2012-03-21 16:34:11 -0700	[diff] [blame]	469	vma = find_mergeable_vma(mm, addr);
				470	if (!vma)
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	471	goto out;
				472
				473	page = follow_page(vma, addr, FOLL_GET);
Dan Carpenter	22eccdd	2010-04-23 13:18:10 -0400	[diff] [blame]	474	if (IS_ERR_OR_NULL(page))
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	475	goto out;
Kirill A. Shutemov	f765f54	2016-01-15 16:53:03 -0800	[diff] [blame]	476	if (PageAnon(page)) {
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	477	flush_anon_page(vma, page, addr);
				478	flush_dcache_page(page);
				479	} else {
				480	put_page(page);
Andrea Arcangeli	c8f95ed	2015-11-05 18:49:19 -0800	[diff] [blame]	481	out:
				482	page = NULL;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	483	}
				484	up_read(&mm->mmap_sem);
				485	return page;
				486	}
				487
Petr Holasek	90bd6fd	2013-02-22 16:35:00 -0800	[diff] [blame]	488	/*
				489	* This helper is used for getting right index into array of tree roots.
				490	* When merge_across_nodes knob is set to 1, there are only two rb-trees for
				491	* stable and unstable pages from all nodes with roots in index 0. Otherwise,
				492	* every node has its own stable and unstable tree.
				493	*/
				494	static inline int get_kpfn_nid(unsigned long kpfn)
				495	{
Hugh Dickins	d8fc16a	2013-03-08 12:43:34 -0800	[diff] [blame]	496	return ksm_merge_across_nodes ? 0 : NUMA(pfn_to_nid(kpfn));
Petr Holasek	90bd6fd	2013-02-22 16:35:00 -0800	[diff] [blame]	497	}
				498
Hugh Dickins	4035c07a	2009-12-14 17:59:27 -0800	[diff] [blame]	499	static void remove_node_from_stable_tree(struct stable_node *stable_node)
				500	{
				501	struct rmap_item *rmap_item;
Hugh Dickins	4035c07a	2009-12-14 17:59:27 -0800	[diff] [blame]	502
Sasha Levin	b67bfe0	2013-02-27 17:06:00 -0800	[diff] [blame]	503	hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
Hugh Dickins	4035c07a	2009-12-14 17:59:27 -0800	[diff] [blame]	504	if (rmap_item->hlist.next)
				505	ksm_pages_sharing--;
				506	else
				507	ksm_pages_shared--;
Peter Zijlstra	9e60109	2011-03-22 16:32:46 -0700	[diff] [blame]	508	put_anon_vma(rmap_item->anon_vma);
Hugh Dickins	4035c07a	2009-12-14 17:59:27 -0800	[diff] [blame]	509	rmap_item->address &= PAGE_MASK;
				510	cond_resched();
				511	}
				512
Hugh Dickins	4146d2d	2013-02-22 16:35:11 -0800	[diff] [blame]	513	if (stable_node->head == &migrate_nodes)
				514	list_del(&stable_node->list);
				515	else
				516	rb_erase(&stable_node->node,
Hugh Dickins	ef53d16	2013-02-22 16:36:12 -0800	[diff] [blame]	517	root_stable_tree + NUMA(stable_node->nid));
Hugh Dickins	4035c07a	2009-12-14 17:59:27 -0800	[diff] [blame]	518	free_stable_node(stable_node);
				519	}
				520
				521	/*
				522	* get_ksm_page: checks if the page indicated by the stable node
				523	* is still its ksm page, despite having held no reference to it.
				524	* In which case we can trust the content of the page, and it
				525	* returns the gotten page; but if the page has now been zapped,
				526	* remove the stale node from the stable tree and return NULL.
Hugh Dickins	c8d6553	2013-02-22 16:35:10 -0800	[diff] [blame]	527	* But beware, the stable node's page might be being migrated.
Hugh Dickins	4035c07a	2009-12-14 17:59:27 -0800	[diff] [blame]	528	*
				529	* You would expect the stable_node to hold a reference to the ksm page.
				530	* But if it increments the page's count, swapping out has to wait for
				531	* ksmd to come around again before it can free the page, which may take
				532	* seconds or even minutes: much too unresponsive. So instead we use a
				533	* "keyhole reference": access to the ksm page from the stable node peeps
				534	* out through its keyhole to see if that page still holds the right key,
				535	* pointing back to this stable node. This relies on freeing a PageAnon
				536	* page to reset its page->mapping to NULL, and relies on no other use of
				537	* a page to put something that might look like our key in page->mapping.
Hugh Dickins	4035c07a	2009-12-14 17:59:27 -0800	[diff] [blame]	538	* is on its way to being freed; but it is an anomaly to bear in mind.
				539	*/
Hugh Dickins	8fdb3db	2013-02-22 16:36:03 -0800	[diff] [blame]	540	static struct page get_ksm_page(struct stable_node stable_node, bool lock_it)
Hugh Dickins	4035c07a	2009-12-14 17:59:27 -0800	[diff] [blame]	541	{
				542	struct page *page;
				543	void *expected_mapping;
Hugh Dickins	c8d6553	2013-02-22 16:35:10 -0800	[diff] [blame]	544	unsigned long kpfn;
Hugh Dickins	4035c07a	2009-12-14 17:59:27 -0800	[diff] [blame]	545
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	546	expected_mapping = (void *)((unsigned long)stable_node \|
				547	PAGE_MAPPING_KSM);
Hugh Dickins	c8d6553	2013-02-22 16:35:10 -0800	[diff] [blame]	548	again:
Jason Low	4db0c3c	2015-04-15 16:14:08 -0700	[diff] [blame]	549	kpfn = READ_ONCE(stable_node->kpfn);
Hugh Dickins	c8d6553	2013-02-22 16:35:10 -0800	[diff] [blame]	550	page = pfn_to_page(kpfn);
				551
				552	/*
				553	* page is computed from kpfn, so on most architectures reading
				554	* page->mapping is naturally ordered after reading node->kpfn,
				555	* but on Alpha we need to be more careful.
				556	*/
				557	smp_read_barrier_depends();
Jason Low	4db0c3c	2015-04-15 16:14:08 -0700	[diff] [blame]	558	if (READ_ONCE(page->mapping) != expected_mapping)
Hugh Dickins	4035c07a	2009-12-14 17:59:27 -0800	[diff] [blame]	559	goto stale;
Hugh Dickins	c8d6553	2013-02-22 16:35:10 -0800	[diff] [blame]	560
				561	/*
				562	* We cannot do anything with the page while its refcount is 0.
				563	* Usually 0 means free, or tail of a higher-order page: in which
				564	* case this node is no longer referenced, and should be freed;
				565	* however, it might mean that the page is under page_freeze_refs().
				566	* The __remove_mapping() case is easy, again the node is now stale;
				567	* but if page is swapcache in migrate_page_move_mapping(), it might
				568	* still be our page, in which case it's essential to keep the node.
				569	*/
				570	while (!get_page_unless_zero(page)) {
				571	/*
				572	* Another check for page->mapping != expected_mapping would
				573	* work here too. We have chosen the !PageSwapCache test to
				574	* optimize the common case, when the page is or is about to
				575	* be freed: PageSwapCache is cleared (under spin_lock_irq)
				576	* in the freeze_refs section of __remove_mapping(); but Anon
				577	* page->mapping reset to NULL later, in free_pages_prepare().
				578	*/
				579	if (!PageSwapCache(page))
				580	goto stale;
				581	cpu_relax();
				582	}
				583
Jason Low	4db0c3c	2015-04-15 16:14:08 -0700	[diff] [blame]	584	if (READ_ONCE(page->mapping) != expected_mapping) {
Hugh Dickins	4035c07a	2009-12-14 17:59:27 -0800	[diff] [blame]	585	put_page(page);
				586	goto stale;
				587	}
Hugh Dickins	c8d6553	2013-02-22 16:35:10 -0800	[diff] [blame]	588
Hugh Dickins	8fdb3db	2013-02-22 16:36:03 -0800	[diff] [blame]	589	if (lock_it) {
Hugh Dickins	8aafa6a	2013-02-22 16:35:06 -0800	[diff] [blame]	590	lock_page(page);
Jason Low	4db0c3c	2015-04-15 16:14:08 -0700	[diff] [blame]	591	if (READ_ONCE(page->mapping) != expected_mapping) {
Hugh Dickins	8aafa6a	2013-02-22 16:35:06 -0800	[diff] [blame]	592	unlock_page(page);
				593	put_page(page);
				594	goto stale;
				595	}
				596	}
Hugh Dickins	4035c07a	2009-12-14 17:59:27 -0800	[diff] [blame]	597	return page;
Hugh Dickins	c8d6553	2013-02-22 16:35:10 -0800	[diff] [blame]	598
Hugh Dickins	4035c07a	2009-12-14 17:59:27 -0800	[diff] [blame]	599	stale:
Hugh Dickins	c8d6553	2013-02-22 16:35:10 -0800	[diff] [blame]	600	/*
				601	* We come here from above when page->mapping or !PageSwapCache
				602	* suggests that the node is stale; but it might be under migration.
				603	* We need smp_rmb(), matching the smp_wmb() in ksm_migrate_page(),
				604	* before checking whether node->kpfn has been changed.
				605	*/
				606	smp_rmb();
Jason Low	4db0c3c	2015-04-15 16:14:08 -0700	[diff] [blame]	607	if (READ_ONCE(stable_node->kpfn) != kpfn)
Hugh Dickins	c8d6553	2013-02-22 16:35:10 -0800	[diff] [blame]	608	goto again;
Hugh Dickins	4035c07a	2009-12-14 17:59:27 -0800	[diff] [blame]	609	remove_node_from_stable_tree(stable_node);
				610	return NULL;
				611	}
				612
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	613	/*
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	614	* Removing rmap_item from stable or unstable tree.
				615	* This function will clean the information from the stable/unstable tree.
				616	*/
				617	static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
				618	{
Hugh Dickins	7b6ba2c	2009-12-14 17:59:20 -0800	[diff] [blame]	619	if (rmap_item->address & STABLE_FLAG) {
				620	struct stable_node *stable_node;
Hugh Dickins	5ad6468	2009-12-14 17:59:24 -0800	[diff] [blame]	621	struct page *page;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	622
Hugh Dickins	7b6ba2c	2009-12-14 17:59:20 -0800	[diff] [blame]	623	stable_node = rmap_item->head;
Hugh Dickins	8aafa6a	2013-02-22 16:35:06 -0800	[diff] [blame]	624	page = get_ksm_page(stable_node, true);
Hugh Dickins	4035c07a	2009-12-14 17:59:27 -0800	[diff] [blame]	625	if (!page)
				626	goto out;
				627
Hugh Dickins	7b6ba2c	2009-12-14 17:59:20 -0800	[diff] [blame]	628	hlist_del(&rmap_item->hlist);
Hugh Dickins	4035c07a	2009-12-14 17:59:27 -0800	[diff] [blame]	629	unlock_page(page);
				630	put_page(page);
Hugh Dickins	08beca4	2009-12-14 17:59:21 -0800	[diff] [blame]	631
Andrea Arcangeli	98666f8a	2015-11-05 18:49:13 -0800	[diff] [blame]	632	if (!hlist_empty(&stable_node->hlist))
Hugh Dickins	4035c07a	2009-12-14 17:59:27 -0800	[diff] [blame]	633	ksm_pages_sharing--;
				634	else
Hugh Dickins	7b6ba2c	2009-12-14 17:59:20 -0800	[diff] [blame]	635	ksm_pages_shared--;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	636
Peter Zijlstra	9e60109	2011-03-22 16:32:46 -0700	[diff] [blame]	637	put_anon_vma(rmap_item->anon_vma);
Hugh Dickins	93d1771	2009-12-14 17:59:16 -0800	[diff] [blame]	638	rmap_item->address &= PAGE_MASK;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	639
Hugh Dickins	7b6ba2c	2009-12-14 17:59:20 -0800	[diff] [blame]	640	} else if (rmap_item->address & UNSTABLE_FLAG) {
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	641	unsigned char age;
				642	/*
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	643	* Usually ksmd can and must skip the rb_erase, because
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	644	* root_unstable_tree was already reset to RB_ROOT.
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	645	* But be careful when an mm is exiting: do the rb_erase
				646	* if this rmap_item was inserted by this scan, rather
				647	* than left over from before.
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	648	*/
				649	age = (unsigned char)(ksm_scan.seqnr - rmap_item->address);
Hugh Dickins	cd551f9	2009-09-21 17:02:17 -0700	[diff] [blame]	650	BUG_ON(age > 1);
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	651	if (!age)
Petr Holasek	90bd6fd	2013-02-22 16:35:00 -0800	[diff] [blame]	652	rb_erase(&rmap_item->node,
Hugh Dickins	ef53d16	2013-02-22 16:36:12 -0800	[diff] [blame]	653	root_unstable_tree + NUMA(rmap_item->nid));
Hugh Dickins	93d1771	2009-12-14 17:59:16 -0800	[diff] [blame]	654	ksm_pages_unshared--;
				655	rmap_item->address &= PAGE_MASK;
				656	}
Hugh Dickins	4035c07a	2009-12-14 17:59:27 -0800	[diff] [blame]	657	out:
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	658	cond_resched(); /* we're called from many long loops */
				659	}
				660
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	661	static void remove_trailing_rmap_items(struct mm_slot *mm_slot,
Hugh Dickins	6514d51	2009-12-14 17:59:19 -0800	[diff] [blame]	662	struct rmap_item **rmap_list)
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	663	{
Hugh Dickins	6514d51	2009-12-14 17:59:19 -0800	[diff] [blame]	664	while (*rmap_list) {
				665	struct rmap_item rmap_item = rmap_list;
				666	*rmap_list = rmap_item->rmap_list;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	667	remove_rmap_item_from_tree(rmap_item);
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	668	free_rmap_item(rmap_item);
				669	}
				670	}
				671
				672	/*
Hugh Dickins	e850dcf	2013-02-22 16:35:03 -0800	[diff] [blame]	673	* Though it's very tempting to unmerge rmap_items from stable tree rather
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	674	* than check every pte of a given vma, the locking doesn't quite work for
				675	* that - an rmap_item is assigned to the stable tree after inserting ksm
				676	* page and upping mmap_sem. Nor does it fit with the way we skip dup'ing
				677	* rmap_items from parent to child at fork time (so as not to waste time
				678	* if exit comes before the next scan reaches it).
Hugh Dickins	81464e30	2009-09-21 17:02:15 -0700	[diff] [blame]	679	*
				680	* Similarly, although we'd like to remove rmap_items (so updating counts
				681	* and freeing memory) when unmerging an area, it's easier to leave that
				682	* to the next pass of ksmd - consider, for example, how ksmd might be
				683	* in cmp_and_merge_page on one of the rmap_items we would be removing.
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	684	*/
Hugh Dickins	d952b79	2009-09-21 17:02:16 -0700	[diff] [blame]	685	static int unmerge_ksm_pages(struct vm_area_struct *vma,
				686	unsigned long start, unsigned long end)
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	687	{
				688	unsigned long addr;
Hugh Dickins	d952b79	2009-09-21 17:02:16 -0700	[diff] [blame]	689	int err = 0;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	690
Hugh Dickins	d952b79	2009-09-21 17:02:16 -0700	[diff] [blame]	691	for (addr = start; addr < end && !err; addr += PAGE_SIZE) {
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	692	if (ksm_test_exit(vma->vm_mm))
				693	break;
Hugh Dickins	d952b79	2009-09-21 17:02:16 -0700	[diff] [blame]	694	if (signal_pending(current))
				695	err = -ERESTARTSYS;
				696	else
				697	err = break_ksm(vma, addr);
				698	}
				699	return err;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	700	}
				701
Hugh Dickins	2ffd867	2009-09-21 17:02:23 -0700	[diff] [blame]	702	#ifdef CONFIG_SYSFS
				703	/*
				704	* Only called through the sysfs control interface:
				705	*/
Hugh Dickins	cbf86cf	2013-02-22 16:35:08 -0800	[diff] [blame]	706	static int remove_stable_node(struct stable_node *stable_node)
				707	{
				708	struct page *page;
				709	int err;
				710
				711	page = get_ksm_page(stable_node, true);
				712	if (!page) {
				713	/*
				714	* get_ksm_page did remove_node_from_stable_tree itself.
				715	*/
				716	return 0;
				717	}
				718
Hugh Dickins	8fdb3db	2013-02-22 16:36:03 -0800	[diff] [blame]	719	if (WARN_ON_ONCE(page_mapped(page))) {
Hugh Dickins	cbf86cf	2013-02-22 16:35:08 -0800	[diff] [blame]	720	/*
Hugh Dickins	8fdb3db	2013-02-22 16:36:03 -0800	[diff] [blame]	721	* This should not happen: but if it does, just refuse to let
				722	* merge_across_nodes be switched - there is no need to panic.
				723	*/
				724	err = -EBUSY;
				725	} else {
				726	/*
				727	* The stable node did not yet appear stale to get_ksm_page(),
				728	* since that allows for an unmapped ksm page to be recognized
				729	* right up until it is freed; but the node is safe to remove.
Hugh Dickins	cbf86cf	2013-02-22 16:35:08 -0800	[diff] [blame]	730	* This page might be in a pagevec waiting to be freed,
				731	* or it might be PageSwapCache (perhaps under writeback),
				732	* or it might have been removed from swapcache a moment ago.
				733	*/
				734	set_page_stable_node(page, NULL);
				735	remove_node_from_stable_tree(stable_node);
				736	err = 0;
				737	}
				738
				739	unlock_page(page);
				740	put_page(page);
				741	return err;
				742	}
				743
				744	static int remove_all_stable_nodes(void)
				745	{
Geliang Tang	0364041	2016-01-14 15:20:54 -0800	[diff] [blame]	746	struct stable_node stable_node, next;
Hugh Dickins	cbf86cf	2013-02-22 16:35:08 -0800	[diff] [blame]	747	int nid;
				748	int err = 0;
				749
Hugh Dickins	ef53d16	2013-02-22 16:36:12 -0800	[diff] [blame]	750	for (nid = 0; nid < ksm_nr_node_ids; nid++) {
Hugh Dickins	cbf86cf	2013-02-22 16:35:08 -0800	[diff] [blame]	751	while (root_stable_tree[nid].rb_node) {
				752	stable_node = rb_entry(root_stable_tree[nid].rb_node,
				753	struct stable_node, node);
				754	if (remove_stable_node(stable_node)) {
				755	err = -EBUSY;
				756	break; /* proceed to next nid */
				757	}
				758	cond_resched();
				759	}
				760	}
Geliang Tang	0364041	2016-01-14 15:20:54 -0800	[diff] [blame]	761	list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) {
Hugh Dickins	4146d2d	2013-02-22 16:35:11 -0800	[diff] [blame]	762	if (remove_stable_node(stable_node))
				763	err = -EBUSY;
				764	cond_resched();
				765	}
Hugh Dickins	cbf86cf	2013-02-22 16:35:08 -0800	[diff] [blame]	766	return err;
				767	}
				768
Hugh Dickins	d952b79	2009-09-21 17:02:16 -0700	[diff] [blame]	769	static int unmerge_and_remove_all_rmap_items(void)
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	770	{
				771	struct mm_slot *mm_slot;
				772	struct mm_struct *mm;
				773	struct vm_area_struct *vma;
Hugh Dickins	d952b79	2009-09-21 17:02:16 -0700	[diff] [blame]	774	int err = 0;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	775
Hugh Dickins	d952b79	2009-09-21 17:02:16 -0700	[diff] [blame]	776	spin_lock(&ksm_mmlist_lock);
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	777	ksm_scan.mm_slot = list_entry(ksm_mm_head.mm_list.next,
Hugh Dickins	d952b79	2009-09-21 17:02:16 -0700	[diff] [blame]	778	struct mm_slot, mm_list);
				779	spin_unlock(&ksm_mmlist_lock);
				780
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	781	for (mm_slot = ksm_scan.mm_slot;
				782	mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) {
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	783	mm = mm_slot->mm;
				784	down_read(&mm->mmap_sem);
				785	for (vma = mm->mmap; vma; vma = vma->vm_next) {
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	786	if (ksm_test_exit(mm))
				787	break;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	788	if (!(vma->vm_flags & VM_MERGEABLE) \|\| !vma->anon_vma)
				789	continue;
Hugh Dickins	d952b79	2009-09-21 17:02:16 -0700	[diff] [blame]	790	err = unmerge_ksm_pages(vma,
				791	vma->vm_start, vma->vm_end);
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	792	if (err)
				793	goto error;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	794	}
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	795
Hugh Dickins	6514d51	2009-12-14 17:59:19 -0800	[diff] [blame]	796	remove_trailing_rmap_items(mm_slot, &mm_slot->rmap_list);
Zhou Chengming	7496fea	2016-05-12 15:42:21 -0700	[diff] [blame]	797	up_read(&mm->mmap_sem);
Hugh Dickins	d952b79	2009-09-21 17:02:16 -0700	[diff] [blame]	798
				799	spin_lock(&ksm_mmlist_lock);
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	800	ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next,
Hugh Dickins	d952b79	2009-09-21 17:02:16 -0700	[diff] [blame]	801	struct mm_slot, mm_list);
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	802	if (ksm_test_exit(mm)) {
Sasha Levin	4ca3a69	2013-02-22 16:32:28 -0800	[diff] [blame]	803	hash_del(&mm_slot->link);
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	804	list_del(&mm_slot->mm_list);
				805	spin_unlock(&ksm_mmlist_lock);
				806
				807	free_mm_slot(mm_slot);
				808	clear_bit(MMF_VM_MERGEABLE, &mm->flags);
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	809	mmdrop(mm);
Zhou Chengming	7496fea	2016-05-12 15:42:21 -0700	[diff] [blame]	810	} else
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	811	spin_unlock(&ksm_mmlist_lock);
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	812	}
				813
Hugh Dickins	cbf86cf	2013-02-22 16:35:08 -0800	[diff] [blame]	814	/* Clean up stable nodes, but don't worry if some are still busy */
				815	remove_all_stable_nodes();
Hugh Dickins	d952b79	2009-09-21 17:02:16 -0700	[diff] [blame]	816	ksm_scan.seqnr = 0;
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	817	return 0;
				818
				819	error:
				820	up_read(&mm->mmap_sem);
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	821	spin_lock(&ksm_mmlist_lock);
Hugh Dickins	d952b79	2009-09-21 17:02:16 -0700	[diff] [blame]	822	ksm_scan.mm_slot = &ksm_mm_head;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	823	spin_unlock(&ksm_mmlist_lock);
Hugh Dickins	d952b79	2009-09-21 17:02:16 -0700	[diff] [blame]	824	return err;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	825	}
Hugh Dickins	2ffd867	2009-09-21 17:02:23 -0700	[diff] [blame]	826	#endif /* CONFIG_SYSFS */
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	827
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	828	static u32 calc_checksum(struct page *page)
				829	{
				830	u32 checksum;
Cong Wang	9b04c5f	2011-11-25 23:14:39 +0800	[diff] [blame]	831	void *addr = kmap_atomic(page);
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	832	checksum = jhash2(addr, PAGE_SIZE / 4, 17);
Cong Wang	9b04c5f	2011-11-25 23:14:39 +0800	[diff] [blame]	833	kunmap_atomic(addr);
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	834	return checksum;
				835	}
				836
				837	static int memcmp_pages(struct page page1, struct page page2)
				838	{
				839	char addr1, addr2;
				840	int ret;
				841
Cong Wang	9b04c5f	2011-11-25 23:14:39 +0800	[diff] [blame]	842	addr1 = kmap_atomic(page1);
				843	addr2 = kmap_atomic(page2);
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	844	ret = memcmp(addr1, addr2, PAGE_SIZE);
Cong Wang	9b04c5f	2011-11-25 23:14:39 +0800	[diff] [blame]	845	kunmap_atomic(addr2);
				846	kunmap_atomic(addr1);
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	847	return ret;
				848	}
				849
				850	static inline int pages_identical(struct page page1, struct page page2)
				851	{
				852	return !memcmp_pages(page1, page2);
				853	}
				854
				855	static int write_protect_page(struct vm_area_struct vma, struct page page,
				856	pte_t *orig_pte)
				857	{
				858	struct mm_struct *mm = vma->vm_mm;
Kirill A. Shutemov	36eaff3	2017-02-24 14:58:04 -0800	[diff] [blame^]	859	struct page_vma_mapped_walk pvmw = {
				860	.page = page,
				861	.vma = vma,
				862	};
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	863	int swapped;
				864	int err = -EFAULT;
Haggai Eran	6bdb913	2012-10-08 16:33:35 -0700	[diff] [blame]	865	unsigned long mmun_start; /* For mmu_notifiers */
				866	unsigned long mmun_end; /* For mmu_notifiers */
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	867
Kirill A. Shutemov	36eaff3	2017-02-24 14:58:04 -0800	[diff] [blame^]	868	pvmw.address = page_address_in_vma(page, vma);
				869	if (pvmw.address == -EFAULT)
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	870	goto out;
				871
Andrea Arcangeli	29ad768	2011-01-13 15:47:19 -0800	[diff] [blame]	872	BUG_ON(PageTransCompound(page));
Haggai Eran	6bdb913	2012-10-08 16:33:35 -0700	[diff] [blame]	873
Kirill A. Shutemov	36eaff3	2017-02-24 14:58:04 -0800	[diff] [blame^]	874	mmun_start = pvmw.address;
				875	mmun_end = pvmw.address + PAGE_SIZE;
Haggai Eran	6bdb913	2012-10-08 16:33:35 -0700	[diff] [blame]	876	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
				877
Kirill A. Shutemov	36eaff3	2017-02-24 14:58:04 -0800	[diff] [blame^]	878	if (!page_vma_mapped_walk(&pvmw))
Haggai Eran	6bdb913	2012-10-08 16:33:35 -0700	[diff] [blame]	879	goto out_mn;
Kirill A. Shutemov	36eaff3	2017-02-24 14:58:04 -0800	[diff] [blame^]	880	if (WARN_ONCE(!pvmw.pte, "Unexpected PMD mapping?"))
				881	goto out_unlock;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	882
Kirill A. Shutemov	36eaff3	2017-02-24 14:58:04 -0800	[diff] [blame^]	883	if (pte_write(pvmw.pte) \|\| pte_dirty(pvmw.pte)) {
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	884	pte_t entry;
				885
				886	swapped = PageSwapCache(page);
Kirill A. Shutemov	36eaff3	2017-02-24 14:58:04 -0800	[diff] [blame^]	887	flush_cache_page(vma, pvmw.address, page_to_pfn(page));
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	888	/*
Lucas De Marchi	25985ed	2011-03-30 22:57:33 -0300	[diff] [blame]	889	* Ok this is tricky, when get_user_pages_fast() run it doesn't
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	890	* take any lock, therefore the check that we are going to make
				891	* with the pagecount against the mapcount is racey and
				892	* O_DIRECT can happen right after the check.
				893	* So we clear the pte and flush the tlb before the check
				894	* this assure us that no O_DIRECT can happen after the check
				895	* or in the middle of the check.
				896	*/
Kirill A. Shutemov	36eaff3	2017-02-24 14:58:04 -0800	[diff] [blame^]	897	entry = ptep_clear_flush_notify(vma, pvmw.address, pvmw.pte);
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	898	/*
				899	* Check that no O_DIRECT or similar I/O is in progress on the
				900	* page
				901	*/
Hugh Dickins	31e855e	2009-12-14 17:59:17 -0800	[diff] [blame]	902	if (page_mapcount(page) + 1 + swapped != page_count(page)) {
Kirill A. Shutemov	36eaff3	2017-02-24 14:58:04 -0800	[diff] [blame^]	903	set_pte_at(mm, pvmw.address, pvmw.pte, entry);
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	904	goto out_unlock;
				905	}
Hugh Dickins	4e31635	2010-10-02 17:49:08 -0700	[diff] [blame]	906	if (pte_dirty(entry))
				907	set_page_dirty(page);
				908	entry = pte_mkclean(pte_wrprotect(entry));
Kirill A. Shutemov	36eaff3	2017-02-24 14:58:04 -0800	[diff] [blame^]	909	set_pte_at_notify(mm, pvmw.address, pvmw.pte, entry);
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	910	}
Kirill A. Shutemov	36eaff3	2017-02-24 14:58:04 -0800	[diff] [blame^]	911	orig_pte = pvmw.pte;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	912	err = 0;
				913
				914	out_unlock:
Kirill A. Shutemov	36eaff3	2017-02-24 14:58:04 -0800	[diff] [blame^]	915	page_vma_mapped_walk_done(&pvmw);
Haggai Eran	6bdb913	2012-10-08 16:33:35 -0700	[diff] [blame]	916	out_mn:
				917	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	918	out:
				919	return err;
				920	}
				921
				922	/**
				923	* replace_page - replace page in vma by new ksm page
Hugh Dickins	8dd3557	2009-12-14 17:59:18 -0800	[diff] [blame]	924	* @vma: vma that holds the pte pointing to page
				925	* @page: the page we are replacing by kpage
				926	* @kpage: the ksm page we replace page by
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	927	* @orig_pte: the original value of the pte
				928	*
				929	* Returns 0 on success, -EFAULT on failure.
				930	*/
Hugh Dickins	8dd3557	2009-12-14 17:59:18 -0800	[diff] [blame]	931	static int replace_page(struct vm_area_struct vma, struct page page,
				932	struct page *kpage, pte_t orig_pte)
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	933	{
				934	struct mm_struct *mm = vma->vm_mm;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	935	pmd_t *pmd;
				936	pte_t *ptep;
Claudio Imbrenda	e86c59b	2017-02-24 14:55:39 -0800	[diff] [blame]	937	pte_t newpte;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	938	spinlock_t *ptl;
				939	unsigned long addr;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	940	int err = -EFAULT;
Haggai Eran	6bdb913	2012-10-08 16:33:35 -0700	[diff] [blame]	941	unsigned long mmun_start; /* For mmu_notifiers */
				942	unsigned long mmun_end; /* For mmu_notifiers */
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	943
Hugh Dickins	8dd3557	2009-12-14 17:59:18 -0800	[diff] [blame]	944	addr = page_address_in_vma(page, vma);
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	945	if (addr == -EFAULT)
				946	goto out;
				947
Bob Liu	6219049	2012-12-11 16:00:37 -0800	[diff] [blame]	948	pmd = mm_find_pmd(mm, addr);
				949	if (!pmd)
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	950	goto out;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	951
Haggai Eran	6bdb913	2012-10-08 16:33:35 -0700	[diff] [blame]	952	mmun_start = addr;
				953	mmun_end = addr + PAGE_SIZE;
				954	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
				955
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	956	ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
				957	if (!pte_same(*ptep, orig_pte)) {
				958	pte_unmap_unlock(ptep, ptl);
Haggai Eran	6bdb913	2012-10-08 16:33:35 -0700	[diff] [blame]	959	goto out_mn;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	960	}
				961
Claudio Imbrenda	e86c59b	2017-02-24 14:55:39 -0800	[diff] [blame]	962	/*
				963	* No need to check ksm_use_zero_pages here: we can only have a
				964	* zero_page here if ksm_use_zero_pages was enabled alreaady.
				965	*/
				966	if (!is_zero_pfn(page_to_pfn(kpage))) {
				967	get_page(kpage);
				968	page_add_anon_rmap(kpage, vma, addr, false);
				969	newpte = mk_pte(kpage, vma->vm_page_prot);
				970	} else {
				971	newpte = pte_mkspecial(pfn_pte(page_to_pfn(kpage),
				972	vma->vm_page_prot));
				973	}
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	974
				975	flush_cache_page(vma, addr, pte_pfn(*ptep));
Joerg Roedel	34ee645	2014-11-13 13:46:09 +1100	[diff] [blame]	976	ptep_clear_flush_notify(vma, addr, ptep);
Claudio Imbrenda	e86c59b	2017-02-24 14:55:39 -0800	[diff] [blame]	977	set_pte_at_notify(mm, addr, ptep, newpte);
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	978
Kirill A. Shutemov	d281ee6	2016-01-15 16:52:16 -0800	[diff] [blame]	979	page_remove_rmap(page, false);
Hugh Dickins	ae52a2a	2011-01-13 15:46:28 -0800	[diff] [blame]	980	if (!page_mapped(page))
				981	try_to_free_swap(page);
Hugh Dickins	8dd3557	2009-12-14 17:59:18 -0800	[diff] [blame]	982	put_page(page);
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	983
				984	pte_unmap_unlock(ptep, ptl);
				985	err = 0;
Haggai Eran	6bdb913	2012-10-08 16:33:35 -0700	[diff] [blame]	986	out_mn:
				987	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	988	out:
				989	return err;
				990	}
				991
				992	/*
				993	* try_to_merge_one_page - take two pages and merge them into one
Hugh Dickins	8dd3557	2009-12-14 17:59:18 -0800	[diff] [blame]	994	* @vma: the vma that holds the pte pointing to page
				995	* @page: the PageAnon page that we want to replace with kpage
Hugh Dickins	80e14822	2009-12-14 17:59:29 -0800	[diff] [blame]	996	* @kpage: the PageKsm page that we want to map instead of page,
				997	* or NULL the first time when we want to use page as kpage.
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	998	*
				999	* This function returns 0 if the pages were merged, -EFAULT otherwise.
				1000	*/
				1001	static int try_to_merge_one_page(struct vm_area_struct *vma,
Hugh Dickins	8dd3557	2009-12-14 17:59:18 -0800	[diff] [blame]	1002	struct page page, struct page kpage)
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1003	{
				1004	pte_t orig_pte = __pte(0);
				1005	int err = -EFAULT;
				1006
Hugh Dickins	db114b8	2009-12-14 17:59:25 -0800	[diff] [blame]	1007	if (page == kpage) /* ksm page forked */
				1008	return 0;
				1009
Hugh Dickins	8dd3557	2009-12-14 17:59:18 -0800	[diff] [blame]	1010	if (!PageAnon(page))
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1011	goto out;
				1012
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1013	/*
				1014	* We need the page lock to read a stable PageSwapCache in
				1015	* write_protect_page(). We use trylock_page() instead of
				1016	* lock_page() because we don't want to wait here - we
				1017	* prefer to continue scanning and merging different pages,
				1018	* then come back to this page when it is unlocked.
				1019	*/
Hugh Dickins	8dd3557	2009-12-14 17:59:18 -0800	[diff] [blame]	1020	if (!trylock_page(page))
Hugh Dickins	31e855e	2009-12-14 17:59:17 -0800	[diff] [blame]	1021	goto out;
Kirill A. Shutemov	f765f54	2016-01-15 16:53:03 -0800	[diff] [blame]	1022
				1023	if (PageTransCompound(page)) {
				1024	err = split_huge_page(page);
				1025	if (err)
				1026	goto out_unlock;
				1027	}
				1028
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1029	/*
				1030	* If this anonymous page is mapped only here, its pte may need
				1031	* to be write-protected. If it's mapped elsewhere, all of its
				1032	* ptes are necessarily already write-protected. But in either
				1033	* case, we need to lock and check page_count is not raised.
				1034	*/
Hugh Dickins	80e14822	2009-12-14 17:59:29 -0800	[diff] [blame]	1035	if (write_protect_page(vma, page, &orig_pte) == 0) {
				1036	if (!kpage) {
				1037	/*
				1038	* While we hold page lock, upgrade page from
				1039	* PageAnon+anon_vma to PageKsm+NULL stable_node:
				1040	* stable_tree_insert() will update stable_node.
				1041	*/
				1042	set_page_stable_node(page, NULL);
				1043	mark_page_accessed(page);
Minchan Kim	337ed7e	2016-01-15 16:55:15 -0800	[diff] [blame]	1044	/*
				1045	* Page reclaim just frees a clean page with no dirty
				1046	* ptes: make sure that the ksm page would be swapped.
				1047	*/
				1048	if (!PageDirty(page))
				1049	SetPageDirty(page);
Hugh Dickins	80e14822	2009-12-14 17:59:29 -0800	[diff] [blame]	1050	err = 0;
				1051	} else if (pages_identical(page, kpage))
				1052	err = replace_page(vma, page, kpage, orig_pte);
				1053	}
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1054
Hugh Dickins	80e14822	2009-12-14 17:59:29 -0800	[diff] [blame]	1055	if ((vma->vm_flags & VM_LOCKED) && kpage && !err) {
Hugh Dickins	73848b4	2009-12-14 17:59:22 -0800	[diff] [blame]	1056	munlock_vma_page(page);
Hugh Dickins	5ad6468	2009-12-14 17:59:24 -0800	[diff] [blame]	1057	if (!PageMlocked(kpage)) {
				1058	unlock_page(page);
Hugh Dickins	5ad6468	2009-12-14 17:59:24 -0800	[diff] [blame]	1059	lock_page(kpage);
				1060	mlock_vma_page(kpage);
				1061	page = kpage; /* for final unlock */
				1062	}
				1063	}
Hugh Dickins	73848b4	2009-12-14 17:59:22 -0800	[diff] [blame]	1064
Kirill A. Shutemov	f765f54	2016-01-15 16:53:03 -0800	[diff] [blame]	1065	out_unlock:
Hugh Dickins	8dd3557	2009-12-14 17:59:18 -0800	[diff] [blame]	1066	unlock_page(page);
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1067	out:
				1068	return err;
				1069	}
				1070
				1071	/*
Hugh Dickins	81464e30	2009-09-21 17:02:15 -0700	[diff] [blame]	1072	* try_to_merge_with_ksm_page - like try_to_merge_two_pages,
				1073	* but no new kernel page is allocated: kpage must already be a ksm page.
Hugh Dickins	8dd3557	2009-12-14 17:59:18 -0800	[diff] [blame]	1074	*
				1075	* This function returns 0 if the pages were merged, -EFAULT otherwise.
Hugh Dickins	81464e30	2009-09-21 17:02:15 -0700	[diff] [blame]	1076	*/
Hugh Dickins	8dd3557	2009-12-14 17:59:18 -0800	[diff] [blame]	1077	static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item,
				1078	struct page page, struct page kpage)
Hugh Dickins	81464e30	2009-09-21 17:02:15 -0700	[diff] [blame]	1079	{
Hugh Dickins	8dd3557	2009-12-14 17:59:18 -0800	[diff] [blame]	1080	struct mm_struct *mm = rmap_item->mm;
Hugh Dickins	81464e30	2009-09-21 17:02:15 -0700	[diff] [blame]	1081	struct vm_area_struct *vma;
				1082	int err = -EFAULT;
				1083
Hugh Dickins	8dd3557	2009-12-14 17:59:18 -0800	[diff] [blame]	1084	down_read(&mm->mmap_sem);
Andrea Arcangeli	85c6e8d	2015-11-05 18:49:16 -0800	[diff] [blame]	1085	vma = find_mergeable_vma(mm, rmap_item->address);
				1086	if (!vma)
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	1087	goto out;
				1088
Hugh Dickins	8dd3557	2009-12-14 17:59:18 -0800	[diff] [blame]	1089	err = try_to_merge_one_page(vma, page, kpage);
Hugh Dickins	db114b8	2009-12-14 17:59:25 -0800	[diff] [blame]	1090	if (err)
				1091	goto out;
				1092
Hugh Dickins	bc56620	2013-02-22 16:36:06 -0800	[diff] [blame]	1093	/* Unstable nid is in union with stable anon_vma: remove first */
				1094	remove_rmap_item_from_tree(rmap_item);
				1095
Hugh Dickins	db114b8	2009-12-14 17:59:25 -0800	[diff] [blame]	1096	/* Must get reference to anon_vma while still holding mmap_sem */
Peter Zijlstra	9e60109	2011-03-22 16:32:46 -0700	[diff] [blame]	1097	rmap_item->anon_vma = vma->anon_vma;
				1098	get_anon_vma(vma->anon_vma);
Hugh Dickins	81464e30	2009-09-21 17:02:15 -0700	[diff] [blame]	1099	out:
Hugh Dickins	8dd3557	2009-12-14 17:59:18 -0800	[diff] [blame]	1100	up_read(&mm->mmap_sem);
Hugh Dickins	81464e30	2009-09-21 17:02:15 -0700	[diff] [blame]	1101	return err;
				1102	}
				1103
				1104	/*
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1105	* try_to_merge_two_pages - take two identical pages and prepare them
				1106	* to be merged into one page.
				1107	*
Hugh Dickins	8dd3557	2009-12-14 17:59:18 -0800	[diff] [blame]	1108	* This function returns the kpage if we successfully merged two identical
				1109	* pages into one ksm page, NULL otherwise.
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1110	*
Hugh Dickins	80e14822	2009-12-14 17:59:29 -0800	[diff] [blame]	1111	* Note that this function upgrades page to ksm page: if one of the pages
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1112	* is already a ksm page, try_to_merge_with_ksm_page should be used.
				1113	*/
Hugh Dickins	8dd3557	2009-12-14 17:59:18 -0800	[diff] [blame]	1114	static struct page try_to_merge_two_pages(struct rmap_item rmap_item,
				1115	struct page *page,
				1116	struct rmap_item *tree_rmap_item,
				1117	struct page *tree_page)
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1118	{
Hugh Dickins	80e14822	2009-12-14 17:59:29 -0800	[diff] [blame]	1119	int err;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1120
Hugh Dickins	80e14822	2009-12-14 17:59:29 -0800	[diff] [blame]	1121	err = try_to_merge_with_ksm_page(rmap_item, page, NULL);
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1122	if (!err) {
Hugh Dickins	8dd3557	2009-12-14 17:59:18 -0800	[diff] [blame]	1123	err = try_to_merge_with_ksm_page(tree_rmap_item,
Hugh Dickins	80e14822	2009-12-14 17:59:29 -0800	[diff] [blame]	1124	tree_page, page);
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1125	/*
Hugh Dickins	81464e30	2009-09-21 17:02:15 -0700	[diff] [blame]	1126	* If that fails, we have a ksm page with only one pte
				1127	* pointing to it: so break it.
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1128	*/
Hugh Dickins	4035c07a	2009-12-14 17:59:27 -0800	[diff] [blame]	1129	if (err)
Hugh Dickins	8dd3557	2009-12-14 17:59:18 -0800	[diff] [blame]	1130	break_cow(rmap_item);
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1131	}
Hugh Dickins	80e14822	2009-12-14 17:59:29 -0800	[diff] [blame]	1132	return err ? NULL : page;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1133	}
				1134
				1135	/*
Hugh Dickins	8dd3557	2009-12-14 17:59:18 -0800	[diff] [blame]	1136	* stable_tree_search - search for page inside the stable tree
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1137	*
				1138	* This function checks if there is a page inside the stable tree
				1139	* with identical content to the page that we are scanning right now.
				1140	*
Hugh Dickins	7b6ba2c	2009-12-14 17:59:20 -0800	[diff] [blame]	1141	* This function returns the stable tree node of identical content if found,
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1142	* NULL otherwise.
				1143	*/
Hugh Dickins	62b61f6	2009-12-14 17:59:33 -0800	[diff] [blame]	1144	static struct page stable_tree_search(struct page page)
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1145	{
Petr Holasek	90bd6fd	2013-02-22 16:35:00 -0800	[diff] [blame]	1146	int nid;
Hugh Dickins	ef53d16	2013-02-22 16:36:12 -0800	[diff] [blame]	1147	struct rb_root *root;
Hugh Dickins	4146d2d	2013-02-22 16:35:11 -0800	[diff] [blame]	1148	struct rb_node **new;
				1149	struct rb_node *parent;
				1150	struct stable_node *stable_node;
				1151	struct stable_node *page_node;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1152
Hugh Dickins	4146d2d	2013-02-22 16:35:11 -0800	[diff] [blame]	1153	page_node = page_stable_node(page);
				1154	if (page_node && page_node->head != &migrate_nodes) {
				1155	/* ksm page forked */
Hugh Dickins	08beca4	2009-12-14 17:59:21 -0800	[diff] [blame]	1156	get_page(page);
Hugh Dickins	62b61f6	2009-12-14 17:59:33 -0800	[diff] [blame]	1157	return page;
Hugh Dickins	08beca4	2009-12-14 17:59:21 -0800	[diff] [blame]	1158	}
				1159
Petr Holasek	90bd6fd	2013-02-22 16:35:00 -0800	[diff] [blame]	1160	nid = get_kpfn_nid(page_to_pfn(page));
Hugh Dickins	ef53d16	2013-02-22 16:36:12 -0800	[diff] [blame]	1161	root = root_stable_tree + nid;
Hugh Dickins	4146d2d	2013-02-22 16:35:11 -0800	[diff] [blame]	1162	again:
Hugh Dickins	ef53d16	2013-02-22 16:36:12 -0800	[diff] [blame]	1163	new = &root->rb_node;
Hugh Dickins	4146d2d	2013-02-22 16:35:11 -0800	[diff] [blame]	1164	parent = NULL;
Petr Holasek	90bd6fd	2013-02-22 16:35:00 -0800	[diff] [blame]	1165
Hugh Dickins	4146d2d	2013-02-22 16:35:11 -0800	[diff] [blame]	1166	while (*new) {
Hugh Dickins	4035c07a	2009-12-14 17:59:27 -0800	[diff] [blame]	1167	struct page *tree_page;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1168	int ret;
				1169
Hugh Dickins	08beca4	2009-12-14 17:59:21 -0800	[diff] [blame]	1170	cond_resched();
Hugh Dickins	4146d2d	2013-02-22 16:35:11 -0800	[diff] [blame]	1171	stable_node = rb_entry(*new, struct stable_node, node);
Hugh Dickins	8aafa6a	2013-02-22 16:35:06 -0800	[diff] [blame]	1172	tree_page = get_ksm_page(stable_node, false);
Andrea Arcangeli	f2e5ff8	2015-11-05 18:49:10 -0800	[diff] [blame]	1173	if (!tree_page) {
				1174	/*
				1175	* If we walked over a stale stable_node,
				1176	* get_ksm_page() will call rb_erase() and it
				1177	* may rebalance the tree from under us. So
				1178	* restart the search from scratch. Returning
				1179	* NULL would be safe too, but we'd generate
				1180	* false negative insertions just because some
				1181	* stable_node was stale.
				1182	*/
				1183	goto again;
				1184	}
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1185
Hugh Dickins	4035c07a	2009-12-14 17:59:27 -0800	[diff] [blame]	1186	ret = memcmp_pages(page, tree_page);
Hugh Dickins	c8d6553	2013-02-22 16:35:10 -0800	[diff] [blame]	1187	put_page(tree_page);
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1188
Hugh Dickins	4146d2d	2013-02-22 16:35:11 -0800	[diff] [blame]	1189	parent = *new;
Hugh Dickins	c8d6553	2013-02-22 16:35:10 -0800	[diff] [blame]	1190	if (ret < 0)
Hugh Dickins	4146d2d	2013-02-22 16:35:11 -0800	[diff] [blame]	1191	new = &parent->rb_left;
Hugh Dickins	c8d6553	2013-02-22 16:35:10 -0800	[diff] [blame]	1192	else if (ret > 0)
Hugh Dickins	4146d2d	2013-02-22 16:35:11 -0800	[diff] [blame]	1193	new = &parent->rb_right;
Hugh Dickins	c8d6553	2013-02-22 16:35:10 -0800	[diff] [blame]	1194	else {
				1195	/*
				1196	* Lock and unlock the stable_node's page (which
				1197	* might already have been migrated) so that page
				1198	* migration is sure to notice its raised count.
				1199	* It would be more elegant to return stable_node
				1200	* than kpage, but that involves more changes.
				1201	*/
				1202	tree_page = get_ksm_page(stable_node, true);
Hugh Dickins	4146d2d	2013-02-22 16:35:11 -0800	[diff] [blame]	1203	if (tree_page) {
Hugh Dickins	c8d6553	2013-02-22 16:35:10 -0800	[diff] [blame]	1204	unlock_page(tree_page);
Hugh Dickins	4146d2d	2013-02-22 16:35:11 -0800	[diff] [blame]	1205	if (get_kpfn_nid(stable_node->kpfn) !=
				1206	NUMA(stable_node->nid)) {
				1207	put_page(tree_page);
				1208	goto replace;
				1209	}
				1210	return tree_page;
				1211	}
				1212	/*
				1213	* There is now a place for page_node, but the tree may
				1214	* have been rebalanced, so re-evaluate parent and new.
				1215	*/
				1216	if (page_node)
				1217	goto again;
				1218	return NULL;
Hugh Dickins	c8d6553	2013-02-22 16:35:10 -0800	[diff] [blame]	1219	}
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1220	}
				1221
Hugh Dickins	4146d2d	2013-02-22 16:35:11 -0800	[diff] [blame]	1222	if (!page_node)
				1223	return NULL;
				1224
				1225	list_del(&page_node->list);
				1226	DO_NUMA(page_node->nid = nid);
				1227	rb_link_node(&page_node->node, parent, new);
Hugh Dickins	ef53d16	2013-02-22 16:36:12 -0800	[diff] [blame]	1228	rb_insert_color(&page_node->node, root);
Hugh Dickins	4146d2d	2013-02-22 16:35:11 -0800	[diff] [blame]	1229	get_page(page);
				1230	return page;
				1231
				1232	replace:
				1233	if (page_node) {
				1234	list_del(&page_node->list);
				1235	DO_NUMA(page_node->nid = nid);
Hugh Dickins	ef53d16	2013-02-22 16:36:12 -0800	[diff] [blame]	1236	rb_replace_node(&stable_node->node, &page_node->node, root);
Hugh Dickins	4146d2d	2013-02-22 16:35:11 -0800	[diff] [blame]	1237	get_page(page);
				1238	} else {
Hugh Dickins	ef53d16	2013-02-22 16:36:12 -0800	[diff] [blame]	1239	rb_erase(&stable_node->node, root);
Hugh Dickins	4146d2d	2013-02-22 16:35:11 -0800	[diff] [blame]	1240	page = NULL;
				1241	}
				1242	stable_node->head = &migrate_nodes;
				1243	list_add(&stable_node->list, stable_node->head);
				1244	return page;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1245	}
				1246
				1247	/*
Hugh Dickins	e850dcf	2013-02-22 16:35:03 -0800	[diff] [blame]	1248	* stable_tree_insert - insert stable tree node pointing to new ksm page
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1249	* into the stable tree.
				1250	*
Hugh Dickins	7b6ba2c	2009-12-14 17:59:20 -0800	[diff] [blame]	1251	* This function returns the stable tree node just allocated on success,
				1252	* NULL otherwise.
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1253	*/
Hugh Dickins	7b6ba2c	2009-12-14 17:59:20 -0800	[diff] [blame]	1254	static struct stable_node stable_tree_insert(struct page kpage)
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1255	{
Petr Holasek	90bd6fd	2013-02-22 16:35:00 -0800	[diff] [blame]	1256	int nid;
				1257	unsigned long kpfn;
Hugh Dickins	ef53d16	2013-02-22 16:36:12 -0800	[diff] [blame]	1258	struct rb_root *root;
Petr Holasek	90bd6fd	2013-02-22 16:35:00 -0800	[diff] [blame]	1259	struct rb_node **new;
Andrea Arcangeli	f2e5ff8	2015-11-05 18:49:10 -0800	[diff] [blame]	1260	struct rb_node *parent;
Hugh Dickins	7b6ba2c	2009-12-14 17:59:20 -0800	[diff] [blame]	1261	struct stable_node *stable_node;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1262
Petr Holasek	90bd6fd	2013-02-22 16:35:00 -0800	[diff] [blame]	1263	kpfn = page_to_pfn(kpage);
				1264	nid = get_kpfn_nid(kpfn);
Hugh Dickins	ef53d16	2013-02-22 16:36:12 -0800	[diff] [blame]	1265	root = root_stable_tree + nid;
Andrea Arcangeli	f2e5ff8	2015-11-05 18:49:10 -0800	[diff] [blame]	1266	again:
				1267	parent = NULL;
Hugh Dickins	ef53d16	2013-02-22 16:36:12 -0800	[diff] [blame]	1268	new = &root->rb_node;
Petr Holasek	90bd6fd	2013-02-22 16:35:00 -0800	[diff] [blame]	1269
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1270	while (*new) {
Hugh Dickins	4035c07a	2009-12-14 17:59:27 -0800	[diff] [blame]	1271	struct page *tree_page;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1272	int ret;
				1273
Hugh Dickins	08beca4	2009-12-14 17:59:21 -0800	[diff] [blame]	1274	cond_resched();
Hugh Dickins	7b6ba2c	2009-12-14 17:59:20 -0800	[diff] [blame]	1275	stable_node = rb_entry(*new, struct stable_node, node);
Hugh Dickins	8aafa6a	2013-02-22 16:35:06 -0800	[diff] [blame]	1276	tree_page = get_ksm_page(stable_node, false);
Andrea Arcangeli	f2e5ff8	2015-11-05 18:49:10 -0800	[diff] [blame]	1277	if (!tree_page) {
				1278	/*
				1279	* If we walked over a stale stable_node,
				1280	* get_ksm_page() will call rb_erase() and it
				1281	* may rebalance the tree from under us. So
				1282	* restart the search from scratch. Returning
				1283	* NULL would be safe too, but we'd generate
				1284	* false negative insertions just because some
				1285	* stable_node was stale.
				1286	*/
				1287	goto again;
				1288	}
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1289
Hugh Dickins	4035c07a	2009-12-14 17:59:27 -0800	[diff] [blame]	1290	ret = memcmp_pages(kpage, tree_page);
				1291	put_page(tree_page);
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1292
				1293	parent = *new;
				1294	if (ret < 0)
				1295	new = &parent->rb_left;
				1296	else if (ret > 0)
				1297	new = &parent->rb_right;
				1298	else {
				1299	/*
				1300	* It is not a bug that stable_tree_search() didn't
				1301	* find this node: because at that time our page was
				1302	* not yet write-protected, so may have changed since.
				1303	*/
				1304	return NULL;
				1305	}
				1306	}
				1307
Hugh Dickins	7b6ba2c	2009-12-14 17:59:20 -0800	[diff] [blame]	1308	stable_node = alloc_stable_node();
				1309	if (!stable_node)
				1310	return NULL;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1311
Hugh Dickins	7b6ba2c	2009-12-14 17:59:20 -0800	[diff] [blame]	1312	INIT_HLIST_HEAD(&stable_node->hlist);
Petr Holasek	90bd6fd	2013-02-22 16:35:00 -0800	[diff] [blame]	1313	stable_node->kpfn = kpfn;
Hugh Dickins	08beca4	2009-12-14 17:59:21 -0800	[diff] [blame]	1314	set_page_stable_node(kpage, stable_node);
Hugh Dickins	4146d2d	2013-02-22 16:35:11 -0800	[diff] [blame]	1315	DO_NUMA(stable_node->nid = nid);
Hugh Dickins	e850dcf	2013-02-22 16:35:03 -0800	[diff] [blame]	1316	rb_link_node(&stable_node->node, parent, new);
Hugh Dickins	ef53d16	2013-02-22 16:36:12 -0800	[diff] [blame]	1317	rb_insert_color(&stable_node->node, root);
Hugh Dickins	08beca4	2009-12-14 17:59:21 -0800	[diff] [blame]	1318
Hugh Dickins	7b6ba2c	2009-12-14 17:59:20 -0800	[diff] [blame]	1319	return stable_node;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1320	}
				1321
				1322	/*
Hugh Dickins	8dd3557	2009-12-14 17:59:18 -0800	[diff] [blame]	1323	* unstable_tree_search_insert - search for identical page,
				1324	* else insert rmap_item into the unstable tree.
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1325	*
				1326	* This function searches for a page in the unstable tree identical to the
				1327	* page currently being scanned; and if no identical page is found in the
				1328	* tree, we insert rmap_item as a new object into the unstable tree.
				1329	*
				1330	* This function returns pointer to rmap_item found to be identical
				1331	* to the currently scanned page, NULL otherwise.
				1332	*
				1333	* This function does both searching and inserting, because they share
				1334	* the same walking algorithm in an rbtree.
				1335	*/
Hugh Dickins	8dd3557	2009-12-14 17:59:18 -0800	[diff] [blame]	1336	static
				1337	struct rmap_item unstable_tree_search_insert(struct rmap_item rmap_item,
				1338	struct page *page,
				1339	struct page **tree_pagep)
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1340	{
Petr Holasek	90bd6fd	2013-02-22 16:35:00 -0800	[diff] [blame]	1341	struct rb_node **new;
				1342	struct rb_root *root;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1343	struct rb_node *parent = NULL;
Petr Holasek	90bd6fd	2013-02-22 16:35:00 -0800	[diff] [blame]	1344	int nid;
				1345
				1346	nid = get_kpfn_nid(page_to_pfn(page));
Hugh Dickins	ef53d16	2013-02-22 16:36:12 -0800	[diff] [blame]	1347	root = root_unstable_tree + nid;
Petr Holasek	90bd6fd	2013-02-22 16:35:00 -0800	[diff] [blame]	1348	new = &root->rb_node;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1349
				1350	while (*new) {
				1351	struct rmap_item *tree_rmap_item;
Hugh Dickins	8dd3557	2009-12-14 17:59:18 -0800	[diff] [blame]	1352	struct page *tree_page;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1353	int ret;
				1354
Hugh Dickins	d178f27	2009-11-09 15:58:23 +0000	[diff] [blame]	1355	cond_resched();
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1356	tree_rmap_item = rb_entry(*new, struct rmap_item, node);
Hugh Dickins	8dd3557	2009-12-14 17:59:18 -0800	[diff] [blame]	1357	tree_page = get_mergeable_page(tree_rmap_item);
Andrea Arcangeli	c8f95ed	2015-11-05 18:49:19 -0800	[diff] [blame]	1358	if (!tree_page)
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1359	return NULL;
				1360
				1361	/*
Hugh Dickins	8dd3557	2009-12-14 17:59:18 -0800	[diff] [blame]	1362	* Don't substitute a ksm page for a forked page.
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1363	*/
Hugh Dickins	8dd3557	2009-12-14 17:59:18 -0800	[diff] [blame]	1364	if (page == tree_page) {
				1365	put_page(tree_page);
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1366	return NULL;
				1367	}
				1368
Hugh Dickins	8dd3557	2009-12-14 17:59:18 -0800	[diff] [blame]	1369	ret = memcmp_pages(page, tree_page);
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1370
				1371	parent = *new;
				1372	if (ret < 0) {
Hugh Dickins	8dd3557	2009-12-14 17:59:18 -0800	[diff] [blame]	1373	put_page(tree_page);
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1374	new = &parent->rb_left;
				1375	} else if (ret > 0) {
Hugh Dickins	8dd3557	2009-12-14 17:59:18 -0800	[diff] [blame]	1376	put_page(tree_page);
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1377	new = &parent->rb_right;
Hugh Dickins	b599cbd	2013-02-22 16:36:05 -0800	[diff] [blame]	1378	} else if (!ksm_merge_across_nodes &&
				1379	page_to_nid(tree_page) != nid) {
				1380	/*
				1381	* If tree_page has been migrated to another NUMA node,
				1382	* it will be flushed out and put in the right unstable
				1383	* tree next time: only merge with it when across_nodes.
				1384	*/
				1385	put_page(tree_page);
				1386	return NULL;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1387	} else {
Hugh Dickins	8dd3557	2009-12-14 17:59:18 -0800	[diff] [blame]	1388	*tree_pagep = tree_page;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1389	return tree_rmap_item;
				1390	}
				1391	}
				1392
Hugh Dickins	7b6ba2c	2009-12-14 17:59:20 -0800	[diff] [blame]	1393	rmap_item->address \|= UNSTABLE_FLAG;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1394	rmap_item->address \|= (ksm_scan.seqnr & SEQNR_MASK);
Hugh Dickins	e850dcf	2013-02-22 16:35:03 -0800	[diff] [blame]	1395	DO_NUMA(rmap_item->nid = nid);
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1396	rb_link_node(&rmap_item->node, parent, new);
Petr Holasek	90bd6fd	2013-02-22 16:35:00 -0800	[diff] [blame]	1397	rb_insert_color(&rmap_item->node, root);
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1398
Hugh Dickins	473b0ce	2009-09-21 17:02:11 -0700	[diff] [blame]	1399	ksm_pages_unshared++;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1400	return NULL;
				1401	}
				1402
				1403	/*
				1404	* stable_tree_append - add another rmap_item to the linked list of
				1405	* rmap_items hanging off a given node of the stable tree, all sharing
				1406	* the same ksm page.
				1407	*/
				1408	static void stable_tree_append(struct rmap_item *rmap_item,
Hugh Dickins	7b6ba2c	2009-12-14 17:59:20 -0800	[diff] [blame]	1409	struct stable_node *stable_node)
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1410	{
Hugh Dickins	7b6ba2c	2009-12-14 17:59:20 -0800	[diff] [blame]	1411	rmap_item->head = stable_node;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1412	rmap_item->address \|= STABLE_FLAG;
Hugh Dickins	7b6ba2c	2009-12-14 17:59:20 -0800	[diff] [blame]	1413	hlist_add_head(&rmap_item->hlist, &stable_node->hlist);
Hugh Dickins	e178dfd	2009-09-21 17:02:10 -0700	[diff] [blame]	1414
Hugh Dickins	7b6ba2c	2009-12-14 17:59:20 -0800	[diff] [blame]	1415	if (rmap_item->hlist.next)
				1416	ksm_pages_sharing++;
				1417	else
				1418	ksm_pages_shared++;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1419	}
				1420
				1421	/*
Hugh Dickins	81464e30	2009-09-21 17:02:15 -0700	[diff] [blame]	1422	* cmp_and_merge_page - first see if page can be merged into the stable tree;
				1423	* if not, compare checksum to previous and if it's the same, see if page can
				1424	* be inserted into the unstable tree, or merged with a page already there and
				1425	* both transferred to the stable tree.
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1426	*
				1427	* @page: the page that we are searching identical page to.
				1428	* @rmap_item: the reverse mapping into the virtual address of this page
				1429	*/
				1430	static void cmp_and_merge_page(struct page page, struct rmap_item rmap_item)
				1431	{
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1432	struct rmap_item *tree_rmap_item;
Hugh Dickins	8dd3557	2009-12-14 17:59:18 -0800	[diff] [blame]	1433	struct page *tree_page = NULL;
Hugh Dickins	7b6ba2c	2009-12-14 17:59:20 -0800	[diff] [blame]	1434	struct stable_node *stable_node;
Hugh Dickins	8dd3557	2009-12-14 17:59:18 -0800	[diff] [blame]	1435	struct page *kpage;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1436	unsigned int checksum;
				1437	int err;
				1438
Hugh Dickins	4146d2d	2013-02-22 16:35:11 -0800	[diff] [blame]	1439	stable_node = page_stable_node(page);
				1440	if (stable_node) {
				1441	if (stable_node->head != &migrate_nodes &&
				1442	get_kpfn_nid(stable_node->kpfn) != NUMA(stable_node->nid)) {
				1443	rb_erase(&stable_node->node,
Hugh Dickins	ef53d16	2013-02-22 16:36:12 -0800	[diff] [blame]	1444	root_stable_tree + NUMA(stable_node->nid));
Hugh Dickins	4146d2d	2013-02-22 16:35:11 -0800	[diff] [blame]	1445	stable_node->head = &migrate_nodes;
				1446	list_add(&stable_node->list, stable_node->head);
				1447	}
				1448	if (stable_node->head != &migrate_nodes &&
				1449	rmap_item->head == stable_node)
				1450	return;
				1451	}
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1452
				1453	/* We first start with searching the page inside the stable tree */
Hugh Dickins	62b61f6	2009-12-14 17:59:33 -0800	[diff] [blame]	1454	kpage = stable_tree_search(page);
Hugh Dickins	4146d2d	2013-02-22 16:35:11 -0800	[diff] [blame]	1455	if (kpage == page && rmap_item->head == stable_node) {
				1456	put_page(kpage);
				1457	return;
				1458	}
				1459
				1460	remove_rmap_item_from_tree(rmap_item);
				1461
Hugh Dickins	62b61f6	2009-12-14 17:59:33 -0800	[diff] [blame]	1462	if (kpage) {
Hugh Dickins	08beca4	2009-12-14 17:59:21 -0800	[diff] [blame]	1463	err = try_to_merge_with_ksm_page(rmap_item, page, kpage);
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1464	if (!err) {
				1465	/*
				1466	* The page was successfully merged:
				1467	* add its rmap_item to the stable tree.
				1468	*/
Hugh Dickins	5ad6468	2009-12-14 17:59:24 -0800	[diff] [blame]	1469	lock_page(kpage);
Hugh Dickins	62b61f6	2009-12-14 17:59:33 -0800	[diff] [blame]	1470	stable_tree_append(rmap_item, page_stable_node(kpage));
Hugh Dickins	5ad6468	2009-12-14 17:59:24 -0800	[diff] [blame]	1471	unlock_page(kpage);
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1472	}
Hugh Dickins	8dd3557	2009-12-14 17:59:18 -0800	[diff] [blame]	1473	put_page(kpage);
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1474	return;
				1475	}
				1476
				1477	/*
Hugh Dickins	4035c07a	2009-12-14 17:59:27 -0800	[diff] [blame]	1478	* If the hash value of the page has changed from the last time
				1479	* we calculated it, this page is changing frequently: therefore we
				1480	* don't want to insert it in the unstable tree, and we don't want
				1481	* to waste our time searching for something identical to it there.
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1482	*/
				1483	checksum = calc_checksum(page);
				1484	if (rmap_item->oldchecksum != checksum) {
				1485	rmap_item->oldchecksum = checksum;
				1486	return;
				1487	}
				1488
Claudio Imbrenda	e86c59b	2017-02-24 14:55:39 -0800	[diff] [blame]	1489	/*
				1490	* Same checksum as an empty page. We attempt to merge it with the
				1491	* appropriate zero page if the user enabled this via sysfs.
				1492	*/
				1493	if (ksm_use_zero_pages && (checksum == zero_checksum)) {
				1494	struct vm_area_struct *vma;
				1495
				1496	vma = find_mergeable_vma(rmap_item->mm, rmap_item->address);
				1497	err = try_to_merge_one_page(vma, page,
				1498	ZERO_PAGE(rmap_item->address));
				1499	/*
				1500	* In case of failure, the page was not really empty, so we
				1501	* need to continue. Otherwise we're done.
				1502	*/
				1503	if (!err)
				1504	return;
				1505	}
Hugh Dickins	8dd3557	2009-12-14 17:59:18 -0800	[diff] [blame]	1506	tree_rmap_item =
				1507	unstable_tree_search_insert(rmap_item, page, &tree_page);
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1508	if (tree_rmap_item) {
Hugh Dickins	8dd3557	2009-12-14 17:59:18 -0800	[diff] [blame]	1509	kpage = try_to_merge_two_pages(rmap_item, page,
				1510	tree_rmap_item, tree_page);
				1511	put_page(tree_page);
Hugh Dickins	8dd3557	2009-12-14 17:59:18 -0800	[diff] [blame]	1512	if (kpage) {
Hugh Dickins	bc56620	2013-02-22 16:36:06 -0800	[diff] [blame]	1513	/*
				1514	* The pages were successfully merged: insert new
				1515	* node in the stable tree and add both rmap_items.
				1516	*/
Hugh Dickins	5ad6468	2009-12-14 17:59:24 -0800	[diff] [blame]	1517	lock_page(kpage);
Hugh Dickins	7b6ba2c	2009-12-14 17:59:20 -0800	[diff] [blame]	1518	stable_node = stable_tree_insert(kpage);
				1519	if (stable_node) {
				1520	stable_tree_append(tree_rmap_item, stable_node);
				1521	stable_tree_append(rmap_item, stable_node);
				1522	}
Hugh Dickins	5ad6468	2009-12-14 17:59:24 -0800	[diff] [blame]	1523	unlock_page(kpage);
Hugh Dickins	7b6ba2c	2009-12-14 17:59:20 -0800	[diff] [blame]	1524
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1525	/*
				1526	* If we fail to insert the page into the stable tree,
				1527	* we will have 2 virtual addresses that are pointing
				1528	* to a ksm page left outside the stable tree,
				1529	* in which case we need to break_cow on both.
				1530	*/
Hugh Dickins	7b6ba2c	2009-12-14 17:59:20 -0800	[diff] [blame]	1531	if (!stable_node) {
Hugh Dickins	8dd3557	2009-12-14 17:59:18 -0800	[diff] [blame]	1532	break_cow(tree_rmap_item);
				1533	break_cow(rmap_item);
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1534	}
				1535	}
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1536	}
				1537	}
				1538
				1539	static struct rmap_item get_next_rmap_item(struct mm_slot mm_slot,
Hugh Dickins	6514d51	2009-12-14 17:59:19 -0800	[diff] [blame]	1540	struct rmap_item **rmap_list,
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1541	unsigned long addr)
				1542	{
				1543	struct rmap_item *rmap_item;
				1544
Hugh Dickins	6514d51	2009-12-14 17:59:19 -0800	[diff] [blame]	1545	while (*rmap_list) {
				1546	rmap_item = *rmap_list;
Hugh Dickins	93d1771	2009-12-14 17:59:16 -0800	[diff] [blame]	1547	if ((rmap_item->address & PAGE_MASK) == addr)
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1548	return rmap_item;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1549	if (rmap_item->address > addr)
				1550	break;
Hugh Dickins	6514d51	2009-12-14 17:59:19 -0800	[diff] [blame]	1551	*rmap_list = rmap_item->rmap_list;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1552	remove_rmap_item_from_tree(rmap_item);
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1553	free_rmap_item(rmap_item);
				1554	}
				1555
				1556	rmap_item = alloc_rmap_item();
				1557	if (rmap_item) {
				1558	/* It has already been zeroed */
				1559	rmap_item->mm = mm_slot->mm;
				1560	rmap_item->address = addr;
Hugh Dickins	6514d51	2009-12-14 17:59:19 -0800	[diff] [blame]	1561	rmap_item->rmap_list = *rmap_list;
				1562	*rmap_list = rmap_item;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1563	}
				1564	return rmap_item;
				1565	}
				1566
				1567	static struct rmap_item scan_get_next_rmap_item(struct page *page)
				1568	{
				1569	struct mm_struct *mm;
				1570	struct mm_slot *slot;
				1571	struct vm_area_struct *vma;
				1572	struct rmap_item *rmap_item;
Petr Holasek	90bd6fd	2013-02-22 16:35:00 -0800	[diff] [blame]	1573	int nid;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1574
				1575	if (list_empty(&ksm_mm_head.mm_list))
				1576	return NULL;
				1577
				1578	slot = ksm_scan.mm_slot;
				1579	if (slot == &ksm_mm_head) {
Hugh Dickins	2919bfd	2011-01-13 15:47:29 -0800	[diff] [blame]	1580	/*
				1581	* A number of pages can hang around indefinitely on per-cpu
				1582	* pagevecs, raised page count preventing write_protect_page
				1583	* from merging them. Though it doesn't really matter much,
				1584	* it is puzzling to see some stuck in pages_volatile until
				1585	* other activity jostles them out, and they also prevented
				1586	* LTP's KSM test from succeeding deterministically; so drain
				1587	* them here (here rather than on entry to ksm_do_scan(),
				1588	* so we don't IPI too often when pages_to_scan is set low).
				1589	*/
				1590	lru_add_drain_all();
				1591
Hugh Dickins	4146d2d	2013-02-22 16:35:11 -0800	[diff] [blame]	1592	/*
				1593	* Whereas stale stable_nodes on the stable_tree itself
				1594	* get pruned in the regular course of stable_tree_search(),
				1595	* those moved out to the migrate_nodes list can accumulate:
				1596	* so prune them once before each full scan.
				1597	*/
				1598	if (!ksm_merge_across_nodes) {
Geliang Tang	0364041	2016-01-14 15:20:54 -0800	[diff] [blame]	1599	struct stable_node stable_node, next;
Hugh Dickins	4146d2d	2013-02-22 16:35:11 -0800	[diff] [blame]	1600	struct page *page;
				1601
Geliang Tang	0364041	2016-01-14 15:20:54 -0800	[diff] [blame]	1602	list_for_each_entry_safe(stable_node, next,
				1603	&migrate_nodes, list) {
Hugh Dickins	4146d2d	2013-02-22 16:35:11 -0800	[diff] [blame]	1604	page = get_ksm_page(stable_node, false);
				1605	if (page)
				1606	put_page(page);
				1607	cond_resched();
				1608	}
				1609	}
				1610
Hugh Dickins	ef53d16	2013-02-22 16:36:12 -0800	[diff] [blame]	1611	for (nid = 0; nid < ksm_nr_node_ids; nid++)
Petr Holasek	90bd6fd	2013-02-22 16:35:00 -0800	[diff] [blame]	1612	root_unstable_tree[nid] = RB_ROOT;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1613
				1614	spin_lock(&ksm_mmlist_lock);
				1615	slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list);
				1616	ksm_scan.mm_slot = slot;
				1617	spin_unlock(&ksm_mmlist_lock);
Hugh Dickins	2b47261	2011-06-15 15:08:58 -0700	[diff] [blame]	1618	/*
				1619	* Although we tested list_empty() above, a racing __ksm_exit
				1620	* of the last mm on the list may have removed it since then.
				1621	*/
				1622	if (slot == &ksm_mm_head)
				1623	return NULL;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1624	next_mm:
				1625	ksm_scan.address = 0;
Hugh Dickins	6514d51	2009-12-14 17:59:19 -0800	[diff] [blame]	1626	ksm_scan.rmap_list = &slot->rmap_list;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1627	}
				1628
				1629	mm = slot->mm;
				1630	down_read(&mm->mmap_sem);
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	1631	if (ksm_test_exit(mm))
				1632	vma = NULL;
				1633	else
				1634	vma = find_vma(mm, ksm_scan.address);
				1635
				1636	for (; vma; vma = vma->vm_next) {
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1637	if (!(vma->vm_flags & VM_MERGEABLE))
				1638	continue;
				1639	if (ksm_scan.address < vma->vm_start)
				1640	ksm_scan.address = vma->vm_start;
				1641	if (!vma->anon_vma)
				1642	ksm_scan.address = vma->vm_end;
				1643
				1644	while (ksm_scan.address < vma->vm_end) {
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	1645	if (ksm_test_exit(mm))
				1646	break;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1647	*page = follow_page(vma, ksm_scan.address, FOLL_GET);
Andrea Arcangeli	21ae5b0	2011-01-13 15:47:00 -0800	[diff] [blame]	1648	if (IS_ERR_OR_NULL(*page)) {
				1649	ksm_scan.address += PAGE_SIZE;
				1650	cond_resched();
				1651	continue;
				1652	}
Kirill A. Shutemov	f765f54	2016-01-15 16:53:03 -0800	[diff] [blame]	1653	if (PageAnon(*page)) {
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1654	flush_anon_page(vma, *page, ksm_scan.address);
				1655	flush_dcache_page(*page);
				1656	rmap_item = get_next_rmap_item(slot,
Hugh Dickins	6514d51	2009-12-14 17:59:19 -0800	[diff] [blame]	1657	ksm_scan.rmap_list, ksm_scan.address);
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1658	if (rmap_item) {
Hugh Dickins	6514d51	2009-12-14 17:59:19 -0800	[diff] [blame]	1659	ksm_scan.rmap_list =
				1660	&rmap_item->rmap_list;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1661	ksm_scan.address += PAGE_SIZE;
				1662	} else
				1663	put_page(*page);
				1664	up_read(&mm->mmap_sem);
				1665	return rmap_item;
				1666	}
Andrea Arcangeli	21ae5b0	2011-01-13 15:47:00 -0800	[diff] [blame]	1667	put_page(*page);
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1668	ksm_scan.address += PAGE_SIZE;
				1669	cond_resched();
				1670	}
				1671	}
				1672
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	1673	if (ksm_test_exit(mm)) {
				1674	ksm_scan.address = 0;
Hugh Dickins	6514d51	2009-12-14 17:59:19 -0800	[diff] [blame]	1675	ksm_scan.rmap_list = &slot->rmap_list;
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	1676	}
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1677	/*
				1678	* Nuke all the rmap_items that are above this current rmap:
				1679	* because there were no VM_MERGEABLE vmas with such addresses.
				1680	*/
Hugh Dickins	6514d51	2009-12-14 17:59:19 -0800	[diff] [blame]	1681	remove_trailing_rmap_items(slot, ksm_scan.rmap_list);
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1682
				1683	spin_lock(&ksm_mmlist_lock);
Hugh Dickins	cd551f9	2009-09-21 17:02:17 -0700	[diff] [blame]	1684	ksm_scan.mm_slot = list_entry(slot->mm_list.next,
				1685	struct mm_slot, mm_list);
				1686	if (ksm_scan.address == 0) {
				1687	/*
				1688	* We've completed a full scan of all vmas, holding mmap_sem
				1689	* throughout, and found no VM_MERGEABLE: so do the same as
				1690	* __ksm_exit does to remove this mm from all our lists now.
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	1691	* This applies either when cleaning up after __ksm_exit
				1692	* (but beware: we can reach here even before __ksm_exit),
				1693	* or when all VM_MERGEABLE areas have been unmapped (and
				1694	* mmap_sem then protects against race with MADV_MERGEABLE).
Hugh Dickins	cd551f9	2009-09-21 17:02:17 -0700	[diff] [blame]	1695	*/
Sasha Levin	4ca3a69	2013-02-22 16:32:28 -0800	[diff] [blame]	1696	hash_del(&slot->link);
Hugh Dickins	cd551f9	2009-09-21 17:02:17 -0700	[diff] [blame]	1697	list_del(&slot->mm_list);
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	1698	spin_unlock(&ksm_mmlist_lock);
				1699
Hugh Dickins	cd551f9	2009-09-21 17:02:17 -0700	[diff] [blame]	1700	free_mm_slot(slot);
				1701	clear_bit(MMF_VM_MERGEABLE, &mm->flags);
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	1702	up_read(&mm->mmap_sem);
				1703	mmdrop(mm);
				1704	} else {
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	1705	up_read(&mm->mmap_sem);
Zhou Chengming	7496fea	2016-05-12 15:42:21 -0700	[diff] [blame]	1706	/*
				1707	* up_read(&mm->mmap_sem) first because after
				1708	* spin_unlock(&ksm_mmlist_lock) run, the "mm" may
				1709	* already have been freed under us by __ksm_exit()
				1710	* because the "mm_slot" is still hashed and
				1711	* ksm_scan.mm_slot doesn't point to it anymore.
				1712	*/
				1713	spin_unlock(&ksm_mmlist_lock);
Hugh Dickins	cd551f9	2009-09-21 17:02:17 -0700	[diff] [blame]	1714	}
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1715
				1716	/* Repeat until we've completed scanning the whole list */
Hugh Dickins	cd551f9	2009-09-21 17:02:17 -0700	[diff] [blame]	1717	slot = ksm_scan.mm_slot;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1718	if (slot != &ksm_mm_head)
				1719	goto next_mm;
				1720
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1721	ksm_scan.seqnr++;
				1722	return NULL;
				1723	}
				1724
				1725	/**
				1726	* ksm_do_scan - the ksm scanner main worker function.
				1727	* @scan_npages - number of pages we want to scan before we return.
				1728	*/
				1729	static void ksm_do_scan(unsigned int scan_npages)
				1730	{
				1731	struct rmap_item *rmap_item;
Dan Carpenter	22eccdd	2010-04-23 13:18:10 -0400	[diff] [blame]	1732	struct page *uninitialized_var(page);
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1733
Andrea Arcangeli	878aee7	2011-01-13 15:47:10 -0800	[diff] [blame]	1734	while (scan_npages-- && likely(!freezing(current))) {
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1735	cond_resched();
				1736	rmap_item = scan_get_next_rmap_item(&page);
				1737	if (!rmap_item)
				1738	return;
Hugh Dickins	4146d2d	2013-02-22 16:35:11 -0800	[diff] [blame]	1739	cmp_and_merge_page(page, rmap_item);
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1740	put_page(page);
				1741	}
				1742	}
				1743
Hugh Dickins	6e158384	2009-09-21 17:02:14 -0700	[diff] [blame]	1744	static int ksmd_should_run(void)
				1745	{
				1746	return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.mm_list);
				1747	}
				1748
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1749	static int ksm_scan_thread(void *nothing)
				1750	{
Andrea Arcangeli	878aee7	2011-01-13 15:47:10 -0800	[diff] [blame]	1751	set_freezable();
Izik Eidus	339aa62	2009-09-21 17:02:07 -0700	[diff] [blame]	1752	set_user_nice(current, 5);
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1753
				1754	while (!kthread_should_stop()) {
Hugh Dickins	6e158384	2009-09-21 17:02:14 -0700	[diff] [blame]	1755	mutex_lock(&ksm_thread_mutex);
Hugh Dickins	ef4d43a	2013-02-22 16:35:16 -0800	[diff] [blame]	1756	wait_while_offlining();
Hugh Dickins	6e158384	2009-09-21 17:02:14 -0700	[diff] [blame]	1757	if (ksmd_should_run())
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1758	ksm_do_scan(ksm_thread_pages_to_scan);
Hugh Dickins	6e158384	2009-09-21 17:02:14 -0700	[diff] [blame]	1759	mutex_unlock(&ksm_thread_mutex);
				1760
Andrea Arcangeli	878aee7	2011-01-13 15:47:10 -0800	[diff] [blame]	1761	try_to_freeze();
				1762
Hugh Dickins	6e158384	2009-09-21 17:02:14 -0700	[diff] [blame]	1763	if (ksmd_should_run()) {
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1764	schedule_timeout_interruptible(
				1765	msecs_to_jiffies(ksm_thread_sleep_millisecs));
				1766	} else {
Andrea Arcangeli	878aee7	2011-01-13 15:47:10 -0800	[diff] [blame]	1767	wait_event_freezable(ksm_thread_wait,
Hugh Dickins	6e158384	2009-09-21 17:02:14 -0700	[diff] [blame]	1768	ksmd_should_run() \|\| kthread_should_stop());
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1769	}
				1770	}
				1771	return 0;
				1772	}
				1773
Hugh Dickins	f8af4da	2009-09-21 17:01:57 -0700	[diff] [blame]	1774	int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
				1775	unsigned long end, int advice, unsigned long *vm_flags)
				1776	{
				1777	struct mm_struct *mm = vma->vm_mm;
Hugh Dickins	d952b79	2009-09-21 17:02:16 -0700	[diff] [blame]	1778	int err;
Hugh Dickins	f8af4da	2009-09-21 17:01:57 -0700	[diff] [blame]	1779
				1780	switch (advice) {
				1781	case MADV_MERGEABLE:
				1782	/*
				1783	* Be somewhat over-protective for now!
				1784	*/
				1785	if (*vm_flags & (VM_MERGEABLE \| VM_SHARED \| VM_MAYSHARE \|
				1786	VM_PFNMAP \| VM_IO \| VM_DONTEXPAND \|
Kirill A. Shutemov	0661a33	2015-02-10 14:10:04 -0800	[diff] [blame]	1787	VM_HUGETLB \| VM_MIXEDMAP))
Hugh Dickins	f8af4da	2009-09-21 17:01:57 -0700	[diff] [blame]	1788	return 0; /* just ignore the advice */
				1789
Konstantin Khlebnikov	cc2383e	2012-10-08 16:28:37 -0700	[diff] [blame]	1790	#ifdef VM_SAO
				1791	if (*vm_flags & VM_SAO)
				1792	return 0;
				1793	#endif
				1794
Hugh Dickins	d952b79	2009-09-21 17:02:16 -0700	[diff] [blame]	1795	if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
				1796	err = __ksm_enter(mm);
				1797	if (err)
				1798	return err;
				1799	}
Hugh Dickins	f8af4da	2009-09-21 17:01:57 -0700	[diff] [blame]	1800
				1801	*vm_flags \|= VM_MERGEABLE;
				1802	break;
				1803
				1804	case MADV_UNMERGEABLE:
				1805	if (!(*vm_flags & VM_MERGEABLE))
				1806	return 0; /* just ignore the advice */
				1807
Hugh Dickins	d952b79	2009-09-21 17:02:16 -0700	[diff] [blame]	1808	if (vma->anon_vma) {
				1809	err = unmerge_ksm_pages(vma, start, end);
				1810	if (err)
				1811	return err;
				1812	}
Hugh Dickins	f8af4da	2009-09-21 17:01:57 -0700	[diff] [blame]	1813
				1814	*vm_flags &= ~VM_MERGEABLE;
				1815	break;
				1816	}
				1817
				1818	return 0;
				1819	}
				1820
				1821	int __ksm_enter(struct mm_struct *mm)
				1822	{
Hugh Dickins	6e158384	2009-09-21 17:02:14 -0700	[diff] [blame]	1823	struct mm_slot *mm_slot;
				1824	int needs_wakeup;
				1825
				1826	mm_slot = alloc_mm_slot();
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1827	if (!mm_slot)
				1828	return -ENOMEM;
				1829
Hugh Dickins	6e158384	2009-09-21 17:02:14 -0700	[diff] [blame]	1830	/* Check ksm_run too? Would need tighter locking */
				1831	needs_wakeup = list_empty(&ksm_mm_head.mm_list);
				1832
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1833	spin_lock(&ksm_mmlist_lock);
				1834	insert_to_mm_slots_hash(mm, mm_slot);
				1835	/*
Hugh Dickins	cbf86cf	2013-02-22 16:35:08 -0800	[diff] [blame]	1836	* When KSM_RUN_MERGE (or KSM_RUN_STOP),
				1837	* insert just behind the scanning cursor, to let the area settle
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1838	* down a little; when fork is followed by immediate exec, we don't
				1839	* want ksmd to waste time setting up and tearing down an rmap_list.
Hugh Dickins	cbf86cf	2013-02-22 16:35:08 -0800	[diff] [blame]	1840	*
				1841	* But when KSM_RUN_UNMERGE, it's important to insert ahead of its
				1842	* scanning cursor, otherwise KSM pages in newly forked mms will be
				1843	* missed: then we might as well insert at the end of the list.
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1844	*/
Hugh Dickins	cbf86cf	2013-02-22 16:35:08 -0800	[diff] [blame]	1845	if (ksm_run & KSM_RUN_UNMERGE)
				1846	list_add_tail(&mm_slot->mm_list, &ksm_mm_head.mm_list);
				1847	else
				1848	list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list);
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1849	spin_unlock(&ksm_mmlist_lock);
				1850
Hugh Dickins	f8af4da	2009-09-21 17:01:57 -0700	[diff] [blame]	1851	set_bit(MMF_VM_MERGEABLE, &mm->flags);
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	1852	atomic_inc(&mm->mm_count);
Hugh Dickins	6e158384	2009-09-21 17:02:14 -0700	[diff] [blame]	1853
				1854	if (needs_wakeup)
				1855	wake_up_interruptible(&ksm_thread_wait);
				1856
Hugh Dickins	f8af4da	2009-09-21 17:01:57 -0700	[diff] [blame]	1857	return 0;
				1858	}
				1859
Andrea Arcangeli	1c2fb7a	2009-09-21 17:02:22 -0700	[diff] [blame]	1860	void __ksm_exit(struct mm_struct *mm)
Hugh Dickins	f8af4da	2009-09-21 17:01:57 -0700	[diff] [blame]	1861	{
Hugh Dickins	cd551f9	2009-09-21 17:02:17 -0700	[diff] [blame]	1862	struct mm_slot *mm_slot;
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	1863	int easy_to_free = 0;
Hugh Dickins	cd551f9	2009-09-21 17:02:17 -0700	[diff] [blame]	1864
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1865	/*
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	1866	* This process is exiting: if it's straightforward (as is the
				1867	* case when ksmd was never running), free mm_slot immediately.
				1868	* But if it's at the cursor or has rmap_items linked to it, use
				1869	* mmap_sem to synchronize with any break_cows before pagetables
				1870	* are freed, and leave the mm_slot on the list for ksmd to free.
				1871	* Beware: ksm may already have noticed it exiting and freed the slot.
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1872	*/
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	1873
Hugh Dickins	cd551f9	2009-09-21 17:02:17 -0700	[diff] [blame]	1874	spin_lock(&ksm_mmlist_lock);
				1875	mm_slot = get_mm_slot(mm);
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	1876	if (mm_slot && ksm_scan.mm_slot != mm_slot) {
Hugh Dickins	6514d51	2009-12-14 17:59:19 -0800	[diff] [blame]	1877	if (!mm_slot->rmap_list) {
Sasha Levin	4ca3a69	2013-02-22 16:32:28 -0800	[diff] [blame]	1878	hash_del(&mm_slot->link);
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	1879	list_del(&mm_slot->mm_list);
				1880	easy_to_free = 1;
				1881	} else {
				1882	list_move(&mm_slot->mm_list,
				1883	&ksm_scan.mm_slot->mm_list);
				1884	}
Hugh Dickins	cd551f9	2009-09-21 17:02:17 -0700	[diff] [blame]	1885	}
Hugh Dickins	cd551f9	2009-09-21 17:02:17 -0700	[diff] [blame]	1886	spin_unlock(&ksm_mmlist_lock);
				1887
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	1888	if (easy_to_free) {
				1889	free_mm_slot(mm_slot);
				1890	clear_bit(MMF_VM_MERGEABLE, &mm->flags);
				1891	mmdrop(mm);
				1892	} else if (mm_slot) {
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	1893	down_write(&mm->mmap_sem);
				1894	up_write(&mm->mmap_sem);
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	1895	}
Hugh Dickins	f8af4da	2009-09-21 17:01:57 -0700	[diff] [blame]	1896	}
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1897
Hugh Dickins	cbf86cf	2013-02-22 16:35:08 -0800	[diff] [blame]	1898	struct page ksm_might_need_to_copy(struct page page,
Hugh Dickins	5ad6468	2009-12-14 17:59:24 -0800	[diff] [blame]	1899	struct vm_area_struct *vma, unsigned long address)
				1900	{
Hugh Dickins	cbf86cf	2013-02-22 16:35:08 -0800	[diff] [blame]	1901	struct anon_vma *anon_vma = page_anon_vma(page);
Hugh Dickins	5ad6468	2009-12-14 17:59:24 -0800	[diff] [blame]	1902	struct page *new_page;
				1903
Hugh Dickins	cbf86cf	2013-02-22 16:35:08 -0800	[diff] [blame]	1904	if (PageKsm(page)) {
				1905	if (page_stable_node(page) &&
				1906	!(ksm_run & KSM_RUN_UNMERGE))
				1907	return page; /* no need to copy it */
				1908	} else if (!anon_vma) {
				1909	return page; /* no need to copy it */
				1910	} else if (anon_vma->root == vma->anon_vma->root &&
				1911	page->index == linear_page_index(vma, address)) {
				1912	return page; /* still no need to copy it */
				1913	}
				1914	if (!PageUptodate(page))
				1915	return page; /* let do_swap_page report the error */
				1916
Hugh Dickins	5ad6468	2009-12-14 17:59:24 -0800	[diff] [blame]	1917	new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
				1918	if (new_page) {
				1919	copy_user_highpage(new_page, page, address, vma);
				1920
				1921	SetPageDirty(new_page);
				1922	__SetPageUptodate(new_page);
Kirill A. Shutemov	48c935a	2016-01-15 16:51:24 -0800	[diff] [blame]	1923	__SetPageLocked(new_page);
Hugh Dickins	5ad6468	2009-12-14 17:59:24 -0800	[diff] [blame]	1924	}
				1925
Hugh Dickins	5ad6468	2009-12-14 17:59:24 -0800	[diff] [blame]	1926	return new_page;
				1927	}
				1928
Joonsoo Kim	051ac83	2014-01-21 15:49:48 -0800	[diff] [blame]	1929	int rmap_walk_ksm(struct page page, struct rmap_walk_control rwc)
Hugh Dickins	e9995ef	2009-12-14 17:59:31 -0800	[diff] [blame]	1930	{
				1931	struct stable_node *stable_node;
Hugh Dickins	e9995ef	2009-12-14 17:59:31 -0800	[diff] [blame]	1932	struct rmap_item *rmap_item;
				1933	int ret = SWAP_AGAIN;
				1934	int search_new_forks = 0;
				1935
Sasha Levin	309381fea	2014-01-23 15:52:54 -0800	[diff] [blame]	1936	VM_BUG_ON_PAGE(!PageKsm(page), page);
Joonsoo Kim	9f32624	2014-01-21 15:49:53 -0800	[diff] [blame]	1937
				1938	/*
				1939	* Rely on the page lock to protect against concurrent modifications
				1940	* to that page's node of the stable tree.
				1941	*/
Sasha Levin	309381fea	2014-01-23 15:52:54 -0800	[diff] [blame]	1942	VM_BUG_ON_PAGE(!PageLocked(page), page);
Hugh Dickins	e9995ef	2009-12-14 17:59:31 -0800	[diff] [blame]	1943
				1944	stable_node = page_stable_node(page);
				1945	if (!stable_node)
				1946	return ret;
				1947	again:
Sasha Levin	b67bfe0	2013-02-27 17:06:00 -0800	[diff] [blame]	1948	hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
Hugh Dickins	e9995ef	2009-12-14 17:59:31 -0800	[diff] [blame]	1949	struct anon_vma *anon_vma = rmap_item->anon_vma;
Rik van Riel	5beb493	2010-03-05 13:42:07 -0800	[diff] [blame]	1950	struct anon_vma_chain *vmac;
Hugh Dickins	e9995ef	2009-12-14 17:59:31 -0800	[diff] [blame]	1951	struct vm_area_struct *vma;
				1952
Andrea Arcangeli	ad12695	2015-11-05 18:49:07 -0800	[diff] [blame]	1953	cond_resched();
Hugh Dickins	b6b19f2	2012-12-19 17:44:29 -0800	[diff] [blame]	1954	anon_vma_lock_read(anon_vma);
Michel Lespinasse	bf181b9	2012-10-08 16:31:39 -0700	[diff] [blame]	1955	anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
				1956	0, ULONG_MAX) {
Andrea Arcangeli	ad12695	2015-11-05 18:49:07 -0800	[diff] [blame]	1957	cond_resched();
Rik van Riel	5beb493	2010-03-05 13:42:07 -0800	[diff] [blame]	1958	vma = vmac->vma;
Hugh Dickins	e9995ef	2009-12-14 17:59:31 -0800	[diff] [blame]	1959	if (rmap_item->address < vma->vm_start \|\|
				1960	rmap_item->address >= vma->vm_end)
				1961	continue;
				1962	/*
				1963	* Initially we examine only the vma which covers this
				1964	* rmap_item; but later, if there is still work to do,
				1965	* we examine covering vmas in other mms: in case they
				1966	* were forked from the original since ksmd passed.
				1967	*/
				1968	if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
				1969	continue;
				1970
Joonsoo Kim	0dd1c7b	2014-01-21 15:49:49 -0800	[diff] [blame]	1971	if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
				1972	continue;
				1973
Joonsoo Kim	051ac83	2014-01-21 15:49:48 -0800	[diff] [blame]	1974	ret = rwc->rmap_one(page, vma,
				1975	rmap_item->address, rwc->arg);
Hugh Dickins	e9995ef	2009-12-14 17:59:31 -0800	[diff] [blame]	1976	if (ret != SWAP_AGAIN) {
Hugh Dickins	b6b19f2	2012-12-19 17:44:29 -0800	[diff] [blame]	1977	anon_vma_unlock_read(anon_vma);
Hugh Dickins	e9995ef	2009-12-14 17:59:31 -0800	[diff] [blame]	1978	goto out;
				1979	}
Joonsoo Kim	0dd1c7b	2014-01-21 15:49:49 -0800	[diff] [blame]	1980	if (rwc->done && rwc->done(page)) {
				1981	anon_vma_unlock_read(anon_vma);
				1982	goto out;
				1983	}
Hugh Dickins	e9995ef	2009-12-14 17:59:31 -0800	[diff] [blame]	1984	}
Hugh Dickins	b6b19f2	2012-12-19 17:44:29 -0800	[diff] [blame]	1985	anon_vma_unlock_read(anon_vma);
Hugh Dickins	e9995ef	2009-12-14 17:59:31 -0800	[diff] [blame]	1986	}
				1987	if (!search_new_forks++)
				1988	goto again;
				1989	out:
				1990	return ret;
				1991	}
				1992
Joonsoo Kim	5262950	2014-01-21 15:49:50 -0800	[diff] [blame]	1993	#ifdef CONFIG_MIGRATION
Hugh Dickins	e9995ef	2009-12-14 17:59:31 -0800	[diff] [blame]	1994	void ksm_migrate_page(struct page newpage, struct page oldpage)
				1995	{
				1996	struct stable_node *stable_node;
				1997
Sasha Levin	309381fea	2014-01-23 15:52:54 -0800	[diff] [blame]	1998	VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
				1999	VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
				2000	VM_BUG_ON_PAGE(newpage->mapping != oldpage->mapping, newpage);
Hugh Dickins	e9995ef	2009-12-14 17:59:31 -0800	[diff] [blame]	2001
				2002	stable_node = page_stable_node(newpage);
				2003	if (stable_node) {
Sasha Levin	309381fea	2014-01-23 15:52:54 -0800	[diff] [blame]	2004	VM_BUG_ON_PAGE(stable_node->kpfn != page_to_pfn(oldpage), oldpage);
Hugh Dickins	62b61f6	2009-12-14 17:59:33 -0800	[diff] [blame]	2005	stable_node->kpfn = page_to_pfn(newpage);
Hugh Dickins	c8d6553	2013-02-22 16:35:10 -0800	[diff] [blame]	2006	/*
				2007	* newpage->mapping was set in advance; now we need smp_wmb()
				2008	* to make sure that the new stable_node->kpfn is visible
				2009	* to get_ksm_page() before it can see that oldpage->mapping
				2010	* has gone stale (or that PageSwapCache has been cleared).
				2011	*/
				2012	smp_wmb();
				2013	set_page_stable_node(oldpage, NULL);
Hugh Dickins	e9995ef	2009-12-14 17:59:31 -0800	[diff] [blame]	2014	}
				2015	}
				2016	#endif /* CONFIG_MIGRATION */
				2017
Hugh Dickins	62b61f6	2009-12-14 17:59:33 -0800	[diff] [blame]	2018	#ifdef CONFIG_MEMORY_HOTREMOVE
Hugh Dickins	ef4d43a	2013-02-22 16:35:16 -0800	[diff] [blame]	2019	static void wait_while_offlining(void)
				2020	{
				2021	while (ksm_run & KSM_RUN_OFFLINE) {
				2022	mutex_unlock(&ksm_thread_mutex);
				2023	wait_on_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE),
NeilBrown	7431620	2014-07-07 15:16:04 +1000	[diff] [blame]	2024	TASK_UNINTERRUPTIBLE);
Hugh Dickins	ef4d43a	2013-02-22 16:35:16 -0800	[diff] [blame]	2025	mutex_lock(&ksm_thread_mutex);
				2026	}
				2027	}
				2028
Hugh Dickins	ee0ea59	2013-02-22 16:35:05 -0800	[diff] [blame]	2029	static void ksm_check_stable_tree(unsigned long start_pfn,
				2030	unsigned long end_pfn)
Hugh Dickins	62b61f6	2009-12-14 17:59:33 -0800	[diff] [blame]	2031	{
Geliang Tang	0364041	2016-01-14 15:20:54 -0800	[diff] [blame]	2032	struct stable_node stable_node, next;
Hugh Dickins	62b61f6	2009-12-14 17:59:33 -0800	[diff] [blame]	2033	struct rb_node *node;
Petr Holasek	90bd6fd	2013-02-22 16:35:00 -0800	[diff] [blame]	2034	int nid;
Hugh Dickins	62b61f6	2009-12-14 17:59:33 -0800	[diff] [blame]	2035
Hugh Dickins	ef53d16	2013-02-22 16:36:12 -0800	[diff] [blame]	2036	for (nid = 0; nid < ksm_nr_node_ids; nid++) {
				2037	node = rb_first(root_stable_tree + nid);
Hugh Dickins	ee0ea59	2013-02-22 16:35:05 -0800	[diff] [blame]	2038	while (node) {
Petr Holasek	90bd6fd	2013-02-22 16:35:00 -0800	[diff] [blame]	2039	stable_node = rb_entry(node, struct stable_node, node);
				2040	if (stable_node->kpfn >= start_pfn &&
Hugh Dickins	ee0ea59	2013-02-22 16:35:05 -0800	[diff] [blame]	2041	stable_node->kpfn < end_pfn) {
				2042	/*
				2043	* Don't get_ksm_page, page has already gone:
				2044	* which is why we keep kpfn instead of page*
				2045	*/
				2046	remove_node_from_stable_tree(stable_node);
Hugh Dickins	ef53d16	2013-02-22 16:36:12 -0800	[diff] [blame]	2047	node = rb_first(root_stable_tree + nid);
Hugh Dickins	ee0ea59	2013-02-22 16:35:05 -0800	[diff] [blame]	2048	} else
				2049	node = rb_next(node);
				2050	cond_resched();
Petr Holasek	90bd6fd	2013-02-22 16:35:00 -0800	[diff] [blame]	2051	}
Hugh Dickins	ee0ea59	2013-02-22 16:35:05 -0800	[diff] [blame]	2052	}
Geliang Tang	0364041	2016-01-14 15:20:54 -0800	[diff] [blame]	2053	list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) {
Hugh Dickins	4146d2d	2013-02-22 16:35:11 -0800	[diff] [blame]	2054	if (stable_node->kpfn >= start_pfn &&
				2055	stable_node->kpfn < end_pfn)
				2056	remove_node_from_stable_tree(stable_node);
				2057	cond_resched();
				2058	}
Hugh Dickins	62b61f6	2009-12-14 17:59:33 -0800	[diff] [blame]	2059	}
				2060
				2061	static int ksm_memory_callback(struct notifier_block *self,
				2062	unsigned long action, void *arg)
				2063	{
				2064	struct memory_notify *mn = arg;
Hugh Dickins	62b61f6	2009-12-14 17:59:33 -0800	[diff] [blame]	2065
				2066	switch (action) {
				2067	case MEM_GOING_OFFLINE:
				2068	/*
Hugh Dickins	ef4d43a	2013-02-22 16:35:16 -0800	[diff] [blame]	2069	* Prevent ksm_do_scan(), unmerge_and_remove_all_rmap_items()
				2070	* and remove_all_stable_nodes() while memory is going offline:
				2071	* it is unsafe for them to touch the stable tree at this time.
				2072	* But unmerge_ksm_pages(), rmap lookups and other entry points
				2073	* which do not need the ksm_thread_mutex are all safe.
Hugh Dickins	62b61f6	2009-12-14 17:59:33 -0800	[diff] [blame]	2074	*/
Hugh Dickins	ef4d43a	2013-02-22 16:35:16 -0800	[diff] [blame]	2075	mutex_lock(&ksm_thread_mutex);
				2076	ksm_run \|= KSM_RUN_OFFLINE;
				2077	mutex_unlock(&ksm_thread_mutex);
Hugh Dickins	62b61f6	2009-12-14 17:59:33 -0800	[diff] [blame]	2078	break;
				2079
				2080	case MEM_OFFLINE:
				2081	/*
				2082	* Most of the work is done by page migration; but there might
				2083	* be a few stable_nodes left over, still pointing to struct
Hugh Dickins	ee0ea59	2013-02-22 16:35:05 -0800	[diff] [blame]	2084	* pages which have been offlined: prune those from the tree,
				2085	* otherwise get_ksm_page() might later try to access a
				2086	* non-existent struct page.
Hugh Dickins	62b61f6	2009-12-14 17:59:33 -0800	[diff] [blame]	2087	*/
Hugh Dickins	ee0ea59	2013-02-22 16:35:05 -0800	[diff] [blame]	2088	ksm_check_stable_tree(mn->start_pfn,
				2089	mn->start_pfn + mn->nr_pages);
Hugh Dickins	62b61f6	2009-12-14 17:59:33 -0800	[diff] [blame]	2090	/* fallthrough */
				2091
				2092	case MEM_CANCEL_OFFLINE:
Hugh Dickins	ef4d43a	2013-02-22 16:35:16 -0800	[diff] [blame]	2093	mutex_lock(&ksm_thread_mutex);
				2094	ksm_run &= ~KSM_RUN_OFFLINE;
Hugh Dickins	62b61f6	2009-12-14 17:59:33 -0800	[diff] [blame]	2095	mutex_unlock(&ksm_thread_mutex);
Hugh Dickins	ef4d43a	2013-02-22 16:35:16 -0800	[diff] [blame]	2096
				2097	smp_mb(); /* wake_up_bit advises this */
				2098	wake_up_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE));
Hugh Dickins	62b61f6	2009-12-14 17:59:33 -0800	[diff] [blame]	2099	break;
				2100	}
				2101	return NOTIFY_OK;
				2102	}
Hugh Dickins	ef4d43a	2013-02-22 16:35:16 -0800	[diff] [blame]	2103	#else
				2104	static void wait_while_offlining(void)
				2105	{
				2106	}
Hugh Dickins	62b61f6	2009-12-14 17:59:33 -0800	[diff] [blame]	2107	#endif /* CONFIG_MEMORY_HOTREMOVE */
				2108
Hugh Dickins	2ffd867	2009-09-21 17:02:23 -0700	[diff] [blame]	2109	#ifdef CONFIG_SYSFS
				2110	/*
				2111	* This all compiles without CONFIG_SYSFS, but is a waste of space.
				2112	*/
				2113
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	2114	#define KSM_ATTR_RO(_name) \
				2115	static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
				2116	#define KSM_ATTR(_name) \
				2117	static struct kobj_attribute _name##_attr = \
				2118	__ATTR(_name, 0644, _name##_show, _name##_store)
				2119
				2120	static ssize_t sleep_millisecs_show(struct kobject *kobj,
				2121	struct kobj_attribute attr, char buf)
				2122	{
				2123	return sprintf(buf, "%u\n", ksm_thread_sleep_millisecs);
				2124	}
				2125
				2126	static ssize_t sleep_millisecs_store(struct kobject *kobj,
				2127	struct kobj_attribute *attr,
				2128	const char *buf, size_t count)
				2129	{
				2130	unsigned long msecs;
				2131	int err;
				2132
Jingoo Han	3dbb95f	2013-09-11 14:20:25 -0700	[diff] [blame]	2133	err = kstrtoul(buf, 10, &msecs);
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	2134	if (err \|\| msecs > UINT_MAX)
				2135	return -EINVAL;
				2136
				2137	ksm_thread_sleep_millisecs = msecs;
				2138
				2139	return count;
				2140	}
				2141	KSM_ATTR(sleep_millisecs);
				2142
				2143	static ssize_t pages_to_scan_show(struct kobject *kobj,
				2144	struct kobj_attribute attr, char buf)
				2145	{
				2146	return sprintf(buf, "%u\n", ksm_thread_pages_to_scan);
				2147	}
				2148
				2149	static ssize_t pages_to_scan_store(struct kobject *kobj,
				2150	struct kobj_attribute *attr,
				2151	const char *buf, size_t count)
				2152	{
				2153	int err;
				2154	unsigned long nr_pages;
				2155
Jingoo Han	3dbb95f	2013-09-11 14:20:25 -0700	[diff] [blame]	2156	err = kstrtoul(buf, 10, &nr_pages);
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	2157	if (err \|\| nr_pages > UINT_MAX)
				2158	return -EINVAL;
				2159
				2160	ksm_thread_pages_to_scan = nr_pages;
				2161
				2162	return count;
				2163	}
				2164	KSM_ATTR(pages_to_scan);
				2165
				2166	static ssize_t run_show(struct kobject kobj, struct kobj_attribute attr,
				2167	char *buf)
				2168	{
Hugh Dickins	ef4d43a	2013-02-22 16:35:16 -0800	[diff] [blame]	2169	return sprintf(buf, "%lu\n", ksm_run);
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	2170	}
				2171
				2172	static ssize_t run_store(struct kobject kobj, struct kobj_attribute attr,
				2173	const char *buf, size_t count)
				2174	{
				2175	int err;
				2176	unsigned long flags;
				2177
Jingoo Han	3dbb95f	2013-09-11 14:20:25 -0700	[diff] [blame]	2178	err = kstrtoul(buf, 10, &flags);
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	2179	if (err \|\| flags > UINT_MAX)
				2180	return -EINVAL;
				2181	if (flags > KSM_RUN_UNMERGE)
				2182	return -EINVAL;
				2183
				2184	/*
				2185	* KSM_RUN_MERGE sets ksmd running, and 0 stops it running.
				2186	* KSM_RUN_UNMERGE stops it running and unmerges all rmap_items,
Hugh Dickins	d0f209f	2009-12-14 17:59:34 -0800	[diff] [blame]	2187	* breaking COW to free the pages_shared (but leaves mm_slots
				2188	* on the list for when ksmd may be set running again).
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	2189	*/
				2190
				2191	mutex_lock(&ksm_thread_mutex);
Hugh Dickins	ef4d43a	2013-02-22 16:35:16 -0800	[diff] [blame]	2192	wait_while_offlining();
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	2193	if (ksm_run != flags) {
				2194	ksm_run = flags;
Hugh Dickins	d952b79	2009-09-21 17:02:16 -0700	[diff] [blame]	2195	if (flags & KSM_RUN_UNMERGE) {
David Rientjes	e1e12d2	2012-12-11 16:02:56 -0800	[diff] [blame]	2196	set_current_oom_origin();
Hugh Dickins	d952b79	2009-09-21 17:02:16 -0700	[diff] [blame]	2197	err = unmerge_and_remove_all_rmap_items();
David Rientjes	e1e12d2	2012-12-11 16:02:56 -0800	[diff] [blame]	2198	clear_current_oom_origin();
Hugh Dickins	d952b79	2009-09-21 17:02:16 -0700	[diff] [blame]	2199	if (err) {
				2200	ksm_run = KSM_RUN_STOP;
				2201	count = err;
				2202	}
				2203	}
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	2204	}
				2205	mutex_unlock(&ksm_thread_mutex);
				2206
				2207	if (flags & KSM_RUN_MERGE)
				2208	wake_up_interruptible(&ksm_thread_wait);
				2209
				2210	return count;
				2211	}
				2212	KSM_ATTR(run);
				2213
Petr Holasek	90bd6fd	2013-02-22 16:35:00 -0800	[diff] [blame]	2214	#ifdef CONFIG_NUMA
				2215	static ssize_t merge_across_nodes_show(struct kobject *kobj,
				2216	struct kobj_attribute attr, char buf)
				2217	{
				2218	return sprintf(buf, "%u\n", ksm_merge_across_nodes);
				2219	}
				2220
				2221	static ssize_t merge_across_nodes_store(struct kobject *kobj,
				2222	struct kobj_attribute *attr,
				2223	const char *buf, size_t count)
				2224	{
				2225	int err;
				2226	unsigned long knob;
				2227
				2228	err = kstrtoul(buf, 10, &knob);
				2229	if (err)
				2230	return err;
				2231	if (knob > 1)
				2232	return -EINVAL;
				2233
				2234	mutex_lock(&ksm_thread_mutex);
Hugh Dickins	ef4d43a	2013-02-22 16:35:16 -0800	[diff] [blame]	2235	wait_while_offlining();
Petr Holasek	90bd6fd	2013-02-22 16:35:00 -0800	[diff] [blame]	2236	if (ksm_merge_across_nodes != knob) {
Hugh Dickins	cbf86cf	2013-02-22 16:35:08 -0800	[diff] [blame]	2237	if (ksm_pages_shared \|\| remove_all_stable_nodes())
Petr Holasek	90bd6fd	2013-02-22 16:35:00 -0800	[diff] [blame]	2238	err = -EBUSY;
Hugh Dickins	ef53d16	2013-02-22 16:36:12 -0800	[diff] [blame]	2239	else if (root_stable_tree == one_stable_tree) {
				2240	struct rb_root *buf;
				2241	/*
				2242	* This is the first time that we switch away from the
				2243	* default of merging across nodes: must now allocate
				2244	* a buffer to hold as many roots as may be needed.
				2245	* Allocate stable and unstable together:
				2246	* MAXSMP NODES_SHIFT 10 will use 16kB.
				2247	*/
Joe Perches	bafe1e1	2013-11-12 15:07:10 -0800	[diff] [blame]	2248	buf = kcalloc(nr_node_ids + nr_node_ids, sizeof(*buf),
				2249	GFP_KERNEL);
Hugh Dickins	ef53d16	2013-02-22 16:36:12 -0800	[diff] [blame]	2250	/* Let us assume that RB_ROOT is NULL is zero */
				2251	if (!buf)
				2252	err = -ENOMEM;
				2253	else {
				2254	root_stable_tree = buf;
				2255	root_unstable_tree = buf + nr_node_ids;
				2256	/* Stable tree is empty but not the unstable */
				2257	root_unstable_tree[0] = one_unstable_tree[0];
				2258	}
				2259	}
				2260	if (!err) {
Petr Holasek	90bd6fd	2013-02-22 16:35:00 -0800	[diff] [blame]	2261	ksm_merge_across_nodes = knob;
Hugh Dickins	ef53d16	2013-02-22 16:36:12 -0800	[diff] [blame]	2262	ksm_nr_node_ids = knob ? 1 : nr_node_ids;
				2263	}
Petr Holasek	90bd6fd	2013-02-22 16:35:00 -0800	[diff] [blame]	2264	}
				2265	mutex_unlock(&ksm_thread_mutex);
				2266
				2267	return err ? err : count;
				2268	}
				2269	KSM_ATTR(merge_across_nodes);
				2270	#endif
				2271
Claudio Imbrenda	e86c59b	2017-02-24 14:55:39 -0800	[diff] [blame]	2272	static ssize_t use_zero_pages_show(struct kobject *kobj,
				2273	struct kobj_attribute attr, char buf)
				2274	{
				2275	return sprintf(buf, "%u\n", ksm_use_zero_pages);
				2276	}
				2277	static ssize_t use_zero_pages_store(struct kobject *kobj,
				2278	struct kobj_attribute *attr,
				2279	const char *buf, size_t count)
				2280	{
				2281	int err;
				2282	bool value;
				2283
				2284	err = kstrtobool(buf, &value);
				2285	if (err)
				2286	return -EINVAL;
				2287
				2288	ksm_use_zero_pages = value;
				2289
				2290	return count;
				2291	}
				2292	KSM_ATTR(use_zero_pages);
				2293
Hugh Dickins	b402826	2009-09-21 17:02:09 -0700	[diff] [blame]	2294	static ssize_t pages_shared_show(struct kobject *kobj,
				2295	struct kobj_attribute attr, char buf)
				2296	{
				2297	return sprintf(buf, "%lu\n", ksm_pages_shared);
				2298	}
				2299	KSM_ATTR_RO(pages_shared);
				2300
				2301	static ssize_t pages_sharing_show(struct kobject *kobj,
				2302	struct kobj_attribute attr, char buf)
				2303	{
Hugh Dickins	e178dfd	2009-09-21 17:02:10 -0700	[diff] [blame]	2304	return sprintf(buf, "%lu\n", ksm_pages_sharing);
Hugh Dickins	b402826	2009-09-21 17:02:09 -0700	[diff] [blame]	2305	}
				2306	KSM_ATTR_RO(pages_sharing);
				2307
Hugh Dickins	473b0ce	2009-09-21 17:02:11 -0700	[diff] [blame]	2308	static ssize_t pages_unshared_show(struct kobject *kobj,
				2309	struct kobj_attribute attr, char buf)
				2310	{
				2311	return sprintf(buf, "%lu\n", ksm_pages_unshared);
				2312	}
				2313	KSM_ATTR_RO(pages_unshared);
				2314
				2315	static ssize_t pages_volatile_show(struct kobject *kobj,
				2316	struct kobj_attribute attr, char buf)
				2317	{
				2318	long ksm_pages_volatile;
				2319
				2320	ksm_pages_volatile = ksm_rmap_items - ksm_pages_shared
				2321	- ksm_pages_sharing - ksm_pages_unshared;
				2322	/*
				2323	* It was not worth any locking to calculate that statistic,
				2324	* but it might therefore sometimes be negative: conceal that.
				2325	*/
				2326	if (ksm_pages_volatile < 0)
				2327	ksm_pages_volatile = 0;
				2328	return sprintf(buf, "%ld\n", ksm_pages_volatile);
				2329	}
				2330	KSM_ATTR_RO(pages_volatile);
				2331
				2332	static ssize_t full_scans_show(struct kobject *kobj,
				2333	struct kobj_attribute attr, char buf)
				2334	{
				2335	return sprintf(buf, "%lu\n", ksm_scan.seqnr);
				2336	}
				2337	KSM_ATTR_RO(full_scans);
				2338
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	2339	static struct attribute *ksm_attrs[] = {
				2340	&sleep_millisecs_attr.attr,
				2341	&pages_to_scan_attr.attr,
				2342	&run_attr.attr,
Hugh Dickins	b402826	2009-09-21 17:02:09 -0700	[diff] [blame]	2343	&pages_shared_attr.attr,
				2344	&pages_sharing_attr.attr,
Hugh Dickins	473b0ce	2009-09-21 17:02:11 -0700	[diff] [blame]	2345	&pages_unshared_attr.attr,
				2346	&pages_volatile_attr.attr,
				2347	&full_scans_attr.attr,
Petr Holasek	90bd6fd	2013-02-22 16:35:00 -0800	[diff] [blame]	2348	#ifdef CONFIG_NUMA
				2349	&merge_across_nodes_attr.attr,
				2350	#endif
Claudio Imbrenda	e86c59b	2017-02-24 14:55:39 -0800	[diff] [blame]	2351	&use_zero_pages_attr.attr,
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	2352	NULL,
				2353	};
				2354
				2355	static struct attribute_group ksm_attr_group = {
				2356	.attrs = ksm_attrs,
				2357	.name = "ksm",
				2358	};
Hugh Dickins	2ffd867	2009-09-21 17:02:23 -0700	[diff] [blame]	2359	#endif /* CONFIG_SYSFS */
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	2360
				2361	static int __init ksm_init(void)
				2362	{
				2363	struct task_struct *ksm_thread;
				2364	int err;
				2365
Claudio Imbrenda	e86c59b	2017-02-24 14:55:39 -0800	[diff] [blame]	2366	/* The correct value depends on page size and endianness */
				2367	zero_checksum = calc_checksum(ZERO_PAGE(0));
				2368	/* Default to false for backwards compatibility */
				2369	ksm_use_zero_pages = false;
				2370
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	2371	err = ksm_slab_init();
				2372	if (err)
				2373	goto out;
				2374
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	2375	ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd");
				2376	if (IS_ERR(ksm_thread)) {
Paul McQuade	25acde3	2014-10-09 15:29:09 -0700	[diff] [blame]	2377	pr_err("ksm: creating kthread failed\n");
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	2378	err = PTR_ERR(ksm_thread);
Lai Jiangshan	d9f8984	2010-08-09 17:20:02 -0700	[diff] [blame]	2379	goto out_free;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	2380	}
				2381
Hugh Dickins	2ffd867	2009-09-21 17:02:23 -0700	[diff] [blame]	2382	#ifdef CONFIG_SYSFS
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	2383	err = sysfs_create_group(mm_kobj, &ksm_attr_group);
				2384	if (err) {
Paul McQuade	25acde3	2014-10-09 15:29:09 -0700	[diff] [blame]	2385	pr_err("ksm: register sysfs failed\n");
Hugh Dickins	2ffd867	2009-09-21 17:02:23 -0700	[diff] [blame]	2386	kthread_stop(ksm_thread);
Lai Jiangshan	d9f8984	2010-08-09 17:20:02 -0700	[diff] [blame]	2387	goto out_free;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	2388	}
Hugh Dickins	c73602a	2009-10-07 16:32:22 -0700	[diff] [blame]	2389	#else
				2390	ksm_run = KSM_RUN_MERGE; /* no way for user to start it */
				2391
Hugh Dickins	2ffd867	2009-09-21 17:02:23 -0700	[diff] [blame]	2392	#endif /* CONFIG_SYSFS */
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	2393
Hugh Dickins	62b61f6	2009-12-14 17:59:33 -0800	[diff] [blame]	2394	#ifdef CONFIG_MEMORY_HOTREMOVE
Hugh Dickins	ef4d43a	2013-02-22 16:35:16 -0800	[diff] [blame]	2395	/* There is no significance to this priority 100 */
Hugh Dickins	62b61f6	2009-12-14 17:59:33 -0800	[diff] [blame]	2396	hotplug_memory_notifier(ksm_memory_callback, 100);
				2397	#endif
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	2398	return 0;
				2399
Lai Jiangshan	d9f8984	2010-08-09 17:20:02 -0700	[diff] [blame]	2400	out_free:
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	2401	ksm_slab_free();
				2402	out:
				2403	return err;
				2404	}
Paul Gortmaker	a64fb3c	2014-01-23 15:53:30 -0800	[diff] [blame]	2405	subsys_initcall(ksm_init);