Blame - mm/oom_kill.c - SHIFTPHONES/mainline/linux

blob: f7ed6ece0719d1508129aee70551b2b13434e40a [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
				2	* linux/mm/oom_kill.c
				3	*
				4	* Copyright (C) 1998,2000 Rik van Riel
				5	* Thanks go out to Claus Fischer for some serious inspiration and
				6	* for goading me into coding this file...
David Rientjes	a63d83f	2010-08-09 17:19:46 -0700	[diff] [blame]	7	* Copyright (C) 2010 Google, Inc.
				8	* Rewritten by David Rientjes
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	9	*
				10	* The routines in this file are used to kill a process when
Paul Jackson	a49335c	2005-09-06 15:18:09 -0700	[diff] [blame]	11	* we're seriously out of memory. This gets called from __alloc_pages()
				12	* in mm/page_alloc.c when we really run out of memory.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	13	*
				14	* Since we won't call these routines often (on a well-configured
				15	* machine) this file will double as a 'coding guide' and a signpost
				16	* for newbie kernel hackers. It features several pointers to major
				17	* kernel subsystems and hints as to where to find out what things do.
				18	*/
				19
Alexey Dobriyan	8ac773b	2006-10-19 23:28:32 -0700	[diff] [blame]	20	#include <linux/oom.h>
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	21	#include <linux/mm.h>
Alexey Dobriyan	4e950f6	2007-07-30 02:36:13 +0400	[diff] [blame]	22	#include <linux/err.h>
Tejun Heo	5a0e3ad	2010-03-24 17:04:11 +0900	[diff] [blame]	23	#include <linux/gfp.h>
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	24	#include <linux/sched.h>
				25	#include <linux/swap.h>
				26	#include <linux/timex.h>
				27	#include <linux/jiffies.h>
Paul Jackson	ef08e3b	2005-09-06 15:18:13 -0700	[diff] [blame]	28	#include <linux/cpuset.h>
Paul Gortmaker	b95f1b31	2011-10-16 02:01:52 -0400	[diff] [blame]	29	#include <linux/export.h>
Martin Schwidefsky	8bc719d	2006-09-25 23:31:20 -0700	[diff] [blame]	30	#include <linux/notifier.h>
Pavel Emelianov	c7ba5c9	2008-02-07 00:13:58 -0800	[diff] [blame]	31	#include <linux/memcontrol.h>
David Rientjes	6f48d0eb	2010-08-09 17:18:52 -0700	[diff] [blame]	32	#include <linux/mempolicy.h>
David Howells	5cd9c58	2008-08-14 11:37:28 +0100	[diff] [blame]	33	#include <linux/security.h>
David Rientjes	edd4554	2011-03-22 16:30:12 -0700	[diff] [blame]	34	#include <linux/ptrace.h>
David Rientjes	f660daa	2011-10-31 17:07:07 -0700	[diff] [blame]	35	#include <linux/freezer.h>
KAMEZAWA Hiroyuki	43d2b11	2012-01-10 15:08:09 -0800	[diff] [blame]	36	#include <linux/ftrace.h>
David Rientjes	dc3f21e	2012-03-21 16:33:47 -0700	[diff] [blame]	37	#include <linux/ratelimit.h>
Michal Hocko	aac4536	2016-03-25 14:20:24 -0700	[diff] [blame^]	38	#include <linux/kthread.h>
				39	#include <linux/init.h>
				40
				41	#include <asm/tlb.h>
				42	#include "internal.h"
KAMEZAWA Hiroyuki	43d2b11	2012-01-10 15:08:09 -0800	[diff] [blame]	43
				44	#define CREATE_TRACE_POINTS
				45	#include <trace/events/oom.h>
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	46
KAMEZAWA Hiroyuki	fadd8fb	2006-06-23 02:03:13 -0700	[diff] [blame]	47	int sysctl_panic_on_oom;
David Rientjes	fe071d7	2007-10-16 23:25:56 -0700	[diff] [blame]	48	int sysctl_oom_kill_allocating_task;
David Rientjes	ad915c4	2010-08-09 17:18:53 -0700	[diff] [blame]	49	int sysctl_oom_dump_tasks = 1;
Johannes Weiner	dc56401	2015-06-24 16:57:19 -0700	[diff] [blame]	50
				51	DEFINE_MUTEX(oom_lock);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	52
David Rientjes	6f48d0eb	2010-08-09 17:18:52 -0700	[diff] [blame]	53	#ifdef CONFIG_NUMA
				54	/**
				55	* has_intersects_mems_allowed() - check task eligiblity for kill
Oleg Nesterov	ad96244	2014-01-21 15:50:00 -0800	[diff] [blame]	56	* @start: task struct of which task to consider
David Rientjes	6f48d0eb	2010-08-09 17:18:52 -0700	[diff] [blame]	57	* @mask: nodemask passed to page allocator for mempolicy ooms
				58	*
				59	* Task eligibility is determined by whether or not a candidate task, @tsk,
				60	* shares the same mempolicy nodes as current if it is bound by such a policy
				61	* and whether or not it has the same set of allowed cpuset nodes.
KOSAKI Motohiro	495789a	2009-09-21 17:03:14 -0700	[diff] [blame]	62	*/
Oleg Nesterov	ad96244	2014-01-21 15:50:00 -0800	[diff] [blame]	63	static bool has_intersects_mems_allowed(struct task_struct *start,
David Rientjes	6f48d0eb	2010-08-09 17:18:52 -0700	[diff] [blame]	64	const nodemask_t *mask)
KOSAKI Motohiro	495789a	2009-09-21 17:03:14 -0700	[diff] [blame]	65	{
Oleg Nesterov	ad96244	2014-01-21 15:50:00 -0800	[diff] [blame]	66	struct task_struct *tsk;
				67	bool ret = false;
KOSAKI Motohiro	495789a	2009-09-21 17:03:14 -0700	[diff] [blame]	68
Oleg Nesterov	ad96244	2014-01-21 15:50:00 -0800	[diff] [blame]	69	rcu_read_lock();
Oleg Nesterov	1da4db0	2014-01-21 15:49:58 -0800	[diff] [blame]	70	for_each_thread(start, tsk) {
David Rientjes	6f48d0eb	2010-08-09 17:18:52 -0700	[diff] [blame]	71	if (mask) {
				72	/*
				73	* If this is a mempolicy constrained oom, tsk's
				74	* cpuset is irrelevant. Only return true if its
				75	* mempolicy intersects current, otherwise it may be
				76	* needlessly killed.
				77	*/
Oleg Nesterov	ad96244	2014-01-21 15:50:00 -0800	[diff] [blame]	78	ret = mempolicy_nodemask_intersects(tsk, mask);
David Rientjes	6f48d0eb	2010-08-09 17:18:52 -0700	[diff] [blame]	79	} else {
				80	/*
				81	* This is not a mempolicy constrained oom, so only
				82	* check the mems of tsk's cpuset.
				83	*/
Oleg Nesterov	ad96244	2014-01-21 15:50:00 -0800	[diff] [blame]	84	ret = cpuset_mems_allowed_intersects(current, tsk);
David Rientjes	6f48d0eb	2010-08-09 17:18:52 -0700	[diff] [blame]	85	}
Oleg Nesterov	ad96244	2014-01-21 15:50:00 -0800	[diff] [blame]	86	if (ret)
				87	break;
Oleg Nesterov	1da4db0	2014-01-21 15:49:58 -0800	[diff] [blame]	88	}
Oleg Nesterov	ad96244	2014-01-21 15:50:00 -0800	[diff] [blame]	89	rcu_read_unlock();
KOSAKI Motohiro	df1090a	2010-08-09 17:19:39 -0700	[diff] [blame]	90
Oleg Nesterov	ad96244	2014-01-21 15:50:00 -0800	[diff] [blame]	91	return ret;
KOSAKI Motohiro	495789a	2009-09-21 17:03:14 -0700	[diff] [blame]	92	}
David Rientjes	6f48d0eb	2010-08-09 17:18:52 -0700	[diff] [blame]	93	#else
				94	static bool has_intersects_mems_allowed(struct task_struct *tsk,
				95	const nodemask_t *mask)
				96	{
				97	return true;
				98	}
				99	#endif /* CONFIG_NUMA */
KOSAKI Motohiro	495789a	2009-09-21 17:03:14 -0700	[diff] [blame]	100
David Rientjes	6f48d0eb	2010-08-09 17:18:52 -0700	[diff] [blame]	101	/*
				102	* The process p may have detached its own ->mm while exiting or through
				103	* use_mm(), but one or more of its subthreads may still have a valid
				104	* pointer. Return p, or any of its subthreads with a valid ->mm, with
				105	* task_lock() held.
				106	*/
KAMEZAWA Hiroyuki	158e0a2	2010-08-10 18:03:00 -0700	[diff] [blame]	107	struct task_struct find_lock_task_mm(struct task_struct p)
Oleg Nesterov	dd8e8f4	2010-08-09 17:18:45 -0700	[diff] [blame]	108	{
Oleg Nesterov	1da4db0	2014-01-21 15:49:58 -0800	[diff] [blame]	109	struct task_struct *t;
Oleg Nesterov	dd8e8f4	2010-08-09 17:18:45 -0700	[diff] [blame]	110
Oleg Nesterov	4d4048b	2014-01-21 15:50:01 -0800	[diff] [blame]	111	rcu_read_lock();
				112
Oleg Nesterov	1da4db0	2014-01-21 15:49:58 -0800	[diff] [blame]	113	for_each_thread(p, t) {
Oleg Nesterov	dd8e8f4	2010-08-09 17:18:45 -0700	[diff] [blame]	114	task_lock(t);
				115	if (likely(t->mm))
Oleg Nesterov	4d4048b	2014-01-21 15:50:01 -0800	[diff] [blame]	116	goto found;
Oleg Nesterov	dd8e8f4	2010-08-09 17:18:45 -0700	[diff] [blame]	117	task_unlock(t);
Oleg Nesterov	1da4db0	2014-01-21 15:49:58 -0800	[diff] [blame]	118	}
Oleg Nesterov	4d4048b	2014-01-21 15:50:01 -0800	[diff] [blame]	119	t = NULL;
				120	found:
				121	rcu_read_unlock();
Oleg Nesterov	dd8e8f4	2010-08-09 17:18:45 -0700	[diff] [blame]	122
Oleg Nesterov	4d4048b	2014-01-21 15:50:01 -0800	[diff] [blame]	123	return t;
Oleg Nesterov	dd8e8f4	2010-08-09 17:18:45 -0700	[diff] [blame]	124	}
				125
Yaowei Bai	db2a0dd	2015-11-06 16:28:06 -0800	[diff] [blame]	126	/*
				127	* order == -1 means the oom kill is required by sysrq, otherwise only
				128	* for display purposes.
				129	*/
				130	static inline bool is_sysrq_oom(struct oom_control *oc)
				131	{
				132	return oc->order == -1;
				133	}
				134
KOSAKI Motohiro	ab290ad	2010-08-09 17:19:35 -0700	[diff] [blame]	135	/* return true if the task is not adequate as candidate victim task. */
David Rientjes	e85bfd3	2010-09-22 13:05:10 -0700	[diff] [blame]	136	static bool oom_unkillable_task(struct task_struct *p,
Johannes Weiner	2314b42	2014-12-10 15:44:33 -0800	[diff] [blame]	137	struct mem_cgroup memcg, const nodemask_t nodemask)
KOSAKI Motohiro	ab290ad	2010-08-09 17:19:35 -0700	[diff] [blame]	138	{
				139	if (is_global_init(p))
				140	return true;
				141	if (p->flags & PF_KTHREAD)
				142	return true;
				143
				144	/* When mem_cgroup_out_of_memory() and p is not member of the group */
Johannes Weiner	72835c8	2012-01-12 17:18:32 -0800	[diff] [blame]	145	if (memcg && !task_in_mem_cgroup(p, memcg))
KOSAKI Motohiro	ab290ad	2010-08-09 17:19:35 -0700	[diff] [blame]	146	return true;
				147
				148	/* p may not have freeable memory in nodemask */
				149	if (!has_intersects_mems_allowed(p, nodemask))
				150	return true;
				151
				152	return false;
				153	}
				154
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	155	/**
David Rientjes	a63d83f	2010-08-09 17:19:46 -0700	[diff] [blame]	156	* oom_badness - heuristic function to determine which candidate task to kill
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	157	* @p: task struct of which task we should calculate
David Rientjes	a63d83f	2010-08-09 17:19:46 -0700	[diff] [blame]	158	* @totalpages: total present RAM allowed for page allocation
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	159	*
David Rientjes	a63d83f	2010-08-09 17:19:46 -0700	[diff] [blame]	160	* The heuristic for determining which task to kill is made to be as simple and
				161	* predictable as possible. The goal is to return the highest value for the
				162	* task consuming the most memory to avoid subsequent oom failures.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	163	*/
David Rientjes	a7f638f	2012-05-29 15:06:47 -0700	[diff] [blame]	164	unsigned long oom_badness(struct task_struct p, struct mem_cgroup memcg,
				165	const nodemask_t *nodemask, unsigned long totalpages)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	166	{
David Rientjes	1e11ad8	2012-06-08 13:21:26 -0700	[diff] [blame]	167	long points;
David Rientjes	61eafb0	2012-06-20 12:52:58 -0700	[diff] [blame]	168	long adj;
KOSAKI Motohiro	28b83c5	2009-09-21 17:03:13 -0700	[diff] [blame]	169
Johannes Weiner	72835c8	2012-01-12 17:18:32 -0800	[diff] [blame]	170	if (oom_unkillable_task(p, memcg, nodemask))
KOSAKI Motohiro	26ebc98	2010-08-09 17:19:37 -0700	[diff] [blame]	171	return 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	172
Oleg Nesterov	dd8e8f4	2010-08-09 17:18:45 -0700	[diff] [blame]	173	p = find_lock_task_mm(p);
				174	if (!p)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	175	return 0;
				176
David Rientjes	a9c58b90	2012-12-11 16:02:54 -0800	[diff] [blame]	177	adj = (long)p->signal->oom_score_adj;
David Rientjes	61eafb0	2012-06-20 12:52:58 -0700	[diff] [blame]	178	if (adj == OOM_SCORE_ADJ_MIN) {
Michal Hocko	5aecc85	2011-11-15 14:36:07 -0800	[diff] [blame]	179	task_unlock(p);
				180	return 0;
				181	}
				182
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	183	/*
David Rientjes	a63d83f	2010-08-09 17:19:46 -0700	[diff] [blame]	184	* The baseline for the badness score is the proportion of RAM that each
KOSAKI Motohiro	f755a04	2011-04-27 15:26:50 -0700	[diff] [blame]	185	* task's rss, pagetable and swap space use.
David Rientjes	a63d83f	2010-08-09 17:19:46 -0700	[diff] [blame]	186	*/
Kirill A. Shutemov	dc6c9a3	2015-02-11 15:26:50 -0800	[diff] [blame]	187	points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) +
				188	atomic_long_read(&p->mm->nr_ptes) + mm_nr_pmds(p->mm);
Andrew Morton	97c2c9b8	2006-04-18 22:20:38 -0700	[diff] [blame]	189	task_unlock(p);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	190
				191	/*
David Rientjes	a63d83f	2010-08-09 17:19:46 -0700	[diff] [blame]	192	* Root processes get 3% bonus, just like the __vm_enough_memory()
				193	* implementation used by LSMs.
Hugh Dickins	7ba3485	2007-01-05 16:37:03 -0800	[diff] [blame]	194	*/
David Rientjes	a63d83f	2010-08-09 17:19:46 -0700	[diff] [blame]	195	if (has_capability_noaudit(p, CAP_SYS_ADMIN))
David Rientjes	778c14a	2014-01-30 15:46:11 -0800	[diff] [blame]	196	points -= (points * 3) / 100;
Hugh Dickins	7ba3485	2007-01-05 16:37:03 -0800	[diff] [blame]	197
David Rientjes	61eafb0	2012-06-20 12:52:58 -0700	[diff] [blame]	198	/* Normalize to oom_score_adj units */
				199	adj *= totalpages / 1000;
				200	points += adj;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	201
David Rientjes	f19e8aa	2010-09-22 13:04:52 -0700	[diff] [blame]	202	/*
David Rientjes	a7f638f	2012-05-29 15:06:47 -0700	[diff] [blame]	203	* Never return 0 for an eligible task regardless of the root bonus and
				204	* oom_score_adj (oom_score_adj can't be OOM_SCORE_ADJ_MIN here).
David Rientjes	f19e8aa	2010-09-22 13:04:52 -0700	[diff] [blame]	205	*/
David Rientjes	1e11ad8	2012-06-08 13:21:26 -0700	[diff] [blame]	206	return points > 0 ? points : 1;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	207	}
				208
				209	/*
Christoph Lameter	9b0f8b0	2006-02-20 18:27:52 -0800	[diff] [blame]	210	* Determine the type of allocation constraint.
				211	*/
Christoph Lameter	9b0f8b0	2006-02-20 18:27:52 -0800	[diff] [blame]	212	#ifdef CONFIG_NUMA
David Rientjes	6e0fc46	2015-09-08 15:00:36 -0700	[diff] [blame]	213	static enum oom_constraint constrained_alloc(struct oom_control *oc,
				214	unsigned long *totalpages)
KAMEZAWA Hiroyuki	4365a56	2009-12-15 16:45:33 -0800	[diff] [blame]	215	{
Mel Gorman	54a6eb5	2008-04-28 02:12:16 -0700	[diff] [blame]	216	struct zone *zone;
Mel Gorman	dd1a239	2008-04-28 02:12:17 -0700	[diff] [blame]	217	struct zoneref *z;
David Rientjes	6e0fc46	2015-09-08 15:00:36 -0700	[diff] [blame]	218	enum zone_type high_zoneidx = gfp_zone(oc->gfp_mask);
David Rientjes	a63d83f	2010-08-09 17:19:46 -0700	[diff] [blame]	219	bool cpuset_limited = false;
				220	int nid;
Christoph Lameter	9b0f8b0	2006-02-20 18:27:52 -0800	[diff] [blame]	221
David Rientjes	a63d83f	2010-08-09 17:19:46 -0700	[diff] [blame]	222	/* Default to all available memory */
				223	*totalpages = totalram_pages + total_swap_pages;
				224
David Rientjes	6e0fc46	2015-09-08 15:00:36 -0700	[diff] [blame]	225	if (!oc->zonelist)
David Rientjes	a63d83f	2010-08-09 17:19:46 -0700	[diff] [blame]	226	return CONSTRAINT_NONE;
KAMEZAWA Hiroyuki	4365a56	2009-12-15 16:45:33 -0800	[diff] [blame]	227	/*
				228	* Reach here only when __GFP_NOFAIL is used. So, we should avoid
				229	* to kill current.We have to random task kill in this case.
				230	* Hopefully, CONSTRAINT_THISNODE...but no way to handle it, now.
				231	*/
David Rientjes	6e0fc46	2015-09-08 15:00:36 -0700	[diff] [blame]	232	if (oc->gfp_mask & __GFP_THISNODE)
KAMEZAWA Hiroyuki	4365a56	2009-12-15 16:45:33 -0800	[diff] [blame]	233	return CONSTRAINT_NONE;
Christoph Lameter	9b0f8b0	2006-02-20 18:27:52 -0800	[diff] [blame]	234
KAMEZAWA Hiroyuki	4365a56	2009-12-15 16:45:33 -0800	[diff] [blame]	235	/*
David Rientjes	a63d83f	2010-08-09 17:19:46 -0700	[diff] [blame]	236	* This is not a __GFP_THISNODE allocation, so a truncated nodemask in
				237	* the page allocator means a mempolicy is in effect. Cpuset policy
				238	* is enforced in get_page_from_freelist().
KAMEZAWA Hiroyuki	4365a56	2009-12-15 16:45:33 -0800	[diff] [blame]	239	*/
David Rientjes	6e0fc46	2015-09-08 15:00:36 -0700	[diff] [blame]	240	if (oc->nodemask &&
				241	!nodes_subset(node_states[N_MEMORY], *oc->nodemask)) {
David Rientjes	a63d83f	2010-08-09 17:19:46 -0700	[diff] [blame]	242	*totalpages = total_swap_pages;
David Rientjes	6e0fc46	2015-09-08 15:00:36 -0700	[diff] [blame]	243	for_each_node_mask(nid, *oc->nodemask)
David Rientjes	a63d83f	2010-08-09 17:19:46 -0700	[diff] [blame]	244	*totalpages += node_spanned_pages(nid);
Christoph Lameter	9b0f8b0	2006-02-20 18:27:52 -0800	[diff] [blame]	245	return CONSTRAINT_MEMORY_POLICY;
David Rientjes	a63d83f	2010-08-09 17:19:46 -0700	[diff] [blame]	246	}
KAMEZAWA Hiroyuki	4365a56	2009-12-15 16:45:33 -0800	[diff] [blame]	247
				248	/* Check this allocation failure is caused by cpuset's wall function */
David Rientjes	6e0fc46	2015-09-08 15:00:36 -0700	[diff] [blame]	249	for_each_zone_zonelist_nodemask(zone, z, oc->zonelist,
				250	high_zoneidx, oc->nodemask)
				251	if (!cpuset_zone_allowed(zone, oc->gfp_mask))
David Rientjes	a63d83f	2010-08-09 17:19:46 -0700	[diff] [blame]	252	cpuset_limited = true;
Christoph Lameter	9b0f8b0	2006-02-20 18:27:52 -0800	[diff] [blame]	253
David Rientjes	a63d83f	2010-08-09 17:19:46 -0700	[diff] [blame]	254	if (cpuset_limited) {
				255	*totalpages = total_swap_pages;
				256	for_each_node_mask(nid, cpuset_current_mems_allowed)
				257	*totalpages += node_spanned_pages(nid);
				258	return CONSTRAINT_CPUSET;
				259	}
Christoph Lameter	9b0f8b0	2006-02-20 18:27:52 -0800	[diff] [blame]	260	return CONSTRAINT_NONE;
				261	}
KAMEZAWA Hiroyuki	4365a56	2009-12-15 16:45:33 -0800	[diff] [blame]	262	#else
David Rientjes	6e0fc46	2015-09-08 15:00:36 -0700	[diff] [blame]	263	static enum oom_constraint constrained_alloc(struct oom_control *oc,
				264	unsigned long *totalpages)
KAMEZAWA Hiroyuki	4365a56	2009-12-15 16:45:33 -0800	[diff] [blame]	265	{
David Rientjes	a63d83f	2010-08-09 17:19:46 -0700	[diff] [blame]	266	*totalpages = totalram_pages + total_swap_pages;
KAMEZAWA Hiroyuki	4365a56	2009-12-15 16:45:33 -0800	[diff] [blame]	267	return CONSTRAINT_NONE;
				268	}
				269	#endif
Christoph Lameter	9b0f8b0	2006-02-20 18:27:52 -0800	[diff] [blame]	270
David Rientjes	6e0fc46	2015-09-08 15:00:36 -0700	[diff] [blame]	271	enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
				272	struct task_struct *task, unsigned long totalpages)
David Rientjes	462607e	2012-07-31 16:43:40 -0700	[diff] [blame]	273	{
David Rientjes	6e0fc46	2015-09-08 15:00:36 -0700	[diff] [blame]	274	if (oom_unkillable_task(task, NULL, oc->nodemask))
David Rientjes	462607e	2012-07-31 16:43:40 -0700	[diff] [blame]	275	return OOM_SCAN_CONTINUE;
				276
				277	/*
				278	* This task already has access to memory reserves and is being killed.
				279	* Don't allow any other task to have access to the reserves.
				280	*/
				281	if (test_tsk_thread_flag(task, TIF_MEMDIE)) {
Yaowei Bai	db2a0dd	2015-11-06 16:28:06 -0800	[diff] [blame]	282	if (!is_sysrq_oom(oc))
David Rientjes	462607e	2012-07-31 16:43:40 -0700	[diff] [blame]	283	return OOM_SCAN_ABORT;
				284	}
				285	if (!task->mm)
				286	return OOM_SCAN_CONTINUE;
				287
David Rientjes	e1e12d2	2012-12-11 16:02:56 -0800	[diff] [blame]	288	/*
				289	* If task is allocating a lot of memory and has been marked to be
				290	* killed first if it triggers an oom, then select it.
				291	*/
				292	if (oom_task_origin(task))
				293	return OOM_SCAN_SELECT;
				294
David Rientjes	462607e	2012-07-31 16:43:40 -0700	[diff] [blame]	295	return OOM_SCAN_OK;
				296	}
				297
Christoph Lameter	9b0f8b0	2006-02-20 18:27:52 -0800	[diff] [blame]	298	/*
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	299	* Simple selection loop. We chose the process with the highest
Rusty Russell	6b4f2b5	2013-07-15 11:24:08 +0930	[diff] [blame]	300	* number of 'points'. Returns -1 on scan abort.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	301	*/
David Rientjes	6e0fc46	2015-09-08 15:00:36 -0700	[diff] [blame]	302	static struct task_struct select_bad_process(struct oom_control oc,
				303	unsigned int *ppoints, unsigned long totalpages)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	304	{
David Rientjes	3a5dda7	2011-03-22 16:30:09 -0700	[diff] [blame]	305	struct task_struct g, p;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	306	struct task_struct *chosen = NULL;
David Rientjes	a7f638f	2012-05-29 15:06:47 -0700	[diff] [blame]	307	unsigned long chosen_points = 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	308
David Rientjes	6b0c81b	2012-07-31 16:43:45 -0700	[diff] [blame]	309	rcu_read_lock();
Oleg Nesterov	1da4db0	2014-01-21 15:49:58 -0800	[diff] [blame]	310	for_each_process_thread(g, p) {
David Rientjes	a63d83f	2010-08-09 17:19:46 -0700	[diff] [blame]	311	unsigned int points;
Paul Jackson	a49335c	2005-09-06 15:18:09 -0700	[diff] [blame]	312
David Rientjes	6e0fc46	2015-09-08 15:00:36 -0700	[diff] [blame]	313	switch (oom_scan_process_thread(oc, p, totalpages)) {
David Rientjes	462607e	2012-07-31 16:43:40 -0700	[diff] [blame]	314	case OOM_SCAN_SELECT:
				315	chosen = p;
				316	chosen_points = ULONG_MAX;
				317	/* fall through */
				318	case OOM_SCAN_CONTINUE:
Andrey Vagin	30e2b41	2011-03-22 16:30:11 -0700	[diff] [blame]	319	continue;
David Rientjes	462607e	2012-07-31 16:43:40 -0700	[diff] [blame]	320	case OOM_SCAN_ABORT:
David Rientjes	6b0c81b	2012-07-31 16:43:45 -0700	[diff] [blame]	321	rcu_read_unlock();
Rusty Russell	6b4f2b5	2013-07-15 11:24:08 +0930	[diff] [blame]	322	return (struct task_struct *)(-1UL);
David Rientjes	462607e	2012-07-31 16:43:40 -0700	[diff] [blame]	323	case OOM_SCAN_OK:
				324	break;
				325	};
David Rientjes	6e0fc46	2015-09-08 15:00:36 -0700	[diff] [blame]	326	points = oom_badness(p, NULL, oc->nodemask, totalpages);
David Rientjes	d49ad93	2014-01-23 15:53:34 -0800	[diff] [blame]	327	if (!points \|\| points < chosen_points)
				328	continue;
				329	/* Prefer thread group leaders for display purposes */
				330	if (points == chosen_points && thread_group_leader(chosen))
				331	continue;
				332
				333	chosen = p;
				334	chosen_points = points;
Oleg Nesterov	1da4db0	2014-01-21 15:49:58 -0800	[diff] [blame]	335	}
David Rientjes	6b0c81b	2012-07-31 16:43:45 -0700	[diff] [blame]	336	if (chosen)
				337	get_task_struct(chosen);
				338	rcu_read_unlock();
Oleg Nesterov	972c4ea	2006-09-29 02:01:12 -0700	[diff] [blame]	339
David Rientjes	a7f638f	2012-05-29 15:06:47 -0700	[diff] [blame]	340	ppoints = chosen_points 1000 / totalpages;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	341	return chosen;
				342	}
				343
				344	/**
Randy Dunlap	1b578df	2008-03-19 17:00:42 -0700	[diff] [blame]	345	* dump_tasks - dump current memory state of all system tasks
Wanpeng Li	dad7557	2012-06-20 12:53:01 -0700	[diff] [blame]	346	* @memcg: current's memory controller, if constrained
David Rientjes	e85bfd3	2010-09-22 13:05:10 -0700	[diff] [blame]	347	* @nodemask: nodemask passed to page allocator for mempolicy ooms
Randy Dunlap	1b578df	2008-03-19 17:00:42 -0700	[diff] [blame]	348	*
David Rientjes	e85bfd3	2010-09-22 13:05:10 -0700	[diff] [blame]	349	* Dumps the current memory state of all eligible tasks. Tasks not in the same
				350	* memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes
				351	* are not shown.
David Rientjes	de34d96	2012-07-31 16:42:56 -0700	[diff] [blame]	352	* State information includes task's pid, uid, tgid, vm size, rss, nr_ptes,
				353	* swapents, oom_score_adj value, and name.
David Rientjes	fef1bdd	2008-02-07 00:14:07 -0800	[diff] [blame]	354	*/
Johannes Weiner	2314b42	2014-12-10 15:44:33 -0800	[diff] [blame]	355	static void dump_tasks(struct mem_cgroup memcg, const nodemask_t nodemask)
David Rientjes	fef1bdd	2008-02-07 00:14:07 -0800	[diff] [blame]	356	{
KOSAKI Motohiro	c55db95	2010-08-09 17:18:46 -0700	[diff] [blame]	357	struct task_struct *p;
				358	struct task_struct *task;
David Rientjes	fef1bdd	2008-02-07 00:14:07 -0800	[diff] [blame]	359
Kirill A. Shutemov	dc6c9a3	2015-02-11 15:26:50 -0800	[diff] [blame]	360	pr_info("[ pid ] uid tgid total_vm rss nr_ptes nr_pmds swapents oom_score_adj name\n");
David Rientjes	6b0c81b	2012-07-31 16:43:45 -0700	[diff] [blame]	361	rcu_read_lock();
KOSAKI Motohiro	c55db95	2010-08-09 17:18:46 -0700	[diff] [blame]	362	for_each_process(p) {
Johannes Weiner	72835c8	2012-01-12 17:18:32 -0800	[diff] [blame]	363	if (oom_unkillable_task(p, memcg, nodemask))
David Rientjes	fef1bdd	2008-02-07 00:14:07 -0800	[diff] [blame]	364	continue;
				365
KOSAKI Motohiro	c55db95	2010-08-09 17:18:46 -0700	[diff] [blame]	366	task = find_lock_task_mm(p);
				367	if (!task) {
David Rientjes	6d2661e	2009-05-28 14:34:19 -0700	[diff] [blame]	368	/*
David Rientjes	74ab7f1	2010-08-09 17:18:46 -0700	[diff] [blame]	369	* This is a kthread or all of p's threads have already
				370	* detached their mm's. There's no need to report
KOSAKI Motohiro	c55db95	2010-08-09 17:18:46 -0700	[diff] [blame]	371	* them; they can't be oom killed anyway.
David Rientjes	6d2661e	2009-05-28 14:34:19 -0700	[diff] [blame]	372	*/
David Rientjes	6d2661e	2009-05-28 14:34:19 -0700	[diff] [blame]	373	continue;
				374	}
KOSAKI Motohiro	c55db95	2010-08-09 17:18:46 -0700	[diff] [blame]	375
Kirill A. Shutemov	dc6c9a3	2015-02-11 15:26:50 -0800	[diff] [blame]	376	pr_info("[%5d] %5d %5d %8lu %8lu %7ld %7ld %8lu %5hd %s\n",
Eric W. Biederman	078de5f	2012-02-08 07:00:08 -0800	[diff] [blame]	377	task->pid, from_kuid(&init_user_ns, task_uid(task)),
				378	task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
Kirill A. Shutemov	e1f56c8	2013-11-14 14:30:48 -0800	[diff] [blame]	379	atomic_long_read(&task->mm->nr_ptes),
Kirill A. Shutemov	dc6c9a3	2015-02-11 15:26:50 -0800	[diff] [blame]	380	mm_nr_pmds(task->mm),
David Rientjes	de34d96	2012-07-31 16:42:56 -0700	[diff] [blame]	381	get_mm_counter(task->mm, MM_SWAPENTS),
David Rientjes	a63d83f	2010-08-09 17:19:46 -0700	[diff] [blame]	382	task->signal->oom_score_adj, task->comm);
KOSAKI Motohiro	c55db95	2010-08-09 17:18:46 -0700	[diff] [blame]	383	task_unlock(task);
				384	}
David Rientjes	6b0c81b	2012-07-31 16:43:45 -0700	[diff] [blame]	385	rcu_read_unlock();
David Rientjes	fef1bdd	2008-02-07 00:14:07 -0800	[diff] [blame]	386	}
				387
David Rientjes	6e0fc46	2015-09-08 15:00:36 -0700	[diff] [blame]	388	static void dump_header(struct oom_control oc, struct task_struct p,
				389	struct mem_cgroup *memcg)
David Rientjes	1b604d7	2009-12-14 17:57:47 -0800	[diff] [blame]	390	{
Joe Perches	756a025	2016-03-17 14:19:47 -0700	[diff] [blame]	391	pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n",
Vlastimil Babka	a0795cd	2016-03-15 14:56:05 -0700	[diff] [blame]	392	current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order,
David Rientjes	a63d83f	2010-08-09 17:19:46 -0700	[diff] [blame]	393	current->signal->oom_score_adj);
Vlastimil Babka	a0795cd	2016-03-15 14:56:05 -0700	[diff] [blame]	394
David Rientjes	da39da3	2015-11-05 18:48:05 -0800	[diff] [blame]	395	cpuset_print_current_mems_allowed();
David Rientjes	1b604d7	2009-12-14 17:57:47 -0800	[diff] [blame]	396	dump_stack();
Sha Zhengju	58cf188	2013-02-22 16:32:05 -0800	[diff] [blame]	397	if (memcg)
				398	mem_cgroup_print_oom_info(memcg, p);
				399	else
				400	show_mem(SHOW_MEM_FILTER_NODES);
David Rientjes	1b604d7	2009-12-14 17:57:47 -0800	[diff] [blame]	401	if (sysctl_oom_dump_tasks)
David Rientjes	6e0fc46	2015-09-08 15:00:36 -0700	[diff] [blame]	402	dump_tasks(memcg, oc->nodemask);
David Rientjes	1b604d7	2009-12-14 17:57:47 -0800	[diff] [blame]	403	}
				404
Michal Hocko	5695be1	2014-10-20 18:12:32 +0200	[diff] [blame]	405	/*
Michal Hocko	c32b3cb	2015-02-11 15:26:24 -0800	[diff] [blame]	406	* Number of OOM victims in flight
Michal Hocko	5695be1	2014-10-20 18:12:32 +0200	[diff] [blame]	407	*/
Michal Hocko	c32b3cb	2015-02-11 15:26:24 -0800	[diff] [blame]	408	static atomic_t oom_victims = ATOMIC_INIT(0);
				409	static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait);
Michal Hocko	5695be1	2014-10-20 18:12:32 +0200	[diff] [blame]	410
Michal Hocko	c32b3cb	2015-02-11 15:26:24 -0800	[diff] [blame]	411	bool oom_killer_disabled __read_mostly;
Michal Hocko	5695be1	2014-10-20 18:12:32 +0200	[diff] [blame]	412
Michal Hocko	aac4536	2016-03-25 14:20:24 -0700	[diff] [blame^]	413	#ifdef CONFIG_MMU
				414	/*
				415	* OOM Reaper kernel thread which tries to reap the memory used by the OOM
				416	* victim (if that is possible) to help the OOM killer to move on.
				417	*/
				418	static struct task_struct *oom_reaper_th;
				419	static struct mm_struct *mm_to_reap;
				420	static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
				421
				422	static bool __oom_reap_vmas(struct mm_struct *mm)
				423	{
				424	struct mmu_gather tlb;
				425	struct vm_area_struct *vma;
				426	struct zap_details details = {.check_swap_entries = true,
				427	.ignore_dirty = true};
				428	bool ret = true;
				429
				430	/* We might have raced with exit path */
				431	if (!atomic_inc_not_zero(&mm->mm_users))
				432	return true;
				433
				434	if (!down_read_trylock(&mm->mmap_sem)) {
				435	ret = false;
				436	goto out;
				437	}
				438
				439	tlb_gather_mmu(&tlb, mm, 0, -1);
				440	for (vma = mm->mmap ; vma; vma = vma->vm_next) {
				441	if (is_vm_hugetlb_page(vma))
				442	continue;
				443
				444	/*
				445	* mlocked VMAs require explicit munlocking before unmap.
				446	* Let's keep it simple here and skip such VMAs.
				447	*/
				448	if (vma->vm_flags & VM_LOCKED)
				449	continue;
				450
				451	/*
				452	* Only anonymous pages have a good chance to be dropped
				453	* without additional steps which we cannot afford as we
				454	* are OOM already.
				455	*
				456	* We do not even care about fs backed pages because all
				457	* which are reclaimable have already been reclaimed and
				458	* we do not want to block exit_mmap by keeping mm ref
				459	* count elevated without a good reason.
				460	*/
				461	if (vma_is_anonymous(vma) \|\| !(vma->vm_flags & VM_SHARED))
				462	unmap_page_range(&tlb, vma, vma->vm_start, vma->vm_end,
				463	&details);
				464	}
				465	tlb_finish_mmu(&tlb, 0, -1);
				466	up_read(&mm->mmap_sem);
				467	out:
				468	mmput(mm);
				469	return ret;
				470	}
				471
				472	static void oom_reap_vmas(struct mm_struct *mm)
				473	{
				474	int attempts = 0;
				475
				476	/* Retry the down_read_trylock(mmap_sem) a few times */
				477	while (attempts++ < 10 && !__oom_reap_vmas(mm))
				478	schedule_timeout_idle(HZ/10);
				479
				480	/* Drop a reference taken by wake_oom_reaper */
				481	mmdrop(mm);
				482	}
				483
				484	static int oom_reaper(void *unused)
				485	{
				486	while (true) {
				487	struct mm_struct *mm;
				488
				489	wait_event_freezable(oom_reaper_wait,
				490	(mm = READ_ONCE(mm_to_reap)));
				491	oom_reap_vmas(mm);
				492	WRITE_ONCE(mm_to_reap, NULL);
				493	}
				494
				495	return 0;
				496	}
				497
				498	static void wake_oom_reaper(struct mm_struct *mm)
				499	{
				500	struct mm_struct *old_mm;
				501
				502	if (!oom_reaper_th)
				503	return;
				504
				505	/*
				506	* Pin the given mm. Use mm_count instead of mm_users because
				507	* we do not want to delay the address space tear down.
				508	*/
				509	atomic_inc(&mm->mm_count);
				510
				511	/*
				512	* Make sure that only a single mm is ever queued for the reaper
				513	* because multiple are not necessary and the operation might be
				514	* disruptive so better reduce it to the bare minimum.
				515	*/
				516	old_mm = cmpxchg(&mm_to_reap, NULL, mm);
				517	if (!old_mm)
				518	wake_up(&oom_reaper_wait);
				519	else
				520	mmdrop(mm);
				521	}
				522
				523	static int __init oom_init(void)
				524	{
				525	oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper");
				526	if (IS_ERR(oom_reaper_th)) {
				527	pr_err("Unable to start OOM reaper %ld. Continuing regardless\n",
				528	PTR_ERR(oom_reaper_th));
				529	oom_reaper_th = NULL;
				530	}
				531	return 0;
				532	}
				533	subsys_initcall(oom_init)
				534	#else
				535	static void wake_oom_reaper(struct mm_struct *mm)
				536	{
				537	}
				538	#endif
				539
Michal Hocko	49550b6	2015-02-11 15:26:12 -0800	[diff] [blame]	540	/**
Johannes Weiner	16e9519	2015-06-24 16:57:07 -0700	[diff] [blame]	541	* mark_oom_victim - mark the given task as OOM victim
Michal Hocko	49550b6	2015-02-11 15:26:12 -0800	[diff] [blame]	542	* @tsk: task to mark
Michal Hocko	c32b3cb	2015-02-11 15:26:24 -0800	[diff] [blame]	543	*
Johannes Weiner	dc56401	2015-06-24 16:57:19 -0700	[diff] [blame]	544	* Has to be called with oom_lock held and never after
Michal Hocko	c32b3cb	2015-02-11 15:26:24 -0800	[diff] [blame]	545	* oom has been disabled already.
Michal Hocko	49550b6	2015-02-11 15:26:12 -0800	[diff] [blame]	546	*/
Johannes Weiner	16e9519	2015-06-24 16:57:07 -0700	[diff] [blame]	547	void mark_oom_victim(struct task_struct *tsk)
Michal Hocko	49550b6	2015-02-11 15:26:12 -0800	[diff] [blame]	548	{
Michal Hocko	c32b3cb	2015-02-11 15:26:24 -0800	[diff] [blame]	549	WARN_ON(oom_killer_disabled);
				550	/* OOM killer might race with memcg OOM */
				551	if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE))
				552	return;
Michal Hocko	63a8ca9	2015-02-11 15:26:15 -0800	[diff] [blame]	553	/*
				554	* Make sure that the task is woken up from uninterruptible sleep
				555	* if it is frozen because OOM killer wouldn't be able to free
				556	* any memory and livelock. freezing_slow_path will tell the freezer
				557	* that TIF_MEMDIE tasks should be ignored.
				558	*/
				559	__thaw_task(tsk);
Michal Hocko	c32b3cb	2015-02-11 15:26:24 -0800	[diff] [blame]	560	atomic_inc(&oom_victims);
Michal Hocko	49550b6	2015-02-11 15:26:12 -0800	[diff] [blame]	561	}
				562
				563	/**
Johannes Weiner	16e9519	2015-06-24 16:57:07 -0700	[diff] [blame]	564	* exit_oom_victim - note the exit of an OOM victim
Michal Hocko	49550b6	2015-02-11 15:26:12 -0800	[diff] [blame]	565	*/
Johannes Weiner	16e9519	2015-06-24 16:57:07 -0700	[diff] [blame]	566	void exit_oom_victim(void)
Michal Hocko	49550b6	2015-02-11 15:26:12 -0800	[diff] [blame]	567	{
Johannes Weiner	4640277	2015-06-24 16:57:10 -0700	[diff] [blame]	568	clear_thread_flag(TIF_MEMDIE);
Michal Hocko	c32b3cb	2015-02-11 15:26:24 -0800	[diff] [blame]	569
Johannes Weiner	c38f102	2015-06-24 16:57:13 -0700	[diff] [blame]	570	if (!atomic_dec_return(&oom_victims))
Michal Hocko	c32b3cb	2015-02-11 15:26:24 -0800	[diff] [blame]	571	wake_up_all(&oom_victims_wait);
Michal Hocko	c32b3cb	2015-02-11 15:26:24 -0800	[diff] [blame]	572	}
				573
				574	/**
				575	* oom_killer_disable - disable OOM killer
				576	*
				577	* Forces all page allocations to fail rather than trigger OOM killer.
				578	* Will block and wait until all OOM victims are killed.
				579	*
				580	* The function cannot be called when there are runnable user tasks because
				581	* the userspace would see unexpected allocation failures as a result. Any
				582	* new usage of this function should be consulted with MM people.
				583	*
				584	* Returns true if successful and false if the OOM killer cannot be
				585	* disabled.
				586	*/
				587	bool oom_killer_disable(void)
				588	{
				589	/*
Tetsuo Handa	6afcf28	2016-03-17 14:20:45 -0700	[diff] [blame]	590	* Make sure to not race with an ongoing OOM killer. Check that the
				591	* current is not killed (possibly due to sharing the victim's memory).
Michal Hocko	c32b3cb	2015-02-11 15:26:24 -0800	[diff] [blame]	592	*/
Tetsuo Handa	6afcf28	2016-03-17 14:20:45 -0700	[diff] [blame]	593	if (mutex_lock_killable(&oom_lock))
Michal Hocko	c32b3cb	2015-02-11 15:26:24 -0800	[diff] [blame]	594	return false;
Michal Hocko	c32b3cb	2015-02-11 15:26:24 -0800	[diff] [blame]	595	oom_killer_disabled = true;
Johannes Weiner	dc56401	2015-06-24 16:57:19 -0700	[diff] [blame]	596	mutex_unlock(&oom_lock);
Michal Hocko	c32b3cb	2015-02-11 15:26:24 -0800	[diff] [blame]	597
				598	wait_event(oom_victims_wait, !atomic_read(&oom_victims));
				599
				600	return true;
				601	}
				602
				603	/**
				604	* oom_killer_enable - enable OOM killer
				605	*/
				606	void oom_killer_enable(void)
				607	{
Michal Hocko	c32b3cb	2015-02-11 15:26:24 -0800	[diff] [blame]	608	oom_killer_disabled = false;
Michal Hocko	49550b6	2015-02-11 15:26:12 -0800	[diff] [blame]	609	}
				610
Oleg Nesterov	4d7b339	2015-11-05 18:48:26 -0800	[diff] [blame]	611	/*
				612	* task->mm can be NULL if the task is the exited group leader. So to
				613	* determine whether the task is using a particular mm, we examine all the
				614	* task's threads: if one of those is using this mm then this task was also
				615	* using it.
				616	*/
				617	static bool process_shares_mm(struct task_struct p, struct mm_struct mm)
				618	{
				619	struct task_struct *t;
				620
				621	for_each_thread(p, t) {
				622	struct mm_struct *t_mm = READ_ONCE(t->mm);
				623	if (t_mm)
				624	return t_mm == mm;
				625	}
				626	return false;
				627	}
				628
KOSAKI Motohiro	3b4798c	2009-12-15 16:45:32 -0800	[diff] [blame]	629	#define K(x) ((x) << (PAGE_SHIFT-10))
David Rientjes	6b0c81b	2012-07-31 16:43:45 -0700	[diff] [blame]	630	/*
				631	* Must be called while holding a reference to p, which will be released upon
				632	* returning.
				633	*/
David Rientjes	6e0fc46	2015-09-08 15:00:36 -0700	[diff] [blame]	634	void oom_kill_process(struct oom_control oc, struct task_struct p,
David Rientjes	9cbb78b	2012-07-31 16:43:44 -0700	[diff] [blame]	635	unsigned int points, unsigned long totalpages,
David Rientjes	6e0fc46	2015-09-08 15:00:36 -0700	[diff] [blame]	636	struct mem_cgroup memcg, const char message)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	637	{
Linus Torvalds	52d3c03	2011-03-14 15:17:07 -0700	[diff] [blame]	638	struct task_struct *victim = p;
David Rientjes	5e9d834	2010-08-09 17:18:51 -0700	[diff] [blame]	639	struct task_struct *child;
Oleg Nesterov	1da4db0	2014-01-21 15:49:58 -0800	[diff] [blame]	640	struct task_struct *t;
David Rientjes	647f2bd	2012-03-21 16:33:46 -0700	[diff] [blame]	641	struct mm_struct *mm;
Linus Torvalds	52d3c03	2011-03-14 15:17:07 -0700	[diff] [blame]	642	unsigned int victim_points = 0;
David Rientjes	dc3f21e	2012-03-21 16:33:47 -0700	[diff] [blame]	643	static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
				644	DEFAULT_RATELIMIT_BURST);
Michal Hocko	aac4536	2016-03-25 14:20:24 -0700	[diff] [blame^]	645	bool can_oom_reap = true;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	646
Nick Piggin	50ec3bb	2006-09-25 23:31:29 -0700	[diff] [blame]	647	/*
				648	* If the task is already exiting, don't alarm the sysadmin or kill
				649	* its children or threads, just set TIF_MEMDIE so it can die quickly
				650	*/
Michal Hocko	83363b9	2015-02-11 15:24:56 -0800	[diff] [blame]	651	task_lock(p);
				652	if (p->mm && task_will_free_mem(p)) {
Johannes Weiner	16e9519	2015-06-24 16:57:07 -0700	[diff] [blame]	653	mark_oom_victim(p);
Michal Hocko	83363b9	2015-02-11 15:24:56 -0800	[diff] [blame]	654	task_unlock(p);
David Rientjes	6b0c81b	2012-07-31 16:43:45 -0700	[diff] [blame]	655	put_task_struct(p);
David Rientjes	2a1c9b1	2012-03-21 16:33:46 -0700	[diff] [blame]	656	return;
Nick Piggin	50ec3bb	2006-09-25 23:31:29 -0700	[diff] [blame]	657	}
Michal Hocko	83363b9	2015-02-11 15:24:56 -0800	[diff] [blame]	658	task_unlock(p);
Nick Piggin	50ec3bb	2006-09-25 23:31:29 -0700	[diff] [blame]	659
David Rientjes	dc3f21e	2012-03-21 16:33:47 -0700	[diff] [blame]	660	if (__ratelimit(&oom_rs))
David Rientjes	6e0fc46	2015-09-08 15:00:36 -0700	[diff] [blame]	661	dump_header(oc, p, memcg);
David Rientjes	8447d95	2012-03-21 16:33:47 -0700	[diff] [blame]	662
Wang Long	f0d6647	2015-06-24 16:58:01 -0700	[diff] [blame]	663	pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n",
David Rientjes	5e9d834	2010-08-09 17:18:51 -0700	[diff] [blame]	664	message, task_pid_nr(p), p->comm, points);
Nick Piggin	f3af38d	2006-12-06 20:31:51 -0800	[diff] [blame]	665
David Rientjes	5e9d834	2010-08-09 17:18:51 -0700	[diff] [blame]	666	/*
				667	* If any of p's children has a different mm and is eligible for kill,
David Rientjes	1123983	2011-07-25 17:12:17 -0700	[diff] [blame]	668	* the one with the highest oom_badness() score is sacrificed for its
David Rientjes	5e9d834	2010-08-09 17:18:51 -0700	[diff] [blame]	669	* parent. This attempts to lose the minimal amount of work done while
				670	* still freeing memory.
				671	*/
David Rientjes	6b0c81b	2012-07-31 16:43:45 -0700	[diff] [blame]	672	read_lock(&tasklist_lock);
Oleg Nesterov	1da4db0	2014-01-21 15:49:58 -0800	[diff] [blame]	673	for_each_thread(p, t) {
David Rientjes	5e9d834	2010-08-09 17:18:51 -0700	[diff] [blame]	674	list_for_each_entry(child, &t->children, sibling) {
David Rientjes	a63d83f	2010-08-09 17:19:46 -0700	[diff] [blame]	675	unsigned int child_points;
David Rientjes	5e9d834	2010-08-09 17:18:51 -0700	[diff] [blame]	676
Oleg Nesterov	4d7b339	2015-11-05 18:48:26 -0800	[diff] [blame]	677	if (process_shares_mm(child, p->mm))
David Rientjes	edd4554	2011-03-22 16:30:12 -0700	[diff] [blame]	678	continue;
David Rientjes	a63d83f	2010-08-09 17:19:46 -0700	[diff] [blame]	679	/*
				680	* oom_badness() returns 0 if the thread is unkillable
				681	*/
David Rientjes	6e0fc46	2015-09-08 15:00:36 -0700	[diff] [blame]	682	child_points = oom_badness(child, memcg, oc->nodemask,
David Rientjes	a63d83f	2010-08-09 17:19:46 -0700	[diff] [blame]	683	totalpages);
David Rientjes	5e9d834	2010-08-09 17:18:51 -0700	[diff] [blame]	684	if (child_points > victim_points) {
David Rientjes	6b0c81b	2012-07-31 16:43:45 -0700	[diff] [blame]	685	put_task_struct(victim);
David Rientjes	5e9d834	2010-08-09 17:18:51 -0700	[diff] [blame]	686	victim = child;
				687	victim_points = child_points;
David Rientjes	6b0c81b	2012-07-31 16:43:45 -0700	[diff] [blame]	688	get_task_struct(victim);
David Rientjes	5e9d834	2010-08-09 17:18:51 -0700	[diff] [blame]	689	}
Oleg Nesterov	dd8e8f4	2010-08-09 17:18:45 -0700	[diff] [blame]	690	}
Oleg Nesterov	1da4db0	2014-01-21 15:49:58 -0800	[diff] [blame]	691	}
David Rientjes	6b0c81b	2012-07-31 16:43:45 -0700	[diff] [blame]	692	read_unlock(&tasklist_lock);
Oleg Nesterov	dd8e8f4	2010-08-09 17:18:45 -0700	[diff] [blame]	693
David Rientjes	6b0c81b	2012-07-31 16:43:45 -0700	[diff] [blame]	694	p = find_lock_task_mm(victim);
				695	if (!p) {
David Rientjes	6b0c81b	2012-07-31 16:43:45 -0700	[diff] [blame]	696	put_task_struct(victim);
David Rientjes	647f2bd	2012-03-21 16:33:46 -0700	[diff] [blame]	697	return;
David Rientjes	6b0c81b	2012-07-31 16:43:45 -0700	[diff] [blame]	698	} else if (victim != p) {
				699	get_task_struct(p);
				700	put_task_struct(victim);
				701	victim = p;
				702	}
David Rientjes	647f2bd	2012-03-21 16:33:46 -0700	[diff] [blame]	703
Tetsuo Handa	880b7689	2015-11-05 18:47:51 -0800	[diff] [blame]	704	/* Get a reference to safely compare mm after task_unlock(victim) */
David Rientjes	647f2bd	2012-03-21 16:33:46 -0700	[diff] [blame]	705	mm = victim->mm;
Tetsuo Handa	880b7689	2015-11-05 18:47:51 -0800	[diff] [blame]	706	atomic_inc(&mm->mm_count);
Tetsuo Handa	426fb5e	2015-11-05 18:47:44 -0800	[diff] [blame]	707	/*
				708	* We should send SIGKILL before setting TIF_MEMDIE in order to prevent
				709	* the OOM victim from depleting the memory reserves from the user
				710	* space under its control.
				711	*/
				712	do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true);
Johannes Weiner	16e9519	2015-06-24 16:57:07 -0700	[diff] [blame]	713	mark_oom_victim(victim);
Jerome Marchand	eca56ff	2016-01-14 15:19:26 -0800	[diff] [blame]	714	pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
David Rientjes	647f2bd	2012-03-21 16:33:46 -0700	[diff] [blame]	715	task_pid_nr(victim), victim->comm, K(victim->mm->total_vm),
				716	K(get_mm_counter(victim->mm, MM_ANONPAGES)),
Jerome Marchand	eca56ff	2016-01-14 15:19:26 -0800	[diff] [blame]	717	K(get_mm_counter(victim->mm, MM_FILEPAGES)),
				718	K(get_mm_counter(victim->mm, MM_SHMEMPAGES)));
David Rientjes	647f2bd	2012-03-21 16:33:46 -0700	[diff] [blame]	719	task_unlock(victim);
				720
				721	/*
				722	* Kill all user processes sharing victim->mm in other thread groups, if
				723	* any. They don't get access to memory reserves, though, to avoid
				724	* depletion of all memory. This prevents mm->mmap_sem livelock when an
				725	* oom killed thread cannot exit because it requires the semaphore and
				726	* its contended by another thread trying to allocate memory itself.
				727	* That thread will now get access to memory reserves since it has a
				728	* pending fatal signal.
				729	*/
Oleg Nesterov	4d4048b	2014-01-21 15:50:01 -0800	[diff] [blame]	730	rcu_read_lock();
Oleg Nesterov	c319025	2015-11-05 18:48:23 -0800	[diff] [blame]	731	for_each_process(p) {
Oleg Nesterov	4d7b339	2015-11-05 18:48:26 -0800	[diff] [blame]	732	if (!process_shares_mm(p, mm))
Oleg Nesterov	c319025	2015-11-05 18:48:23 -0800	[diff] [blame]	733	continue;
				734	if (same_thread_group(p, victim))
				735	continue;
Michal Hocko	aac4536	2016-03-25 14:20:24 -0700	[diff] [blame^]	736	if (unlikely(p->flags & PF_KTHREAD) \|\| is_global_init(p) \|\|
				737	p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
				738	/*
				739	* We cannot use oom_reaper for the mm shared by this
				740	* process because it wouldn't get killed and so the
				741	* memory might be still used.
				742	*/
				743	can_oom_reap = false;
Oleg Nesterov	c319025	2015-11-05 18:48:23 -0800	[diff] [blame]	744	continue;
Michal Hocko	aac4536	2016-03-25 14:20:24 -0700	[diff] [blame^]	745	}
Oleg Nesterov	c319025	2015-11-05 18:48:23 -0800	[diff] [blame]	746	do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true);
				747	}
David Rientjes	6b0c81b	2012-07-31 16:43:45 -0700	[diff] [blame]	748	rcu_read_unlock();
David Rientjes	647f2bd	2012-03-21 16:33:46 -0700	[diff] [blame]	749
Michal Hocko	aac4536	2016-03-25 14:20:24 -0700	[diff] [blame^]	750	if (can_oom_reap)
				751	wake_oom_reaper(mm);
				752
Tetsuo Handa	880b7689	2015-11-05 18:47:51 -0800	[diff] [blame]	753	mmdrop(mm);
David Rientjes	6b0c81b	2012-07-31 16:43:45 -0700	[diff] [blame]	754	put_task_struct(victim);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	755	}
David Rientjes	647f2bd	2012-03-21 16:33:46 -0700	[diff] [blame]	756	#undef K
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	757
David Rientjes	309ed88	2010-08-09 17:18:54 -0700	[diff] [blame]	758	/*
				759	* Determines whether the kernel must panic because of the panic_on_oom sysctl.
				760	*/
David Rientjes	6e0fc46	2015-09-08 15:00:36 -0700	[diff] [blame]	761	void check_panic_on_oom(struct oom_control *oc, enum oom_constraint constraint,
Balasubramani Vivekanandan	2415b9f	2015-04-14 15:48:18 -0700	[diff] [blame]	762	struct mem_cgroup *memcg)
David Rientjes	309ed88	2010-08-09 17:18:54 -0700	[diff] [blame]	763	{
				764	if (likely(!sysctl_panic_on_oom))
				765	return;
				766	if (sysctl_panic_on_oom != 2) {
				767	/*
				768	* panic_on_oom == 1 only affects CONSTRAINT_NONE, the kernel
				769	* does not panic for cpuset, mempolicy, or memcg allocation
				770	* failures.
				771	*/
				772	if (constraint != CONSTRAINT_NONE)
				773	return;
				774	}
David Rientjes	071a4be	2015-09-08 15:00:42 -0700	[diff] [blame]	775	/* Do not panic for oom kills triggered by sysrq */
Yaowei Bai	db2a0dd	2015-11-06 16:28:06 -0800	[diff] [blame]	776	if (is_sysrq_oom(oc))
David Rientjes	071a4be	2015-09-08 15:00:42 -0700	[diff] [blame]	777	return;
David Rientjes	6e0fc46	2015-09-08 15:00:36 -0700	[diff] [blame]	778	dump_header(oc, NULL, memcg);
David Rientjes	309ed88	2010-08-09 17:18:54 -0700	[diff] [blame]	779	panic("Out of memory: %s panic_on_oom is enabled\n",
				780	sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
				781	}
				782
Martin Schwidefsky	8bc719d	2006-09-25 23:31:20 -0700	[diff] [blame]	783	static BLOCKING_NOTIFIER_HEAD(oom_notify_list);
				784
				785	int register_oom_notifier(struct notifier_block *nb)
				786	{
				787	return blocking_notifier_chain_register(&oom_notify_list, nb);
				788	}
				789	EXPORT_SYMBOL_GPL(register_oom_notifier);
				790
				791	int unregister_oom_notifier(struct notifier_block *nb)
				792	{
				793	return blocking_notifier_chain_unregister(&oom_notify_list, nb);
				794	}
				795	EXPORT_SYMBOL_GPL(unregister_oom_notifier);
				796
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	797	/**
David Rientjes	6e0fc46	2015-09-08 15:00:36 -0700	[diff] [blame]	798	* out_of_memory - kill the "best" process when we run out of memory
				799	* @oc: pointer to struct oom_control
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	800	*
				801	* If we run out of memory, we have the choice between either
				802	* killing a random task (bad), letting the system crash (worse)
				803	* OR try to be smart about which process to kill. Note that we
				804	* don't have to be perfect here, we just have to be good.
				805	*/
David Rientjes	6e0fc46	2015-09-08 15:00:36 -0700	[diff] [blame]	806	bool out_of_memory(struct oom_control *oc)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	807	{
David Rientjes	0aad4b3	2010-08-09 17:18:59 -0700	[diff] [blame]	808	struct task_struct *p;
David Rientjes	a63d83f	2010-08-09 17:19:46 -0700	[diff] [blame]	809	unsigned long totalpages;
Martin Schwidefsky	8bc719d	2006-09-25 23:31:20 -0700	[diff] [blame]	810	unsigned long freed = 0;
David Rientjes	9cbb78b	2012-07-31 16:43:44 -0700	[diff] [blame]	811	unsigned int uninitialized_var(points);
David Rientjes	e365893	2010-08-09 17:18:55 -0700	[diff] [blame]	812	enum oom_constraint constraint = CONSTRAINT_NONE;
Martin Schwidefsky	8bc719d	2006-09-25 23:31:20 -0700	[diff] [blame]	813
Johannes Weiner	dc56401	2015-06-24 16:57:19 -0700	[diff] [blame]	814	if (oom_killer_disabled)
				815	return false;
				816
Martin Schwidefsky	8bc719d	2006-09-25 23:31:20 -0700	[diff] [blame]	817	blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
				818	if (freed > 0)
				819	/* Got some memory back in the last second. */
David Rientjes	75e8f8b	2015-09-08 15:00:47 -0700	[diff] [blame]	820	return true;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	821
David Rientjes	7b98c2e	2010-08-09 17:18:48 -0700	[diff] [blame]	822	/*
David Rientjes	9ff4868	2012-12-11 16:01:30 -0800	[diff] [blame]	823	* If current has a pending SIGKILL or is exiting, then automatically
				824	* select it. The goal is to allow it to allocate so that it may
				825	* quickly exit and free its memory.
Tetsuo Handa	d7a94e7	2015-02-11 15:24:54 -0800	[diff] [blame]	826	*
				827	* But don't select if current has already released its mm and cleared
				828	* TIF_MEMDIE flag at exit_mm(), otherwise an OOM livelock may occur.
David Rientjes	7b98c2e	2010-08-09 17:18:48 -0700	[diff] [blame]	829	*/
Tetsuo Handa	d7a94e7	2015-02-11 15:24:54 -0800	[diff] [blame]	830	if (current->mm &&
				831	(fatal_signal_pending(current) \|\| task_will_free_mem(current))) {
Johannes Weiner	16e9519	2015-06-24 16:57:07 -0700	[diff] [blame]	832	mark_oom_victim(current);
David Rientjes	75e8f8b	2015-09-08 15:00:47 -0700	[diff] [blame]	833	return true;
David Rientjes	7b98c2e	2010-08-09 17:18:48 -0700	[diff] [blame]	834	}
				835
Christoph Lameter	9b0f8b0	2006-02-20 18:27:52 -0800	[diff] [blame]	836	/*
				837	* Check if there were limitations on the allocation (only relevant for
				838	* NUMA) that may require different handling.
				839	*/
David Rientjes	6e0fc46	2015-09-08 15:00:36 -0700	[diff] [blame]	840	constraint = constrained_alloc(oc, &totalpages);
				841	if (constraint != CONSTRAINT_MEMORY_POLICY)
				842	oc->nodemask = NULL;
				843	check_panic_on_oom(oc, constraint, NULL);
David Rientjes	0aad4b3	2010-08-09 17:18:59 -0700	[diff] [blame]	844
David Rientjes	121d1ba	2012-07-31 16:42:55 -0700	[diff] [blame]	845	if (sysctl_oom_kill_allocating_task && current->mm &&
David Rientjes	6e0fc46	2015-09-08 15:00:36 -0700	[diff] [blame]	846	!oom_unkillable_task(current, NULL, oc->nodemask) &&
David Rientjes	121d1ba	2012-07-31 16:42:55 -0700	[diff] [blame]	847	current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
David Rientjes	6b0c81b	2012-07-31 16:43:45 -0700	[diff] [blame]	848	get_task_struct(current);
David Rientjes	6e0fc46	2015-09-08 15:00:36 -0700	[diff] [blame]	849	oom_kill_process(oc, current, 0, totalpages, NULL,
David Rientjes	2a1c9b1	2012-03-21 16:33:46 -0700	[diff] [blame]	850	"Out of memory (oom_kill_allocating_task)");
David Rientjes	75e8f8b	2015-09-08 15:00:47 -0700	[diff] [blame]	851	return true;
David Rientjes	0aad4b3	2010-08-09 17:18:59 -0700	[diff] [blame]	852	}
				853
David Rientjes	6e0fc46	2015-09-08 15:00:36 -0700	[diff] [blame]	854	p = select_bad_process(oc, &points, totalpages);
David Rientjes	0aad4b3	2010-08-09 17:18:59 -0700	[diff] [blame]	855	/* Found nothing?!?! Either we hang forever, or we panic. */
Yaowei Bai	db2a0dd	2015-11-06 16:28:06 -0800	[diff] [blame]	856	if (!p && !is_sysrq_oom(oc)) {
David Rientjes	6e0fc46	2015-09-08 15:00:36 -0700	[diff] [blame]	857	dump_header(oc, NULL, NULL);
David Rientjes	0aad4b3	2010-08-09 17:18:59 -0700	[diff] [blame]	858	panic("Out of memory and no killable processes...\n");
				859	}
David Rientjes	071a4be	2015-09-08 15:00:42 -0700	[diff] [blame]	860	if (p && p != (void *)-1UL) {
David Rientjes	6e0fc46	2015-09-08 15:00:36 -0700	[diff] [blame]	861	oom_kill_process(oc, p, points, totalpages, NULL,
				862	"Out of memory");
David Rientjes	75e8f8b	2015-09-08 15:00:47 -0700	[diff] [blame]	863	/*
				864	* Give the killed process a good chance to exit before trying
				865	* to allocate memory again.
				866	*/
David Rientjes	4f774b9	2012-07-31 16:42:37 -0700	[diff] [blame]	867	schedule_timeout_killable(1);
David Rientjes	75e8f8b	2015-09-08 15:00:47 -0700	[diff] [blame]	868	}
Johannes Weiner	dc56401	2015-06-24 16:57:19 -0700	[diff] [blame]	869	return true;
Michal Hocko	c32b3cb	2015-02-11 15:26:24 -0800	[diff] [blame]	870	}
				871
David Rientjes	e365893	2010-08-09 17:18:55 -0700	[diff] [blame]	872	/*
				873	* The pagefault handler calls here because it is out of memory, so kill a
David Rientjes	efacd02	2012-12-12 13:52:06 -0800	[diff] [blame]	874	* memory-hogging task. If any populated zone has ZONE_OOM_LOCKED set, a
				875	* parallel oom killing is already in progress so do nothing.
David Rientjes	e365893	2010-08-09 17:18:55 -0700	[diff] [blame]	876	*/
				877	void pagefault_out_of_memory(void)
				878	{
David Rientjes	6e0fc46	2015-09-08 15:00:36 -0700	[diff] [blame]	879	struct oom_control oc = {
				880	.zonelist = NULL,
				881	.nodemask = NULL,
				882	.gfp_mask = 0,
				883	.order = 0,
David Rientjes	6e0fc46	2015-09-08 15:00:36 -0700	[diff] [blame]	884	};
				885
Johannes Weiner	4942642	2013-10-16 13:46:59 -0700	[diff] [blame]	886	if (mem_cgroup_oom_synchronize(true))
Johannes Weiner	dc56401	2015-06-24 16:57:19 -0700	[diff] [blame]	887	return;
Johannes Weiner	3812c8c	2013-09-12 15:13:44 -0700	[diff] [blame]	888
Johannes Weiner	dc56401	2015-06-24 16:57:19 -0700	[diff] [blame]	889	if (!mutex_trylock(&oom_lock))
				890	return;
Michal Hocko	c32b3cb	2015-02-11 15:26:24 -0800	[diff] [blame]	891
David Rientjes	6e0fc46	2015-09-08 15:00:36 -0700	[diff] [blame]	892	if (!out_of_memory(&oc)) {
Johannes Weiner	dc56401	2015-06-24 16:57:19 -0700	[diff] [blame]	893	/*
				894	* There shouldn't be any user tasks runnable while the
				895	* OOM killer is disabled, so the current task has to
				896	* be a racing OOM victim for which oom_killer_disable()
				897	* is waiting for.
				898	*/
				899	WARN_ON(test_thread_flag(TIF_MEMDIE));
David Rientjes	e365893	2010-08-09 17:18:55 -0700	[diff] [blame]	900	}
Johannes Weiner	dc56401	2015-06-24 16:57:19 -0700	[diff] [blame]	901
				902	mutex_unlock(&oom_lock);
David Rientjes	e365893	2010-08-09 17:18:55 -0700	[diff] [blame]	903	}