mm: throttle direct reclaimers if PF_MEMALLOC reserves are low and swap is backed by network storage
If swap is backed by network storage such as NBD, there is a risk that a
large number of reclaimers can hang the system by consuming all
PF_MEMALLOC reserves. To avoid these hangs, the administrator must tune
min_free_kbytes in advance which is a bit fragile.
This patch throttles direct reclaimers if half the PF_MEMALLOC reserves
are in use. If the system is routinely getting throttled the system
administrator can increase min_free_kbytes so degradation is smoother but
the system will keep running.
Signed-off-by: Mel Gorman <mgorman@suse.de>
Cc: David Miller <davem@davemloft.net>
Cc: Neil Brown <neilb@suse.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Christie <michaelc@cs.wisc.edu>
Cc: Eric B Munson <emunson@mgebm.net>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Sebastian Andrzej Siewior <sebastian@breakpoint.cc>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Christoph Lameter <cl@linux.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ef2d1e7..48aee0f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4389,6 +4389,7 @@
pgdat_resize_init(pgdat);
pgdat->nr_zones = 0;
init_waitqueue_head(&pgdat->kswapd_wait);
+ init_waitqueue_head(&pgdat->pfmemalloc_wait);
pgdat->kswapd_max_order = 0;
pgdat_page_cgroup_init(pgdat);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 6b1f89a..021a44a 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2112,6 +2112,80 @@
return 0;
}
+static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
+{
+ struct zone *zone;
+ unsigned long pfmemalloc_reserve = 0;
+ unsigned long free_pages = 0;
+ int i;
+ bool wmark_ok;
+
+ for (i = 0; i <= ZONE_NORMAL; i++) {
+ zone = &pgdat->node_zones[i];
+ pfmemalloc_reserve += min_wmark_pages(zone);
+ free_pages += zone_page_state(zone, NR_FREE_PAGES);
+ }
+
+ wmark_ok = free_pages > pfmemalloc_reserve / 2;
+
+ /* kswapd must be awake if processes are being throttled */
+ if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
+ pgdat->classzone_idx = min(pgdat->classzone_idx,
+ (enum zone_type)ZONE_NORMAL);
+ wake_up_interruptible(&pgdat->kswapd_wait);
+ }
+
+ return wmark_ok;
+}
+
+/*
+ * Throttle direct reclaimers if backing storage is backed by the network
+ * and the PFMEMALLOC reserve for the preferred node is getting dangerously
+ * depleted. kswapd will continue to make progress and wake the processes
+ * when the low watermark is reached
+ */
+static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
+ nodemask_t *nodemask)
+{
+ struct zone *zone;
+ int high_zoneidx = gfp_zone(gfp_mask);
+ pg_data_t *pgdat;
+
+ /*
+ * Kernel threads should not be throttled as they may be indirectly
+ * responsible for cleaning pages necessary for reclaim to make forward
+ * progress. kjournald for example may enter direct reclaim while
+ * committing a transaction where throttling it could forcing other
+ * processes to block on log_wait_commit().
+ */
+ if (current->flags & PF_KTHREAD)
+ return;
+
+ /* Check if the pfmemalloc reserves are ok */
+ first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone);
+ pgdat = zone->zone_pgdat;
+ if (pfmemalloc_watermark_ok(pgdat))
+ return;
+
+ /*
+ * If the caller cannot enter the filesystem, it's possible that it
+ * is due to the caller holding an FS lock or performing a journal
+ * transaction in the case of a filesystem like ext[3|4]. In this case,
+ * it is not safe to block on pfmemalloc_wait as kswapd could be
+ * blocked waiting on the same lock. Instead, throttle for up to a
+ * second before continuing.
+ */
+ if (!(gfp_mask & __GFP_FS)) {
+ wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
+ pfmemalloc_watermark_ok(pgdat), HZ);
+ return;
+ }
+
+ /* Throttle until kswapd wakes the process */
+ wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
+ pfmemalloc_watermark_ok(pgdat));
+}
+
unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
gfp_t gfp_mask, nodemask_t *nodemask)
{
@@ -2131,6 +2205,15 @@
.gfp_mask = sc.gfp_mask,
};
+ throttle_direct_reclaim(gfp_mask, zonelist, nodemask);
+
+ /*
+ * Do not enter reclaim if fatal signal is pending. 1 is returned so
+ * that the page allocator does not consider triggering OOM
+ */
+ if (fatal_signal_pending(current))
+ return 1;
+
trace_mm_vmscan_direct_reclaim_begin(order,
sc.may_writepage,
gfp_mask);
@@ -2275,8 +2358,13 @@
return balanced_pages >= (present_pages >> 2);
}
-/* is kswapd sleeping prematurely? */
-static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
+/*
+ * Prepare kswapd for sleeping. This verifies that there are no processes
+ * waiting in throttle_direct_reclaim() and that watermarks have been met.
+ *
+ * Returns true if kswapd is ready to sleep
+ */
+static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
int classzone_idx)
{
int i;
@@ -2285,7 +2373,21 @@
/* If a direct reclaimer woke kswapd within HZ/10, it's premature */
if (remaining)
- return true;
+ return false;
+
+ /*
+ * There is a potential race between when kswapd checks its watermarks
+ * and a process gets throttled. There is also a potential race if
+ * processes get throttled, kswapd wakes, a large process exits therby
+ * balancing the zones that causes kswapd to miss a wakeup. If kswapd
+ * is going to sleep, no process should be sleeping on pfmemalloc_wait
+ * so wake them now if necessary. If necessary, processes will wake
+ * kswapd and get throttled again
+ */
+ if (waitqueue_active(&pgdat->pfmemalloc_wait)) {
+ wake_up(&pgdat->pfmemalloc_wait);
+ return false;
+ }
/* Check the watermark levels */
for (i = 0; i <= classzone_idx; i++) {
@@ -2318,9 +2420,9 @@
* must be balanced
*/
if (order)
- return !pgdat_balanced(pgdat, balanced, classzone_idx);
+ return pgdat_balanced(pgdat, balanced, classzone_idx);
else
- return !all_zones_ok;
+ return all_zones_ok;
}
/*
@@ -2546,6 +2648,16 @@
}
}
+
+ /*
+ * If the low watermark is met there is no need for processes
+ * to be throttled on pfmemalloc_wait as they should not be
+ * able to safely make forward progress. Wake them
+ */
+ if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
+ pfmemalloc_watermark_ok(pgdat))
+ wake_up(&pgdat->pfmemalloc_wait);
+
if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))
break; /* kswapd: all done */
/*
@@ -2647,7 +2759,7 @@
}
/*
- * Return the order we were reclaiming at so sleeping_prematurely()
+ * Return the order we were reclaiming at so prepare_kswapd_sleep()
* makes a decision on the order we were last reclaiming at. However,
* if another caller entered the allocator slow path while kswapd
* was awake, order will remain at the higher level
@@ -2667,7 +2779,7 @@
prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
/* Try to sleep for a short interval */
- if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
+ if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
remaining = schedule_timeout(HZ/10);
finish_wait(&pgdat->kswapd_wait, &wait);
prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
@@ -2677,7 +2789,7 @@
* After a short sleep, check if it was a premature sleep. If not, then
* go fully to sleep until explicitly woken up.
*/
- if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
+ if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
/*