mm/vmscan: don't mess with pgdat->flags in memcg reclaim
memcg reclaim may alter pgdat->flags based on the state of LRU lists in
cgroup and its children. PGDAT_WRITEBACK may force kswapd to sleep
congested_wait(), PGDAT_DIRTY may force kswapd to writeback filesystem
pages. But the worst here is PGDAT_CONGESTED, since it may force all
direct reclaims to stall in wait_iff_congested(). Note that only kswapd
have powers to clear any of these bits. This might just never happen if
cgroup limits configured that way. So all direct reclaims will stall as
long as we have some congested bdi in the system.
Leave all pgdat->flags manipulations to kswapd. kswapd scans the whole
pgdat, only kswapd can clear pgdat->flags once node is balanced, thus
it's reasonable to leave all decisions about node state to kswapd.
Why only kswapd? Why not allow to global direct reclaim change these
flags? It is because currently only kswapd can clear these flags. I'm
less worried about the case when PGDAT_CONGESTED falsely not set, and
more worried about the case when it falsely set. If direct reclaimer
sets PGDAT_CONGESTED, do we have guarantee that after the congestion
problem is sorted out, kswapd will be woken up and clear the flag? It
seems like there is no such guarantee. E.g. direct reclaimers may
eventually balance pgdat and kswapd simply won't wake up (see
wakeup_kswapd()).
Moving pgdat->flags manipulation to kswapd, means that cgroup2 recalim
now loses its congestion throttling mechanism. Add per-cgroup
congestion state and throttle cgroup2 reclaimers if memcg is in
congestion state.
Currently there is no need in per-cgroup PGDAT_WRITEBACK and PGDAT_DIRTY
bits since they alter only kswapd behavior.
The problem could be easily demonstrated by creating heavy congestion in
one cgroup:
echo "+memory" > /sys/fs/cgroup/cgroup.subtree_control
mkdir -p /sys/fs/cgroup/congester
echo 512M > /sys/fs/cgroup/congester/memory.max
echo $$ > /sys/fs/cgroup/congester/cgroup.procs
/* generate a lot of diry data on slow HDD */
while true; do dd if=/dev/zero of=/mnt/sdb/zeroes bs=1M count=1024; done &
....
while true; do dd if=/dev/zero of=/mnt/sdb/zeroes bs=1M count=1024; done &
and some job in another cgroup:
mkdir /sys/fs/cgroup/victim
echo 128M > /sys/fs/cgroup/victim/memory.max
# time cat /dev/sda > /dev/null
real 10m15.054s
user 0m0.487s
sys 1m8.505s
According to the tracepoint in wait_iff_congested(), the 'cat' spent 50%
of the time sleeping there.
With the patch, cat don't waste time anymore:
# time cat /dev/sda > /dev/null
real 5m32.911s
user 0m0.411s
sys 0m56.664s
[aryabinin@virtuozzo.com: congestion state should be per-node]
Link: http://lkml.kernel.org/r/20180406135215.10057-1-aryabinin@virtuozzo.com
[ayabinin@virtuozzo.com: make congestion state per-cgroup-per-node instead of just per-cgroup[
Link: http://lkml.kernel.org/r/20180406180254.8970-2-aryabinin@virtuozzo.com
Link: http://lkml.kernel.org/r/20180323152029.11084-5-aryabinin@virtuozzo.com
Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Tejun Heo <tj@kernel.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 1ecc648..e411385 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -200,6 +200,29 @@ static bool sane_reclaim(struct scan_control *sc)
#endif
return false;
}
+
+static void set_memcg_congestion(pg_data_t *pgdat,
+ struct mem_cgroup *memcg,
+ bool congested)
+{
+ struct mem_cgroup_per_node *mn;
+
+ if (!memcg)
+ return;
+
+ mn = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
+ WRITE_ONCE(mn->congested, congested);
+}
+
+static bool memcg_congested(pg_data_t *pgdat,
+ struct mem_cgroup *memcg)
+{
+ struct mem_cgroup_per_node *mn;
+
+ mn = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
+ return READ_ONCE(mn->congested);
+
+}
#else
static bool global_reclaim(struct scan_control *sc)
{
@@ -210,6 +233,18 @@ static bool sane_reclaim(struct scan_control *sc)
{
return true;
}
+
+static inline void set_memcg_congestion(struct pglist_data *pgdat,
+ struct mem_cgroup *memcg, bool congested)
+{
+}
+
+static inline bool memcg_congested(struct pglist_data *pgdat,
+ struct mem_cgroup *memcg)
+{
+ return false;
+
+}
#endif
/*
@@ -2474,6 +2509,12 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat,
return true;
}
+static bool pgdat_memcg_congested(pg_data_t *pgdat, struct mem_cgroup *memcg)
+{
+ return test_bit(PGDAT_CONGESTED, &pgdat->flags) ||
+ (memcg && memcg_congested(pgdat, memcg));
+}
+
static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
{
struct reclaim_state *reclaim_state = current->reclaim_state;
@@ -2556,29 +2597,27 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
if (sc->nr_reclaimed - nr_reclaimed)
reclaimable = true;
- /*
- * If reclaim is isolating dirty pages under writeback, it
- * implies that the long-lived page allocation rate is exceeding
- * the page laundering rate. Either the global limits are not
- * being effective at throttling processes due to the page
- * distribution throughout zones or there is heavy usage of a
- * slow backing device. The only option is to throttle from
- * reclaim context which is not ideal as there is no guarantee
- * the dirtying process is throttled in the same way
- * balance_dirty_pages() manages.
- *
- * Once a node is flagged PGDAT_WRITEBACK, kswapd will count the
- * number of pages under pages flagged for immediate reclaim and
- * stall if any are encountered in the nr_immediate check below.
- */
- if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken)
- set_bit(PGDAT_WRITEBACK, &pgdat->flags);
+ if (current_is_kswapd()) {
+ /*
+ * If reclaim is isolating dirty pages under writeback,
+ * it implies that the long-lived page allocation rate
+ * is exceeding the page laundering rate. Either the
+ * global limits are not being effective at throttling
+ * processes due to the page distribution throughout
+ * zones or there is heavy usage of a slow backing
+ * device. The only option is to throttle from reclaim
+ * context which is not ideal as there is no guarantee
+ * the dirtying process is throttled in the same way
+ * balance_dirty_pages() manages.
+ *
+ * Once a node is flagged PGDAT_WRITEBACK, kswapd will
+ * count the number of pages under pages flagged for
+ * immediate reclaim and stall if any are encountered
+ * in the nr_immediate check below.
+ */
+ if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken)
+ set_bit(PGDAT_WRITEBACK, &pgdat->flags);
- /*
- * Legacy memcg will stall in page writeback so avoid forcibly
- * stalling here.
- */
- if (sane_reclaim(sc)) {
/*
* Tag a node as congested if all the dirty pages
* scanned were backed by a congested BDI and
@@ -2602,14 +2641,22 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
}
/*
+ * Legacy memcg will stall in page writeback so avoid forcibly
+ * stalling in wait_iff_congested().
+ */
+ if (!global_reclaim(sc) && sane_reclaim(sc) &&
+ sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
+ set_memcg_congestion(pgdat, root, true);
+
+ /*
* Stall direct reclaim for IO completions if underlying BDIs
* and node is congested. Allow kswapd to continue until it
* starts encountering unqueued dirty pages or cycling through
* the LRU too quickly.
*/
if (!sc->hibernation_mode && !current_is_kswapd() &&
- current_may_throttle())
- wait_iff_congested(pgdat, BLK_RW_ASYNC, HZ/10);
+ current_may_throttle() && pgdat_memcg_congested(pgdat, root))
+ wait_iff_congested(BLK_RW_ASYNC, HZ/10);
} while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
sc->nr_scanned - nr_scanned, sc));
@@ -2826,6 +2873,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
continue;
last_pgdat = zone->zone_pgdat;
snapshot_refaults(sc->target_mem_cgroup, zone->zone_pgdat);
+ set_memcg_congestion(last_pgdat, sc->target_mem_cgroup, false);
}
delayacct_freepages_end();