mm: vmscan: reduce size of inactive file list
The inactive file list should still be large enough to contain readahead
windows and freshly written file data, but it no longer is the only
source for detecting multiple accesses to file pages. The workingset
refault measurement code causes recently evicted file pages that get
accessed again after a shorter interval to be promoted directly to the
active list.
With that mechanism in place, we can afford to (on a larger system)
dedicate more memory to the active file list, so we can actually cache
more of the frequently used file pages in memory, and not have them
pushed out by streaming writes, once-used streaming file reads, etc.
This can help things like database workloads, where only half the page
cache can currently be used to cache the database working set. This
patch automatically increases that fraction on larger systems, using the
same ratio that has already been used for anonymous memory.
[hannes@cmpxchg.org: cgroup-awareness]
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reported-by: Andres Freund <andres@anarazel.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
diff --git a/mm/vmscan.c b/mm/vmscan.c
index dcfdfc1..38d6d06 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1862,83 +1862,63 @@
free_hot_cold_page_list(&l_hold, true);
}
-#ifdef CONFIG_SWAP
-static bool inactive_anon_is_low_global(struct zone *zone)
-{
- unsigned long active, inactive;
-
- active = zone_page_state(zone, NR_ACTIVE_ANON);
- inactive = zone_page_state(zone, NR_INACTIVE_ANON);
-
- return inactive * zone->inactive_ratio < active;
-}
-
-/**
- * inactive_anon_is_low - check if anonymous pages need to be deactivated
- * @lruvec: LRU vector to check
+/*
+ * The inactive anon list should be small enough that the VM never has
+ * to do too much work.
*
- * Returns true if the zone does not have enough inactive anon pages,
- * meaning some active anon pages need to be deactivated.
+ * The inactive file list should be small enough to leave most memory
+ * to the established workingset on the scan-resistant active list,
+ * but large enough to avoid thrashing the aggregate readahead window.
+ *
+ * Both inactive lists should also be large enough that each inactive
+ * page has a chance to be referenced again before it is reclaimed.
+ *
+ * The inactive_ratio is the target ratio of ACTIVE to INACTIVE pages
+ * on this LRU, maintained by the pageout code. A zone->inactive_ratio
+ * of 3 means 3:1 or 25% of the pages are kept on the inactive list.
+ *
+ * total target max
+ * memory ratio inactive
+ * -------------------------------------
+ * 10MB 1 5MB
+ * 100MB 1 50MB
+ * 1GB 3 250MB
+ * 10GB 10 0.9GB
+ * 100GB 31 3GB
+ * 1TB 101 10GB
+ * 10TB 320 32GB
*/
-static bool inactive_anon_is_low(struct lruvec *lruvec)
+static bool inactive_list_is_low(struct lruvec *lruvec, bool file)
{
+ unsigned long inactive_ratio;
+ unsigned long inactive;
+ unsigned long active;
+ unsigned long gb;
+
/*
* If we don't have swap space, anonymous page deactivation
* is pointless.
*/
- if (!total_swap_pages)
+ if (!file && !total_swap_pages)
return false;
- if (!mem_cgroup_disabled())
- return mem_cgroup_inactive_anon_is_low(lruvec);
+ inactive = lruvec_lru_size(lruvec, file * LRU_FILE);
+ active = lruvec_lru_size(lruvec, file * LRU_FILE + LRU_ACTIVE);
- return inactive_anon_is_low_global(lruvec_zone(lruvec));
-}
-#else
-static inline bool inactive_anon_is_low(struct lruvec *lruvec)
-{
- return false;
-}
-#endif
-
-/**
- * inactive_file_is_low - check if file pages need to be deactivated
- * @lruvec: LRU vector to check
- *
- * When the system is doing streaming IO, memory pressure here
- * ensures that active file pages get deactivated, until more
- * than half of the file pages are on the inactive list.
- *
- * Once we get to that situation, protect the system's working
- * set from being evicted by disabling active file page aging.
- *
- * This uses a different ratio than the anonymous pages, because
- * the page cache uses a use-once replacement algorithm.
- */
-static bool inactive_file_is_low(struct lruvec *lruvec)
-{
- unsigned long inactive;
- unsigned long active;
-
- inactive = lruvec_lru_size(lruvec, LRU_INACTIVE_FILE);
- active = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE);
-
- return active > inactive;
-}
-
-static bool inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru)
-{
- if (is_file_lru(lru))
- return inactive_file_is_low(lruvec);
+ gb = (inactive + active) >> (30 - PAGE_SHIFT);
+ if (gb)
+ inactive_ratio = int_sqrt(10 * gb);
else
- return inactive_anon_is_low(lruvec);
+ inactive_ratio = 1;
+
+ return inactive * inactive_ratio < active;
}
static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
struct lruvec *lruvec, struct scan_control *sc)
{
if (is_active_lru(lru)) {
- if (inactive_list_is_low(lruvec, lru))
+ if (inactive_list_is_low(lruvec, is_file_lru(lru)))
shrink_active_list(nr_to_scan, lruvec, sc, lru);
return 0;
}
@@ -2059,7 +2039,7 @@
* lruvec even if it has plenty of old anonymous pages unless the
* system is under heavy pressure.
*/
- if (!inactive_file_is_low(lruvec) &&
+ if (!inactive_list_is_low(lruvec, true) &&
lruvec_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) {
scan_balance = SCAN_FILE;
goto out;
@@ -2301,7 +2281,7 @@
* Even if we did not try to evict anon pages at all, we want to
* rebalance the anon lru active/inactive ratio.
*/
- if (inactive_anon_is_low(lruvec))
+ if (inactive_list_is_low(lruvec, false))
shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
sc, LRU_ACTIVE_ANON);
@@ -2962,7 +2942,7 @@
do {
struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
- if (inactive_anon_is_low(lruvec))
+ if (inactive_list_is_low(lruvec, false))
shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
sc, LRU_ACTIVE_ANON);