SeongJae Park | 43b0536 | 2021-11-05 13:47:57 -0700 | [diff] [blame] | 1 | // SPDX-License-Identifier: GPL-2.0 |
| 2 | /* |
| 3 | * DAMON-based page reclamation |
| 4 | * |
| 5 | * Author: SeongJae Park <sj@kernel.org> |
| 6 | */ |
| 7 | |
| 8 | #define pr_fmt(fmt) "damon-reclaim: " fmt |
| 9 | |
| 10 | #include <linux/damon.h> |
| 11 | #include <linux/ioport.h> |
| 12 | #include <linux/module.h> |
| 13 | #include <linux/sched.h> |
| 14 | #include <linux/workqueue.h> |
| 15 | |
| 16 | #ifdef MODULE_PARAM_PREFIX |
| 17 | #undef MODULE_PARAM_PREFIX |
| 18 | #endif |
| 19 | #define MODULE_PARAM_PREFIX "damon_reclaim." |
| 20 | |
| 21 | /* |
| 22 | * Enable or disable DAMON_RECLAIM. |
| 23 | * |
| 24 | * You can enable DAMON_RCLAIM by setting the value of this parameter as ``Y``. |
| 25 | * Setting it as ``N`` disables DAMON_RECLAIM. Note that DAMON_RECLAIM could |
| 26 | * do no real monitoring and reclamation due to the watermarks-based activation |
| 27 | * condition. Refer to below descriptions for the watermarks parameter for |
| 28 | * this. |
| 29 | */ |
| 30 | static bool enabled __read_mostly; |
| 31 | module_param(enabled, bool, 0600); |
| 32 | |
| 33 | /* |
| 34 | * Time threshold for cold memory regions identification in microseconds. |
| 35 | * |
| 36 | * If a memory region is not accessed for this or longer time, DAMON_RECLAIM |
| 37 | * identifies the region as cold, and reclaims. 120 seconds by default. |
| 38 | */ |
| 39 | static unsigned long min_age __read_mostly = 120000000; |
| 40 | module_param(min_age, ulong, 0600); |
| 41 | |
| 42 | /* |
| 43 | * Limit of time for trying the reclamation in milliseconds. |
| 44 | * |
| 45 | * DAMON_RECLAIM tries to use only up to this time within a time window |
| 46 | * (quota_reset_interval_ms) for trying reclamation of cold pages. This can be |
| 47 | * used for limiting CPU consumption of DAMON_RECLAIM. If the value is zero, |
| 48 | * the limit is disabled. |
| 49 | * |
| 50 | * 10 ms by default. |
| 51 | */ |
| 52 | static unsigned long quota_ms __read_mostly = 10; |
| 53 | module_param(quota_ms, ulong, 0600); |
| 54 | |
| 55 | /* |
| 56 | * Limit of size of memory for the reclamation in bytes. |
| 57 | * |
| 58 | * DAMON_RECLAIM charges amount of memory which it tried to reclaim within a |
| 59 | * time window (quota_reset_interval_ms) and makes no more than this limit is |
| 60 | * tried. This can be used for limiting consumption of CPU and IO. If this |
| 61 | * value is zero, the limit is disabled. |
| 62 | * |
| 63 | * 128 MiB by default. |
| 64 | */ |
| 65 | static unsigned long quota_sz __read_mostly = 128 * 1024 * 1024; |
| 66 | module_param(quota_sz, ulong, 0600); |
| 67 | |
| 68 | /* |
| 69 | * The time/size quota charge reset interval in milliseconds. |
| 70 | * |
| 71 | * The charge reset interval for the quota of time (quota_ms) and size |
| 72 | * (quota_sz). That is, DAMON_RECLAIM does not try reclamation for more than |
| 73 | * quota_ms milliseconds or quota_sz bytes within quota_reset_interval_ms |
| 74 | * milliseconds. |
| 75 | * |
| 76 | * 1 second by default. |
| 77 | */ |
| 78 | static unsigned long quota_reset_interval_ms __read_mostly = 1000; |
| 79 | module_param(quota_reset_interval_ms, ulong, 0600); |
| 80 | |
| 81 | /* |
| 82 | * The watermarks check time interval in microseconds. |
| 83 | * |
| 84 | * Minimal time to wait before checking the watermarks, when DAMON_RECLAIM is |
| 85 | * enabled but inactive due to its watermarks rule. 5 seconds by default. |
| 86 | */ |
| 87 | static unsigned long wmarks_interval __read_mostly = 5000000; |
| 88 | module_param(wmarks_interval, ulong, 0600); |
| 89 | |
| 90 | /* |
| 91 | * Free memory rate (per thousand) for the high watermark. |
| 92 | * |
| 93 | * If free memory of the system in bytes per thousand bytes is higher than |
| 94 | * this, DAMON_RECLAIM becomes inactive, so it does nothing but periodically |
| 95 | * checks the watermarks. 500 (50%) by default. |
| 96 | */ |
| 97 | static unsigned long wmarks_high __read_mostly = 500; |
| 98 | module_param(wmarks_high, ulong, 0600); |
| 99 | |
| 100 | /* |
| 101 | * Free memory rate (per thousand) for the middle watermark. |
| 102 | * |
| 103 | * If free memory of the system in bytes per thousand bytes is between this and |
| 104 | * the low watermark, DAMON_RECLAIM becomes active, so starts the monitoring |
| 105 | * and the reclaiming. 400 (40%) by default. |
| 106 | */ |
| 107 | static unsigned long wmarks_mid __read_mostly = 400; |
| 108 | module_param(wmarks_mid, ulong, 0600); |
| 109 | |
| 110 | /* |
| 111 | * Free memory rate (per thousand) for the low watermark. |
| 112 | * |
| 113 | * If free memory of the system in bytes per thousand bytes is lower than this, |
| 114 | * DAMON_RECLAIM becomes inactive, so it does nothing but periodically checks |
| 115 | * the watermarks. In the case, the system falls back to the LRU-based page |
| 116 | * granularity reclamation logic. 200 (20%) by default. |
| 117 | */ |
| 118 | static unsigned long wmarks_low __read_mostly = 200; |
| 119 | module_param(wmarks_low, ulong, 0600); |
| 120 | |
| 121 | /* |
| 122 | * Sampling interval for the monitoring in microseconds. |
| 123 | * |
| 124 | * The sampling interval of DAMON for the cold memory monitoring. Please refer |
| 125 | * to the DAMON documentation for more detail. 5 ms by default. |
| 126 | */ |
| 127 | static unsigned long sample_interval __read_mostly = 5000; |
| 128 | module_param(sample_interval, ulong, 0600); |
| 129 | |
| 130 | /* |
| 131 | * Aggregation interval for the monitoring in microseconds. |
| 132 | * |
| 133 | * The aggregation interval of DAMON for the cold memory monitoring. Please |
| 134 | * refer to the DAMON documentation for more detail. 100 ms by default. |
| 135 | */ |
| 136 | static unsigned long aggr_interval __read_mostly = 100000; |
| 137 | module_param(aggr_interval, ulong, 0600); |
| 138 | |
| 139 | /* |
| 140 | * Minimum number of monitoring regions. |
| 141 | * |
| 142 | * The minimal number of monitoring regions of DAMON for the cold memory |
| 143 | * monitoring. This can be used to set lower-bound of the monitoring quality. |
| 144 | * But, setting this too high could result in increased monitoring overhead. |
| 145 | * Please refer to the DAMON documentation for more detail. 10 by default. |
| 146 | */ |
| 147 | static unsigned long min_nr_regions __read_mostly = 10; |
| 148 | module_param(min_nr_regions, ulong, 0600); |
| 149 | |
| 150 | /* |
| 151 | * Maximum number of monitoring regions. |
| 152 | * |
| 153 | * The maximum number of monitoring regions of DAMON for the cold memory |
| 154 | * monitoring. This can be used to set upper-bound of the monitoring overhead. |
| 155 | * However, setting this too low could result in bad monitoring quality. |
| 156 | * Please refer to the DAMON documentation for more detail. 1000 by default. |
| 157 | */ |
| 158 | static unsigned long max_nr_regions __read_mostly = 1000; |
| 159 | module_param(max_nr_regions, ulong, 0600); |
| 160 | |
| 161 | /* |
| 162 | * Start of the target memory region in physical address. |
| 163 | * |
| 164 | * The start physical address of memory region that DAMON_RECLAIM will do work |
| 165 | * against. By default, biggest System RAM is used as the region. |
| 166 | */ |
| 167 | static unsigned long monitor_region_start __read_mostly; |
| 168 | module_param(monitor_region_start, ulong, 0600); |
| 169 | |
| 170 | /* |
| 171 | * End of the target memory region in physical address. |
| 172 | * |
| 173 | * The end physical address of memory region that DAMON_RECLAIM will do work |
| 174 | * against. By default, biggest System RAM is used as the region. |
| 175 | */ |
| 176 | static unsigned long monitor_region_end __read_mostly; |
| 177 | module_param(monitor_region_end, ulong, 0600); |
| 178 | |
| 179 | /* |
| 180 | * PID of the DAMON thread |
| 181 | * |
| 182 | * If DAMON_RECLAIM is enabled, this becomes the PID of the worker thread. |
| 183 | * Else, -1. |
| 184 | */ |
| 185 | static int kdamond_pid __read_mostly = -1; |
| 186 | module_param(kdamond_pid, int, 0400); |
| 187 | |
SeongJae Park | 60e52e7 | 2022-01-14 14:10:23 -0800 | [diff] [blame] | 188 | /* |
| 189 | * Number of memory regions that tried to be reclaimed. |
| 190 | */ |
| 191 | static unsigned long nr_reclaim_tried_regions __read_mostly; |
| 192 | module_param(nr_reclaim_tried_regions, ulong, 0400); |
| 193 | |
| 194 | /* |
| 195 | * Total bytes of memory regions that tried to be reclaimed. |
| 196 | */ |
| 197 | static unsigned long bytes_reclaim_tried_regions __read_mostly; |
| 198 | module_param(bytes_reclaim_tried_regions, ulong, 0400); |
| 199 | |
| 200 | /* |
| 201 | * Number of memory regions that successfully be reclaimed. |
| 202 | */ |
| 203 | static unsigned long nr_reclaimed_regions __read_mostly; |
| 204 | module_param(nr_reclaimed_regions, ulong, 0400); |
| 205 | |
| 206 | /* |
| 207 | * Total bytes of memory regions that successfully be reclaimed. |
| 208 | */ |
| 209 | static unsigned long bytes_reclaimed_regions __read_mostly; |
| 210 | module_param(bytes_reclaimed_regions, ulong, 0400); |
| 211 | |
| 212 | /* |
| 213 | * Number of times that the time/space quota limits have exceeded |
| 214 | */ |
| 215 | static unsigned long nr_quota_exceeds __read_mostly; |
| 216 | module_param(nr_quota_exceeds, ulong, 0400); |
| 217 | |
SeongJae Park | 43b0536 | 2021-11-05 13:47:57 -0700 | [diff] [blame] | 218 | static struct damon_ctx *ctx; |
| 219 | static struct damon_target *target; |
| 220 | |
| 221 | struct damon_reclaim_ram_walk_arg { |
| 222 | unsigned long start; |
| 223 | unsigned long end; |
| 224 | }; |
| 225 | |
| 226 | static int walk_system_ram(struct resource *res, void *arg) |
| 227 | { |
| 228 | struct damon_reclaim_ram_walk_arg *a = arg; |
| 229 | |
| 230 | if (a->end - a->start < res->end - res->start) { |
| 231 | a->start = res->start; |
| 232 | a->end = res->end; |
| 233 | } |
| 234 | return 0; |
| 235 | } |
| 236 | |
| 237 | /* |
| 238 | * Find biggest 'System RAM' resource and store its start and end address in |
| 239 | * @start and @end, respectively. If no System RAM is found, returns false. |
| 240 | */ |
| 241 | static bool get_monitoring_region(unsigned long *start, unsigned long *end) |
| 242 | { |
| 243 | struct damon_reclaim_ram_walk_arg arg = {}; |
| 244 | |
| 245 | walk_system_ram_res(0, ULONG_MAX, &arg, walk_system_ram); |
| 246 | if (arg.end <= arg.start) |
| 247 | return false; |
| 248 | |
| 249 | *start = arg.start; |
| 250 | *end = arg.end; |
| 251 | return true; |
| 252 | } |
| 253 | |
| 254 | static struct damos *damon_reclaim_new_scheme(void) |
| 255 | { |
| 256 | struct damos_watermarks wmarks = { |
| 257 | .metric = DAMOS_WMARK_FREE_MEM_RATE, |
| 258 | .interval = wmarks_interval, |
| 259 | .high = wmarks_high, |
| 260 | .mid = wmarks_mid, |
| 261 | .low = wmarks_low, |
| 262 | }; |
| 263 | struct damos_quota quota = { |
| 264 | /* |
| 265 | * Do not try reclamation for more than quota_ms milliseconds |
| 266 | * or quota_sz bytes within quota_reset_interval_ms. |
| 267 | */ |
| 268 | .ms = quota_ms, |
| 269 | .sz = quota_sz, |
| 270 | .reset_interval = quota_reset_interval_ms, |
| 271 | /* Within the quota, page out older regions first. */ |
| 272 | .weight_sz = 0, |
| 273 | .weight_nr_accesses = 0, |
| 274 | .weight_age = 1 |
| 275 | }; |
| 276 | struct damos *scheme = damon_new_scheme( |
| 277 | /* Find regions having PAGE_SIZE or larger size */ |
| 278 | PAGE_SIZE, ULONG_MAX, |
| 279 | /* and not accessed at all */ |
| 280 | 0, 0, |
| 281 | /* for min_age or more micro-seconds, and */ |
| 282 | min_age / aggr_interval, UINT_MAX, |
| 283 | /* page out those, as soon as found */ |
| 284 | DAMOS_PAGEOUT, |
| 285 | /* under the quota. */ |
| 286 | "a, |
| 287 | /* (De)activate this according to the watermarks. */ |
| 288 | &wmarks); |
| 289 | |
| 290 | return scheme; |
| 291 | } |
| 292 | |
| 293 | static int damon_reclaim_turn(bool on) |
| 294 | { |
| 295 | struct damon_region *region; |
| 296 | struct damos *scheme; |
| 297 | int err; |
| 298 | |
| 299 | if (!on) { |
| 300 | err = damon_stop(&ctx, 1); |
| 301 | if (!err) |
| 302 | kdamond_pid = -1; |
| 303 | return err; |
| 304 | } |
| 305 | |
| 306 | err = damon_set_attrs(ctx, sample_interval, aggr_interval, 0, |
| 307 | min_nr_regions, max_nr_regions); |
| 308 | if (err) |
| 309 | return err; |
| 310 | |
| 311 | if (monitor_region_start > monitor_region_end) |
| 312 | return -EINVAL; |
| 313 | if (!monitor_region_start && !monitor_region_end && |
| 314 | !get_monitoring_region(&monitor_region_start, |
| 315 | &monitor_region_end)) |
| 316 | return -EINVAL; |
| 317 | /* DAMON will free this on its own when finish monitoring */ |
| 318 | region = damon_new_region(monitor_region_start, monitor_region_end); |
| 319 | if (!region) |
| 320 | return -ENOMEM; |
| 321 | damon_add_region(region, target); |
| 322 | |
| 323 | /* Will be freed by 'damon_set_schemes()' below */ |
| 324 | scheme = damon_reclaim_new_scheme(); |
| 325 | if (!scheme) { |
| 326 | err = -ENOMEM; |
| 327 | goto free_region_out; |
| 328 | } |
| 329 | err = damon_set_schemes(ctx, &scheme, 1); |
| 330 | if (err) |
| 331 | goto free_scheme_out; |
| 332 | |
| 333 | err = damon_start(&ctx, 1); |
| 334 | if (!err) { |
| 335 | kdamond_pid = ctx->kdamond->pid; |
| 336 | return 0; |
| 337 | } |
| 338 | |
| 339 | free_scheme_out: |
| 340 | damon_destroy_scheme(scheme); |
| 341 | free_region_out: |
| 342 | damon_destroy_region(region, target); |
| 343 | return err; |
| 344 | } |
| 345 | |
| 346 | #define ENABLE_CHECK_INTERVAL_MS 1000 |
| 347 | static struct delayed_work damon_reclaim_timer; |
| 348 | static void damon_reclaim_timer_fn(struct work_struct *work) |
| 349 | { |
| 350 | static bool last_enabled; |
| 351 | bool now_enabled; |
| 352 | |
| 353 | now_enabled = enabled; |
| 354 | if (last_enabled != now_enabled) { |
| 355 | if (!damon_reclaim_turn(now_enabled)) |
| 356 | last_enabled = now_enabled; |
| 357 | else |
| 358 | enabled = last_enabled; |
| 359 | } |
| 360 | |
| 361 | schedule_delayed_work(&damon_reclaim_timer, |
| 362 | msecs_to_jiffies(ENABLE_CHECK_INTERVAL_MS)); |
| 363 | } |
| 364 | static DECLARE_DELAYED_WORK(damon_reclaim_timer, damon_reclaim_timer_fn); |
| 365 | |
SeongJae Park | 60e52e7 | 2022-01-14 14:10:23 -0800 | [diff] [blame] | 366 | static int damon_reclaim_after_aggregation(struct damon_ctx *c) |
| 367 | { |
| 368 | struct damos *s; |
| 369 | |
| 370 | /* update the stats parameter */ |
| 371 | damon_for_each_scheme(s, c) { |
| 372 | nr_reclaim_tried_regions = s->stat.nr_tried; |
| 373 | bytes_reclaim_tried_regions = s->stat.sz_tried; |
| 374 | nr_reclaimed_regions = s->stat.nr_applied; |
| 375 | bytes_reclaimed_regions = s->stat.sz_applied; |
| 376 | nr_quota_exceeds = s->stat.qt_exceeds; |
| 377 | } |
| 378 | return 0; |
| 379 | } |
| 380 | |
SeongJae Park | 43b0536 | 2021-11-05 13:47:57 -0700 | [diff] [blame] | 381 | static int __init damon_reclaim_init(void) |
| 382 | { |
| 383 | ctx = damon_new_ctx(); |
| 384 | if (!ctx) |
| 385 | return -ENOMEM; |
| 386 | |
| 387 | damon_pa_set_primitives(ctx); |
SeongJae Park | 60e52e7 | 2022-01-14 14:10:23 -0800 | [diff] [blame] | 388 | ctx->callback.after_aggregation = damon_reclaim_after_aggregation; |
SeongJae Park | 43b0536 | 2021-11-05 13:47:57 -0700 | [diff] [blame] | 389 | |
| 390 | /* 4242 means nothing but fun */ |
| 391 | target = damon_new_target(4242); |
| 392 | if (!target) { |
| 393 | damon_destroy_ctx(ctx); |
| 394 | return -ENOMEM; |
| 395 | } |
| 396 | damon_add_target(ctx, target); |
| 397 | |
| 398 | schedule_delayed_work(&damon_reclaim_timer, 0); |
| 399 | return 0; |
| 400 | } |
| 401 | |
| 402 | module_init(damon_reclaim_init); |