ext4: improve cr 0 / cr 1 group scanning
Instead of traversing through groups linearly, scan groups in specific
orders at cr 0 and cr 1. At cr 0, we want to find groups that have the
largest free order >= the order of the request. So, with this patch,
we maintain lists for each possible order and insert each group into a
list based on the largest free order in its buddy bitmap. During cr 0
allocation, we traverse these lists in the increasing order of largest
free orders. This allows us to find a group with the best available cr
0 match in constant time. If nothing can be found, we fallback to cr 1
immediately.
At CR1, the story is slightly different. We want to traverse in the
order of increasing average fragment size. For CR1, we maintain a rb
tree of groupinfos which is sorted by average fragment size. Instead
of traversing linearly, at CR1, we traverse in the order of increasing
average fragment size, starting at the most optimal group. This brings
down cr 1 search complexity to log(num groups).
For cr >= 2, we just perform the linear search as before. Also, in
case of lock contention, we intermittently fallback to linear search
even in CR 0 and CR 1 cases. This allows us to proceed during the
allocation path even in case of high contention.
There is an opportunity to do optimization at CR2 too. That's because
at CR2 we only consider groups where bb_free counter (number of free
blocks) is greater than the request extent size. That's left as future
work.
All the changes introduced in this patch are protected under a new
mount option "mb_optimize_scan".
With this patchset, following experiment was performed:
Created a highly fragmented disk of size 65TB. The disk had no
contiguous 2M regions. Following command was run consecutively for 3
times:
time dd if=/dev/urandom of=file bs=2M count=10
Here are the results with and without cr 0/1 optimizations introduced
in this patch:
|---------+------------------------------+---------------------------|
| | Without CR 0/1 Optimizations | With CR 0/1 Optimizations |
|---------+------------------------------+---------------------------|
| 1st run | 5m1.871s | 2m47.642s |
| 2nd run | 2m28.390s | 0m0.611s |
| 3rd run | 2m26.530s | 0m1.255s |
|---------+------------------------------+---------------------------|
Signed-off-by: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
Reported-by: kernel test robot <lkp@intel.com>
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Reviewed-by: Andreas Dilger <adilger@dilger.ca>
Link: https://lore.kernel.org/r/20210401172129.189766-6-harshadshirwadkar@gmail.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 0ef82e6..3b12deb 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1688,7 +1688,7 @@ enum {
Opt_dioread_nolock, Opt_dioread_lock,
Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache,
- Opt_prefetch_block_bitmaps,
+ Opt_prefetch_block_bitmaps, Opt_mb_optimize_scan,
#ifdef CONFIG_EXT4_DEBUG
Opt_fc_debug_max_replay, Opt_fc_debug_force
#endif
@@ -1789,6 +1789,7 @@ static const match_table_t tokens = {
{Opt_nombcache, "nombcache"},
{Opt_nombcache, "no_mbcache"}, /* for backward compatibility */
{Opt_prefetch_block_bitmaps, "prefetch_block_bitmaps"},
+ {Opt_mb_optimize_scan, "mb_optimize_scan=%d"},
{Opt_removed, "check=none"}, /* mount option from ext2/3 */
{Opt_removed, "nocheck"}, /* mount option from ext2/3 */
{Opt_removed, "reservation"}, /* mount option from ext2/3 */
@@ -1821,6 +1822,8 @@ static ext4_fsblk_t get_sb_block(void **data)
}
#define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3))
+#define DEFAULT_MB_OPTIMIZE_SCAN (-1)
+
static const char deprecated_msg[] =
"Mount option \"%s\" will be removed by %s\n"
"Contact linux-ext4@vger.kernel.org if you think we should keep it.\n";
@@ -2009,6 +2012,7 @@ static const struct mount_opts {
{Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET},
{Opt_prefetch_block_bitmaps, EXT4_MOUNT_PREFETCH_BLOCK_BITMAPS,
MOPT_SET},
+ {Opt_mb_optimize_scan, EXT4_MOUNT2_MB_OPTIMIZE_SCAN, MOPT_GTE0},
#ifdef CONFIG_EXT4_DEBUG
{Opt_fc_debug_force, EXT4_MOUNT2_JOURNAL_FAST_COMMIT,
MOPT_SET | MOPT_2 | MOPT_EXT4_ONLY},
@@ -2093,6 +2097,7 @@ static int ext4_set_test_dummy_encryption(struct super_block *sb,
struct ext4_parsed_options {
unsigned long journal_devnum;
unsigned int journal_ioprio;
+ int mb_optimize_scan;
};
static int handle_mount_opt(struct super_block *sb, char *opt, int token,
@@ -2389,6 +2394,13 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
sbi->s_mount_opt |= m->mount_opt;
} else if (token == Opt_data_err_ignore) {
sbi->s_mount_opt &= ~m->mount_opt;
+ } else if (token == Opt_mb_optimize_scan) {
+ if (arg != 0 && arg != 1) {
+ ext4_msg(sb, KERN_WARNING,
+ "mb_optimize_scan should be set to 0 or 1.");
+ return -1;
+ }
+ parsed_opts->mb_optimize_scan = arg;
} else {
if (!args->from)
arg = 1;
@@ -4035,6 +4047,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
/* Set defaults for the variables that will be set during parsing */
parsed_opts.journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
parsed_opts.journal_devnum = 0;
+ parsed_opts.mb_optimize_scan = DEFAULT_MB_OPTIMIZE_SCAN;
if ((data && !orig_data) || !sbi)
goto out_free_base;
@@ -4979,6 +4992,19 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
ext4_fc_replay_cleanup(sb);
ext4_ext_init(sb);
+
+ /*
+ * Enable optimize_scan if number of groups is > threshold. This can be
+ * turned off by passing "mb_optimize_scan=0". This can also be
+ * turned on forcefully by passing "mb_optimize_scan=1".
+ */
+ if (parsed_opts.mb_optimize_scan == 1)
+ set_opt2(sb, MB_OPTIMIZE_SCAN);
+ else if (parsed_opts.mb_optimize_scan == 0)
+ clear_opt2(sb, MB_OPTIMIZE_SCAN);
+ else if (sbi->s_groups_count >= MB_DEFAULT_LINEAR_SCAN_THRESHOLD)
+ set_opt2(sb, MB_OPTIMIZE_SCAN);
+
err = ext4_mb_init(sb);
if (err) {
ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)",