dm raid: add raid level takeover support
Add raid level takeover support allowing arbitrary takeovers between
raid levels supported by md personalities (i.e. raid0, raid1/10 and
raid4/5/6):
- add rs_config_{backup|restore} function to allow for temporary
storing ctr requested layout changes and restore them for takeover
conersion decision after the superblocks got loaded and analyzed
- add members to store layout to 'struct raid_set' (not mandatory
for takeover but needed for reshape in later patch)
- add rebuild_disks bitfield to 'struct raid_set' and set bits in ctr
to use in setting up takeover (base to address a 'rebuild' related
raid_status() table line bug and needed as well for reshape in future
patch)
- add runtime flags and respective manipulation functions to be able to
control e.g. wrting of superlocks to the preresume function on
takeover and (later) reshape
- add functions to detect takeover, check it's valid (mandatory here to
avoid failing on md_run()), setup for it and use in the ctr; those
will be likely moved out once reshaping gets added to simplify the
ctr
- start raid set readonly in ctr and switch to readwrite, optionally
updating superblocks, in preresume in order to allow suspend to
quiesce any active table before (which involves superblock updates);
this ensures the proper sequence of writing the current and any new
takeover(/reshape) metadata
Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 2842033..89b677c 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -98,6 +98,13 @@
#define ALL_CTR_FLAGS (CTR_FLAG_OPTIONS_NO_ARGS | \
CTR_FLAG_OPTIONS_ONE_ARG)
+/*
+ * All flags which cause a recovery unfreeze once they got stored in the raid metadata
+ */
+#define ALL_FREEZE_FLAGS (ALL_CTR_FLAGS & ~(CTR_FLAG_REGION_SIZE | CTR_FLAGS_ANY_SYNC | \
+ CTR_FLAG_RAID10_FORMAT | CTR_FLAG_RAID10_COPIES | \
+ CTR_FLAG_RAID10_USE_NEAR_SETS))
+
/* Invalid options definitions per raid level... */
/* "raid0" does not accept any options */
@@ -129,14 +136,39 @@
#define RAID6_INVALID_FLAGS (CTR_FLAG_NOSYNC | RAID45_INVALID_FLAGS)
/* ...invalid options definitions per raid level */
+/*
+ * Flags for rs->runtime_flags field
+ * (RT_FLAG prefix meaning "runtime flag")
+ *
+ * These are all internal and used to define runtime state,
+ * e.g. to prevent another resume from preresume processing
+ * the raid set all over again.
+ */
+#define RT_FLAG_RS_PRERESUMED 0x1
+#define RT_FLAG_RS_RESUMED 0x2
+#define RT_FLAG_RS_BITMAP_LOADED 0x4
+#define RT_FLAG_UPDATE_SBS 0x8
+
/* Array elements of 64 bit needed for rebuild/write_mostly bits */
#define DISKS_ARRAY_ELEMS ((MAX_RAID_DEVICES + (sizeof(uint64_t) * 8 - 1)) / sizeof(uint64_t) / 8)
+/*
+ * raid set level, layout and chunk sectors backup/restore
+ */
+struct rs_layout {
+ int new_level;
+ int new_layout;
+ int new_chunk_sectors;
+};
+
struct raid_set {
struct dm_target *ti;
uint32_t bitmap_loaded;
uint32_t ctr_flags;
+ uint32_t runtime_flags;
+
+ uint64_t rebuild_disks[DISKS_ARRAY_ELEMS];
int raid_disks;
int delta_disks;
@@ -146,10 +178,41 @@
struct mddev md;
struct raid_type *raid_type;
struct dm_target_callbacks callbacks;
+ struct rs_layout rs_layout;
struct raid_dev dev[0];
};
+/* Backup/restore raid set configuration helpers */
+static void _rs_config_backup(struct raid_set *rs, struct rs_layout *l)
+{
+ struct mddev *mddev = &rs->md;
+
+ l->new_level = mddev->new_level;
+ l->new_layout = mddev->new_layout;
+ l->new_chunk_sectors = mddev->new_chunk_sectors;
+}
+
+static void rs_config_backup(struct raid_set *rs)
+{
+ return _rs_config_backup(rs, &rs->rs_layout);
+}
+
+static void _rs_config_restore(struct raid_set *rs, struct rs_layout *l)
+{
+ struct mddev *mddev = &rs->md;
+
+ mddev->new_level = l->new_level;
+ mddev->new_layout = l->new_layout;
+ mddev->new_chunk_sectors = l->new_chunk_sectors;
+}
+
+static void rs_config_restore(struct raid_set *rs)
+{
+ return _rs_config_restore(rs, &rs->rs_layout);
+}
+/* END: backup/restore raid set configuration helpers */
+
/* raid10 algorithms (i.e. formats) */
#define ALGORITHM_RAID10_DEFAULT 0
#define ALGORITHM_RAID10_NEAR 1
@@ -201,6 +264,13 @@
*flags |= flag;
}
+/* Clear single @flag in @flags */
+static void _clear_flag(uint32_t flag, uint32_t *flags)
+{
+ WARN_ON_ONCE(hweight32(flag) != 1);
+ *flags &= ~flag;
+}
+
/* Test single @flag in @flags */
static bool _test_flag(uint32_t flag, uint32_t flags)
{
@@ -229,6 +299,17 @@
_set_flag(flag, flags);
return false;
}
+
+/* Return true if single @flag is set in @*flags and clear it, else return false */
+static bool _test_and_clear_flag(uint32_t flag, uint32_t *flags)
+{
+ if (_test_flag(flag, *flags)) {
+ _clear_flag(flag, flags);
+ return true;
+ }
+
+ return false;
+}
/* ...ctr and runtime flag bit manipulation */
/* All table line arguments are defined here */
@@ -576,7 +657,7 @@
rs->md.layout = raid_type->algorithm;
rs->md.new_layout = rs->md.layout;
rs->md.delta_disks = 0;
- rs->md.recovery_cp = 0;
+ rs->md.recovery_cp = rs_is_raid0(rs) ? MaxSector : 0;
for (i = 0; i < raid_devs; i++)
md_rdev_init(&rs->dev[i].rdev);
@@ -1007,9 +1088,12 @@
* indexes of replaced devices and to set up additional
* devices on raid level takeover.
*/
- if (!_in_range(value, 0, rs->md.raid_disks - 1))
+ if (!_in_range(value, 0, rs->raid_disks - 1))
return ti_error_einval(rs->ti, "Invalid rebuild index given");
+ if (test_and_set_bit(value, (void *) rs->rebuild_disks))
+ return ti_error_einval(rs->ti, "rebuild for this index already given");
+
rd = rs->dev + value;
clear_bit(In_sync, &rd->rdev.flags);
clear_bit(Faulty, &rd->rdev.flags);
@@ -1175,8 +1259,166 @@
return mddev_congested(&rs->md, bits);
}
+/*
+ * Make sure a valid takover (level switch) is being requested on @rs
+ *
+ * Conversions of raid sets from one MD personality to another
+ * have to conform to restrictions which are enforced here.
+ *
+ * Degration is already checked for in rs_check_conversion() below.
+ */
+static int rs_check_takeover(struct raid_set *rs)
+{
+ struct mddev *mddev = &rs->md;
+ unsigned int near_copies;
+
+ switch (mddev->level) {
+ case 0:
+ /* raid0 -> raid1/5 with one disk */
+ if ((mddev->new_level == 1 || mddev->new_level == 5) &&
+ mddev->raid_disks == 1)
+ return 0;
+
+ /* raid0 -> raid10 */
+ if (mddev->new_level == 10 &&
+ !(rs->raid_disks % 2))
+ return 0;
+
+ /* raid0 with multiple disks -> raid4/5/6 */
+ if (_in_range(mddev->new_level, 4, 6) &&
+ mddev->new_layout == ALGORITHM_PARITY_N &&
+ mddev->raid_disks > 1)
+ return 0;
+
+ break;
+
+ case 10:
+ /* Can't takeover raid10_offset! */
+ if (_is_raid10_offset(mddev->layout))
+ break;
+
+ near_copies = _raid10_near_copies(mddev->layout);
+
+ /* raid10* -> raid0 */
+ if (mddev->new_level == 0) {
+ /* Can takeover raid10_near with raid disks divisable by data copies! */
+ if (near_copies > 1 &&
+ !(mddev->raid_disks % near_copies)) {
+ mddev->raid_disks /= near_copies;
+ mddev->delta_disks = mddev->raid_disks;
+ return 0;
+ }
+
+ /* Can takeover raid10_far */
+ if (near_copies == 1 &&
+ _raid10_far_copies(mddev->layout) > 1)
+ return 0;
+
+ break;
+ }
+
+ /* raid10_{near,far} -> raid1 */
+ if (mddev->new_level == 1 &&
+ max(near_copies, _raid10_far_copies(mddev->layout)) == mddev->raid_disks)
+ return 0;
+
+ /* raid10_{near,far} with 2 disks -> raid4/5 */
+ if (_in_range(mddev->new_level, 4, 5) &&
+ mddev->raid_disks == 2)
+ return 0;
+ break;
+
+ case 1:
+ /* raid1 with 2 disks -> raid4/5 */
+ if (_in_range(mddev->new_level, 4, 5) &&
+ mddev->raid_disks == 2) {
+ mddev->degraded = 1;
+ return 0;
+ }
+
+ /* raid1 -> raid0 */
+ if (mddev->new_level == 0 &&
+ mddev->raid_disks == 1)
+ return 0;
+
+ /* raid1 -> raid10 */
+ if (mddev->new_level == 10)
+ return 0;
+
+ break;
+
+ case 4:
+ /* raid4 -> raid0 */
+ if (mddev->new_level == 0)
+ return 0;
+
+ /* raid4 -> raid1/5 with 2 disks */
+ if ((mddev->new_level == 1 || mddev->new_level == 5) &&
+ mddev->raid_disks == 2)
+ return 0;
+
+ /* raid4 -> raid5/6 with parity N */
+ if (_in_range(mddev->new_level, 5, 6) &&
+ mddev->layout == ALGORITHM_PARITY_N)
+ return 0;
+ break;
+
+ case 5:
+ /* raid5 with parity N -> raid0 */
+ if (mddev->new_level == 0 &&
+ mddev->layout == ALGORITHM_PARITY_N)
+ return 0;
+
+ /* raid5 with parity N -> raid4 */
+ if (mddev->new_level == 4 &&
+ mddev->layout == ALGORITHM_PARITY_N)
+ return 0;
+
+ /* raid5 with 2 disks -> raid1/4/10 */
+ if ((mddev->new_level == 1 || mddev->new_level == 4 || mddev->new_level == 10) &&
+ mddev->raid_disks == 2)
+ return 0;
+
+ /* raid5 with parity N -> raid6 with parity N */
+ if (mddev->new_level == 6 &&
+ ((mddev->layout == ALGORITHM_PARITY_N && mddev->new_layout == ALGORITHM_PARITY_N) ||
+ _in_range(mddev->new_layout, ALGORITHM_LEFT_ASYMMETRIC_6, ALGORITHM_RIGHT_SYMMETRIC_6)))
+ return 0;
+ break;
+
+ case 6:
+ /* raid6 with parity N -> raid0 */
+ if (mddev->new_level == 0 &&
+ mddev->layout == ALGORITHM_PARITY_N)
+ return 0;
+
+ /* raid6 with parity N -> raid4 */
+ if (mddev->new_level == 4 &&
+ mddev->layout == ALGORITHM_PARITY_N)
+ return 0;
+
+ /* raid6_*_n with parity N -> raid5_* */
+ if (mddev->new_level == 5 &&
+ ((mddev->layout == ALGORITHM_PARITY_N && mddev->new_layout == ALGORITHM_PARITY_N) ||
+ _in_range(mddev->new_layout, ALGORITHM_LEFT_ASYMMETRIC, ALGORITHM_RIGHT_SYMMETRIC)))
+ return 0;
+
+ default:
+ break;
+ }
+
+ return ti_error_einval(rs->ti, "takeover not possible");
+}
+
+/* True if @rs requested to be taken over */
+static bool rs_takeover_requested(struct raid_set *rs)
+{
+ return rs->md.new_level != rs->md.level;
+}
+
/* Features */
-#define FEATURE_FLAG_SUPPORTS_RESHAPE 0x1
+#define FEATURE_FLAG_SUPPORTS_V180 0x1 /* Supports v1.8.0 extended superblock */
+#define FEATURE_FLAG_SUPPORTS_RESHAPE 0x2 /* Supports v1.8.0 reshaping functionality */
/* State flags for sb->flags */
#define SB_FLAG_RESHAPE_ACTIVE 0x1
@@ -1220,7 +1462,7 @@
/********************************************************************
* BELOW FOLLOW V1.8.0 EXTENSIONS TO THE PRISTINE SUPERBLOCK FORMAT!!!
*
- * FEATURE_FLAG_SUPPORTS_RESHAPE in the features member indicates that those exist
+ * FEATURE_FLAG_SUPPORTS_V180 in the features member indicates that those exist
*/
__le32 flags; /* Flags defining array states for reshaping */
@@ -1287,7 +1529,7 @@
failed_devices[0] = le64_to_cpu(sb->failed_devices);
memset(failed_devices + 1, 0, sizeof(sb->extended_failed_devices));
- if (_test_flag(FEATURE_FLAG_SUPPORTS_RESHAPE, le32_to_cpu(sb->compat_features))) {
+ if (_test_flag(FEATURE_FLAG_SUPPORTS_V180, le32_to_cpu(sb->compat_features))) {
int i = ARRAY_SIZE(sb->extended_failed_devices);
while (i--)
@@ -1337,7 +1579,7 @@
sb_update_failed_devices(sb, failed_devices);
sb->magic = cpu_to_le32(DM_RAID_MAGIC);
- sb->compat_features = cpu_to_le32(0); /* Don't set reshape flag yet */
+ sb->compat_features = cpu_to_le32(FEATURE_FLAG_SUPPORTS_V180); /* Don't set reshape flag yet */
sb->num_devices = cpu_to_le32(mddev->raid_disks);
sb->array_position = cpu_to_le32(rdev->raid_disk);
@@ -1416,6 +1658,7 @@
super_sync(rdev->mddev, rdev);
set_bit(FirstUse, &rdev->flags);
+ sb->compat_features = cpu_to_le32(FEATURE_FLAG_SUPPORTS_V180); /* Don't set reshape flag yet */
/* Force writing of superblocks to disk */
set_bit(MD_CHANGE_DEVS, &rdev->mddev->flags);
@@ -1461,7 +1704,7 @@
* Reshaping is supported, e.g. reshape_position is valid
* in superblock and superblock content is authoritative.
*/
- if (_test_flag(FEATURE_FLAG_SUPPORTS_RESHAPE, le32_to_cpu(sb->compat_features))) {
+ if (_test_flag(FEATURE_FLAG_SUPPORTS_V180, le32_to_cpu(sb->compat_features))) {
/* Superblock is authoritative wrt given raid set layout! */
mddev->raid_disks = le32_to_cpu(sb->num_devices);
mddev->level = le32_to_cpu(sb->level);
@@ -1564,6 +1807,7 @@
if (new_devs == rs->raid_disks) {
DMINFO("Superblocks created for new raid set");
set_bit(MD_ARRAY_FIRST_USE, &mddev->flags);
+ _set_flag(RT_FLAG_UPDATE_SBS, &rs->runtime_flags);
mddev->recovery_cp = 0;
} else if (new_devs && new_devs != rs->raid_disks && !rebuilds) {
DMERR("New device injected into existing raid set without "
@@ -1657,8 +1901,9 @@
if (!mddev->events && super_init_validation(rs, rdev))
return -EINVAL;
- if (sb->compat_features || sb->incompat_features) {
- rs->ti->error = "Unable to assemble array: No feature flags supported yet";
+ if (le32_to_cpu(sb->compat_features) != FEATURE_FLAG_SUPPORTS_V180 ||
+ sb->incompat_features) {
+ rs->ti->error = "Unable to assemble array: No incompatible feature flags supported yet";
return -EINVAL;
}
@@ -1718,8 +1963,6 @@
* that the "sync" directive is disallowed during the
* reshape.
*/
- rdev->sectors = to_sector(i_size_read(rdev->bdev->bd_inode));
-
if (_test_flag(CTR_FLAG_SYNC, rs->ctr_flags))
continue;
@@ -1785,14 +2028,77 @@
return 0;
}
+/* Userpace reordered disks -> adjust raid_disk indexes in @rs */
+static void _reorder_raid_disk_indexes(struct raid_set *rs)
+{
+ int i = 0;
+ struct md_rdev *rdev;
+
+ rdev_for_each(rdev, &rs->md) {
+ rdev->raid_disk = i++;
+ rdev->saved_raid_disk = rdev->new_raid_disk = -1;
+ }
+}
+
+/*
+ * Setup @rs for takeover by a different raid level
+ */
+static int rs_setup_takeover(struct raid_set *rs)
+{
+ struct mddev *mddev = &rs->md;
+ struct md_rdev *rdev;
+ unsigned int d = mddev->raid_disks = rs->raid_disks;
+ sector_t new_data_offset = rs->dev[0].rdev.data_offset ? 0 : rs->data_offset;
+
+ if (rt_is_raid10(rs->raid_type)) {
+ if (mddev->level == 0) {
+ /* Userpace reordered disks -> adjust raid_disk indexes */
+ _reorder_raid_disk_indexes(rs);
+
+ /* raid0 -> raid10_far layout */
+ mddev->layout = raid10_format_to_md_layout(rs, ALGORITHM_RAID10_FAR,
+ rs->raid10_copies);
+ } else if (mddev->level == 1)
+ /* raid1 -> raid10_near layout */
+ mddev->layout = raid10_format_to_md_layout(rs, ALGORITHM_RAID10_NEAR,
+ rs->raid_disks);
+ else
+ return -EINVAL;
+
+ }
+
+ clear_bit(MD_ARRAY_FIRST_USE, &mddev->flags);
+ mddev->recovery_cp = MaxSector;
+
+ while (d--) {
+ rdev = &rs->dev[d].rdev;
+
+ if (test_bit(d, (void *) rs->rebuild_disks)) {
+ clear_bit(In_sync, &rdev->flags);
+ clear_bit(Faulty, &rdev->flags);
+ mddev->recovery_cp = rdev->recovery_offset = 0;
+ /* Bitmap has to be created when we do an "up" takeover */
+ set_bit(MD_ARRAY_FIRST_USE, &mddev->flags);
+ }
+
+ rdev->new_data_offset = new_data_offset;
+ }
+
+ rs_set_new(rs);
+ set_bit(MD_CHANGE_DEVS, &mddev->flags);
+
+ return 0;
+}
+
/*
* Enable/disable discard support on RAID set depending on
* RAID level and discard properties of underlying RAID members.
*/
-static void configure_discard_support(struct dm_target *ti, struct raid_set *rs)
+static void configure_discard_support(struct raid_set *rs)
{
int i;
bool raid456;
+ struct dm_target *ti = rs->ti;
/* Assume discards not supported until after checks below. */
ti->discards_supported = false;
@@ -1894,6 +2200,14 @@
goto bad;
rs->md.sync_super = super_sync;
+
+ /*
+ * Backup any new raid set level, layout, ...
+ * requested to be able to compare to superblock
+ * members for conversion decisions.
+ */
+ rs_config_backup(rs);
+
r = analyse_superblocks(ti, rs);
if (r)
goto bad;
@@ -1902,10 +2216,29 @@
ti->private = rs;
ti->num_flush_bios = 1;
+ /* Restore any requested new layout for conversion decision */
+ rs_config_restore(rs);
+
/*
- * Disable/enable discard support on RAID set.
+ * If a takeover is needed, just set the level to
+ * the new requested one and allow the raid set to run.
*/
- configure_discard_support(ti, rs);
+ if (rs_takeover_requested(rs)) {
+ r = rs_check_takeover(rs);
+ if (r)
+ return r;
+
+ r = rs_setup_takeover(rs);
+ if (r)
+ return r;
+
+ _set_flag(RT_FLAG_UPDATE_SBS, &rs->runtime_flags);
+ }
+
+ /* Start raid set read-only and assumed clean to change in raid_resume() */
+ rs->md.ro = 1;
+ rs->md.in_sync = 1;
+ set_bit(MD_RECOVERY_FROZEN, &rs->md.recovery);
/* Has to be held on running the array */
mddev_lock_nointr(&rs->md);
@@ -2312,29 +2645,92 @@
}
}
+/* Load the dirty region bitmap */
+static int _bitmap_load(struct raid_set *rs)
+{
+ int r = 0;
+
+ /* Try loading the bitmap unless "raid0", which does not have one */
+ if (!rs_is_raid0(rs) &&
+ !_test_and_set_flag(RT_FLAG_RS_BITMAP_LOADED, &rs->runtime_flags)) {
+ r = bitmap_load(&rs->md);
+ if (r)
+ DMERR("Failed to load bitmap");
+ }
+
+ return r;
+}
+
+static int raid_preresume(struct dm_target *ti)
+{
+ struct raid_set *rs = ti->private;
+ struct mddev *mddev = &rs->md;
+
+ /* This is a resume after a suspend of the set -> it's already started */
+ if (_test_and_set_flag(RT_FLAG_RS_PRERESUMED, &rs->runtime_flags))
+ return 0;
+
+ /*
+ * The superblocks need to be updated on disk if the
+ * array is new or _bitmap_load will overwrite them
+ * in core with old data.
+ *
+ * In case the array got modified (takeover/reshape/resize)
+ * or the data offsets on the component devices changed, they
+ * have to be updated as well.
+ *
+ * Have to switch to readwrite and back in order to
+ * allow for the superblock updates.
+ */
+ if (_test_and_clear_flag(RT_FLAG_UPDATE_SBS, &rs->runtime_flags)) {
+ set_bit(MD_CHANGE_DEVS, &mddev->flags);
+ mddev->ro = 0;
+ md_update_sb(mddev, 1);
+ mddev->ro = 1;
+ }
+
+ /*
+ * Disable/enable discard support on raid set after any
+ * conversion, because devices can have been added
+ */
+ configure_discard_support(rs);
+
+ /* Load the bitmap from disk unless raid0 */
+ return _bitmap_load(rs);
+}
+
static void raid_resume(struct dm_target *ti)
{
struct raid_set *rs = ti->private;
+ struct mddev *mddev = &rs->md;
- if (!rt_is_raid0(rs->raid_type)) {
- set_bit(MD_CHANGE_DEVS, &rs->md.flags);
+ if (_test_and_set_flag(RT_FLAG_RS_RESUMED, &rs->runtime_flags)) {
+ /*
+ * A secondary resume while the device is active.
+ * Take this opportunity to check whether any failed
+ * devices are reachable again.
+ */
+ attempt_restore_of_faulty_devices(rs);
- if (!rs->bitmap_loaded) {
- bitmap_load(&rs->md);
- rs->bitmap_loaded = 1;
- } else {
- /*
- * A secondary resume while the device is active.
- * Take this opportunity to check whether any failed
- * devices are reachable again.
- */
- attempt_restore_of_faulty_devices(rs);
- }
+ } else {
+ mddev->in_sync = 0;
- clear_bit(MD_RECOVERY_FROZEN, &rs->md.recovery);
+ /*
+ * If any of the constructor flags got passed in
+ * but "region_size" (gets always passed in for
+ * mappings with bitmap), we expect userspace to
+ * reset them and reload the mapping anyway.
+ *
+ * -> don't unfreeze resynchronization until imminant
+ * reload of the table w/o theses flags
+ */
+ if (!_test_flags(ALL_FREEZE_FLAGS, rs->ctr_flags))
+ clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
}
- mddev_resume(&rs->md);
+ mddev->ro = 0;
+ if (mddev->suspended)
+ mddev_resume(mddev);
}
static struct target_type raid_target = {
@@ -2350,6 +2746,7 @@
.io_hints = raid_io_hints,
.presuspend = raid_presuspend,
.postsuspend = raid_postsuspend,
+ .preresume = raid_preresume,
.resume = raid_resume,
};