md: Protect access to mddev->disks list using RCU
All modifications and most access to the mddev->disks list are made
under the reconfig_mutex lock. However there are three places where
the list is walked without any locking. If a reconfig happens at this
time, havoc (and oops) can ensue.
So use RCU to protect these accesses:
- wrap them in rcu_read_{,un}lock()
- use list_for_each_entry_rcu
- add to the list with list_add_rcu
- delete from the list with list_del_rcu
- delay the 'free' with call_rcu rather than schedule_work
Note that export_rdev did a list_del_init on this list. In almost all
cases the entry was not in the list anymore so it was a no-op and so
safe. It is no longer safe as after list_del_rcu we may not touch
the list_head.
An audit shows that export_rdev is called:
- after unbind_rdev_from_array, in which case the delete has
already been done,
- after bind_rdev_to_array fails, in which case the delete isn't needed.
- before the device has been put on a list at all (e.g. in
add_new_disk where reading the superblock fails).
- and in autorun devices after a failure when the device is on a
different list.
So remove the list_del_init call from export_rdev, and add it back
immediately before the called to export_rdev for that last case.
Note also that ->same_set is sometimes used for lists other than
mddev->list (e.g. candidates). In these cases rcu is not needed.
Signed-off-by: NeilBrown <neilb@suse.de>
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index eba83e2..621a272 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -241,10 +241,10 @@
static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
{
mdk_rdev_t *rdev;
- struct list_head *tmp;
mddev_t *mddev = bitmap->mddev;
- rdev_for_each(rdev, tmp, mddev)
+ rcu_read_lock();
+ rdev_for_each_rcu(rdev, mddev)
if (test_bit(In_sync, &rdev->flags)
&& !test_bit(Faulty, &rdev->flags)) {
int size = PAGE_SIZE;
@@ -260,11 +260,11 @@
+ (long)(page->index * (PAGE_SIZE/512))
+ size/512 > 0)
/* bitmap runs in to metadata */
- return -EINVAL;
+ goto bad_alignment;
if (rdev->data_offset + mddev->size*2
> rdev->sb_start + bitmap->offset)
/* data runs in to bitmap */
- return -EINVAL;
+ goto bad_alignment;
} else if (rdev->sb_start < rdev->data_offset) {
/* METADATA BITMAP DATA */
if (rdev->sb_start
@@ -272,7 +272,7 @@
+ page->index*(PAGE_SIZE/512) + size/512
> rdev->data_offset)
/* bitmap runs in to data */
- return -EINVAL;
+ goto bad_alignment;
} else {
/* DATA METADATA BITMAP - no problems */
}
@@ -282,10 +282,15 @@
size,
page);
}
+ rcu_read_unlock();
if (wait)
md_super_wait(mddev);
return 0;
+
+ bad_alignment:
+ rcu_read_unlock();
+ return -EINVAL;
}
static void bitmap_file_kick(struct bitmap *bitmap);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 450f30b..c2ff77c 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1395,15 +1395,17 @@
static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
{
- struct list_head *tmp, *tmp2;
mdk_rdev_t *rdev, *rdev2;
- rdev_for_each(rdev, tmp, mddev1)
- rdev_for_each(rdev2, tmp2, mddev2)
+ rcu_read_lock();
+ rdev_for_each_rcu(rdev, mddev1)
+ rdev_for_each_rcu(rdev2, mddev2)
if (rdev->bdev->bd_contains ==
- rdev2->bdev->bd_contains)
+ rdev2->bdev->bd_contains) {
+ rcu_read_unlock();
return 1;
-
+ }
+ rcu_read_unlock();
return 0;
}
@@ -1470,7 +1472,7 @@
kobject_del(&rdev->kobj);
goto fail;
}
- list_add(&rdev->same_set, &mddev->disks);
+ list_add_rcu(&rdev->same_set, &mddev->disks);
bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk);
return 0;
@@ -1495,14 +1497,16 @@
return;
}
bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk);
- list_del_init(&rdev->same_set);
+ list_del_rcu(&rdev->same_set);
printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b));
rdev->mddev = NULL;
sysfs_remove_link(&rdev->kobj, "block");
/* We need to delay this, otherwise we can deadlock when
- * writing to 'remove' to "dev/state"
+ * writing to 'remove' to "dev/state". We also need
+ * to delay it due to rcu usage.
*/
+ synchronize_rcu();
INIT_WORK(&rdev->del_work, md_delayed_delete);
kobject_get(&rdev->kobj);
schedule_work(&rdev->del_work);
@@ -1558,7 +1562,6 @@
if (rdev->mddev)
MD_BUG();
free_disk_sb(rdev);
- list_del_init(&rdev->same_set);
#ifndef MODULE
if (test_bit(AutoDetected, &rdev->flags))
md_autodetect_dev(rdev->bdev->bd_dev);
@@ -4062,8 +4065,10 @@
/* on success, candidates will be empty, on error
* it won't...
*/
- rdev_for_each_list(rdev, tmp, candidates)
+ rdev_for_each_list(rdev, tmp, candidates) {
+ list_del_init(&rdev->same_set);
export_rdev(rdev);
+ }
mddev_put(mddev);
}
printk(KERN_INFO "md: ... autorun DONE.\n");
@@ -5529,12 +5534,12 @@
static int is_mddev_idle(mddev_t *mddev)
{
mdk_rdev_t * rdev;
- struct list_head *tmp;
int idle;
long curr_events;
idle = 1;
- rdev_for_each(rdev, tmp, mddev) {
+ rcu_read_lock();
+ rdev_for_each_rcu(rdev, mddev) {
struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
curr_events = disk_stat_read(disk, sectors[0]) +
disk_stat_read(disk, sectors[1]) -
@@ -5566,6 +5571,7 @@
idle = 0;
}
}
+ rcu_read_unlock();
return idle;
}