block: fix disk->part[] dereferencing race
disk->part[] is protected by its matching bdev's lock. However,
non-critical accesses like collecting stats and printing out sysfs and
proc information used to be performed without any locking. As
partitions can come and go dynamically, partitions can go away
underneath those non-critical accesses. As some of those accesses are
writes, this theoretically can lead to silent corruption.
This patch fixes the race by using RCU for the partition array and dev
reference counter to hold partitions.
* Rename disk->part[] to disk->__part[] to make sure no one outside
genhd layer proper accesses it directly.
* Use RCU for disk->__part[] dereferencing.
* Implement disk_{get|put}_part() which can be used to get and put
partitions from gendisk respectively.
* Iterators are implemented to help iterate through all partitions
safely.
* Functions which require RCU readlock are marked with _rcu suffix.
* Use disk_put_part() in __blkdev_put() instead of directly putting
the contained kobject.
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 72e0a28..2f2873b 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -929,6 +929,7 @@
{
struct module *owner = NULL;
struct gendisk *disk;
+ struct hd_struct *part = NULL;
int ret;
int partno;
int perm = 0;
@@ -978,7 +979,6 @@
if (bdev->bd_invalidated)
rescan_partitions(disk, bdev);
} else {
- struct hd_struct *p;
struct block_device *whole;
whole = bdget_disk(disk, 0);
ret = -ENOMEM;
@@ -989,16 +989,16 @@
if (ret)
goto out_first;
bdev->bd_contains = whole;
- p = disk->part[partno - 1];
+ part = disk_get_part(disk, partno);
bdev->bd_inode->i_data.backing_dev_info =
whole->bd_inode->i_data.backing_dev_info;
- if (!(disk->flags & GENHD_FL_UP) || !p || !p->nr_sects) {
+ if (!(disk->flags & GENHD_FL_UP) ||
+ !part || !part->nr_sects) {
ret = -ENXIO;
goto out_first;
}
- kobject_get(&p->dev.kobj);
- bdev->bd_part = p;
- bd_set_size(bdev, (loff_t) p->nr_sects << 9);
+ bdev->bd_part = part;
+ bd_set_size(bdev, (loff_t)part->nr_sects << 9);
}
} else {
put_disk(disk);
@@ -1027,6 +1027,7 @@
__blkdev_put(bdev->bd_contains, 1);
bdev->bd_contains = NULL;
put_disk(disk);
+ disk_put_part(part);
module_put(owner);
out:
mutex_unlock(&bdev->bd_mutex);
@@ -1119,7 +1120,7 @@
module_put(owner);
if (bdev->bd_contains != bdev) {
- kobject_put(&bdev->bd_part->dev.kobj);
+ disk_put_part(bdev->bd_part);
bdev->bd_part = NULL;
}
bdev->bd_disk = NULL;