implement in-kernel gendisk events handling
Currently, media presence polling for removeable block devices is done
from userland. There are several issues with this.
* Polling is done by periodically opening the device. For SCSI
devices, the command sequence generated by such action involves a
few different commands including TEST_UNIT_READY. This behavior,
while perfectly legal, is different from Windows which only issues
single command, GET_EVENT_STATUS_NOTIFICATION. Unfortunately, some
ATAPI devices lock up after being periodically queried such command
sequences.
* There is no reliable and unintrusive way for a userland program to
tell whether the target device is safe for media presence polling.
For example, polling for media presence during an on-going burning
session can make it fail. The polling program can avoid this by
opening the device with O_EXCL but then it risks making a valid
exclusive user of the device fail w/ -EBUSY.
* Userland polling is unnecessarily heavy and in-kernel implementation
is lighter and better coordinated (workqueue, timer slack).
This patch implements framework for in-kernel disk event handling,
which includes media presence polling.
* bdops->check_events() is added, which supercedes ->media_changed().
It should check whether there's any pending event and return if so.
Currently, two events are defined - DISK_EVENT_MEDIA_CHANGE and
DISK_EVENT_EJECT_REQUEST. ->check_events() is guaranteed not to be
called parallelly.
* gendisk->events and ->async_events are added. These should be
initialized by block driver before passing the device to add_disk().
The former contains the mask of all supported events and the latter
the mask of all events which the device can report without polling.
/sys/block/*/events[_async] export these to userland.
* Kernel parameter block.events_dfl_poll_msecs controls the system
polling interval (default is 0 which means disable) and
/sys/block/*/events_poll_msecs control polling intervals for
individual devices (default is -1 meaning use system setting). Note
that if a device can report all supported events asynchronously and
its polling interval isn't explicitly set, the device won't be
polled regardless of the system polling interval.
* If a device is opened exclusively with write access, event checking
is automatically disabled until all write exclusive accesses are
released.
* There are event 'clearing' events. For example, both of currently
defined events are cleared after the device has been successfully
opened. This information is passed to ->check_events() callback
using @clearing argument as a hint.
* Event checking is always performed from system_nrt_wq and timer
slack is set to 25% for polling.
* Nothing changes for drivers which implement ->media_changed() but
not ->check_events(). Going forward, all drivers will be converted
to ->check_events() and ->media_change() will be dropped.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 83031bc..05667e6 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1251,6 +1251,9 @@
int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
int (*direct_access) (struct block_device *, sector_t,
void **, unsigned long *);
+ unsigned int (*check_events) (struct gendisk *disk,
+ unsigned int clearing);
+ /* ->media_changed() is DEPRECATED, use ->check_events() instead */
int (*media_changed) (struct gendisk *);
void (*unlock_native_capacity) (struct gendisk *);
int (*revalidate_disk) (struct gendisk *);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f485015..997d22e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -662,6 +662,7 @@
void * bd_claiming;
void * bd_holder;
int bd_holders;
+ bool bd_write_holder;
#ifdef CONFIG_SYSFS
struct gendisk * bd_holder_disk; /* for sysfs slave linkng */
#endif
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 56e17ed..13893aa 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -127,6 +127,11 @@
#define GENHD_FL_EXT_DEVT 64 /* allow extended devt */
#define GENHD_FL_NATIVE_CAPACITY 128
+enum {
+ DISK_EVENT_MEDIA_CHANGE = 1 << 0, /* media changed */
+ DISK_EVENT_EJECT_REQUEST = 1 << 1, /* eject requested */
+};
+
#define BLK_SCSI_MAX_CMDS (256)
#define BLK_SCSI_CMD_PER_LONG (BLK_SCSI_MAX_CMDS / (sizeof(long) * 8))
@@ -143,6 +148,8 @@
struct hd_struct __rcu *part[];
};
+struct disk_events;
+
struct gendisk {
/* major, first_minor and minors are input parameters only,
* don't use directly. Use disk_devt() and disk_max_parts().
@@ -154,6 +161,10 @@
char disk_name[DISK_NAME_LEN]; /* name of major driver */
char *(*devnode)(struct gendisk *gd, mode_t *mode);
+
+ unsigned int events; /* supported events */
+ unsigned int async_events; /* async events, subset of all */
+
/* Array of pointers to partitions indexed by partno.
* Protected with matching bdev lock but stat and other
* non-critical accesses use RCU. Always access through
@@ -171,8 +182,8 @@
struct kobject *slave_dir;
struct timer_rand_state *random;
-
atomic_t sync_io; /* RAID */
+ struct disk_events *ev;
#ifdef CONFIG_BLK_DEV_INTEGRITY
struct blk_integrity *integrity;
#endif
@@ -405,6 +416,11 @@
return disk->part0.policy;
}
+extern void disk_block_events(struct gendisk *disk);
+extern void disk_unblock_events(struct gendisk *disk);
+extern void disk_check_events(struct gendisk *disk);
+extern unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask);
+
/* drivers/char/random.c */
extern void add_disk_randomness(struct gendisk *disk);
extern void rand_initialize_disk(struct gendisk *disk);