dm thin: add read only and fail io modes
Add read-only and fail-io modes to thin provisioning.
If a transaction commit fails the pool's metadata device will transition
to "read-only" mode. If a commit fails once already in read-only mode
the transition to "fail-io" mode occurs.
Once in fail-io mode the pool and all associated thin devices will
report a status of "Fail".
Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
diff --git a/Documentation/device-mapper/thin-provisioning.txt b/Documentation/device-mapper/thin-provisioning.txt
index f5cfc62..30b8b83b 100644
--- a/Documentation/device-mapper/thin-provisioning.txt
+++ b/Documentation/device-mapper/thin-provisioning.txt
@@ -231,6 +231,9 @@
no_discard_passdown: Don't pass discards down to the underlying
data device, but just remove the mapping.
+ read_only: Don't allow any changes to be made to the pool
+ metadata.
+
Data block size must be between 64KB (128 sectors) and 1GB
(2097152 sectors) inclusive.
@@ -239,7 +242,7 @@
<transaction id> <used metadata blocks>/<total metadata blocks>
<used data blocks>/<total data blocks> <held metadata root>
-
+ [no_]discard_passdown ro|rw
transaction id:
A 64-bit number used by userspace to help synchronise with metadata
@@ -257,6 +260,21 @@
held root. This feature is not yet implemented so '-' is
always returned.
+ discard_passdown|no_discard_passdown
+ Whether or not discards are actually being passed down to the
+ underlying device. When this is enabled when loading the table,
+ it can get disabled if the underlying device doesn't support it.
+
+ ro|rw
+ If the pool encounters certain types of device failures it will
+ drop into a read-only metadata mode in which no changes to
+ the pool metadata (like allocating new blocks) are permitted.
+
+ In serious cases where even a read-only mode is deemed unsafe
+ no further I/O will be permitted and the status will just
+ contain the string 'Fail'. The userspace recovery tools
+ should then be used.
+
iii) Messages
create_thin <dev id>
@@ -329,3 +347,7 @@
ii) Status
<nr mapped sectors> <highest mapped sector>
+
+ If the pool has encountered device errors and failed, the status
+ will just contain the string 'Fail'. The userspace recovery
+ tools should then be used.
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 98c50f9..087e9b3 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2011 Red Hat UK.
+ * Copyright (C) 2011-2012 Red Hat UK.
*
* This file is released under the GPL.
*/
@@ -496,12 +496,27 @@
*/
struct dm_thin_new_mapping;
+/*
+ * The pool runs in 3 modes. Ordered in degraded order for comparisons.
+ */
+enum pool_mode {
+ PM_WRITE, /* metadata may be changed */
+ PM_READ_ONLY, /* metadata may not be changed */
+ PM_FAIL, /* all I/O fails */
+};
+
struct pool_features {
+ enum pool_mode mode;
+
unsigned zero_new_blocks:1;
unsigned discard_enabled:1;
unsigned discard_passdown:1;
};
+struct thin_c;
+typedef void (*process_bio_fn)(struct thin_c *tc, struct bio *bio);
+typedef void (*process_mapping_fn)(struct dm_thin_new_mapping *m);
+
struct pool {
struct list_head list;
struct dm_target *ti; /* Only set if a pool target is bound */
@@ -542,8 +557,17 @@
struct dm_thin_new_mapping *next_mapping;
mempool_t *mapping_pool;
mempool_t *endio_hook_pool;
+
+ process_bio_fn process_bio;
+ process_bio_fn process_discard;
+
+ process_mapping_fn process_prepared_mapping;
+ process_mapping_fn process_prepared_discard;
};
+static enum pool_mode get_pool_mode(struct pool *pool);
+static void set_pool_mode(struct pool *pool, enum pool_mode mode);
+
/*
* Target context for a pool.
*/
@@ -718,16 +742,28 @@
struct pool *pool = tc->pool;
unsigned long flags;
- /*
- * Batch together any FUA/FLUSH bios we find and then issue
- * a single commit for them in process_deferred_bios().
- */
- if (bio_triggers_commit(tc, bio)) {
- spin_lock_irqsave(&pool->lock, flags);
- bio_list_add(&pool->deferred_flush_bios, bio);
- spin_unlock_irqrestore(&pool->lock, flags);
- } else
+ if (!bio_triggers_commit(tc, bio)) {
generic_make_request(bio);
+ return;
+ }
+
+ /*
+ * Complete bio with an error if earlier I/O caused changes to
+ * the metadata that can't be committed e.g, due to I/O errors
+ * on the metadata device.
+ */
+ if (dm_thin_aborted_changes(tc->td)) {
+ bio_io_error(bio);
+ return;
+ }
+
+ /*
+ * Batch together any bios that trigger commits and then issue a
+ * single commit for them in process_deferred_bios().
+ */
+ spin_lock_irqsave(&pool->lock, flags);
+ bio_list_add(&pool->deferred_flush_bios, bio);
+ spin_unlock_irqrestore(&pool->lock, flags);
}
static void remap_to_origin_and_issue(struct thin_c *tc, struct bio *bio)
@@ -864,6 +900,14 @@
wake_worker(pool);
}
+static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
+{
+ if (m->bio)
+ m->bio->bi_end_io = m->saved_bi_end_io;
+ cell_error(m->cell);
+ list_del(&m->list);
+ mempool_free(m, m->tc->pool->mapping_pool);
+}
static void process_prepared_mapping(struct dm_thin_new_mapping *m)
{
struct thin_c *tc = m->tc;
@@ -908,18 +952,20 @@
mempool_free(m, tc->pool->mapping_pool);
}
-static void process_prepared_discard(struct dm_thin_new_mapping *m)
+static void process_prepared_discard_fail(struct dm_thin_new_mapping *m)
{
- int r;
struct thin_c *tc = m->tc;
- r = dm_thin_remove_block(tc->td, m->virt_block);
- if (r)
- DMERR("dm_thin_remove_block() failed");
+ bio_io_error(m->bio);
+ cell_defer_except(tc, m->cell);
+ cell_defer_except(tc, m->cell2);
+ mempool_free(m, tc->pool->mapping_pool);
+}
- /*
- * Pass the discard down to the underlying device?
- */
+static void process_prepared_discard_passdown(struct dm_thin_new_mapping *m)
+{
+ struct thin_c *tc = m->tc;
+
if (m->pass_discard)
remap_and_issue(tc, m->bio, m->data_block);
else
@@ -930,8 +976,20 @@
mempool_free(m, tc->pool->mapping_pool);
}
+static void process_prepared_discard(struct dm_thin_new_mapping *m)
+{
+ int r;
+ struct thin_c *tc = m->tc;
+
+ r = dm_thin_remove_block(tc->td, m->virt_block);
+ if (r)
+ DMERR("dm_thin_remove_block() failed");
+
+ process_prepared_discard_passdown(m);
+}
+
static void process_prepared(struct pool *pool, struct list_head *head,
- void (*fn)(struct dm_thin_new_mapping *))
+ process_mapping_fn *fn)
{
unsigned long flags;
struct list_head maps;
@@ -943,7 +1001,7 @@
spin_unlock_irqrestore(&pool->lock, flags);
list_for_each_entry_safe(m, tmp, &maps, list)
- fn(m);
+ (*fn)(m);
}
/*
@@ -1109,6 +1167,35 @@
}
}
+static int commit(struct pool *pool)
+{
+ int r;
+
+ r = dm_pool_commit_metadata(pool->pmd);
+ if (r)
+ DMERR("commit failed, error = %d", r);
+
+ return r;
+}
+
+/*
+ * A non-zero return indicates read_only or fail_io mode.
+ * Many callers don't care about the return value.
+ */
+static int commit_or_fallback(struct pool *pool)
+{
+ int r;
+
+ if (get_pool_mode(pool) != PM_WRITE)
+ return -EINVAL;
+
+ r = commit(pool);
+ if (r)
+ set_pool_mode(pool, PM_READ_ONLY);
+
+ return r;
+}
+
static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
{
int r;
@@ -1137,12 +1224,7 @@
* Try to commit to see if that will free up some
* more space.
*/
- r = dm_pool_commit_metadata(pool->pmd);
- if (r) {
- DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
- __func__, r);
- return r;
- }
+ (void) commit_or_fallback(pool);
r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
if (r)
@@ -1373,6 +1455,7 @@
default:
DMERR("%s: alloc_data_block() failed, error = %d", __func__, r);
+ set_pool_mode(tc->pool, PM_READ_ONLY);
cell_error(cell);
break;
}
@@ -1430,6 +1513,49 @@
}
}
+static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
+{
+ int r;
+ int rw = bio_data_dir(bio);
+ dm_block_t block = get_bio_block(tc, bio);
+ struct dm_thin_lookup_result lookup_result;
+
+ r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
+ switch (r) {
+ case 0:
+ if (lookup_result.shared && (rw == WRITE) && bio->bi_size)
+ bio_io_error(bio);
+ else
+ remap_and_issue(tc, bio, lookup_result.block);
+ break;
+
+ case -ENODATA:
+ if (rw != READ) {
+ bio_io_error(bio);
+ break;
+ }
+
+ if (tc->origin_dev) {
+ remap_to_origin_and_issue(tc, bio);
+ break;
+ }
+
+ zero_fill_bio(bio);
+ bio_endio(bio, 0);
+ break;
+
+ default:
+ DMERR("dm_thin_find_block() failed, error = %d", r);
+ bio_io_error(bio);
+ break;
+ }
+}
+
+static void process_bio_fail(struct thin_c *tc, struct bio *bio)
+{
+ bio_io_error(bio);
+}
+
static int need_commit_due_to_time(struct pool *pool)
{
return jiffies < pool->last_commit_jiffies ||
@@ -1441,7 +1567,6 @@
unsigned long flags;
struct bio *bio;
struct bio_list bios;
- int r;
bio_list_init(&bios);
@@ -1468,9 +1593,9 @@
}
if (bio->bi_rw & REQ_DISCARD)
- process_discard(tc, bio);
+ pool->process_discard(tc, bio);
else
- process_bio(tc, bio);
+ pool->process_bio(tc, bio);
}
/*
@@ -1486,10 +1611,7 @@
if (bio_list_empty(&bios) && !need_commit_due_to_time(pool))
return;
- r = dm_pool_commit_metadata(pool->pmd);
- if (r) {
- DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
- __func__, r);
+ if (commit_or_fallback(pool)) {
while ((bio = bio_list_pop(&bios)))
bio_io_error(bio);
return;
@@ -1504,8 +1626,8 @@
{
struct pool *pool = container_of(ws, struct pool, worker);
- process_prepared(pool, &pool->prepared_mappings, process_prepared_mapping);
- process_prepared(pool, &pool->prepared_discards, process_prepared_discard);
+ process_prepared(pool, &pool->prepared_mappings, &pool->process_prepared_mapping);
+ process_prepared(pool, &pool->prepared_discards, &pool->process_prepared_discard);
process_deferred_bios(pool);
}
@@ -1522,6 +1644,52 @@
/*----------------------------------------------------------------*/
+static enum pool_mode get_pool_mode(struct pool *pool)
+{
+ return pool->pf.mode;
+}
+
+static void set_pool_mode(struct pool *pool, enum pool_mode mode)
+{
+ int r;
+
+ pool->pf.mode = mode;
+
+ switch (mode) {
+ case PM_FAIL:
+ DMERR("switching pool to failure mode");
+ pool->process_bio = process_bio_fail;
+ pool->process_discard = process_bio_fail;
+ pool->process_prepared_mapping = process_prepared_mapping_fail;
+ pool->process_prepared_discard = process_prepared_discard_fail;
+ break;
+
+ case PM_READ_ONLY:
+ DMERR("switching pool to read-only mode");
+ r = dm_pool_abort_metadata(pool->pmd);
+ if (r) {
+ DMERR("aborting transaction failed");
+ set_pool_mode(pool, PM_FAIL);
+ } else {
+ dm_pool_metadata_read_only(pool->pmd);
+ pool->process_bio = process_bio_read_only;
+ pool->process_discard = process_discard;
+ pool->process_prepared_mapping = process_prepared_mapping_fail;
+ pool->process_prepared_discard = process_prepared_discard_passdown;
+ }
+ break;
+
+ case PM_WRITE:
+ pool->process_bio = process_bio;
+ pool->process_discard = process_discard;
+ pool->process_prepared_mapping = process_prepared_mapping;
+ pool->process_prepared_discard = process_prepared_discard;
+ break;
+ }
+}
+
+/*----------------------------------------------------------------*/
+
/*
* Mapping functions.
*/
@@ -1567,6 +1735,12 @@
struct dm_thin_lookup_result result;
map_context->ptr = thin_hook_bio(tc, bio);
+
+ if (get_pool_mode(tc->pool) == PM_FAIL) {
+ bio_io_error(bio);
+ return DM_MAPIO_SUBMITTED;
+ }
+
if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) {
thin_defer_bio(tc, bio);
return DM_MAPIO_SUBMITTED;
@@ -1603,14 +1777,35 @@
break;
case -ENODATA:
+ if (get_pool_mode(tc->pool) == PM_READ_ONLY) {
+ /*
+ * This block isn't provisioned, and we have no way
+ * of doing so. Just error it.
+ */
+ bio_io_error(bio);
+ r = DM_MAPIO_SUBMITTED;
+ break;
+ }
+ /* fall through */
+
+ case -EWOULDBLOCK:
/*
* In future, the failed dm_thin_find_block above could
* provide the hint to load the metadata into cache.
*/
- case -EWOULDBLOCK:
thin_defer_bio(tc, bio);
r = DM_MAPIO_SUBMITTED;
break;
+
+ default:
+ /*
+ * Must always call bio_io_error on failure.
+ * dm_thin_find_block can fail with -EINVAL if the
+ * pool is switched to fail-io mode.
+ */
+ bio_io_error(bio);
+ r = DM_MAPIO_SUBMITTED;
+ break;
}
return r;
@@ -1647,15 +1842,26 @@
{
struct pool_c *pt = ti->private;
+ /*
+ * We want to make sure that degraded pools are never upgraded.
+ */
+ enum pool_mode old_mode = pool->pf.mode;
+ enum pool_mode new_mode = pt->pf.mode;
+
+ if (old_mode > new_mode)
+ new_mode = old_mode;
+
pool->ti = ti;
pool->low_water_blocks = pt->low_water_blocks;
pool->pf = pt->pf;
+ set_pool_mode(pool, new_mode);
/*
* If discard_passdown was enabled verify that the data device
* supports discards. Disable discard_passdown if not; otherwise
* -EOPNOTSUPP will be returned.
*/
+ /* FIXME: pull this out into a sep fn. */
if (pt->pf.discard_passdown) {
struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
if (!q || !blk_queue_discard(q)) {
@@ -1681,6 +1887,7 @@
/* Initialize pool features. */
static void pool_features_init(struct pool_features *pf)
{
+ pf->mode = PM_WRITE;
pf->zero_new_blocks = 1;
pf->discard_enabled = 1;
pf->discard_passdown = 1;
@@ -1711,14 +1918,16 @@
static struct pool *pool_create(struct mapped_device *pool_md,
struct block_device *metadata_dev,
- unsigned long block_size, char **error)
+ unsigned long block_size,
+ int read_only, char **error)
{
int r;
void *err_p;
struct pool *pool;
struct dm_pool_metadata *pmd;
+ bool format_device = read_only ? false : true;
- pmd = dm_pool_metadata_open(metadata_dev, block_size, true);
+ pmd = dm_pool_metadata_open(metadata_dev, block_size, format_device);
if (IS_ERR(pmd)) {
*error = "Error creating metadata object";
return (struct pool *)pmd;
@@ -1835,8 +2044,8 @@
static struct pool *__pool_find(struct mapped_device *pool_md,
struct block_device *metadata_dev,
- unsigned long block_size, char **error,
- int *created)
+ unsigned long block_size, int read_only,
+ char **error, int *created)
{
struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev);
@@ -1857,7 +2066,7 @@
__pool_inc(pool);
} else {
- pool = pool_create(pool_md, metadata_dev, block_size, error);
+ pool = pool_create(pool_md, metadata_dev, block_size, read_only, error);
*created = 1;
}
}
@@ -1908,19 +2117,23 @@
arg_name = dm_shift_arg(as);
argc--;
- if (!strcasecmp(arg_name, "skip_block_zeroing")) {
+ if (!strcasecmp(arg_name, "skip_block_zeroing"))
pf->zero_new_blocks = 0;
- continue;
- } else if (!strcasecmp(arg_name, "ignore_discard")) {
- pf->discard_enabled = 0;
- continue;
- } else if (!strcasecmp(arg_name, "no_discard_passdown")) {
- pf->discard_passdown = 0;
- continue;
- }
- ti->error = "Unrecognised pool feature requested";
- r = -EINVAL;
+ else if (!strcasecmp(arg_name, "ignore_discard"))
+ pf->discard_enabled = 0;
+
+ else if (!strcasecmp(arg_name, "no_discard_passdown"))
+ pf->discard_passdown = 0;
+
+ else if (!strcasecmp(arg_name, "read_only"))
+ pf->mode = PM_READ_ONLY;
+
+ else {
+ ti->error = "Unrecognised pool feature requested";
+ r = -EINVAL;
+ break;
+ }
}
return r;
@@ -2013,7 +2226,7 @@
}
pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev,
- block_size, &ti->error, &pool_created);
+ block_size, pf.mode == PM_READ_ONLY, &ti->error, &pool_created);
if (IS_ERR(pool)) {
r = PTR_ERR(pool);
goto out_free_pt;
@@ -2146,15 +2359,12 @@
r = dm_pool_resize_data_dev(pool->pmd, data_size);
if (r) {
DMERR("failed to resize data device");
+ /* FIXME Stricter than necessary: Rollback transaction instead here */
+ set_pool_mode(pool, PM_READ_ONLY);
return r;
}
- r = dm_pool_commit_metadata(pool->pmd);
- if (r) {
- DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
- __func__, r);
- return r;
- }
+ (void) commit_or_fallback(pool);
}
return 0;
@@ -2177,19 +2387,12 @@
static void pool_postsuspend(struct dm_target *ti)
{
- int r;
struct pool_c *pt = ti->private;
struct pool *pool = pt->pool;
cancel_delayed_work(&pool->waker);
flush_workqueue(pool->wq);
-
- r = dm_pool_commit_metadata(pool->pmd);
- if (r < 0) {
- DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
- __func__, r);
- /* FIXME: invalidate device? error the next FUA or FLUSH bio ?*/
- }
+ (void) commit_or_fallback(pool);
}
static int check_arg_count(unsigned argc, unsigned args_required)
@@ -2323,12 +2526,7 @@
if (r)
return r;
- r = dm_pool_commit_metadata(pool->pmd);
- if (r) {
- DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
- __func__, r);
- return r;
- }
+ (void) commit_or_fallback(pool);
r = dm_pool_reserve_metadata_snap(pool->pmd);
if (r)
@@ -2389,16 +2587,32 @@
else
DMWARN("Unrecognised thin pool target message received: %s", argv[0]);
- if (!r) {
- r = dm_pool_commit_metadata(pool->pmd);
- if (r)
- DMERR("%s message: dm_pool_commit_metadata() failed, error = %d",
- argv[0], r);
- }
+ if (!r)
+ (void) commit_or_fallback(pool);
return r;
}
+static void emit_flags(struct pool_features *pf, char *result,
+ unsigned sz, unsigned maxlen)
+{
+ unsigned count = !pf->zero_new_blocks + !pf->discard_enabled +
+ !pf->discard_passdown + (pf->mode == PM_READ_ONLY);
+ DMEMIT("%u ", count);
+
+ if (!pf->zero_new_blocks)
+ DMEMIT("skip_block_zeroing ");
+
+ if (!pf->discard_enabled)
+ DMEMIT("ignore_discard ");
+
+ if (!pf->discard_passdown)
+ DMEMIT("no_discard_passdown ");
+
+ if (pf->mode == PM_READ_ONLY)
+ DMEMIT("read_only ");
+}
+
/*
* Status line is:
* <transaction id> <used metadata sectors>/<total metadata sectors>
@@ -2407,7 +2621,7 @@
static int pool_status(struct dm_target *ti, status_type_t type,
char *result, unsigned maxlen)
{
- int r, count;
+ int r;
unsigned sz = 0;
uint64_t transaction_id;
dm_block_t nr_free_blocks_data;
@@ -2422,6 +2636,11 @@
switch (type) {
case STATUSTYPE_INFO:
+ if (get_pool_mode(pool) == PM_FAIL) {
+ DMEMIT("Fail");
+ break;
+ }
+
r = dm_pool_get_metadata_transaction_id(pool->pmd,
&transaction_id);
if (r)
@@ -2457,9 +2676,19 @@
(unsigned long long)nr_blocks_data);
if (held_root)
- DMEMIT("%llu", held_root);
+ DMEMIT("%llu ", held_root);
else
- DMEMIT("-");
+ DMEMIT("- ");
+
+ if (pool->pf.mode == PM_READ_ONLY)
+ DMEMIT("ro ");
+ else
+ DMEMIT("rw ");
+
+ if (pool->pf.discard_enabled && pool->pf.discard_passdown)
+ DMEMIT("discard_passdown");
+ else
+ DMEMIT("no_discard_passdown");
break;
@@ -2469,20 +2698,7 @@
format_dev_t(buf2, pt->data_dev->bdev->bd_dev),
(unsigned long)pool->sectors_per_block,
(unsigned long long)pt->low_water_blocks);
-
- count = !pool->pf.zero_new_blocks + !pool->pf.discard_enabled +
- !pt->pf.discard_passdown;
- DMEMIT("%u ", count);
-
- if (!pool->pf.zero_new_blocks)
- DMEMIT("skip_block_zeroing ");
-
- if (!pool->pf.discard_enabled)
- DMEMIT("ignore_discard ");
-
- if (!pt->pf.discard_passdown)
- DMEMIT("no_discard_passdown ");
-
+ emit_flags(&pt->pf, result, sz, maxlen);
break;
}
@@ -2542,7 +2758,7 @@
.name = "thin-pool",
.features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
DM_TARGET_IMMUTABLE,
- .version = {1, 2, 0},
+ .version = {1, 3, 0},
.module = THIS_MODULE,
.ctr = pool_ctr,
.dtr = pool_dtr,
@@ -2647,6 +2863,11 @@
}
__pool_inc(tc->pool);
+ if (get_pool_mode(tc->pool) == PM_FAIL) {
+ ti->error = "Couldn't open thin device, Pool is in fail mode";
+ goto bad_thin_open;
+ }
+
r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td);
if (r) {
ti->error = "Couldn't open thin internal device";
@@ -2755,6 +2976,11 @@
char buf[BDEVNAME_SIZE];
struct thin_c *tc = ti->private;
+ if (get_pool_mode(tc->pool) == PM_FAIL) {
+ DMEMIT("Fail");
+ return 0;
+ }
+
if (!tc->td)
DMEMIT("-");
else {
@@ -2823,7 +3049,7 @@
static struct target_type thin_target = {
.name = "thin",
- .version = {1, 2, 0},
+ .version = {1, 3, 0},
.module = THIS_MODULE,
.ctr = thin_ctr,
.dtr = thin_dtr,