md/r5cache: handle FLUSH and FUA
With raid5 cache, we committing data from journal device. When
there is flush request, we need to flush journal device's cache.
This was not needed in raid5 journal, because we will flush the
journal before committing data to raid disks.
This is similar to FUA, except that we also need flush journal for
FUA. Otherwise, corruptions in earlier meta data will stop recovery
from reaching FUA data.
slightly changed the code by Shaohua
Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Shaohua Li <shli@fb.com>
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 6b99570..8cb79fc 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -159,6 +159,9 @@ struct r5l_log {
spinlock_t stripe_in_journal_lock;
atomic_t stripe_in_journal_count;
+
+ /* to submit async io_units, to fulfill ordering of flush */
+ struct work_struct deferred_io_work;
};
/*
@@ -185,6 +188,18 @@ struct r5l_io_unit {
int state;
bool need_split_bio;
+ struct bio *split_bio;
+
+ unsigned int has_flush:1; /* include flush request */
+ unsigned int has_fua:1; /* include fua request */
+ unsigned int has_null_flush:1; /* include empty flush request */
+ /*
+ * io isn't sent yet, flush/fua request can only be submitted till it's
+ * the first IO in running_ios list
+ */
+ unsigned int io_deferred:1;
+
+ struct bio_list flush_barriers; /* size == 0 flush bios */
};
/* r5l_io_unit state */
@@ -494,9 +509,11 @@ static void r5l_move_to_end_ios(struct r5l_log *log)
}
}
+static void __r5l_stripe_write_finished(struct r5l_io_unit *io);
static void r5l_log_endio(struct bio *bio)
{
struct r5l_io_unit *io = bio->bi_private;
+ struct r5l_io_unit *io_deferred;
struct r5l_log *log = io->log;
unsigned long flags;
@@ -512,18 +529,89 @@ static void r5l_log_endio(struct bio *bio)
r5l_move_to_end_ios(log);
else
r5l_log_run_stripes(log);
+ if (!list_empty(&log->running_ios)) {
+ /*
+ * FLUSH/FUA io_unit is deferred because of ordering, now we
+ * can dispatch it
+ */
+ io_deferred = list_first_entry(&log->running_ios,
+ struct r5l_io_unit, log_sibling);
+ if (io_deferred->io_deferred)
+ schedule_work(&log->deferred_io_work);
+ }
+
spin_unlock_irqrestore(&log->io_list_lock, flags);
if (log->need_cache_flush)
md_wakeup_thread(log->rdev->mddev->thread);
+
+ if (io->has_null_flush) {
+ struct bio *bi;
+
+ WARN_ON(bio_list_empty(&io->flush_barriers));
+ while ((bi = bio_list_pop(&io->flush_barriers)) != NULL) {
+ bio_endio(bi);
+ atomic_dec(&io->pending_stripe);
+ }
+ if (atomic_read(&io->pending_stripe) == 0)
+ __r5l_stripe_write_finished(io);
+ }
+}
+
+static void r5l_do_submit_io(struct r5l_log *log, struct r5l_io_unit *io)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&log->io_list_lock, flags);
+ __r5l_set_io_unit_state(io, IO_UNIT_IO_START);
+ spin_unlock_irqrestore(&log->io_list_lock, flags);
+
+ if (io->has_flush)
+ bio_set_op_attrs(io->current_bio, REQ_OP_WRITE, WRITE_FLUSH);
+ if (io->has_fua)
+ bio_set_op_attrs(io->current_bio, REQ_OP_WRITE, WRITE_FUA);
+ submit_bio(io->current_bio);
+
+ if (!io->split_bio)
+ return;
+
+ if (io->has_flush)
+ bio_set_op_attrs(io->split_bio, REQ_OP_WRITE, WRITE_FLUSH);
+ if (io->has_fua)
+ bio_set_op_attrs(io->split_bio, REQ_OP_WRITE, WRITE_FUA);
+ submit_bio(io->split_bio);
+}
+
+/* deferred io_unit will be dispatched here */
+static void r5l_submit_io_async(struct work_struct *work)
+{
+ struct r5l_log *log = container_of(work, struct r5l_log,
+ deferred_io_work);
+ struct r5l_io_unit *io = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&log->io_list_lock, flags);
+ if (!list_empty(&log->running_ios)) {
+ io = list_first_entry(&log->running_ios, struct r5l_io_unit,
+ log_sibling);
+ if (!io->io_deferred)
+ io = NULL;
+ else
+ io->io_deferred = 0;
+ }
+ spin_unlock_irqrestore(&log->io_list_lock, flags);
+ if (io)
+ r5l_do_submit_io(log, io);
}
static void r5l_submit_current_io(struct r5l_log *log)
{
struct r5l_io_unit *io = log->current_io;
+ struct bio *bio;
struct r5l_meta_block *block;
unsigned long flags;
u32 crc;
+ bool do_submit = true;
if (!io)
return;
@@ -532,13 +620,20 @@ static void r5l_submit_current_io(struct r5l_log *log)
block->meta_size = cpu_to_le32(io->meta_offset);
crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE);
block->checksum = cpu_to_le32(crc);
+ bio = io->current_bio;
log->current_io = NULL;
spin_lock_irqsave(&log->io_list_lock, flags);
- __r5l_set_io_unit_state(io, IO_UNIT_IO_START);
+ if (io->has_flush || io->has_fua) {
+ if (io != list_first_entry(&log->running_ios,
+ struct r5l_io_unit, log_sibling)) {
+ io->io_deferred = 1;
+ do_submit = false;
+ }
+ }
spin_unlock_irqrestore(&log->io_list_lock, flags);
-
- submit_bio(io->current_bio);
+ if (do_submit)
+ r5l_do_submit_io(log, io);
}
static struct bio *r5l_bio_alloc(struct r5l_log *log)
@@ -583,6 +678,7 @@ static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log)
io->log = log;
INIT_LIST_HEAD(&io->log_sibling);
INIT_LIST_HEAD(&io->stripe_list);
+ bio_list_init(&io->flush_barriers);
io->state = IO_UNIT_RUNNING;
io->meta_page = mempool_alloc(log->meta_pool, GFP_NOIO);
@@ -653,12 +749,11 @@ static void r5l_append_payload_page(struct r5l_log *log, struct page *page)
struct r5l_io_unit *io = log->current_io;
if (io->need_split_bio) {
- struct bio *prev = io->current_bio;
-
+ BUG_ON(io->split_bio);
+ io->split_bio = io->current_bio;
io->current_bio = r5l_bio_alloc(log);
- bio_chain(io->current_bio, prev);
-
- submit_bio(prev);
+ bio_chain(io->current_bio, io->split_bio);
+ io->need_split_bio = false;
}
if (!bio_add_page(io->current_bio, page, PAGE_SIZE, 0))
@@ -687,12 +782,24 @@ static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
io = log->current_io;
+ if (test_and_clear_bit(STRIPE_R5C_PREFLUSH, &sh->state))
+ io->has_flush = 1;
+
for (i = 0; i < sh->disks; i++) {
if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) ||
test_bit(R5_InJournal, &sh->dev[i].flags))
continue;
if (i == sh->pd_idx || i == sh->qd_idx)
continue;
+ if (test_bit(R5_WantFUA, &sh->dev[i].flags) &&
+ log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK) {
+ io->has_fua = 1;
+ /*
+ * we need to flush journal to make sure recovery can
+ * reach the data with fua flag
+ */
+ io->has_flush = 1;
+ }
r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA,
raid5_compute_blocknr(sh, i, 0),
sh->dev[i].log_checksum, 0, false);
@@ -856,17 +963,34 @@ int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio)
{
if (!log)
return -ENODEV;
- /*
- * we flush log disk cache first, then write stripe data to raid disks.
- * So if bio is finished, the log disk cache is flushed already. The
- * recovery guarantees we can recovery the bio from log disk, so we
- * don't need to flush again
- */
- if (bio->bi_iter.bi_size == 0) {
- bio_endio(bio);
- return 0;
+
+ if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) {
+ /*
+ * in write through (journal only)
+ * we flush log disk cache first, then write stripe data to
+ * raid disks. So if bio is finished, the log disk cache is
+ * flushed already. The recovery guarantees we can recovery
+ * the bio from log disk, so we don't need to flush again
+ */
+ if (bio->bi_iter.bi_size == 0) {
+ bio_endio(bio);
+ return 0;
+ }
+ bio->bi_opf &= ~REQ_PREFLUSH;
+ } else {
+ /* write back (with cache) */
+ if (bio->bi_iter.bi_size == 0) {
+ mutex_lock(&log->io_mutex);
+ r5l_get_meta(log, 0);
+ bio_list_add(&log->current_io->flush_barriers, bio);
+ log->current_io->has_flush = 1;
+ log->current_io->has_null_flush = 1;
+ atomic_inc(&log->current_io->pending_stripe);
+ r5l_submit_current_io(log);
+ mutex_unlock(&log->io_mutex);
+ return 0;
+ }
}
- bio->bi_opf &= ~REQ_PREFLUSH;
return -EAGAIN;
}
@@ -2470,6 +2594,8 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
INIT_LIST_HEAD(&log->no_space_stripes);
spin_lock_init(&log->no_space_stripes_lock);
+ INIT_WORK(&log->deferred_io_work, r5l_submit_io_async);
+
log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
INIT_LIST_HEAD(&log->stripe_in_journal_list);
spin_lock_init(&log->stripe_in_journal_lock);
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index aa4968c..dbab8c7 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5248,6 +5248,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
int remaining;
DEFINE_WAIT(w);
bool do_prepare;
+ bool do_flush = false;
if (unlikely(bi->bi_opf & REQ_PREFLUSH)) {
int ret = r5l_handle_flush_request(conf->log, bi);
@@ -5259,6 +5260,11 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
return;
}
/* ret == -EAGAIN, fallback */
+ /*
+ * if r5l_handle_flush_request() didn't clear REQ_PREFLUSH,
+ * we need to flush journal device
+ */
+ do_flush = bi->bi_opf & REQ_PREFLUSH;
}
md_write_start(mddev, bi);
@@ -5398,6 +5404,12 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
do_prepare = true;
goto retry;
}
+ if (do_flush) {
+ set_bit(STRIPE_R5C_PREFLUSH, &sh->state);
+ /* we only need flush for one stripe */
+ do_flush = false;
+ }
+
set_bit(STRIPE_HANDLE, &sh->state);
clear_bit(STRIPE_DELAYED, &sh->state);
if ((!sh->batch_head || sh == sh->batch_head) &&
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index a698113..d13fe45 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -376,6 +376,7 @@ enum {
STRIPE_R5C_FULL_STRIPE, /* in r5c cache (to-be/being handled or
* in conf->r5c_full_stripe_list)
*/
+ STRIPE_R5C_PREFLUSH, /* need to flush journal device */
};
#define STRIPE_EXPAND_SYNC_FLAGS \