drbd: Defer new writes when detecting conflicting writes
Before submitting a new local write request, wait for any conflicting
local or remote requests to complete.
We could assume that the new request occurred first and that the
conflicting requests overwrote it (and therefore discard the new
reques), but we know for sure that the new request occurred after the
conflicting requests and so this behavior would we weird. We would also
end up with the wrong result if the new request is not fully contained
within the conflicting requests.
Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index e11ea47..6bcf417 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -300,48 +300,6 @@
_req_may_be_done(req, m);
}
-/*
- * checks whether there was an overlapping request
- * or ee already registered.
- *
- * if so, return 1, in which case this request is completed on the spot,
- * without ever being submitted or send.
- *
- * return 0 if it is ok to submit this request.
- *
- * NOTE:
- * paranoia: assume something above us is broken, and issues different write
- * requests for the same block simultaneously...
- *
- * To ensure these won't be reordered differently on both nodes, resulting in
- * diverging data sets, we discard the later one(s). Not that this is supposed
- * to happen, but this is the rationale why we also have to check for
- * conflicting requests with local origin, and why we have to do so regardless
- * of whether we allowed multiple primaries.
- */
-static int _req_conflicts(struct drbd_request *req)
-{
- struct drbd_conf *mdev = req->mdev;
- const sector_t sector = req->i.sector;
- const int size = req->i.size;
- struct drbd_interval *i;
-
- D_ASSERT(drbd_interval_empty(&req->i));
-
- i = drbd_find_overlap(&mdev->write_requests, sector, size);
- if (i) {
- dev_alert(DEV, "%s[%u] Concurrent %s write detected! "
- "[DISCARD L] new: %llus +%u; "
- "pending: %llus +%u\n",
- current->comm, current->pid,
- i->local ? "local" : "remote",
- (unsigned long long)sector, size,
- (unsigned long long)i->sector, i->size);
- return 1;
- }
- return 0;
-}
-
/* obviously this could be coded as many single functions
* instead of one huge switch,
* or by putting the code directly in the respective locations
@@ -721,6 +679,34 @@
return 0 == drbd_bm_count_bits(mdev, sbnr, ebnr);
}
+/*
+ * complete_conflicting_writes - wait for any conflicting write requests
+ *
+ * The write_requests tree contains all active write requests which we
+ * currently know about. Wait for any requests to complete which conflict with
+ * the new one.
+ */
+static int complete_conflicting_writes(struct drbd_conf *mdev,
+ sector_t sector, int size)
+{
+ for(;;) {
+ DEFINE_WAIT(wait);
+ struct drbd_interval *i;
+
+ i = drbd_find_overlap(&mdev->write_requests, sector, size);
+ if (!i)
+ return 0;
+ i->waiting = true;
+ prepare_to_wait(&mdev->misc_wait, &wait, TASK_INTERRUPTIBLE);
+ spin_unlock_irq(&mdev->tconn->req_lock);
+ schedule();
+ finish_wait(&mdev->misc_wait, &wait);
+ spin_lock_irq(&mdev->tconn->req_lock);
+ if (signal_pending(current))
+ return -ERESTARTSYS;
+ }
+}
+
static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, unsigned long start_time)
{
const int rw = bio_rw(bio);
@@ -729,7 +715,7 @@
struct drbd_tl_epoch *b = NULL;
struct drbd_request *req;
int local, remote, send_oos = 0;
- int err = -EIO;
+ int err;
int ret = 0;
/* allocate outside of all locks; */
@@ -799,6 +785,7 @@
if (!(local || remote) && !is_susp(mdev->state)) {
if (__ratelimit(&drbd_ratelimit_state))
dev_err(DEV, "IO ERROR: neither local nor remote disk\n");
+ err = -EIO;
goto fail_free_complete;
}
@@ -823,6 +810,14 @@
/* GOOD, everything prepared, grab the spin_lock */
spin_lock_irq(&mdev->tconn->req_lock);
+ if (rw == WRITE) {
+ err = complete_conflicting_writes(mdev, sector, size);
+ if (err) {
+ spin_unlock_irq(&mdev->tconn->req_lock);
+ goto fail_free_complete;
+ }
+ }
+
if (is_susp(mdev->state)) {
/* If we got suspended, use the retry mechanism of
generic_make_request() to restart processing of this
@@ -843,6 +838,7 @@
if (!(local || remote)) {
dev_err(DEV, "IO ERROR: neither local nor remote disk\n");
spin_unlock_irq(&mdev->tconn->req_lock);
+ err = -EIO;
goto fail_free_complete;
}
}
@@ -903,12 +899,6 @@
if (local)
_req_mod(req, TO_BE_SUBMITTED);
- /* check this request on the collision detection hash tables.
- * if we have a conflict, just complete it here.
- * THINK do we want to check reads, too? (I don't think so...) */
- if (rw == WRITE && _req_conflicts(req))
- goto fail_conflicting;
-
list_add_tail(&req->tl_requests, &mdev->tconn->newest_tle->requests);
/* NOTE remote first: to get the concurrent write detection right,
@@ -975,21 +965,6 @@
return 0;
-fail_conflicting:
- /* this is a conflicting request.
- * even though it may have been only _partially_
- * overlapping with one of the currently pending requests,
- * without even submitting or sending it, we will
- * pretend that it was successfully served right now.
- */
- _drbd_end_io_acct(mdev, req);
- spin_unlock_irq(&mdev->tconn->req_lock);
- if (remote)
- dec_ap_pending(mdev);
- /* THINK: do we want to fail it (-EIO), or pretend success?
- * this pretends success. */
- err = 0;
-
fail_free_complete:
if (req->rq_state & RQ_IN_ACT_LOG)
drbd_al_complete_io(mdev, sector);