drbd: fix for spurious fullsync (uuids rotated too fast) If it was an "empty" resync, the SyncSource may have already "finished" the resync and rotated the UUIDs, before noticing the connection loss (and generating a new uuid, if Primary, rotating again), while the SyncTarget did not change its uuids at all, or only got to the previous sync-uuid. This would then again lead to a full sync on next handshake (see also Bug #251). Fix: Use explicit resync finished notification even for empty resyncs, do not finish an empty resync implicitly on the SyncSource. Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com> Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>

commit: af85e8e83d160f72a10e4467852646ac08614260 [log] [tgz]
author: Lars Ellenberg <lars.ellenberg@linbit.com> Thu Oct 07 16:07:55 2010 +0200
committer: Philipp Reisner <philipp.reisner@linbit.com> Thu Oct 14 18:38:48 2010 +0200
tree: b2c842d6129065bbdd787a810038752e6239b0ef
parent: e9ef7bb6f9696471ddddf0065afac8b435e5d051 [diff]
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index accb37d..63f45d7 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c

@@ -1426,6 +1426,11 @@
 	    (os.user_isp && !ns.user_isp))
 		resume_next_sg(mdev);
 
+	/* sync target done with resync.  Explicitly notify peer, even though
+	 * it should (at least for non-empty resyncs) already know itself. */
+	if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED)
+		drbd_send_state(mdev);
+
 	/* free tl_hash if we Got thawed and are C_STANDALONE */
 	if (ns.conn == C_STANDALONE && !is_susp(ns) && mdev->tl_hash)
 		drbd_free_tl_hash(mdev);

diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 166b51e..88be45a 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c

@@ -522,6 +522,12 @@
 		dev_err(DEV, "%s in w_make_resync_request\n",
 			drbd_conn_str(mdev->state.conn));
 
+	if (mdev->rs_total == 0) {
+		/* empty resync? */
+		drbd_resync_finished(mdev);
+		return 1;
+	}
+
 	if (!get_ldev(mdev)) {
 		/* Since we only need to access mdev->rsync a
 		   get_ldev_if_state(mdev,D_FAILED) would be sufficient, but
@@ -768,6 +774,14 @@
 	return 1;
 }
 
+static void ping_peer(struct drbd_conf *mdev)
+{
+	clear_bit(GOT_PING_ACK, &mdev->flags);
+	request_ping(mdev);
+	wait_event(mdev->misc_wait,
+		   test_bit(GOT_PING_ACK, &mdev->flags) || mdev->state.conn < C_CONNECTED);
+}
+
 int drbd_resync_finished(struct drbd_conf *mdev)
 {
 	unsigned long db, dt, dbdt;
@@ -807,6 +821,8 @@
 	if (!get_ldev(mdev))
 		goto out;
 
+	ping_peer(mdev);
+
 	spin_lock_irq(&mdev->req_lock);
 	os = mdev->state;
 
@@ -1420,14 +1436,6 @@
 	return retcode;
 }
 
-static void ping_peer(struct drbd_conf *mdev)
-{
-	clear_bit(GOT_PING_ACK, &mdev->flags);
-	request_ping(mdev);
-	wait_event(mdev->misc_wait,
-		   test_bit(GOT_PING_ACK, &mdev->flags) || mdev->state.conn < C_CONNECTED);
-}
-
 /**
  * drbd_start_resync() - Start the resync process
  * @mdev:	DRBD device.
@@ -1527,9 +1535,21 @@
 		     (unsigned long) mdev->rs_total << (BM_BLOCK_SHIFT-10),
 		     (unsigned long) mdev->rs_total);
 
-		if (mdev->rs_total == 0) {
-			/* Peer still reachable? Beware of failing before-resync-target handlers! */
-			ping_peer(mdev);
+		if (mdev->agreed_pro_version < 95 && mdev->rs_total == 0) {
+			/* This still has a race (about when exactly the peers
+			 * detect connection loss) that can lead to a full sync
+			 * on next handshake. In 8.3.9 we fixed this with explicit
+			 * resync-finished notifications, but the fix
+			 * introduces a protocol change.  Sleeping for some
+			 * time longer than the ping interval + timeout on the
+			 * SyncSource, to give the SyncTarget the chance to
+			 * detect connection loss, then waiting for a ping
+			 * response (implicit in drbd_resync_finished) reduces
+			 * the race considerably, but does not solve it. */
+			if (side == C_SYNC_SOURCE)
+				schedule_timeout_interruptible(
+					mdev->net_conf->ping_int * HZ +
+					mdev->net_conf->ping_timeo*HZ/9);
 			drbd_resync_finished(mdev);
 		}
commit	af85e8e83d160f72a10e4467852646ac08614260	[log] [tgz]
author	Lars Ellenberg <lars.ellenberg@linbit.com>	Thu Oct 07 16:07:55 2010 +0200
committer	Philipp Reisner <philipp.reisner@linbit.com>	Thu Oct 14 18:38:48 2010 +0200
tree	b2c842d6129065bbdd787a810038752e6239b0ef
parent	e9ef7bb6f9696471ddddf0065afac8b435e5d051 [diff]