RDS: Rewrite connection cleanup, fixing oops on rmmod

This fixes a bug where a connection was unexpectedly
not on *any* list while being destroyed. It also
cleans up some code duplication and regularizes some
function names.

* Grab appropriate lock in conn_free() and explain in comment
* Ensure via locking that a conn is never not on either
  a dev's list or the nodev list
* Add rds_xx_remove_conn() to match rds_xx_add_conn()
* Make rds_xx_add_conn() return void
* Rename remove_{,nodev_}conns() to
  destroy_{,nodev_}conns() and unify their implementation
  in a helper function
* Document lock ordering as nodev conn_lock before
  dev_conn_lock

Reported-by: Yosef Etigin <yosefe@voltaire.com>
Signed-off-by: Andy Grover <andy.grover@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
diff --git a/net/rds/ib.c b/net/rds/ib.c
index 06a7b79..4933b38 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -51,6 +51,7 @@
 
 struct list_head rds_ib_devices;
 
+/* NOTE: if also grabbing ibdev lock, grab this first */
 DEFINE_SPINLOCK(ib_nodev_conns_lock);
 LIST_HEAD(ib_nodev_conns);
 
@@ -137,7 +138,7 @@
 		kfree(i_ipaddr);
 	}
 
-	rds_ib_remove_conns(rds_ibdev);
+	rds_ib_destroy_conns(rds_ibdev);
 
 	if (rds_ibdev->mr_pool)
 		rds_ib_destroy_mr_pool(rds_ibdev->mr_pool);
@@ -249,7 +250,7 @@
 void rds_ib_exit(void)
 {
 	rds_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
-	rds_ib_remove_nodev_conns();
+	rds_ib_destroy_nodev_conns();
 	ib_unregister_client(&rds_ib_client);
 	rds_ib_sysctl_exit();
 	rds_ib_recv_exit();
diff --git a/net/rds/ib.h b/net/rds/ib.h
index 8be563a..c08ffff 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -267,9 +267,17 @@
 
 /* ib_rdma.c */
 int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr);
-int rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn);
-void rds_ib_remove_nodev_conns(void);
-void rds_ib_remove_conns(struct rds_ib_device *rds_ibdev);
+void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn);
+void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn);
+void __rds_ib_destroy_conns(struct list_head *list, spinlock_t *list_lock);
+static inline void rds_ib_destroy_nodev_conns(void)
+{
+	__rds_ib_destroy_conns(&ib_nodev_conns, &ib_nodev_conns_lock);
+}
+static inline void rds_ib_destroy_conns(struct rds_ib_device *rds_ibdev)
+{
+	__rds_ib_destroy_conns(&rds_ibdev->conn_list, &rds_ibdev->spinlock);
+}
 struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *);
 void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo);
 void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *);
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index 0532237..889ab04 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -126,9 +126,7 @@
 	err = rds_ib_update_ipaddr(rds_ibdev, conn->c_laddr);
 	if (err)
 		printk(KERN_ERR "rds_ib_update_ipaddr failed (%d)\n", err);
-	err = rds_ib_add_conn(rds_ibdev, conn);
-	if (err)
-		printk(KERN_ERR "rds_ib_add_conn failed (%d)\n", err);
+	rds_ib_add_conn(rds_ibdev, conn);
 
 	/* If the peer gave us the last packet it saw, process this as if
 	 * we had received a regular ACK. */
@@ -616,18 +614,8 @@
 		/*
 		 * Move connection back to the nodev list.
 		 */
-		if (ic->rds_ibdev) {
-
-			spin_lock_irq(&ic->rds_ibdev->spinlock);
-			BUG_ON(list_empty(&ic->ib_node));
-			list_del(&ic->ib_node);
-			spin_unlock_irq(&ic->rds_ibdev->spinlock);
-
-			spin_lock_irq(&ib_nodev_conns_lock);
-			list_add_tail(&ic->ib_node, &ib_nodev_conns);
-			spin_unlock_irq(&ib_nodev_conns_lock);
-			ic->rds_ibdev = NULL;
-		}
+		if (ic->rds_ibdev)
+			rds_ib_remove_conn(ic->rds_ibdev, conn);
 
 		ic->i_cm_id = NULL;
 		ic->i_pd = NULL;
@@ -701,11 +689,27 @@
 	return 0;
 }
 
+/*
+ * Free a connection. Connection must be shut down and not set for reconnect.
+ */
 void rds_ib_conn_free(void *arg)
 {
 	struct rds_ib_connection *ic = arg;
+	spinlock_t	*lock_ptr;
+
 	rdsdebug("ic %p\n", ic);
+
+	/*
+	 * Conn is either on a dev's list or on the nodev list.
+	 * A race with shutdown() or connect() would cause problems
+	 * (since rds_ibdev would change) but that should never happen.
+	 */
+	lock_ptr = ic->rds_ibdev ? &ic->rds_ibdev->spinlock : &ib_nodev_conns_lock;
+
+	spin_lock_irq(lock_ptr);
 	list_del(&ic->ib_node);
+	spin_unlock_irq(lock_ptr);
+
 	kfree(ic);
 }
 
diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
index 69a6289..81033af 100644
--- a/net/rds/ib_rdma.c
+++ b/net/rds/ib_rdma.c
@@ -139,7 +139,7 @@
 	return rds_ib_add_ipaddr(rds_ibdev, ipaddr);
 }
 
-int rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn)
+void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn)
 {
 	struct rds_ib_connection *ic = conn->c_transport_data;
 
@@ -148,45 +148,44 @@
 	BUG_ON(list_empty(&ib_nodev_conns));
 	BUG_ON(list_empty(&ic->ib_node));
 	list_del(&ic->ib_node);
-	spin_unlock_irq(&ib_nodev_conns_lock);
 
 	spin_lock_irq(&rds_ibdev->spinlock);
 	list_add_tail(&ic->ib_node, &rds_ibdev->conn_list);
 	spin_unlock_irq(&rds_ibdev->spinlock);
-
-	ic->rds_ibdev = rds_ibdev;
-
-	return 0;
-}
-
-void rds_ib_remove_nodev_conns(void)
-{
-	struct rds_ib_connection *ic, *_ic;
-	LIST_HEAD(tmp_list);
-
-	/* avoid calling conn_destroy with irqs off */
-	spin_lock_irq(&ib_nodev_conns_lock);
-	list_splice(&ib_nodev_conns, &tmp_list);
-	INIT_LIST_HEAD(&ib_nodev_conns);
 	spin_unlock_irq(&ib_nodev_conns_lock);
 
-	list_for_each_entry_safe(ic, _ic, &tmp_list, ib_node) {
-		if (ic->conn->c_passive)
-			rds_conn_destroy(ic->conn->c_passive);
-		rds_conn_destroy(ic->conn);
-	}
+	ic->rds_ibdev = rds_ibdev;
 }
 
-void rds_ib_remove_conns(struct rds_ib_device *rds_ibdev)
+void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn)
+{
+	struct rds_ib_connection *ic = conn->c_transport_data;
+
+	/* place conn on nodev_conns_list */
+	spin_lock(&ib_nodev_conns_lock);
+
+	spin_lock_irq(&rds_ibdev->spinlock);
+	BUG_ON(list_empty(&ic->ib_node));
+	list_del(&ic->ib_node);
+	spin_unlock_irq(&rds_ibdev->spinlock);
+
+	list_add_tail(&ic->ib_node, &ib_nodev_conns);
+
+	spin_unlock(&ib_nodev_conns_lock);
+
+	ic->rds_ibdev = NULL;
+}
+
+void __rds_ib_destroy_conns(struct list_head *list, spinlock_t *list_lock)
 {
 	struct rds_ib_connection *ic, *_ic;
 	LIST_HEAD(tmp_list);
 
 	/* avoid calling conn_destroy with irqs off */
-	spin_lock_irq(&rds_ibdev->spinlock);
-	list_splice(&rds_ibdev->conn_list, &tmp_list);
-	INIT_LIST_HEAD(&rds_ibdev->conn_list);
-	spin_unlock_irq(&rds_ibdev->spinlock);
+	spin_lock_irq(list_lock);
+	list_splice(list, &tmp_list);
+	INIT_LIST_HEAD(list);
+	spin_unlock_irq(list_lock);
 
 	list_for_each_entry_safe(ic, _ic, &tmp_list, ib_node) {
 		if (ic->conn->c_passive)
diff --git a/net/rds/iw.c b/net/rds/iw.c
index 1b56905..b732efb 100644
--- a/net/rds/iw.c
+++ b/net/rds/iw.c
@@ -51,6 +51,7 @@
 
 struct list_head rds_iw_devices;
 
+/* NOTE: if also grabbing iwdev lock, grab this first */
 DEFINE_SPINLOCK(iw_nodev_conns_lock);
 LIST_HEAD(iw_nodev_conns);
 
@@ -145,7 +146,7 @@
 	}
 	spin_unlock_irq(&rds_iwdev->spinlock);
 
-	rds_iw_remove_conns(rds_iwdev);
+	rds_iw_destroy_conns(rds_iwdev);
 
 	if (rds_iwdev->mr_pool)
 		rds_iw_destroy_mr_pool(rds_iwdev->mr_pool);
@@ -258,7 +259,7 @@
 void rds_iw_exit(void)
 {
 	rds_info_deregister_func(RDS_INFO_IWARP_CONNECTIONS, rds_iw_ic_info);
-	rds_iw_remove_nodev_conns();
+	rds_iw_destroy_nodev_conns();
 	ib_unregister_client(&rds_iw_client);
 	rds_iw_sysctl_exit();
 	rds_iw_recv_exit();
diff --git a/net/rds/iw.h b/net/rds/iw.h
index 0ddda34..70eb948 100644
--- a/net/rds/iw.h
+++ b/net/rds/iw.h
@@ -294,9 +294,17 @@
 
 /* ib_rdma.c */
 int rds_iw_update_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id);
-int rds_iw_add_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn);
-void rds_iw_remove_nodev_conns(void);
-void rds_iw_remove_conns(struct rds_iw_device *rds_iwdev);
+void rds_iw_add_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn);
+void rds_iw_remove_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn);
+void __rds_iw_destroy_conns(struct list_head *list, spinlock_t *list_lock);
+static inline void rds_iw_destroy_nodev_conns(void)
+{
+	__rds_iw_destroy_conns(&iw_nodev_conns, &iw_nodev_conns_lock);
+}
+static inline void rds_iw_destroy_conns(struct rds_iw_device *rds_iwdev)
+{
+	__rds_iw_destroy_conns(&rds_iwdev->conn_list, &rds_iwdev->spinlock);
+}
 struct rds_iw_mr_pool *rds_iw_create_mr_pool(struct rds_iw_device *);
 void rds_iw_get_mr_info(struct rds_iw_device *rds_iwdev, struct rds_info_rdma_connection *iinfo);
 void rds_iw_destroy_mr_pool(struct rds_iw_mr_pool *);
diff --git a/net/rds/iw_cm.c b/net/rds/iw_cm.c
index 57ecb3d..0ffaa3e 100644
--- a/net/rds/iw_cm.c
+++ b/net/rds/iw_cm.c
@@ -86,9 +86,7 @@
 	err = rds_iw_update_cm_id(rds_iwdev, ic->i_cm_id);
 	if (err)
 		printk(KERN_ERR "rds_iw_update_ipaddr failed (%d)\n", err);
-	err = rds_iw_add_conn(rds_iwdev, conn);
-	if (err)
-		printk(KERN_ERR "rds_iw_add_conn failed (%d)\n", err);
+	rds_iw_add_conn(rds_iwdev, conn);
 
 	/* If the peer gave us the last packet it saw, process this as if
 	 * we had received a regular ACK. */
@@ -637,19 +635,8 @@
 		 * 	Move connection back to the nodev list.
 		 * 	Remove cm_id from the device cm_id list.
 		 */
-		if (ic->rds_iwdev) {
-
-			spin_lock_irq(&ic->rds_iwdev->spinlock);
-			BUG_ON(list_empty(&ic->iw_node));
-			list_del(&ic->iw_node);
-			spin_unlock_irq(&ic->rds_iwdev->spinlock);
-
-			spin_lock_irq(&iw_nodev_conns_lock);
-			list_add_tail(&ic->iw_node, &iw_nodev_conns);
-			spin_unlock_irq(&iw_nodev_conns_lock);
-			rds_iw_remove_cm_id(ic->rds_iwdev, ic->i_cm_id);
-			ic->rds_iwdev = NULL;
-		}
+		if (ic->rds_iwdev)
+			rds_iw_remove_conn(ic->rds_iwdev, conn);
 
 		rdma_destroy_id(ic->i_cm_id);
 
@@ -726,11 +713,27 @@
 	return 0;
 }
 
+/*
+ * Free a connection. Connection must be shut down and not set for reconnect.
+ */
 void rds_iw_conn_free(void *arg)
 {
 	struct rds_iw_connection *ic = arg;
+	spinlock_t	*lock_ptr;
+
 	rdsdebug("ic %p\n", ic);
+
+	/*
+	 * Conn is either on a dev's list or on the nodev list.
+	 * A race with shutdown() or connect() would cause problems
+	 * (since rds_iwdev would change) but that should never happen.
+	 */
+	lock_ptr = ic->rds_iwdev ? &ic->rds_iwdev->spinlock : &iw_nodev_conns_lock;
+
+	spin_lock_irq(lock_ptr);
 	list_del(&ic->iw_node);
+	spin_unlock_irq(lock_ptr);
+
 	kfree(ic);
 }
 
diff --git a/net/rds/iw_rdma.c b/net/rds/iw_rdma.c
index 1c02a8f..dcdb37d 100644
--- a/net/rds/iw_rdma.c
+++ b/net/rds/iw_rdma.c
@@ -196,7 +196,7 @@
 	return rds_iw_add_cm_id(rds_iwdev, cm_id);
 }
 
-int rds_iw_add_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn)
+void rds_iw_add_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn)
 {
 	struct rds_iw_connection *ic = conn->c_transport_data;
 
@@ -205,45 +205,45 @@
 	BUG_ON(list_empty(&iw_nodev_conns));
 	BUG_ON(list_empty(&ic->iw_node));
 	list_del(&ic->iw_node);
-	spin_unlock_irq(&iw_nodev_conns_lock);
 
 	spin_lock_irq(&rds_iwdev->spinlock);
 	list_add_tail(&ic->iw_node, &rds_iwdev->conn_list);
 	spin_unlock_irq(&rds_iwdev->spinlock);
-
-	ic->rds_iwdev = rds_iwdev;
-
-	return 0;
-}
-
-void rds_iw_remove_nodev_conns(void)
-{
-	struct rds_iw_connection *ic, *_ic;
-	LIST_HEAD(tmp_list);
-
-	/* avoid calling conn_destroy with irqs off */
-	spin_lock_irq(&iw_nodev_conns_lock);
-	list_splice(&iw_nodev_conns, &tmp_list);
-	INIT_LIST_HEAD(&iw_nodev_conns);
 	spin_unlock_irq(&iw_nodev_conns_lock);
 
-	list_for_each_entry_safe(ic, _ic, &tmp_list, iw_node) {
-		if (ic->conn->c_passive)
-			rds_conn_destroy(ic->conn->c_passive);
-		rds_conn_destroy(ic->conn);
-	}
+	ic->rds_iwdev = rds_iwdev;
 }
 
-void rds_iw_remove_conns(struct rds_iw_device *rds_iwdev)
+void rds_iw_remove_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn)
+{
+	struct rds_iw_connection *ic = conn->c_transport_data;
+
+	/* place conn on nodev_conns_list */
+	spin_lock(&iw_nodev_conns_lock);
+
+	spin_lock_irq(&rds_iwdev->spinlock);
+	BUG_ON(list_empty(&ic->iw_node));
+	list_del(&ic->iw_node);
+	spin_unlock_irq(&rds_iwdev->spinlock);
+
+	list_add_tail(&ic->iw_node, &iw_nodev_conns);
+
+	spin_unlock(&iw_nodev_conns_lock);
+
+	rds_iw_remove_cm_id(ic->rds_iwdev, ic->i_cm_id);
+	ic->rds_iwdev = NULL;
+}
+
+void __rds_iw_destroy_conns(struct list_head *list, spinlock_t *list_lock)
 {
 	struct rds_iw_connection *ic, *_ic;
 	LIST_HEAD(tmp_list);
 
 	/* avoid calling conn_destroy with irqs off */
-	spin_lock_irq(&rds_iwdev->spinlock);
-	list_splice(&rds_iwdev->conn_list, &tmp_list);
-	INIT_LIST_HEAD(&rds_iwdev->conn_list);
-	spin_unlock_irq(&rds_iwdev->spinlock);
+	spin_lock_irq(list_lock);
+	list_splice(list, &tmp_list);
+	INIT_LIST_HEAD(list);
+	spin_unlock_irq(list_lock);
 
 	list_for_each_entry_safe(ic, _ic, &tmp_list, iw_node) {
 		if (ic->conn->c_passive)