xprtrdma: Move Receive posting to Receive handler
Receive completion and Reply handling are done by a BOUND
workqueue, meaning they run on only one CPU.
Posting receives is currently done in the send_request path, which
on large systems is typically done on a different CPU than the one
handling Receive completions. This results in movement of
Receive-related cachelines between the sending and receiving CPUs.
More importantly, it means that currently Receives are posted while
the transport's write lock is held, which is unnecessary and costly.
Finally, allocation of Receive buffers is performed on-demand in
the Receive completion handler. This helps guarantee that they are
allocated on the same NUMA node as the CPU that handles Receive
completions.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 4b18926..2ed2ee4 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -74,6 +74,7 @@
*/
static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt);
static void rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf);
+static int rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt, bool temp);
static void rpcrdma_dma_unmap_regbuf(struct rpcrdma_regbuf *rb);
struct workqueue_struct *rpcrdma_receive_wq __read_mostly;
@@ -726,7 +727,6 @@ rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
{
struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt,
rx_ia);
- unsigned int extras;
int rc;
retry:
@@ -770,9 +770,8 @@ rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
}
dprintk("RPC: %s: connected\n", __func__);
- extras = r_xprt->rx_buf.rb_bc_srv_max_requests;
- if (extras)
- rpcrdma_ep_post_extra_recv(r_xprt, extras);
+
+ rpcrdma_post_recvs(r_xprt, true);
out:
if (rc)
@@ -1082,14 +1081,8 @@ rpcrdma_create_req(struct rpcrdma_xprt *r_xprt)
return req;
}
-/**
- * rpcrdma_create_rep - Allocate an rpcrdma_rep object
- * @r_xprt: controlling transport
- *
- * Returns 0 on success or a negative errno on failure.
- */
-int
-rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt)
+static int
+rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt, bool temp)
{
struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
@@ -1117,6 +1110,7 @@ rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt)
rep->rr_recv_wr.wr_cqe = &rep->rr_cqe;
rep->rr_recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov;
rep->rr_recv_wr.num_sge = 1;
+ rep->rr_temp = temp;
spin_lock(&buf->rb_lock);
list_add(&rep->rr_list, &buf->rb_recv_bufs);
@@ -1168,12 +1162,8 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
list_add(&req->rl_list, &buf->rb_send_bufs);
}
+ buf->rb_posted_receives = 0;
INIT_LIST_HEAD(&buf->rb_recv_bufs);
- for (i = 0; i <= buf->rb_max_requests; i++) {
- rc = rpcrdma_create_rep(r_xprt);
- if (rc)
- goto out;
- }
rc = rpcrdma_sendctxs_create(r_xprt);
if (rc)
@@ -1263,7 +1253,6 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
rep = rpcrdma_buffer_get_rep_locked(buf);
rpcrdma_destroy_rep(rep);
}
- buf->rb_send_count = 0;
spin_lock(&buf->rb_reqslock);
while (!list_empty(&buf->rb_allreqs)) {
@@ -1278,7 +1267,6 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
spin_lock(&buf->rb_reqslock);
}
spin_unlock(&buf->rb_reqslock);
- buf->rb_recv_count = 0;
rpcrdma_mrs_destroy(buf);
}
@@ -1351,27 +1339,11 @@ rpcrdma_mr_unmap_and_put(struct rpcrdma_mr *mr)
__rpcrdma_mr_put(&r_xprt->rx_buf, mr);
}
-static struct rpcrdma_rep *
-rpcrdma_buffer_get_rep(struct rpcrdma_buffer *buffers)
-{
- /* If an RPC previously completed without a reply (say, a
- * credential problem or a soft timeout occurs) then hold off
- * on supplying more Receive buffers until the number of new
- * pending RPCs catches up to the number of posted Receives.
- */
- if (unlikely(buffers->rb_send_count < buffers->rb_recv_count))
- return NULL;
-
- if (unlikely(list_empty(&buffers->rb_recv_bufs)))
- return NULL;
- buffers->rb_recv_count++;
- return rpcrdma_buffer_get_rep_locked(buffers);
-}
-
-/*
- * Get a set of request/reply buffers.
+/**
+ * rpcrdma_buffer_get - Get a request buffer
+ * @buffers: Buffer pool from which to obtain a buffer
*
- * Reply buffer (if available) is attached to send buffer upon return.
+ * Returns a fresh rpcrdma_req, or NULL if none are available.
*/
struct rpcrdma_req *
rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
@@ -1379,23 +1351,21 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
struct rpcrdma_req *req;
spin_lock(&buffers->rb_lock);
- if (list_empty(&buffers->rb_send_bufs))
- goto out_reqbuf;
- buffers->rb_send_count++;
+ if (unlikely(list_empty(&buffers->rb_send_bufs)))
+ goto out_noreqs;
req = rpcrdma_buffer_get_req_locked(buffers);
- req->rl_reply = rpcrdma_buffer_get_rep(buffers);
spin_unlock(&buffers->rb_lock);
-
return req;
-out_reqbuf:
+out_noreqs:
spin_unlock(&buffers->rb_lock);
return NULL;
}
-/*
- * Put request/reply buffers back into pool.
- * Pre-decrement counter/array index.
+/**
+ * rpcrdma_buffer_put - Put request/reply buffers back into pool
+ * @req: object to return
+ *
*/
void
rpcrdma_buffer_put(struct rpcrdma_req *req)
@@ -1406,27 +1376,16 @@ rpcrdma_buffer_put(struct rpcrdma_req *req)
req->rl_reply = NULL;
spin_lock(&buffers->rb_lock);
- buffers->rb_send_count--;
- list_add_tail(&req->rl_list, &buffers->rb_send_bufs);
+ list_add(&req->rl_list, &buffers->rb_send_bufs);
if (rep) {
- buffers->rb_recv_count--;
- list_add_tail(&rep->rr_list, &buffers->rb_recv_bufs);
+ if (!rep->rr_temp) {
+ list_add(&rep->rr_list, &buffers->rb_recv_bufs);
+ rep = NULL;
+ }
}
spin_unlock(&buffers->rb_lock);
-}
-
-/*
- * Recover reply buffers from pool.
- * This happens when recovering from disconnect.
- */
-void
-rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
-{
- struct rpcrdma_buffer *buffers = req->rl_buffer;
-
- spin_lock(&buffers->rb_lock);
- req->rl_reply = rpcrdma_buffer_get_rep(buffers);
- spin_unlock(&buffers->rb_lock);
+ if (rep)
+ rpcrdma_destroy_rep(rep);
}
/*
@@ -1438,10 +1397,13 @@ rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
{
struct rpcrdma_buffer *buffers = &rep->rr_rxprt->rx_buf;
- spin_lock(&buffers->rb_lock);
- buffers->rb_recv_count--;
- list_add_tail(&rep->rr_list, &buffers->rb_recv_bufs);
- spin_unlock(&buffers->rb_lock);
+ if (!rep->rr_temp) {
+ spin_lock(&buffers->rb_lock);
+ list_add(&rep->rr_list, &buffers->rb_recv_bufs);
+ spin_unlock(&buffers->rb_lock);
+ } else {
+ rpcrdma_destroy_rep(rep);
+ }
}
/**
@@ -1537,13 +1499,6 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
struct ib_send_wr *send_wr = &req->rl_sendctx->sc_wr;
int rc;
- if (req->rl_reply) {
- rc = rpcrdma_ep_post_recv(ia, req->rl_reply);
- if (rc)
- return rc;
- req->rl_reply = NULL;
- }
-
if (!ep->rep_send_count ||
test_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags)) {
send_wr->send_flags |= IB_SEND_SIGNALED;
@@ -1618,3 +1573,70 @@ rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *r_xprt, unsigned int count)
rpcrdma_recv_buffer_put(rep);
return rc;
}
+
+/**
+ * rpcrdma_post_recvs - Maybe post some Receive buffers
+ * @r_xprt: controlling transport
+ * @temp: when true, allocate temp rpcrdma_rep objects
+ *
+ */
+void
+rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp)
+{
+ struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+ struct ib_recv_wr *wr, *bad_wr;
+ int needed, count, rc;
+
+ needed = buf->rb_credits + (buf->rb_bc_srv_max_requests << 1);
+ if (buf->rb_posted_receives > needed)
+ return;
+ needed -= buf->rb_posted_receives;
+
+ count = 0;
+ wr = NULL;
+ while (needed) {
+ struct rpcrdma_regbuf *rb;
+ struct rpcrdma_rep *rep;
+
+ spin_lock(&buf->rb_lock);
+ rep = list_first_entry_or_null(&buf->rb_recv_bufs,
+ struct rpcrdma_rep, rr_list);
+ if (likely(rep))
+ list_del(&rep->rr_list);
+ spin_unlock(&buf->rb_lock);
+ if (!rep) {
+ if (rpcrdma_create_rep(r_xprt, temp))
+ break;
+ continue;
+ }
+
+ rb = rep->rr_rdmabuf;
+ if (!rpcrdma_regbuf_is_mapped(rb)) {
+ if (!__rpcrdma_dma_map_regbuf(&r_xprt->rx_ia, rb)) {
+ rpcrdma_recv_buffer_put(rep);
+ break;
+ }
+ }
+
+ trace_xprtrdma_post_recv(rep->rr_recv_wr.wr_cqe);
+ rep->rr_recv_wr.next = wr;
+ wr = &rep->rr_recv_wr;
+ ++count;
+ --needed;
+ }
+ if (!count)
+ return;
+
+ rc = ib_post_recv(r_xprt->rx_ia.ri_id->qp, wr, &bad_wr);
+ if (rc) {
+ for (wr = bad_wr; wr; wr = wr->next) {
+ struct rpcrdma_rep *rep;
+
+ rep = container_of(wr, struct rpcrdma_rep, rr_recv_wr);
+ rpcrdma_recv_buffer_put(rep);
+ --count;
+ }
+ }
+ buf->rb_posted_receives += count;
+ trace_xprtrdma_post_recvs(r_xprt, count, rc);
+}