blob: 8f9704e0f03de89f4e411d27becfa1b4be87a7e5 [file] [log] [blame]
\"Talpey, Thomas\f58851e2007-09-10 13:50:12 -04001/*
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04002 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the BSD-type
8 * license below:
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 *
14 * Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 *
17 * Redistributions in binary form must reproduce the above
18 * copyright notice, this list of conditions and the following
19 * disclaimer in the documentation and/or other materials provided
20 * with the distribution.
21 *
22 * Neither the name of the Network Appliance, Inc. nor the names of
23 * its contributors may be used to endorse or promote products
24 * derived from this software without specific prior written
25 * permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\"Talpey, Thomas\f58851e2007-09-10 13:50:12 -040038 */
39
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -040040/*
41 * verbs.c
42 *
43 * Encapsulates the major functions managing:
44 * o adapters
45 * o endpoints
46 * o connections
47 * o buffer memory
48 */
49
Alexey Dobriyana6b7a402011-06-06 10:43:46 +000050#include <linux/interrupt.h>
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -040051#include <linux/pci.h> /* for Tavor hack below */
Tejun Heo5a0e3ad2010-03-24 17:04:11 +090052#include <linux/slab.h>
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -040053
\"Talpey, Thomas\f58851e2007-09-10 13:50:12 -040054#include "xprt_rdma.h"
55
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -040056/*
57 * Globals/Macros
58 */
59
60#ifdef RPC_DEBUG
61# define RPCDBG_FACILITY RPCDBG_TRANS
62#endif
63
64/*
65 * internal functions
66 */
67
68/*
69 * handle replies in tasklet context, using a single, global list
70 * rdma tasklet function -- just turn around and call the func
71 * for all replies on the list
72 */
73
74static DEFINE_SPINLOCK(rpcrdma_tk_lock_g);
75static LIST_HEAD(rpcrdma_tasklets_g);
76
77static void
78rpcrdma_run_tasklet(unsigned long data)
79{
80 struct rpcrdma_rep *rep;
81 void (*func)(struct rpcrdma_rep *);
82 unsigned long flags;
83
84 data = data;
85 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
86 while (!list_empty(&rpcrdma_tasklets_g)) {
87 rep = list_entry(rpcrdma_tasklets_g.next,
88 struct rpcrdma_rep, rr_list);
89 list_del(&rep->rr_list);
90 func = rep->rr_func;
91 rep->rr_func = NULL;
92 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
93
94 if (func)
95 func(rep);
96 else
97 rpcrdma_recv_buffer_put(rep);
98
99 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
100 }
101 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
102}
103
104static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL);
105
106static inline void
107rpcrdma_schedule_tasklet(struct rpcrdma_rep *rep)
108{
109 unsigned long flags;
110
111 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
112 list_add_tail(&rep->rr_list, &rpcrdma_tasklets_g);
113 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
114 tasklet_schedule(&rpcrdma_tasklet_g);
115}
116
117static void
118rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
119{
120 struct rpcrdma_ep *ep = context;
121
122 dprintk("RPC: %s: QP error %X on device %s ep %p\n",
123 __func__, event->event, event->device->name, context);
124 if (ep->rep_connected == 1) {
125 ep->rep_connected = -EIO;
126 ep->rep_func(ep);
127 wake_up_all(&ep->rep_connect_wait);
128 }
129}
130
131static void
132rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context)
133{
134 struct rpcrdma_ep *ep = context;
135
136 dprintk("RPC: %s: CQ error %X on device %s ep %p\n",
137 __func__, event->event, event->device->name, context);
138 if (ep->rep_connected == 1) {
139 ep->rep_connected = -EIO;
140 ep->rep_func(ep);
141 wake_up_all(&ep->rep_connect_wait);
142 }
143}
144
145static inline
146void rpcrdma_event_process(struct ib_wc *wc)
147{
Tom Tucker5c635e02011-02-09 19:45:34 +0000148 struct rpcrdma_mw *frmr;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400149 struct rpcrdma_rep *rep =
150 (struct rpcrdma_rep *)(unsigned long) wc->wr_id;
151
152 dprintk("RPC: %s: event rep %p status %X opcode %X length %u\n",
153 __func__, rep, wc->status, wc->opcode, wc->byte_len);
154
155 if (!rep) /* send or bind completion that we don't care about */
156 return;
157
158 if (IB_WC_SUCCESS != wc->status) {
Tom Tucker5c635e02011-02-09 19:45:34 +0000159 dprintk("RPC: %s: WC opcode %d status %X, connection lost\n",
160 __func__, wc->opcode, wc->status);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400161 rep->rr_len = ~0U;
Tom Tucker5c635e02011-02-09 19:45:34 +0000162 if (wc->opcode != IB_WC_FAST_REG_MR && wc->opcode != IB_WC_LOCAL_INV)
163 rpcrdma_schedule_tasklet(rep);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400164 return;
165 }
166
167 switch (wc->opcode) {
Tom Tucker5c635e02011-02-09 19:45:34 +0000168 case IB_WC_FAST_REG_MR:
169 frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
170 frmr->r.frmr.state = FRMR_IS_VALID;
171 break;
172 case IB_WC_LOCAL_INV:
173 frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
174 frmr->r.frmr.state = FRMR_IS_INVALID;
175 break;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400176 case IB_WC_RECV:
177 rep->rr_len = wc->byte_len;
178 ib_dma_sync_single_for_cpu(
179 rdmab_to_ia(rep->rr_buffer)->ri_id->device,
180 rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE);
181 /* Keep (only) the most recent credits, after check validity */
182 if (rep->rr_len >= 16) {
183 struct rpcrdma_msg *p =
184 (struct rpcrdma_msg *) rep->rr_base;
185 unsigned int credits = ntohl(p->rm_credit);
186 if (credits == 0) {
187 dprintk("RPC: %s: server"
188 " dropped credits to 0!\n", __func__);
189 /* don't deadlock */
190 credits = 1;
191 } else if (credits > rep->rr_buffer->rb_max_requests) {
192 dprintk("RPC: %s: server"
193 " over-crediting: %d (%d)\n",
194 __func__, credits,
195 rep->rr_buffer->rb_max_requests);
196 credits = rep->rr_buffer->rb_max_requests;
197 }
198 atomic_set(&rep->rr_buffer->rb_credits, credits);
199 }
200 /* fall through */
201 case IB_WC_BIND_MW:
202 rpcrdma_schedule_tasklet(rep);
203 break;
204 default:
205 dprintk("RPC: %s: unexpected WC event %X\n",
206 __func__, wc->opcode);
207 break;
208 }
209}
210
211static inline int
212rpcrdma_cq_poll(struct ib_cq *cq)
213{
214 struct ib_wc wc;
215 int rc;
216
217 for (;;) {
218 rc = ib_poll_cq(cq, 1, &wc);
219 if (rc < 0) {
220 dprintk("RPC: %s: ib_poll_cq failed %i\n",
221 __func__, rc);
222 return rc;
223 }
224 if (rc == 0)
225 break;
226
227 rpcrdma_event_process(&wc);
228 }
229
230 return 0;
231}
232
233/*
234 * rpcrdma_cq_event_upcall
235 *
236 * This upcall handles recv, send, bind and unbind events.
237 * It is reentrant but processes single events in order to maintain
238 * ordering of receives to keep server credits.
239 *
240 * It is the responsibility of the scheduled tasklet to return
241 * recv buffers to the pool. NOTE: this affects synchronization of
242 * connection shutdown. That is, the structures required for
243 * the completion of the reply handler must remain intact until
244 * all memory has been reclaimed.
245 *
246 * Note that send events are suppressed and do not result in an upcall.
247 */
248static void
249rpcrdma_cq_event_upcall(struct ib_cq *cq, void *context)
250{
251 int rc;
252
253 rc = rpcrdma_cq_poll(cq);
254 if (rc)
255 return;
256
257 rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
258 if (rc) {
259 dprintk("RPC: %s: ib_req_notify_cq failed %i\n",
260 __func__, rc);
261 return;
262 }
263
264 rpcrdma_cq_poll(cq);
265}
266
267#ifdef RPC_DEBUG
268static const char * const conn[] = {
269 "address resolved",
270 "address error",
271 "route resolved",
272 "route error",
273 "connect request",
274 "connect response",
275 "connect error",
276 "unreachable",
277 "rejected",
278 "established",
279 "disconnected",
280 "device removal"
281};
282#endif
283
284static int
285rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
286{
287 struct rpcrdma_xprt *xprt = id->context;
288 struct rpcrdma_ia *ia = &xprt->rx_ia;
289 struct rpcrdma_ep *ep = &xprt->rx_ep;
Ingo Molnarff0db042008-11-25 16:58:42 -0800290#ifdef RPC_DEBUG
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400291 struct sockaddr_in *addr = (struct sockaddr_in *) &ep->rep_remote_addr;
Ingo Molnarff0db042008-11-25 16:58:42 -0800292#endif
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400293 struct ib_qp_attr attr;
294 struct ib_qp_init_attr iattr;
295 int connstate = 0;
296
297 switch (event->event) {
298 case RDMA_CM_EVENT_ADDR_RESOLVED:
299 case RDMA_CM_EVENT_ROUTE_RESOLVED:
Tom Talpey5675add2008-10-09 15:01:41 -0400300 ia->ri_async_rc = 0;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400301 complete(&ia->ri_done);
302 break;
303 case RDMA_CM_EVENT_ADDR_ERROR:
304 ia->ri_async_rc = -EHOSTUNREACH;
305 dprintk("RPC: %s: CM address resolution error, ep 0x%p\n",
306 __func__, ep);
307 complete(&ia->ri_done);
308 break;
309 case RDMA_CM_EVENT_ROUTE_ERROR:
310 ia->ri_async_rc = -ENETUNREACH;
311 dprintk("RPC: %s: CM route resolution error, ep 0x%p\n",
312 __func__, ep);
313 complete(&ia->ri_done);
314 break;
315 case RDMA_CM_EVENT_ESTABLISHED:
316 connstate = 1;
317 ib_query_qp(ia->ri_id->qp, &attr,
318 IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC,
319 &iattr);
320 dprintk("RPC: %s: %d responder resources"
321 " (%d initiator)\n",
322 __func__, attr.max_dest_rd_atomic, attr.max_rd_atomic);
323 goto connected;
324 case RDMA_CM_EVENT_CONNECT_ERROR:
325 connstate = -ENOTCONN;
326 goto connected;
327 case RDMA_CM_EVENT_UNREACHABLE:
328 connstate = -ENETDOWN;
329 goto connected;
330 case RDMA_CM_EVENT_REJECTED:
331 connstate = -ECONNREFUSED;
332 goto connected;
333 case RDMA_CM_EVENT_DISCONNECTED:
334 connstate = -ECONNABORTED;
335 goto connected;
336 case RDMA_CM_EVENT_DEVICE_REMOVAL:
337 connstate = -ENODEV;
338connected:
Harvey Harrison21454aa2008-10-31 00:54:56 -0700339 dprintk("RPC: %s: %s: %pI4:%u (ep 0x%p event 0x%x)\n",
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400340 __func__,
341 (event->event <= 11) ? conn[event->event] :
342 "unknown connection error",
Harvey Harrison21454aa2008-10-31 00:54:56 -0700343 &addr->sin_addr.s_addr,
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400344 ntohs(addr->sin_port),
345 ep, event->event);
346 atomic_set(&rpcx_to_rdmax(ep->rep_xprt)->rx_buf.rb_credits, 1);
347 dprintk("RPC: %s: %sconnected\n",
348 __func__, connstate > 0 ? "" : "dis");
349 ep->rep_connected = connstate;
350 ep->rep_func(ep);
351 wake_up_all(&ep->rep_connect_wait);
352 break;
353 default:
Tom Talpey1a954052008-10-09 15:01:31 -0400354 dprintk("RPC: %s: unexpected CM event %d\n",
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400355 __func__, event->event);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400356 break;
357 }
358
Tom Talpeyb3cd8d42008-10-09 15:02:02 -0400359#ifdef RPC_DEBUG
360 if (connstate == 1) {
361 int ird = attr.max_dest_rd_atomic;
362 int tird = ep->rep_remote_cma.responder_resources;
Harvey Harrison21454aa2008-10-31 00:54:56 -0700363 printk(KERN_INFO "rpcrdma: connection to %pI4:%u "
Tom Talpeyb3cd8d42008-10-09 15:02:02 -0400364 "on %s, memreg %d slots %d ird %d%s\n",
Harvey Harrison21454aa2008-10-31 00:54:56 -0700365 &addr->sin_addr.s_addr,
Tom Talpeyb3cd8d42008-10-09 15:02:02 -0400366 ntohs(addr->sin_port),
367 ia->ri_id->device->name,
368 ia->ri_memreg_strategy,
369 xprt->rx_buf.rb_max_requests,
370 ird, ird < 4 && ird < tird / 2 ? " (low!)" : "");
371 } else if (connstate < 0) {
Harvey Harrison21454aa2008-10-31 00:54:56 -0700372 printk(KERN_INFO "rpcrdma: connection to %pI4:%u closed (%d)\n",
373 &addr->sin_addr.s_addr,
Tom Talpeyb3cd8d42008-10-09 15:02:02 -0400374 ntohs(addr->sin_port),
375 connstate);
376 }
377#endif
378
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400379 return 0;
380}
381
382static struct rdma_cm_id *
383rpcrdma_create_id(struct rpcrdma_xprt *xprt,
384 struct rpcrdma_ia *ia, struct sockaddr *addr)
385{
386 struct rdma_cm_id *id;
387 int rc;
388
Tom Talpey1a954052008-10-09 15:01:31 -0400389 init_completion(&ia->ri_done);
390
Sean Heftyb26f9b92010-04-01 17:08:41 +0000391 id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP, IB_QPT_RC);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400392 if (IS_ERR(id)) {
393 rc = PTR_ERR(id);
394 dprintk("RPC: %s: rdma_create_id() failed %i\n",
395 __func__, rc);
396 return id;
397 }
398
Tom Talpey5675add2008-10-09 15:01:41 -0400399 ia->ri_async_rc = -ETIMEDOUT;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400400 rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT);
401 if (rc) {
402 dprintk("RPC: %s: rdma_resolve_addr() failed %i\n",
403 __func__, rc);
404 goto out;
405 }
Tom Talpey5675add2008-10-09 15:01:41 -0400406 wait_for_completion_interruptible_timeout(&ia->ri_done,
407 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400408 rc = ia->ri_async_rc;
409 if (rc)
410 goto out;
411
Tom Talpey5675add2008-10-09 15:01:41 -0400412 ia->ri_async_rc = -ETIMEDOUT;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400413 rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
414 if (rc) {
415 dprintk("RPC: %s: rdma_resolve_route() failed %i\n",
416 __func__, rc);
417 goto out;
418 }
Tom Talpey5675add2008-10-09 15:01:41 -0400419 wait_for_completion_interruptible_timeout(&ia->ri_done,
420 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400421 rc = ia->ri_async_rc;
422 if (rc)
423 goto out;
424
425 return id;
426
427out:
428 rdma_destroy_id(id);
429 return ERR_PTR(rc);
430}
431
432/*
433 * Drain any cq, prior to teardown.
434 */
435static void
436rpcrdma_clean_cq(struct ib_cq *cq)
437{
438 struct ib_wc wc;
439 int count = 0;
440
441 while (1 == ib_poll_cq(cq, 1, &wc))
442 ++count;
443
444 if (count)
445 dprintk("RPC: %s: flushed %d events (last 0x%x)\n",
446 __func__, count, wc.opcode);
447}
448
449/*
450 * Exported functions.
451 */
452
453/*
454 * Open and initialize an Interface Adapter.
455 * o initializes fields of struct rpcrdma_ia, including
456 * interface and provider attributes and protection zone.
457 */
458int
459rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
460{
Tom Talpeybd7ed1d2008-10-09 15:00:09 -0400461 int rc, mem_priv;
462 struct ib_device_attr devattr;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400463 struct rpcrdma_ia *ia = &xprt->rx_ia;
464
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400465 ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
466 if (IS_ERR(ia->ri_id)) {
467 rc = PTR_ERR(ia->ri_id);
468 goto out1;
469 }
470
471 ia->ri_pd = ib_alloc_pd(ia->ri_id->device);
472 if (IS_ERR(ia->ri_pd)) {
473 rc = PTR_ERR(ia->ri_pd);
474 dprintk("RPC: %s: ib_alloc_pd() failed %i\n",
475 __func__, rc);
476 goto out2;
477 }
478
479 /*
Tom Talpeybd7ed1d2008-10-09 15:00:09 -0400480 * Query the device to determine if the requested memory
481 * registration strategy is supported. If it isn't, set the
482 * strategy to a globally supported model.
483 */
484 rc = ib_query_device(ia->ri_id->device, &devattr);
485 if (rc) {
486 dprintk("RPC: %s: ib_query_device failed %d\n",
487 __func__, rc);
488 goto out2;
489 }
490
491 if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) {
492 ia->ri_have_dma_lkey = 1;
493 ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey;
494 }
495
496 switch (memreg) {
497 case RPCRDMA_MEMWINDOWS:
498 case RPCRDMA_MEMWINDOWS_ASYNC:
499 if (!(devattr.device_cap_flags & IB_DEVICE_MEM_WINDOW)) {
500 dprintk("RPC: %s: MEMWINDOWS registration "
501 "specified but not supported by adapter, "
502 "using slower RPCRDMA_REGISTER\n",
503 __func__);
504 memreg = RPCRDMA_REGISTER;
505 }
506 break;
507 case RPCRDMA_MTHCAFMR:
508 if (!ia->ri_id->device->alloc_fmr) {
509#if RPCRDMA_PERSISTENT_REGISTRATION
510 dprintk("RPC: %s: MTHCAFMR registration "
511 "specified but not supported by adapter, "
512 "using riskier RPCRDMA_ALLPHYSICAL\n",
513 __func__);
514 memreg = RPCRDMA_ALLPHYSICAL;
515#else
516 dprintk("RPC: %s: MTHCAFMR registration "
517 "specified but not supported by adapter, "
518 "using slower RPCRDMA_REGISTER\n",
519 __func__);
520 memreg = RPCRDMA_REGISTER;
521#endif
522 }
523 break;
Tom Talpey3197d3092008-10-09 15:00:20 -0400524 case RPCRDMA_FRMR:
525 /* Requires both frmr reg and local dma lkey */
526 if ((devattr.device_cap_flags &
527 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) !=
528 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) {
529#if RPCRDMA_PERSISTENT_REGISTRATION
530 dprintk("RPC: %s: FRMR registration "
531 "specified but not supported by adapter, "
532 "using riskier RPCRDMA_ALLPHYSICAL\n",
533 __func__);
534 memreg = RPCRDMA_ALLPHYSICAL;
535#else
536 dprintk("RPC: %s: FRMR registration "
537 "specified but not supported by adapter, "
538 "using slower RPCRDMA_REGISTER\n",
539 __func__);
540 memreg = RPCRDMA_REGISTER;
541#endif
Steve Wise0fc6c4e2014-05-28 10:32:00 -0400542 } else {
543 /* Mind the ia limit on FRMR page list depth */
544 ia->ri_max_frmr_depth = min_t(unsigned int,
545 RPCRDMA_MAX_DATA_SEGS,
546 devattr.max_fast_reg_page_list_len);
Tom Talpey3197d3092008-10-09 15:00:20 -0400547 }
548 break;
Tom Talpeybd7ed1d2008-10-09 15:00:09 -0400549 }
550
551 /*
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400552 * Optionally obtain an underlying physical identity mapping in
553 * order to do a memory window-based bind. This base registration
554 * is protected from remote access - that is enabled only by binding
555 * for the specific bytes targeted during each RPC operation, and
556 * revoked after the corresponding completion similar to a storage
557 * adapter.
558 */
Tom Talpeybd7ed1d2008-10-09 15:00:09 -0400559 switch (memreg) {
560 case RPCRDMA_BOUNCEBUFFERS:
561 case RPCRDMA_REGISTER:
Tom Talpey3197d3092008-10-09 15:00:20 -0400562 case RPCRDMA_FRMR:
Tom Talpeybd7ed1d2008-10-09 15:00:09 -0400563 break;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400564#if RPCRDMA_PERSISTENT_REGISTRATION
Tom Talpeybd7ed1d2008-10-09 15:00:09 -0400565 case RPCRDMA_ALLPHYSICAL:
566 mem_priv = IB_ACCESS_LOCAL_WRITE |
567 IB_ACCESS_REMOTE_WRITE |
568 IB_ACCESS_REMOTE_READ;
569 goto register_setup;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400570#endif
Tom Talpeybd7ed1d2008-10-09 15:00:09 -0400571 case RPCRDMA_MEMWINDOWS_ASYNC:
572 case RPCRDMA_MEMWINDOWS:
573 mem_priv = IB_ACCESS_LOCAL_WRITE |
574 IB_ACCESS_MW_BIND;
575 goto register_setup;
576 case RPCRDMA_MTHCAFMR:
577 if (ia->ri_have_dma_lkey)
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400578 break;
Tom Talpeybd7ed1d2008-10-09 15:00:09 -0400579 mem_priv = IB_ACCESS_LOCAL_WRITE;
580 register_setup:
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400581 ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
582 if (IS_ERR(ia->ri_bind_mem)) {
583 printk(KERN_ALERT "%s: ib_get_dma_mr for "
584 "phys register failed with %lX\n\t"
585 "Will continue with degraded performance\n",
586 __func__, PTR_ERR(ia->ri_bind_mem));
587 memreg = RPCRDMA_REGISTER;
588 ia->ri_bind_mem = NULL;
589 }
Tom Talpeybd7ed1d2008-10-09 15:00:09 -0400590 break;
591 default:
592 printk(KERN_ERR "%s: invalid memory registration mode %d\n",
593 __func__, memreg);
594 rc = -EINVAL;
595 goto out2;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400596 }
Tom Talpeybd7ed1d2008-10-09 15:00:09 -0400597 dprintk("RPC: %s: memory registration strategy is %d\n",
598 __func__, memreg);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400599
600 /* Else will do memory reg/dereg for each chunk */
601 ia->ri_memreg_strategy = memreg;
602
603 return 0;
604out2:
605 rdma_destroy_id(ia->ri_id);
Tom Talpeyfee08ca2008-10-09 15:01:00 -0400606 ia->ri_id = NULL;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400607out1:
608 return rc;
609}
610
611/*
612 * Clean up/close an IA.
613 * o if event handles and PD have been initialized, free them.
614 * o close the IA
615 */
616void
617rpcrdma_ia_close(struct rpcrdma_ia *ia)
618{
619 int rc;
620
621 dprintk("RPC: %s: entering\n", __func__);
622 if (ia->ri_bind_mem != NULL) {
623 rc = ib_dereg_mr(ia->ri_bind_mem);
624 dprintk("RPC: %s: ib_dereg_mr returned %i\n",
625 __func__, rc);
626 }
Tom Talpeyfee08ca2008-10-09 15:01:00 -0400627 if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
628 if (ia->ri_id->qp)
629 rdma_destroy_qp(ia->ri_id);
630 rdma_destroy_id(ia->ri_id);
631 ia->ri_id = NULL;
632 }
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400633 if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) {
634 rc = ib_dealloc_pd(ia->ri_pd);
635 dprintk("RPC: %s: ib_dealloc_pd returned %i\n",
636 __func__, rc);
637 }
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400638}
639
640/*
641 * Create unconnected endpoint.
642 */
643int
644rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
645 struct rpcrdma_create_data_internal *cdata)
646{
647 struct ib_device_attr devattr;
Chuck Lever5d40a8a2007-10-26 13:30:54 -0400648 int rc, err;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400649
650 rc = ib_query_device(ia->ri_id->device, &devattr);
651 if (rc) {
652 dprintk("RPC: %s: ib_query_device failed %d\n",
653 __func__, rc);
654 return rc;
655 }
656
657 /* check provider's send/recv wr limits */
658 if (cdata->max_requests > devattr.max_qp_wr)
659 cdata->max_requests = devattr.max_qp_wr;
660
661 ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall;
662 ep->rep_attr.qp_context = ep;
663 /* send_cq and recv_cq initialized below */
664 ep->rep_attr.srq = NULL;
665 ep->rep_attr.cap.max_send_wr = cdata->max_requests;
666 switch (ia->ri_memreg_strategy) {
Steve Wise0fc6c4e2014-05-28 10:32:00 -0400667 case RPCRDMA_FRMR: {
668 int depth = 7;
669
Tom Tucker15cdc6442010-08-11 12:47:24 -0400670 /* Add room for frmr register and invalidate WRs.
671 * 1. FRMR reg WR for head
672 * 2. FRMR invalidate WR for head
Steve Wise0fc6c4e2014-05-28 10:32:00 -0400673 * 3. N FRMR reg WRs for pagelist
674 * 4. N FRMR invalidate WRs for pagelist
Tom Tucker15cdc6442010-08-11 12:47:24 -0400675 * 5. FRMR reg WR for tail
676 * 6. FRMR invalidate WR for tail
677 * 7. The RDMA_SEND WR
678 */
Steve Wise0fc6c4e2014-05-28 10:32:00 -0400679
680 /* Calculate N if the device max FRMR depth is smaller than
681 * RPCRDMA_MAX_DATA_SEGS.
682 */
683 if (ia->ri_max_frmr_depth < RPCRDMA_MAX_DATA_SEGS) {
684 int delta = RPCRDMA_MAX_DATA_SEGS -
685 ia->ri_max_frmr_depth;
686
687 do {
688 depth += 2; /* FRMR reg + invalidate */
689 delta -= ia->ri_max_frmr_depth;
690 } while (delta > 0);
691
692 }
693 ep->rep_attr.cap.max_send_wr *= depth;
Tom Tucker15cdc6442010-08-11 12:47:24 -0400694 if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) {
Steve Wise0fc6c4e2014-05-28 10:32:00 -0400695 cdata->max_requests = devattr.max_qp_wr / depth;
Tom Tucker15cdc6442010-08-11 12:47:24 -0400696 if (!cdata->max_requests)
697 return -EINVAL;
Steve Wise0fc6c4e2014-05-28 10:32:00 -0400698 ep->rep_attr.cap.max_send_wr = cdata->max_requests *
699 depth;
Tom Tucker15cdc6442010-08-11 12:47:24 -0400700 }
Tom Talpey3197d3092008-10-09 15:00:20 -0400701 break;
Steve Wise0fc6c4e2014-05-28 10:32:00 -0400702 }
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400703 case RPCRDMA_MEMWINDOWS_ASYNC:
704 case RPCRDMA_MEMWINDOWS:
705 /* Add room for mw_binds+unbinds - overkill! */
706 ep->rep_attr.cap.max_send_wr++;
707 ep->rep_attr.cap.max_send_wr *= (2 * RPCRDMA_MAX_SEGS);
708 if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr)
709 return -EINVAL;
710 break;
711 default:
712 break;
713 }
714 ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
715 ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2);
716 ep->rep_attr.cap.max_recv_sge = 1;
717 ep->rep_attr.cap.max_inline_data = 0;
718 ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
719 ep->rep_attr.qp_type = IB_QPT_RC;
720 ep->rep_attr.port_num = ~0;
721
722 dprintk("RPC: %s: requested max: dtos: send %d recv %d; "
723 "iovs: send %d recv %d\n",
724 __func__,
725 ep->rep_attr.cap.max_send_wr,
726 ep->rep_attr.cap.max_recv_wr,
727 ep->rep_attr.cap.max_send_sge,
728 ep->rep_attr.cap.max_recv_sge);
729
730 /* set trigger for requesting send completion */
731 ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 /* - 1*/;
732 switch (ia->ri_memreg_strategy) {
733 case RPCRDMA_MEMWINDOWS_ASYNC:
734 case RPCRDMA_MEMWINDOWS:
735 ep->rep_cqinit -= RPCRDMA_MAX_SEGS;
736 break;
737 default:
738 break;
739 }
740 if (ep->rep_cqinit <= 2)
741 ep->rep_cqinit = 0;
742 INIT_CQCOUNT(ep);
743 ep->rep_ia = ia;
744 init_waitqueue_head(&ep->rep_connect_wait);
745
746 /*
747 * Create a single cq for receive dto and mw_bind (only ever
748 * care about unbind, really). Send completions are suppressed.
749 * Use single threaded tasklet upcalls to maintain ordering.
750 */
751 ep->rep_cq = ib_create_cq(ia->ri_id->device, rpcrdma_cq_event_upcall,
752 rpcrdma_cq_async_error_upcall, NULL,
753 ep->rep_attr.cap.max_recv_wr +
754 ep->rep_attr.cap.max_send_wr + 1, 0);
755 if (IS_ERR(ep->rep_cq)) {
756 rc = PTR_ERR(ep->rep_cq);
757 dprintk("RPC: %s: ib_create_cq failed: %i\n",
758 __func__, rc);
759 goto out1;
760 }
761
762 rc = ib_req_notify_cq(ep->rep_cq, IB_CQ_NEXT_COMP);
763 if (rc) {
764 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
765 __func__, rc);
766 goto out2;
767 }
768
769 ep->rep_attr.send_cq = ep->rep_cq;
770 ep->rep_attr.recv_cq = ep->rep_cq;
771
772 /* Initialize cma parameters */
773
774 /* RPC/RDMA does not use private data */
775 ep->rep_remote_cma.private_data = NULL;
776 ep->rep_remote_cma.private_data_len = 0;
777
778 /* Client offers RDMA Read but does not initiate */
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400779 ep->rep_remote_cma.initiator_depth = 0;
Tom Tuckerb334eaa2008-10-09 15:00:30 -0400780 if (ia->ri_memreg_strategy == RPCRDMA_BOUNCEBUFFERS)
781 ep->rep_remote_cma.responder_resources = 0;
782 else if (devattr.max_qp_rd_atom > 32) /* arbitrary but <= 255 */
783 ep->rep_remote_cma.responder_resources = 32;
784 else
785 ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400786
787 ep->rep_remote_cma.retry_count = 7;
788 ep->rep_remote_cma.flow_control = 0;
789 ep->rep_remote_cma.rnr_retry_count = 0;
790
791 return 0;
792
793out2:
Chuck Lever5d40a8a2007-10-26 13:30:54 -0400794 err = ib_destroy_cq(ep->rep_cq);
795 if (err)
796 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
797 __func__, err);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400798out1:
799 return rc;
800}
801
802/*
803 * rpcrdma_ep_destroy
804 *
805 * Disconnect and destroy endpoint. After this, the only
806 * valid operations on the ep are to free it (if dynamically
807 * allocated) or re-create it.
808 *
809 * The caller's error handling must be sure to not leak the endpoint
810 * if this function fails.
811 */
812int
813rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
814{
815 int rc;
816
817 dprintk("RPC: %s: entering, connected is %d\n",
818 __func__, ep->rep_connected);
819
820 if (ia->ri_id->qp) {
821 rc = rpcrdma_ep_disconnect(ep, ia);
822 if (rc)
823 dprintk("RPC: %s: rpcrdma_ep_disconnect"
824 " returned %i\n", __func__, rc);
Tom Talpeyfee08ca2008-10-09 15:01:00 -0400825 rdma_destroy_qp(ia->ri_id);
826 ia->ri_id->qp = NULL;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400827 }
828
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400829 /* padding - could be done in rpcrdma_buffer_destroy... */
830 if (ep->rep_pad_mr) {
831 rpcrdma_deregister_internal(ia, ep->rep_pad_mr, &ep->rep_pad);
832 ep->rep_pad_mr = NULL;
833 }
834
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400835 rpcrdma_clean_cq(ep->rep_cq);
836 rc = ib_destroy_cq(ep->rep_cq);
837 if (rc)
838 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
839 __func__, rc);
840
841 return rc;
842}
843
844/*
845 * Connect unconnected endpoint.
846 */
847int
848rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
849{
850 struct rdma_cm_id *id;
851 int rc = 0;
852 int retry_count = 0;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400853
Tom Talpeyc0555512008-10-10 11:32:45 -0400854 if (ep->rep_connected != 0) {
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400855 struct rpcrdma_xprt *xprt;
856retry:
857 rc = rpcrdma_ep_disconnect(ep, ia);
858 if (rc && rc != -ENOTCONN)
859 dprintk("RPC: %s: rpcrdma_ep_disconnect"
860 " status %i\n", __func__, rc);
861 rpcrdma_clean_cq(ep->rep_cq);
862
863 xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
864 id = rpcrdma_create_id(xprt, ia,
865 (struct sockaddr *)&xprt->rx_data.addr);
866 if (IS_ERR(id)) {
867 rc = PTR_ERR(id);
868 goto out;
869 }
870 /* TEMP TEMP TEMP - fail if new device:
871 * Deregister/remarshal *all* requests!
872 * Close and recreate adapter, pd, etc!
873 * Re-determine all attributes still sane!
874 * More stuff I haven't thought of!
875 * Rrrgh!
876 */
877 if (ia->ri_id->device != id->device) {
878 printk("RPC: %s: can't reconnect on "
879 "different device!\n", __func__);
880 rdma_destroy_id(id);
881 rc = -ENETDOWN;
882 goto out;
883 }
884 /* END TEMP */
Tom Talpey1a954052008-10-09 15:01:31 -0400885 rdma_destroy_qp(ia->ri_id);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400886 rdma_destroy_id(ia->ri_id);
887 ia->ri_id = id;
888 }
889
890 rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
891 if (rc) {
892 dprintk("RPC: %s: rdma_create_qp failed %i\n",
893 __func__, rc);
894 goto out;
895 }
896
897/* XXX Tavor device performs badly with 2K MTU! */
898if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) {
899 struct pci_dev *pcid = to_pci_dev(ia->ri_id->device->dma_device);
900 if (pcid->device == PCI_DEVICE_ID_MELLANOX_TAVOR &&
901 (pcid->vendor == PCI_VENDOR_ID_MELLANOX ||
902 pcid->vendor == PCI_VENDOR_ID_TOPSPIN)) {
903 struct ib_qp_attr attr = {
904 .path_mtu = IB_MTU_1024
905 };
906 rc = ib_modify_qp(ia->ri_id->qp, &attr, IB_QP_PATH_MTU);
907 }
908}
909
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400910 ep->rep_connected = 0;
911
912 rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
913 if (rc) {
914 dprintk("RPC: %s: rdma_connect() failed with %i\n",
915 __func__, rc);
916 goto out;
917 }
918
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400919 wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
920
921 /*
922 * Check state. A non-peer reject indicates no listener
923 * (ECONNREFUSED), which may be a transient state. All
924 * others indicate a transport condition which has already
925 * undergone a best-effort.
926 */
Joe Perchesf64f9e72009-11-29 16:55:45 -0800927 if (ep->rep_connected == -ECONNREFUSED &&
928 ++retry_count <= RDMA_CONNECT_RETRY_MAX) {
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400929 dprintk("RPC: %s: non-peer_reject, retry\n", __func__);
930 goto retry;
931 }
932 if (ep->rep_connected <= 0) {
933 /* Sometimes, the only way to reliably connect to remote
934 * CMs is to use same nonzero values for ORD and IRD. */
Tom Tuckerb334eaa2008-10-09 15:00:30 -0400935 if (retry_count++ <= RDMA_CONNECT_RETRY_MAX + 1 &&
936 (ep->rep_remote_cma.responder_resources == 0 ||
937 ep->rep_remote_cma.initiator_depth !=
938 ep->rep_remote_cma.responder_resources)) {
939 if (ep->rep_remote_cma.responder_resources == 0)
940 ep->rep_remote_cma.responder_resources = 1;
941 ep->rep_remote_cma.initiator_depth =
942 ep->rep_remote_cma.responder_resources;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400943 goto retry;
Tom Tuckerb334eaa2008-10-09 15:00:30 -0400944 }
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400945 rc = ep->rep_connected;
946 } else {
947 dprintk("RPC: %s: connected\n", __func__);
948 }
949
950out:
951 if (rc)
952 ep->rep_connected = rc;
953 return rc;
954}
955
956/*
957 * rpcrdma_ep_disconnect
958 *
959 * This is separate from destroy to facilitate the ability
960 * to reconnect without recreating the endpoint.
961 *
962 * This call is not reentrant, and must not be made in parallel
963 * on the same endpoint.
964 */
965int
966rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
967{
968 int rc;
969
970 rpcrdma_clean_cq(ep->rep_cq);
971 rc = rdma_disconnect(ia->ri_id);
972 if (!rc) {
973 /* returns without wait if not connected */
974 wait_event_interruptible(ep->rep_connect_wait,
975 ep->rep_connected != 1);
976 dprintk("RPC: %s: after wait, %sconnected\n", __func__,
977 (ep->rep_connected == 1) ? "still " : "dis");
978 } else {
979 dprintk("RPC: %s: rdma_disconnect %i\n", __func__, rc);
980 ep->rep_connected = rc;
981 }
982 return rc;
983}
984
985/*
986 * Initialize buffer memory
987 */
988int
989rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
990 struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata)
991{
992 char *p;
993 size_t len;
994 int i, rc;
Tom Talpey8d4ba032008-10-09 14:59:49 -0400995 struct rpcrdma_mw *r;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400996
997 buf->rb_max_requests = cdata->max_requests;
998 spin_lock_init(&buf->rb_lock);
999 atomic_set(&buf->rb_credits, 1);
1000
1001 /* Need to allocate:
1002 * 1. arrays for send and recv pointers
1003 * 2. arrays of struct rpcrdma_req to fill in pointers
1004 * 3. array of struct rpcrdma_rep for replies
1005 * 4. padding, if any
Tom Talpey3197d3092008-10-09 15:00:20 -04001006 * 5. mw's, fmr's or frmr's, if any
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001007 * Send/recv buffers in req/rep need to be registered
1008 */
1009
1010 len = buf->rb_max_requests *
1011 (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *));
1012 len += cdata->padding;
1013 switch (ia->ri_memreg_strategy) {
Tom Talpey3197d3092008-10-09 15:00:20 -04001014 case RPCRDMA_FRMR:
1015 len += buf->rb_max_requests * RPCRDMA_MAX_SEGS *
1016 sizeof(struct rpcrdma_mw);
1017 break;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001018 case RPCRDMA_MTHCAFMR:
1019 /* TBD we are perhaps overallocating here */
1020 len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
1021 sizeof(struct rpcrdma_mw);
1022 break;
1023 case RPCRDMA_MEMWINDOWS_ASYNC:
1024 case RPCRDMA_MEMWINDOWS:
1025 len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
1026 sizeof(struct rpcrdma_mw);
1027 break;
1028 default:
1029 break;
1030 }
1031
1032 /* allocate 1, 4 and 5 in one shot */
1033 p = kzalloc(len, GFP_KERNEL);
1034 if (p == NULL) {
1035 dprintk("RPC: %s: req_t/rep_t/pad kzalloc(%zd) failed\n",
1036 __func__, len);
1037 rc = -ENOMEM;
1038 goto out;
1039 }
1040 buf->rb_pool = p; /* for freeing it later */
1041
1042 buf->rb_send_bufs = (struct rpcrdma_req **) p;
1043 p = (char *) &buf->rb_send_bufs[buf->rb_max_requests];
1044 buf->rb_recv_bufs = (struct rpcrdma_rep **) p;
1045 p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests];
1046
1047 /*
1048 * Register the zeroed pad buffer, if any.
1049 */
1050 if (cdata->padding) {
1051 rc = rpcrdma_register_internal(ia, p, cdata->padding,
1052 &ep->rep_pad_mr, &ep->rep_pad);
1053 if (rc)
1054 goto out;
1055 }
1056 p += cdata->padding;
1057
1058 /*
1059 * Allocate the fmr's, or mw's for mw_bind chunk registration.
1060 * We "cycle" the mw's in order to minimize rkey reuse,
1061 * and also reduce unbind-to-bind collision.
1062 */
1063 INIT_LIST_HEAD(&buf->rb_mws);
Tom Talpey8d4ba032008-10-09 14:59:49 -04001064 r = (struct rpcrdma_mw *)p;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001065 switch (ia->ri_memreg_strategy) {
Tom Talpey3197d3092008-10-09 15:00:20 -04001066 case RPCRDMA_FRMR:
1067 for (i = buf->rb_max_requests * RPCRDMA_MAX_SEGS; i; i--) {
1068 r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
Steve Wise0fc6c4e2014-05-28 10:32:00 -04001069 ia->ri_max_frmr_depth);
Tom Talpey3197d3092008-10-09 15:00:20 -04001070 if (IS_ERR(r->r.frmr.fr_mr)) {
1071 rc = PTR_ERR(r->r.frmr.fr_mr);
1072 dprintk("RPC: %s: ib_alloc_fast_reg_mr"
1073 " failed %i\n", __func__, rc);
1074 goto out;
1075 }
Steve Wise0fc6c4e2014-05-28 10:32:00 -04001076 r->r.frmr.fr_pgl = ib_alloc_fast_reg_page_list(
1077 ia->ri_id->device,
1078 ia->ri_max_frmr_depth);
Tom Talpey3197d3092008-10-09 15:00:20 -04001079 if (IS_ERR(r->r.frmr.fr_pgl)) {
1080 rc = PTR_ERR(r->r.frmr.fr_pgl);
1081 dprintk("RPC: %s: "
1082 "ib_alloc_fast_reg_page_list "
1083 "failed %i\n", __func__, rc);
Allen Andrews4034ba02014-05-28 10:32:09 -04001084
1085 ib_dereg_mr(r->r.frmr.fr_mr);
Tom Talpey3197d3092008-10-09 15:00:20 -04001086 goto out;
1087 }
1088 list_add(&r->mw_list, &buf->rb_mws);
1089 ++r;
1090 }
1091 break;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001092 case RPCRDMA_MTHCAFMR:
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001093 /* TBD we are perhaps overallocating here */
1094 for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
Tom Talpey8d4ba032008-10-09 14:59:49 -04001095 static struct ib_fmr_attr fa =
1096 { RPCRDMA_MAX_DATA_SEGS, 1, PAGE_SHIFT };
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001097 r->r.fmr = ib_alloc_fmr(ia->ri_pd,
1098 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ,
1099 &fa);
1100 if (IS_ERR(r->r.fmr)) {
1101 rc = PTR_ERR(r->r.fmr);
1102 dprintk("RPC: %s: ib_alloc_fmr"
1103 " failed %i\n", __func__, rc);
1104 goto out;
1105 }
1106 list_add(&r->mw_list, &buf->rb_mws);
1107 ++r;
1108 }
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001109 break;
1110 case RPCRDMA_MEMWINDOWS_ASYNC:
1111 case RPCRDMA_MEMWINDOWS:
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001112 /* Allocate one extra request's worth, for full cycling */
1113 for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
Shani Michaeli7083e422013-02-06 16:19:12 +00001114 r->r.mw = ib_alloc_mw(ia->ri_pd, IB_MW_TYPE_1);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001115 if (IS_ERR(r->r.mw)) {
1116 rc = PTR_ERR(r->r.mw);
1117 dprintk("RPC: %s: ib_alloc_mw"
1118 " failed %i\n", __func__, rc);
1119 goto out;
1120 }
1121 list_add(&r->mw_list, &buf->rb_mws);
1122 ++r;
1123 }
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001124 break;
1125 default:
1126 break;
1127 }
1128
1129 /*
1130 * Allocate/init the request/reply buffers. Doing this
1131 * using kmalloc for now -- one for each buf.
1132 */
1133 for (i = 0; i < buf->rb_max_requests; i++) {
1134 struct rpcrdma_req *req;
1135 struct rpcrdma_rep *rep;
1136
1137 len = cdata->inline_wsize + sizeof(struct rpcrdma_req);
1138 /* RPC layer requests *double* size + 1K RPC_SLACK_SPACE! */
1139 /* Typical ~2400b, so rounding up saves work later */
1140 if (len < 4096)
1141 len = 4096;
1142 req = kmalloc(len, GFP_KERNEL);
1143 if (req == NULL) {
1144 dprintk("RPC: %s: request buffer %d alloc"
1145 " failed\n", __func__, i);
1146 rc = -ENOMEM;
1147 goto out;
1148 }
1149 memset(req, 0, sizeof(struct rpcrdma_req));
1150 buf->rb_send_bufs[i] = req;
1151 buf->rb_send_bufs[i]->rl_buffer = buf;
1152
1153 rc = rpcrdma_register_internal(ia, req->rl_base,
1154 len - offsetof(struct rpcrdma_req, rl_base),
1155 &buf->rb_send_bufs[i]->rl_handle,
1156 &buf->rb_send_bufs[i]->rl_iov);
1157 if (rc)
1158 goto out;
1159
1160 buf->rb_send_bufs[i]->rl_size = len-sizeof(struct rpcrdma_req);
1161
1162 len = cdata->inline_rsize + sizeof(struct rpcrdma_rep);
1163 rep = kmalloc(len, GFP_KERNEL);
1164 if (rep == NULL) {
1165 dprintk("RPC: %s: reply buffer %d alloc failed\n",
1166 __func__, i);
1167 rc = -ENOMEM;
1168 goto out;
1169 }
1170 memset(rep, 0, sizeof(struct rpcrdma_rep));
1171 buf->rb_recv_bufs[i] = rep;
1172 buf->rb_recv_bufs[i]->rr_buffer = buf;
1173 init_waitqueue_head(&rep->rr_unbind);
1174
1175 rc = rpcrdma_register_internal(ia, rep->rr_base,
1176 len - offsetof(struct rpcrdma_rep, rr_base),
1177 &buf->rb_recv_bufs[i]->rr_handle,
1178 &buf->rb_recv_bufs[i]->rr_iov);
1179 if (rc)
1180 goto out;
1181
1182 }
1183 dprintk("RPC: %s: max_requests %d\n",
1184 __func__, buf->rb_max_requests);
1185 /* done */
1186 return 0;
1187out:
1188 rpcrdma_buffer_destroy(buf);
1189 return rc;
1190}
1191
1192/*
1193 * Unregister and destroy buffer memory. Need to deal with
1194 * partial initialization, so it's callable from failed create.
1195 * Must be called before destroying endpoint, as registrations
1196 * reference it.
1197 */
1198void
1199rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
1200{
1201 int rc, i;
1202 struct rpcrdma_ia *ia = rdmab_to_ia(buf);
Tom Talpey8d4ba032008-10-09 14:59:49 -04001203 struct rpcrdma_mw *r;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001204
1205 /* clean up in reverse order from create
1206 * 1. recv mr memory (mr free, then kfree)
1207 * 1a. bind mw memory
1208 * 2. send mr memory (mr free, then kfree)
1209 * 3. padding (if any) [moved to rpcrdma_ep_destroy]
1210 * 4. arrays
1211 */
1212 dprintk("RPC: %s: entering\n", __func__);
1213
1214 for (i = 0; i < buf->rb_max_requests; i++) {
1215 if (buf->rb_recv_bufs && buf->rb_recv_bufs[i]) {
1216 rpcrdma_deregister_internal(ia,
1217 buf->rb_recv_bufs[i]->rr_handle,
1218 &buf->rb_recv_bufs[i]->rr_iov);
1219 kfree(buf->rb_recv_bufs[i]);
1220 }
1221 if (buf->rb_send_bufs && buf->rb_send_bufs[i]) {
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001222 rpcrdma_deregister_internal(ia,
1223 buf->rb_send_bufs[i]->rl_handle,
1224 &buf->rb_send_bufs[i]->rl_iov);
1225 kfree(buf->rb_send_bufs[i]);
1226 }
1227 }
1228
Allen Andrews4034ba02014-05-28 10:32:09 -04001229 while (!list_empty(&buf->rb_mws)) {
1230 r = list_entry(buf->rb_mws.next,
1231 struct rpcrdma_mw, mw_list);
1232 list_del(&r->mw_list);
1233 switch (ia->ri_memreg_strategy) {
1234 case RPCRDMA_FRMR:
1235 rc = ib_dereg_mr(r->r.frmr.fr_mr);
1236 if (rc)
1237 dprintk("RPC: %s:"
1238 " ib_dereg_mr"
1239 " failed %i\n",
1240 __func__, rc);
1241 ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
1242 break;
1243 case RPCRDMA_MTHCAFMR:
1244 rc = ib_dealloc_fmr(r->r.fmr);
1245 if (rc)
1246 dprintk("RPC: %s:"
1247 " ib_dealloc_fmr"
1248 " failed %i\n",
1249 __func__, rc);
1250 break;
1251 case RPCRDMA_MEMWINDOWS_ASYNC:
1252 case RPCRDMA_MEMWINDOWS:
1253 rc = ib_dealloc_mw(r->r.mw);
1254 if (rc)
1255 dprintk("RPC: %s:"
1256 " ib_dealloc_mw"
1257 " failed %i\n",
1258 __func__, rc);
1259 break;
1260 default:
1261 break;
1262 }
1263 }
1264
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001265 kfree(buf->rb_pool);
1266}
1267
1268/*
1269 * Get a set of request/reply buffers.
1270 *
1271 * Reply buffer (if needed) is attached to send buffer upon return.
1272 * Rule:
1273 * rb_send_index and rb_recv_index MUST always be pointing to the
1274 * *next* available buffer (non-NULL). They are incremented after
1275 * removing buffers, and decremented *before* returning them.
1276 */
1277struct rpcrdma_req *
1278rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
1279{
1280 struct rpcrdma_req *req;
1281 unsigned long flags;
Tom Talpey8d4ba032008-10-09 14:59:49 -04001282 int i;
1283 struct rpcrdma_mw *r;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001284
1285 spin_lock_irqsave(&buffers->rb_lock, flags);
1286 if (buffers->rb_send_index == buffers->rb_max_requests) {
1287 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1288 dprintk("RPC: %s: out of request buffers\n", __func__);
1289 return ((struct rpcrdma_req *)NULL);
1290 }
1291
1292 req = buffers->rb_send_bufs[buffers->rb_send_index];
1293 if (buffers->rb_send_index < buffers->rb_recv_index) {
1294 dprintk("RPC: %s: %d extra receives outstanding (ok)\n",
1295 __func__,
1296 buffers->rb_recv_index - buffers->rb_send_index);
1297 req->rl_reply = NULL;
1298 } else {
1299 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1300 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1301 }
1302 buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
1303 if (!list_empty(&buffers->rb_mws)) {
Tom Talpey8d4ba032008-10-09 14:59:49 -04001304 i = RPCRDMA_MAX_SEGS - 1;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001305 do {
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001306 r = list_entry(buffers->rb_mws.next,
1307 struct rpcrdma_mw, mw_list);
1308 list_del(&r->mw_list);
1309 req->rl_segments[i].mr_chunk.rl_mw = r;
1310 } while (--i >= 0);
1311 }
1312 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1313 return req;
1314}
1315
1316/*
1317 * Put request/reply buffers back into pool.
1318 * Pre-decrement counter/array index.
1319 */
1320void
1321rpcrdma_buffer_put(struct rpcrdma_req *req)
1322{
1323 struct rpcrdma_buffer *buffers = req->rl_buffer;
1324 struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
1325 int i;
1326 unsigned long flags;
1327
1328 BUG_ON(req->rl_nchunks != 0);
1329 spin_lock_irqsave(&buffers->rb_lock, flags);
1330 buffers->rb_send_bufs[--buffers->rb_send_index] = req;
1331 req->rl_niovs = 0;
1332 if (req->rl_reply) {
1333 buffers->rb_recv_bufs[--buffers->rb_recv_index] = req->rl_reply;
1334 init_waitqueue_head(&req->rl_reply->rr_unbind);
1335 req->rl_reply->rr_func = NULL;
1336 req->rl_reply = NULL;
1337 }
1338 switch (ia->ri_memreg_strategy) {
Tom Talpey3197d3092008-10-09 15:00:20 -04001339 case RPCRDMA_FRMR:
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001340 case RPCRDMA_MTHCAFMR:
1341 case RPCRDMA_MEMWINDOWS_ASYNC:
1342 case RPCRDMA_MEMWINDOWS:
1343 /*
1344 * Cycle mw's back in reverse order, and "spin" them.
1345 * This delays and scrambles reuse as much as possible.
1346 */
1347 i = 1;
1348 do {
1349 struct rpcrdma_mw **mw;
1350 mw = &req->rl_segments[i].mr_chunk.rl_mw;
1351 list_add_tail(&(*mw)->mw_list, &buffers->rb_mws);
1352 *mw = NULL;
1353 } while (++i < RPCRDMA_MAX_SEGS);
1354 list_add_tail(&req->rl_segments[0].mr_chunk.rl_mw->mw_list,
1355 &buffers->rb_mws);
1356 req->rl_segments[0].mr_chunk.rl_mw = NULL;
1357 break;
1358 default:
1359 break;
1360 }
1361 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1362}
1363
1364/*
1365 * Recover reply buffers from pool.
1366 * This happens when recovering from error conditions.
1367 * Post-increment counter/array index.
1368 */
1369void
1370rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
1371{
1372 struct rpcrdma_buffer *buffers = req->rl_buffer;
1373 unsigned long flags;
1374
1375 if (req->rl_iov.length == 0) /* special case xprt_rdma_allocate() */
1376 buffers = ((struct rpcrdma_req *) buffers)->rl_buffer;
1377 spin_lock_irqsave(&buffers->rb_lock, flags);
1378 if (buffers->rb_recv_index < buffers->rb_max_requests) {
1379 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1380 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1381 }
1382 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1383}
1384
1385/*
1386 * Put reply buffers back into pool when not attached to
1387 * request. This happens in error conditions, and when
1388 * aborting unbinds. Pre-decrement counter/array index.
1389 */
1390void
1391rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
1392{
1393 struct rpcrdma_buffer *buffers = rep->rr_buffer;
1394 unsigned long flags;
1395
1396 rep->rr_func = NULL;
1397 spin_lock_irqsave(&buffers->rb_lock, flags);
1398 buffers->rb_recv_bufs[--buffers->rb_recv_index] = rep;
1399 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1400}
1401
1402/*
1403 * Wrappers for internal-use kmalloc memory registration, used by buffer code.
1404 */
1405
1406int
1407rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
1408 struct ib_mr **mrp, struct ib_sge *iov)
1409{
1410 struct ib_phys_buf ipb;
1411 struct ib_mr *mr;
1412 int rc;
1413
1414 /*
1415 * All memory passed here was kmalloc'ed, therefore phys-contiguous.
1416 */
1417 iov->addr = ib_dma_map_single(ia->ri_id->device,
1418 va, len, DMA_BIDIRECTIONAL);
1419 iov->length = len;
1420
Tom Talpeybd7ed1d2008-10-09 15:00:09 -04001421 if (ia->ri_have_dma_lkey) {
1422 *mrp = NULL;
1423 iov->lkey = ia->ri_dma_lkey;
1424 return 0;
1425 } else if (ia->ri_bind_mem != NULL) {
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001426 *mrp = NULL;
1427 iov->lkey = ia->ri_bind_mem->lkey;
1428 return 0;
1429 }
1430
1431 ipb.addr = iov->addr;
1432 ipb.size = iov->length;
1433 mr = ib_reg_phys_mr(ia->ri_pd, &ipb, 1,
1434 IB_ACCESS_LOCAL_WRITE, &iov->addr);
1435
1436 dprintk("RPC: %s: phys convert: 0x%llx "
1437 "registered 0x%llx length %d\n",
Andrew Mortona56daeb2007-10-16 01:29:57 -07001438 __func__, (unsigned long long)ipb.addr,
1439 (unsigned long long)iov->addr, len);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001440
1441 if (IS_ERR(mr)) {
1442 *mrp = NULL;
1443 rc = PTR_ERR(mr);
1444 dprintk("RPC: %s: failed with %i\n", __func__, rc);
1445 } else {
1446 *mrp = mr;
1447 iov->lkey = mr->lkey;
1448 rc = 0;
1449 }
1450
1451 return rc;
1452}
1453
1454int
1455rpcrdma_deregister_internal(struct rpcrdma_ia *ia,
1456 struct ib_mr *mr, struct ib_sge *iov)
1457{
1458 int rc;
1459
1460 ib_dma_unmap_single(ia->ri_id->device,
1461 iov->addr, iov->length, DMA_BIDIRECTIONAL);
1462
1463 if (NULL == mr)
1464 return 0;
1465
1466 rc = ib_dereg_mr(mr);
1467 if (rc)
1468 dprintk("RPC: %s: ib_dereg_mr failed %i\n", __func__, rc);
1469 return rc;
1470}
1471
1472/*
1473 * Wrappers for chunk registration, shared by read/write chunk code.
1474 */
1475
1476static void
1477rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing)
1478{
1479 seg->mr_dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
1480 seg->mr_dmalen = seg->mr_len;
1481 if (seg->mr_page)
1482 seg->mr_dma = ib_dma_map_page(ia->ri_id->device,
1483 seg->mr_page, offset_in_page(seg->mr_offset),
1484 seg->mr_dmalen, seg->mr_dir);
1485 else
1486 seg->mr_dma = ib_dma_map_single(ia->ri_id->device,
1487 seg->mr_offset,
1488 seg->mr_dmalen, seg->mr_dir);
Tom Tucker5c635e02011-02-09 19:45:34 +00001489 if (ib_dma_mapping_error(ia->ri_id->device, seg->mr_dma)) {
1490 dprintk("RPC: %s: mr_dma %llx mr_offset %p mr_dma_len %zu\n",
1491 __func__,
Randy Dunlap986d4ab2011-03-15 17:11:59 -07001492 (unsigned long long)seg->mr_dma,
1493 seg->mr_offset, seg->mr_dmalen);
Tom Tucker5c635e02011-02-09 19:45:34 +00001494 }
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001495}
1496
1497static void
1498rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg)
1499{
1500 if (seg->mr_page)
1501 ib_dma_unmap_page(ia->ri_id->device,
1502 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1503 else
1504 ib_dma_unmap_single(ia->ri_id->device,
1505 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1506}
1507
Tom Talpey8d4ba032008-10-09 14:59:49 -04001508static int
Tom Talpey3197d3092008-10-09 15:00:20 -04001509rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
1510 int *nsegs, int writing, struct rpcrdma_ia *ia,
1511 struct rpcrdma_xprt *r_xprt)
1512{
1513 struct rpcrdma_mr_seg *seg1 = seg;
Tom Tucker5c635e02011-02-09 19:45:34 +00001514 struct ib_send_wr invalidate_wr, frmr_wr, *bad_wr, *post_wr;
1515
Tom Talpey3197d3092008-10-09 15:00:20 -04001516 u8 key;
1517 int len, pageoff;
1518 int i, rc;
Tom Tucker9b781452012-02-20 13:07:57 -06001519 int seg_len;
1520 u64 pa;
1521 int page_no;
Tom Talpey3197d3092008-10-09 15:00:20 -04001522
1523 pageoff = offset_in_page(seg1->mr_offset);
1524 seg1->mr_offset -= pageoff; /* start of page */
1525 seg1->mr_len += pageoff;
1526 len = -pageoff;
Steve Wise0fc6c4e2014-05-28 10:32:00 -04001527 if (*nsegs > ia->ri_max_frmr_depth)
1528 *nsegs = ia->ri_max_frmr_depth;
Tom Tucker9b781452012-02-20 13:07:57 -06001529 for (page_no = i = 0; i < *nsegs;) {
Tom Talpey3197d3092008-10-09 15:00:20 -04001530 rpcrdma_map_one(ia, seg, writing);
Tom Tucker9b781452012-02-20 13:07:57 -06001531 pa = seg->mr_dma;
1532 for (seg_len = seg->mr_len; seg_len > 0; seg_len -= PAGE_SIZE) {
1533 seg1->mr_chunk.rl_mw->r.frmr.fr_pgl->
1534 page_list[page_no++] = pa;
1535 pa += PAGE_SIZE;
1536 }
Tom Talpey3197d3092008-10-09 15:00:20 -04001537 len += seg->mr_len;
1538 ++seg;
1539 ++i;
1540 /* Check for holes */
1541 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1542 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1543 break;
1544 }
1545 dprintk("RPC: %s: Using frmr %p to map %d segments\n",
1546 __func__, seg1->mr_chunk.rl_mw, i);
1547
Tom Tucker5c635e02011-02-09 19:45:34 +00001548 if (unlikely(seg1->mr_chunk.rl_mw->r.frmr.state == FRMR_IS_VALID)) {
1549 dprintk("RPC: %s: frmr %x left valid, posting invalidate.\n",
1550 __func__,
1551 seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey);
1552 /* Invalidate before using. */
1553 memset(&invalidate_wr, 0, sizeof invalidate_wr);
1554 invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
1555 invalidate_wr.next = &frmr_wr;
1556 invalidate_wr.opcode = IB_WR_LOCAL_INV;
1557 invalidate_wr.send_flags = IB_SEND_SIGNALED;
1558 invalidate_wr.ex.invalidate_rkey =
1559 seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1560 DECR_CQCOUNT(&r_xprt->rx_ep);
1561 post_wr = &invalidate_wr;
1562 } else
1563 post_wr = &frmr_wr;
1564
Tom Talpey3197d3092008-10-09 15:00:20 -04001565 /* Bump the key */
1566 key = (u8)(seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey & 0x000000FF);
1567 ib_update_fast_reg_key(seg1->mr_chunk.rl_mw->r.frmr.fr_mr, ++key);
1568
1569 /* Prepare FRMR WR */
1570 memset(&frmr_wr, 0, sizeof frmr_wr);
Tom Tucker5c635e02011-02-09 19:45:34 +00001571 frmr_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
Tom Talpey3197d3092008-10-09 15:00:20 -04001572 frmr_wr.opcode = IB_WR_FAST_REG_MR;
Tom Tucker5c635e02011-02-09 19:45:34 +00001573 frmr_wr.send_flags = IB_SEND_SIGNALED;
Steve Wise7a8b80eb2010-08-11 12:47:08 -04001574 frmr_wr.wr.fast_reg.iova_start = seg1->mr_dma;
Tom Talpey3197d3092008-10-09 15:00:20 -04001575 frmr_wr.wr.fast_reg.page_list = seg1->mr_chunk.rl_mw->r.frmr.fr_pgl;
Tom Tucker9b781452012-02-20 13:07:57 -06001576 frmr_wr.wr.fast_reg.page_list_len = page_no;
Tom Talpey3197d3092008-10-09 15:00:20 -04001577 frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
Tom Tucker9b781452012-02-20 13:07:57 -06001578 frmr_wr.wr.fast_reg.length = page_no << PAGE_SHIFT;
Tom Tucker5c635e02011-02-09 19:45:34 +00001579 BUG_ON(frmr_wr.wr.fast_reg.length < len);
Tom Talpey3197d3092008-10-09 15:00:20 -04001580 frmr_wr.wr.fast_reg.access_flags = (writing ?
Vu Pham68743082009-05-26 14:51:00 -04001581 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
1582 IB_ACCESS_REMOTE_READ);
Tom Talpey3197d3092008-10-09 15:00:20 -04001583 frmr_wr.wr.fast_reg.rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1584 DECR_CQCOUNT(&r_xprt->rx_ep);
1585
Tom Tucker5c635e02011-02-09 19:45:34 +00001586 rc = ib_post_send(ia->ri_id->qp, post_wr, &bad_wr);
Tom Talpey3197d3092008-10-09 15:00:20 -04001587
1588 if (rc) {
1589 dprintk("RPC: %s: failed ib_post_send for register,"
1590 " status %i\n", __func__, rc);
1591 while (i--)
1592 rpcrdma_unmap_one(ia, --seg);
1593 } else {
1594 seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1595 seg1->mr_base = seg1->mr_dma + pageoff;
1596 seg1->mr_nsegs = i;
1597 seg1->mr_len = len;
1598 }
1599 *nsegs = i;
1600 return rc;
1601}
1602
1603static int
1604rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg,
1605 struct rpcrdma_ia *ia, struct rpcrdma_xprt *r_xprt)
1606{
1607 struct rpcrdma_mr_seg *seg1 = seg;
1608 struct ib_send_wr invalidate_wr, *bad_wr;
1609 int rc;
1610
1611 while (seg1->mr_nsegs--)
1612 rpcrdma_unmap_one(ia, seg++);
1613
1614 memset(&invalidate_wr, 0, sizeof invalidate_wr);
Tom Tucker5c635e02011-02-09 19:45:34 +00001615 invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
Tom Talpey3197d3092008-10-09 15:00:20 -04001616 invalidate_wr.opcode = IB_WR_LOCAL_INV;
Tom Tucker5c635e02011-02-09 19:45:34 +00001617 invalidate_wr.send_flags = IB_SEND_SIGNALED;
Tom Talpey3197d3092008-10-09 15:00:20 -04001618 invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1619 DECR_CQCOUNT(&r_xprt->rx_ep);
1620
1621 rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
1622 if (rc)
1623 dprintk("RPC: %s: failed ib_post_send for invalidate,"
1624 " status %i\n", __func__, rc);
1625 return rc;
1626}
1627
1628static int
Tom Talpey8d4ba032008-10-09 14:59:49 -04001629rpcrdma_register_fmr_external(struct rpcrdma_mr_seg *seg,
1630 int *nsegs, int writing, struct rpcrdma_ia *ia)
1631{
1632 struct rpcrdma_mr_seg *seg1 = seg;
1633 u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
1634 int len, pageoff, i, rc;
1635
1636 pageoff = offset_in_page(seg1->mr_offset);
1637 seg1->mr_offset -= pageoff; /* start of page */
1638 seg1->mr_len += pageoff;
1639 len = -pageoff;
1640 if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1641 *nsegs = RPCRDMA_MAX_DATA_SEGS;
1642 for (i = 0; i < *nsegs;) {
1643 rpcrdma_map_one(ia, seg, writing);
1644 physaddrs[i] = seg->mr_dma;
1645 len += seg->mr_len;
1646 ++seg;
1647 ++i;
1648 /* Check for holes */
1649 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1650 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1651 break;
1652 }
1653 rc = ib_map_phys_fmr(seg1->mr_chunk.rl_mw->r.fmr,
1654 physaddrs, i, seg1->mr_dma);
1655 if (rc) {
1656 dprintk("RPC: %s: failed ib_map_phys_fmr "
1657 "%u@0x%llx+%i (%d)... status %i\n", __func__,
1658 len, (unsigned long long)seg1->mr_dma,
1659 pageoff, i, rc);
1660 while (i--)
1661 rpcrdma_unmap_one(ia, --seg);
1662 } else {
1663 seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.fmr->rkey;
1664 seg1->mr_base = seg1->mr_dma + pageoff;
1665 seg1->mr_nsegs = i;
1666 seg1->mr_len = len;
1667 }
1668 *nsegs = i;
1669 return rc;
1670}
1671
1672static int
1673rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg,
1674 struct rpcrdma_ia *ia)
1675{
1676 struct rpcrdma_mr_seg *seg1 = seg;
1677 LIST_HEAD(l);
1678 int rc;
1679
1680 list_add(&seg1->mr_chunk.rl_mw->r.fmr->list, &l);
1681 rc = ib_unmap_fmr(&l);
1682 while (seg1->mr_nsegs--)
1683 rpcrdma_unmap_one(ia, seg++);
1684 if (rc)
1685 dprintk("RPC: %s: failed ib_unmap_fmr,"
1686 " status %i\n", __func__, rc);
1687 return rc;
1688}
1689
1690static int
1691rpcrdma_register_memwin_external(struct rpcrdma_mr_seg *seg,
1692 int *nsegs, int writing, struct rpcrdma_ia *ia,
1693 struct rpcrdma_xprt *r_xprt)
1694{
1695 int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
1696 IB_ACCESS_REMOTE_READ);
1697 struct ib_mw_bind param;
1698 int rc;
1699
1700 *nsegs = 1;
1701 rpcrdma_map_one(ia, seg, writing);
Shani Michaeli7083e422013-02-06 16:19:12 +00001702 param.bind_info.mr = ia->ri_bind_mem;
Tom Talpey8d4ba032008-10-09 14:59:49 -04001703 param.wr_id = 0ULL; /* no send cookie */
Shani Michaeli7083e422013-02-06 16:19:12 +00001704 param.bind_info.addr = seg->mr_dma;
1705 param.bind_info.length = seg->mr_len;
Tom Talpey8d4ba032008-10-09 14:59:49 -04001706 param.send_flags = 0;
Shani Michaeli7083e422013-02-06 16:19:12 +00001707 param.bind_info.mw_access_flags = mem_priv;
Tom Talpey8d4ba032008-10-09 14:59:49 -04001708
1709 DECR_CQCOUNT(&r_xprt->rx_ep);
1710 rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, &param);
1711 if (rc) {
1712 dprintk("RPC: %s: failed ib_bind_mw "
1713 "%u@0x%llx status %i\n",
1714 __func__, seg->mr_len,
1715 (unsigned long long)seg->mr_dma, rc);
1716 rpcrdma_unmap_one(ia, seg);
1717 } else {
1718 seg->mr_rkey = seg->mr_chunk.rl_mw->r.mw->rkey;
Shani Michaeli7083e422013-02-06 16:19:12 +00001719 seg->mr_base = param.bind_info.addr;
Tom Talpey8d4ba032008-10-09 14:59:49 -04001720 seg->mr_nsegs = 1;
1721 }
1722 return rc;
1723}
1724
1725static int
1726rpcrdma_deregister_memwin_external(struct rpcrdma_mr_seg *seg,
1727 struct rpcrdma_ia *ia,
1728 struct rpcrdma_xprt *r_xprt, void **r)
1729{
1730 struct ib_mw_bind param;
1731 LIST_HEAD(l);
1732 int rc;
1733
1734 BUG_ON(seg->mr_nsegs != 1);
Shani Michaeli7083e422013-02-06 16:19:12 +00001735 param.bind_info.mr = ia->ri_bind_mem;
1736 param.bind_info.addr = 0ULL; /* unbind */
1737 param.bind_info.length = 0;
1738 param.bind_info.mw_access_flags = 0;
Tom Talpey8d4ba032008-10-09 14:59:49 -04001739 if (*r) {
1740 param.wr_id = (u64) (unsigned long) *r;
1741 param.send_flags = IB_SEND_SIGNALED;
1742 INIT_CQCOUNT(&r_xprt->rx_ep);
1743 } else {
1744 param.wr_id = 0ULL;
1745 param.send_flags = 0;
1746 DECR_CQCOUNT(&r_xprt->rx_ep);
1747 }
1748 rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, &param);
1749 rpcrdma_unmap_one(ia, seg);
1750 if (rc)
1751 dprintk("RPC: %s: failed ib_(un)bind_mw,"
1752 " status %i\n", __func__, rc);
1753 else
1754 *r = NULL; /* will upcall on completion */
1755 return rc;
1756}
1757
1758static int
1759rpcrdma_register_default_external(struct rpcrdma_mr_seg *seg,
1760 int *nsegs, int writing, struct rpcrdma_ia *ia)
1761{
1762 int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
1763 IB_ACCESS_REMOTE_READ);
1764 struct rpcrdma_mr_seg *seg1 = seg;
1765 struct ib_phys_buf ipb[RPCRDMA_MAX_DATA_SEGS];
1766 int len, i, rc = 0;
1767
1768 if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1769 *nsegs = RPCRDMA_MAX_DATA_SEGS;
1770 for (len = 0, i = 0; i < *nsegs;) {
1771 rpcrdma_map_one(ia, seg, writing);
1772 ipb[i].addr = seg->mr_dma;
1773 ipb[i].size = seg->mr_len;
1774 len += seg->mr_len;
1775 ++seg;
1776 ++i;
1777 /* Check for holes */
1778 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1779 offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len))
1780 break;
1781 }
1782 seg1->mr_base = seg1->mr_dma;
1783 seg1->mr_chunk.rl_mr = ib_reg_phys_mr(ia->ri_pd,
1784 ipb, i, mem_priv, &seg1->mr_base);
1785 if (IS_ERR(seg1->mr_chunk.rl_mr)) {
1786 rc = PTR_ERR(seg1->mr_chunk.rl_mr);
1787 dprintk("RPC: %s: failed ib_reg_phys_mr "
1788 "%u@0x%llx (%d)... status %i\n",
1789 __func__, len,
1790 (unsigned long long)seg1->mr_dma, i, rc);
1791 while (i--)
1792 rpcrdma_unmap_one(ia, --seg);
1793 } else {
1794 seg1->mr_rkey = seg1->mr_chunk.rl_mr->rkey;
1795 seg1->mr_nsegs = i;
1796 seg1->mr_len = len;
1797 }
1798 *nsegs = i;
1799 return rc;
1800}
1801
1802static int
1803rpcrdma_deregister_default_external(struct rpcrdma_mr_seg *seg,
1804 struct rpcrdma_ia *ia)
1805{
1806 struct rpcrdma_mr_seg *seg1 = seg;
1807 int rc;
1808
1809 rc = ib_dereg_mr(seg1->mr_chunk.rl_mr);
1810 seg1->mr_chunk.rl_mr = NULL;
1811 while (seg1->mr_nsegs--)
1812 rpcrdma_unmap_one(ia, seg++);
1813 if (rc)
1814 dprintk("RPC: %s: failed ib_dereg_mr,"
1815 " status %i\n", __func__, rc);
1816 return rc;
1817}
1818
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001819int
1820rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
1821 int nsegs, int writing, struct rpcrdma_xprt *r_xprt)
1822{
1823 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001824 int rc = 0;
1825
1826 switch (ia->ri_memreg_strategy) {
1827
1828#if RPCRDMA_PERSISTENT_REGISTRATION
1829 case RPCRDMA_ALLPHYSICAL:
1830 rpcrdma_map_one(ia, seg, writing);
1831 seg->mr_rkey = ia->ri_bind_mem->rkey;
1832 seg->mr_base = seg->mr_dma;
1833 seg->mr_nsegs = 1;
1834 nsegs = 1;
1835 break;
1836#endif
1837
Tom Talpey3197d3092008-10-09 15:00:20 -04001838 /* Registration using frmr registration */
1839 case RPCRDMA_FRMR:
1840 rc = rpcrdma_register_frmr_external(seg, &nsegs, writing, ia, r_xprt);
1841 break;
1842
Tom Talpey8d4ba032008-10-09 14:59:49 -04001843 /* Registration using fmr memory registration */
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001844 case RPCRDMA_MTHCAFMR:
Tom Talpey8d4ba032008-10-09 14:59:49 -04001845 rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001846 break;
1847
1848 /* Registration using memory windows */
1849 case RPCRDMA_MEMWINDOWS_ASYNC:
1850 case RPCRDMA_MEMWINDOWS:
Tom Talpey8d4ba032008-10-09 14:59:49 -04001851 rc = rpcrdma_register_memwin_external(seg, &nsegs, writing, ia, r_xprt);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001852 break;
1853
1854 /* Default registration each time */
1855 default:
Tom Talpey8d4ba032008-10-09 14:59:49 -04001856 rc = rpcrdma_register_default_external(seg, &nsegs, writing, ia);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001857 break;
1858 }
1859 if (rc)
1860 return -1;
1861
1862 return nsegs;
1863}
1864
1865int
1866rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
1867 struct rpcrdma_xprt *r_xprt, void *r)
1868{
1869 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001870 int nsegs = seg->mr_nsegs, rc;
1871
1872 switch (ia->ri_memreg_strategy) {
1873
1874#if RPCRDMA_PERSISTENT_REGISTRATION
1875 case RPCRDMA_ALLPHYSICAL:
1876 BUG_ON(nsegs != 1);
1877 rpcrdma_unmap_one(ia, seg);
1878 rc = 0;
1879 break;
1880#endif
1881
Tom Talpey3197d3092008-10-09 15:00:20 -04001882 case RPCRDMA_FRMR:
1883 rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt);
1884 break;
1885
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001886 case RPCRDMA_MTHCAFMR:
Tom Talpey8d4ba032008-10-09 14:59:49 -04001887 rc = rpcrdma_deregister_fmr_external(seg, ia);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001888 break;
1889
1890 case RPCRDMA_MEMWINDOWS_ASYNC:
1891 case RPCRDMA_MEMWINDOWS:
Tom Talpey8d4ba032008-10-09 14:59:49 -04001892 rc = rpcrdma_deregister_memwin_external(seg, ia, r_xprt, &r);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001893 break;
1894
1895 default:
Tom Talpey8d4ba032008-10-09 14:59:49 -04001896 rc = rpcrdma_deregister_default_external(seg, ia);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001897 break;
1898 }
1899 if (r) {
1900 struct rpcrdma_rep *rep = r;
1901 void (*func)(struct rpcrdma_rep *) = rep->rr_func;
1902 rep->rr_func = NULL;
1903 func(rep); /* dereg done, callback now */
1904 }
1905 return nsegs;
1906}
1907
1908/*
1909 * Prepost any receive buffer, then post send.
1910 *
1911 * Receive buffer is donated to hardware, reclaimed upon recv completion.
1912 */
1913int
1914rpcrdma_ep_post(struct rpcrdma_ia *ia,
1915 struct rpcrdma_ep *ep,
1916 struct rpcrdma_req *req)
1917{
1918 struct ib_send_wr send_wr, *send_wr_fail;
1919 struct rpcrdma_rep *rep = req->rl_reply;
1920 int rc;
1921
1922 if (rep) {
1923 rc = rpcrdma_ep_post_recv(ia, ep, rep);
1924 if (rc)
1925 goto out;
1926 req->rl_reply = NULL;
1927 }
1928
1929 send_wr.next = NULL;
1930 send_wr.wr_id = 0ULL; /* no send cookie */
1931 send_wr.sg_list = req->rl_send_iov;
1932 send_wr.num_sge = req->rl_niovs;
1933 send_wr.opcode = IB_WR_SEND;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001934 if (send_wr.num_sge == 4) /* no need to sync any pad (constant) */
1935 ib_dma_sync_single_for_device(ia->ri_id->device,
1936 req->rl_send_iov[3].addr, req->rl_send_iov[3].length,
1937 DMA_TO_DEVICE);
1938 ib_dma_sync_single_for_device(ia->ri_id->device,
1939 req->rl_send_iov[1].addr, req->rl_send_iov[1].length,
1940 DMA_TO_DEVICE);
1941 ib_dma_sync_single_for_device(ia->ri_id->device,
1942 req->rl_send_iov[0].addr, req->rl_send_iov[0].length,
1943 DMA_TO_DEVICE);
1944
1945 if (DECR_CQCOUNT(ep) > 0)
1946 send_wr.send_flags = 0;
1947 else { /* Provider must take a send completion every now and then */
1948 INIT_CQCOUNT(ep);
1949 send_wr.send_flags = IB_SEND_SIGNALED;
1950 }
1951
1952 rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail);
1953 if (rc)
1954 dprintk("RPC: %s: ib_post_send returned %i\n", __func__,
1955 rc);
1956out:
1957 return rc;
1958}
1959
1960/*
1961 * (Re)post a receive buffer.
1962 */
1963int
1964rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
1965 struct rpcrdma_ep *ep,
1966 struct rpcrdma_rep *rep)
1967{
1968 struct ib_recv_wr recv_wr, *recv_wr_fail;
1969 int rc;
1970
1971 recv_wr.next = NULL;
1972 recv_wr.wr_id = (u64) (unsigned long) rep;
1973 recv_wr.sg_list = &rep->rr_iov;
1974 recv_wr.num_sge = 1;
1975
1976 ib_dma_sync_single_for_cpu(ia->ri_id->device,
1977 rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL);
1978
1979 DECR_CQCOUNT(ep);
1980 rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
1981
1982 if (rc)
1983 dprintk("RPC: %s: ib_post_recv returned %i\n", __func__,
1984 rc);
1985 return rc;
1986}