blob: 0ebdc0c764835b7fb52b7407abdafac506d6e0c8 [file] [log] [blame]
Chuck Leverbcf3ffd2018-05-07 15:26:55 -04001// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
Tom Tuckerc06b5402007-12-12 16:13:25 -06002/*
Chuck Leverecf85b22018-05-07 15:27:21 -04003 * Copyright (c) 2016-2018 Oracle. All rights reserved.
Steve Wise0bf48282014-05-28 15:12:01 -05004 * Copyright (c) 2014 Open Grid Computing, Inc. All rights reserved.
Tom Tuckerc06b5402007-12-12 16:13:25 -06005 * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
6 *
7 * This software is available to you under a choice of one of two
8 * licenses. You may choose to be licensed under the terms of the GNU
9 * General Public License (GPL) Version 2, available from the file
10 * COPYING in the main directory of this source tree, or the BSD-type
11 * license below:
12 *
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
15 * are met:
16 *
17 * Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 *
20 * Redistributions in binary form must reproduce the above
21 * copyright notice, this list of conditions and the following
22 * disclaimer in the documentation and/or other materials provided
23 * with the distribution.
24 *
25 * Neither the name of the Network Appliance, Inc. nor the names of
26 * its contributors may be used to endorse or promote products
27 * derived from this software without specific prior written
28 * permission.
29 *
30 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
31 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
32 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
33 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
34 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
35 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
36 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
37 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
38 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
39 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
40 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
41 *
42 * Author: Tom Tucker <tom@opengridcomputing.com>
43 */
44
Chuck Lever9a6a1802017-04-09 13:06:25 -040045/* Operation
46 *
47 * The main entry point is svc_rdma_sendto. This is called by the
48 * RPC server when an RPC Reply is ready to be transmitted to a client.
49 *
50 * The passed-in svc_rqst contains a struct xdr_buf which holds an
51 * XDR-encoded RPC Reply message. sendto must construct the RPC-over-RDMA
52 * transport header, post all Write WRs needed for this Reply, then post
53 * a Send WR conveying the transport header and the RPC message itself to
54 * the client.
55 *
56 * svc_rdma_sendto must fully transmit the Reply before returning, as
57 * the svc_rqst will be recycled as soon as sendto returns. Remaining
58 * resources referred to by the svc_rqst are also recycled at that time.
59 * Therefore any resources that must remain longer must be detached
60 * from the svc_rqst and released later.
61 *
62 * Page Management
63 *
64 * The I/O that performs Reply transmission is asynchronous, and may
65 * complete well after sendto returns. Thus pages under I/O must be
66 * removed from the svc_rqst before sendto returns.
67 *
68 * The logic here depends on Send Queue and completion ordering. Since
69 * the Send WR is always posted last, it will always complete last. Thus
70 * when it completes, it is guaranteed that all previous Write WRs have
71 * also completed.
72 *
73 * Write WRs are constructed and posted. Each Write segment gets its own
74 * svc_rdma_rw_ctxt, allowing the Write completion handler to find and
75 * DMA-unmap the pages under I/O for that Write segment. The Write
76 * completion handler does not release any pages.
77 *
Chuck Lever4201c7462018-05-07 15:28:04 -040078 * When the Send WR is constructed, it also gets its own svc_rdma_send_ctxt.
Chuck Lever9a6a1802017-04-09 13:06:25 -040079 * The ownership of all of the Reply's pages are transferred into that
80 * ctxt, the Send WR is posted, and sendto returns.
81 *
Chuck Lever4201c7462018-05-07 15:28:04 -040082 * The svc_rdma_send_ctxt is presented when the Send WR completes. The
Chuck Lever9a6a1802017-04-09 13:06:25 -040083 * Send completion handler finally releases the Reply's pages.
84 *
85 * This mechanism also assumes that completions on the transport's Send
86 * Completion Queue do not run in parallel. Otherwise a Write completion
87 * and Send completion running at the same time could release pages that
88 * are still DMA-mapped.
89 *
90 * Error Handling
91 *
92 * - If the Send WR is posted successfully, it will either complete
93 * successfully, or get flushed. Either way, the Send completion
94 * handler releases the Reply's pages.
95 * - If the Send WR cannot be not posted, the forward path releases
96 * the Reply's pages.
97 *
98 * This handles the case, without the use of page reference counting,
99 * where two different Write segments send portions of the same page.
100 */
101
Tom Tuckerc06b5402007-12-12 16:13:25 -0600102#include <linux/spinlock.h>
103#include <asm/unaligned.h>
Chuck Lever98895ed2018-05-07 15:27:11 -0400104
Tom Tuckerc06b5402007-12-12 16:13:25 -0600105#include <rdma/ib_verbs.h>
106#include <rdma/rdma_cm.h>
Chuck Lever98895ed2018-05-07 15:27:11 -0400107
108#include <linux/sunrpc/debug.h>
109#include <linux/sunrpc/rpc_rdma.h>
Tom Tuckerc06b5402007-12-12 16:13:25 -0600110#include <linux/sunrpc/svc_rdma.h>
111
Chuck Lever98895ed2018-05-07 15:27:11 -0400112#include "xprt_rdma.h"
113#include <trace/events/rpcrdma.h>
114
Tom Tuckerc06b5402007-12-12 16:13:25 -0600115#define RPCDBG_FACILITY RPCDBG_SVCXPRT
116
Chuck Lever4201c7462018-05-07 15:28:04 -0400117static void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc);
118
119static inline struct svc_rdma_send_ctxt *
120svc_rdma_next_send_ctxt(struct list_head *list)
121{
122 return list_first_entry_or_null(list, struct svc_rdma_send_ctxt,
123 sc_list);
124}
125
126static struct svc_rdma_send_ctxt *
127svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma)
128{
129 struct svc_rdma_send_ctxt *ctxt;
Chuck Lever25fd86e2018-05-07 15:28:09 -0400130 size_t size;
Chuck Lever4201c7462018-05-07 15:28:04 -0400131 int i;
132
Chuck Lever25fd86e2018-05-07 15:28:09 -0400133 size = sizeof(*ctxt);
134 size += rdma->sc_max_send_sges * sizeof(struct ib_sge);
135 ctxt = kmalloc(size, GFP_KERNEL);
Chuck Lever4201c7462018-05-07 15:28:04 -0400136 if (!ctxt)
137 return NULL;
138
139 ctxt->sc_cqe.done = svc_rdma_wc_send;
140 ctxt->sc_send_wr.next = NULL;
141 ctxt->sc_send_wr.wr_cqe = &ctxt->sc_cqe;
142 ctxt->sc_send_wr.sg_list = ctxt->sc_sges;
143 ctxt->sc_send_wr.send_flags = IB_SEND_SIGNALED;
Chuck Lever25fd86e2018-05-07 15:28:09 -0400144 for (i = 0; i < rdma->sc_max_send_sges; i++)
Chuck Lever4201c7462018-05-07 15:28:04 -0400145 ctxt->sc_sges[i].lkey = rdma->sc_pd->local_dma_lkey;
146 return ctxt;
147}
148
149/**
150 * svc_rdma_send_ctxts_destroy - Release all send_ctxt's for an xprt
151 * @rdma: svcxprt_rdma being torn down
152 *
153 */
154void svc_rdma_send_ctxts_destroy(struct svcxprt_rdma *rdma)
155{
156 struct svc_rdma_send_ctxt *ctxt;
157
158 while ((ctxt = svc_rdma_next_send_ctxt(&rdma->sc_send_ctxts))) {
159 list_del(&ctxt->sc_list);
160 kfree(ctxt);
161 }
162}
163
164/**
165 * svc_rdma_send_ctxt_get - Get a free send_ctxt
166 * @rdma: controlling svcxprt_rdma
167 *
168 * Returns a ready-to-use send_ctxt, or NULL if none are
169 * available and a fresh one cannot be allocated.
170 */
171struct svc_rdma_send_ctxt *svc_rdma_send_ctxt_get(struct svcxprt_rdma *rdma)
172{
173 struct svc_rdma_send_ctxt *ctxt;
174
175 spin_lock(&rdma->sc_send_lock);
176 ctxt = svc_rdma_next_send_ctxt(&rdma->sc_send_ctxts);
177 if (!ctxt)
178 goto out_empty;
179 list_del(&ctxt->sc_list);
180 spin_unlock(&rdma->sc_send_lock);
181
182out:
183 ctxt->sc_send_wr.num_sge = 0;
184 ctxt->sc_page_count = 0;
185 return ctxt;
186
187out_empty:
188 spin_unlock(&rdma->sc_send_lock);
189 ctxt = svc_rdma_send_ctxt_alloc(rdma);
190 if (!ctxt)
191 return NULL;
192 goto out;
193}
194
195/**
196 * svc_rdma_send_ctxt_put - Return send_ctxt to free list
197 * @rdma: controlling svcxprt_rdma
198 * @ctxt: object to return to the free list
199 *
200 * Pages left in sc_pages are DMA unmapped and released.
201 */
202void svc_rdma_send_ctxt_put(struct svcxprt_rdma *rdma,
203 struct svc_rdma_send_ctxt *ctxt)
204{
205 struct ib_device *device = rdma->sc_cm_id->device;
206 unsigned int i;
207
208 for (i = 0; i < ctxt->sc_send_wr.num_sge; i++)
209 ib_dma_unmap_page(device,
210 ctxt->sc_sges[i].addr,
211 ctxt->sc_sges[i].length,
212 DMA_TO_DEVICE);
213
214 for (i = 0; i < ctxt->sc_page_count; ++i)
215 put_page(ctxt->sc_pages[i]);
216
217 spin_lock(&rdma->sc_send_lock);
218 list_add(&ctxt->sc_list, &rdma->sc_send_ctxts);
219 spin_unlock(&rdma->sc_send_lock);
220}
221
222/**
223 * svc_rdma_wc_send - Invoked by RDMA provider for each polled Send WC
224 * @cq: Completion Queue context
225 * @wc: Work Completion object
226 *
227 * NB: The svc_xprt/svcxprt_rdma is pinned whenever it's possible that
228 * the Send completion handler could be running.
229 */
230static void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc)
231{
232 struct svcxprt_rdma *rdma = cq->cq_context;
233 struct ib_cqe *cqe = wc->wr_cqe;
234 struct svc_rdma_send_ctxt *ctxt;
235
236 trace_svcrdma_wc_send(wc);
237
238 atomic_inc(&rdma->sc_sq_avail);
239 wake_up(&rdma->sc_send_wait);
240
241 ctxt = container_of(cqe, struct svc_rdma_send_ctxt, sc_cqe);
242 svc_rdma_send_ctxt_put(rdma, ctxt);
243
244 if (unlikely(wc->status != IB_WC_SUCCESS)) {
245 set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
246 svc_xprt_enqueue(&rdma->sc_xprt);
247 if (wc->status != IB_WC_WR_FLUSH_ERR)
248 pr_err("svcrdma: Send: %s (%u/0x%x)\n",
249 ib_wc_status_msg(wc->status),
250 wc->status, wc->vendor_err);
251 }
252
253 svc_xprt_put(&rdma->sc_xprt);
254}
255
256int svc_rdma_send(struct svcxprt_rdma *rdma, struct ib_send_wr *wr)
257{
258 struct ib_send_wr *bad_wr, *n_wr;
259 int wr_count;
260 int i;
261 int ret;
262
263 wr_count = 1;
264 for (n_wr = wr->next; n_wr; n_wr = n_wr->next)
265 wr_count++;
266
267 /* If the SQ is full, wait until an SQ entry is available */
268 while (1) {
269 if ((atomic_sub_return(wr_count, &rdma->sc_sq_avail) < 0)) {
270 atomic_inc(&rdma_stat_sq_starve);
271 trace_svcrdma_sq_full(rdma);
272 atomic_add(wr_count, &rdma->sc_sq_avail);
273 wait_event(rdma->sc_send_wait,
274 atomic_read(&rdma->sc_sq_avail) > wr_count);
275 if (test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags))
276 return -ENOTCONN;
277 trace_svcrdma_sq_retry(rdma);
278 continue;
279 }
280 /* Take a transport ref for each WR posted */
281 for (i = 0; i < wr_count; i++)
282 svc_xprt_get(&rdma->sc_xprt);
283
284 /* Bump used SQ WR count and post */
285 ret = ib_post_send(rdma->sc_qp, wr, &bad_wr);
286 trace_svcrdma_post_send(wr, ret);
287 if (ret) {
288 set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
289 for (i = 0; i < wr_count; i++)
290 svc_xprt_put(&rdma->sc_xprt);
291 wake_up(&rdma->sc_send_wait);
292 }
293 break;
294 }
295 return ret;
296}
297
Chuck Levercf570a92016-03-01 13:05:45 -0500298static u32 xdr_padsize(u32 len)
299{
300 return (len & 3) ? (4 - (len & 3)) : 0;
301}
302
Chuck Lever9a6a1802017-04-09 13:06:25 -0400303/* Returns length of transport header, in bytes.
304 */
305static unsigned int svc_rdma_reply_hdr_len(__be32 *rdma_resp)
306{
307 unsigned int nsegs;
308 __be32 *p;
309
310 p = rdma_resp;
311
312 /* RPC-over-RDMA V1 replies never have a Read list. */
313 p += rpcrdma_fixed_maxsz + 1;
314
315 /* Skip Write list. */
316 while (*p++ != xdr_zero) {
317 nsegs = be32_to_cpup(p++);
318 p += nsegs * rpcrdma_segment_maxsz;
319 }
320
321 /* Skip Reply chunk. */
322 if (*p++ != xdr_zero) {
323 nsegs = be32_to_cpup(p++);
324 p += nsegs * rpcrdma_segment_maxsz;
325 }
326
327 return (unsigned long)p - (unsigned long)rdma_resp;
328}
329
330/* One Write chunk is copied from Call transport header to Reply
331 * transport header. Each segment's length field is updated to
332 * reflect number of bytes consumed in the segment.
333 *
334 * Returns number of segments in this chunk.
335 */
336static unsigned int xdr_encode_write_chunk(__be32 *dst, __be32 *src,
337 unsigned int remaining)
338{
339 unsigned int i, nsegs;
340 u32 seg_len;
341
342 /* Write list discriminator */
343 *dst++ = *src++;
344
345 /* number of segments in this chunk */
346 nsegs = be32_to_cpup(src);
347 *dst++ = *src++;
348
349 for (i = nsegs; i; i--) {
350 /* segment's RDMA handle */
351 *dst++ = *src++;
352
353 /* bytes returned in this segment */
354 seg_len = be32_to_cpu(*src);
355 if (remaining >= seg_len) {
356 /* entire segment was consumed */
357 *dst = *src;
358 remaining -= seg_len;
359 } else {
360 /* segment only partly filled */
361 *dst = cpu_to_be32(remaining);
362 remaining = 0;
363 }
364 dst++; src++;
365
366 /* segment's RDMA offset */
367 *dst++ = *src++;
368 *dst++ = *src++;
369 }
370
371 return nsegs;
372}
373
374/* The client provided a Write list in the Call message. Fill in
375 * the segments in the first Write chunk in the Reply's transport
376 * header with the number of bytes consumed in each segment.
377 * Remaining chunks are returned unused.
378 *
379 * Assumptions:
380 * - Client has provided only one Write chunk
381 */
382static void svc_rdma_xdr_encode_write_list(__be32 *rdma_resp, __be32 *wr_ch,
383 unsigned int consumed)
384{
385 unsigned int nsegs;
386 __be32 *p, *q;
387
388 /* RPC-over-RDMA V1 replies never have a Read list. */
389 p = rdma_resp + rpcrdma_fixed_maxsz + 1;
390
391 q = wr_ch;
392 while (*q != xdr_zero) {
393 nsegs = xdr_encode_write_chunk(p, q, consumed);
394 q += 2 + nsegs * rpcrdma_segment_maxsz;
395 p += 2 + nsegs * rpcrdma_segment_maxsz;
396 consumed = 0;
397 }
398
399 /* Terminate Write list */
400 *p++ = xdr_zero;
401
402 /* Reply chunk discriminator; may be replaced later */
403 *p = xdr_zero;
404}
405
406/* The client provided a Reply chunk in the Call message. Fill in
407 * the segments in the Reply chunk in the Reply message with the
408 * number of bytes consumed in each segment.
409 *
410 * Assumptions:
411 * - Reply can always fit in the provided Reply chunk
412 */
413static void svc_rdma_xdr_encode_reply_chunk(__be32 *rdma_resp, __be32 *rp_ch,
414 unsigned int consumed)
415{
416 __be32 *p;
417
418 /* Find the Reply chunk in the Reply's xprt header.
419 * RPC-over-RDMA V1 replies never have a Read list.
420 */
421 p = rdma_resp + rpcrdma_fixed_maxsz + 1;
422
423 /* Skip past Write list */
424 while (*p++ != xdr_zero)
425 p += 1 + be32_to_cpup(p) * rpcrdma_segment_maxsz;
426
427 xdr_encode_write_chunk(p, rp_ch, consumed);
428}
429
Chuck Lever5fdca652016-11-29 11:04:42 -0500430/* Parse the RPC Call's transport header.
Chuck Lever10dc4512015-07-09 16:45:28 -0400431 */
Chuck Lever9a6a1802017-04-09 13:06:25 -0400432static void svc_rdma_get_write_arrays(__be32 *rdma_argp,
433 __be32 **write, __be32 **reply)
Chuck Lever10dc4512015-07-09 16:45:28 -0400434{
Chuck Lever5fdca652016-11-29 11:04:42 -0500435 __be32 *p;
Chuck Lever10dc4512015-07-09 16:45:28 -0400436
Chuck Lever9a6a1802017-04-09 13:06:25 -0400437 p = rdma_argp + rpcrdma_fixed_maxsz;
Chuck Lever10dc4512015-07-09 16:45:28 -0400438
Chuck Lever5fdca652016-11-29 11:04:42 -0500439 /* Read list */
440 while (*p++ != xdr_zero)
441 p += 5;
Chuck Lever10dc4512015-07-09 16:45:28 -0400442
Chuck Lever5fdca652016-11-29 11:04:42 -0500443 /* Write list */
444 if (*p != xdr_zero) {
Chuck Lever9a6a1802017-04-09 13:06:25 -0400445 *write = p;
Chuck Lever5fdca652016-11-29 11:04:42 -0500446 while (*p++ != xdr_zero)
447 p += 1 + be32_to_cpu(*p) * 4;
448 } else {
449 *write = NULL;
450 p++;
Chuck Lever10dc4512015-07-09 16:45:28 -0400451 }
452
Chuck Lever5fdca652016-11-29 11:04:42 -0500453 /* Reply chunk */
454 if (*p != xdr_zero)
Chuck Lever9a6a1802017-04-09 13:06:25 -0400455 *reply = p;
Chuck Lever5fdca652016-11-29 11:04:42 -0500456 else
457 *reply = NULL;
Chuck Lever10dc4512015-07-09 16:45:28 -0400458}
459
Chuck Lever25d552962016-09-13 10:53:23 -0400460/* RPC-over-RDMA Version One private extension: Remote Invalidation.
461 * Responder's choice: requester signals it can handle Send With
462 * Invalidate, and responder chooses one rkey to invalidate.
463 *
464 * Find a candidate rkey to invalidate when sending a reply. Picks the
Chuck Leverc238c4c2017-04-09 13:06:08 -0400465 * first R_key it finds in the chunk lists.
Chuck Lever25d552962016-09-13 10:53:23 -0400466 *
467 * Returns zero if RPC's chunk lists are empty.
468 */
Chuck Leverc238c4c2017-04-09 13:06:08 -0400469static u32 svc_rdma_get_inv_rkey(__be32 *rdma_argp,
470 __be32 *wr_lst, __be32 *rp_ch)
Chuck Lever25d552962016-09-13 10:53:23 -0400471{
Chuck Leverc238c4c2017-04-09 13:06:08 -0400472 __be32 *p;
Chuck Lever25d552962016-09-13 10:53:23 -0400473
Chuck Leverc238c4c2017-04-09 13:06:08 -0400474 p = rdma_argp + rpcrdma_fixed_maxsz;
475 if (*p != xdr_zero)
476 p += 2;
477 else if (wr_lst && be32_to_cpup(wr_lst + 1))
478 p = wr_lst + 2;
479 else if (rp_ch && be32_to_cpup(rp_ch + 1))
480 p = rp_ch + 2;
481 else
482 return 0;
483 return be32_to_cpup(p);
Chuck Lever25d552962016-09-13 10:53:23 -0400484}
485
Chuck Lever6e6092c2017-04-09 13:05:44 -0400486static int svc_rdma_dma_map_page(struct svcxprt_rdma *rdma,
Chuck Lever4201c7462018-05-07 15:28:04 -0400487 struct svc_rdma_send_ctxt *ctxt,
Chuck Lever6e6092c2017-04-09 13:05:44 -0400488 struct page *page,
Chuck Leverf016f302018-05-07 15:27:53 -0400489 unsigned long offset,
Chuck Lever6e6092c2017-04-09 13:05:44 -0400490 unsigned int len)
491{
492 struct ib_device *dev = rdma->sc_cm_id->device;
493 dma_addr_t dma_addr;
494
495 dma_addr = ib_dma_map_page(dev, page, offset, len, DMA_TO_DEVICE);
496 if (ib_dma_mapping_error(dev, dma_addr))
Chuck Lever91a08eae2017-06-23 17:17:15 -0400497 goto out_maperr;
Chuck Lever6e6092c2017-04-09 13:05:44 -0400498
Chuck Lever25fd86e2018-05-07 15:28:09 -0400499 ctxt->sc_sges[ctxt->sc_cur_sge_no].addr = dma_addr;
500 ctxt->sc_sges[ctxt->sc_cur_sge_no].length = len;
Chuck Lever4201c7462018-05-07 15:28:04 -0400501 ctxt->sc_send_wr.num_sge++;
Chuck Lever6e6092c2017-04-09 13:05:44 -0400502 return 0;
Chuck Lever91a08eae2017-06-23 17:17:15 -0400503
504out_maperr:
Chuck Leverbd2abef2018-05-07 15:27:16 -0400505 trace_svcrdma_dma_map_page(rdma, page);
Chuck Lever91a08eae2017-06-23 17:17:15 -0400506 return -EIO;
Chuck Lever6e6092c2017-04-09 13:05:44 -0400507}
508
Chuck Leverf016f302018-05-07 15:27:53 -0400509/* ib_dma_map_page() is used here because svc_rdma_dma_unmap()
510 * handles DMA-unmap and it uses ib_dma_unmap_page() exclusively.
511 */
512static int svc_rdma_dma_map_buf(struct svcxprt_rdma *rdma,
Chuck Lever4201c7462018-05-07 15:28:04 -0400513 struct svc_rdma_send_ctxt *ctxt,
Chuck Leverf016f302018-05-07 15:27:53 -0400514 unsigned char *base,
515 unsigned int len)
516{
Chuck Lever25fd86e2018-05-07 15:28:09 -0400517 return svc_rdma_dma_map_page(rdma, ctxt, virt_to_page(base),
Chuck Leverf016f302018-05-07 15:27:53 -0400518 offset_in_page(base), len);
519}
520
Chuck Lever6e6092c2017-04-09 13:05:44 -0400521/**
522 * svc_rdma_map_reply_hdr - DMA map the transport header buffer
523 * @rdma: controlling transport
524 * @ctxt: op_ctxt for the Send WR
525 * @rdma_resp: buffer containing transport header
526 * @len: length of transport header
527 *
528 * Returns:
529 * %0 if the header is DMA mapped,
530 * %-EIO if DMA mapping failed.
531 */
532int svc_rdma_map_reply_hdr(struct svcxprt_rdma *rdma,
Chuck Lever4201c7462018-05-07 15:28:04 -0400533 struct svc_rdma_send_ctxt *ctxt,
Chuck Lever6e6092c2017-04-09 13:05:44 -0400534 __be32 *rdma_resp,
535 unsigned int len)
536{
Chuck Lever4201c7462018-05-07 15:28:04 -0400537 ctxt->sc_pages[0] = virt_to_page(rdma_resp);
538 ctxt->sc_page_count++;
Chuck Lever25fd86e2018-05-07 15:28:09 -0400539 ctxt->sc_cur_sge_no = 0;
540 return svc_rdma_dma_map_page(rdma, ctxt, ctxt->sc_pages[0], 0, len);
Chuck Lever6e6092c2017-04-09 13:05:44 -0400541}
542
Chuck Lever9a6a1802017-04-09 13:06:25 -0400543/* Load the xdr_buf into the ctxt's sge array, and DMA map each
544 * element as it is added.
545 *
Chuck Lever23262792018-05-07 15:27:59 -0400546 * Returns zero on success, or a negative errno on failure.
Tom Tuckerc06b5402007-12-12 16:13:25 -0600547 */
Chuck Lever9a6a1802017-04-09 13:06:25 -0400548static int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma,
Chuck Lever4201c7462018-05-07 15:28:04 -0400549 struct svc_rdma_send_ctxt *ctxt,
Chuck Lever9a6a1802017-04-09 13:06:25 -0400550 struct xdr_buf *xdr, __be32 *wr_lst)
Tom Tuckerc06b5402007-12-12 16:13:25 -0600551{
Chuck Lever25fd86e2018-05-07 15:28:09 -0400552 unsigned int len, remaining;
Chuck Leverf016f302018-05-07 15:27:53 -0400553 unsigned long page_off;
Chuck Lever9a6a1802017-04-09 13:06:25 -0400554 struct page **ppages;
555 unsigned char *base;
556 u32 xdr_pad;
Tom Tuckerc06b5402007-12-12 16:13:25 -0600557 int ret;
558
Chuck Lever25fd86e2018-05-07 15:28:09 -0400559 if (++ctxt->sc_cur_sge_no >= rdma->sc_max_send_sges)
560 return -EIO;
561 ret = svc_rdma_dma_map_buf(rdma, ctxt,
Chuck Lever9a6a1802017-04-09 13:06:25 -0400562 xdr->head[0].iov_base,
563 xdr->head[0].iov_len);
564 if (ret < 0)
565 return ret;
Tom Tuckerc06b5402007-12-12 16:13:25 -0600566
Chuck Lever9a6a1802017-04-09 13:06:25 -0400567 /* If a Write chunk is present, the xdr_buf's page list
568 * is not included inline. However the Upper Layer may
569 * have added XDR padding in the tail buffer, and that
570 * should not be included inline.
571 */
572 if (wr_lst) {
573 base = xdr->tail[0].iov_base;
574 len = xdr->tail[0].iov_len;
575 xdr_pad = xdr_padsize(xdr->page_len);
Tom Tuckerc06b5402007-12-12 16:13:25 -0600576
Chuck Lever9a6a1802017-04-09 13:06:25 -0400577 if (len && xdr_pad) {
578 base += xdr_pad;
579 len -= xdr_pad;
Tom Tuckerc06b5402007-12-12 16:13:25 -0600580 }
Chuck Lever9a6a1802017-04-09 13:06:25 -0400581
582 goto tail;
Tom Tuckerc06b5402007-12-12 16:13:25 -0600583 }
Tom Tuckerc06b5402007-12-12 16:13:25 -0600584
Chuck Lever9a6a1802017-04-09 13:06:25 -0400585 ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT);
586 page_off = xdr->page_base & ~PAGE_MASK;
587 remaining = xdr->page_len;
588 while (remaining) {
589 len = min_t(u32, PAGE_SIZE - page_off, remaining);
Chuck Lever08ae4e72016-03-01 13:05:36 -0500590
Chuck Lever25fd86e2018-05-07 15:28:09 -0400591 if (++ctxt->sc_cur_sge_no >= rdma->sc_max_send_sges)
592 return -EIO;
593 ret = svc_rdma_dma_map_page(rdma, ctxt, *ppages++,
594 page_off, len);
Chuck Lever9a6a1802017-04-09 13:06:25 -0400595 if (ret < 0)
596 return ret;
Tom Tuckerc06b5402007-12-12 16:13:25 -0600597
Chuck Lever9a6a1802017-04-09 13:06:25 -0400598 remaining -= len;
599 page_off = 0;
Tom Tuckerc06b5402007-12-12 16:13:25 -0600600 }
Tom Tuckerc06b5402007-12-12 16:13:25 -0600601
Chuck Lever9a6a1802017-04-09 13:06:25 -0400602 base = xdr->tail[0].iov_base;
603 len = xdr->tail[0].iov_len;
604tail:
605 if (len) {
Chuck Lever25fd86e2018-05-07 15:28:09 -0400606 if (++ctxt->sc_cur_sge_no >= rdma->sc_max_send_sges)
607 return -EIO;
608 ret = svc_rdma_dma_map_buf(rdma, ctxt, base, len);
Chuck Lever9a6a1802017-04-09 13:06:25 -0400609 if (ret < 0)
610 return ret;
611 }
Chuck Lever08ae4e72016-03-01 13:05:36 -0500612
Chuck Lever23262792018-05-07 15:27:59 -0400613 return 0;
Tom Tuckerc06b5402007-12-12 16:13:25 -0600614}
615
Chuck Leverc55ab072017-04-09 13:06:00 -0400616/* The svc_rqst and all resources it owns are released as soon as
617 * svc_rdma_sendto returns. Transfer pages under I/O to the ctxt
618 * so they are released by the Send completion handler.
619 */
620static void svc_rdma_save_io_pages(struct svc_rqst *rqstp,
Chuck Lever4201c7462018-05-07 15:28:04 -0400621 struct svc_rdma_send_ctxt *ctxt)
Chuck Leverc55ab072017-04-09 13:06:00 -0400622{
623 int i, pages = rqstp->rq_next_page - rqstp->rq_respages;
624
Chuck Lever4201c7462018-05-07 15:28:04 -0400625 ctxt->sc_page_count += pages;
Chuck Leverc55ab072017-04-09 13:06:00 -0400626 for (i = 0; i < pages; i++) {
Chuck Lever4201c7462018-05-07 15:28:04 -0400627 ctxt->sc_pages[i + 1] = rqstp->rq_respages[i];
Chuck Leverc55ab072017-04-09 13:06:00 -0400628 rqstp->rq_respages[i] = NULL;
629 }
630 rqstp->rq_next_page = rqstp->rq_respages + 1;
631}
632
Chuck Lever9a6a1802017-04-09 13:06:25 -0400633/* Prepare the portion of the RPC Reply that will be transmitted
634 * via RDMA Send. The RPC-over-RDMA transport header is prepared
Chuck Lever4201c7462018-05-07 15:28:04 -0400635 * in sc_sges[0], and the RPC xdr_buf is prepared in following sges.
Chuck Lever9a6a1802017-04-09 13:06:25 -0400636 *
637 * Depending on whether a Write list or Reply chunk is present,
638 * the server may send all, a portion of, or none of the xdr_buf.
Chuck Lever4201c7462018-05-07 15:28:04 -0400639 * In the latter case, only the transport header (sc_sges[0]) is
Chuck Lever9a6a1802017-04-09 13:06:25 -0400640 * transmitted.
641 *
642 * RDMA Send is the last step of transmitting an RPC reply. Pages
643 * involved in the earlier RDMA Writes are here transferred out
644 * of the rqstp and into the ctxt's page array. These pages are
645 * DMA unmapped by each Write completion, but the subsequent Send
646 * completion finally releases these pages.
647 *
648 * Assumptions:
649 * - The Reply's transport header will never be larger than a page.
Tom Tuckerc06b5402007-12-12 16:13:25 -0600650 */
Chuck Lever9a6a1802017-04-09 13:06:25 -0400651static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma,
652 __be32 *rdma_argp, __be32 *rdma_resp,
653 struct svc_rqst *rqstp,
654 __be32 *wr_lst, __be32 *rp_ch)
Tom Tuckerc06b5402007-12-12 16:13:25 -0600655{
Chuck Lever4201c7462018-05-07 15:28:04 -0400656 struct svc_rdma_send_ctxt *ctxt;
Chuck Lever9a6a1802017-04-09 13:06:25 -0400657 int ret;
Tom Tucker0e7f0112008-04-23 16:49:54 -0500658
Chuck Lever4201c7462018-05-07 15:28:04 -0400659 ctxt = svc_rdma_send_ctxt_get(rdma);
660 if (!ctxt)
661 return -ENOMEM;
Tom Tuckerc06b5402007-12-12 16:13:25 -0600662
Chuck Lever9a6a1802017-04-09 13:06:25 -0400663 ret = svc_rdma_map_reply_hdr(rdma, ctxt, rdma_resp,
664 svc_rdma_reply_hdr_len(rdma_resp));
665 if (ret < 0)
Tom Tuckerafd566e2008-10-03 15:45:03 -0500666 goto err;
Tom Tuckerafd566e2008-10-03 15:45:03 -0500667
Chuck Lever9a6a1802017-04-09 13:06:25 -0400668 if (!rp_ch) {
669 ret = svc_rdma_map_reply_msg(rdma, ctxt,
670 &rqstp->rq_res, wr_lst);
671 if (ret < 0)
Steve Wise0bf48282014-05-28 15:12:01 -0500672 goto err;
Chuck Lever3fe04ee2015-01-13 11:03:03 -0500673 }
Tom Tuckerc06b5402007-12-12 16:13:25 -0600674
Chuck Leverc55ab072017-04-09 13:06:00 -0400675 svc_rdma_save_io_pages(rqstp, ctxt);
Steve Wise0bf48282014-05-28 15:12:01 -0500676
Chuck Lever986b7882018-05-07 15:28:15 -0400677 ctxt->sc_send_wr.opcode = IB_WR_SEND;
678 if (rdma->sc_snd_w_inv) {
679 ctxt->sc_send_wr.ex.invalidate_rkey =
680 svc_rdma_get_inv_rkey(rdma_argp, wr_lst, rp_ch);
681 if (ctxt->sc_send_wr.ex.invalidate_rkey)
682 ctxt->sc_send_wr.opcode = IB_WR_SEND_WITH_INV;
683 }
684 dprintk("svcrdma: posting Send WR with %u sge(s)\n",
685 ctxt->sc_send_wr.num_sge);
686 ret = svc_rdma_send(rdma, &ctxt->sc_send_wr);
Tom Tuckerc06b5402007-12-12 16:13:25 -0600687 if (ret)
Tom Tuckerafd566e2008-10-03 15:45:03 -0500688 goto err;
Tom Tuckerc06b5402007-12-12 16:13:25 -0600689
Tom Tuckerafd566e2008-10-03 15:45:03 -0500690 return 0;
691
Chuck Lever9a6a1802017-04-09 13:06:25 -0400692err:
Chuck Lever4201c7462018-05-07 15:28:04 -0400693 svc_rdma_send_ctxt_put(rdma, ctxt);
Chuck Lever9ec64052016-05-04 10:53:05 -0400694 return ret;
Tom Tuckerc06b5402007-12-12 16:13:25 -0600695}
696
Chuck Lever4757d902017-04-09 13:06:41 -0400697/* Given the client-provided Write and Reply chunks, the server was not
698 * able to form a complete reply. Return an RDMA_ERROR message so the
699 * client can retire this RPC transaction. As above, the Send completion
700 * routine releases payload pages that were part of a previous RDMA Write.
701 *
702 * Remote Invalidation is skipped for simplicity.
703 */
704static int svc_rdma_send_error_msg(struct svcxprt_rdma *rdma,
705 __be32 *rdma_resp, struct svc_rqst *rqstp)
706{
Chuck Lever4201c7462018-05-07 15:28:04 -0400707 struct svc_rdma_send_ctxt *ctxt;
Chuck Lever4757d902017-04-09 13:06:41 -0400708 __be32 *p;
709 int ret;
710
Chuck Lever4201c7462018-05-07 15:28:04 -0400711 ctxt = svc_rdma_send_ctxt_get(rdma);
712 if (!ctxt)
713 return -ENOMEM;
Chuck Lever4757d902017-04-09 13:06:41 -0400714
715 /* Replace the original transport header with an
716 * RDMA_ERROR response. XID etc are preserved.
717 */
Chuck Lever98895ed2018-05-07 15:27:11 -0400718 trace_svcrdma_err_chunk(*rdma_resp);
Chuck Lever4757d902017-04-09 13:06:41 -0400719 p = rdma_resp + 3;
720 *p++ = rdma_error;
721 *p = err_chunk;
722
723 ret = svc_rdma_map_reply_hdr(rdma, ctxt, rdma_resp, 20);
724 if (ret < 0)
725 goto err;
726
727 svc_rdma_save_io_pages(rqstp, ctxt);
728
Chuck Lever986b7882018-05-07 15:28:15 -0400729 ctxt->sc_send_wr.opcode = IB_WR_SEND;
730 ret = svc_rdma_send(rdma, &ctxt->sc_send_wr);
Chuck Lever4757d902017-04-09 13:06:41 -0400731 if (ret)
732 goto err;
733
734 return 0;
735
736err:
Chuck Lever4201c7462018-05-07 15:28:04 -0400737 svc_rdma_send_ctxt_put(rdma, ctxt);
Chuck Lever4757d902017-04-09 13:06:41 -0400738 return ret;
739}
740
Tom Tuckerc06b5402007-12-12 16:13:25 -0600741void svc_rdma_prep_reply_hdr(struct svc_rqst *rqstp)
742{
743}
744
Chuck Lever9a6a1802017-04-09 13:06:25 -0400745/**
746 * svc_rdma_sendto - Transmit an RPC reply
747 * @rqstp: processed RPC request, reply XDR already in ::rq_res
748 *
749 * Any resources still associated with @rqstp are released upon return.
750 * If no reply message was possible, the connection is closed.
751 *
752 * Returns:
753 * %0 if an RPC reply has been successfully posted,
754 * %-ENOMEM if a resource shortage occurred (connection is lost),
755 * %-ENOTCONN if posting failed (connection is lost).
756 */
Tom Tuckerc06b5402007-12-12 16:13:25 -0600757int svc_rdma_sendto(struct svc_rqst *rqstp)
758{
759 struct svc_xprt *xprt = rqstp->rq_xprt;
760 struct svcxprt_rdma *rdma =
761 container_of(xprt, struct svcxprt_rdma, sc_xprt);
Chuck Lever3a880922018-05-07 15:27:37 -0400762 struct svc_rdma_recv_ctxt *rctxt = rqstp->rq_xprt_ctxt;
Chuck Lever9a6a1802017-04-09 13:06:25 -0400763 __be32 *p, *rdma_argp, *rdma_resp, *wr_lst, *rp_ch;
764 struct xdr_buf *xdr = &rqstp->rq_res;
Tom Tuckerc06b5402007-12-12 16:13:25 -0600765 struct page *res_page;
Chuck Lever9a6a1802017-04-09 13:06:25 -0400766 int ret;
Tom Tuckerc06b5402007-12-12 16:13:25 -0600767
Chuck Lever3316f062018-05-07 15:27:43 -0400768 rdma_argp = rctxt->rc_recv_buf;
Chuck Lever9a6a1802017-04-09 13:06:25 -0400769 svc_rdma_get_write_arrays(rdma_argp, &wr_lst, &rp_ch);
Tom Tuckerc06b5402007-12-12 16:13:25 -0600770
Chuck Levere4eb42c2016-11-29 11:04:50 -0500771 /* Create the RDMA response header. xprt->xpt_mutex,
772 * acquired in svc_send(), serializes RPC replies. The
773 * code path below that inserts the credit grant value
774 * into each transport header runs only inside this
775 * critical section.
776 */
Chuck Lever78da2b32016-01-07 14:49:45 -0500777 ret = -ENOMEM;
778 res_page = alloc_page(GFP_KERNEL);
779 if (!res_page)
780 goto err0;
Tom Tuckerc06b5402007-12-12 16:13:25 -0600781 rdma_resp = page_address(res_page);
Chuck Lever98fc21d2017-02-07 11:58:23 -0500782
Chuck Lever9a6a1802017-04-09 13:06:25 -0400783 p = rdma_resp;
784 *p++ = *rdma_argp;
785 *p++ = *(rdma_argp + 1);
Chuck Lever98fc21d2017-02-07 11:58:23 -0500786 *p++ = rdma->sc_fc_credits;
Chuck Lever9a6a1802017-04-09 13:06:25 -0400787 *p++ = rp_ch ? rdma_nomsg : rdma_msg;
Chuck Lever98fc21d2017-02-07 11:58:23 -0500788
789 /* Start with empty chunks */
790 *p++ = xdr_zero;
791 *p++ = xdr_zero;
792 *p = xdr_zero;
Tom Tuckerc06b5402007-12-12 16:13:25 -0600793
Chuck Lever9a6a1802017-04-09 13:06:25 -0400794 if (wr_lst) {
795 /* XXX: Presume the client sent only one Write chunk */
796 ret = svc_rdma_send_write_chunk(rdma, wr_lst, xdr);
Chuck Lever08ae4e72016-03-01 13:05:36 -0500797 if (ret < 0)
Chuck Lever4757d902017-04-09 13:06:41 -0400798 goto err2;
Chuck Lever9a6a1802017-04-09 13:06:25 -0400799 svc_rdma_xdr_encode_write_list(rdma_resp, wr_lst, ret);
Tom Tuckerc06b5402007-12-12 16:13:25 -0600800 }
Chuck Lever9a6a1802017-04-09 13:06:25 -0400801 if (rp_ch) {
802 ret = svc_rdma_send_reply_chunk(rdma, rp_ch, wr_lst, xdr);
Chuck Lever08ae4e72016-03-01 13:05:36 -0500803 if (ret < 0)
Chuck Lever4757d902017-04-09 13:06:41 -0400804 goto err2;
Chuck Lever9a6a1802017-04-09 13:06:25 -0400805 svc_rdma_xdr_encode_reply_chunk(rdma_resp, rp_ch, ret);
Tom Tuckerc06b5402007-12-12 16:13:25 -0600806 }
Tom Tuckerc06b5402007-12-12 16:13:25 -0600807
Chuck Lever9a6a1802017-04-09 13:06:25 -0400808 ret = svc_rdma_send_reply_msg(rdma, rdma_argp, rdma_resp, rqstp,
809 wr_lst, rp_ch);
Chuck Lever3e1eeb92016-03-01 13:06:11 -0500810 if (ret < 0)
Chuck Lever99952372016-09-13 10:52:59 -0400811 goto err0;
Chuck Lever3a880922018-05-07 15:27:37 -0400812 ret = 0;
813
814out:
815 rqstp->rq_xprt_ctxt = NULL;
816 svc_rdma_recv_ctxt_put(rdma, rctxt);
817 return ret;
Tom Tuckerafd566e2008-10-03 15:45:03 -0500818
Chuck Lever4757d902017-04-09 13:06:41 -0400819 err2:
Colin Ian Kingb20dae702017-07-13 18:51:15 +0100820 if (ret != -E2BIG && ret != -EINVAL)
Chuck Lever4757d902017-04-09 13:06:41 -0400821 goto err1;
822
Chuck Lever4757d902017-04-09 13:06:41 -0400823 ret = svc_rdma_send_error_msg(rdma, rdma_resp, rqstp);
824 if (ret < 0)
825 goto err0;
Chuck Lever3a880922018-05-07 15:27:37 -0400826 ret = 0;
827 goto out;
Chuck Lever4757d902017-04-09 13:06:41 -0400828
Tom Tuckerafd566e2008-10-03 15:45:03 -0500829 err1:
830 put_page(res_page);
831 err0:
Chuck Leverbd2abef2018-05-07 15:27:16 -0400832 trace_svcrdma_send_failed(rqstp, ret);
Chuck Lever9a6a1802017-04-09 13:06:25 -0400833 set_bit(XPT_CLOSE, &xprt->xpt_flags);
Chuck Lever3a880922018-05-07 15:27:37 -0400834 ret = -ENOTCONN;
835 goto out;
Tom Tuckerc06b5402007-12-12 16:13:25 -0600836}