blob: a2bc6b612d03332ba7607a784f0e14efbc886135 [file] [log] [blame]
Ursula Braunf38ba1792017-01-09 16:55:19 +01001/*
2 * Shared Memory Communications over RDMA (SMC-R) and RoCE
3 *
4 * Work Requests exploiting Infiniband API
5 *
6 * Work requests (WR) of type ib_post_send or ib_post_recv respectively
7 * are submitted to either RC SQ or RC RQ respectively
8 * (reliably connected send/receive queue)
9 * and become work queue entries (WQEs).
10 * While an SQ WR/WQE is pending, we track it until transmission completion.
11 * Through a send or receive completion queue (CQ) respectively,
12 * we get completion queue entries (CQEs) [aka work completions (WCs)].
13 * Since the CQ callback is called from IRQ context, we split work by using
14 * bottom halves implemented by tasklets.
15 *
16 * SMC uses this to exchange LLC (link layer control)
17 * and CDC (connection data control) messages.
18 *
19 * Copyright IBM Corp. 2016
20 *
21 * Author(s): Steffen Maier <maier@linux.vnet.ibm.com>
22 */
23
24#include <linux/atomic.h>
25#include <linux/hashtable.h>
26#include <linux/wait.h>
27#include <rdma/ib_verbs.h>
28#include <asm/div64.h>
29
30#include "smc.h"
31#include "smc_wr.h"
32
33#define SMC_WR_MAX_POLL_CQE 10 /* max. # of compl. queue elements in 1 poll */
34
35#define SMC_WR_RX_HASH_BITS 4
36static DEFINE_HASHTABLE(smc_wr_rx_hash, SMC_WR_RX_HASH_BITS);
37static DEFINE_SPINLOCK(smc_wr_rx_hash_lock);
38
39struct smc_wr_tx_pend { /* control data for a pending send request */
40 u64 wr_id; /* work request id sent */
41 smc_wr_tx_handler handler;
42 enum ib_wc_status wc_status; /* CQE status */
43 struct smc_link *link;
44 u32 idx;
45 struct smc_wr_tx_pend_priv priv;
46};
47
48/******************************** send queue *********************************/
49
50/*------------------------------- completion --------------------------------*/
51
52static inline int smc_wr_tx_find_pending_index(struct smc_link *link, u64 wr_id)
53{
54 u32 i;
55
56 for (i = 0; i < link->wr_tx_cnt; i++) {
57 if (link->wr_tx_pends[i].wr_id == wr_id)
58 return i;
59 }
60 return link->wr_tx_cnt;
61}
62
63static inline void smc_wr_tx_process_cqe(struct ib_wc *wc)
64{
65 struct smc_wr_tx_pend pnd_snd;
66 struct smc_link *link;
67 u32 pnd_snd_idx;
68 int i;
69
70 link = wc->qp->qp_context;
71 pnd_snd_idx = smc_wr_tx_find_pending_index(link, wc->wr_id);
72 if (pnd_snd_idx == link->wr_tx_cnt)
73 return;
74 link->wr_tx_pends[pnd_snd_idx].wc_status = wc->status;
75 memcpy(&pnd_snd, &link->wr_tx_pends[pnd_snd_idx], sizeof(pnd_snd));
76 /* clear the full struct smc_wr_tx_pend including .priv */
77 memset(&link->wr_tx_pends[pnd_snd_idx], 0,
78 sizeof(link->wr_tx_pends[pnd_snd_idx]));
79 memset(&link->wr_tx_bufs[pnd_snd_idx], 0,
80 sizeof(link->wr_tx_bufs[pnd_snd_idx]));
81 if (!test_and_clear_bit(pnd_snd_idx, link->wr_tx_mask))
82 return;
83 if (wc->status) {
84 for_each_set_bit(i, link->wr_tx_mask, link->wr_tx_cnt) {
85 /* clear full struct smc_wr_tx_pend including .priv */
86 memset(&link->wr_tx_pends[i], 0,
87 sizeof(link->wr_tx_pends[i]));
88 memset(&link->wr_tx_bufs[i], 0,
89 sizeof(link->wr_tx_bufs[i]));
90 clear_bit(i, link->wr_tx_mask);
91 }
92 /* tbd in future patch: terminate connections of this link
93 * group abnormally
94 */
95 }
96 if (pnd_snd.handler)
97 pnd_snd.handler(&pnd_snd.priv, link, wc->status);
98 wake_up(&link->wr_tx_wait);
99}
100
101static void smc_wr_tx_tasklet_fn(unsigned long data)
102{
103 struct smc_ib_device *dev = (struct smc_ib_device *)data;
104 struct ib_wc wc[SMC_WR_MAX_POLL_CQE];
105 int i = 0, rc;
106 int polled = 0;
107
108again:
109 polled++;
110 do {
111 rc = ib_poll_cq(dev->roce_cq_send, SMC_WR_MAX_POLL_CQE, wc);
112 if (polled == 1) {
113 ib_req_notify_cq(dev->roce_cq_send,
114 IB_CQ_NEXT_COMP |
115 IB_CQ_REPORT_MISSED_EVENTS);
116 }
117 if (!rc)
118 break;
119 for (i = 0; i < rc; i++)
120 smc_wr_tx_process_cqe(&wc[i]);
121 } while (rc > 0);
122 if (polled == 1)
123 goto again;
124}
125
126void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context)
127{
128 struct smc_ib_device *dev = (struct smc_ib_device *)cq_context;
129
130 tasklet_schedule(&dev->send_tasklet);
131}
132
133/*---------------------------- request submission ---------------------------*/
134
135static inline int smc_wr_tx_get_free_slot_index(struct smc_link *link, u32 *idx)
136{
137 *idx = link->wr_tx_cnt;
138 for_each_clear_bit(*idx, link->wr_tx_mask, link->wr_tx_cnt) {
139 if (!test_and_set_bit(*idx, link->wr_tx_mask))
140 return 0;
141 }
142 *idx = link->wr_tx_cnt;
143 return -EBUSY;
144}
145
146/**
147 * smc_wr_tx_get_free_slot() - returns buffer for message assembly,
148 * and sets info for pending transmit tracking
149 * @link: Pointer to smc_link used to later send the message.
150 * @handler: Send completion handler function pointer.
151 * @wr_buf: Out value returns pointer to message buffer.
152 * @wr_pend_priv: Out value returns pointer serving as handler context.
153 *
154 * Return: 0 on success, or -errno on error.
155 */
156int smc_wr_tx_get_free_slot(struct smc_link *link,
157 smc_wr_tx_handler handler,
158 struct smc_wr_buf **wr_buf,
159 struct smc_wr_tx_pend_priv **wr_pend_priv)
160{
161 struct smc_wr_tx_pend *wr_pend;
162 struct ib_send_wr *wr_ib;
163 u64 wr_id;
164 u32 idx;
165 int rc;
166
167 *wr_buf = NULL;
168 *wr_pend_priv = NULL;
169 if (in_softirq()) {
170 rc = smc_wr_tx_get_free_slot_index(link, &idx);
171 if (rc)
172 return rc;
173 } else {
174 rc = wait_event_interruptible_timeout(
175 link->wr_tx_wait,
176 (smc_wr_tx_get_free_slot_index(link, &idx) != -EBUSY),
177 SMC_WR_TX_WAIT_FREE_SLOT_TIME);
178 if (!rc) {
179 /* tbd in future patch: timeout - terminate connections
180 * of this link group abnormally
181 */
182 return -EPIPE;
183 }
184 if (rc == -ERESTARTSYS)
185 return -EINTR;
186 if (idx == link->wr_tx_cnt)
187 return -EPIPE;
188 }
189 wr_id = smc_wr_tx_get_next_wr_id(link);
190 wr_pend = &link->wr_tx_pends[idx];
191 wr_pend->wr_id = wr_id;
192 wr_pend->handler = handler;
193 wr_pend->link = link;
194 wr_pend->idx = idx;
195 wr_ib = &link->wr_tx_ibs[idx];
196 wr_ib->wr_id = wr_id;
197 *wr_buf = &link->wr_tx_bufs[idx];
198 *wr_pend_priv = &wr_pend->priv;
199 return 0;
200}
201
202int smc_wr_tx_put_slot(struct smc_link *link,
203 struct smc_wr_tx_pend_priv *wr_pend_priv)
204{
205 struct smc_wr_tx_pend *pend;
206
207 pend = container_of(wr_pend_priv, struct smc_wr_tx_pend, priv);
208 if (pend->idx < link->wr_tx_cnt) {
209 /* clear the full struct smc_wr_tx_pend including .priv */
210 memset(&link->wr_tx_pends[pend->idx], 0,
211 sizeof(link->wr_tx_pends[pend->idx]));
212 memset(&link->wr_tx_bufs[pend->idx], 0,
213 sizeof(link->wr_tx_bufs[pend->idx]));
214 test_and_clear_bit(pend->idx, link->wr_tx_mask);
215 return 1;
216 }
217
218 return 0;
219}
220
221/* Send prepared WR slot via ib_post_send.
222 * @priv: pointer to smc_wr_tx_pend_priv identifying prepared message buffer
223 */
224int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv)
225{
226 struct ib_send_wr *failed_wr = NULL;
227 struct smc_wr_tx_pend *pend;
228 int rc;
229
230 ib_req_notify_cq(link->smcibdev->roce_cq_send,
231 IB_CQ_SOLICITED_MASK | IB_CQ_REPORT_MISSED_EVENTS);
232 pend = container_of(priv, struct smc_wr_tx_pend, priv);
233 rc = ib_post_send(link->roce_qp, &link->wr_tx_ibs[pend->idx],
234 &failed_wr);
235 if (rc)
236 smc_wr_tx_put_slot(link, priv);
237 return rc;
238}
239
240/****************************** receive queue ********************************/
241
242int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler)
243{
244 struct smc_wr_rx_handler *h_iter;
245 int rc = 0;
246
247 spin_lock(&smc_wr_rx_hash_lock);
248 hash_for_each_possible(smc_wr_rx_hash, h_iter, list, handler->type) {
249 if (h_iter->type == handler->type) {
250 rc = -EEXIST;
251 goto out_unlock;
252 }
253 }
254 hash_add(smc_wr_rx_hash, &handler->list, handler->type);
255out_unlock:
256 spin_unlock(&smc_wr_rx_hash_lock);
257 return rc;
258}
259
260/* Demultiplex a received work request based on the message type to its handler.
261 * Relies on smc_wr_rx_hash having been completely filled before any IB WRs,
262 * and not being modified any more afterwards so we don't need to lock it.
263 */
264static inline void smc_wr_rx_demultiplex(struct ib_wc *wc)
265{
266 struct smc_link *link = (struct smc_link *)wc->qp->qp_context;
267 struct smc_wr_rx_handler *handler;
268 struct smc_wr_rx_hdr *wr_rx;
269 u64 temp_wr_id;
270 u32 index;
271
272 if (wc->byte_len < sizeof(*wr_rx))
273 return; /* short message */
274 temp_wr_id = wc->wr_id;
275 index = do_div(temp_wr_id, link->wr_rx_cnt);
276 wr_rx = (struct smc_wr_rx_hdr *)&link->wr_rx_bufs[index];
277 hash_for_each_possible(smc_wr_rx_hash, handler, list, wr_rx->type) {
278 if (handler->type == wr_rx->type)
279 handler->handler(wc, wr_rx);
280 }
281}
282
283static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
284{
285 struct smc_link *link;
286 int i;
287
288 for (i = 0; i < num; i++) {
289 link = wc[i].qp->qp_context;
290 if (wc[i].status == IB_WC_SUCCESS) {
291 smc_wr_rx_demultiplex(&wc[i]);
292 smc_wr_rx_post(link); /* refill WR RX */
293 } else {
294 /* handle status errors */
295 switch (wc[i].status) {
296 case IB_WC_RETRY_EXC_ERR:
297 case IB_WC_RNR_RETRY_EXC_ERR:
298 case IB_WC_WR_FLUSH_ERR:
299 /* tbd in future patch: terminate connections of this
300 * link group abnormally
301 */
302 break;
303 default:
304 smc_wr_rx_post(link); /* refill WR RX */
305 break;
306 }
307 }
308 }
309}
310
311static void smc_wr_rx_tasklet_fn(unsigned long data)
312{
313 struct smc_ib_device *dev = (struct smc_ib_device *)data;
314 struct ib_wc wc[SMC_WR_MAX_POLL_CQE];
315 int polled = 0;
316 int rc;
317
318again:
319 polled++;
320 do {
321 memset(&wc, 0, sizeof(wc));
322 rc = ib_poll_cq(dev->roce_cq_recv, SMC_WR_MAX_POLL_CQE, wc);
323 if (polled == 1) {
324 ib_req_notify_cq(dev->roce_cq_recv,
325 IB_CQ_SOLICITED_MASK
326 | IB_CQ_REPORT_MISSED_EVENTS);
327 }
328 if (!rc)
329 break;
330 smc_wr_rx_process_cqes(&wc[0], rc);
331 } while (rc > 0);
332 if (polled == 1)
333 goto again;
334}
335
336void smc_wr_rx_cq_handler(struct ib_cq *ib_cq, void *cq_context)
337{
338 struct smc_ib_device *dev = (struct smc_ib_device *)cq_context;
339
340 tasklet_schedule(&dev->recv_tasklet);
341}
342
343int smc_wr_rx_post_init(struct smc_link *link)
344{
345 u32 i;
346 int rc = 0;
347
348 for (i = 0; i < link->wr_rx_cnt; i++)
349 rc = smc_wr_rx_post(link);
350 return rc;
351}
352
353/***************************** init, exit, misc ******************************/
354
355void smc_wr_remember_qp_attr(struct smc_link *lnk)
356{
357 struct ib_qp_attr *attr = &lnk->qp_attr;
358 struct ib_qp_init_attr init_attr;
359
360 memset(attr, 0, sizeof(*attr));
361 memset(&init_attr, 0, sizeof(init_attr));
362 ib_query_qp(lnk->roce_qp, attr,
363 IB_QP_STATE |
364 IB_QP_CUR_STATE |
365 IB_QP_PKEY_INDEX |
366 IB_QP_PORT |
367 IB_QP_QKEY |
368 IB_QP_AV |
369 IB_QP_PATH_MTU |
370 IB_QP_TIMEOUT |
371 IB_QP_RETRY_CNT |
372 IB_QP_RNR_RETRY |
373 IB_QP_RQ_PSN |
374 IB_QP_ALT_PATH |
375 IB_QP_MIN_RNR_TIMER |
376 IB_QP_SQ_PSN |
377 IB_QP_PATH_MIG_STATE |
378 IB_QP_CAP |
379 IB_QP_DEST_QPN,
380 &init_attr);
381
382 lnk->wr_tx_cnt = min_t(size_t, SMC_WR_BUF_CNT,
383 lnk->qp_attr.cap.max_send_wr);
384 lnk->wr_rx_cnt = min_t(size_t, SMC_WR_BUF_CNT * 3,
385 lnk->qp_attr.cap.max_recv_wr);
386}
387
388static void smc_wr_init_sge(struct smc_link *lnk)
389{
390 u32 i;
391
392 for (i = 0; i < lnk->wr_tx_cnt; i++) {
393 lnk->wr_tx_sges[i].addr =
394 lnk->wr_tx_dma_addr + i * SMC_WR_BUF_SIZE;
395 lnk->wr_tx_sges[i].length = SMC_WR_TX_SIZE;
396 lnk->wr_tx_ibs[i].next = NULL;
397 lnk->wr_tx_ibs[i].sg_list = &lnk->wr_tx_sges[i];
398 lnk->wr_tx_ibs[i].num_sge = 1;
399 lnk->wr_tx_ibs[i].opcode = IB_WR_SEND;
400 lnk->wr_tx_ibs[i].send_flags =
401 IB_SEND_SIGNALED | IB_SEND_SOLICITED | IB_SEND_INLINE;
402 }
403 for (i = 0; i < lnk->wr_rx_cnt; i++) {
404 lnk->wr_rx_sges[i].addr =
405 lnk->wr_rx_dma_addr + i * SMC_WR_BUF_SIZE;
406 lnk->wr_rx_sges[i].length = SMC_WR_BUF_SIZE;
407 lnk->wr_rx_ibs[i].next = NULL;
408 lnk->wr_rx_ibs[i].sg_list = &lnk->wr_rx_sges[i];
409 lnk->wr_rx_ibs[i].num_sge = 1;
410 }
411}
412
413void smc_wr_free_link(struct smc_link *lnk)
414{
415 struct ib_device *ibdev;
416
417 memset(lnk->wr_tx_mask, 0,
418 BITS_TO_LONGS(SMC_WR_BUF_CNT) * sizeof(*lnk->wr_tx_mask));
419
420 if (!lnk->smcibdev)
421 return;
422 ibdev = lnk->smcibdev->ibdev;
423
424 if (lnk->wr_rx_dma_addr) {
425 ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr,
426 SMC_WR_BUF_SIZE * lnk->wr_rx_cnt,
427 DMA_FROM_DEVICE);
428 lnk->wr_rx_dma_addr = 0;
429 }
430 if (lnk->wr_tx_dma_addr) {
431 ib_dma_unmap_single(ibdev, lnk->wr_tx_dma_addr,
432 SMC_WR_BUF_SIZE * lnk->wr_tx_cnt,
433 DMA_TO_DEVICE);
434 lnk->wr_tx_dma_addr = 0;
435 }
436}
437
438void smc_wr_free_link_mem(struct smc_link *lnk)
439{
440 kfree(lnk->wr_tx_pends);
441 lnk->wr_tx_pends = NULL;
442 kfree(lnk->wr_tx_mask);
443 lnk->wr_tx_mask = NULL;
444 kfree(lnk->wr_tx_sges);
445 lnk->wr_tx_sges = NULL;
446 kfree(lnk->wr_rx_sges);
447 lnk->wr_rx_sges = NULL;
448 kfree(lnk->wr_rx_ibs);
449 lnk->wr_rx_ibs = NULL;
450 kfree(lnk->wr_tx_ibs);
451 lnk->wr_tx_ibs = NULL;
452 kfree(lnk->wr_tx_bufs);
453 lnk->wr_tx_bufs = NULL;
454 kfree(lnk->wr_rx_bufs);
455 lnk->wr_rx_bufs = NULL;
456}
457
458int smc_wr_alloc_link_mem(struct smc_link *link)
459{
460 /* allocate link related memory */
461 link->wr_tx_bufs = kcalloc(SMC_WR_BUF_CNT, SMC_WR_BUF_SIZE, GFP_KERNEL);
462 if (!link->wr_tx_bufs)
463 goto no_mem;
464 link->wr_rx_bufs = kcalloc(SMC_WR_BUF_CNT * 3, SMC_WR_BUF_SIZE,
465 GFP_KERNEL);
466 if (!link->wr_rx_bufs)
467 goto no_mem_wr_tx_bufs;
468 link->wr_tx_ibs = kcalloc(SMC_WR_BUF_CNT, sizeof(link->wr_tx_ibs[0]),
469 GFP_KERNEL);
470 if (!link->wr_tx_ibs)
471 goto no_mem_wr_rx_bufs;
472 link->wr_rx_ibs = kcalloc(SMC_WR_BUF_CNT * 3,
473 sizeof(link->wr_rx_ibs[0]),
474 GFP_KERNEL);
475 if (!link->wr_rx_ibs)
476 goto no_mem_wr_tx_ibs;
477 link->wr_tx_sges = kcalloc(SMC_WR_BUF_CNT, sizeof(link->wr_tx_sges[0]),
478 GFP_KERNEL);
479 if (!link->wr_tx_sges)
480 goto no_mem_wr_rx_ibs;
481 link->wr_rx_sges = kcalloc(SMC_WR_BUF_CNT * 3,
482 sizeof(link->wr_rx_sges[0]),
483 GFP_KERNEL);
484 if (!link->wr_rx_sges)
485 goto no_mem_wr_tx_sges;
486 link->wr_tx_mask = kzalloc(
487 BITS_TO_LONGS(SMC_WR_BUF_CNT) * sizeof(*link->wr_tx_mask),
488 GFP_KERNEL);
489 if (!link->wr_tx_mask)
490 goto no_mem_wr_rx_sges;
491 link->wr_tx_pends = kcalloc(SMC_WR_BUF_CNT,
492 sizeof(link->wr_tx_pends[0]),
493 GFP_KERNEL);
494 if (!link->wr_tx_pends)
495 goto no_mem_wr_tx_mask;
496 return 0;
497
498no_mem_wr_tx_mask:
499 kfree(link->wr_tx_mask);
500no_mem_wr_rx_sges:
501 kfree(link->wr_rx_sges);
502no_mem_wr_tx_sges:
503 kfree(link->wr_tx_sges);
504no_mem_wr_rx_ibs:
505 kfree(link->wr_rx_ibs);
506no_mem_wr_tx_ibs:
507 kfree(link->wr_tx_ibs);
508no_mem_wr_rx_bufs:
509 kfree(link->wr_rx_bufs);
510no_mem_wr_tx_bufs:
511 kfree(link->wr_tx_bufs);
512no_mem:
513 return -ENOMEM;
514}
515
516void smc_wr_remove_dev(struct smc_ib_device *smcibdev)
517{
518 tasklet_kill(&smcibdev->recv_tasklet);
519 tasklet_kill(&smcibdev->send_tasklet);
520}
521
522void smc_wr_add_dev(struct smc_ib_device *smcibdev)
523{
524 tasklet_init(&smcibdev->recv_tasklet, smc_wr_rx_tasklet_fn,
525 (unsigned long)smcibdev);
526 tasklet_init(&smcibdev->send_tasklet, smc_wr_tx_tasklet_fn,
527 (unsigned long)smcibdev);
528}
529
530int smc_wr_create_link(struct smc_link *lnk)
531{
532 struct ib_device *ibdev = lnk->smcibdev->ibdev;
533 int rc = 0;
534
535 smc_wr_tx_set_wr_id(&lnk->wr_tx_id, 0);
536 lnk->wr_rx_id = 0;
537 lnk->wr_rx_dma_addr = ib_dma_map_single(
538 ibdev, lnk->wr_rx_bufs, SMC_WR_BUF_SIZE * lnk->wr_rx_cnt,
539 DMA_FROM_DEVICE);
540 if (ib_dma_mapping_error(ibdev, lnk->wr_rx_dma_addr)) {
541 lnk->wr_rx_dma_addr = 0;
542 rc = -EIO;
543 goto out;
544 }
545 lnk->wr_tx_dma_addr = ib_dma_map_single(
546 ibdev, lnk->wr_tx_bufs, SMC_WR_BUF_SIZE * lnk->wr_tx_cnt,
547 DMA_TO_DEVICE);
548 if (ib_dma_mapping_error(ibdev, lnk->wr_tx_dma_addr)) {
549 rc = -EIO;
550 goto dma_unmap;
551 }
552 smc_wr_init_sge(lnk);
553 memset(lnk->wr_tx_mask, 0,
554 BITS_TO_LONGS(SMC_WR_BUF_CNT) * sizeof(*lnk->wr_tx_mask));
555 return rc;
556
557dma_unmap:
558 ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr,
559 SMC_WR_BUF_SIZE * lnk->wr_rx_cnt,
560 DMA_FROM_DEVICE);
561 lnk->wr_rx_dma_addr = 0;
562out:
563 return rc;
564}