blob: 69a0013dd25cecbee0658168da577d15ed8913d8 [file] [log] [blame]
Ursula Braune6727f32017-01-09 16:55:23 +01001/*
2 * Shared Memory Communications over RDMA (SMC-R) and RoCE
3 *
4 * Manage send buffer.
5 * Producer:
6 * Copy user space data into send buffer, if send buffer space available.
7 * Consumer:
8 * Trigger RDMA write into RMBE of peer and send CDC, if RMBE space available.
9 *
10 * Copyright IBM Corp. 2016
11 *
12 * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
13 */
14
15#include <linux/net.h>
16#include <linux/rcupdate.h>
17#include <linux/workqueue.h>
Ingo Molnarc3edc402017-02-02 08:35:14 +010018#include <linux/sched/signal.h>
19
Ursula Braune6727f32017-01-09 16:55:23 +010020#include <net/sock.h>
21
22#include "smc.h"
23#include "smc_wr.h"
24#include "smc_cdc.h"
25#include "smc_tx.h"
26
27/***************************** sndbuf producer *******************************/
28
29/* callback implementation for sk.sk_write_space()
30 * to wakeup sndbuf producers that blocked with smc_tx_wait_memory().
31 * called under sk_socket lock.
32 */
33static void smc_tx_write_space(struct sock *sk)
34{
35 struct socket *sock = sk->sk_socket;
36 struct smc_sock *smc = smc_sk(sk);
37 struct socket_wq *wq;
38
39 /* similar to sk_stream_write_space */
40 if (atomic_read(&smc->conn.sndbuf_space) && sock) {
41 clear_bit(SOCK_NOSPACE, &sock->flags);
42 rcu_read_lock();
43 wq = rcu_dereference(sk->sk_wq);
44 if (skwq_has_sleeper(wq))
45 wake_up_interruptible_poll(&wq->wait,
46 POLLOUT | POLLWRNORM |
47 POLLWRBAND);
48 if (wq && wq->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN))
49 sock_wake_async(wq, SOCK_WAKE_SPACE, POLL_OUT);
50 rcu_read_unlock();
51 }
52}
53
54/* Wakeup sndbuf producers that blocked with smc_tx_wait_memory().
55 * Cf. tcp_data_snd_check()=>tcp_check_space()=>tcp_new_space().
56 */
57void smc_tx_sndbuf_nonfull(struct smc_sock *smc)
58{
59 if (smc->sk.sk_socket &&
60 test_bit(SOCK_NOSPACE, &smc->sk.sk_socket->flags))
61 smc->sk.sk_write_space(&smc->sk);
62}
63
64/* blocks sndbuf producer until at least one byte of free space available */
65static int smc_tx_wait_memory(struct smc_sock *smc, int flags)
66{
67 DEFINE_WAIT_FUNC(wait, woken_wake_function);
68 struct smc_connection *conn = &smc->conn;
69 struct sock *sk = &smc->sk;
70 bool noblock;
71 long timeo;
72 int rc = 0;
73
74 /* similar to sk_stream_wait_memory */
75 timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
76 noblock = timeo ? false : true;
77 add_wait_queue(sk_sleep(sk), &wait);
78 while (1) {
79 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
80 if (sk->sk_err ||
81 (sk->sk_shutdown & SEND_SHUTDOWN) ||
82 conn->local_tx_ctrl.conn_state_flags.peer_done_writing) {
83 rc = -EPIPE;
84 break;
85 }
86 if (conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) {
87 rc = -ECONNRESET;
88 break;
89 }
90 if (!timeo) {
91 if (noblock)
92 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
93 rc = -EAGAIN;
94 break;
95 }
96 if (signal_pending(current)) {
97 rc = sock_intr_errno(timeo);
98 break;
99 }
100 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
101 if (atomic_read(&conn->sndbuf_space))
102 break; /* at least 1 byte of free space available */
103 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
104 sk->sk_write_pending++;
105 sk_wait_event(sk, &timeo,
106 sk->sk_err ||
107 (sk->sk_shutdown & SEND_SHUTDOWN) ||
108 smc_cdc_rxed_any_close_or_senddone(conn) ||
109 atomic_read(&conn->sndbuf_space),
110 &wait);
111 sk->sk_write_pending--;
112 }
113 remove_wait_queue(sk_sleep(sk), &wait);
114 return rc;
115}
116
117/* sndbuf producer: main API called by socket layer.
118 * called under sock lock.
119 */
120int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len)
121{
122 size_t copylen, send_done = 0, send_remaining = len;
123 size_t chunk_len, chunk_off, chunk_len_sum;
124 struct smc_connection *conn = &smc->conn;
125 union smc_host_cursor prep;
126 struct sock *sk = &smc->sk;
127 char *sndbuf_base;
128 int tx_cnt_prep;
129 int writespace;
130 int rc, chunk;
131
132 /* This should be in poll */
133 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
134
135 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) {
136 rc = -EPIPE;
137 goto out_err;
138 }
139
140 while (msg_data_left(msg)) {
141 if (sk->sk_state == SMC_INIT)
142 return -ENOTCONN;
143 if (smc->sk.sk_shutdown & SEND_SHUTDOWN ||
Ursula Braunb38d7322017-01-09 16:55:25 +0100144 (smc->sk.sk_err == ECONNABORTED) ||
Ursula Braune6727f32017-01-09 16:55:23 +0100145 conn->local_tx_ctrl.conn_state_flags.peer_conn_abort)
146 return -EPIPE;
147 if (smc_cdc_rxed_any_close(conn))
148 return send_done ?: -ECONNRESET;
149
150 if (!atomic_read(&conn->sndbuf_space)) {
151 rc = smc_tx_wait_memory(smc, msg->msg_flags);
152 if (rc) {
153 if (send_done)
154 return send_done;
155 goto out_err;
156 }
157 continue;
158 }
159
160 /* initialize variables for 1st iteration of subsequent loop */
161 /* could be just 1 byte, even after smc_tx_wait_memory above */
162 writespace = atomic_read(&conn->sndbuf_space);
163 /* not more than what user space asked for */
164 copylen = min_t(size_t, send_remaining, writespace);
165 /* determine start of sndbuf */
166 sndbuf_base = conn->sndbuf_desc->cpu_addr;
167 smc_curs_write(&prep,
168 smc_curs_read(&conn->tx_curs_prep, conn),
169 conn);
170 tx_cnt_prep = prep.count;
171 /* determine chunks where to write into sndbuf */
172 /* either unwrapped case, or 1st chunk of wrapped case */
173 chunk_len = min_t(size_t,
174 copylen, conn->sndbuf_size - tx_cnt_prep);
175 chunk_len_sum = chunk_len;
176 chunk_off = tx_cnt_prep;
177 for (chunk = 0; chunk < 2; chunk++) {
178 rc = memcpy_from_msg(sndbuf_base + chunk_off,
179 msg, chunk_len);
180 if (rc) {
181 if (send_done)
182 return send_done;
183 goto out_err;
184 }
185 send_done += chunk_len;
186 send_remaining -= chunk_len;
187
188 if (chunk_len_sum == copylen)
189 break; /* either on 1st or 2nd iteration */
190 /* prepare next (== 2nd) iteration */
191 chunk_len = copylen - chunk_len; /* remainder */
192 chunk_len_sum += chunk_len;
193 chunk_off = 0; /* modulo offset in send ring buffer */
194 }
195 /* update cursors */
196 smc_curs_add(conn->sndbuf_size, &prep, copylen);
197 smc_curs_write(&conn->tx_curs_prep,
198 smc_curs_read(&prep, conn),
199 conn);
200 /* increased in send tasklet smc_cdc_tx_handler() */
201 smp_mb__before_atomic();
202 atomic_sub(copylen, &conn->sndbuf_space);
203 /* guarantee 0 <= sndbuf_space <= sndbuf_size */
204 smp_mb__after_atomic();
205 /* since we just produced more new data into sndbuf,
206 * trigger sndbuf consumer: RDMA write into peer RMBE and CDC
207 */
208 smc_tx_sndbuf_nonempty(conn);
209 } /* while (msg_data_left(msg)) */
210
211 return send_done;
212
213out_err:
214 rc = sk_stream_error(sk, msg->msg_flags, rc);
215 /* make sure we wake any epoll edge trigger waiter */
216 if (unlikely(rc == -EAGAIN))
217 sk->sk_write_space(sk);
218 return rc;
219}
220
221/***************************** sndbuf consumer *******************************/
222
223/* sndbuf consumer: actual data transfer of one target chunk with RDMA write */
224static int smc_tx_rdma_write(struct smc_connection *conn, int peer_rmbe_offset,
225 int num_sges, struct ib_sge sges[])
226{
227 struct smc_link_group *lgr = conn->lgr;
228 struct ib_send_wr *failed_wr = NULL;
229 struct ib_rdma_wr rdma_wr;
230 struct smc_link *link;
231 int rc;
232
233 memset(&rdma_wr, 0, sizeof(rdma_wr));
234 link = &lgr->lnk[SMC_SINGLE_LINK];
235 rdma_wr.wr.wr_id = smc_wr_tx_get_next_wr_id(link);
236 rdma_wr.wr.sg_list = sges;
237 rdma_wr.wr.num_sge = num_sges;
238 rdma_wr.wr.opcode = IB_WR_RDMA_WRITE;
239 rdma_wr.remote_addr =
240 lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].dma_addr +
241 /* RMBE within RMB */
242 ((conn->peer_conn_idx - 1) * conn->peer_rmbe_size) +
243 /* offset within RMBE */
244 peer_rmbe_offset;
245 rdma_wr.rkey = lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].rkey;
246 rc = ib_post_send(link->roce_qp, &rdma_wr.wr, &failed_wr);
247 if (rc)
248 conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1;
249 return rc;
250}
251
252/* sndbuf consumer */
253static inline void smc_tx_advance_cursors(struct smc_connection *conn,
254 union smc_host_cursor *prod,
255 union smc_host_cursor *sent,
256 size_t len)
257{
258 smc_curs_add(conn->peer_rmbe_size, prod, len);
259 /* increased in recv tasklet smc_cdc_msg_rcv() */
260 smp_mb__before_atomic();
261 /* data in flight reduces usable snd_wnd */
262 atomic_sub(len, &conn->peer_rmbe_space);
263 /* guarantee 0 <= peer_rmbe_space <= peer_rmbe_size */
264 smp_mb__after_atomic();
265 smc_curs_add(conn->sndbuf_size, sent, len);
266}
267
268/* sndbuf consumer: prepare all necessary (src&dst) chunks of data transmit;
269 * usable snd_wnd as max transmit
270 */
271static int smc_tx_rdma_writes(struct smc_connection *conn)
272{
273 size_t src_off, src_len, dst_off, dst_len; /* current chunk values */
274 size_t len, dst_len_sum, src_len_sum, dstchunk, srcchunk;
275 union smc_host_cursor sent, prep, prod, cons;
276 struct ib_sge sges[SMC_IB_MAX_SEND_SGE];
277 struct smc_link_group *lgr = conn->lgr;
278 int to_send, rmbespace;
279 struct smc_link *link;
280 int num_sges;
281 int rc;
282
283 /* source: sndbuf */
284 smc_curs_write(&sent, smc_curs_read(&conn->tx_curs_sent, conn), conn);
285 smc_curs_write(&prep, smc_curs_read(&conn->tx_curs_prep, conn), conn);
286 /* cf. wmem_alloc - (snd_max - snd_una) */
287 to_send = smc_curs_diff(conn->sndbuf_size, &sent, &prep);
288 if (to_send <= 0)
289 return 0;
290
291 /* destination: RMBE */
292 /* cf. snd_wnd */
293 rmbespace = atomic_read(&conn->peer_rmbe_space);
294 if (rmbespace <= 0)
295 return 0;
296 smc_curs_write(&prod,
297 smc_curs_read(&conn->local_tx_ctrl.prod, conn),
298 conn);
299 smc_curs_write(&cons,
300 smc_curs_read(&conn->local_rx_ctrl.cons, conn),
301 conn);
302
303 /* if usable snd_wnd closes ask peer to advertise once it opens again */
304 conn->local_tx_ctrl.prod_flags.write_blocked = (to_send >= rmbespace);
305 /* cf. usable snd_wnd */
306 len = min(to_send, rmbespace);
307
308 /* initialize variables for first iteration of subsequent nested loop */
309 link = &lgr->lnk[SMC_SINGLE_LINK];
310 dst_off = prod.count;
311 if (prod.wrap == cons.wrap) {
312 /* the filled destination area is unwrapped,
313 * hence the available free destination space is wrapped
314 * and we need 2 destination chunks of sum len; start with 1st
315 * which is limited by what's available in sndbuf
316 */
317 dst_len = min_t(size_t,
318 conn->peer_rmbe_size - prod.count, len);
319 } else {
320 /* the filled destination area is wrapped,
321 * hence the available free destination space is unwrapped
322 * and we need a single destination chunk of entire len
323 */
324 dst_len = len;
325 }
326 dst_len_sum = dst_len;
327 src_off = sent.count;
328 /* dst_len determines the maximum src_len */
329 if (sent.count + dst_len <= conn->sndbuf_size) {
330 /* unwrapped src case: single chunk of entire dst_len */
331 src_len = dst_len;
332 } else {
333 /* wrapped src case: 2 chunks of sum dst_len; start with 1st: */
334 src_len = conn->sndbuf_size - sent.count;
335 }
336 src_len_sum = src_len;
337 for (dstchunk = 0; dstchunk < 2; dstchunk++) {
338 num_sges = 0;
339 for (srcchunk = 0; srcchunk < 2; srcchunk++) {
340 sges[srcchunk].addr =
341 conn->sndbuf_desc->dma_addr[SMC_SINGLE_LINK] +
342 src_off;
343 sges[srcchunk].length = src_len;
344 sges[srcchunk].lkey = link->roce_pd->local_dma_lkey;
345 num_sges++;
346 src_off += src_len;
347 if (src_off >= conn->sndbuf_size)
348 src_off -= conn->sndbuf_size;
349 /* modulo in send ring */
350 if (src_len_sum == dst_len)
351 break; /* either on 1st or 2nd iteration */
352 /* prepare next (== 2nd) iteration */
353 src_len = dst_len - src_len; /* remainder */
354 src_len_sum += src_len;
355 }
356 rc = smc_tx_rdma_write(conn, dst_off, num_sges, sges);
357 if (rc)
358 return rc;
359 if (dst_len_sum == len)
360 break; /* either on 1st or 2nd iteration */
361 /* prepare next (== 2nd) iteration */
362 dst_off = 0; /* modulo offset in RMBE ring buffer */
363 dst_len = len - dst_len; /* remainder */
364 dst_len_sum += dst_len;
365 src_len = min_t(int,
366 dst_len, conn->sndbuf_size - sent.count);
367 src_len_sum = src_len;
368 }
369
370 smc_tx_advance_cursors(conn, &prod, &sent, len);
371 /* update connection's cursors with advanced local cursors */
372 smc_curs_write(&conn->local_tx_ctrl.prod,
373 smc_curs_read(&prod, conn),
374 conn);
375 /* dst: peer RMBE */
376 smc_curs_write(&conn->tx_curs_sent,
377 smc_curs_read(&sent, conn),
378 conn);
379 /* src: local sndbuf */
380
381 return 0;
382}
383
384/* Wakeup sndbuf consumers from any context (IRQ or process)
385 * since there is more data to transmit; usable snd_wnd as max transmit
386 */
387int smc_tx_sndbuf_nonempty(struct smc_connection *conn)
388{
389 struct smc_cdc_tx_pend *pend;
390 struct smc_wr_buf *wr_buf;
391 int rc;
392
393 spin_lock_bh(&conn->send_lock);
394 rc = smc_cdc_get_free_slot(&conn->lgr->lnk[SMC_SINGLE_LINK], &wr_buf,
395 &pend);
396 if (rc < 0) {
397 if (rc == -EBUSY) {
Ursula Braunb38d7322017-01-09 16:55:25 +0100398 struct smc_sock *smc =
399 container_of(conn, struct smc_sock, conn);
400
401 if (smc->sk.sk_err == ECONNABORTED) {
402 rc = sock_error(&smc->sk);
403 goto out_unlock;
404 }
Ursula Braune6727f32017-01-09 16:55:23 +0100405 rc = 0;
406 schedule_work(&conn->tx_work);
407 }
408 goto out_unlock;
409 }
410
411 rc = smc_tx_rdma_writes(conn);
412 if (rc) {
413 smc_wr_tx_put_slot(&conn->lgr->lnk[SMC_SINGLE_LINK],
414 (struct smc_wr_tx_pend_priv *)pend);
415 goto out_unlock;
416 }
417
418 rc = smc_cdc_msg_send(conn, wr_buf, pend);
419
420out_unlock:
421 spin_unlock_bh(&conn->send_lock);
422 return rc;
423}
424
425/* Wakeup sndbuf consumers from process context
426 * since there is more data to transmit
427 */
428static void smc_tx_work(struct work_struct *work)
429{
430 struct smc_connection *conn = container_of(work,
431 struct smc_connection,
432 tx_work);
433 struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
434
435 lock_sock(&smc->sk);
436 smc_tx_sndbuf_nonempty(conn);
437 release_sock(&smc->sk);
438}
439
Ursula Braun952310c2017-01-09 16:55:24 +0100440void smc_tx_consumer_update(struct smc_connection *conn)
441{
442 union smc_host_cursor cfed, cons;
443 struct smc_cdc_tx_pend *pend;
444 struct smc_wr_buf *wr_buf;
445 int to_confirm, rc;
446
447 smc_curs_write(&cons,
448 smc_curs_read(&conn->local_tx_ctrl.cons, conn),
449 conn);
450 smc_curs_write(&cfed,
451 smc_curs_read(&conn->rx_curs_confirmed, conn),
452 conn);
453 to_confirm = smc_curs_diff(conn->rmbe_size, &cfed, &cons);
454
455 if (conn->local_rx_ctrl.prod_flags.cons_curs_upd_req ||
456 ((to_confirm > conn->rmbe_update_limit) &&
457 ((to_confirm > (conn->rmbe_size / 2)) ||
458 conn->local_rx_ctrl.prod_flags.write_blocked))) {
459 rc = smc_cdc_get_free_slot(&conn->lgr->lnk[SMC_SINGLE_LINK],
460 &wr_buf, &pend);
461 if (!rc)
462 rc = smc_cdc_msg_send(conn, wr_buf, pend);
463 if (rc < 0) {
464 schedule_work(&conn->tx_work);
465 return;
466 }
467 smc_curs_write(&conn->rx_curs_confirmed,
468 smc_curs_read(&conn->local_tx_ctrl.cons, conn),
469 conn);
470 conn->local_rx_ctrl.prod_flags.cons_curs_upd_req = 0;
471 }
472 if (conn->local_rx_ctrl.prod_flags.write_blocked &&
473 !atomic_read(&conn->bytes_to_rcv))
474 conn->local_rx_ctrl.prod_flags.write_blocked = 0;
475}
476
Ursula Braune6727f32017-01-09 16:55:23 +0100477/***************************** send initialize *******************************/
478
479/* Initialize send properties on connection establishment. NB: not __init! */
480void smc_tx_init(struct smc_sock *smc)
481{
482 smc->sk.sk_write_space = smc_tx_write_space;
483 INIT_WORK(&smc->conn.tx_work, smc_tx_work);
484 spin_lock_init(&smc->conn.send_lock);
485}