blob: 06449cf19934926c8ee63f8ec97479ae282c1394 [file] [log] [blame]
Long Li03bee012017-11-07 01:54:56 -07001/*
2 * Copyright (C) 2017, Microsoft Corporation.
3 *
4 * Author(s): Long Li <longli@microsoft.com>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
14 * the GNU General Public License for more details.
15 */
Long Lif1981862017-11-04 18:17:24 -070016#include <linux/module.h>
Long Lif64b78f2017-11-22 17:38:40 -070017#include <linux/highmem.h>
Long Li03bee012017-11-07 01:54:56 -070018#include "smbdirect.h"
Long Lif1981862017-11-04 18:17:24 -070019#include "cifs_debug.h"
Long Lib6903bc2018-05-30 12:48:00 -070020#include "cifsproto.h"
Paulo Alcantara35e2cc12018-06-15 10:22:44 -030021#include "smb2proto.h"
Long Lif1981862017-11-04 18:17:24 -070022
23static struct smbd_response *get_empty_queue_buffer(
24 struct smbd_connection *info);
25static struct smbd_response *get_receive_buffer(
26 struct smbd_connection *info);
27static void put_receive_buffer(
28 struct smbd_connection *info,
29 struct smbd_response *response);
30static int allocate_receive_buffers(struct smbd_connection *info, int num_buf);
31static void destroy_receive_buffers(struct smbd_connection *info);
32
33static void put_empty_packet(
34 struct smbd_connection *info, struct smbd_response *response);
35static void enqueue_reassembly(
36 struct smbd_connection *info,
37 struct smbd_response *response, int data_length);
38static struct smbd_response *_get_first_reassembly(
39 struct smbd_connection *info);
40
41static int smbd_post_recv(
42 struct smbd_connection *info,
43 struct smbd_response *response);
44
45static int smbd_post_send_empty(struct smbd_connection *info);
Long Lid649e1b2017-11-22 17:38:42 -070046static int smbd_post_send_data(
47 struct smbd_connection *info,
48 struct kvec *iov, int n_vec, int remaining_data_length);
49static int smbd_post_send_page(struct smbd_connection *info,
50 struct page *page, unsigned long offset,
51 size_t size, int remaining_data_length);
Long Li03bee012017-11-07 01:54:56 -070052
Long Lic7398582017-11-22 17:38:44 -070053static void destroy_mr_list(struct smbd_connection *info);
54static int allocate_mr_list(struct smbd_connection *info);
55
Long Li03bee012017-11-07 01:54:56 -070056/* SMBD version number */
57#define SMBD_V1 0x0100
58
59/* Port numbers for SMBD transport */
60#define SMB_PORT 445
61#define SMBD_PORT 5445
62
63/* Address lookup and resolve timeout in ms */
64#define RDMA_RESOLVE_TIMEOUT 5000
65
66/* SMBD negotiation timeout in seconds */
67#define SMBD_NEGOTIATE_TIMEOUT 120
68
69/* SMBD minimum receive size and fragmented sized defined in [MS-SMBD] */
70#define SMBD_MIN_RECEIVE_SIZE 128
71#define SMBD_MIN_FRAGMENTED_SIZE 131072
72
73/*
74 * Default maximum number of RDMA read/write outstanding on this connection
75 * This value is possibly decreased during QP creation on hardware limit
76 */
77#define SMBD_CM_RESPONDER_RESOURCES 32
78
79/* Maximum number of retries on data transfer operations */
80#define SMBD_CM_RETRY 6
81/* No need to retry on Receiver Not Ready since SMBD manages credits */
82#define SMBD_CM_RNR_RETRY 0
83
84/*
85 * User configurable initial values per SMBD transport connection
86 * as defined in [MS-SMBD] 3.1.1.1
87 * Those may change after a SMBD negotiation
88 */
89/* The local peer's maximum number of credits to grant to the peer */
90int smbd_receive_credit_max = 255;
91
92/* The remote peer's credit request of local peer */
93int smbd_send_credit_target = 255;
94
95/* The maximum single message size can be sent to remote peer */
96int smbd_max_send_size = 1364;
97
98/* The maximum fragmented upper-layer payload receive size supported */
99int smbd_max_fragmented_recv_size = 1024 * 1024;
100
101/* The maximum single-message size which can be received */
102int smbd_max_receive_size = 8192;
103
104/* The timeout to initiate send of a keepalive message on idle */
105int smbd_keep_alive_interval = 120;
106
107/*
108 * User configurable initial values for RDMA transport
109 * The actual values used may be lower and are limited to hardware capabilities
110 */
111/* Default maximum number of SGEs in a RDMA write/read */
112int smbd_max_frmr_depth = 2048;
113
114/* If payload is less than this byte, use RDMA send/recv not read/write */
115int rdma_readwrite_threshold = 4096;
Long Lif1981862017-11-04 18:17:24 -0700116
117/* Transport logging functions
118 * Logging are defined as classes. They can be OR'ed to define the actual
119 * logging level via module parameter smbd_logging_class
120 * e.g. cifs.smbd_logging_class=0xa0 will log all log_rdma_recv() and
121 * log_rdma_event()
122 */
123#define LOG_OUTGOING 0x1
124#define LOG_INCOMING 0x2
125#define LOG_READ 0x4
126#define LOG_WRITE 0x8
127#define LOG_RDMA_SEND 0x10
128#define LOG_RDMA_RECV 0x20
129#define LOG_KEEP_ALIVE 0x40
130#define LOG_RDMA_EVENT 0x80
131#define LOG_RDMA_MR 0x100
132static unsigned int smbd_logging_class;
133module_param(smbd_logging_class, uint, 0644);
134MODULE_PARM_DESC(smbd_logging_class,
135 "Logging class for SMBD transport 0x0 to 0x100");
136
137#define ERR 0x0
138#define INFO 0x1
139static unsigned int smbd_logging_level = ERR;
140module_param(smbd_logging_level, uint, 0644);
141MODULE_PARM_DESC(smbd_logging_level,
142 "Logging level for SMBD transport, 0 (default): error, 1: info");
143
144#define log_rdma(level, class, fmt, args...) \
145do { \
146 if (level <= smbd_logging_level || class & smbd_logging_class) \
147 cifs_dbg(VFS, "%s:%d " fmt, __func__, __LINE__, ##args);\
148} while (0)
149
150#define log_outgoing(level, fmt, args...) \
151 log_rdma(level, LOG_OUTGOING, fmt, ##args)
152#define log_incoming(level, fmt, args...) \
153 log_rdma(level, LOG_INCOMING, fmt, ##args)
154#define log_read(level, fmt, args...) log_rdma(level, LOG_READ, fmt, ##args)
155#define log_write(level, fmt, args...) log_rdma(level, LOG_WRITE, fmt, ##args)
156#define log_rdma_send(level, fmt, args...) \
157 log_rdma(level, LOG_RDMA_SEND, fmt, ##args)
158#define log_rdma_recv(level, fmt, args...) \
159 log_rdma(level, LOG_RDMA_RECV, fmt, ##args)
160#define log_keep_alive(level, fmt, args...) \
161 log_rdma(level, LOG_KEEP_ALIVE, fmt, ##args)
162#define log_rdma_event(level, fmt, args...) \
163 log_rdma(level, LOG_RDMA_EVENT, fmt, ##args)
164#define log_rdma_mr(level, fmt, args...) \
165 log_rdma(level, LOG_RDMA_MR, fmt, ##args)
166
167/*
168 * Destroy the transport and related RDMA and memory resources
169 * Need to go through all the pending counters and make sure on one is using
170 * the transport while it is destroyed
171 */
172static void smbd_destroy_rdma_work(struct work_struct *work)
173{
174 struct smbd_response *response;
175 struct smbd_connection *info =
176 container_of(work, struct smbd_connection, destroy_work);
177 unsigned long flags;
178
179 log_rdma_event(INFO, "destroying qp\n");
180 ib_drain_qp(info->id->qp);
181 rdma_destroy_qp(info->id);
182
183 /* Unblock all I/O waiting on the send queue */
184 wake_up_interruptible_all(&info->wait_send_queue);
185
186 log_rdma_event(INFO, "cancelling idle timer\n");
187 cancel_delayed_work_sync(&info->idle_timer_work);
188 log_rdma_event(INFO, "cancelling send immediate work\n");
189 cancel_delayed_work_sync(&info->send_immediate_work);
190
Long Lid649e1b2017-11-22 17:38:42 -0700191 log_rdma_event(INFO, "wait for all send to finish\n");
192 wait_event(info->wait_smbd_send_pending,
193 info->smbd_send_pending == 0);
194
Long Lif1981862017-11-04 18:17:24 -0700195 log_rdma_event(INFO, "wait for all recv to finish\n");
196 wake_up_interruptible(&info->wait_reassembly_queue);
Long Lif64b78f2017-11-22 17:38:40 -0700197 wait_event(info->wait_smbd_recv_pending,
198 info->smbd_recv_pending == 0);
Long Lif1981862017-11-04 18:17:24 -0700199
200 log_rdma_event(INFO, "wait for all send posted to IB to finish\n");
201 wait_event(info->wait_send_pending,
202 atomic_read(&info->send_pending) == 0);
203 wait_event(info->wait_send_payload_pending,
204 atomic_read(&info->send_payload_pending) == 0);
205
Long Lic7398582017-11-22 17:38:44 -0700206 log_rdma_event(INFO, "freeing mr list\n");
207 wake_up_interruptible_all(&info->wait_mr);
208 wait_event(info->wait_for_mr_cleanup,
209 atomic_read(&info->mr_used_count) == 0);
210 destroy_mr_list(info);
211
Long Lif1981862017-11-04 18:17:24 -0700212 /* It's not posssible for upper layer to get to reassembly */
213 log_rdma_event(INFO, "drain the reassembly queue\n");
214 do {
215 spin_lock_irqsave(&info->reassembly_queue_lock, flags);
216 response = _get_first_reassembly(info);
217 if (response) {
218 list_del(&response->list);
219 spin_unlock_irqrestore(
220 &info->reassembly_queue_lock, flags);
221 put_receive_buffer(info, response);
Steve Frenchf9de1512018-02-03 19:45:07 -0600222 } else
223 spin_unlock_irqrestore(&info->reassembly_queue_lock, flags);
Long Lif1981862017-11-04 18:17:24 -0700224 } while (response);
Steve Frenchf9de1512018-02-03 19:45:07 -0600225
Long Lif1981862017-11-04 18:17:24 -0700226 info->reassembly_data_length = 0;
227
228 log_rdma_event(INFO, "free receive buffers\n");
229 wait_event(info->wait_receive_queues,
230 info->count_receive_queue + info->count_empty_packet_queue
231 == info->receive_credit_max);
232 destroy_receive_buffers(info);
233
234 ib_free_cq(info->send_cq);
235 ib_free_cq(info->recv_cq);
236 ib_dealloc_pd(info->pd);
237 rdma_destroy_id(info->id);
238
239 /* free mempools */
240 mempool_destroy(info->request_mempool);
241 kmem_cache_destroy(info->request_cache);
242
243 mempool_destroy(info->response_mempool);
244 kmem_cache_destroy(info->response_cache);
245
246 info->transport_status = SMBD_DESTROYED;
247 wake_up_all(&info->wait_destroy);
248}
249
250static int smbd_process_disconnected(struct smbd_connection *info)
251{
252 schedule_work(&info->destroy_work);
253 return 0;
254}
255
256static void smbd_disconnect_rdma_work(struct work_struct *work)
257{
258 struct smbd_connection *info =
259 container_of(work, struct smbd_connection, disconnect_work);
260
261 if (info->transport_status == SMBD_CONNECTED) {
262 info->transport_status = SMBD_DISCONNECTING;
263 rdma_disconnect(info->id);
264 }
265}
266
267static void smbd_disconnect_rdma_connection(struct smbd_connection *info)
268{
269 queue_work(info->workqueue, &info->disconnect_work);
270}
271
272/* Upcall from RDMA CM */
273static int smbd_conn_upcall(
274 struct rdma_cm_id *id, struct rdma_cm_event *event)
275{
276 struct smbd_connection *info = id->context;
277
278 log_rdma_event(INFO, "event=%d status=%d\n",
279 event->event, event->status);
280
281 switch (event->event) {
282 case RDMA_CM_EVENT_ADDR_RESOLVED:
283 case RDMA_CM_EVENT_ROUTE_RESOLVED:
284 info->ri_rc = 0;
285 complete(&info->ri_done);
286 break;
287
288 case RDMA_CM_EVENT_ADDR_ERROR:
289 info->ri_rc = -EHOSTUNREACH;
290 complete(&info->ri_done);
291 break;
292
293 case RDMA_CM_EVENT_ROUTE_ERROR:
294 info->ri_rc = -ENETUNREACH;
295 complete(&info->ri_done);
296 break;
297
298 case RDMA_CM_EVENT_ESTABLISHED:
299 log_rdma_event(INFO, "connected event=%d\n", event->event);
300 info->transport_status = SMBD_CONNECTED;
301 wake_up_interruptible(&info->conn_wait);
302 break;
303
304 case RDMA_CM_EVENT_CONNECT_ERROR:
305 case RDMA_CM_EVENT_UNREACHABLE:
306 case RDMA_CM_EVENT_REJECTED:
307 log_rdma_event(INFO, "connecting failed event=%d\n", event->event);
308 info->transport_status = SMBD_DISCONNECTED;
309 wake_up_interruptible(&info->conn_wait);
310 break;
311
312 case RDMA_CM_EVENT_DEVICE_REMOVAL:
313 case RDMA_CM_EVENT_DISCONNECTED:
314 /* This happenes when we fail the negotiation */
315 if (info->transport_status == SMBD_NEGOTIATE_FAILED) {
316 info->transport_status = SMBD_DISCONNECTED;
317 wake_up(&info->conn_wait);
318 break;
319 }
320
321 info->transport_status = SMBD_DISCONNECTED;
322 smbd_process_disconnected(info);
Long Li050b8c32019-04-04 11:35:42 -0500323 wake_up(&info->disconn_wait);
324 wake_up_interruptible(&info->wait_reassembly_queue);
325 wake_up_interruptible_all(&info->wait_send_queue);
Long Lif1981862017-11-04 18:17:24 -0700326 break;
327
328 default:
329 break;
330 }
331
332 return 0;
333}
334
335/* Upcall from RDMA QP */
336static void
337smbd_qp_async_error_upcall(struct ib_event *event, void *context)
338{
339 struct smbd_connection *info = context;
340
341 log_rdma_event(ERR, "%s on device %s info %p\n",
342 ib_event_msg(event->event), event->device->name, info);
343
344 switch (event->event) {
345 case IB_EVENT_CQ_ERR:
346 case IB_EVENT_QP_FATAL:
347 smbd_disconnect_rdma_connection(info);
348
349 default:
350 break;
351 }
352}
353
354static inline void *smbd_request_payload(struct smbd_request *request)
355{
356 return (void *)request->packet;
357}
358
359static inline void *smbd_response_payload(struct smbd_response *response)
360{
361 return (void *)response->packet;
362}
363
364/* Called when a RDMA send is done */
365static void send_done(struct ib_cq *cq, struct ib_wc *wc)
366{
367 int i;
368 struct smbd_request *request =
369 container_of(wc->wr_cqe, struct smbd_request, cqe);
370
371 log_rdma_send(INFO, "smbd_request %p completed wc->status=%d\n",
372 request, wc->status);
373
374 if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_SEND) {
375 log_rdma_send(ERR, "wc->status=%d wc->opcode=%d\n",
376 wc->status, wc->opcode);
377 smbd_disconnect_rdma_connection(request->info);
378 }
379
380 for (i = 0; i < request->num_sge; i++)
381 ib_dma_unmap_single(request->info->id->device,
382 request->sge[i].addr,
383 request->sge[i].length,
384 DMA_TO_DEVICE);
385
386 if (request->has_payload) {
387 if (atomic_dec_and_test(&request->info->send_payload_pending))
388 wake_up(&request->info->wait_send_payload_pending);
389 } else {
390 if (atomic_dec_and_test(&request->info->send_pending))
391 wake_up(&request->info->wait_send_pending);
392 }
393
394 mempool_free(request, request->info->request_mempool);
395}
396
397static void dump_smbd_negotiate_resp(struct smbd_negotiate_resp *resp)
398{
399 log_rdma_event(INFO, "resp message min_version %u max_version %u "
400 "negotiated_version %u credits_requested %u "
401 "credits_granted %u status %u max_readwrite_size %u "
402 "preferred_send_size %u max_receive_size %u "
403 "max_fragmented_size %u\n",
404 resp->min_version, resp->max_version, resp->negotiated_version,
405 resp->credits_requested, resp->credits_granted, resp->status,
406 resp->max_readwrite_size, resp->preferred_send_size,
407 resp->max_receive_size, resp->max_fragmented_size);
408}
409
410/*
411 * Process a negotiation response message, according to [MS-SMBD]3.1.5.7
412 * response, packet_length: the negotiation response message
413 * return value: true if negotiation is a success, false if failed
414 */
415static bool process_negotiation_response(
416 struct smbd_response *response, int packet_length)
417{
418 struct smbd_connection *info = response->info;
419 struct smbd_negotiate_resp *packet = smbd_response_payload(response);
420
421 if (packet_length < sizeof(struct smbd_negotiate_resp)) {
422 log_rdma_event(ERR,
423 "error: packet_length=%d\n", packet_length);
424 return false;
425 }
426
427 if (le16_to_cpu(packet->negotiated_version) != SMBD_V1) {
428 log_rdma_event(ERR, "error: negotiated_version=%x\n",
429 le16_to_cpu(packet->negotiated_version));
430 return false;
431 }
432 info->protocol = le16_to_cpu(packet->negotiated_version);
433
434 if (packet->credits_requested == 0) {
435 log_rdma_event(ERR, "error: credits_requested==0\n");
436 return false;
437 }
438 info->receive_credit_target = le16_to_cpu(packet->credits_requested);
439
440 if (packet->credits_granted == 0) {
441 log_rdma_event(ERR, "error: credits_granted==0\n");
442 return false;
443 }
444 atomic_set(&info->send_credits, le16_to_cpu(packet->credits_granted));
445
446 atomic_set(&info->receive_credits, 0);
447
448 if (le32_to_cpu(packet->preferred_send_size) > info->max_receive_size) {
449 log_rdma_event(ERR, "error: preferred_send_size=%d\n",
450 le32_to_cpu(packet->preferred_send_size));
451 return false;
452 }
453 info->max_receive_size = le32_to_cpu(packet->preferred_send_size);
454
455 if (le32_to_cpu(packet->max_receive_size) < SMBD_MIN_RECEIVE_SIZE) {
456 log_rdma_event(ERR, "error: max_receive_size=%d\n",
457 le32_to_cpu(packet->max_receive_size));
458 return false;
459 }
460 info->max_send_size = min_t(int, info->max_send_size,
461 le32_to_cpu(packet->max_receive_size));
462
463 if (le32_to_cpu(packet->max_fragmented_size) <
464 SMBD_MIN_FRAGMENTED_SIZE) {
465 log_rdma_event(ERR, "error: max_fragmented_size=%d\n",
466 le32_to_cpu(packet->max_fragmented_size));
467 return false;
468 }
469 info->max_fragmented_send_size =
470 le32_to_cpu(packet->max_fragmented_size);
Long Lic7398582017-11-22 17:38:44 -0700471 info->rdma_readwrite_threshold =
472 rdma_readwrite_threshold > info->max_fragmented_send_size ?
473 info->max_fragmented_send_size :
474 rdma_readwrite_threshold;
475
476
477 info->max_readwrite_size = min_t(u32,
478 le32_to_cpu(packet->max_readwrite_size),
479 info->max_frmr_depth * PAGE_SIZE);
480 info->max_frmr_depth = info->max_readwrite_size / PAGE_SIZE;
Long Lif1981862017-11-04 18:17:24 -0700481
482 return true;
483}
484
485/*
486 * Check and schedule to send an immediate packet
487 * This is used to extend credtis to remote peer to keep the transport busy
488 */
489static void check_and_send_immediate(struct smbd_connection *info)
490{
491 if (info->transport_status != SMBD_CONNECTED)
492 return;
493
494 info->send_immediate = true;
495
496 /*
497 * Promptly send a packet if our peer is running low on receive
498 * credits
499 */
500 if (atomic_read(&info->receive_credits) <
501 info->receive_credit_target - 1)
502 queue_delayed_work(
503 info->workqueue, &info->send_immediate_work, 0);
504}
505
506static void smbd_post_send_credits(struct work_struct *work)
507{
508 int ret = 0;
509 int use_receive_queue = 1;
510 int rc;
511 struct smbd_response *response;
512 struct smbd_connection *info =
513 container_of(work, struct smbd_connection,
514 post_send_credits_work);
515
516 if (info->transport_status != SMBD_CONNECTED) {
517 wake_up(&info->wait_receive_queues);
518 return;
519 }
520
521 if (info->receive_credit_target >
522 atomic_read(&info->receive_credits)) {
523 while (true) {
524 if (use_receive_queue)
525 response = get_receive_buffer(info);
526 else
527 response = get_empty_queue_buffer(info);
528 if (!response) {
529 /* now switch to emtpy packet queue */
530 if (use_receive_queue) {
531 use_receive_queue = 0;
532 continue;
533 } else
534 break;
535 }
536
537 response->type = SMBD_TRANSFER_DATA;
538 response->first_segment = false;
539 rc = smbd_post_recv(info, response);
540 if (rc) {
541 log_rdma_recv(ERR,
542 "post_recv failed rc=%d\n", rc);
543 put_receive_buffer(info, response);
544 break;
545 }
546
547 ret++;
548 }
549 }
550
551 spin_lock(&info->lock_new_credits_offered);
552 info->new_credits_offered += ret;
553 spin_unlock(&info->lock_new_credits_offered);
554
555 atomic_add(ret, &info->receive_credits);
556
557 /* Check if we can post new receive and grant credits to peer */
558 check_and_send_immediate(info);
559}
560
561static void smbd_recv_done_work(struct work_struct *work)
562{
563 struct smbd_connection *info =
564 container_of(work, struct smbd_connection, recv_done_work);
565
566 /*
567 * We may have new send credits granted from remote peer
568 * If any sender is blcoked on lack of credets, unblock it
569 */
570 if (atomic_read(&info->send_credits))
571 wake_up_interruptible(&info->wait_send_queue);
572
573 /*
574 * Check if we need to send something to remote peer to
575 * grant more credits or respond to KEEP_ALIVE packet
576 */
577 check_and_send_immediate(info);
578}
579
580/* Called from softirq, when recv is done */
581static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
582{
583 struct smbd_data_transfer *data_transfer;
584 struct smbd_response *response =
585 container_of(wc->wr_cqe, struct smbd_response, cqe);
586 struct smbd_connection *info = response->info;
587 int data_length = 0;
588
589 log_rdma_recv(INFO, "response=%p type=%d wc status=%d wc opcode %d "
590 "byte_len=%d pkey_index=%x\n",
591 response, response->type, wc->status, wc->opcode,
592 wc->byte_len, wc->pkey_index);
593
594 if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_RECV) {
595 log_rdma_recv(INFO, "wc->status=%d opcode=%d\n",
596 wc->status, wc->opcode);
597 smbd_disconnect_rdma_connection(info);
598 goto error;
599 }
600
601 ib_dma_sync_single_for_cpu(
602 wc->qp->device,
603 response->sge.addr,
604 response->sge.length,
605 DMA_FROM_DEVICE);
606
607 switch (response->type) {
608 /* SMBD negotiation response */
609 case SMBD_NEGOTIATE_RESP:
610 dump_smbd_negotiate_resp(smbd_response_payload(response));
611 info->full_packet_received = true;
612 info->negotiate_done =
613 process_negotiation_response(response, wc->byte_len);
614 complete(&info->negotiate_completion);
615 break;
616
617 /* SMBD data transfer packet */
618 case SMBD_TRANSFER_DATA:
619 data_transfer = smbd_response_payload(response);
620 data_length = le32_to_cpu(data_transfer->data_length);
621
622 /*
623 * If this is a packet with data playload place the data in
624 * reassembly queue and wake up the reading thread
625 */
626 if (data_length) {
627 if (info->full_packet_received)
628 response->first_segment = true;
629
630 if (le32_to_cpu(data_transfer->remaining_data_length))
631 info->full_packet_received = false;
632 else
633 info->full_packet_received = true;
634
635 enqueue_reassembly(
636 info,
637 response,
638 data_length);
639 } else
640 put_empty_packet(info, response);
641
642 if (data_length)
643 wake_up_interruptible(&info->wait_reassembly_queue);
644
645 atomic_dec(&info->receive_credits);
646 info->receive_credit_target =
647 le16_to_cpu(data_transfer->credits_requested);
648 atomic_add(le16_to_cpu(data_transfer->credits_granted),
649 &info->send_credits);
650
651 log_incoming(INFO, "data flags %d data_offset %d "
652 "data_length %d remaining_data_length %d\n",
653 le16_to_cpu(data_transfer->flags),
654 le32_to_cpu(data_transfer->data_offset),
655 le32_to_cpu(data_transfer->data_length),
656 le32_to_cpu(data_transfer->remaining_data_length));
657
658 /* Send a KEEP_ALIVE response right away if requested */
659 info->keep_alive_requested = KEEP_ALIVE_NONE;
660 if (le16_to_cpu(data_transfer->flags) &
661 SMB_DIRECT_RESPONSE_REQUESTED) {
662 info->keep_alive_requested = KEEP_ALIVE_PENDING;
663 }
664
665 queue_work(info->workqueue, &info->recv_done_work);
666 return;
667
668 default:
669 log_rdma_recv(ERR,
670 "unexpected response type=%d\n", response->type);
671 }
672
673error:
674 put_receive_buffer(info, response);
675}
676
677static struct rdma_cm_id *smbd_create_id(
678 struct smbd_connection *info,
679 struct sockaddr *dstaddr, int port)
680{
681 struct rdma_cm_id *id;
682 int rc;
683 __be16 *sport;
684
685 id = rdma_create_id(&init_net, smbd_conn_upcall, info,
686 RDMA_PS_TCP, IB_QPT_RC);
687 if (IS_ERR(id)) {
688 rc = PTR_ERR(id);
689 log_rdma_event(ERR, "rdma_create_id() failed %i\n", rc);
690 return id;
691 }
692
693 if (dstaddr->sa_family == AF_INET6)
694 sport = &((struct sockaddr_in6 *)dstaddr)->sin6_port;
695 else
696 sport = &((struct sockaddr_in *)dstaddr)->sin_port;
697
698 *sport = htons(port);
699
700 init_completion(&info->ri_done);
701 info->ri_rc = -ETIMEDOUT;
702
703 rc = rdma_resolve_addr(id, NULL, (struct sockaddr *)dstaddr,
704 RDMA_RESOLVE_TIMEOUT);
705 if (rc) {
706 log_rdma_event(ERR, "rdma_resolve_addr() failed %i\n", rc);
707 goto out;
708 }
709 wait_for_completion_interruptible_timeout(
710 &info->ri_done, msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT));
711 rc = info->ri_rc;
712 if (rc) {
713 log_rdma_event(ERR, "rdma_resolve_addr() completed %i\n", rc);
714 goto out;
715 }
716
717 info->ri_rc = -ETIMEDOUT;
718 rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
719 if (rc) {
720 log_rdma_event(ERR, "rdma_resolve_route() failed %i\n", rc);
721 goto out;
722 }
723 wait_for_completion_interruptible_timeout(
724 &info->ri_done, msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT));
725 rc = info->ri_rc;
726 if (rc) {
727 log_rdma_event(ERR, "rdma_resolve_route() completed %i\n", rc);
728 goto out;
729 }
730
731 return id;
732
733out:
734 rdma_destroy_id(id);
735 return ERR_PTR(rc);
736}
737
738/*
739 * Test if FRWR (Fast Registration Work Requests) is supported on the device
740 * This implementation requries FRWR on RDMA read/write
741 * return value: true if it is supported
742 */
743static bool frwr_is_supported(struct ib_device_attr *attrs)
744{
745 if (!(attrs->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS))
746 return false;
747 if (attrs->max_fast_reg_page_list_len == 0)
748 return false;
749 return true;
750}
751
752static int smbd_ia_open(
753 struct smbd_connection *info,
754 struct sockaddr *dstaddr, int port)
755{
756 int rc;
757
758 info->id = smbd_create_id(info, dstaddr, port);
759 if (IS_ERR(info->id)) {
760 rc = PTR_ERR(info->id);
761 goto out1;
762 }
763
764 if (!frwr_is_supported(&info->id->device->attrs)) {
765 log_rdma_event(ERR,
766 "Fast Registration Work Requests "
767 "(FRWR) is not supported\n");
768 log_rdma_event(ERR,
769 "Device capability flags = %llx "
770 "max_fast_reg_page_list_len = %u\n",
771 info->id->device->attrs.device_cap_flags,
772 info->id->device->attrs.max_fast_reg_page_list_len);
773 rc = -EPROTONOSUPPORT;
774 goto out2;
775 }
Long Lic7398582017-11-22 17:38:44 -0700776 info->max_frmr_depth = min_t(int,
777 smbd_max_frmr_depth,
778 info->id->device->attrs.max_fast_reg_page_list_len);
779 info->mr_type = IB_MR_TYPE_MEM_REG;
780 if (info->id->device->attrs.device_cap_flags & IB_DEVICE_SG_GAPS_REG)
781 info->mr_type = IB_MR_TYPE_SG_GAPS;
Long Lif1981862017-11-04 18:17:24 -0700782
783 info->pd = ib_alloc_pd(info->id->device, 0);
784 if (IS_ERR(info->pd)) {
785 rc = PTR_ERR(info->pd);
786 log_rdma_event(ERR, "ib_alloc_pd() returned %d\n", rc);
787 goto out2;
788 }
789
790 return 0;
791
792out2:
793 rdma_destroy_id(info->id);
794 info->id = NULL;
795
796out1:
797 return rc;
798}
799
800/*
801 * Send a negotiation request message to the peer
802 * The negotiation procedure is in [MS-SMBD] 3.1.5.2 and 3.1.5.3
803 * After negotiation, the transport is connected and ready for
804 * carrying upper layer SMB payload
805 */
806static int smbd_post_send_negotiate_req(struct smbd_connection *info)
807{
Bart Van Assche73930592018-07-18 09:25:25 -0700808 struct ib_send_wr send_wr;
Long Lif1981862017-11-04 18:17:24 -0700809 int rc = -ENOMEM;
810 struct smbd_request *request;
811 struct smbd_negotiate_req *packet;
812
813 request = mempool_alloc(info->request_mempool, GFP_KERNEL);
814 if (!request)
815 return rc;
816
817 request->info = info;
818
819 packet = smbd_request_payload(request);
820 packet->min_version = cpu_to_le16(SMBD_V1);
821 packet->max_version = cpu_to_le16(SMBD_V1);
822 packet->reserved = 0;
823 packet->credits_requested = cpu_to_le16(info->send_credit_target);
824 packet->preferred_send_size = cpu_to_le32(info->max_send_size);
825 packet->max_receive_size = cpu_to_le32(info->max_receive_size);
826 packet->max_fragmented_size =
827 cpu_to_le32(info->max_fragmented_recv_size);
828
829 request->num_sge = 1;
830 request->sge[0].addr = ib_dma_map_single(
831 info->id->device, (void *)packet,
832 sizeof(*packet), DMA_TO_DEVICE);
833 if (ib_dma_mapping_error(info->id->device, request->sge[0].addr)) {
834 rc = -EIO;
835 goto dma_mapping_failed;
836 }
837
838 request->sge[0].length = sizeof(*packet);
839 request->sge[0].lkey = info->pd->local_dma_lkey;
840
841 ib_dma_sync_single_for_device(
842 info->id->device, request->sge[0].addr,
843 request->sge[0].length, DMA_TO_DEVICE);
844
845 request->cqe.done = send_done;
846
847 send_wr.next = NULL;
848 send_wr.wr_cqe = &request->cqe;
849 send_wr.sg_list = request->sge;
850 send_wr.num_sge = request->num_sge;
851 send_wr.opcode = IB_WR_SEND;
852 send_wr.send_flags = IB_SEND_SIGNALED;
853
854 log_rdma_send(INFO, "sge addr=%llx length=%x lkey=%x\n",
855 request->sge[0].addr,
856 request->sge[0].length, request->sge[0].lkey);
857
858 request->has_payload = false;
859 atomic_inc(&info->send_pending);
Bart Van Assche73930592018-07-18 09:25:25 -0700860 rc = ib_post_send(info->id->qp, &send_wr, NULL);
Long Lif1981862017-11-04 18:17:24 -0700861 if (!rc)
862 return 0;
863
864 /* if we reach here, post send failed */
865 log_rdma_send(ERR, "ib_post_send failed rc=%d\n", rc);
866 atomic_dec(&info->send_pending);
867 ib_dma_unmap_single(info->id->device, request->sge[0].addr,
868 request->sge[0].length, DMA_TO_DEVICE);
869
Long Li21a4e142018-03-30 15:16:36 -0700870 smbd_disconnect_rdma_connection(info);
871
Long Lif1981862017-11-04 18:17:24 -0700872dma_mapping_failed:
873 mempool_free(request, info->request_mempool);
874 return rc;
875}
876
877/*
878 * Extend the credits to remote peer
879 * This implements [MS-SMBD] 3.1.5.9
880 * The idea is that we should extend credits to remote peer as quickly as
881 * it's allowed, to maintain data flow. We allocate as much receive
882 * buffer as possible, and extend the receive credits to remote peer
883 * return value: the new credtis being granted.
884 */
885static int manage_credits_prior_sending(struct smbd_connection *info)
886{
887 int new_credits;
888
889 spin_lock(&info->lock_new_credits_offered);
890 new_credits = info->new_credits_offered;
891 info->new_credits_offered = 0;
892 spin_unlock(&info->lock_new_credits_offered);
893
894 return new_credits;
895}
896
897/*
898 * Check if we need to send a KEEP_ALIVE message
899 * The idle connection timer triggers a KEEP_ALIVE message when expires
900 * SMB_DIRECT_RESPONSE_REQUESTED is set in the message flag to have peer send
901 * back a response.
902 * return value:
903 * 1 if SMB_DIRECT_RESPONSE_REQUESTED needs to be set
904 * 0: otherwise
905 */
906static int manage_keep_alive_before_sending(struct smbd_connection *info)
907{
908 if (info->keep_alive_requested == KEEP_ALIVE_PENDING) {
909 info->keep_alive_requested = KEEP_ALIVE_SENT;
910 return 1;
911 }
912 return 0;
913}
914
915/*
916 * Build and prepare the SMBD packet header
917 * This function waits for avaialbe send credits and build a SMBD packet
918 * header. The caller then optional append payload to the packet after
919 * the header
920 * intput values
921 * size: the size of the payload
922 * remaining_data_length: remaining data to send if this is part of a
923 * fragmented packet
924 * output values
925 * request_out: the request allocated from this function
926 * return values: 0 on success, otherwise actual error code returned
927 */
928static int smbd_create_header(struct smbd_connection *info,
929 int size, int remaining_data_length,
930 struct smbd_request **request_out)
931{
932 struct smbd_request *request;
933 struct smbd_data_transfer *packet;
934 int header_length;
935 int rc;
936
937 /* Wait for send credits. A SMBD packet needs one credit */
938 rc = wait_event_interruptible(info->wait_send_queue,
939 atomic_read(&info->send_credits) > 0 ||
940 info->transport_status != SMBD_CONNECTED);
941 if (rc)
942 return rc;
943
944 if (info->transport_status != SMBD_CONNECTED) {
945 log_outgoing(ERR, "disconnected not sending\n");
946 return -ENOENT;
947 }
948 atomic_dec(&info->send_credits);
949
950 request = mempool_alloc(info->request_mempool, GFP_KERNEL);
951 if (!request) {
952 rc = -ENOMEM;
953 goto err;
954 }
955
956 request->info = info;
957
958 /* Fill in the packet header */
959 packet = smbd_request_payload(request);
960 packet->credits_requested = cpu_to_le16(info->send_credit_target);
961 packet->credits_granted =
962 cpu_to_le16(manage_credits_prior_sending(info));
963 info->send_immediate = false;
964
965 packet->flags = 0;
966 if (manage_keep_alive_before_sending(info))
967 packet->flags |= cpu_to_le16(SMB_DIRECT_RESPONSE_REQUESTED);
968
969 packet->reserved = 0;
970 if (!size)
971 packet->data_offset = 0;
972 else
973 packet->data_offset = cpu_to_le32(24);
974 packet->data_length = cpu_to_le32(size);
975 packet->remaining_data_length = cpu_to_le32(remaining_data_length);
976 packet->padding = 0;
977
978 log_outgoing(INFO, "credits_requested=%d credits_granted=%d "
979 "data_offset=%d data_length=%d remaining_data_length=%d\n",
980 le16_to_cpu(packet->credits_requested),
981 le16_to_cpu(packet->credits_granted),
982 le32_to_cpu(packet->data_offset),
983 le32_to_cpu(packet->data_length),
984 le32_to_cpu(packet->remaining_data_length));
985
986 /* Map the packet to DMA */
987 header_length = sizeof(struct smbd_data_transfer);
988 /* If this is a packet without payload, don't send padding */
989 if (!size)
990 header_length = offsetof(struct smbd_data_transfer, padding);
991
992 request->num_sge = 1;
993 request->sge[0].addr = ib_dma_map_single(info->id->device,
994 (void *)packet,
995 header_length,
996 DMA_BIDIRECTIONAL);
997 if (ib_dma_mapping_error(info->id->device, request->sge[0].addr)) {
998 mempool_free(request, info->request_mempool);
999 rc = -EIO;
1000 goto err;
1001 }
1002
1003 request->sge[0].length = header_length;
1004 request->sge[0].lkey = info->pd->local_dma_lkey;
1005
1006 *request_out = request;
1007 return 0;
1008
1009err:
1010 atomic_inc(&info->send_credits);
1011 return rc;
1012}
1013
1014static void smbd_destroy_header(struct smbd_connection *info,
1015 struct smbd_request *request)
1016{
1017
1018 ib_dma_unmap_single(info->id->device,
1019 request->sge[0].addr,
1020 request->sge[0].length,
1021 DMA_TO_DEVICE);
1022 mempool_free(request, info->request_mempool);
1023 atomic_inc(&info->send_credits);
1024}
1025
1026/* Post the send request */
1027static int smbd_post_send(struct smbd_connection *info,
1028 struct smbd_request *request, bool has_payload)
1029{
Bart Van Assche73930592018-07-18 09:25:25 -07001030 struct ib_send_wr send_wr;
Long Lif1981862017-11-04 18:17:24 -07001031 int rc, i;
1032
1033 for (i = 0; i < request->num_sge; i++) {
1034 log_rdma_send(INFO,
Colin Ian Kingac65cb62018-02-09 12:14:15 +00001035 "rdma_request sge[%d] addr=%llu length=%u\n",
Long Liff30b892018-04-17 12:17:10 -07001036 i, request->sge[i].addr, request->sge[i].length);
Long Lif1981862017-11-04 18:17:24 -07001037 ib_dma_sync_single_for_device(
1038 info->id->device,
1039 request->sge[i].addr,
1040 request->sge[i].length,
1041 DMA_TO_DEVICE);
1042 }
1043
1044 request->cqe.done = send_done;
1045
1046 send_wr.next = NULL;
1047 send_wr.wr_cqe = &request->cqe;
1048 send_wr.sg_list = request->sge;
1049 send_wr.num_sge = request->num_sge;
1050 send_wr.opcode = IB_WR_SEND;
1051 send_wr.send_flags = IB_SEND_SIGNALED;
1052
1053 if (has_payload) {
1054 request->has_payload = true;
1055 atomic_inc(&info->send_payload_pending);
1056 } else {
1057 request->has_payload = false;
1058 atomic_inc(&info->send_pending);
1059 }
1060
Bart Van Assche73930592018-07-18 09:25:25 -07001061 rc = ib_post_send(info->id->qp, &send_wr, NULL);
Long Lif1981862017-11-04 18:17:24 -07001062 if (rc) {
1063 log_rdma_send(ERR, "ib_post_send failed rc=%d\n", rc);
1064 if (has_payload) {
1065 if (atomic_dec_and_test(&info->send_payload_pending))
1066 wake_up(&info->wait_send_payload_pending);
1067 } else {
1068 if (atomic_dec_and_test(&info->send_pending))
1069 wake_up(&info->wait_send_pending);
1070 }
Long Li21a4e142018-03-30 15:16:36 -07001071 smbd_disconnect_rdma_connection(info);
Long Lif1981862017-11-04 18:17:24 -07001072 } else
1073 /* Reset timer for idle connection after packet is sent */
1074 mod_delayed_work(info->workqueue, &info->idle_timer_work,
1075 info->keep_alive_interval*HZ);
1076
1077 return rc;
1078}
1079
1080static int smbd_post_send_sgl(struct smbd_connection *info,
1081 struct scatterlist *sgl, int data_length, int remaining_data_length)
1082{
1083 int num_sgs;
1084 int i, rc;
1085 struct smbd_request *request;
1086 struct scatterlist *sg;
1087
1088 rc = smbd_create_header(
1089 info, data_length, remaining_data_length, &request);
1090 if (rc)
1091 return rc;
1092
1093 num_sgs = sgl ? sg_nents(sgl) : 0;
1094 for_each_sg(sgl, sg, num_sgs, i) {
1095 request->sge[i+1].addr =
1096 ib_dma_map_page(info->id->device, sg_page(sg),
1097 sg->offset, sg->length, DMA_BIDIRECTIONAL);
1098 if (ib_dma_mapping_error(
1099 info->id->device, request->sge[i+1].addr)) {
1100 rc = -EIO;
1101 request->sge[i+1].addr = 0;
1102 goto dma_mapping_failure;
1103 }
1104 request->sge[i+1].length = sg->length;
1105 request->sge[i+1].lkey = info->pd->local_dma_lkey;
1106 request->num_sge++;
1107 }
1108
1109 rc = smbd_post_send(info, request, data_length);
1110 if (!rc)
1111 return 0;
1112
1113dma_mapping_failure:
1114 for (i = 1; i < request->num_sge; i++)
1115 if (request->sge[i].addr)
1116 ib_dma_unmap_single(info->id->device,
1117 request->sge[i].addr,
1118 request->sge[i].length,
1119 DMA_TO_DEVICE);
1120 smbd_destroy_header(info, request);
1121 return rc;
1122}
1123
1124/*
Long Lid649e1b2017-11-22 17:38:42 -07001125 * Send a page
1126 * page: the page to send
1127 * offset: offset in the page to send
1128 * size: length in the page to send
1129 * remaining_data_length: remaining data to send in this payload
1130 */
1131static int smbd_post_send_page(struct smbd_connection *info, struct page *page,
1132 unsigned long offset, size_t size, int remaining_data_length)
1133{
1134 struct scatterlist sgl;
1135
1136 sg_init_table(&sgl, 1);
1137 sg_set_page(&sgl, page, size, offset);
1138
1139 return smbd_post_send_sgl(info, &sgl, size, remaining_data_length);
1140}
1141
1142/*
Long Lif1981862017-11-04 18:17:24 -07001143 * Send an empty message
1144 * Empty message is used to extend credits to peer to for keep live
1145 * while there is no upper layer payload to send at the time
1146 */
1147static int smbd_post_send_empty(struct smbd_connection *info)
1148{
1149 info->count_send_empty++;
1150 return smbd_post_send_sgl(info, NULL, 0, 0);
1151}
1152
1153/*
Long Lid649e1b2017-11-22 17:38:42 -07001154 * Send a data buffer
1155 * iov: the iov array describing the data buffers
1156 * n_vec: number of iov array
1157 * remaining_data_length: remaining data to send following this packet
1158 * in segmented SMBD packet
1159 */
1160static int smbd_post_send_data(
1161 struct smbd_connection *info, struct kvec *iov, int n_vec,
1162 int remaining_data_length)
1163{
1164 int i;
1165 u32 data_length = 0;
1166 struct scatterlist sgl[SMBDIRECT_MAX_SGE];
1167
1168 if (n_vec > SMBDIRECT_MAX_SGE) {
1169 cifs_dbg(VFS, "Can't fit data to SGL, n_vec=%d\n", n_vec);
1170 return -ENOMEM;
1171 }
1172
1173 sg_init_table(sgl, n_vec);
1174 for (i = 0; i < n_vec; i++) {
1175 data_length += iov[i].iov_len;
1176 sg_set_buf(&sgl[i], iov[i].iov_base, iov[i].iov_len);
1177 }
1178
1179 return smbd_post_send_sgl(info, sgl, data_length, remaining_data_length);
1180}
1181
1182/*
Long Lif1981862017-11-04 18:17:24 -07001183 * Post a receive request to the transport
1184 * The remote peer can only send data when a receive request is posted
1185 * The interaction is controlled by send/receive credit system
1186 */
1187static int smbd_post_recv(
1188 struct smbd_connection *info, struct smbd_response *response)
1189{
Bart Van Assche73930592018-07-18 09:25:25 -07001190 struct ib_recv_wr recv_wr;
Long Lif1981862017-11-04 18:17:24 -07001191 int rc = -EIO;
1192
1193 response->sge.addr = ib_dma_map_single(
1194 info->id->device, response->packet,
1195 info->max_receive_size, DMA_FROM_DEVICE);
1196 if (ib_dma_mapping_error(info->id->device, response->sge.addr))
1197 return rc;
1198
1199 response->sge.length = info->max_receive_size;
1200 response->sge.lkey = info->pd->local_dma_lkey;
1201
1202 response->cqe.done = recv_done;
1203
1204 recv_wr.wr_cqe = &response->cqe;
1205 recv_wr.next = NULL;
1206 recv_wr.sg_list = &response->sge;
1207 recv_wr.num_sge = 1;
1208
Bart Van Assche73930592018-07-18 09:25:25 -07001209 rc = ib_post_recv(info->id->qp, &recv_wr, NULL);
Long Lif1981862017-11-04 18:17:24 -07001210 if (rc) {
1211 ib_dma_unmap_single(info->id->device, response->sge.addr,
1212 response->sge.length, DMA_FROM_DEVICE);
Long Li21a4e142018-03-30 15:16:36 -07001213 smbd_disconnect_rdma_connection(info);
Long Lif1981862017-11-04 18:17:24 -07001214 log_rdma_recv(ERR, "ib_post_recv failed rc=%d\n", rc);
1215 }
1216
1217 return rc;
1218}
1219
1220/* Perform SMBD negotiate according to [MS-SMBD] 3.1.5.2 */
1221static int smbd_negotiate(struct smbd_connection *info)
1222{
1223 int rc;
1224 struct smbd_response *response = get_receive_buffer(info);
1225
1226 response->type = SMBD_NEGOTIATE_RESP;
1227 rc = smbd_post_recv(info, response);
1228 log_rdma_event(INFO,
1229 "smbd_post_recv rc=%d iov.addr=%llx iov.length=%x "
1230 "iov.lkey=%x\n",
1231 rc, response->sge.addr,
1232 response->sge.length, response->sge.lkey);
1233 if (rc)
1234 return rc;
1235
1236 init_completion(&info->negotiate_completion);
1237 info->negotiate_done = false;
1238 rc = smbd_post_send_negotiate_req(info);
1239 if (rc)
1240 return rc;
1241
1242 rc = wait_for_completion_interruptible_timeout(
1243 &info->negotiate_completion, SMBD_NEGOTIATE_TIMEOUT * HZ);
1244 log_rdma_event(INFO, "wait_for_completion_timeout rc=%d\n", rc);
1245
1246 if (info->negotiate_done)
1247 return 0;
1248
1249 if (rc == 0)
1250 rc = -ETIMEDOUT;
1251 else if (rc == -ERESTARTSYS)
1252 rc = -EINTR;
1253 else
1254 rc = -ENOTCONN;
1255
1256 return rc;
1257}
1258
1259static void put_empty_packet(
1260 struct smbd_connection *info, struct smbd_response *response)
1261{
1262 spin_lock(&info->empty_packet_queue_lock);
1263 list_add_tail(&response->list, &info->empty_packet_queue);
1264 info->count_empty_packet_queue++;
1265 spin_unlock(&info->empty_packet_queue_lock);
1266
1267 queue_work(info->workqueue, &info->post_send_credits_work);
1268}
1269
1270/*
1271 * Implement Connection.FragmentReassemblyBuffer defined in [MS-SMBD] 3.1.1.1
1272 * This is a queue for reassembling upper layer payload and present to upper
1273 * layer. All the inncoming payload go to the reassembly queue, regardless of
1274 * if reassembly is required. The uuper layer code reads from the queue for all
1275 * incoming payloads.
1276 * Put a received packet to the reassembly queue
1277 * response: the packet received
1278 * data_length: the size of payload in this packet
1279 */
1280static void enqueue_reassembly(
1281 struct smbd_connection *info,
1282 struct smbd_response *response,
1283 int data_length)
1284{
1285 spin_lock(&info->reassembly_queue_lock);
1286 list_add_tail(&response->list, &info->reassembly_queue);
1287 info->reassembly_queue_length++;
1288 /*
1289 * Make sure reassembly_data_length is updated after list and
1290 * reassembly_queue_length are updated. On the dequeue side
1291 * reassembly_data_length is checked without a lock to determine
1292 * if reassembly_queue_length and list is up to date
1293 */
1294 virt_wmb();
1295 info->reassembly_data_length += data_length;
1296 spin_unlock(&info->reassembly_queue_lock);
1297 info->count_reassembly_queue++;
1298 info->count_enqueue_reassembly_queue++;
1299}
1300
1301/*
1302 * Get the first entry at the front of reassembly queue
1303 * Caller is responsible for locking
1304 * return value: the first entry if any, NULL if queue is empty
1305 */
1306static struct smbd_response *_get_first_reassembly(struct smbd_connection *info)
1307{
1308 struct smbd_response *ret = NULL;
1309
1310 if (!list_empty(&info->reassembly_queue)) {
1311 ret = list_first_entry(
1312 &info->reassembly_queue,
1313 struct smbd_response, list);
1314 }
1315 return ret;
1316}
1317
1318static struct smbd_response *get_empty_queue_buffer(
1319 struct smbd_connection *info)
1320{
1321 struct smbd_response *ret = NULL;
1322 unsigned long flags;
1323
1324 spin_lock_irqsave(&info->empty_packet_queue_lock, flags);
1325 if (!list_empty(&info->empty_packet_queue)) {
1326 ret = list_first_entry(
1327 &info->empty_packet_queue,
1328 struct smbd_response, list);
1329 list_del(&ret->list);
1330 info->count_empty_packet_queue--;
1331 }
1332 spin_unlock_irqrestore(&info->empty_packet_queue_lock, flags);
1333
1334 return ret;
1335}
1336
1337/*
1338 * Get a receive buffer
1339 * For each remote send, we need to post a receive. The receive buffers are
1340 * pre-allocated in advance.
1341 * return value: the receive buffer, NULL if none is available
1342 */
1343static struct smbd_response *get_receive_buffer(struct smbd_connection *info)
1344{
1345 struct smbd_response *ret = NULL;
1346 unsigned long flags;
1347
1348 spin_lock_irqsave(&info->receive_queue_lock, flags);
1349 if (!list_empty(&info->receive_queue)) {
1350 ret = list_first_entry(
1351 &info->receive_queue,
1352 struct smbd_response, list);
1353 list_del(&ret->list);
1354 info->count_receive_queue--;
1355 info->count_get_receive_buffer++;
1356 }
1357 spin_unlock_irqrestore(&info->receive_queue_lock, flags);
1358
1359 return ret;
1360}
1361
1362/*
1363 * Return a receive buffer
1364 * Upon returning of a receive buffer, we can post new receive and extend
1365 * more receive credits to remote peer. This is done immediately after a
1366 * receive buffer is returned.
1367 */
1368static void put_receive_buffer(
1369 struct smbd_connection *info, struct smbd_response *response)
1370{
1371 unsigned long flags;
1372
1373 ib_dma_unmap_single(info->id->device, response->sge.addr,
1374 response->sge.length, DMA_FROM_DEVICE);
1375
1376 spin_lock_irqsave(&info->receive_queue_lock, flags);
1377 list_add_tail(&response->list, &info->receive_queue);
1378 info->count_receive_queue++;
1379 info->count_put_receive_buffer++;
1380 spin_unlock_irqrestore(&info->receive_queue_lock, flags);
1381
1382 queue_work(info->workqueue, &info->post_send_credits_work);
1383}
1384
1385/* Preallocate all receive buffer on transport establishment */
1386static int allocate_receive_buffers(struct smbd_connection *info, int num_buf)
1387{
1388 int i;
1389 struct smbd_response *response;
1390
1391 INIT_LIST_HEAD(&info->reassembly_queue);
1392 spin_lock_init(&info->reassembly_queue_lock);
1393 info->reassembly_data_length = 0;
1394 info->reassembly_queue_length = 0;
1395
1396 INIT_LIST_HEAD(&info->receive_queue);
1397 spin_lock_init(&info->receive_queue_lock);
1398 info->count_receive_queue = 0;
1399
1400 INIT_LIST_HEAD(&info->empty_packet_queue);
1401 spin_lock_init(&info->empty_packet_queue_lock);
1402 info->count_empty_packet_queue = 0;
1403
1404 init_waitqueue_head(&info->wait_receive_queues);
1405
1406 for (i = 0; i < num_buf; i++) {
1407 response = mempool_alloc(info->response_mempool, GFP_KERNEL);
1408 if (!response)
1409 goto allocate_failed;
1410
1411 response->info = info;
1412 list_add_tail(&response->list, &info->receive_queue);
1413 info->count_receive_queue++;
1414 }
1415
1416 return 0;
1417
1418allocate_failed:
1419 while (!list_empty(&info->receive_queue)) {
1420 response = list_first_entry(
1421 &info->receive_queue,
1422 struct smbd_response, list);
1423 list_del(&response->list);
1424 info->count_receive_queue--;
1425
1426 mempool_free(response, info->response_mempool);
1427 }
1428 return -ENOMEM;
1429}
1430
1431static void destroy_receive_buffers(struct smbd_connection *info)
1432{
1433 struct smbd_response *response;
1434
1435 while ((response = get_receive_buffer(info)))
1436 mempool_free(response, info->response_mempool);
1437
1438 while ((response = get_empty_queue_buffer(info)))
1439 mempool_free(response, info->response_mempool);
1440}
1441
1442/*
1443 * Check and send an immediate or keep alive packet
1444 * The condition to send those packets are defined in [MS-SMBD] 3.1.1.1
1445 * Connection.KeepaliveRequested and Connection.SendImmediate
1446 * The idea is to extend credits to server as soon as it becomes available
1447 */
1448static void send_immediate_work(struct work_struct *work)
1449{
1450 struct smbd_connection *info = container_of(
1451 work, struct smbd_connection,
1452 send_immediate_work.work);
1453
1454 if (info->keep_alive_requested == KEEP_ALIVE_PENDING ||
1455 info->send_immediate) {
1456 log_keep_alive(INFO, "send an empty message\n");
1457 smbd_post_send_empty(info);
1458 }
1459}
1460
1461/* Implement idle connection timer [MS-SMBD] 3.1.6.2 */
1462static void idle_connection_timer(struct work_struct *work)
1463{
1464 struct smbd_connection *info = container_of(
1465 work, struct smbd_connection,
1466 idle_timer_work.work);
1467
1468 if (info->keep_alive_requested != KEEP_ALIVE_NONE) {
1469 log_keep_alive(ERR,
1470 "error status info->keep_alive_requested=%d\n",
1471 info->keep_alive_requested);
1472 smbd_disconnect_rdma_connection(info);
1473 return;
1474 }
1475
1476 log_keep_alive(INFO, "about to send an empty idle message\n");
1477 smbd_post_send_empty(info);
1478
1479 /* Setup the next idle timeout work */
1480 queue_delayed_work(info->workqueue, &info->idle_timer_work,
1481 info->keep_alive_interval*HZ);
1482}
1483
Long Li050b8c32019-04-04 11:35:42 -05001484/*
1485 * Destroy the transport and related RDMA and memory resources
1486 * Need to go through all the pending counters and make sure on one is using
1487 * the transport while it is destroyed
1488 */
1489void smbd_destroy(struct TCP_Server_Info *server)
Long Li8ef130f2017-11-22 17:38:37 -07001490{
Long Li050b8c32019-04-04 11:35:42 -05001491 struct smbd_connection *info = server->smbd_conn;
1492 struct smbd_response *response;
1493 unsigned long flags;
1494
1495 if (!info) {
1496 log_rdma_event(INFO, "rdma session already destroyed\n");
1497 return;
1498 }
1499
Long Li8ef130f2017-11-22 17:38:37 -07001500 log_rdma_event(INFO, "destroying rdma session\n");
Long Li050b8c32019-04-04 11:35:42 -05001501 if (info->transport_status != SMBD_DISCONNECTED) {
1502 rdma_disconnect(server->smbd_conn->id);
1503 log_rdma_event(INFO, "wait for transport being disconnected\n");
1504 wait_event(
1505 info->disconn_wait,
1506 info->transport_status == SMBD_DISCONNECTED);
1507 }
Long Li8ef130f2017-11-22 17:38:37 -07001508
Long Li050b8c32019-04-04 11:35:42 -05001509 log_rdma_event(INFO, "destroying qp\n");
1510 ib_drain_qp(info->id->qp);
1511 rdma_destroy_qp(info->id);
Long Li8ef130f2017-11-22 17:38:37 -07001512
Long Li050b8c32019-04-04 11:35:42 -05001513 log_rdma_event(INFO, "cancelling idle timer\n");
1514 cancel_delayed_work_sync(&info->idle_timer_work);
1515 log_rdma_event(INFO, "cancelling send immediate work\n");
1516 cancel_delayed_work_sync(&info->send_immediate_work);
1517
1518 log_rdma_event(INFO, "wait for all send posted to IB to finish\n");
1519 wait_event(info->wait_send_pending,
1520 atomic_read(&info->send_pending) == 0);
1521 wait_event(info->wait_send_payload_pending,
1522 atomic_read(&info->send_payload_pending) == 0);
1523
1524 /* It's not posssible for upper layer to get to reassembly */
1525 log_rdma_event(INFO, "drain the reassembly queue\n");
1526 do {
1527 spin_lock_irqsave(&info->reassembly_queue_lock, flags);
1528 response = _get_first_reassembly(info);
1529 if (response) {
1530 list_del(&response->list);
1531 spin_unlock_irqrestore(
1532 &info->reassembly_queue_lock, flags);
1533 put_receive_buffer(info, response);
1534 } else
1535 spin_unlock_irqrestore(
1536 &info->reassembly_queue_lock, flags);
1537 } while (response);
1538 info->reassembly_data_length = 0;
1539
1540 log_rdma_event(INFO, "free receive buffers\n");
1541 wait_event(info->wait_receive_queues,
1542 info->count_receive_queue + info->count_empty_packet_queue
1543 == info->receive_credit_max);
1544 destroy_receive_buffers(info);
1545
1546 /*
1547 * For performance reasons, memory registration and deregistration
1548 * are not locked by srv_mutex. It is possible some processes are
1549 * blocked on transport srv_mutex while holding memory registration.
1550 * Release the transport srv_mutex to allow them to hit the failure
1551 * path when sending data, and then release memory registartions.
1552 */
1553 log_rdma_event(INFO, "freeing mr list\n");
1554 wake_up_interruptible_all(&info->wait_mr);
1555 while (atomic_read(&info->mr_used_count)) {
1556 mutex_unlock(&server->srv_mutex);
1557 msleep(1000);
1558 mutex_lock(&server->srv_mutex);
1559 }
1560 destroy_mr_list(info);
1561
1562 ib_free_cq(info->send_cq);
1563 ib_free_cq(info->recv_cq);
1564 ib_dealloc_pd(info->pd);
1565 rdma_destroy_id(info->id);
1566
1567 /* free mempools */
1568 mempool_destroy(info->request_mempool);
1569 kmem_cache_destroy(info->request_cache);
1570
1571 mempool_destroy(info->response_mempool);
1572 kmem_cache_destroy(info->response_cache);
1573
1574 info->transport_status = SMBD_DESTROYED;
Long Li8ef130f2017-11-22 17:38:37 -07001575
1576 destroy_workqueue(info->workqueue);
1577 kfree(info);
1578}
1579
Long Liad57b8e2017-11-22 17:38:35 -07001580/*
1581 * Reconnect this SMBD connection, called from upper layer
1582 * return value: 0 on success, or actual error code
1583 */
1584int smbd_reconnect(struct TCP_Server_Info *server)
1585{
1586 log_rdma_event(INFO, "reconnecting rdma session\n");
1587
1588 if (!server->smbd_conn) {
Long Li48f238a2018-03-30 15:16:35 -07001589 log_rdma_event(INFO, "rdma session already destroyed\n");
1590 goto create_conn;
Long Liad57b8e2017-11-22 17:38:35 -07001591 }
1592
1593 /*
1594 * This is possible if transport is disconnected and we haven't received
1595 * notification from RDMA, but upper layer has detected timeout
1596 */
1597 if (server->smbd_conn->transport_status == SMBD_CONNECTED) {
1598 log_rdma_event(INFO, "disconnecting transport\n");
Long Li050b8c32019-04-04 11:35:42 -05001599 smbd_destroy(server);
Long Liad57b8e2017-11-22 17:38:35 -07001600 }
1601
Long Li48f238a2018-03-30 15:16:35 -07001602create_conn:
Long Liad57b8e2017-11-22 17:38:35 -07001603 log_rdma_event(INFO, "creating rdma session\n");
1604 server->smbd_conn = smbd_get_connection(
1605 server, (struct sockaddr *) &server->dstaddr);
Long Li48f238a2018-03-30 15:16:35 -07001606 log_rdma_event(INFO, "created rdma session info=%p\n",
1607 server->smbd_conn);
Long Liad57b8e2017-11-22 17:38:35 -07001608
1609 return server->smbd_conn ? 0 : -ENOENT;
1610}
1611
Long Lif1981862017-11-04 18:17:24 -07001612static void destroy_caches_and_workqueue(struct smbd_connection *info)
1613{
1614 destroy_receive_buffers(info);
1615 destroy_workqueue(info->workqueue);
1616 mempool_destroy(info->response_mempool);
1617 kmem_cache_destroy(info->response_cache);
1618 mempool_destroy(info->request_mempool);
1619 kmem_cache_destroy(info->request_cache);
1620}
1621
1622#define MAX_NAME_LEN 80
1623static int allocate_caches_and_workqueue(struct smbd_connection *info)
1624{
1625 char name[MAX_NAME_LEN];
1626 int rc;
1627
Ronnie Sahlberg74ea5f92019-02-09 09:51:11 +10001628 scnprintf(name, MAX_NAME_LEN, "smbd_request_%p", info);
Long Lif1981862017-11-04 18:17:24 -07001629 info->request_cache =
1630 kmem_cache_create(
1631 name,
1632 sizeof(struct smbd_request) +
1633 sizeof(struct smbd_data_transfer),
1634 0, SLAB_HWCACHE_ALIGN, NULL);
1635 if (!info->request_cache)
1636 return -ENOMEM;
1637
1638 info->request_mempool =
1639 mempool_create(info->send_credit_target, mempool_alloc_slab,
1640 mempool_free_slab, info->request_cache);
1641 if (!info->request_mempool)
1642 goto out1;
1643
Ronnie Sahlberg74ea5f92019-02-09 09:51:11 +10001644 scnprintf(name, MAX_NAME_LEN, "smbd_response_%p", info);
Long Lif1981862017-11-04 18:17:24 -07001645 info->response_cache =
1646 kmem_cache_create(
1647 name,
1648 sizeof(struct smbd_response) +
1649 info->max_receive_size,
1650 0, SLAB_HWCACHE_ALIGN, NULL);
1651 if (!info->response_cache)
1652 goto out2;
1653
1654 info->response_mempool =
1655 mempool_create(info->receive_credit_max, mempool_alloc_slab,
1656 mempool_free_slab, info->response_cache);
1657 if (!info->response_mempool)
1658 goto out3;
1659
Ronnie Sahlberg74ea5f92019-02-09 09:51:11 +10001660 scnprintf(name, MAX_NAME_LEN, "smbd_%p", info);
Long Lif1981862017-11-04 18:17:24 -07001661 info->workqueue = create_workqueue(name);
1662 if (!info->workqueue)
1663 goto out4;
1664
1665 rc = allocate_receive_buffers(info, info->receive_credit_max);
1666 if (rc) {
1667 log_rdma_event(ERR, "failed to allocate receive buffers\n");
1668 goto out5;
1669 }
1670
1671 return 0;
1672
1673out5:
1674 destroy_workqueue(info->workqueue);
1675out4:
1676 mempool_destroy(info->response_mempool);
1677out3:
1678 kmem_cache_destroy(info->response_cache);
1679out2:
1680 mempool_destroy(info->request_mempool);
1681out1:
1682 kmem_cache_destroy(info->request_cache);
1683 return -ENOMEM;
1684}
1685
1686/* Create a SMBD connection, called by upper layer */
kbuild test robot90844322017-12-18 21:30:06 +08001687static struct smbd_connection *_smbd_get_connection(
Long Lif1981862017-11-04 18:17:24 -07001688 struct TCP_Server_Info *server, struct sockaddr *dstaddr, int port)
1689{
1690 int rc;
1691 struct smbd_connection *info;
1692 struct rdma_conn_param conn_param;
1693 struct ib_qp_init_attr qp_attr;
1694 struct sockaddr_in *addr_in = (struct sockaddr_in *) dstaddr;
Long Lic7398582017-11-22 17:38:44 -07001695 struct ib_port_immutable port_immutable;
1696 u32 ird_ord_hdr[2];
Long Lif1981862017-11-04 18:17:24 -07001697
1698 info = kzalloc(sizeof(struct smbd_connection), GFP_KERNEL);
1699 if (!info)
1700 return NULL;
1701
1702 info->transport_status = SMBD_CONNECTING;
1703 rc = smbd_ia_open(info, dstaddr, port);
1704 if (rc) {
1705 log_rdma_event(INFO, "smbd_ia_open rc=%d\n", rc);
1706 goto create_id_failed;
1707 }
1708
1709 if (smbd_send_credit_target > info->id->device->attrs.max_cqe ||
1710 smbd_send_credit_target > info->id->device->attrs.max_qp_wr) {
1711 log_rdma_event(ERR,
1712 "consider lowering send_credit_target = %d. "
1713 "Possible CQE overrun, device "
1714 "reporting max_cpe %d max_qp_wr %d\n",
1715 smbd_send_credit_target,
1716 info->id->device->attrs.max_cqe,
1717 info->id->device->attrs.max_qp_wr);
1718 goto config_failed;
1719 }
1720
1721 if (smbd_receive_credit_max > info->id->device->attrs.max_cqe ||
1722 smbd_receive_credit_max > info->id->device->attrs.max_qp_wr) {
1723 log_rdma_event(ERR,
1724 "consider lowering receive_credit_max = %d. "
1725 "Possible CQE overrun, device "
1726 "reporting max_cpe %d max_qp_wr %d\n",
1727 smbd_receive_credit_max,
1728 info->id->device->attrs.max_cqe,
1729 info->id->device->attrs.max_qp_wr);
1730 goto config_failed;
1731 }
1732
1733 info->receive_credit_max = smbd_receive_credit_max;
1734 info->send_credit_target = smbd_send_credit_target;
1735 info->max_send_size = smbd_max_send_size;
1736 info->max_fragmented_recv_size = smbd_max_fragmented_recv_size;
1737 info->max_receive_size = smbd_max_receive_size;
1738 info->keep_alive_interval = smbd_keep_alive_interval;
1739
Steve Wise33023fb2018-06-18 08:05:26 -07001740 if (info->id->device->attrs.max_send_sge < SMBDIRECT_MAX_SGE) {
1741 log_rdma_event(ERR,
1742 "warning: device max_send_sge = %d too small\n",
1743 info->id->device->attrs.max_send_sge);
1744 log_rdma_event(ERR, "Queue Pair creation may fail\n");
1745 }
1746 if (info->id->device->attrs.max_recv_sge < SMBDIRECT_MAX_SGE) {
1747 log_rdma_event(ERR,
1748 "warning: device max_recv_sge = %d too small\n",
1749 info->id->device->attrs.max_recv_sge);
Long Lif1981862017-11-04 18:17:24 -07001750 log_rdma_event(ERR, "Queue Pair creation may fail\n");
1751 }
1752
1753 info->send_cq = NULL;
1754 info->recv_cq = NULL;
1755 info->send_cq = ib_alloc_cq(info->id->device, info,
1756 info->send_credit_target, 0, IB_POLL_SOFTIRQ);
1757 if (IS_ERR(info->send_cq)) {
1758 info->send_cq = NULL;
1759 goto alloc_cq_failed;
1760 }
1761
1762 info->recv_cq = ib_alloc_cq(info->id->device, info,
1763 info->receive_credit_max, 0, IB_POLL_SOFTIRQ);
1764 if (IS_ERR(info->recv_cq)) {
1765 info->recv_cq = NULL;
1766 goto alloc_cq_failed;
1767 }
1768
1769 memset(&qp_attr, 0, sizeof(qp_attr));
1770 qp_attr.event_handler = smbd_qp_async_error_upcall;
1771 qp_attr.qp_context = info;
1772 qp_attr.cap.max_send_wr = info->send_credit_target;
1773 qp_attr.cap.max_recv_wr = info->receive_credit_max;
1774 qp_attr.cap.max_send_sge = SMBDIRECT_MAX_SGE;
1775 qp_attr.cap.max_recv_sge = SMBDIRECT_MAX_SGE;
1776 qp_attr.cap.max_inline_data = 0;
1777 qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
1778 qp_attr.qp_type = IB_QPT_RC;
1779 qp_attr.send_cq = info->send_cq;
1780 qp_attr.recv_cq = info->recv_cq;
1781 qp_attr.port_num = ~0;
1782
1783 rc = rdma_create_qp(info->id, info->pd, &qp_attr);
1784 if (rc) {
1785 log_rdma_event(ERR, "rdma_create_qp failed %i\n", rc);
1786 goto create_qp_failed;
1787 }
1788
1789 memset(&conn_param, 0, sizeof(conn_param));
1790 conn_param.initiator_depth = 0;
1791
Long Lic7398582017-11-22 17:38:44 -07001792 conn_param.responder_resources =
1793 info->id->device->attrs.max_qp_rd_atom
1794 < SMBD_CM_RESPONDER_RESOURCES ?
1795 info->id->device->attrs.max_qp_rd_atom :
1796 SMBD_CM_RESPONDER_RESOURCES;
1797 info->responder_resources = conn_param.responder_resources;
1798 log_rdma_mr(INFO, "responder_resources=%d\n",
1799 info->responder_resources);
1800
1801 /* Need to send IRD/ORD in private data for iWARP */
Kamal Heib3023a1e2018-12-10 21:09:48 +02001802 info->id->device->ops.get_port_immutable(
Long Lic7398582017-11-22 17:38:44 -07001803 info->id->device, info->id->port_num, &port_immutable);
1804 if (port_immutable.core_cap_flags & RDMA_CORE_PORT_IWARP) {
1805 ird_ord_hdr[0] = info->responder_resources;
1806 ird_ord_hdr[1] = 1;
1807 conn_param.private_data = ird_ord_hdr;
1808 conn_param.private_data_len = sizeof(ird_ord_hdr);
1809 } else {
1810 conn_param.private_data = NULL;
1811 conn_param.private_data_len = 0;
1812 }
1813
Long Lif1981862017-11-04 18:17:24 -07001814 conn_param.retry_count = SMBD_CM_RETRY;
1815 conn_param.rnr_retry_count = SMBD_CM_RNR_RETRY;
1816 conn_param.flow_control = 0;
Long Lif1981862017-11-04 18:17:24 -07001817
1818 log_rdma_event(INFO, "connecting to IP %pI4 port %d\n",
1819 &addr_in->sin_addr, port);
1820
1821 init_waitqueue_head(&info->conn_wait);
Long Li050b8c32019-04-04 11:35:42 -05001822 init_waitqueue_head(&info->disconn_wait);
1823 init_waitqueue_head(&info->wait_reassembly_queue);
Long Lif1981862017-11-04 18:17:24 -07001824 rc = rdma_connect(info->id, &conn_param);
1825 if (rc) {
1826 log_rdma_event(ERR, "rdma_connect() failed with %i\n", rc);
1827 goto rdma_connect_failed;
1828 }
1829
1830 wait_event_interruptible(
1831 info->conn_wait, info->transport_status != SMBD_CONNECTING);
1832
1833 if (info->transport_status != SMBD_CONNECTED) {
1834 log_rdma_event(ERR, "rdma_connect failed port=%d\n", port);
1835 goto rdma_connect_failed;
1836 }
1837
1838 log_rdma_event(INFO, "rdma_connect connected\n");
1839
1840 rc = allocate_caches_and_workqueue(info);
1841 if (rc) {
1842 log_rdma_event(ERR, "cache allocation failed\n");
1843 goto allocate_cache_failed;
1844 }
1845
1846 init_waitqueue_head(&info->wait_send_queue);
Long Lif1981862017-11-04 18:17:24 -07001847 INIT_DELAYED_WORK(&info->idle_timer_work, idle_connection_timer);
1848 INIT_DELAYED_WORK(&info->send_immediate_work, send_immediate_work);
1849 queue_delayed_work(info->workqueue, &info->idle_timer_work,
1850 info->keep_alive_interval*HZ);
1851
Long Lid649e1b2017-11-22 17:38:42 -07001852 init_waitqueue_head(&info->wait_smbd_send_pending);
1853 info->smbd_send_pending = 0;
1854
Long Lif64b78f2017-11-22 17:38:40 -07001855 init_waitqueue_head(&info->wait_smbd_recv_pending);
1856 info->smbd_recv_pending = 0;
1857
Long Lif1981862017-11-04 18:17:24 -07001858 init_waitqueue_head(&info->wait_send_pending);
1859 atomic_set(&info->send_pending, 0);
1860
1861 init_waitqueue_head(&info->wait_send_payload_pending);
1862 atomic_set(&info->send_payload_pending, 0);
1863
1864 INIT_WORK(&info->disconnect_work, smbd_disconnect_rdma_work);
1865 INIT_WORK(&info->destroy_work, smbd_destroy_rdma_work);
1866 INIT_WORK(&info->recv_done_work, smbd_recv_done_work);
1867 INIT_WORK(&info->post_send_credits_work, smbd_post_send_credits);
1868 info->new_credits_offered = 0;
1869 spin_lock_init(&info->lock_new_credits_offered);
1870
1871 rc = smbd_negotiate(info);
1872 if (rc) {
1873 log_rdma_event(ERR, "smbd_negotiate rc=%d\n", rc);
1874 goto negotiation_failed;
1875 }
1876
Long Lic7398582017-11-22 17:38:44 -07001877 rc = allocate_mr_list(info);
1878 if (rc) {
1879 log_rdma_mr(ERR, "memory registration allocation failed\n");
1880 goto allocate_mr_failed;
1881 }
1882
Long Lif1981862017-11-04 18:17:24 -07001883 return info;
1884
Long Lic7398582017-11-22 17:38:44 -07001885allocate_mr_failed:
1886 /* At this point, need to a full transport shutdown */
Long Li050b8c32019-04-04 11:35:42 -05001887 smbd_destroy(server);
Long Lic7398582017-11-22 17:38:44 -07001888 return NULL;
1889
Long Lif1981862017-11-04 18:17:24 -07001890negotiation_failed:
1891 cancel_delayed_work_sync(&info->idle_timer_work);
1892 destroy_caches_and_workqueue(info);
1893 info->transport_status = SMBD_NEGOTIATE_FAILED;
1894 init_waitqueue_head(&info->conn_wait);
1895 rdma_disconnect(info->id);
1896 wait_event(info->conn_wait,
1897 info->transport_status == SMBD_DISCONNECTED);
1898
1899allocate_cache_failed:
1900rdma_connect_failed:
1901 rdma_destroy_qp(info->id);
1902
1903create_qp_failed:
1904alloc_cq_failed:
1905 if (info->send_cq)
1906 ib_free_cq(info->send_cq);
1907 if (info->recv_cq)
1908 ib_free_cq(info->recv_cq);
1909
1910config_failed:
1911 ib_dealloc_pd(info->pd);
1912 rdma_destroy_id(info->id);
1913
1914create_id_failed:
1915 kfree(info);
1916 return NULL;
1917}
Long Li399f9532017-11-17 17:26:52 -08001918
1919struct smbd_connection *smbd_get_connection(
1920 struct TCP_Server_Info *server, struct sockaddr *dstaddr)
1921{
1922 struct smbd_connection *ret;
1923 int port = SMBD_PORT;
1924
1925try_again:
1926 ret = _smbd_get_connection(server, dstaddr, port);
1927
1928 /* Try SMB_PORT if SMBD_PORT doesn't work */
1929 if (!ret && port == SMBD_PORT) {
1930 port = SMB_PORT;
1931 goto try_again;
1932 }
1933 return ret;
1934}
Long Lif64b78f2017-11-22 17:38:40 -07001935
1936/*
1937 * Receive data from receive reassembly queue
1938 * All the incoming data packets are placed in reassembly queue
1939 * buf: the buffer to read data into
1940 * size: the length of data to read
1941 * return value: actual data read
1942 * Note: this implementation copies the data from reassebmly queue to receive
1943 * buffers used by upper layer. This is not the optimal code path. A better way
1944 * to do it is to not have upper layer allocate its receive buffers but rather
1945 * borrow the buffer from reassembly queue, and return it after data is
1946 * consumed. But this will require more changes to upper layer code, and also
1947 * need to consider packet boundaries while they still being reassembled.
1948 */
Steve French2026b062018-01-24 23:07:41 -06001949static int smbd_recv_buf(struct smbd_connection *info, char *buf,
1950 unsigned int size)
Long Lif64b78f2017-11-22 17:38:40 -07001951{
1952 struct smbd_response *response;
1953 struct smbd_data_transfer *data_transfer;
1954 int to_copy, to_read, data_read, offset;
1955 u32 data_length, remaining_data_length, data_offset;
1956 int rc;
Long Lif64b78f2017-11-22 17:38:40 -07001957
1958again:
1959 if (info->transport_status != SMBD_CONNECTED) {
1960 log_read(ERR, "disconnected\n");
1961 return -ENODEV;
1962 }
1963
1964 /*
1965 * No need to hold the reassembly queue lock all the time as we are
1966 * the only one reading from the front of the queue. The transport
1967 * may add more entries to the back of the queue at the same time
1968 */
1969 log_read(INFO, "size=%d info->reassembly_data_length=%d\n", size,
1970 info->reassembly_data_length);
1971 if (info->reassembly_data_length >= size) {
1972 int queue_length;
1973 int queue_removed = 0;
1974
1975 /*
1976 * Need to make sure reassembly_data_length is read before
1977 * reading reassembly_queue_length and calling
1978 * _get_first_reassembly. This call is lock free
1979 * as we never read at the end of the queue which are being
1980 * updated in SOFTIRQ as more data is received
1981 */
1982 virt_rmb();
1983 queue_length = info->reassembly_queue_length;
1984 data_read = 0;
1985 to_read = size;
1986 offset = info->first_entry_offset;
1987 while (data_read < size) {
1988 response = _get_first_reassembly(info);
1989 data_transfer = smbd_response_payload(response);
1990 data_length = le32_to_cpu(data_transfer->data_length);
1991 remaining_data_length =
1992 le32_to_cpu(
1993 data_transfer->remaining_data_length);
1994 data_offset = le32_to_cpu(data_transfer->data_offset);
1995
1996 /*
1997 * The upper layer expects RFC1002 length at the
1998 * beginning of the payload. Return it to indicate
1999 * the total length of the packet. This minimize the
2000 * change to upper layer packet processing logic. This
2001 * will be eventually remove when an intermediate
2002 * transport layer is added
2003 */
2004 if (response->first_segment && size == 4) {
2005 unsigned int rfc1002_len =
2006 data_length + remaining_data_length;
2007 *((__be32 *)buf) = cpu_to_be32(rfc1002_len);
2008 data_read = 4;
2009 response->first_segment = false;
2010 log_read(INFO, "returning rfc1002 length %d\n",
2011 rfc1002_len);
2012 goto read_rfc1002_done;
2013 }
2014
2015 to_copy = min_t(int, data_length - offset, to_read);
2016 memcpy(
2017 buf + data_read,
2018 (char *)data_transfer + data_offset + offset,
2019 to_copy);
2020
2021 /* move on to the next buffer? */
2022 if (to_copy == data_length - offset) {
2023 queue_length--;
2024 /*
2025 * No need to lock if we are not at the
2026 * end of the queue
2027 */
Steve Frenchf9de1512018-02-03 19:45:07 -06002028 if (queue_length)
2029 list_del(&response->list);
2030 else {
Arnd Bergmanne36c0482018-01-10 21:51:05 +01002031 spin_lock_irq(
2032 &info->reassembly_queue_lock);
Steve Frenchf9de1512018-02-03 19:45:07 -06002033 list_del(&response->list);
Arnd Bergmanne36c0482018-01-10 21:51:05 +01002034 spin_unlock_irq(
2035 &info->reassembly_queue_lock);
Steve Frenchf9de1512018-02-03 19:45:07 -06002036 }
2037 queue_removed++;
Long Lif64b78f2017-11-22 17:38:40 -07002038 info->count_reassembly_queue--;
2039 info->count_dequeue_reassembly_queue++;
2040 put_receive_buffer(info, response);
2041 offset = 0;
2042 log_read(INFO, "put_receive_buffer offset=0\n");
2043 } else
2044 offset += to_copy;
2045
2046 to_read -= to_copy;
2047 data_read += to_copy;
2048
2049 log_read(INFO, "_get_first_reassembly memcpy %d bytes "
2050 "data_transfer_length-offset=%d after that "
2051 "to_read=%d data_read=%d offset=%d\n",
2052 to_copy, data_length - offset,
2053 to_read, data_read, offset);
2054 }
2055
Arnd Bergmanne36c0482018-01-10 21:51:05 +01002056 spin_lock_irq(&info->reassembly_queue_lock);
Long Lif64b78f2017-11-22 17:38:40 -07002057 info->reassembly_data_length -= data_read;
2058 info->reassembly_queue_length -= queue_removed;
Arnd Bergmanne36c0482018-01-10 21:51:05 +01002059 spin_unlock_irq(&info->reassembly_queue_lock);
Long Lif64b78f2017-11-22 17:38:40 -07002060
2061 info->first_entry_offset = offset;
2062 log_read(INFO, "returning to thread data_read=%d "
2063 "reassembly_data_length=%d first_entry_offset=%d\n",
2064 data_read, info->reassembly_data_length,
2065 info->first_entry_offset);
2066read_rfc1002_done:
2067 return data_read;
2068 }
2069
2070 log_read(INFO, "wait_event on more data\n");
2071 rc = wait_event_interruptible(
2072 info->wait_reassembly_queue,
2073 info->reassembly_data_length >= size ||
2074 info->transport_status != SMBD_CONNECTED);
2075 /* Don't return any data if interrupted */
2076 if (rc)
2077 return -ENODEV;
2078
2079 goto again;
2080}
2081
2082/*
2083 * Receive a page from receive reassembly queue
2084 * page: the page to read data into
2085 * to_read: the length of data to read
2086 * return value: actual data read
2087 */
Steve French2026b062018-01-24 23:07:41 -06002088static int smbd_recv_page(struct smbd_connection *info,
Long Li6509f502018-05-30 12:48:01 -07002089 struct page *page, unsigned int page_offset,
2090 unsigned int to_read)
Long Lif64b78f2017-11-22 17:38:40 -07002091{
2092 int ret;
2093 char *to_address;
Long Li6509f502018-05-30 12:48:01 -07002094 void *page_address;
Long Lif64b78f2017-11-22 17:38:40 -07002095
2096 /* make sure we have the page ready for read */
2097 ret = wait_event_interruptible(
2098 info->wait_reassembly_queue,
2099 info->reassembly_data_length >= to_read ||
2100 info->transport_status != SMBD_CONNECTED);
2101 if (ret)
Long Li6509f502018-05-30 12:48:01 -07002102 return ret;
Long Lif64b78f2017-11-22 17:38:40 -07002103
2104 /* now we can read from reassembly queue and not sleep */
Long Li6509f502018-05-30 12:48:01 -07002105 page_address = kmap_atomic(page);
2106 to_address = (char *) page_address + page_offset;
Long Lif64b78f2017-11-22 17:38:40 -07002107
2108 log_read(INFO, "reading from page=%p address=%p to_read=%d\n",
2109 page, to_address, to_read);
2110
2111 ret = smbd_recv_buf(info, to_address, to_read);
Long Li6509f502018-05-30 12:48:01 -07002112 kunmap_atomic(page_address);
Long Lif64b78f2017-11-22 17:38:40 -07002113
2114 return ret;
2115}
2116
2117/*
2118 * Receive data from transport
2119 * msg: a msghdr point to the buffer, can be ITER_KVEC or ITER_BVEC
2120 * return: total bytes read, or 0. SMB Direct will not do partial read.
2121 */
2122int smbd_recv(struct smbd_connection *info, struct msghdr *msg)
2123{
2124 char *buf;
2125 struct page *page;
Long Li6509f502018-05-30 12:48:01 -07002126 unsigned int to_read, page_offset;
Long Lif64b78f2017-11-22 17:38:40 -07002127 int rc;
2128
2129 info->smbd_recv_pending++;
2130
David Howells00e23702018-10-22 13:07:28 +01002131 if (iov_iter_rw(&msg->msg_iter) == WRITE) {
2132 /* It's a bug in upper layer to get there */
2133 cifs_dbg(VFS, "CIFS: invalid msg iter dir %u\n",
2134 iov_iter_rw(&msg->msg_iter));
2135 rc = -EINVAL;
2136 goto out;
2137 }
2138
2139 switch (iov_iter_type(&msg->msg_iter)) {
2140 case ITER_KVEC:
Long Lif64b78f2017-11-22 17:38:40 -07002141 buf = msg->msg_iter.kvec->iov_base;
2142 to_read = msg->msg_iter.kvec->iov_len;
2143 rc = smbd_recv_buf(info, buf, to_read);
2144 break;
2145
David Howells00e23702018-10-22 13:07:28 +01002146 case ITER_BVEC:
Long Lif64b78f2017-11-22 17:38:40 -07002147 page = msg->msg_iter.bvec->bv_page;
Long Li6509f502018-05-30 12:48:01 -07002148 page_offset = msg->msg_iter.bvec->bv_offset;
Long Lif64b78f2017-11-22 17:38:40 -07002149 to_read = msg->msg_iter.bvec->bv_len;
Long Li6509f502018-05-30 12:48:01 -07002150 rc = smbd_recv_page(info, page, page_offset, to_read);
Long Lif64b78f2017-11-22 17:38:40 -07002151 break;
2152
2153 default:
2154 /* It's a bug in upper layer to get there */
2155 cifs_dbg(VFS, "CIFS: invalid msg type %d\n",
David Howells00e23702018-10-22 13:07:28 +01002156 iov_iter_type(&msg->msg_iter));
Long Li6509f502018-05-30 12:48:01 -07002157 rc = -EINVAL;
Long Lif64b78f2017-11-22 17:38:40 -07002158 }
2159
David Howells00e23702018-10-22 13:07:28 +01002160out:
Long Lif64b78f2017-11-22 17:38:40 -07002161 info->smbd_recv_pending--;
2162 wake_up(&info->wait_smbd_recv_pending);
2163
2164 /* SMBDirect will read it all or nothing */
2165 if (rc > 0)
2166 msg->msg_iter.count = 0;
2167 return rc;
2168}
Long Lid649e1b2017-11-22 17:38:42 -07002169
2170/*
2171 * Send data to transport
2172 * Each rqst is transported as a SMBDirect payload
2173 * rqst: the data to write
2174 * return value: 0 if successfully write, otherwise error code
2175 */
Ronnie Sahlberg81f39f92018-06-28 10:47:14 +10002176int smbd_send(struct TCP_Server_Info *server, struct smb_rqst *rqst)
Long Lid649e1b2017-11-22 17:38:42 -07002177{
Ronnie Sahlberg81f39f92018-06-28 10:47:14 +10002178 struct smbd_connection *info = server->smbd_conn;
Long Lid649e1b2017-11-22 17:38:42 -07002179 struct kvec vec;
2180 int nvecs;
2181 int size;
Paulo Alcantara35e2cc12018-06-15 10:22:44 -03002182 unsigned int buflen, remaining_data_length;
Long Lid649e1b2017-11-22 17:38:42 -07002183 int start, i, j;
2184 int max_iov_size =
2185 info->max_send_size - sizeof(struct smbd_data_transfer);
Long Li8bcda1d2018-04-17 12:17:07 -07002186 struct kvec *iov;
Long Lid649e1b2017-11-22 17:38:42 -07002187 int rc;
2188
2189 info->smbd_send_pending++;
2190 if (info->transport_status != SMBD_CONNECTED) {
2191 rc = -ENODEV;
2192 goto done;
2193 }
2194
2195 /*
Long Li8bcda1d2018-04-17 12:17:07 -07002196 * Skip the RFC1002 length defined in MS-SMB2 section 2.1
2197 * It is used only for TCP transport in the iov[0]
Long Lid649e1b2017-11-22 17:38:42 -07002198 * In future we may want to add a transport layer under protocol
2199 * layer so this will only be issued to TCP transport
2200 */
Long Li8bcda1d2018-04-17 12:17:07 -07002201
2202 if (rqst->rq_iov[0].iov_len != 4) {
2203 log_write(ERR, "expected the pdu length in 1st iov, but got %zu\n", rqst->rq_iov[0].iov_len);
2204 return -EINVAL;
2205 }
Long Lid649e1b2017-11-22 17:38:42 -07002206
Long Lib6903bc2018-05-30 12:48:00 -07002207 /*
2208 * Add in the page array if there is one. The caller needs to set
2209 * rq_tailsz to PAGE_SIZE when the buffer has multiple pages and
2210 * ends at page boundary
2211 */
Ronnie Sahlberg81f39f92018-06-28 10:47:14 +10002212 buflen = smb_rqst_len(server, rqst);
Long Lid649e1b2017-11-22 17:38:42 -07002213
2214 if (buflen + sizeof(struct smbd_data_transfer) >
2215 info->max_fragmented_send_size) {
2216 log_write(ERR, "payload size %d > max size %d\n",
2217 buflen, info->max_fragmented_send_size);
2218 rc = -EINVAL;
2219 goto done;
2220 }
2221
Paulo Alcantara35e2cc12018-06-15 10:22:44 -03002222 iov = &rqst->rq_iov[1];
2223
Long Liff30b892018-04-17 12:17:10 -07002224 cifs_dbg(FYI, "Sending smb (RDMA): smb_len=%u\n", buflen);
2225 for (i = 0; i < rqst->rq_nvec-1; i++)
2226 dump_smb(iov[i].iov_base, iov[i].iov_len);
2227
Long Lid649e1b2017-11-22 17:38:42 -07002228 remaining_data_length = buflen;
2229
2230 log_write(INFO, "rqst->rq_nvec=%d rqst->rq_npages=%d rq_pagesz=%d "
2231 "rq_tailsz=%d buflen=%d\n",
2232 rqst->rq_nvec, rqst->rq_npages, rqst->rq_pagesz,
2233 rqst->rq_tailsz, buflen);
2234
2235 start = i = iov[0].iov_len ? 0 : 1;
2236 buflen = 0;
2237 while (true) {
2238 buflen += iov[i].iov_len;
2239 if (buflen > max_iov_size) {
2240 if (i > start) {
2241 remaining_data_length -=
2242 (buflen-iov[i].iov_len);
2243 log_write(INFO, "sending iov[] from start=%d "
2244 "i=%d nvecs=%d "
2245 "remaining_data_length=%d\n",
2246 start, i, i-start,
2247 remaining_data_length);
2248 rc = smbd_post_send_data(
2249 info, &iov[start], i-start,
2250 remaining_data_length);
2251 if (rc)
2252 goto done;
2253 } else {
2254 /* iov[start] is too big, break it */
2255 nvecs = (buflen+max_iov_size-1)/max_iov_size;
2256 log_write(INFO, "iov[%d] iov_base=%p buflen=%d"
2257 " break to %d vectors\n",
2258 start, iov[start].iov_base,
2259 buflen, nvecs);
2260 for (j = 0; j < nvecs; j++) {
2261 vec.iov_base =
2262 (char *)iov[start].iov_base +
2263 j*max_iov_size;
2264 vec.iov_len = max_iov_size;
2265 if (j == nvecs-1)
2266 vec.iov_len =
2267 buflen -
2268 max_iov_size*(nvecs-1);
2269 remaining_data_length -= vec.iov_len;
2270 log_write(INFO,
2271 "sending vec j=%d iov_base=%p"
2272 " iov_len=%zu "
2273 "remaining_data_length=%d\n",
2274 j, vec.iov_base, vec.iov_len,
2275 remaining_data_length);
2276 rc = smbd_post_send_data(
2277 info, &vec, 1,
2278 remaining_data_length);
2279 if (rc)
2280 goto done;
2281 }
2282 i++;
Long Li8bcda1d2018-04-17 12:17:07 -07002283 if (i == rqst->rq_nvec-1)
Long Liab60ee72018-04-17 12:17:05 -07002284 break;
Long Lid649e1b2017-11-22 17:38:42 -07002285 }
2286 start = i;
2287 buflen = 0;
2288 } else {
2289 i++;
Long Li8bcda1d2018-04-17 12:17:07 -07002290 if (i == rqst->rq_nvec-1) {
Long Lid649e1b2017-11-22 17:38:42 -07002291 /* send out all remaining vecs */
2292 remaining_data_length -= buflen;
2293 log_write(INFO,
2294 "sending iov[] from start=%d i=%d "
2295 "nvecs=%d remaining_data_length=%d\n",
2296 start, i, i-start,
2297 remaining_data_length);
2298 rc = smbd_post_send_data(info, &iov[start],
2299 i-start, remaining_data_length);
2300 if (rc)
2301 goto done;
2302 break;
2303 }
2304 }
2305 log_write(INFO, "looping i=%d buflen=%d\n", i, buflen);
2306 }
2307
2308 /* now sending pages if there are any */
2309 for (i = 0; i < rqst->rq_npages; i++) {
Long Lib6903bc2018-05-30 12:48:00 -07002310 unsigned int offset;
2311
2312 rqst_page_get_length(rqst, i, &buflen, &offset);
Long Lid649e1b2017-11-22 17:38:42 -07002313 nvecs = (buflen + max_iov_size - 1) / max_iov_size;
2314 log_write(INFO, "sending pages buflen=%d nvecs=%d\n",
2315 buflen, nvecs);
2316 for (j = 0; j < nvecs; j++) {
2317 size = max_iov_size;
2318 if (j == nvecs-1)
2319 size = buflen - j*max_iov_size;
2320 remaining_data_length -= size;
2321 log_write(INFO, "sending pages i=%d offset=%d size=%d"
2322 " remaining_data_length=%d\n",
Long Lib6903bc2018-05-30 12:48:00 -07002323 i, j*max_iov_size+offset, size,
2324 remaining_data_length);
Long Lid649e1b2017-11-22 17:38:42 -07002325 rc = smbd_post_send_page(
Long Lib6903bc2018-05-30 12:48:00 -07002326 info, rqst->rq_pages[i],
2327 j*max_iov_size + offset,
Long Lid649e1b2017-11-22 17:38:42 -07002328 size, remaining_data_length);
2329 if (rc)
2330 goto done;
2331 }
2332 }
2333
2334done:
2335 /*
2336 * As an optimization, we don't wait for individual I/O to finish
2337 * before sending the next one.
2338 * Send them all and wait for pending send count to get to 0
2339 * that means all the I/Os have been out and we are good to return
2340 */
2341
2342 wait_event(info->wait_send_payload_pending,
2343 atomic_read(&info->send_payload_pending) == 0);
2344
2345 info->smbd_send_pending--;
2346 wake_up(&info->wait_smbd_send_pending);
2347
2348 return rc;
2349}
Long Lic7398582017-11-22 17:38:44 -07002350
2351static void register_mr_done(struct ib_cq *cq, struct ib_wc *wc)
2352{
2353 struct smbd_mr *mr;
2354 struct ib_cqe *cqe;
2355
2356 if (wc->status) {
2357 log_rdma_mr(ERR, "status=%d\n", wc->status);
2358 cqe = wc->wr_cqe;
2359 mr = container_of(cqe, struct smbd_mr, cqe);
2360 smbd_disconnect_rdma_connection(mr->conn);
2361 }
2362}
2363
2364/*
2365 * The work queue function that recovers MRs
2366 * We need to call ib_dereg_mr() and ib_alloc_mr() before this MR can be used
2367 * again. Both calls are slow, so finish them in a workqueue. This will not
2368 * block I/O path.
2369 * There is one workqueue that recovers MRs, there is no need to lock as the
2370 * I/O requests calling smbd_register_mr will never update the links in the
2371 * mr_list.
2372 */
2373static void smbd_mr_recovery_work(struct work_struct *work)
2374{
2375 struct smbd_connection *info =
2376 container_of(work, struct smbd_connection, mr_recovery_work);
2377 struct smbd_mr *smbdirect_mr;
2378 int rc;
2379
2380 list_for_each_entry(smbdirect_mr, &info->mr_list, list) {
Long Liff526d82018-09-20 21:18:39 +00002381 if (smbdirect_mr->state == MR_INVALIDATED)
2382 ib_dma_unmap_sg(
2383 info->id->device, smbdirect_mr->sgl,
2384 smbdirect_mr->sgl_count,
2385 smbdirect_mr->dir);
2386 else if (smbdirect_mr->state == MR_ERROR) {
Long Lic7398582017-11-22 17:38:44 -07002387
Long Li7cf20bc2018-05-30 12:48:02 -07002388 /* recover this MR entry */
2389 rc = ib_dereg_mr(smbdirect_mr->mr);
2390 if (rc) {
2391 log_rdma_mr(ERR,
2392 "ib_dereg_mr failed rc=%x\n",
2393 rc);
2394 smbd_disconnect_rdma_connection(info);
2395 continue;
2396 }
2397
2398 smbdirect_mr->mr = ib_alloc_mr(
2399 info->pd, info->mr_type,
2400 info->max_frmr_depth);
2401 if (IS_ERR(smbdirect_mr->mr)) {
2402 log_rdma_mr(ERR,
2403 "ib_alloc_mr failed mr_type=%x "
2404 "max_frmr_depth=%x\n",
2405 info->mr_type,
2406 info->max_frmr_depth);
2407 smbd_disconnect_rdma_connection(info);
2408 continue;
2409 }
Long Liff526d82018-09-20 21:18:39 +00002410 } else
2411 /* This MR is being used, don't recover it */
2412 continue;
Long Li7cf20bc2018-05-30 12:48:02 -07002413
Long Liff526d82018-09-20 21:18:39 +00002414 smbdirect_mr->state = MR_READY;
Long Lic7398582017-11-22 17:38:44 -07002415
Long Liff526d82018-09-20 21:18:39 +00002416 /* smbdirect_mr->state is updated by this function
2417 * and is read and updated by I/O issuing CPUs trying
2418 * to get a MR, the call to atomic_inc_return
2419 * implicates a memory barrier and guarantees this
2420 * value is updated before waking up any calls to
2421 * get_mr() from the I/O issuing CPUs
2422 */
2423 if (atomic_inc_return(&info->mr_ready_count) == 1)
2424 wake_up_interruptible(&info->wait_mr);
Long Lic7398582017-11-22 17:38:44 -07002425 }
2426}
2427
2428static void destroy_mr_list(struct smbd_connection *info)
2429{
2430 struct smbd_mr *mr, *tmp;
2431
2432 cancel_work_sync(&info->mr_recovery_work);
2433 list_for_each_entry_safe(mr, tmp, &info->mr_list, list) {
2434 if (mr->state == MR_INVALIDATED)
2435 ib_dma_unmap_sg(info->id->device, mr->sgl,
2436 mr->sgl_count, mr->dir);
2437 ib_dereg_mr(mr->mr);
2438 kfree(mr->sgl);
2439 kfree(mr);
2440 }
2441}
2442
2443/*
2444 * Allocate MRs used for RDMA read/write
2445 * The number of MRs will not exceed hardware capability in responder_resources
2446 * All MRs are kept in mr_list. The MR can be recovered after it's used
2447 * Recovery is done in smbd_mr_recovery_work. The content of list entry changes
2448 * as MRs are used and recovered for I/O, but the list links will not change
2449 */
2450static int allocate_mr_list(struct smbd_connection *info)
2451{
2452 int i;
2453 struct smbd_mr *smbdirect_mr, *tmp;
2454
2455 INIT_LIST_HEAD(&info->mr_list);
2456 init_waitqueue_head(&info->wait_mr);
2457 spin_lock_init(&info->mr_list_lock);
2458 atomic_set(&info->mr_ready_count, 0);
2459 atomic_set(&info->mr_used_count, 0);
2460 init_waitqueue_head(&info->wait_for_mr_cleanup);
2461 /* Allocate more MRs (2x) than hardware responder_resources */
2462 for (i = 0; i < info->responder_resources * 2; i++) {
2463 smbdirect_mr = kzalloc(sizeof(*smbdirect_mr), GFP_KERNEL);
2464 if (!smbdirect_mr)
2465 goto out;
2466 smbdirect_mr->mr = ib_alloc_mr(info->pd, info->mr_type,
2467 info->max_frmr_depth);
2468 if (IS_ERR(smbdirect_mr->mr)) {
2469 log_rdma_mr(ERR, "ib_alloc_mr failed mr_type=%x "
2470 "max_frmr_depth=%x\n",
2471 info->mr_type, info->max_frmr_depth);
2472 goto out;
2473 }
2474 smbdirect_mr->sgl = kcalloc(
2475 info->max_frmr_depth,
2476 sizeof(struct scatterlist),
2477 GFP_KERNEL);
2478 if (!smbdirect_mr->sgl) {
2479 log_rdma_mr(ERR, "failed to allocate sgl\n");
2480 ib_dereg_mr(smbdirect_mr->mr);
2481 goto out;
2482 }
2483 smbdirect_mr->state = MR_READY;
2484 smbdirect_mr->conn = info;
2485
2486 list_add_tail(&smbdirect_mr->list, &info->mr_list);
2487 atomic_inc(&info->mr_ready_count);
2488 }
2489 INIT_WORK(&info->mr_recovery_work, smbd_mr_recovery_work);
2490 return 0;
2491
2492out:
2493 kfree(smbdirect_mr);
2494
2495 list_for_each_entry_safe(smbdirect_mr, tmp, &info->mr_list, list) {
2496 ib_dereg_mr(smbdirect_mr->mr);
2497 kfree(smbdirect_mr->sgl);
2498 kfree(smbdirect_mr);
2499 }
2500 return -ENOMEM;
2501}
2502
2503/*
2504 * Get a MR from mr_list. This function waits until there is at least one
2505 * MR available in the list. It may access the list while the
2506 * smbd_mr_recovery_work is recovering the MR list. This doesn't need a lock
2507 * as they never modify the same places. However, there may be several CPUs
2508 * issueing I/O trying to get MR at the same time, mr_list_lock is used to
2509 * protect this situation.
2510 */
2511static struct smbd_mr *get_mr(struct smbd_connection *info)
2512{
2513 struct smbd_mr *ret;
2514 int rc;
2515again:
2516 rc = wait_event_interruptible(info->wait_mr,
2517 atomic_read(&info->mr_ready_count) ||
2518 info->transport_status != SMBD_CONNECTED);
2519 if (rc) {
2520 log_rdma_mr(ERR, "wait_event_interruptible rc=%x\n", rc);
2521 return NULL;
2522 }
2523
2524 if (info->transport_status != SMBD_CONNECTED) {
2525 log_rdma_mr(ERR, "info->transport_status=%x\n",
2526 info->transport_status);
2527 return NULL;
2528 }
2529
2530 spin_lock(&info->mr_list_lock);
2531 list_for_each_entry(ret, &info->mr_list, list) {
2532 if (ret->state == MR_READY) {
2533 ret->state = MR_REGISTERED;
2534 spin_unlock(&info->mr_list_lock);
2535 atomic_dec(&info->mr_ready_count);
2536 atomic_inc(&info->mr_used_count);
2537 return ret;
2538 }
2539 }
2540
2541 spin_unlock(&info->mr_list_lock);
2542 /*
2543 * It is possible that we could fail to get MR because other processes may
2544 * try to acquire a MR at the same time. If this is the case, retry it.
2545 */
2546 goto again;
2547}
2548
2549/*
2550 * Register memory for RDMA read/write
2551 * pages[]: the list of pages to register memory with
2552 * num_pages: the number of pages to register
2553 * tailsz: if non-zero, the bytes to register in the last page
2554 * writing: true if this is a RDMA write (SMB read), false for RDMA read
2555 * need_invalidate: true if this MR needs to be locally invalidated after I/O
2556 * return value: the MR registered, NULL if failed.
2557 */
2558struct smbd_mr *smbd_register_mr(
2559 struct smbd_connection *info, struct page *pages[], int num_pages,
Long Li7cf20bc2018-05-30 12:48:02 -07002560 int offset, int tailsz, bool writing, bool need_invalidate)
Long Lic7398582017-11-22 17:38:44 -07002561{
2562 struct smbd_mr *smbdirect_mr;
2563 int rc, i;
2564 enum dma_data_direction dir;
2565 struct ib_reg_wr *reg_wr;
Long Lic7398582017-11-22 17:38:44 -07002566
2567 if (num_pages > info->max_frmr_depth) {
2568 log_rdma_mr(ERR, "num_pages=%d max_frmr_depth=%d\n",
2569 num_pages, info->max_frmr_depth);
2570 return NULL;
2571 }
2572
2573 smbdirect_mr = get_mr(info);
2574 if (!smbdirect_mr) {
2575 log_rdma_mr(ERR, "get_mr returning NULL\n");
2576 return NULL;
2577 }
2578 smbdirect_mr->need_invalidate = need_invalidate;
2579 smbdirect_mr->sgl_count = num_pages;
2580 sg_init_table(smbdirect_mr->sgl, num_pages);
2581
Long Li7cf20bc2018-05-30 12:48:02 -07002582 log_rdma_mr(INFO, "num_pages=0x%x offset=0x%x tailsz=0x%x\n",
2583 num_pages, offset, tailsz);
Long Lic7398582017-11-22 17:38:44 -07002584
Long Li7cf20bc2018-05-30 12:48:02 -07002585 if (num_pages == 1) {
2586 sg_set_page(&smbdirect_mr->sgl[0], pages[0], tailsz, offset);
2587 goto skip_multiple_pages;
2588 }
2589
2590 /* We have at least two pages to register */
2591 sg_set_page(
2592 &smbdirect_mr->sgl[0], pages[0], PAGE_SIZE - offset, offset);
2593 i = 1;
2594 while (i < num_pages - 1) {
2595 sg_set_page(&smbdirect_mr->sgl[i], pages[i], PAGE_SIZE, 0);
2596 i++;
2597 }
Long Lic7398582017-11-22 17:38:44 -07002598 sg_set_page(&smbdirect_mr->sgl[i], pages[i],
2599 tailsz ? tailsz : PAGE_SIZE, 0);
2600
Long Li7cf20bc2018-05-30 12:48:02 -07002601skip_multiple_pages:
Long Lic7398582017-11-22 17:38:44 -07002602 dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
2603 smbdirect_mr->dir = dir;
2604 rc = ib_dma_map_sg(info->id->device, smbdirect_mr->sgl, num_pages, dir);
2605 if (!rc) {
Long Li7cf20bc2018-05-30 12:48:02 -07002606 log_rdma_mr(ERR, "ib_dma_map_sg num_pages=%x dir=%x rc=%x\n",
Long Lic7398582017-11-22 17:38:44 -07002607 num_pages, dir, rc);
2608 goto dma_map_error;
2609 }
2610
2611 rc = ib_map_mr_sg(smbdirect_mr->mr, smbdirect_mr->sgl, num_pages,
2612 NULL, PAGE_SIZE);
2613 if (rc != num_pages) {
Long Li7cf20bc2018-05-30 12:48:02 -07002614 log_rdma_mr(ERR,
2615 "ib_map_mr_sg failed rc = %d num_pages = %x\n",
Long Lic7398582017-11-22 17:38:44 -07002616 rc, num_pages);
2617 goto map_mr_error;
2618 }
2619
2620 ib_update_fast_reg_key(smbdirect_mr->mr,
2621 ib_inc_rkey(smbdirect_mr->mr->rkey));
2622 reg_wr = &smbdirect_mr->wr;
2623 reg_wr->wr.opcode = IB_WR_REG_MR;
2624 smbdirect_mr->cqe.done = register_mr_done;
2625 reg_wr->wr.wr_cqe = &smbdirect_mr->cqe;
2626 reg_wr->wr.num_sge = 0;
2627 reg_wr->wr.send_flags = IB_SEND_SIGNALED;
2628 reg_wr->mr = smbdirect_mr->mr;
2629 reg_wr->key = smbdirect_mr->mr->rkey;
2630 reg_wr->access = writing ?
2631 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
2632 IB_ACCESS_REMOTE_READ;
2633
2634 /*
2635 * There is no need for waiting for complemtion on ib_post_send
2636 * on IB_WR_REG_MR. Hardware enforces a barrier and order of execution
2637 * on the next ib_post_send when we actaully send I/O to remote peer
2638 */
Bart Van Assche73930592018-07-18 09:25:25 -07002639 rc = ib_post_send(info->id->qp, &reg_wr->wr, NULL);
Long Lic7398582017-11-22 17:38:44 -07002640 if (!rc)
2641 return smbdirect_mr;
2642
2643 log_rdma_mr(ERR, "ib_post_send failed rc=%x reg_wr->key=%x\n",
2644 rc, reg_wr->key);
2645
2646 /* If all failed, attempt to recover this MR by setting it MR_ERROR*/
2647map_mr_error:
2648 ib_dma_unmap_sg(info->id->device, smbdirect_mr->sgl,
2649 smbdirect_mr->sgl_count, smbdirect_mr->dir);
2650
2651dma_map_error:
2652 smbdirect_mr->state = MR_ERROR;
2653 if (atomic_dec_and_test(&info->mr_used_count))
2654 wake_up(&info->wait_for_mr_cleanup);
2655
Long Li21a4e142018-03-30 15:16:36 -07002656 smbd_disconnect_rdma_connection(info);
2657
Long Lic7398582017-11-22 17:38:44 -07002658 return NULL;
2659}
2660
2661static void local_inv_done(struct ib_cq *cq, struct ib_wc *wc)
2662{
2663 struct smbd_mr *smbdirect_mr;
2664 struct ib_cqe *cqe;
2665
2666 cqe = wc->wr_cqe;
2667 smbdirect_mr = container_of(cqe, struct smbd_mr, cqe);
2668 smbdirect_mr->state = MR_INVALIDATED;
2669 if (wc->status != IB_WC_SUCCESS) {
2670 log_rdma_mr(ERR, "invalidate failed status=%x\n", wc->status);
2671 smbdirect_mr->state = MR_ERROR;
2672 }
2673 complete(&smbdirect_mr->invalidate_done);
2674}
2675
2676/*
2677 * Deregister a MR after I/O is done
2678 * This function may wait if remote invalidation is not used
2679 * and we have to locally invalidate the buffer to prevent data is being
2680 * modified by remote peer after upper layer consumes it
2681 */
2682int smbd_deregister_mr(struct smbd_mr *smbdirect_mr)
2683{
Bart Van Assche73930592018-07-18 09:25:25 -07002684 struct ib_send_wr *wr;
Long Lic7398582017-11-22 17:38:44 -07002685 struct smbd_connection *info = smbdirect_mr->conn;
2686 int rc = 0;
2687
2688 if (smbdirect_mr->need_invalidate) {
2689 /* Need to finish local invalidation before returning */
2690 wr = &smbdirect_mr->inv_wr;
2691 wr->opcode = IB_WR_LOCAL_INV;
2692 smbdirect_mr->cqe.done = local_inv_done;
2693 wr->wr_cqe = &smbdirect_mr->cqe;
2694 wr->num_sge = 0;
2695 wr->ex.invalidate_rkey = smbdirect_mr->mr->rkey;
2696 wr->send_flags = IB_SEND_SIGNALED;
2697
2698 init_completion(&smbdirect_mr->invalidate_done);
Bart Van Assche73930592018-07-18 09:25:25 -07002699 rc = ib_post_send(info->id->qp, wr, NULL);
Long Lic7398582017-11-22 17:38:44 -07002700 if (rc) {
2701 log_rdma_mr(ERR, "ib_post_send failed rc=%x\n", rc);
2702 smbd_disconnect_rdma_connection(info);
2703 goto done;
2704 }
2705 wait_for_completion(&smbdirect_mr->invalidate_done);
2706 smbdirect_mr->need_invalidate = false;
2707 } else
2708 /*
2709 * For remote invalidation, just set it to MR_INVALIDATED
2710 * and defer to mr_recovery_work to recover the MR for next use
2711 */
2712 smbdirect_mr->state = MR_INVALIDATED;
2713
2714 /*
2715 * Schedule the work to do MR recovery for future I/Os
2716 * MR recovery is slow and we don't want it to block the current I/O
2717 */
2718 queue_work(info->workqueue, &info->mr_recovery_work);
2719
2720done:
2721 if (atomic_dec_and_test(&info->mr_used_count))
2722 wake_up(&info->wait_for_mr_cleanup);
2723
2724 return rc;
2725}