blob: 51826479a41e6dbf4dc7e032261fb49a97cd8562 [file] [log] [blame]
Sagi Grimberg3f2304f2018-12-03 17:52:17 -08001// SPDX-License-Identifier: GPL-2.0
2/*
3 * NVMe over Fabrics TCP host.
4 * Copyright (c) 2018 Lightbits Labs. All rights reserved.
5 */
6#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7#include <linux/module.h>
8#include <linux/init.h>
9#include <linux/slab.h>
10#include <linux/err.h>
11#include <linux/nvme-tcp.h>
12#include <net/sock.h>
13#include <net/tcp.h>
14#include <linux/blk-mq.h>
15#include <crypto/hash.h>
16
17#include "nvme.h"
18#include "fabrics.h"
19
20struct nvme_tcp_queue;
21
22enum nvme_tcp_send_state {
23 NVME_TCP_SEND_CMD_PDU = 0,
24 NVME_TCP_SEND_H2C_PDU,
25 NVME_TCP_SEND_DATA,
26 NVME_TCP_SEND_DDGST,
27};
28
29struct nvme_tcp_request {
30 struct nvme_request req;
31 void *pdu;
32 struct nvme_tcp_queue *queue;
33 u32 data_len;
34 u32 pdu_len;
35 u32 pdu_sent;
36 u16 ttag;
37 struct list_head entry;
38 u32 ddgst;
39
40 struct bio *curr_bio;
41 struct iov_iter iter;
42
43 /* send state */
44 size_t offset;
45 size_t data_sent;
46 enum nvme_tcp_send_state state;
47};
48
49enum nvme_tcp_queue_flags {
50 NVME_TCP_Q_ALLOCATED = 0,
51 NVME_TCP_Q_LIVE = 1,
52};
53
54enum nvme_tcp_recv_state {
55 NVME_TCP_RECV_PDU = 0,
56 NVME_TCP_RECV_DATA,
57 NVME_TCP_RECV_DDGST,
58};
59
60struct nvme_tcp_ctrl;
61struct nvme_tcp_queue {
62 struct socket *sock;
63 struct work_struct io_work;
64 int io_cpu;
65
66 spinlock_t lock;
67 struct list_head send_list;
68
69 /* recv state */
70 void *pdu;
71 int pdu_remaining;
72 int pdu_offset;
73 size_t data_remaining;
74 size_t ddgst_remaining;
75
76 /* send state */
77 struct nvme_tcp_request *request;
78
79 int queue_size;
80 size_t cmnd_capsule_len;
81 struct nvme_tcp_ctrl *ctrl;
82 unsigned long flags;
83 bool rd_enabled;
84
85 bool hdr_digest;
86 bool data_digest;
87 struct ahash_request *rcv_hash;
88 struct ahash_request *snd_hash;
89 __le32 exp_ddgst;
90 __le32 recv_ddgst;
91
92 struct page_frag_cache pf_cache;
93
94 void (*state_change)(struct sock *);
95 void (*data_ready)(struct sock *);
96 void (*write_space)(struct sock *);
97};
98
99struct nvme_tcp_ctrl {
100 /* read only in the hot path */
101 struct nvme_tcp_queue *queues;
102 struct blk_mq_tag_set tag_set;
103
104 /* other member variables */
105 struct list_head list;
106 struct blk_mq_tag_set admin_tag_set;
107 struct sockaddr_storage addr;
108 struct sockaddr_storage src_addr;
109 struct nvme_ctrl ctrl;
110
111 struct work_struct err_work;
112 struct delayed_work connect_work;
113 struct nvme_tcp_request async_req;
114};
115
116static LIST_HEAD(nvme_tcp_ctrl_list);
117static DEFINE_MUTEX(nvme_tcp_ctrl_mutex);
118static struct workqueue_struct *nvme_tcp_wq;
119static struct blk_mq_ops nvme_tcp_mq_ops;
120static struct blk_mq_ops nvme_tcp_admin_mq_ops;
121
122static inline struct nvme_tcp_ctrl *to_tcp_ctrl(struct nvme_ctrl *ctrl)
123{
124 return container_of(ctrl, struct nvme_tcp_ctrl, ctrl);
125}
126
127static inline int nvme_tcp_queue_id(struct nvme_tcp_queue *queue)
128{
129 return queue - queue->ctrl->queues;
130}
131
132static inline struct blk_mq_tags *nvme_tcp_tagset(struct nvme_tcp_queue *queue)
133{
134 u32 queue_idx = nvme_tcp_queue_id(queue);
135
136 if (queue_idx == 0)
137 return queue->ctrl->admin_tag_set.tags[queue_idx];
138 return queue->ctrl->tag_set.tags[queue_idx - 1];
139}
140
141static inline u8 nvme_tcp_hdgst_len(struct nvme_tcp_queue *queue)
142{
143 return queue->hdr_digest ? NVME_TCP_DIGEST_LENGTH : 0;
144}
145
146static inline u8 nvme_tcp_ddgst_len(struct nvme_tcp_queue *queue)
147{
148 return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0;
149}
150
151static inline size_t nvme_tcp_inline_data_size(struct nvme_tcp_queue *queue)
152{
153 return queue->cmnd_capsule_len - sizeof(struct nvme_command);
154}
155
156static inline bool nvme_tcp_async_req(struct nvme_tcp_request *req)
157{
158 return req == &req->queue->ctrl->async_req;
159}
160
161static inline bool nvme_tcp_has_inline_data(struct nvme_tcp_request *req)
162{
163 struct request *rq;
164 unsigned int bytes;
165
166 if (unlikely(nvme_tcp_async_req(req)))
167 return false; /* async events don't have a request */
168
169 rq = blk_mq_rq_from_pdu(req);
170 bytes = blk_rq_payload_bytes(rq);
171
172 return rq_data_dir(rq) == WRITE && bytes &&
173 bytes <= nvme_tcp_inline_data_size(req->queue);
174}
175
176static inline struct page *nvme_tcp_req_cur_page(struct nvme_tcp_request *req)
177{
178 return req->iter.bvec->bv_page;
179}
180
181static inline size_t nvme_tcp_req_cur_offset(struct nvme_tcp_request *req)
182{
183 return req->iter.bvec->bv_offset + req->iter.iov_offset;
184}
185
186static inline size_t nvme_tcp_req_cur_length(struct nvme_tcp_request *req)
187{
188 return min_t(size_t, req->iter.bvec->bv_len - req->iter.iov_offset,
189 req->pdu_len - req->pdu_sent);
190}
191
192static inline size_t nvme_tcp_req_offset(struct nvme_tcp_request *req)
193{
194 return req->iter.iov_offset;
195}
196
197static inline size_t nvme_tcp_pdu_data_left(struct nvme_tcp_request *req)
198{
199 return rq_data_dir(blk_mq_rq_from_pdu(req)) == WRITE ?
200 req->pdu_len - req->pdu_sent : 0;
201}
202
203static inline size_t nvme_tcp_pdu_last_send(struct nvme_tcp_request *req,
204 int len)
205{
206 return nvme_tcp_pdu_data_left(req) <= len;
207}
208
209static void nvme_tcp_init_iter(struct nvme_tcp_request *req,
210 unsigned int dir)
211{
212 struct request *rq = blk_mq_rq_from_pdu(req);
213 struct bio_vec *vec;
214 unsigned int size;
215 int nsegs;
216 size_t offset;
217
218 if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) {
219 vec = &rq->special_vec;
220 nsegs = 1;
221 size = blk_rq_payload_bytes(rq);
222 offset = 0;
223 } else {
224 struct bio *bio = req->curr_bio;
225
226 vec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
227 nsegs = bio_segments(bio);
228 size = bio->bi_iter.bi_size;
229 offset = bio->bi_iter.bi_bvec_done;
230 }
231
232 iov_iter_bvec(&req->iter, dir, vec, nsegs, size);
233 req->iter.iov_offset = offset;
234}
235
236static inline void nvme_tcp_advance_req(struct nvme_tcp_request *req,
237 int len)
238{
239 req->data_sent += len;
240 req->pdu_sent += len;
241 iov_iter_advance(&req->iter, len);
242 if (!iov_iter_count(&req->iter) &&
243 req->data_sent < req->data_len) {
244 req->curr_bio = req->curr_bio->bi_next;
245 nvme_tcp_init_iter(req, WRITE);
246 }
247}
248
249static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req)
250{
251 struct nvme_tcp_queue *queue = req->queue;
252
253 spin_lock(&queue->lock);
254 list_add_tail(&req->entry, &queue->send_list);
255 spin_unlock(&queue->lock);
256
257 queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
258}
259
260static inline struct nvme_tcp_request *
261nvme_tcp_fetch_request(struct nvme_tcp_queue *queue)
262{
263 struct nvme_tcp_request *req;
264
265 spin_lock(&queue->lock);
266 req = list_first_entry_or_null(&queue->send_list,
267 struct nvme_tcp_request, entry);
268 if (req)
269 list_del(&req->entry);
270 spin_unlock(&queue->lock);
271
272 return req;
273}
274
275static inline void nvme_tcp_ddgst_final(struct ahash_request *hash, u32 *dgst)
276{
277 ahash_request_set_crypt(hash, NULL, (u8 *)dgst, 0);
278 crypto_ahash_final(hash);
279}
280
281static inline void nvme_tcp_ddgst_update(struct ahash_request *hash,
282 struct page *page, off_t off, size_t len)
283{
284 struct scatterlist sg;
285
286 sg_init_marker(&sg, 1);
287 sg_set_page(&sg, page, len, off);
288 ahash_request_set_crypt(hash, &sg, NULL, len);
289 crypto_ahash_update(hash);
290}
291
292static inline void nvme_tcp_hdgst(struct ahash_request *hash,
293 void *pdu, size_t len)
294{
295 struct scatterlist sg;
296
297 sg_init_one(&sg, pdu, len);
298 ahash_request_set_crypt(hash, &sg, pdu + len, len);
299 crypto_ahash_digest(hash);
300}
301
302static int nvme_tcp_verify_hdgst(struct nvme_tcp_queue *queue,
303 void *pdu, size_t pdu_len)
304{
305 struct nvme_tcp_hdr *hdr = pdu;
306 __le32 recv_digest;
307 __le32 exp_digest;
308
309 if (unlikely(!(hdr->flags & NVME_TCP_F_HDGST))) {
310 dev_err(queue->ctrl->ctrl.device,
311 "queue %d: header digest flag is cleared\n",
312 nvme_tcp_queue_id(queue));
313 return -EPROTO;
314 }
315
316 recv_digest = *(__le32 *)(pdu + hdr->hlen);
317 nvme_tcp_hdgst(queue->rcv_hash, pdu, pdu_len);
318 exp_digest = *(__le32 *)(pdu + hdr->hlen);
319 if (recv_digest != exp_digest) {
320 dev_err(queue->ctrl->ctrl.device,
321 "header digest error: recv %#x expected %#x\n",
322 le32_to_cpu(recv_digest), le32_to_cpu(exp_digest));
323 return -EIO;
324 }
325
326 return 0;
327}
328
329static int nvme_tcp_check_ddgst(struct nvme_tcp_queue *queue, void *pdu)
330{
331 struct nvme_tcp_hdr *hdr = pdu;
332 u8 digest_len = nvme_tcp_hdgst_len(queue);
333 u32 len;
334
335 len = le32_to_cpu(hdr->plen) - hdr->hlen -
336 ((hdr->flags & NVME_TCP_F_HDGST) ? digest_len : 0);
337
338 if (unlikely(len && !(hdr->flags & NVME_TCP_F_DDGST))) {
339 dev_err(queue->ctrl->ctrl.device,
340 "queue %d: data digest flag is cleared\n",
341 nvme_tcp_queue_id(queue));
342 return -EPROTO;
343 }
344 crypto_ahash_init(queue->rcv_hash);
345
346 return 0;
347}
348
349static void nvme_tcp_exit_request(struct blk_mq_tag_set *set,
350 struct request *rq, unsigned int hctx_idx)
351{
352 struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
353
354 page_frag_free(req->pdu);
355}
356
357static int nvme_tcp_init_request(struct blk_mq_tag_set *set,
358 struct request *rq, unsigned int hctx_idx,
359 unsigned int numa_node)
360{
361 struct nvme_tcp_ctrl *ctrl = set->driver_data;
362 struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
363 int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0;
364 struct nvme_tcp_queue *queue = &ctrl->queues[queue_idx];
365 u8 hdgst = nvme_tcp_hdgst_len(queue);
366
367 req->pdu = page_frag_alloc(&queue->pf_cache,
368 sizeof(struct nvme_tcp_cmd_pdu) + hdgst,
369 GFP_KERNEL | __GFP_ZERO);
370 if (!req->pdu)
371 return -ENOMEM;
372
373 req->queue = queue;
374 nvme_req(rq)->ctrl = &ctrl->ctrl;
375
376 return 0;
377}
378
379static int nvme_tcp_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
380 unsigned int hctx_idx)
381{
382 struct nvme_tcp_ctrl *ctrl = data;
383 struct nvme_tcp_queue *queue = &ctrl->queues[hctx_idx + 1];
384
385 hctx->driver_data = queue;
386 return 0;
387}
388
389static int nvme_tcp_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data,
390 unsigned int hctx_idx)
391{
392 struct nvme_tcp_ctrl *ctrl = data;
393 struct nvme_tcp_queue *queue = &ctrl->queues[0];
394
395 hctx->driver_data = queue;
396 return 0;
397}
398
399static enum nvme_tcp_recv_state
400nvme_tcp_recv_state(struct nvme_tcp_queue *queue)
401{
402 return (queue->pdu_remaining) ? NVME_TCP_RECV_PDU :
403 (queue->ddgst_remaining) ? NVME_TCP_RECV_DDGST :
404 NVME_TCP_RECV_DATA;
405}
406
407static void nvme_tcp_init_recv_ctx(struct nvme_tcp_queue *queue)
408{
409 queue->pdu_remaining = sizeof(struct nvme_tcp_rsp_pdu) +
410 nvme_tcp_hdgst_len(queue);
411 queue->pdu_offset = 0;
412 queue->data_remaining = -1;
413 queue->ddgst_remaining = 0;
414}
415
416static void nvme_tcp_error_recovery(struct nvme_ctrl *ctrl)
417{
418 if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
419 return;
420
421 queue_work(nvme_wq, &to_tcp_ctrl(ctrl)->err_work);
422}
423
424static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue,
425 struct nvme_completion *cqe)
426{
427 struct request *rq;
428
429 rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), cqe->command_id);
430 if (!rq) {
431 dev_err(queue->ctrl->ctrl.device,
432 "queue %d tag 0x%x not found\n",
433 nvme_tcp_queue_id(queue), cqe->command_id);
434 nvme_tcp_error_recovery(&queue->ctrl->ctrl);
435 return -EINVAL;
436 }
437
438 nvme_end_request(rq, cqe->status, cqe->result);
439
440 return 0;
441}
442
443static int nvme_tcp_handle_c2h_data(struct nvme_tcp_queue *queue,
444 struct nvme_tcp_data_pdu *pdu)
445{
446 struct request *rq;
447
448 rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
449 if (!rq) {
450 dev_err(queue->ctrl->ctrl.device,
451 "queue %d tag %#x not found\n",
452 nvme_tcp_queue_id(queue), pdu->command_id);
453 return -ENOENT;
454 }
455
456 if (!blk_rq_payload_bytes(rq)) {
457 dev_err(queue->ctrl->ctrl.device,
458 "queue %d tag %#x unexpected data\n",
459 nvme_tcp_queue_id(queue), rq->tag);
460 return -EIO;
461 }
462
463 queue->data_remaining = le32_to_cpu(pdu->data_length);
464
465 return 0;
466
467}
468
469static int nvme_tcp_handle_comp(struct nvme_tcp_queue *queue,
470 struct nvme_tcp_rsp_pdu *pdu)
471{
472 struct nvme_completion *cqe = &pdu->cqe;
473 int ret = 0;
474
475 /*
476 * AEN requests are special as they don't time out and can
477 * survive any kind of queue freeze and often don't respond to
478 * aborts. We don't even bother to allocate a struct request
479 * for them but rather special case them here.
480 */
481 if (unlikely(nvme_tcp_queue_id(queue) == 0 &&
482 cqe->command_id >= NVME_AQ_BLK_MQ_DEPTH))
483 nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status,
484 &cqe->result);
485 else
486 ret = nvme_tcp_process_nvme_cqe(queue, cqe);
487
488 return ret;
489}
490
491static int nvme_tcp_setup_h2c_data_pdu(struct nvme_tcp_request *req,
492 struct nvme_tcp_r2t_pdu *pdu)
493{
494 struct nvme_tcp_data_pdu *data = req->pdu;
495 struct nvme_tcp_queue *queue = req->queue;
496 struct request *rq = blk_mq_rq_from_pdu(req);
497 u8 hdgst = nvme_tcp_hdgst_len(queue);
498 u8 ddgst = nvme_tcp_ddgst_len(queue);
499
500 req->pdu_len = le32_to_cpu(pdu->r2t_length);
501 req->pdu_sent = 0;
502
503 if (unlikely(req->data_sent + req->pdu_len > req->data_len)) {
504 dev_err(queue->ctrl->ctrl.device,
505 "req %d r2t len %u exceeded data len %u (%zu sent)\n",
506 rq->tag, req->pdu_len, req->data_len,
507 req->data_sent);
508 return -EPROTO;
509 }
510
511 if (unlikely(le32_to_cpu(pdu->r2t_offset) < req->data_sent)) {
512 dev_err(queue->ctrl->ctrl.device,
513 "req %d unexpected r2t offset %u (expected %zu)\n",
514 rq->tag, le32_to_cpu(pdu->r2t_offset),
515 req->data_sent);
516 return -EPROTO;
517 }
518
519 memset(data, 0, sizeof(*data));
520 data->hdr.type = nvme_tcp_h2c_data;
521 data->hdr.flags = NVME_TCP_F_DATA_LAST;
522 if (queue->hdr_digest)
523 data->hdr.flags |= NVME_TCP_F_HDGST;
524 if (queue->data_digest)
525 data->hdr.flags |= NVME_TCP_F_DDGST;
526 data->hdr.hlen = sizeof(*data);
527 data->hdr.pdo = data->hdr.hlen + hdgst;
528 data->hdr.plen =
529 cpu_to_le32(data->hdr.hlen + hdgst + req->pdu_len + ddgst);
530 data->ttag = pdu->ttag;
531 data->command_id = rq->tag;
532 data->data_offset = cpu_to_le32(req->data_sent);
533 data->data_length = cpu_to_le32(req->pdu_len);
534 return 0;
535}
536
537static int nvme_tcp_handle_r2t(struct nvme_tcp_queue *queue,
538 struct nvme_tcp_r2t_pdu *pdu)
539{
540 struct nvme_tcp_request *req;
541 struct request *rq;
542 int ret;
543
544 rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
545 if (!rq) {
546 dev_err(queue->ctrl->ctrl.device,
547 "queue %d tag %#x not found\n",
548 nvme_tcp_queue_id(queue), pdu->command_id);
549 return -ENOENT;
550 }
551 req = blk_mq_rq_to_pdu(rq);
552
553 ret = nvme_tcp_setup_h2c_data_pdu(req, pdu);
554 if (unlikely(ret))
555 return ret;
556
557 req->state = NVME_TCP_SEND_H2C_PDU;
558 req->offset = 0;
559
560 nvme_tcp_queue_request(req);
561
562 return 0;
563}
564
565static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb,
566 unsigned int *offset, size_t *len)
567{
568 struct nvme_tcp_hdr *hdr;
569 char *pdu = queue->pdu;
570 size_t rcv_len = min_t(size_t, *len, queue->pdu_remaining);
571 int ret;
572
573 ret = skb_copy_bits(skb, *offset,
574 &pdu[queue->pdu_offset], rcv_len);
575 if (unlikely(ret))
576 return ret;
577
578 queue->pdu_remaining -= rcv_len;
579 queue->pdu_offset += rcv_len;
580 *offset += rcv_len;
581 *len -= rcv_len;
582 if (queue->pdu_remaining)
583 return 0;
584
585 hdr = queue->pdu;
586 if (queue->hdr_digest) {
587 ret = nvme_tcp_verify_hdgst(queue, queue->pdu, hdr->hlen);
588 if (unlikely(ret))
589 return ret;
590 }
591
592
593 if (queue->data_digest) {
594 ret = nvme_tcp_check_ddgst(queue, queue->pdu);
595 if (unlikely(ret))
596 return ret;
597 }
598
599 switch (hdr->type) {
600 case nvme_tcp_c2h_data:
601 ret = nvme_tcp_handle_c2h_data(queue, (void *)queue->pdu);
602 break;
603 case nvme_tcp_rsp:
604 nvme_tcp_init_recv_ctx(queue);
605 ret = nvme_tcp_handle_comp(queue, (void *)queue->pdu);
606 break;
607 case nvme_tcp_r2t:
608 nvme_tcp_init_recv_ctx(queue);
609 ret = nvme_tcp_handle_r2t(queue, (void *)queue->pdu);
610 break;
611 default:
612 dev_err(queue->ctrl->ctrl.device,
613 "unsupported pdu type (%d)\n", hdr->type);
614 return -EINVAL;
615 }
616
617 return ret;
618}
619
620static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb,
621 unsigned int *offset, size_t *len)
622{
623 struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu;
624 struct nvme_tcp_request *req;
625 struct request *rq;
626
627 rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
628 if (!rq) {
629 dev_err(queue->ctrl->ctrl.device,
630 "queue %d tag %#x not found\n",
631 nvme_tcp_queue_id(queue), pdu->command_id);
632 return -ENOENT;
633 }
634 req = blk_mq_rq_to_pdu(rq);
635
636 while (true) {
637 int recv_len, ret;
638
639 recv_len = min_t(size_t, *len, queue->data_remaining);
640 if (!recv_len)
641 break;
642
643 if (!iov_iter_count(&req->iter)) {
644 req->curr_bio = req->curr_bio->bi_next;
645
646 /*
647 * If we don`t have any bios it means that controller
648 * sent more data than we requested, hence error
649 */
650 if (!req->curr_bio) {
651 dev_err(queue->ctrl->ctrl.device,
652 "queue %d no space in request %#x",
653 nvme_tcp_queue_id(queue), rq->tag);
654 nvme_tcp_init_recv_ctx(queue);
655 return -EIO;
656 }
657 nvme_tcp_init_iter(req, READ);
658 }
659
660 /* we can read only from what is left in this bio */
661 recv_len = min_t(size_t, recv_len,
662 iov_iter_count(&req->iter));
663
664 if (queue->data_digest)
665 ret = skb_copy_and_hash_datagram_iter(skb, *offset,
666 &req->iter, recv_len, queue->rcv_hash);
667 else
668 ret = skb_copy_datagram_iter(skb, *offset,
669 &req->iter, recv_len);
670 if (ret) {
671 dev_err(queue->ctrl->ctrl.device,
672 "queue %d failed to copy request %#x data",
673 nvme_tcp_queue_id(queue), rq->tag);
674 return ret;
675 }
676
677 *len -= recv_len;
678 *offset += recv_len;
679 queue->data_remaining -= recv_len;
680 }
681
682 if (!queue->data_remaining) {
683 if (queue->data_digest) {
684 nvme_tcp_ddgst_final(queue->rcv_hash, &queue->exp_ddgst);
685 queue->ddgst_remaining = NVME_TCP_DIGEST_LENGTH;
686 } else {
687 nvme_tcp_init_recv_ctx(queue);
688 }
689 }
690
691 return 0;
692}
693
694static int nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue,
695 struct sk_buff *skb, unsigned int *offset, size_t *len)
696{
697 char *ddgst = (char *)&queue->recv_ddgst;
698 size_t recv_len = min_t(size_t, *len, queue->ddgst_remaining);
699 off_t off = NVME_TCP_DIGEST_LENGTH - queue->ddgst_remaining;
700 int ret;
701
702 ret = skb_copy_bits(skb, *offset, &ddgst[off], recv_len);
703 if (unlikely(ret))
704 return ret;
705
706 queue->ddgst_remaining -= recv_len;
707 *offset += recv_len;
708 *len -= recv_len;
709 if (queue->ddgst_remaining)
710 return 0;
711
712 if (queue->recv_ddgst != queue->exp_ddgst) {
713 dev_err(queue->ctrl->ctrl.device,
714 "data digest error: recv %#x expected %#x\n",
715 le32_to_cpu(queue->recv_ddgst),
716 le32_to_cpu(queue->exp_ddgst));
717 return -EIO;
718 }
719
720 nvme_tcp_init_recv_ctx(queue);
721 return 0;
722}
723
724static int nvme_tcp_recv_skb(read_descriptor_t *desc, struct sk_buff *skb,
725 unsigned int offset, size_t len)
726{
727 struct nvme_tcp_queue *queue = desc->arg.data;
728 size_t consumed = len;
729 int result;
730
731 while (len) {
732 switch (nvme_tcp_recv_state(queue)) {
733 case NVME_TCP_RECV_PDU:
734 result = nvme_tcp_recv_pdu(queue, skb, &offset, &len);
735 break;
736 case NVME_TCP_RECV_DATA:
737 result = nvme_tcp_recv_data(queue, skb, &offset, &len);
738 break;
739 case NVME_TCP_RECV_DDGST:
740 result = nvme_tcp_recv_ddgst(queue, skb, &offset, &len);
741 break;
742 default:
743 result = -EFAULT;
744 }
745 if (result) {
746 dev_err(queue->ctrl->ctrl.device,
747 "receive failed: %d\n", result);
748 queue->rd_enabled = false;
749 nvme_tcp_error_recovery(&queue->ctrl->ctrl);
750 return result;
751 }
752 }
753
754 return consumed;
755}
756
757static void nvme_tcp_data_ready(struct sock *sk)
758{
759 struct nvme_tcp_queue *queue;
760
761 read_lock(&sk->sk_callback_lock);
762 queue = sk->sk_user_data;
763 if (likely(queue && queue->rd_enabled))
764 queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
765 read_unlock(&sk->sk_callback_lock);
766}
767
768static void nvme_tcp_write_space(struct sock *sk)
769{
770 struct nvme_tcp_queue *queue;
771
772 read_lock_bh(&sk->sk_callback_lock);
773 queue = sk->sk_user_data;
774 if (likely(queue && sk_stream_is_writeable(sk))) {
775 clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
776 queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
777 }
778 read_unlock_bh(&sk->sk_callback_lock);
779}
780
781static void nvme_tcp_state_change(struct sock *sk)
782{
783 struct nvme_tcp_queue *queue;
784
785 read_lock(&sk->sk_callback_lock);
786 queue = sk->sk_user_data;
787 if (!queue)
788 goto done;
789
790 switch (sk->sk_state) {
791 case TCP_CLOSE:
792 case TCP_CLOSE_WAIT:
793 case TCP_LAST_ACK:
794 case TCP_FIN_WAIT1:
795 case TCP_FIN_WAIT2:
796 /* fallthrough */
797 nvme_tcp_error_recovery(&queue->ctrl->ctrl);
798 break;
799 default:
800 dev_info(queue->ctrl->ctrl.device,
801 "queue %d socket state %d\n",
802 nvme_tcp_queue_id(queue), sk->sk_state);
803 }
804
805 queue->state_change(sk);
806done:
807 read_unlock(&sk->sk_callback_lock);
808}
809
810static inline void nvme_tcp_done_send_req(struct nvme_tcp_queue *queue)
811{
812 queue->request = NULL;
813}
814
815static void nvme_tcp_fail_request(struct nvme_tcp_request *req)
816{
817 union nvme_result res = {};
818
819 nvme_end_request(blk_mq_rq_from_pdu(req),
820 NVME_SC_DATA_XFER_ERROR, res);
821}
822
823static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
824{
825 struct nvme_tcp_queue *queue = req->queue;
826
827 while (true) {
828 struct page *page = nvme_tcp_req_cur_page(req);
829 size_t offset = nvme_tcp_req_cur_offset(req);
830 size_t len = nvme_tcp_req_cur_length(req);
831 bool last = nvme_tcp_pdu_last_send(req, len);
832 int ret, flags = MSG_DONTWAIT;
833
834 if (last && !queue->data_digest)
835 flags |= MSG_EOR;
836 else
837 flags |= MSG_MORE;
838
839 ret = kernel_sendpage(queue->sock, page, offset, len, flags);
840 if (ret <= 0)
841 return ret;
842
843 nvme_tcp_advance_req(req, ret);
844 if (queue->data_digest)
845 nvme_tcp_ddgst_update(queue->snd_hash, page,
846 offset, ret);
847
848 /* fully successful last write*/
849 if (last && ret == len) {
850 if (queue->data_digest) {
851 nvme_tcp_ddgst_final(queue->snd_hash,
852 &req->ddgst);
853 req->state = NVME_TCP_SEND_DDGST;
854 req->offset = 0;
855 } else {
856 nvme_tcp_done_send_req(queue);
857 }
858 return 1;
859 }
860 }
861 return -EAGAIN;
862}
863
864static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req)
865{
866 struct nvme_tcp_queue *queue = req->queue;
867 struct nvme_tcp_cmd_pdu *pdu = req->pdu;
868 bool inline_data = nvme_tcp_has_inline_data(req);
869 int flags = MSG_DONTWAIT | (inline_data ? MSG_MORE : MSG_EOR);
870 u8 hdgst = nvme_tcp_hdgst_len(queue);
871 int len = sizeof(*pdu) + hdgst - req->offset;
872 int ret;
873
874 if (queue->hdr_digest && !req->offset)
875 nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
876
877 ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
878 offset_in_page(pdu) + req->offset, len, flags);
879 if (unlikely(ret <= 0))
880 return ret;
881
882 len -= ret;
883 if (!len) {
884 if (inline_data) {
885 req->state = NVME_TCP_SEND_DATA;
886 if (queue->data_digest)
887 crypto_ahash_init(queue->snd_hash);
888 nvme_tcp_init_iter(req, WRITE);
889 } else {
890 nvme_tcp_done_send_req(queue);
891 }
892 return 1;
893 }
894 req->offset += ret;
895
896 return -EAGAIN;
897}
898
899static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req)
900{
901 struct nvme_tcp_queue *queue = req->queue;
902 struct nvme_tcp_data_pdu *pdu = req->pdu;
903 u8 hdgst = nvme_tcp_hdgst_len(queue);
904 int len = sizeof(*pdu) - req->offset + hdgst;
905 int ret;
906
907 if (queue->hdr_digest && !req->offset)
908 nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
909
910 ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
911 offset_in_page(pdu) + req->offset, len,
912 MSG_DONTWAIT | MSG_MORE);
913 if (unlikely(ret <= 0))
914 return ret;
915
916 len -= ret;
917 if (!len) {
918 req->state = NVME_TCP_SEND_DATA;
919 if (queue->data_digest)
920 crypto_ahash_init(queue->snd_hash);
921 if (!req->data_sent)
922 nvme_tcp_init_iter(req, WRITE);
923 return 1;
924 }
925 req->offset += ret;
926
927 return -EAGAIN;
928}
929
930static int nvme_tcp_try_send_ddgst(struct nvme_tcp_request *req)
931{
932 struct nvme_tcp_queue *queue = req->queue;
933 int ret;
934 struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_EOR };
935 struct kvec iov = {
936 .iov_base = &req->ddgst + req->offset,
937 .iov_len = NVME_TCP_DIGEST_LENGTH - req->offset
938 };
939
940 ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
941 if (unlikely(ret <= 0))
942 return ret;
943
944 if (req->offset + ret == NVME_TCP_DIGEST_LENGTH) {
945 nvme_tcp_done_send_req(queue);
946 return 1;
947 }
948
949 req->offset += ret;
950 return -EAGAIN;
951}
952
953static int nvme_tcp_try_send(struct nvme_tcp_queue *queue)
954{
955 struct nvme_tcp_request *req;
956 int ret = 1;
957
958 if (!queue->request) {
959 queue->request = nvme_tcp_fetch_request(queue);
960 if (!queue->request)
961 return 0;
962 }
963 req = queue->request;
964
965 if (req->state == NVME_TCP_SEND_CMD_PDU) {
966 ret = nvme_tcp_try_send_cmd_pdu(req);
967 if (ret <= 0)
968 goto done;
969 if (!nvme_tcp_has_inline_data(req))
970 return ret;
971 }
972
973 if (req->state == NVME_TCP_SEND_H2C_PDU) {
974 ret = nvme_tcp_try_send_data_pdu(req);
975 if (ret <= 0)
976 goto done;
977 }
978
979 if (req->state == NVME_TCP_SEND_DATA) {
980 ret = nvme_tcp_try_send_data(req);
981 if (ret <= 0)
982 goto done;
983 }
984
985 if (req->state == NVME_TCP_SEND_DDGST)
986 ret = nvme_tcp_try_send_ddgst(req);
987done:
988 if (ret == -EAGAIN)
989 ret = 0;
990 return ret;
991}
992
993static int nvme_tcp_try_recv(struct nvme_tcp_queue *queue)
994{
995 struct sock *sk = queue->sock->sk;
996 read_descriptor_t rd_desc;
997 int consumed;
998
999 rd_desc.arg.data = queue;
1000 rd_desc.count = 1;
1001 lock_sock(sk);
1002 consumed = tcp_read_sock(sk, &rd_desc, nvme_tcp_recv_skb);
1003 release_sock(sk);
1004 return consumed;
1005}
1006
1007static void nvme_tcp_io_work(struct work_struct *w)
1008{
1009 struct nvme_tcp_queue *queue =
1010 container_of(w, struct nvme_tcp_queue, io_work);
1011 unsigned long start = jiffies + msecs_to_jiffies(1);
1012
1013 do {
1014 bool pending = false;
1015 int result;
1016
1017 result = nvme_tcp_try_send(queue);
1018 if (result > 0) {
1019 pending = true;
1020 } else if (unlikely(result < 0)) {
1021 dev_err(queue->ctrl->ctrl.device,
1022 "failed to send request %d\n", result);
1023 if (result != -EPIPE)
1024 nvme_tcp_fail_request(queue->request);
1025 nvme_tcp_done_send_req(queue);
1026 return;
1027 }
1028
1029 result = nvme_tcp_try_recv(queue);
1030 if (result > 0)
1031 pending = true;
1032
1033 if (!pending)
1034 return;
1035
1036 } while (time_after(jiffies, start)); /* quota is exhausted */
1037
1038 queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
1039}
1040
1041static void nvme_tcp_free_crypto(struct nvme_tcp_queue *queue)
1042{
1043 struct crypto_ahash *tfm = crypto_ahash_reqtfm(queue->rcv_hash);
1044
1045 ahash_request_free(queue->rcv_hash);
1046 ahash_request_free(queue->snd_hash);
1047 crypto_free_ahash(tfm);
1048}
1049
1050static int nvme_tcp_alloc_crypto(struct nvme_tcp_queue *queue)
1051{
1052 struct crypto_ahash *tfm;
1053
1054 tfm = crypto_alloc_ahash("crc32c", 0, CRYPTO_ALG_ASYNC);
1055 if (IS_ERR(tfm))
1056 return PTR_ERR(tfm);
1057
1058 queue->snd_hash = ahash_request_alloc(tfm, GFP_KERNEL);
1059 if (!queue->snd_hash)
1060 goto free_tfm;
1061 ahash_request_set_callback(queue->snd_hash, 0, NULL, NULL);
1062
1063 queue->rcv_hash = ahash_request_alloc(tfm, GFP_KERNEL);
1064 if (!queue->rcv_hash)
1065 goto free_snd_hash;
1066 ahash_request_set_callback(queue->rcv_hash, 0, NULL, NULL);
1067
1068 return 0;
1069free_snd_hash:
1070 ahash_request_free(queue->snd_hash);
1071free_tfm:
1072 crypto_free_ahash(tfm);
1073 return -ENOMEM;
1074}
1075
1076static void nvme_tcp_free_async_req(struct nvme_tcp_ctrl *ctrl)
1077{
1078 struct nvme_tcp_request *async = &ctrl->async_req;
1079
1080 page_frag_free(async->pdu);
1081}
1082
1083static int nvme_tcp_alloc_async_req(struct nvme_tcp_ctrl *ctrl)
1084{
1085 struct nvme_tcp_queue *queue = &ctrl->queues[0];
1086 struct nvme_tcp_request *async = &ctrl->async_req;
1087 u8 hdgst = nvme_tcp_hdgst_len(queue);
1088
1089 async->pdu = page_frag_alloc(&queue->pf_cache,
1090 sizeof(struct nvme_tcp_cmd_pdu) + hdgst,
1091 GFP_KERNEL | __GFP_ZERO);
1092 if (!async->pdu)
1093 return -ENOMEM;
1094
1095 async->queue = &ctrl->queues[0];
1096 return 0;
1097}
1098
1099static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid)
1100{
1101 struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1102 struct nvme_tcp_queue *queue = &ctrl->queues[qid];
1103
1104 if (!test_and_clear_bit(NVME_TCP_Q_ALLOCATED, &queue->flags))
1105 return;
1106
1107 if (queue->hdr_digest || queue->data_digest)
1108 nvme_tcp_free_crypto(queue);
1109
1110 sock_release(queue->sock);
1111 kfree(queue->pdu);
1112}
1113
1114static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue)
1115{
1116 struct nvme_tcp_icreq_pdu *icreq;
1117 struct nvme_tcp_icresp_pdu *icresp;
1118 struct msghdr msg = {};
1119 struct kvec iov;
1120 bool ctrl_hdgst, ctrl_ddgst;
1121 int ret;
1122
1123 icreq = kzalloc(sizeof(*icreq), GFP_KERNEL);
1124 if (!icreq)
1125 return -ENOMEM;
1126
1127 icresp = kzalloc(sizeof(*icresp), GFP_KERNEL);
1128 if (!icresp) {
1129 ret = -ENOMEM;
1130 goto free_icreq;
1131 }
1132
1133 icreq->hdr.type = nvme_tcp_icreq;
1134 icreq->hdr.hlen = sizeof(*icreq);
1135 icreq->hdr.pdo = 0;
1136 icreq->hdr.plen = cpu_to_le32(icreq->hdr.hlen);
1137 icreq->pfv = cpu_to_le16(NVME_TCP_PFV_1_0);
1138 icreq->maxr2t = 0; /* single inflight r2t supported */
1139 icreq->hpda = 0; /* no alignment constraint */
1140 if (queue->hdr_digest)
1141 icreq->digest |= NVME_TCP_HDR_DIGEST_ENABLE;
1142 if (queue->data_digest)
1143 icreq->digest |= NVME_TCP_DATA_DIGEST_ENABLE;
1144
1145 iov.iov_base = icreq;
1146 iov.iov_len = sizeof(*icreq);
1147 ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
1148 if (ret < 0)
1149 goto free_icresp;
1150
1151 memset(&msg, 0, sizeof(msg));
1152 iov.iov_base = icresp;
1153 iov.iov_len = sizeof(*icresp);
1154 ret = kernel_recvmsg(queue->sock, &msg, &iov, 1,
1155 iov.iov_len, msg.msg_flags);
1156 if (ret < 0)
1157 goto free_icresp;
1158
1159 ret = -EINVAL;
1160 if (icresp->hdr.type != nvme_tcp_icresp) {
1161 pr_err("queue %d: bad type returned %d\n",
1162 nvme_tcp_queue_id(queue), icresp->hdr.type);
1163 goto free_icresp;
1164 }
1165
1166 if (le32_to_cpu(icresp->hdr.plen) != sizeof(*icresp)) {
1167 pr_err("queue %d: bad pdu length returned %d\n",
1168 nvme_tcp_queue_id(queue), icresp->hdr.plen);
1169 goto free_icresp;
1170 }
1171
1172 if (icresp->pfv != NVME_TCP_PFV_1_0) {
1173 pr_err("queue %d: bad pfv returned %d\n",
1174 nvme_tcp_queue_id(queue), icresp->pfv);
1175 goto free_icresp;
1176 }
1177
1178 ctrl_ddgst = !!(icresp->digest & NVME_TCP_DATA_DIGEST_ENABLE);
1179 if ((queue->data_digest && !ctrl_ddgst) ||
1180 (!queue->data_digest && ctrl_ddgst)) {
1181 pr_err("queue %d: data digest mismatch host: %s ctrl: %s\n",
1182 nvme_tcp_queue_id(queue),
1183 queue->data_digest ? "enabled" : "disabled",
1184 ctrl_ddgst ? "enabled" : "disabled");
1185 goto free_icresp;
1186 }
1187
1188 ctrl_hdgst = !!(icresp->digest & NVME_TCP_HDR_DIGEST_ENABLE);
1189 if ((queue->hdr_digest && !ctrl_hdgst) ||
1190 (!queue->hdr_digest && ctrl_hdgst)) {
1191 pr_err("queue %d: header digest mismatch host: %s ctrl: %s\n",
1192 nvme_tcp_queue_id(queue),
1193 queue->hdr_digest ? "enabled" : "disabled",
1194 ctrl_hdgst ? "enabled" : "disabled");
1195 goto free_icresp;
1196 }
1197
1198 if (icresp->cpda != 0) {
1199 pr_err("queue %d: unsupported cpda returned %d\n",
1200 nvme_tcp_queue_id(queue), icresp->cpda);
1201 goto free_icresp;
1202 }
1203
1204 ret = 0;
1205free_icresp:
1206 kfree(icresp);
1207free_icreq:
1208 kfree(icreq);
1209 return ret;
1210}
1211
1212static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
1213 int qid, size_t queue_size)
1214{
1215 struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1216 struct nvme_tcp_queue *queue = &ctrl->queues[qid];
1217 struct linger sol = { .l_onoff = 1, .l_linger = 0 };
Sagi Grimberg873946f2018-12-11 23:38:57 -08001218 int ret, opt, rcv_pdu_size, n;
Sagi Grimberg3f2304f2018-12-03 17:52:17 -08001219
1220 queue->ctrl = ctrl;
1221 INIT_LIST_HEAD(&queue->send_list);
1222 spin_lock_init(&queue->lock);
1223 INIT_WORK(&queue->io_work, nvme_tcp_io_work);
1224 queue->queue_size = queue_size;
1225
1226 if (qid > 0)
1227 queue->cmnd_capsule_len = ctrl->ctrl.ioccsz * 16;
1228 else
1229 queue->cmnd_capsule_len = sizeof(struct nvme_command) +
1230 NVME_TCP_ADMIN_CCSZ;
1231
1232 ret = sock_create(ctrl->addr.ss_family, SOCK_STREAM,
1233 IPPROTO_TCP, &queue->sock);
1234 if (ret) {
1235 dev_err(ctrl->ctrl.device,
1236 "failed to create socket: %d\n", ret);
1237 return ret;
1238 }
1239
1240 /* Single syn retry */
1241 opt = 1;
1242 ret = kernel_setsockopt(queue->sock, IPPROTO_TCP, TCP_SYNCNT,
1243 (char *)&opt, sizeof(opt));
1244 if (ret) {
1245 dev_err(ctrl->ctrl.device,
1246 "failed to set TCP_SYNCNT sock opt %d\n", ret);
1247 goto err_sock;
1248 }
1249
1250 /* Set TCP no delay */
1251 opt = 1;
1252 ret = kernel_setsockopt(queue->sock, IPPROTO_TCP,
1253 TCP_NODELAY, (char *)&opt, sizeof(opt));
1254 if (ret) {
1255 dev_err(ctrl->ctrl.device,
1256 "failed to set TCP_NODELAY sock opt %d\n", ret);
1257 goto err_sock;
1258 }
1259
1260 /*
1261 * Cleanup whatever is sitting in the TCP transmit queue on socket
1262 * close. This is done to prevent stale data from being sent should
1263 * the network connection be restored before TCP times out.
1264 */
1265 ret = kernel_setsockopt(queue->sock, SOL_SOCKET, SO_LINGER,
1266 (char *)&sol, sizeof(sol));
1267 if (ret) {
1268 dev_err(ctrl->ctrl.device,
1269 "failed to set SO_LINGER sock opt %d\n", ret);
1270 goto err_sock;
1271 }
1272
1273 queue->sock->sk->sk_allocation = GFP_ATOMIC;
Sagi Grimberg873946f2018-12-11 23:38:57 -08001274 if (!qid)
1275 n = 0;
1276 else
1277 n = (qid - 1) % num_online_cpus();
1278 queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false);
Sagi Grimberg3f2304f2018-12-03 17:52:17 -08001279 queue->request = NULL;
1280 queue->data_remaining = 0;
1281 queue->ddgst_remaining = 0;
1282 queue->pdu_remaining = 0;
1283 queue->pdu_offset = 0;
1284 sk_set_memalloc(queue->sock->sk);
1285
1286 if (ctrl->ctrl.opts->mask & NVMF_OPT_HOST_TRADDR) {
1287 ret = kernel_bind(queue->sock, (struct sockaddr *)&ctrl->src_addr,
1288 sizeof(ctrl->src_addr));
1289 if (ret) {
1290 dev_err(ctrl->ctrl.device,
1291 "failed to bind queue %d socket %d\n",
1292 qid, ret);
1293 goto err_sock;
1294 }
1295 }
1296
1297 queue->hdr_digest = nctrl->opts->hdr_digest;
1298 queue->data_digest = nctrl->opts->data_digest;
1299 if (queue->hdr_digest || queue->data_digest) {
1300 ret = nvme_tcp_alloc_crypto(queue);
1301 if (ret) {
1302 dev_err(ctrl->ctrl.device,
1303 "failed to allocate queue %d crypto\n", qid);
1304 goto err_sock;
1305 }
1306 }
1307
1308 rcv_pdu_size = sizeof(struct nvme_tcp_rsp_pdu) +
1309 nvme_tcp_hdgst_len(queue);
1310 queue->pdu = kmalloc(rcv_pdu_size, GFP_KERNEL);
1311 if (!queue->pdu) {
1312 ret = -ENOMEM;
1313 goto err_crypto;
1314 }
1315
1316 dev_dbg(ctrl->ctrl.device, "connecting queue %d\n",
1317 nvme_tcp_queue_id(queue));
1318
1319 ret = kernel_connect(queue->sock, (struct sockaddr *)&ctrl->addr,
1320 sizeof(ctrl->addr), 0);
1321 if (ret) {
1322 dev_err(ctrl->ctrl.device,
1323 "failed to connect socket: %d\n", ret);
1324 goto err_rcv_pdu;
1325 }
1326
1327 ret = nvme_tcp_init_connection(queue);
1328 if (ret)
1329 goto err_init_connect;
1330
1331 queue->rd_enabled = true;
1332 set_bit(NVME_TCP_Q_ALLOCATED, &queue->flags);
1333 nvme_tcp_init_recv_ctx(queue);
1334
1335 write_lock_bh(&queue->sock->sk->sk_callback_lock);
1336 queue->sock->sk->sk_user_data = queue;
1337 queue->state_change = queue->sock->sk->sk_state_change;
1338 queue->data_ready = queue->sock->sk->sk_data_ready;
1339 queue->write_space = queue->sock->sk->sk_write_space;
1340 queue->sock->sk->sk_data_ready = nvme_tcp_data_ready;
1341 queue->sock->sk->sk_state_change = nvme_tcp_state_change;
1342 queue->sock->sk->sk_write_space = nvme_tcp_write_space;
1343 write_unlock_bh(&queue->sock->sk->sk_callback_lock);
1344
1345 return 0;
1346
1347err_init_connect:
1348 kernel_sock_shutdown(queue->sock, SHUT_RDWR);
1349err_rcv_pdu:
1350 kfree(queue->pdu);
1351err_crypto:
1352 if (queue->hdr_digest || queue->data_digest)
1353 nvme_tcp_free_crypto(queue);
1354err_sock:
1355 sock_release(queue->sock);
1356 queue->sock = NULL;
1357 return ret;
1358}
1359
1360static void nvme_tcp_restore_sock_calls(struct nvme_tcp_queue *queue)
1361{
1362 struct socket *sock = queue->sock;
1363
1364 write_lock_bh(&sock->sk->sk_callback_lock);
1365 sock->sk->sk_user_data = NULL;
1366 sock->sk->sk_data_ready = queue->data_ready;
1367 sock->sk->sk_state_change = queue->state_change;
1368 sock->sk->sk_write_space = queue->write_space;
1369 write_unlock_bh(&sock->sk->sk_callback_lock);
1370}
1371
1372static void __nvme_tcp_stop_queue(struct nvme_tcp_queue *queue)
1373{
1374 kernel_sock_shutdown(queue->sock, SHUT_RDWR);
1375 nvme_tcp_restore_sock_calls(queue);
1376 cancel_work_sync(&queue->io_work);
1377}
1378
1379static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid)
1380{
1381 struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1382 struct nvme_tcp_queue *queue = &ctrl->queues[qid];
1383
1384 if (!test_and_clear_bit(NVME_TCP_Q_LIVE, &queue->flags))
1385 return;
1386
1387 __nvme_tcp_stop_queue(queue);
1388}
1389
1390static int nvme_tcp_start_queue(struct nvme_ctrl *nctrl, int idx)
1391{
1392 struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1393 int ret;
1394
1395 if (idx)
1396 ret = nvmf_connect_io_queue(nctrl, idx);
1397 else
1398 ret = nvmf_connect_admin_queue(nctrl);
1399
1400 if (!ret) {
1401 set_bit(NVME_TCP_Q_LIVE, &ctrl->queues[idx].flags);
1402 } else {
1403 __nvme_tcp_stop_queue(&ctrl->queues[idx]);
1404 dev_err(nctrl->device,
1405 "failed to connect queue: %d ret=%d\n", idx, ret);
1406 }
1407 return ret;
1408}
1409
1410static struct blk_mq_tag_set *nvme_tcp_alloc_tagset(struct nvme_ctrl *nctrl,
1411 bool admin)
1412{
1413 struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1414 struct blk_mq_tag_set *set;
1415 int ret;
1416
1417 if (admin) {
1418 set = &ctrl->admin_tag_set;
1419 memset(set, 0, sizeof(*set));
1420 set->ops = &nvme_tcp_admin_mq_ops;
1421 set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
1422 set->reserved_tags = 2; /* connect + keep-alive */
1423 set->numa_node = NUMA_NO_NODE;
1424 set->cmd_size = sizeof(struct nvme_tcp_request);
1425 set->driver_data = ctrl;
1426 set->nr_hw_queues = 1;
1427 set->timeout = ADMIN_TIMEOUT;
1428 } else {
1429 set = &ctrl->tag_set;
1430 memset(set, 0, sizeof(*set));
1431 set->ops = &nvme_tcp_mq_ops;
1432 set->queue_depth = nctrl->sqsize + 1;
1433 set->reserved_tags = 1; /* fabric connect */
1434 set->numa_node = NUMA_NO_NODE;
1435 set->flags = BLK_MQ_F_SHOULD_MERGE;
1436 set->cmd_size = sizeof(struct nvme_tcp_request);
1437 set->driver_data = ctrl;
1438 set->nr_hw_queues = nctrl->queue_count - 1;
1439 set->timeout = NVME_IO_TIMEOUT;
Sagi Grimberg873946f2018-12-11 23:38:57 -08001440 set->nr_maps = 2 /* default + read */;
Sagi Grimberg3f2304f2018-12-03 17:52:17 -08001441 }
1442
1443 ret = blk_mq_alloc_tag_set(set);
1444 if (ret)
1445 return ERR_PTR(ret);
1446
1447 return set;
1448}
1449
1450static void nvme_tcp_free_admin_queue(struct nvme_ctrl *ctrl)
1451{
1452 if (to_tcp_ctrl(ctrl)->async_req.pdu) {
1453 nvme_tcp_free_async_req(to_tcp_ctrl(ctrl));
1454 to_tcp_ctrl(ctrl)->async_req.pdu = NULL;
1455 }
1456
1457 nvme_tcp_free_queue(ctrl, 0);
1458}
1459
1460static void nvme_tcp_free_io_queues(struct nvme_ctrl *ctrl)
1461{
1462 int i;
1463
1464 for (i = 1; i < ctrl->queue_count; i++)
1465 nvme_tcp_free_queue(ctrl, i);
1466}
1467
1468static void nvme_tcp_stop_io_queues(struct nvme_ctrl *ctrl)
1469{
1470 int i;
1471
1472 for (i = 1; i < ctrl->queue_count; i++)
1473 nvme_tcp_stop_queue(ctrl, i);
1474}
1475
1476static int nvme_tcp_start_io_queues(struct nvme_ctrl *ctrl)
1477{
1478 int i, ret = 0;
1479
1480 for (i = 1; i < ctrl->queue_count; i++) {
1481 ret = nvme_tcp_start_queue(ctrl, i);
1482 if (ret)
1483 goto out_stop_queues;
1484 }
1485
1486 return 0;
1487
1488out_stop_queues:
1489 for (i--; i >= 1; i--)
1490 nvme_tcp_stop_queue(ctrl, i);
1491 return ret;
1492}
1493
1494static int nvme_tcp_alloc_admin_queue(struct nvme_ctrl *ctrl)
1495{
1496 int ret;
1497
1498 ret = nvme_tcp_alloc_queue(ctrl, 0, NVME_AQ_DEPTH);
1499 if (ret)
1500 return ret;
1501
1502 ret = nvme_tcp_alloc_async_req(to_tcp_ctrl(ctrl));
1503 if (ret)
1504 goto out_free_queue;
1505
1506 return 0;
1507
1508out_free_queue:
1509 nvme_tcp_free_queue(ctrl, 0);
1510 return ret;
1511}
1512
1513static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
1514{
1515 int i, ret;
1516
1517 for (i = 1; i < ctrl->queue_count; i++) {
1518 ret = nvme_tcp_alloc_queue(ctrl, i,
1519 ctrl->sqsize + 1);
1520 if (ret)
1521 goto out_free_queues;
1522 }
1523
1524 return 0;
1525
1526out_free_queues:
1527 for (i--; i >= 1; i--)
1528 nvme_tcp_free_queue(ctrl, i);
1529
1530 return ret;
1531}
1532
1533static unsigned int nvme_tcp_nr_io_queues(struct nvme_ctrl *ctrl)
1534{
Sagi Grimberg873946f2018-12-11 23:38:57 -08001535 unsigned int nr_io_queues;
1536
1537 nr_io_queues = min(ctrl->opts->nr_io_queues, num_online_cpus());
1538 nr_io_queues += min(ctrl->opts->nr_write_queues, num_online_cpus());
1539
1540 return nr_io_queues;
Sagi Grimberg3f2304f2018-12-03 17:52:17 -08001541}
1542
1543static int nvme_alloc_io_queues(struct nvme_ctrl *ctrl)
1544{
1545 unsigned int nr_io_queues;
1546 int ret;
1547
1548 nr_io_queues = nvme_tcp_nr_io_queues(ctrl);
1549 ret = nvme_set_queue_count(ctrl, &nr_io_queues);
1550 if (ret)
1551 return ret;
1552
1553 ctrl->queue_count = nr_io_queues + 1;
1554 if (ctrl->queue_count < 2)
1555 return 0;
1556
1557 dev_info(ctrl->device,
1558 "creating %d I/O queues.\n", nr_io_queues);
1559
1560 return nvme_tcp_alloc_io_queues(ctrl);
1561}
1562
1563static void nvme_tcp_destroy_io_queues(struct nvme_ctrl *ctrl, bool remove)
1564{
1565 nvme_tcp_stop_io_queues(ctrl);
1566 if (remove) {
1567 if (ctrl->ops->flags & NVME_F_FABRICS)
1568 blk_cleanup_queue(ctrl->connect_q);
1569 blk_mq_free_tag_set(ctrl->tagset);
1570 }
1571 nvme_tcp_free_io_queues(ctrl);
1572}
1573
1574static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new)
1575{
1576 int ret;
1577
1578 ret = nvme_alloc_io_queues(ctrl);
1579 if (ret)
1580 return ret;
1581
1582 if (new) {
1583 ctrl->tagset = nvme_tcp_alloc_tagset(ctrl, false);
1584 if (IS_ERR(ctrl->tagset)) {
1585 ret = PTR_ERR(ctrl->tagset);
1586 goto out_free_io_queues;
1587 }
1588
1589 if (ctrl->ops->flags & NVME_F_FABRICS) {
1590 ctrl->connect_q = blk_mq_init_queue(ctrl->tagset);
1591 if (IS_ERR(ctrl->connect_q)) {
1592 ret = PTR_ERR(ctrl->connect_q);
1593 goto out_free_tag_set;
1594 }
1595 }
1596 } else {
1597 blk_mq_update_nr_hw_queues(ctrl->tagset,
1598 ctrl->queue_count - 1);
1599 }
1600
1601 ret = nvme_tcp_start_io_queues(ctrl);
1602 if (ret)
1603 goto out_cleanup_connect_q;
1604
1605 return 0;
1606
1607out_cleanup_connect_q:
1608 if (new && (ctrl->ops->flags & NVME_F_FABRICS))
1609 blk_cleanup_queue(ctrl->connect_q);
1610out_free_tag_set:
1611 if (new)
1612 blk_mq_free_tag_set(ctrl->tagset);
1613out_free_io_queues:
1614 nvme_tcp_free_io_queues(ctrl);
1615 return ret;
1616}
1617
1618static void nvme_tcp_destroy_admin_queue(struct nvme_ctrl *ctrl, bool remove)
1619{
1620 nvme_tcp_stop_queue(ctrl, 0);
1621 if (remove) {
1622 free_opal_dev(ctrl->opal_dev);
1623 blk_cleanup_queue(ctrl->admin_q);
1624 blk_mq_free_tag_set(ctrl->admin_tagset);
1625 }
1626 nvme_tcp_free_admin_queue(ctrl);
1627}
1628
1629static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new)
1630{
1631 int error;
1632
1633 error = nvme_tcp_alloc_admin_queue(ctrl);
1634 if (error)
1635 return error;
1636
1637 if (new) {
1638 ctrl->admin_tagset = nvme_tcp_alloc_tagset(ctrl, true);
1639 if (IS_ERR(ctrl->admin_tagset)) {
1640 error = PTR_ERR(ctrl->admin_tagset);
1641 goto out_free_queue;
1642 }
1643
1644 ctrl->admin_q = blk_mq_init_queue(ctrl->admin_tagset);
1645 if (IS_ERR(ctrl->admin_q)) {
1646 error = PTR_ERR(ctrl->admin_q);
1647 goto out_free_tagset;
1648 }
1649 }
1650
1651 error = nvme_tcp_start_queue(ctrl, 0);
1652 if (error)
1653 goto out_cleanup_queue;
1654
1655 error = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap);
1656 if (error) {
1657 dev_err(ctrl->device,
1658 "prop_get NVME_REG_CAP failed\n");
1659 goto out_stop_queue;
1660 }
1661
1662 ctrl->sqsize = min_t(int, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize);
1663
1664 error = nvme_enable_ctrl(ctrl, ctrl->cap);
1665 if (error)
1666 goto out_stop_queue;
1667
1668 error = nvme_init_identify(ctrl);
1669 if (error)
1670 goto out_stop_queue;
1671
1672 return 0;
1673
1674out_stop_queue:
1675 nvme_tcp_stop_queue(ctrl, 0);
1676out_cleanup_queue:
1677 if (new)
1678 blk_cleanup_queue(ctrl->admin_q);
1679out_free_tagset:
1680 if (new)
1681 blk_mq_free_tag_set(ctrl->admin_tagset);
1682out_free_queue:
1683 nvme_tcp_free_admin_queue(ctrl);
1684 return error;
1685}
1686
1687static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl,
1688 bool remove)
1689{
1690 blk_mq_quiesce_queue(ctrl->admin_q);
1691 nvme_tcp_stop_queue(ctrl, 0);
1692 blk_mq_tagset_busy_iter(ctrl->admin_tagset, nvme_cancel_request, ctrl);
1693 blk_mq_unquiesce_queue(ctrl->admin_q);
1694 nvme_tcp_destroy_admin_queue(ctrl, remove);
1695}
1696
1697static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl,
1698 bool remove)
1699{
1700 if (ctrl->queue_count <= 1)
1701 return;
1702 nvme_stop_queues(ctrl);
1703 nvme_tcp_stop_io_queues(ctrl);
1704 blk_mq_tagset_busy_iter(ctrl->tagset, nvme_cancel_request, ctrl);
1705 if (remove)
1706 nvme_start_queues(ctrl);
1707 nvme_tcp_destroy_io_queues(ctrl, remove);
1708}
1709
1710static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl)
1711{
1712 /* If we are resetting/deleting then do nothing */
1713 if (ctrl->state != NVME_CTRL_CONNECTING) {
1714 WARN_ON_ONCE(ctrl->state == NVME_CTRL_NEW ||
1715 ctrl->state == NVME_CTRL_LIVE);
1716 return;
1717 }
1718
1719 if (nvmf_should_reconnect(ctrl)) {
1720 dev_info(ctrl->device, "Reconnecting in %d seconds...\n",
1721 ctrl->opts->reconnect_delay);
1722 queue_delayed_work(nvme_wq, &to_tcp_ctrl(ctrl)->connect_work,
1723 ctrl->opts->reconnect_delay * HZ);
1724 } else {
1725 dev_info(ctrl->device, "Removing controller...\n");
1726 nvme_delete_ctrl(ctrl);
1727 }
1728}
1729
1730static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new)
1731{
1732 struct nvmf_ctrl_options *opts = ctrl->opts;
1733 int ret = -EINVAL;
1734
1735 ret = nvme_tcp_configure_admin_queue(ctrl, new);
1736 if (ret)
1737 return ret;
1738
1739 if (ctrl->icdoff) {
1740 dev_err(ctrl->device, "icdoff is not supported!\n");
1741 goto destroy_admin;
1742 }
1743
1744 if (opts->queue_size > ctrl->sqsize + 1)
1745 dev_warn(ctrl->device,
1746 "queue_size %zu > ctrl sqsize %u, clamping down\n",
1747 opts->queue_size, ctrl->sqsize + 1);
1748
1749 if (ctrl->sqsize + 1 > ctrl->maxcmd) {
1750 dev_warn(ctrl->device,
1751 "sqsize %u > ctrl maxcmd %u, clamping down\n",
1752 ctrl->sqsize + 1, ctrl->maxcmd);
1753 ctrl->sqsize = ctrl->maxcmd - 1;
1754 }
1755
1756 if (ctrl->queue_count > 1) {
1757 ret = nvme_tcp_configure_io_queues(ctrl, new);
1758 if (ret)
1759 goto destroy_admin;
1760 }
1761
1762 if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE)) {
1763 /* state change failure is ok if we're in DELETING state */
1764 WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING);
1765 ret = -EINVAL;
1766 goto destroy_io;
1767 }
1768
1769 nvme_start_ctrl(ctrl);
1770 return 0;
1771
1772destroy_io:
1773 if (ctrl->queue_count > 1)
1774 nvme_tcp_destroy_io_queues(ctrl, new);
1775destroy_admin:
1776 nvme_tcp_stop_queue(ctrl, 0);
1777 nvme_tcp_destroy_admin_queue(ctrl, new);
1778 return ret;
1779}
1780
1781static void nvme_tcp_reconnect_ctrl_work(struct work_struct *work)
1782{
1783 struct nvme_tcp_ctrl *tcp_ctrl = container_of(to_delayed_work(work),
1784 struct nvme_tcp_ctrl, connect_work);
1785 struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
1786
1787 ++ctrl->nr_reconnects;
1788
1789 if (nvme_tcp_setup_ctrl(ctrl, false))
1790 goto requeue;
1791
1792 dev_info(ctrl->device, "Successfully reconnected (%d attepmpt)\n",
1793 ctrl->nr_reconnects);
1794
1795 ctrl->nr_reconnects = 0;
1796
1797 return;
1798
1799requeue:
1800 dev_info(ctrl->device, "Failed reconnect attempt %d\n",
1801 ctrl->nr_reconnects);
1802 nvme_tcp_reconnect_or_remove(ctrl);
1803}
1804
1805static void nvme_tcp_error_recovery_work(struct work_struct *work)
1806{
1807 struct nvme_tcp_ctrl *tcp_ctrl = container_of(work,
1808 struct nvme_tcp_ctrl, err_work);
1809 struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
1810
1811 nvme_stop_keep_alive(ctrl);
1812 nvme_tcp_teardown_io_queues(ctrl, false);
1813 /* unquiesce to fail fast pending requests */
1814 nvme_start_queues(ctrl);
1815 nvme_tcp_teardown_admin_queue(ctrl, false);
1816
1817 if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
1818 /* state change failure is ok if we're in DELETING state */
1819 WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING);
1820 return;
1821 }
1822
1823 nvme_tcp_reconnect_or_remove(ctrl);
1824}
1825
1826static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown)
1827{
1828 nvme_tcp_teardown_io_queues(ctrl, shutdown);
1829 if (shutdown)
1830 nvme_shutdown_ctrl(ctrl);
1831 else
1832 nvme_disable_ctrl(ctrl, ctrl->cap);
1833 nvme_tcp_teardown_admin_queue(ctrl, shutdown);
1834}
1835
1836static void nvme_tcp_delete_ctrl(struct nvme_ctrl *ctrl)
1837{
1838 nvme_tcp_teardown_ctrl(ctrl, true);
1839}
1840
1841static void nvme_reset_ctrl_work(struct work_struct *work)
1842{
1843 struct nvme_ctrl *ctrl =
1844 container_of(work, struct nvme_ctrl, reset_work);
1845
1846 nvme_stop_ctrl(ctrl);
1847 nvme_tcp_teardown_ctrl(ctrl, false);
1848
1849 if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
1850 /* state change failure is ok if we're in DELETING state */
1851 WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING);
1852 return;
1853 }
1854
1855 if (nvme_tcp_setup_ctrl(ctrl, false))
1856 goto out_fail;
1857
1858 return;
1859
1860out_fail:
1861 ++ctrl->nr_reconnects;
1862 nvme_tcp_reconnect_or_remove(ctrl);
1863}
1864
1865static void nvme_tcp_stop_ctrl(struct nvme_ctrl *ctrl)
1866{
1867 cancel_work_sync(&to_tcp_ctrl(ctrl)->err_work);
1868 cancel_delayed_work_sync(&to_tcp_ctrl(ctrl)->connect_work);
1869}
1870
1871static void nvme_tcp_free_ctrl(struct nvme_ctrl *nctrl)
1872{
1873 struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1874
1875 if (list_empty(&ctrl->list))
1876 goto free_ctrl;
1877
1878 mutex_lock(&nvme_tcp_ctrl_mutex);
1879 list_del(&ctrl->list);
1880 mutex_unlock(&nvme_tcp_ctrl_mutex);
1881
1882 nvmf_free_options(nctrl->opts);
1883free_ctrl:
1884 kfree(ctrl->queues);
1885 kfree(ctrl);
1886}
1887
1888static void nvme_tcp_set_sg_null(struct nvme_command *c)
1889{
1890 struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
1891
1892 sg->addr = 0;
1893 sg->length = 0;
1894 sg->type = (NVME_TRANSPORT_SGL_DATA_DESC << 4) |
1895 NVME_SGL_FMT_TRANSPORT_A;
1896}
1897
1898static void nvme_tcp_set_sg_inline(struct nvme_tcp_queue *queue,
1899 struct nvme_command *c, u32 data_len)
1900{
1901 struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
1902
1903 sg->addr = cpu_to_le64(queue->ctrl->ctrl.icdoff);
1904 sg->length = cpu_to_le32(data_len);
1905 sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET;
1906}
1907
1908static void nvme_tcp_set_sg_host_data(struct nvme_command *c,
1909 u32 data_len)
1910{
1911 struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
1912
1913 sg->addr = 0;
1914 sg->length = cpu_to_le32(data_len);
1915 sg->type = (NVME_TRANSPORT_SGL_DATA_DESC << 4) |
1916 NVME_SGL_FMT_TRANSPORT_A;
1917}
1918
1919static void nvme_tcp_submit_async_event(struct nvme_ctrl *arg)
1920{
1921 struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(arg);
1922 struct nvme_tcp_queue *queue = &ctrl->queues[0];
1923 struct nvme_tcp_cmd_pdu *pdu = ctrl->async_req.pdu;
1924 struct nvme_command *cmd = &pdu->cmd;
1925 u8 hdgst = nvme_tcp_hdgst_len(queue);
1926
1927 memset(pdu, 0, sizeof(*pdu));
1928 pdu->hdr.type = nvme_tcp_cmd;
1929 if (queue->hdr_digest)
1930 pdu->hdr.flags |= NVME_TCP_F_HDGST;
1931 pdu->hdr.hlen = sizeof(*pdu);
1932 pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst);
1933
1934 cmd->common.opcode = nvme_admin_async_event;
1935 cmd->common.command_id = NVME_AQ_BLK_MQ_DEPTH;
1936 cmd->common.flags |= NVME_CMD_SGL_METABUF;
1937 nvme_tcp_set_sg_null(cmd);
1938
1939 ctrl->async_req.state = NVME_TCP_SEND_CMD_PDU;
1940 ctrl->async_req.offset = 0;
1941 ctrl->async_req.curr_bio = NULL;
1942 ctrl->async_req.data_len = 0;
1943
1944 nvme_tcp_queue_request(&ctrl->async_req);
1945}
1946
1947static enum blk_eh_timer_return
1948nvme_tcp_timeout(struct request *rq, bool reserved)
1949{
1950 struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
1951 struct nvme_tcp_ctrl *ctrl = req->queue->ctrl;
1952 struct nvme_tcp_cmd_pdu *pdu = req->pdu;
1953
1954 dev_dbg(ctrl->ctrl.device,
1955 "queue %d: timeout request %#x type %d\n",
1956 nvme_tcp_queue_id(req->queue), rq->tag,
1957 pdu->hdr.type);
1958
1959 if (ctrl->ctrl.state != NVME_CTRL_LIVE) {
1960 union nvme_result res = {};
1961
1962 nvme_req(rq)->flags |= NVME_REQ_CANCELLED;
1963 nvme_end_request(rq, NVME_SC_ABORT_REQ, res);
1964 return BLK_EH_DONE;
1965 }
1966
1967 /* queue error recovery */
1968 nvme_tcp_error_recovery(&ctrl->ctrl);
1969
1970 return BLK_EH_RESET_TIMER;
1971}
1972
1973static blk_status_t nvme_tcp_map_data(struct nvme_tcp_queue *queue,
1974 struct request *rq)
1975{
1976 struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
1977 struct nvme_tcp_cmd_pdu *pdu = req->pdu;
1978 struct nvme_command *c = &pdu->cmd;
1979
1980 c->common.flags |= NVME_CMD_SGL_METABUF;
1981
1982 if (rq_data_dir(rq) == WRITE && req->data_len &&
1983 req->data_len <= nvme_tcp_inline_data_size(queue))
1984 nvme_tcp_set_sg_inline(queue, c, req->data_len);
1985 else
1986 nvme_tcp_set_sg_host_data(c, req->data_len);
1987
1988 return 0;
1989}
1990
1991static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns,
1992 struct request *rq)
1993{
1994 struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
1995 struct nvme_tcp_cmd_pdu *pdu = req->pdu;
1996 struct nvme_tcp_queue *queue = req->queue;
1997 u8 hdgst = nvme_tcp_hdgst_len(queue), ddgst = 0;
1998 blk_status_t ret;
1999
2000 ret = nvme_setup_cmd(ns, rq, &pdu->cmd);
2001 if (ret)
2002 return ret;
2003
2004 req->state = NVME_TCP_SEND_CMD_PDU;
2005 req->offset = 0;
2006 req->data_sent = 0;
2007 req->pdu_len = 0;
2008 req->pdu_sent = 0;
2009 req->data_len = blk_rq_payload_bytes(rq);
2010 req->curr_bio = rq->bio;
2011
2012 if (rq_data_dir(rq) == WRITE &&
2013 req->data_len <= nvme_tcp_inline_data_size(queue))
2014 req->pdu_len = req->data_len;
2015 else if (req->curr_bio)
2016 nvme_tcp_init_iter(req, READ);
2017
2018 pdu->hdr.type = nvme_tcp_cmd;
2019 pdu->hdr.flags = 0;
2020 if (queue->hdr_digest)
2021 pdu->hdr.flags |= NVME_TCP_F_HDGST;
2022 if (queue->data_digest && req->pdu_len) {
2023 pdu->hdr.flags |= NVME_TCP_F_DDGST;
2024 ddgst = nvme_tcp_ddgst_len(queue);
2025 }
2026 pdu->hdr.hlen = sizeof(*pdu);
2027 pdu->hdr.pdo = req->pdu_len ? pdu->hdr.hlen + hdgst : 0;
2028 pdu->hdr.plen =
2029 cpu_to_le32(pdu->hdr.hlen + hdgst + req->pdu_len + ddgst);
2030
2031 ret = nvme_tcp_map_data(queue, rq);
2032 if (unlikely(ret)) {
2033 dev_err(queue->ctrl->ctrl.device,
2034 "Failed to map data (%d)\n", ret);
2035 return ret;
2036 }
2037
2038 return 0;
2039}
2040
2041static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx,
2042 const struct blk_mq_queue_data *bd)
2043{
2044 struct nvme_ns *ns = hctx->queue->queuedata;
2045 struct nvme_tcp_queue *queue = hctx->driver_data;
2046 struct request *rq = bd->rq;
2047 struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2048 bool queue_ready = test_bit(NVME_TCP_Q_LIVE, &queue->flags);
2049 blk_status_t ret;
2050
2051 if (!nvmf_check_ready(&queue->ctrl->ctrl, rq, queue_ready))
2052 return nvmf_fail_nonready_command(&queue->ctrl->ctrl, rq);
2053
2054 ret = nvme_tcp_setup_cmd_pdu(ns, rq);
2055 if (unlikely(ret))
2056 return ret;
2057
2058 blk_mq_start_request(rq);
2059
2060 nvme_tcp_queue_request(req);
2061
2062 return BLK_STS_OK;
2063}
2064
Sagi Grimberg873946f2018-12-11 23:38:57 -08002065static int nvme_tcp_map_queues(struct blk_mq_tag_set *set)
2066{
2067 struct nvme_tcp_ctrl *ctrl = set->driver_data;
2068
2069 set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
2070 set->map[HCTX_TYPE_READ].nr_queues = ctrl->ctrl.opts->nr_io_queues;
2071 if (ctrl->ctrl.opts->nr_write_queues) {
2072 /* separate read/write queues */
2073 set->map[HCTX_TYPE_DEFAULT].nr_queues =
2074 ctrl->ctrl.opts->nr_write_queues;
2075 set->map[HCTX_TYPE_READ].queue_offset =
2076 ctrl->ctrl.opts->nr_write_queues;
2077 } else {
2078 /* mixed read/write queues */
2079 set->map[HCTX_TYPE_DEFAULT].nr_queues =
2080 ctrl->ctrl.opts->nr_io_queues;
2081 set->map[HCTX_TYPE_READ].queue_offset = 0;
2082 }
2083 blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
2084 blk_mq_map_queues(&set->map[HCTX_TYPE_READ]);
2085 return 0;
2086}
2087
Sagi Grimberg3f2304f2018-12-03 17:52:17 -08002088static struct blk_mq_ops nvme_tcp_mq_ops = {
2089 .queue_rq = nvme_tcp_queue_rq,
2090 .complete = nvme_complete_rq,
2091 .init_request = nvme_tcp_init_request,
2092 .exit_request = nvme_tcp_exit_request,
2093 .init_hctx = nvme_tcp_init_hctx,
2094 .timeout = nvme_tcp_timeout,
Sagi Grimberg873946f2018-12-11 23:38:57 -08002095 .map_queues = nvme_tcp_map_queues,
Sagi Grimberg3f2304f2018-12-03 17:52:17 -08002096};
2097
2098static struct blk_mq_ops nvme_tcp_admin_mq_ops = {
2099 .queue_rq = nvme_tcp_queue_rq,
2100 .complete = nvme_complete_rq,
2101 .init_request = nvme_tcp_init_request,
2102 .exit_request = nvme_tcp_exit_request,
2103 .init_hctx = nvme_tcp_init_admin_hctx,
2104 .timeout = nvme_tcp_timeout,
2105};
2106
2107static const struct nvme_ctrl_ops nvme_tcp_ctrl_ops = {
2108 .name = "tcp",
2109 .module = THIS_MODULE,
2110 .flags = NVME_F_FABRICS,
2111 .reg_read32 = nvmf_reg_read32,
2112 .reg_read64 = nvmf_reg_read64,
2113 .reg_write32 = nvmf_reg_write32,
2114 .free_ctrl = nvme_tcp_free_ctrl,
2115 .submit_async_event = nvme_tcp_submit_async_event,
2116 .delete_ctrl = nvme_tcp_delete_ctrl,
2117 .get_address = nvmf_get_address,
2118 .stop_ctrl = nvme_tcp_stop_ctrl,
2119};
2120
2121static bool
2122nvme_tcp_existing_controller(struct nvmf_ctrl_options *opts)
2123{
2124 struct nvme_tcp_ctrl *ctrl;
2125 bool found = false;
2126
2127 mutex_lock(&nvme_tcp_ctrl_mutex);
2128 list_for_each_entry(ctrl, &nvme_tcp_ctrl_list, list) {
2129 found = nvmf_ip_options_match(&ctrl->ctrl, opts);
2130 if (found)
2131 break;
2132 }
2133 mutex_unlock(&nvme_tcp_ctrl_mutex);
2134
2135 return found;
2136}
2137
2138static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev,
2139 struct nvmf_ctrl_options *opts)
2140{
2141 struct nvme_tcp_ctrl *ctrl;
2142 int ret;
2143
2144 ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
2145 if (!ctrl)
2146 return ERR_PTR(-ENOMEM);
2147
2148 INIT_LIST_HEAD(&ctrl->list);
2149 ctrl->ctrl.opts = opts;
Sagi Grimberg873946f2018-12-11 23:38:57 -08002150 ctrl->ctrl.queue_count = opts->nr_io_queues + opts->nr_write_queues + 1;
Sagi Grimberg3f2304f2018-12-03 17:52:17 -08002151 ctrl->ctrl.sqsize = opts->queue_size - 1;
2152 ctrl->ctrl.kato = opts->kato;
2153
2154 INIT_DELAYED_WORK(&ctrl->connect_work,
2155 nvme_tcp_reconnect_ctrl_work);
2156 INIT_WORK(&ctrl->err_work, nvme_tcp_error_recovery_work);
2157 INIT_WORK(&ctrl->ctrl.reset_work, nvme_reset_ctrl_work);
2158
2159 if (!(opts->mask & NVMF_OPT_TRSVCID)) {
2160 opts->trsvcid =
2161 kstrdup(__stringify(NVME_TCP_DISC_PORT), GFP_KERNEL);
2162 if (!opts->trsvcid) {
2163 ret = -ENOMEM;
2164 goto out_free_ctrl;
2165 }
2166 opts->mask |= NVMF_OPT_TRSVCID;
2167 }
2168
2169 ret = inet_pton_with_scope(&init_net, AF_UNSPEC,
2170 opts->traddr, opts->trsvcid, &ctrl->addr);
2171 if (ret) {
2172 pr_err("malformed address passed: %s:%s\n",
2173 opts->traddr, opts->trsvcid);
2174 goto out_free_ctrl;
2175 }
2176
2177 if (opts->mask & NVMF_OPT_HOST_TRADDR) {
2178 ret = inet_pton_with_scope(&init_net, AF_UNSPEC,
2179 opts->host_traddr, NULL, &ctrl->src_addr);
2180 if (ret) {
2181 pr_err("malformed src address passed: %s\n",
2182 opts->host_traddr);
2183 goto out_free_ctrl;
2184 }
2185 }
2186
2187 if (!opts->duplicate_connect && nvme_tcp_existing_controller(opts)) {
2188 ret = -EALREADY;
2189 goto out_free_ctrl;
2190 }
2191
Sagi Grimberg873946f2018-12-11 23:38:57 -08002192 ctrl->queues = kcalloc(ctrl->ctrl.queue_count, sizeof(*ctrl->queues),
Sagi Grimberg3f2304f2018-12-03 17:52:17 -08002193 GFP_KERNEL);
2194 if (!ctrl->queues) {
2195 ret = -ENOMEM;
2196 goto out_free_ctrl;
2197 }
2198
2199 ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_tcp_ctrl_ops, 0);
2200 if (ret)
2201 goto out_kfree_queues;
2202
2203 if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
2204 WARN_ON_ONCE(1);
2205 ret = -EINTR;
2206 goto out_uninit_ctrl;
2207 }
2208
2209 ret = nvme_tcp_setup_ctrl(&ctrl->ctrl, true);
2210 if (ret)
2211 goto out_uninit_ctrl;
2212
2213 dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISp\n",
2214 ctrl->ctrl.opts->subsysnqn, &ctrl->addr);
2215
2216 nvme_get_ctrl(&ctrl->ctrl);
2217
2218 mutex_lock(&nvme_tcp_ctrl_mutex);
2219 list_add_tail(&ctrl->list, &nvme_tcp_ctrl_list);
2220 mutex_unlock(&nvme_tcp_ctrl_mutex);
2221
2222 return &ctrl->ctrl;
2223
2224out_uninit_ctrl:
2225 nvme_uninit_ctrl(&ctrl->ctrl);
2226 nvme_put_ctrl(&ctrl->ctrl);
2227 if (ret > 0)
2228 ret = -EIO;
2229 return ERR_PTR(ret);
2230out_kfree_queues:
2231 kfree(ctrl->queues);
2232out_free_ctrl:
2233 kfree(ctrl);
2234 return ERR_PTR(ret);
2235}
2236
2237static struct nvmf_transport_ops nvme_tcp_transport = {
2238 .name = "tcp",
2239 .module = THIS_MODULE,
2240 .required_opts = NVMF_OPT_TRADDR,
2241 .allowed_opts = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY |
2242 NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO |
Sagi Grimberg873946f2018-12-11 23:38:57 -08002243 NVMF_OPT_HDR_DIGEST | NVMF_OPT_DATA_DIGEST |
2244 NVMF_OPT_NR_WRITE_QUEUES,
Sagi Grimberg3f2304f2018-12-03 17:52:17 -08002245 .create_ctrl = nvme_tcp_create_ctrl,
2246};
2247
2248static int __init nvme_tcp_init_module(void)
2249{
2250 nvme_tcp_wq = alloc_workqueue("nvme_tcp_wq",
2251 WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
2252 if (!nvme_tcp_wq)
2253 return -ENOMEM;
2254
2255 nvmf_register_transport(&nvme_tcp_transport);
2256 return 0;
2257}
2258
2259static void __exit nvme_tcp_cleanup_module(void)
2260{
2261 struct nvme_tcp_ctrl *ctrl;
2262
2263 nvmf_unregister_transport(&nvme_tcp_transport);
2264
2265 mutex_lock(&nvme_tcp_ctrl_mutex);
2266 list_for_each_entry(ctrl, &nvme_tcp_ctrl_list, list)
2267 nvme_delete_ctrl(&ctrl->ctrl);
2268 mutex_unlock(&nvme_tcp_ctrl_mutex);
2269 flush_workqueue(nvme_delete_wq);
2270
2271 destroy_workqueue(nvme_tcp_wq);
2272}
2273
2274module_init(nvme_tcp_init_module);
2275module_exit(nvme_tcp_cleanup_module);
2276
2277MODULE_LICENSE("GPL v2");