blob: 60c3786bc787b1fc21009fc51f92dd1b99fe8a60 [file] [log] [blame]
Matthew Wilcoxb60503b2011-01-20 12:50:14 -05001/*
2 * NVM Express device driver
3 * Copyright (c) 2011, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc.,
16 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
17 */
18
19#include <linux/nvme.h>
20#include <linux/bio.h>
21#include <linux/blkdev.h>
22#include <linux/errno.h>
23#include <linux/fs.h>
24#include <linux/genhd.h>
25#include <linux/init.h>
26#include <linux/interrupt.h>
27#include <linux/io.h>
28#include <linux/kdev_t.h>
29#include <linux/kernel.h>
30#include <linux/mm.h>
31#include <linux/module.h>
32#include <linux/moduleparam.h>
33#include <linux/pci.h>
34#include <linux/sched.h>
35#include <linux/slab.h>
36#include <linux/types.h>
37#include <linux/version.h>
38
39#define NVME_Q_DEPTH 1024
40#define SQ_SIZE(depth) (depth * sizeof(struct nvme_command))
41#define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion))
42#define NVME_MINORS 64
43
44static int nvme_major;
45module_param(nvme_major, int, 0);
46
Matthew Wilcox58ffacb2011-02-06 07:28:06 -050047static int use_threaded_interrupts;
48module_param(use_threaded_interrupts, int, 0);
49
Matthew Wilcoxb60503b2011-01-20 12:50:14 -050050/*
51 * Represents an NVM Express device. Each nvme_dev is a PCI function.
52 */
53struct nvme_dev {
Matthew Wilcoxb60503b2011-01-20 12:50:14 -050054 struct nvme_queue **queues;
55 u32 __iomem *dbs;
56 struct pci_dev *pci_dev;
57 int instance;
58 int queue_count;
59 u32 ctrl_config;
60 struct msix_entry *entry;
61 struct nvme_bar __iomem *bar;
62 struct list_head namespaces;
Matthew Wilcox51814232011-02-01 16:18:08 -050063 char serial[20];
64 char model[40];
65 char firmware_rev[8];
Matthew Wilcoxb60503b2011-01-20 12:50:14 -050066};
67
68/*
69 * An NVM Express namespace is equivalent to a SCSI LUN
70 */
71struct nvme_ns {
72 struct list_head list;
73
74 struct nvme_dev *dev;
75 struct request_queue *queue;
76 struct gendisk *disk;
77
78 int ns_id;
79 int lba_shift;
80};
81
82/*
83 * An NVM Express queue. Each device has at least two (one for admin
84 * commands and one for I/O commands).
85 */
86struct nvme_queue {
87 struct device *q_dmadev;
88 spinlock_t q_lock;
89 struct nvme_command *sq_cmds;
90 volatile struct nvme_completion *cqes;
91 dma_addr_t sq_dma_addr;
92 dma_addr_t cq_dma_addr;
93 wait_queue_head_t sq_full;
94 struct bio_list sq_cong;
95 u32 __iomem *q_db;
96 u16 q_depth;
97 u16 cq_vector;
98 u16 sq_head;
99 u16 sq_tail;
100 u16 cq_head;
Matthew Wilcox82123462011-01-20 13:24:06 -0500101 u16 cq_phase;
Matthew Wilcoxb60503b2011-01-20 12:50:14 -0500102 unsigned long cmdid_data[];
103};
104
105/*
106 * Check we didin't inadvertently grow the command struct
107 */
108static inline void _nvme_check_size(void)
109{
110 BUILD_BUG_ON(sizeof(struct nvme_rw_command) != 64);
111 BUILD_BUG_ON(sizeof(struct nvme_create_cq) != 64);
112 BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64);
113 BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64);
114 BUILD_BUG_ON(sizeof(struct nvme_features) != 64);
115 BUILD_BUG_ON(sizeof(struct nvme_command) != 64);
116 BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != 4096);
117 BUILD_BUG_ON(sizeof(struct nvme_id_ns) != 4096);
118 BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
119}
120
121/**
122 * alloc_cmdid - Allocate a Command ID
123 * @param nvmeq The queue that will be used for this command
124 * @param ctx A pointer that will be passed to the handler
125 * @param handler The ID of the handler to call
126 *
127 * Allocate a Command ID for a queue. The data passed in will
128 * be passed to the completion handler. This is implemented by using
129 * the bottom two bits of the ctx pointer to store the handler ID.
130 * Passing in a pointer that's not 4-byte aligned will cause a BUG.
131 * We can change this if it becomes a problem.
132 */
133static int alloc_cmdid(struct nvme_queue *nvmeq, void *ctx, int handler)
134{
135 int depth = nvmeq->q_depth;
136 unsigned long data = (unsigned long)ctx | handler;
137 int cmdid;
138
139 BUG_ON((unsigned long)ctx & 3);
140
141 do {
142 cmdid = find_first_zero_bit(nvmeq->cmdid_data, depth);
143 if (cmdid >= depth)
144 return -EBUSY;
145 } while (test_and_set_bit(cmdid, nvmeq->cmdid_data));
146
147 nvmeq->cmdid_data[cmdid + BITS_TO_LONGS(depth)] = data;
148 return cmdid;
149}
150
151static int alloc_cmdid_killable(struct nvme_queue *nvmeq, void *ctx,
152 int handler)
153{
154 int cmdid;
155 wait_event_killable(nvmeq->sq_full,
156 (cmdid = alloc_cmdid(nvmeq, ctx, handler)) >= 0);
157 return (cmdid < 0) ? -EINTR : cmdid;
158}
159
160/* If you need more than four handlers, you'll need to change how
Matthew Wilcox3c0cf132011-02-04 16:03:56 -0500161 * alloc_cmdid and nvme_process_cq work. Also, aborted commands take
162 * the sync_completion path (if they complete), so don't put anything
163 * else in slot zero.
Matthew Wilcoxb60503b2011-01-20 12:50:14 -0500164 */
165enum {
166 sync_completion_id = 0,
167 bio_completion_id,
168};
169
170static unsigned long free_cmdid(struct nvme_queue *nvmeq, int cmdid)
171{
172 unsigned long data;
173
174 data = nvmeq->cmdid_data[cmdid + BITS_TO_LONGS(nvmeq->q_depth)];
175 clear_bit(cmdid, nvmeq->cmdid_data);
176 wake_up(&nvmeq->sq_full);
177 return data;
178}
179
Matthew Wilcox3c0cf132011-02-04 16:03:56 -0500180static void clear_cmdid_data(struct nvme_queue *nvmeq, int cmdid)
181{
182 nvmeq->cmdid_data[cmdid + BITS_TO_LONGS(nvmeq->q_depth)] = 0;
183}
184
Matthew Wilcoxb60503b2011-01-20 12:50:14 -0500185static struct nvme_queue *get_nvmeq(struct nvme_ns *ns)
186{
Matthew Wilcox1b234842011-01-20 13:01:49 -0500187 int qid, cpu = get_cpu();
188 if (cpu < ns->dev->queue_count)
189 qid = cpu + 1;
190 else
191 qid = (cpu % rounddown_pow_of_two(ns->dev->queue_count)) + 1;
192 return ns->dev->queues[qid];
Matthew Wilcoxb60503b2011-01-20 12:50:14 -0500193}
194
195static void put_nvmeq(struct nvme_queue *nvmeq)
196{
Matthew Wilcox1b234842011-01-20 13:01:49 -0500197 put_cpu();
Matthew Wilcoxb60503b2011-01-20 12:50:14 -0500198}
199
200/**
201 * nvme_submit_cmd: Copy a command into a queue and ring the doorbell
202 * @nvmeq: The queue to use
203 * @cmd: The command to send
204 *
205 * Safe to use from interrupt context
206 */
207static int nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd)
208{
209 unsigned long flags;
210 u16 tail;
211 /* XXX: Need to check tail isn't going to overrun head */
212 spin_lock_irqsave(&nvmeq->q_lock, flags);
213 tail = nvmeq->sq_tail;
214 memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd));
215 writel(tail, nvmeq->q_db);
216 if (++tail == nvmeq->q_depth)
217 tail = 0;
218 nvmeq->sq_tail = tail;
219 spin_unlock_irqrestore(&nvmeq->q_lock, flags);
220
221 return 0;
222}
223
224struct nvme_req_info {
225 struct bio *bio;
226 int nents;
227 struct scatterlist sg[0];
228};
229
230/* XXX: use a mempool */
231static struct nvme_req_info *alloc_info(unsigned nseg, gfp_t gfp)
232{
233 return kmalloc(sizeof(struct nvme_req_info) +
234 sizeof(struct scatterlist) * nseg, gfp);
235}
236
237static void free_info(struct nvme_req_info *info)
238{
239 kfree(info);
240}
241
242static void bio_completion(struct nvme_queue *nvmeq, void *ctx,
243 struct nvme_completion *cqe)
244{
245 struct nvme_req_info *info = ctx;
246 struct bio *bio = info->bio;
247 u16 status = le16_to_cpup(&cqe->status) >> 1;
248
249 dma_unmap_sg(nvmeq->q_dmadev, info->sg, info->nents,
250 bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
251 free_info(info);
252 bio_endio(bio, status ? -EIO : 0);
253}
254
Matthew Wilcoxff22b542011-01-26 10:02:29 -0500255/* length is in bytes */
256static void nvme_setup_prps(struct nvme_common_command *cmd,
257 struct scatterlist *sg, int length)
258{
259 int dma_len = sg_dma_len(sg);
260 u64 dma_addr = sg_dma_address(sg);
261 int offset = offset_in_page(dma_addr);
262
263 cmd->prp1 = cpu_to_le64(dma_addr);
264 length -= (PAGE_SIZE - offset);
265 if (length <= 0)
266 return;
267
268 dma_len -= (PAGE_SIZE - offset);
269 if (dma_len) {
270 dma_addr += (PAGE_SIZE - offset);
271 } else {
272 sg = sg_next(sg);
273 dma_addr = sg_dma_address(sg);
274 dma_len = sg_dma_len(sg);
275 }
276
277 if (length <= PAGE_SIZE) {
278 cmd->prp2 = cpu_to_le64(dma_addr);
279 return;
280 }
281
282 /* XXX: support PRP lists */
283}
284
Matthew Wilcoxb60503b2011-01-20 12:50:14 -0500285static int nvme_map_bio(struct device *dev, struct nvme_req_info *info,
286 struct bio *bio, enum dma_data_direction dma_dir, int psegs)
287{
288 struct bio_vec *bvec;
289 struct scatterlist *sg = info->sg;
290 int i, nsegs;
291
292 sg_init_table(sg, psegs);
293 bio_for_each_segment(bvec, bio, i) {
294 sg_set_page(sg, bvec->bv_page, bvec->bv_len, bvec->bv_offset);
295 /* XXX: handle non-mergable here */
296 nsegs++;
297 }
298 info->nents = nsegs;
299
300 return dma_map_sg(dev, info->sg, info->nents, dma_dir);
301}
302
303static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
304 struct bio *bio)
305{
Matthew Wilcoxff22b542011-01-26 10:02:29 -0500306 struct nvme_command *cmnd;
Matthew Wilcoxb60503b2011-01-20 12:50:14 -0500307 struct nvme_req_info *info;
308 enum dma_data_direction dma_dir;
309 int cmdid;
310 u16 control;
311 u32 dsmgmt;
312 unsigned long flags;
313 int psegs = bio_phys_segments(ns->queue, bio);
314
315 info = alloc_info(psegs, GFP_NOIO);
316 if (!info)
317 goto congestion;
318 info->bio = bio;
319
320 cmdid = alloc_cmdid(nvmeq, info, bio_completion_id);
321 if (unlikely(cmdid < 0))
322 goto free_info;
323
324 control = 0;
325 if (bio->bi_rw & REQ_FUA)
326 control |= NVME_RW_FUA;
327 if (bio->bi_rw & (REQ_FAILFAST_DEV | REQ_RAHEAD))
328 control |= NVME_RW_LR;
329
330 dsmgmt = 0;
331 if (bio->bi_rw & REQ_RAHEAD)
332 dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
333
334 spin_lock_irqsave(&nvmeq->q_lock, flags);
Matthew Wilcoxff22b542011-01-26 10:02:29 -0500335 cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail];
Matthew Wilcoxb60503b2011-01-20 12:50:14 -0500336
Matthew Wilcoxb8deb622011-01-26 10:08:25 -0500337 memset(cmnd, 0, sizeof(*cmnd));
Matthew Wilcoxb60503b2011-01-20 12:50:14 -0500338 if (bio_data_dir(bio)) {
Matthew Wilcoxff22b542011-01-26 10:02:29 -0500339 cmnd->rw.opcode = nvme_cmd_write;
Matthew Wilcoxb60503b2011-01-20 12:50:14 -0500340 dma_dir = DMA_TO_DEVICE;
341 } else {
Matthew Wilcoxff22b542011-01-26 10:02:29 -0500342 cmnd->rw.opcode = nvme_cmd_read;
Matthew Wilcoxb60503b2011-01-20 12:50:14 -0500343 dma_dir = DMA_FROM_DEVICE;
344 }
345
346 nvme_map_bio(nvmeq->q_dmadev, info, bio, dma_dir, psegs);
347
Matthew Wilcoxff22b542011-01-26 10:02:29 -0500348 cmnd->rw.flags = 1;
349 cmnd->rw.command_id = cmdid;
350 cmnd->rw.nsid = cpu_to_le32(ns->ns_id);
351 nvme_setup_prps(&cmnd->common, info->sg, bio->bi_size);
352 cmnd->rw.slba = cpu_to_le64(bio->bi_sector >> (ns->lba_shift - 9));
353 cmnd->rw.length = cpu_to_le16((bio->bi_size >> ns->lba_shift) - 1);
354 cmnd->rw.control = cpu_to_le16(control);
355 cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt);
Matthew Wilcoxb60503b2011-01-20 12:50:14 -0500356
357 writel(nvmeq->sq_tail, nvmeq->q_db);
358 if (++nvmeq->sq_tail == nvmeq->q_depth)
359 nvmeq->sq_tail = 0;
360
361 spin_unlock_irqrestore(&nvmeq->q_lock, flags);
362
363 return 0;
364
365 free_info:
366 free_info(info);
367 congestion:
368 return -EBUSY;
369}
370
371/*
372 * NB: return value of non-zero would mean that we were a stacking driver.
373 * make_request must always succeed.
374 */
375static int nvme_make_request(struct request_queue *q, struct bio *bio)
376{
377 struct nvme_ns *ns = q->queuedata;
378 struct nvme_queue *nvmeq = get_nvmeq(ns);
379
380 if (nvme_submit_bio_queue(nvmeq, ns, bio)) {
381 blk_set_queue_congested(q, rw_is_sync(bio->bi_rw));
382 bio_list_add(&nvmeq->sq_cong, bio);
383 }
384 put_nvmeq(nvmeq);
385
386 return 0;
387}
388
389struct sync_cmd_info {
390 struct task_struct *task;
391 u32 result;
392 int status;
393};
394
395static void sync_completion(struct nvme_queue *nvmeq, void *ctx,
396 struct nvme_completion *cqe)
397{
398 struct sync_cmd_info *cmdinfo = ctx;
Matthew Wilcox3c0cf132011-02-04 16:03:56 -0500399 if (!cmdinfo)
400 return; /* Command aborted */
Matthew Wilcoxb60503b2011-01-20 12:50:14 -0500401 cmdinfo->result = le32_to_cpup(&cqe->result);
402 cmdinfo->status = le16_to_cpup(&cqe->status) >> 1;
403 wake_up_process(cmdinfo->task);
404}
405
406typedef void (*completion_fn)(struct nvme_queue *, void *,
407 struct nvme_completion *);
408
409static irqreturn_t nvme_process_cq(struct nvme_queue *nvmeq)
410{
Matthew Wilcox82123462011-01-20 13:24:06 -0500411 u16 head, phase;
Matthew Wilcoxb60503b2011-01-20 12:50:14 -0500412
413 static const completion_fn completions[4] = {
414 [sync_completion_id] = sync_completion,
415 [bio_completion_id] = bio_completion,
416 };
417
418 head = nvmeq->cq_head;
Matthew Wilcox82123462011-01-20 13:24:06 -0500419 phase = nvmeq->cq_phase;
Matthew Wilcoxb60503b2011-01-20 12:50:14 -0500420
421 for (;;) {
422 unsigned long data;
423 void *ptr;
424 unsigned char handler;
425 struct nvme_completion cqe = nvmeq->cqes[head];
Matthew Wilcox82123462011-01-20 13:24:06 -0500426 if ((le16_to_cpu(cqe.status) & 1) != phase)
Matthew Wilcoxb60503b2011-01-20 12:50:14 -0500427 break;
428 nvmeq->sq_head = le16_to_cpu(cqe.sq_head);
429 if (++head == nvmeq->q_depth) {
430 head = 0;
Matthew Wilcox82123462011-01-20 13:24:06 -0500431 phase = !phase;
Matthew Wilcoxb60503b2011-01-20 12:50:14 -0500432 }
433
434 data = free_cmdid(nvmeq, cqe.command_id);
435 handler = data & 3;
436 ptr = (void *)(data & ~3UL);
437 completions[handler](nvmeq, ptr, &cqe);
438 }
439
440 /* If the controller ignores the cq head doorbell and continuously
441 * writes to the queue, it is theoretically possible to wrap around
442 * the queue twice and mistakenly return IRQ_NONE. Linux only
443 * requires that 0.1% of your interrupts are handled, so this isn't
444 * a big problem.
445 */
Matthew Wilcox82123462011-01-20 13:24:06 -0500446 if (head == nvmeq->cq_head && phase == nvmeq->cq_phase)
Matthew Wilcoxb60503b2011-01-20 12:50:14 -0500447 return IRQ_NONE;
448
449 writel(head, nvmeq->q_db + 1);
450 nvmeq->cq_head = head;
Matthew Wilcox82123462011-01-20 13:24:06 -0500451 nvmeq->cq_phase = phase;
Matthew Wilcoxb60503b2011-01-20 12:50:14 -0500452
453 return IRQ_HANDLED;
454}
455
456static irqreturn_t nvme_irq(int irq, void *data)
457{
458 return nvme_process_cq(data);
459}
460
Matthew Wilcox58ffacb2011-02-06 07:28:06 -0500461static irqreturn_t nvme_irq_thread(int irq, void *data)
462{
463 irqreturn_t result;
464 struct nvme_queue *nvmeq = data;
465 spin_lock(&nvmeq->q_lock);
466 result = nvme_process_cq(nvmeq);
467 spin_unlock(&nvmeq->q_lock);
468 return result;
469}
470
471static irqreturn_t nvme_irq_check(int irq, void *data)
472{
473 struct nvme_queue *nvmeq = data;
474 struct nvme_completion cqe = nvmeq->cqes[nvmeq->cq_head];
475 if ((le16_to_cpu(cqe.status) & 1) != nvmeq->cq_phase)
476 return IRQ_NONE;
477 return IRQ_WAKE_THREAD;
478}
479
Matthew Wilcox3c0cf132011-02-04 16:03:56 -0500480static void nvme_abort_command(struct nvme_queue *nvmeq, int cmdid)
481{
482 spin_lock_irq(&nvmeq->q_lock);
483 clear_cmdid_data(nvmeq, cmdid);
484 spin_unlock_irq(&nvmeq->q_lock);
485}
486
Matthew Wilcoxb60503b2011-01-20 12:50:14 -0500487/*
488 * Returns 0 on success. If the result is negative, it's a Linux error code;
489 * if the result is positive, it's an NVM Express status code
490 */
Matthew Wilcox3c0cf132011-02-04 16:03:56 -0500491static int nvme_submit_sync_cmd(struct nvme_queue *nvmeq,
492 struct nvme_command *cmd, u32 *result)
Matthew Wilcoxb60503b2011-01-20 12:50:14 -0500493{
494 int cmdid;
495 struct sync_cmd_info cmdinfo;
496
497 cmdinfo.task = current;
498 cmdinfo.status = -EINTR;
499
Matthew Wilcox3c0cf132011-02-04 16:03:56 -0500500 cmdid = alloc_cmdid_killable(nvmeq, &cmdinfo, sync_completion_id);
Matthew Wilcoxb60503b2011-01-20 12:50:14 -0500501 if (cmdid < 0)
502 return cmdid;
503 cmd->common.command_id = cmdid;
504
Matthew Wilcox3c0cf132011-02-04 16:03:56 -0500505 set_current_state(TASK_KILLABLE);
506 nvme_submit_cmd(nvmeq, cmd);
Matthew Wilcoxb60503b2011-01-20 12:50:14 -0500507 schedule();
508
Matthew Wilcox3c0cf132011-02-04 16:03:56 -0500509 if (cmdinfo.status == -EINTR) {
510 nvme_abort_command(nvmeq, cmdid);
511 return -EINTR;
512 }
513
Matthew Wilcoxb60503b2011-01-20 12:50:14 -0500514 if (result)
515 *result = cmdinfo.result;
516
517 return cmdinfo.status;
518}
519
520static int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd,
521 u32 *result)
522{
523 return nvme_submit_sync_cmd(dev->queues[0], cmd, result);
524}
525
526static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id)
527{
528 int status;
529 struct nvme_command c;
530
531 memset(&c, 0, sizeof(c));
532 c.delete_queue.opcode = opcode;
533 c.delete_queue.qid = cpu_to_le16(id);
534
535 status = nvme_submit_admin_cmd(dev, &c, NULL);
536 if (status)
537 return -EIO;
538 return 0;
539}
540
541static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
542 struct nvme_queue *nvmeq)
543{
544 int status;
545 struct nvme_command c;
546 int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED;
547
548 memset(&c, 0, sizeof(c));
549 c.create_cq.opcode = nvme_admin_create_cq;
550 c.create_cq.prp1 = cpu_to_le64(nvmeq->cq_dma_addr);
551 c.create_cq.cqid = cpu_to_le16(qid);
552 c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
553 c.create_cq.cq_flags = cpu_to_le16(flags);
554 c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector);
555
556 status = nvme_submit_admin_cmd(dev, &c, NULL);
557 if (status)
558 return -EIO;
559 return 0;
560}
561
562static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
563 struct nvme_queue *nvmeq)
564{
565 int status;
566 struct nvme_command c;
567 int flags = NVME_QUEUE_PHYS_CONTIG | NVME_SQ_PRIO_MEDIUM;
568
569 memset(&c, 0, sizeof(c));
570 c.create_sq.opcode = nvme_admin_create_sq;
571 c.create_sq.prp1 = cpu_to_le64(nvmeq->sq_dma_addr);
572 c.create_sq.sqid = cpu_to_le16(qid);
573 c.create_sq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
574 c.create_sq.sq_flags = cpu_to_le16(flags);
575 c.create_sq.cqid = cpu_to_le16(qid);
576
577 status = nvme_submit_admin_cmd(dev, &c, NULL);
578 if (status)
579 return -EIO;
580 return 0;
581}
582
583static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid)
584{
585 return adapter_delete_queue(dev, nvme_admin_delete_cq, cqid);
586}
587
588static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid)
589{
590 return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid);
591}
592
593static void nvme_free_queue(struct nvme_dev *dev, int qid)
594{
595 struct nvme_queue *nvmeq = dev->queues[qid];
596
597 free_irq(dev->entry[nvmeq->cq_vector].vector, nvmeq);
598
599 /* Don't tell the adapter to delete the admin queue */
600 if (qid) {
601 adapter_delete_sq(dev, qid);
602 adapter_delete_cq(dev, qid);
603 }
604
605 dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth),
606 (void *)nvmeq->cqes, nvmeq->cq_dma_addr);
607 dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
608 nvmeq->sq_cmds, nvmeq->sq_dma_addr);
609 kfree(nvmeq);
610}
611
612static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
613 int depth, int vector)
614{
615 struct device *dmadev = &dev->pci_dev->dev;
616 unsigned extra = (depth + BITS_TO_LONGS(depth)) * sizeof(long);
617 struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq) + extra, GFP_KERNEL);
618 if (!nvmeq)
619 return NULL;
620
621 nvmeq->cqes = dma_alloc_coherent(dmadev, CQ_SIZE(depth),
622 &nvmeq->cq_dma_addr, GFP_KERNEL);
623 if (!nvmeq->cqes)
624 goto free_nvmeq;
625 memset((void *)nvmeq->cqes, 0, CQ_SIZE(depth));
626
627 nvmeq->sq_cmds = dma_alloc_coherent(dmadev, SQ_SIZE(depth),
628 &nvmeq->sq_dma_addr, GFP_KERNEL);
629 if (!nvmeq->sq_cmds)
630 goto free_cqdma;
631
632 nvmeq->q_dmadev = dmadev;
633 spin_lock_init(&nvmeq->q_lock);
634 nvmeq->cq_head = 0;
Matthew Wilcox82123462011-01-20 13:24:06 -0500635 nvmeq->cq_phase = 1;
Matthew Wilcoxb60503b2011-01-20 12:50:14 -0500636 init_waitqueue_head(&nvmeq->sq_full);
637 bio_list_init(&nvmeq->sq_cong);
638 nvmeq->q_db = &dev->dbs[qid * 2];
639 nvmeq->q_depth = depth;
640 nvmeq->cq_vector = vector;
641
642 return nvmeq;
643
644 free_cqdma:
645 dma_free_coherent(dmadev, CQ_SIZE(nvmeq->q_depth), (void *)nvmeq->cqes,
646 nvmeq->cq_dma_addr);
647 free_nvmeq:
648 kfree(nvmeq);
649 return NULL;
650}
651
Matthew Wilcox30010822011-01-20 09:10:15 -0500652static int queue_request_irq(struct nvme_dev *dev, struct nvme_queue *nvmeq,
653 const char *name)
654{
Matthew Wilcox58ffacb2011-02-06 07:28:06 -0500655 if (use_threaded_interrupts)
656 return request_threaded_irq(dev->entry[nvmeq->cq_vector].vector,
657 nvme_irq_check, nvme_irq_thread,
658 IRQF_DISABLED | IRQF_SHARED,
659 name, nvmeq);
Matthew Wilcox30010822011-01-20 09:10:15 -0500660 return request_irq(dev->entry[nvmeq->cq_vector].vector, nvme_irq,
661 IRQF_DISABLED | IRQF_SHARED, name, nvmeq);
662}
663
Matthew Wilcoxb60503b2011-01-20 12:50:14 -0500664static __devinit struct nvme_queue *nvme_create_queue(struct nvme_dev *dev,
665 int qid, int cq_size, int vector)
666{
667 int result;
668 struct nvme_queue *nvmeq = nvme_alloc_queue(dev, qid, cq_size, vector);
669
Matthew Wilcox3f85d502011-02-01 08:39:04 -0500670 if (!nvmeq)
671 return NULL;
672
Matthew Wilcoxb60503b2011-01-20 12:50:14 -0500673 result = adapter_alloc_cq(dev, qid, nvmeq);
674 if (result < 0)
675 goto free_nvmeq;
676
677 result = adapter_alloc_sq(dev, qid, nvmeq);
678 if (result < 0)
679 goto release_cq;
680
Matthew Wilcox30010822011-01-20 09:10:15 -0500681 result = queue_request_irq(dev, nvmeq, "nvme");
Matthew Wilcoxb60503b2011-01-20 12:50:14 -0500682 if (result < 0)
683 goto release_sq;
684
685 return nvmeq;
686
687 release_sq:
688 adapter_delete_sq(dev, qid);
689 release_cq:
690 adapter_delete_cq(dev, qid);
691 free_nvmeq:
692 dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth),
693 (void *)nvmeq->cqes, nvmeq->cq_dma_addr);
694 dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
695 nvmeq->sq_cmds, nvmeq->sq_dma_addr);
696 kfree(nvmeq);
697 return NULL;
698}
699
700static int __devinit nvme_configure_admin_queue(struct nvme_dev *dev)
701{
702 int result;
703 u32 aqa;
704 struct nvme_queue *nvmeq;
705
706 dev->dbs = ((void __iomem *)dev->bar) + 4096;
707
708 nvmeq = nvme_alloc_queue(dev, 0, 64, 0);
Matthew Wilcox3f85d502011-02-01 08:39:04 -0500709 if (!nvmeq)
710 return -ENOMEM;
Matthew Wilcoxb60503b2011-01-20 12:50:14 -0500711
712 aqa = nvmeq->q_depth - 1;
713 aqa |= aqa << 16;
714
715 dev->ctrl_config = NVME_CC_ENABLE | NVME_CC_CSS_NVM;
716 dev->ctrl_config |= (PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT;
717 dev->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE;
718
Shane Michael Matthews5911f202011-02-01 11:31:55 -0500719 writel(0, &dev->bar->cc);
Matthew Wilcoxb60503b2011-01-20 12:50:14 -0500720 writel(aqa, &dev->bar->aqa);
721 writeq(nvmeq->sq_dma_addr, &dev->bar->asq);
722 writeq(nvmeq->cq_dma_addr, &dev->bar->acq);
723 writel(dev->ctrl_config, &dev->bar->cc);
724
725 while (!(readl(&dev->bar->csts) & NVME_CSTS_RDY)) {
726 msleep(100);
727 if (fatal_signal_pending(current))
728 return -EINTR;
729 }
730
Matthew Wilcox30010822011-01-20 09:10:15 -0500731 result = queue_request_irq(dev, nvmeq, "nvme admin");
Matthew Wilcoxb60503b2011-01-20 12:50:14 -0500732 dev->queues[0] = nvmeq;
733 return result;
734}
735
Matthew Wilcox7fc3cda2011-01-26 17:05:50 -0500736static int nvme_map_user_pages(struct nvme_dev *dev, int write,
737 unsigned long addr, unsigned length,
738 struct scatterlist **sgp)
Matthew Wilcoxb60503b2011-01-20 12:50:14 -0500739{
Matthew Wilcox36c14ed2011-01-24 07:52:07 -0500740 int i, err, count, nents, offset;
Matthew Wilcox7fc3cda2011-01-26 17:05:50 -0500741 struct scatterlist *sg;
742 struct page **pages;
Matthew Wilcoxb60503b2011-01-20 12:50:14 -0500743
Matthew Wilcox36c14ed2011-01-24 07:52:07 -0500744 if (addr & 3)
745 return -EINVAL;
Matthew Wilcox7fc3cda2011-01-26 17:05:50 -0500746 if (!length)
747 return -EINVAL;
748
Matthew Wilcox36c14ed2011-01-24 07:52:07 -0500749 offset = offset_in_page(addr);
Matthew Wilcox7fc3cda2011-01-26 17:05:50 -0500750 count = DIV_ROUND_UP(offset + length, PAGE_SIZE);
751 pages = kcalloc(count, sizeof(*pages), GFP_KERNEL);
Matthew Wilcox36c14ed2011-01-24 07:52:07 -0500752
753 err = get_user_pages_fast(addr, count, 1, pages);
754 if (err < count) {
755 count = err;
756 err = -EFAULT;
757 goto put_pages;
758 }
Matthew Wilcox7fc3cda2011-01-26 17:05:50 -0500759
760 sg = kcalloc(count, sizeof(*sg), GFP_KERNEL);
Matthew Wilcox36c14ed2011-01-24 07:52:07 -0500761 sg_init_table(sg, count);
Matthew Wilcoxff22b542011-01-26 10:02:29 -0500762 sg_set_page(&sg[0], pages[0], PAGE_SIZE - offset, offset);
Matthew Wilcox7fc3cda2011-01-26 17:05:50 -0500763 length -= (PAGE_SIZE - offset);
764 for (i = 1; i < count; i++) {
765 sg_set_page(&sg[i], pages[i], min_t(int, length, PAGE_SIZE), 0);
766 length -= PAGE_SIZE;
767 }
768
769 err = -ENOMEM;
770 nents = dma_map_sg(&dev->pci_dev->dev, sg, count,
771 write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
Matthew Wilcox36c14ed2011-01-24 07:52:07 -0500772 if (!nents)
773 goto put_pages;
Matthew Wilcoxb60503b2011-01-20 12:50:14 -0500774
Matthew Wilcox7fc3cda2011-01-26 17:05:50 -0500775 kfree(pages);
776 *sgp = sg;
777 return nents;
Matthew Wilcoxb60503b2011-01-20 12:50:14 -0500778
Matthew Wilcox36c14ed2011-01-24 07:52:07 -0500779 put_pages:
780 for (i = 0; i < count; i++)
781 put_page(pages[i]);
Matthew Wilcox7fc3cda2011-01-26 17:05:50 -0500782 kfree(pages);
Matthew Wilcox36c14ed2011-01-24 07:52:07 -0500783 return err;
Matthew Wilcoxb60503b2011-01-20 12:50:14 -0500784}
785
Matthew Wilcox7fc3cda2011-01-26 17:05:50 -0500786static void nvme_unmap_user_pages(struct nvme_dev *dev, int write,
787 unsigned long addr, int length,
788 struct scatterlist *sg, int nents)
789{
790 int i, count;
791
792 count = DIV_ROUND_UP(offset_in_page(addr) + length, PAGE_SIZE);
793 dma_unmap_sg(&dev->pci_dev->dev, sg, nents, DMA_FROM_DEVICE);
794
795 for (i = 0; i < count; i++)
796 put_page(sg_page(&sg[i]));
797}
798
799static int nvme_submit_user_admin_command(struct nvme_dev *dev,
800 unsigned long addr, unsigned length,
801 struct nvme_command *cmd)
802{
803 int err, nents;
804 struct scatterlist *sg;
805
806 nents = nvme_map_user_pages(dev, 0, addr, length, &sg);
807 if (nents < 0)
808 return nents;
809 nvme_setup_prps(&cmd->common, sg, length);
810 err = nvme_submit_admin_cmd(dev, cmd, NULL);
811 nvme_unmap_user_pages(dev, 0, addr, length, sg, nents);
812 return err ? -EIO : 0;
813}
814
Matthew Wilcoxbd38c552011-01-26 14:34:32 -0500815static int nvme_identify(struct nvme_ns *ns, unsigned long addr, int cns)
Matthew Wilcoxb60503b2011-01-20 12:50:14 -0500816{
Matthew Wilcoxb60503b2011-01-20 12:50:14 -0500817 struct nvme_command c;
Matthew Wilcoxb60503b2011-01-20 12:50:14 -0500818
Matthew Wilcoxbd38c552011-01-26 14:34:32 -0500819 memset(&c, 0, sizeof(c));
820 c.identify.opcode = nvme_admin_identify;
821 c.identify.nsid = cns ? 0 : cpu_to_le32(ns->ns_id);
822 c.identify.cns = cpu_to_le32(cns);
823
824 return nvme_submit_user_admin_command(ns->dev, addr, 4096, &c);
825}
826
827static int nvme_get_range_type(struct nvme_ns *ns, unsigned long addr)
828{
829 struct nvme_command c;
Matthew Wilcoxb60503b2011-01-20 12:50:14 -0500830
831 memset(&c, 0, sizeof(c));
832 c.features.opcode = nvme_admin_get_features;
833 c.features.nsid = cpu_to_le32(ns->ns_id);
Matthew Wilcoxb60503b2011-01-20 12:50:14 -0500834 c.features.fid = cpu_to_le32(NVME_FEAT_LBA_RANGE);
835
Matthew Wilcoxbd38c552011-01-26 14:34:32 -0500836 return nvme_submit_user_admin_command(ns->dev, addr, 4096, &c);
Matthew Wilcoxb60503b2011-01-20 12:50:14 -0500837}
838
Matthew Wilcoxa53295b2011-02-01 16:13:29 -0500839static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
840{
841 struct nvme_dev *dev = ns->dev;
842 struct nvme_queue *nvmeq;
843 struct nvme_user_io io;
844 struct nvme_command c;
845 unsigned length;
846 u32 result;
847 int nents, status;
848 struct scatterlist *sg;
849
850 if (copy_from_user(&io, uio, sizeof(io)))
851 return -EFAULT;
852 length = io.nblocks << io.block_shift;
853 nents = nvme_map_user_pages(dev, io.opcode & 1, io.addr, length, &sg);
854 if (nents < 0)
855 return nents;
856
857 memset(&c, 0, sizeof(c));
858 c.rw.opcode = io.opcode;
859 c.rw.flags = io.flags;
860 c.rw.nsid = cpu_to_le32(io.nsid);
861 c.rw.slba = cpu_to_le64(io.slba);
862 c.rw.length = cpu_to_le16(io.nblocks - 1);
863 c.rw.control = cpu_to_le16(io.control);
864 c.rw.dsmgmt = cpu_to_le16(io.dsmgmt);
865 c.rw.reftag = cpu_to_le32(io.reftag); /* XXX: endian? */
866 c.rw.apptag = cpu_to_le16(io.apptag);
867 c.rw.appmask = cpu_to_le16(io.appmask);
868 /* XXX: metadata */
869 nvme_setup_prps(&c.common, sg, length);
870
871 nvmeq = get_nvmeq(ns);
Matthew Wilcoxb1ad37e2011-02-04 16:14:30 -0500872 /* Since nvme_submit_sync_cmd sleeps, we can't keep preemption
873 * disabled. We may be preempted at any point, and be rescheduled
874 * to a different CPU. That will cause cacheline bouncing, but no
875 * additional races since q_lock already protects against other CPUs.
876 */
Matthew Wilcoxa53295b2011-02-01 16:13:29 -0500877 put_nvmeq(nvmeq);
Matthew Wilcoxb1ad37e2011-02-04 16:14:30 -0500878 status = nvme_submit_sync_cmd(nvmeq, &c, &result);
Matthew Wilcoxa53295b2011-02-01 16:13:29 -0500879
880 nvme_unmap_user_pages(dev, io.opcode & 1, io.addr, length, sg, nents);
881 put_user(result, &uio->result);
882 return status;
883}
884
Matthew Wilcox6ee44cd2011-02-03 10:58:26 -0500885static int nvme_download_firmware(struct nvme_ns *ns,
886 struct nvme_dlfw __user *udlfw)
887{
888 struct nvme_dev *dev = ns->dev;
889 struct nvme_dlfw dlfw;
890 struct nvme_command c;
891 int nents, status;
892 struct scatterlist *sg;
893
894 if (copy_from_user(&dlfw, udlfw, sizeof(dlfw)))
895 return -EFAULT;
896 if (dlfw.length >= (1 << 30))
897 return -EINVAL;
898
899 nents = nvme_map_user_pages(dev, 1, dlfw.addr, dlfw.length * 4, &sg);
900 if (nents < 0)
901 return nents;
902
903 memset(&c, 0, sizeof(c));
904 c.dlfw.opcode = nvme_admin_download_fw;
905 c.dlfw.numd = cpu_to_le32(dlfw.length);
906 c.dlfw.offset = cpu_to_le32(dlfw.offset);
907 nvme_setup_prps(&c.common, sg, dlfw.length * 4);
908
909 status = nvme_submit_admin_cmd(dev, &c, NULL);
910 nvme_unmap_user_pages(dev, 0, dlfw.addr, dlfw.length * 4, sg, nents);
911 return status;
912}
913
914static int nvme_activate_firmware(struct nvme_ns *ns, unsigned long arg)
915{
916 struct nvme_dev *dev = ns->dev;
917 struct nvme_command c;
918
919 memset(&c, 0, sizeof(c));
920 c.common.opcode = nvme_admin_activate_fw;
921 c.common.rsvd10[0] = cpu_to_le32(arg);
922
923 return nvme_submit_admin_cmd(dev, &c, NULL);
924}
925
Matthew Wilcoxb60503b2011-01-20 12:50:14 -0500926static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
927 unsigned long arg)
928{
929 struct nvme_ns *ns = bdev->bd_disk->private_data;
930
931 switch (cmd) {
932 case NVME_IOCTL_IDENTIFY_NS:
Matthew Wilcox36c14ed2011-01-24 07:52:07 -0500933 return nvme_identify(ns, arg, 0);
Matthew Wilcoxb60503b2011-01-20 12:50:14 -0500934 case NVME_IOCTL_IDENTIFY_CTRL:
Matthew Wilcox36c14ed2011-01-24 07:52:07 -0500935 return nvme_identify(ns, arg, 1);
Matthew Wilcoxb60503b2011-01-20 12:50:14 -0500936 case NVME_IOCTL_GET_RANGE_TYPE:
Matthew Wilcoxbd38c552011-01-26 14:34:32 -0500937 return nvme_get_range_type(ns, arg);
Matthew Wilcoxa53295b2011-02-01 16:13:29 -0500938 case NVME_IOCTL_SUBMIT_IO:
939 return nvme_submit_io(ns, (void __user *)arg);
Matthew Wilcox6ee44cd2011-02-03 10:58:26 -0500940 case NVME_IOCTL_DOWNLOAD_FW:
941 return nvme_download_firmware(ns, (void __user *)arg);
942 case NVME_IOCTL_ACTIVATE_FW:
943 return nvme_activate_firmware(ns, arg);
Matthew Wilcoxb60503b2011-01-20 12:50:14 -0500944 default:
945 return -ENOTTY;
946 }
947}
948
949static const struct block_device_operations nvme_fops = {
950 .owner = THIS_MODULE,
951 .ioctl = nvme_ioctl,
952};
953
954static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, int index,
955 struct nvme_id_ns *id, struct nvme_lba_range_type *rt)
956{
957 struct nvme_ns *ns;
958 struct gendisk *disk;
959 int lbaf;
960
961 if (rt->attributes & NVME_LBART_ATTRIB_HIDE)
962 return NULL;
963
964 ns = kzalloc(sizeof(*ns), GFP_KERNEL);
965 if (!ns)
966 return NULL;
967 ns->queue = blk_alloc_queue(GFP_KERNEL);
968 if (!ns->queue)
969 goto out_free_ns;
970 ns->queue->queue_flags = QUEUE_FLAG_DEFAULT | QUEUE_FLAG_NOMERGES |
971 QUEUE_FLAG_NONROT | QUEUE_FLAG_DISCARD;
972 blk_queue_make_request(ns->queue, nvme_make_request);
973 ns->dev = dev;
974 ns->queue->queuedata = ns;
975
976 disk = alloc_disk(NVME_MINORS);
977 if (!disk)
978 goto out_free_queue;
979 ns->ns_id = index;
980 ns->disk = disk;
981 lbaf = id->flbas & 0xf;
982 ns->lba_shift = id->lbaf[lbaf].ds;
983
984 disk->major = nvme_major;
985 disk->minors = NVME_MINORS;
986 disk->first_minor = NVME_MINORS * index;
987 disk->fops = &nvme_fops;
988 disk->private_data = ns;
989 disk->queue = ns->queue;
Matthew Wilcox388f0372011-02-01 12:49:38 -0500990 disk->driverfs_dev = &dev->pci_dev->dev;
Matthew Wilcoxb60503b2011-01-20 12:50:14 -0500991 sprintf(disk->disk_name, "nvme%dn%d", dev->instance, index);
992 set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9));
993
994 return ns;
995
996 out_free_queue:
997 blk_cleanup_queue(ns->queue);
998 out_free_ns:
999 kfree(ns);
1000 return NULL;
1001}
1002
1003static void nvme_ns_free(struct nvme_ns *ns)
1004{
1005 put_disk(ns->disk);
1006 blk_cleanup_queue(ns->queue);
1007 kfree(ns);
1008}
1009
Matthew Wilcoxb3b06812011-01-20 09:14:34 -05001010static int set_queue_count(struct nvme_dev *dev, int count)
Matthew Wilcoxb60503b2011-01-20 12:50:14 -05001011{
1012 int status;
1013 u32 result;
1014 struct nvme_command c;
Matthew Wilcoxb3b06812011-01-20 09:14:34 -05001015 u32 q_count = (count - 1) | ((count - 1) << 16);
Matthew Wilcoxb60503b2011-01-20 12:50:14 -05001016
1017 memset(&c, 0, sizeof(c));
1018 c.features.opcode = nvme_admin_get_features;
1019 c.features.fid = cpu_to_le32(NVME_FEAT_NUM_QUEUES);
1020 c.features.dword11 = cpu_to_le32(q_count);
1021
1022 status = nvme_submit_admin_cmd(dev, &c, &result);
1023 if (status)
1024 return -EIO;
1025 return min(result & 0xffff, result >> 16) + 1;
1026}
1027
Matthew Wilcoxb60503b2011-01-20 12:50:14 -05001028static int __devinit nvme_setup_io_queues(struct nvme_dev *dev)
1029{
Matthew Wilcox1b234842011-01-20 13:01:49 -05001030 int result, cpu, i, nr_queues;
Matthew Wilcoxb60503b2011-01-20 12:50:14 -05001031
Matthew Wilcox1b234842011-01-20 13:01:49 -05001032 nr_queues = num_online_cpus();
1033 result = set_queue_count(dev, nr_queues);
1034 if (result < 0)
1035 return result;
1036 if (result < nr_queues)
1037 nr_queues = result;
Matthew Wilcoxb60503b2011-01-20 12:50:14 -05001038
Matthew Wilcox1b234842011-01-20 13:01:49 -05001039 /* Deregister the admin queue's interrupt */
1040 free_irq(dev->entry[0].vector, dev->queues[0]);
1041
1042 for (i = 0; i < nr_queues; i++)
1043 dev->entry[i].entry = i;
1044 for (;;) {
1045 result = pci_enable_msix(dev->pci_dev, dev->entry, nr_queues);
1046 if (result == 0) {
1047 break;
1048 } else if (result > 0) {
1049 nr_queues = result;
1050 continue;
1051 } else {
1052 nr_queues = 1;
1053 break;
1054 }
1055 }
1056
1057 result = queue_request_irq(dev, dev->queues[0], "nvme admin");
1058 /* XXX: handle failure here */
1059
1060 cpu = cpumask_first(cpu_online_mask);
1061 for (i = 0; i < nr_queues; i++) {
1062 irq_set_affinity_hint(dev->entry[i].vector, get_cpu_mask(cpu));
1063 cpu = cpumask_next(cpu, cpu_online_mask);
1064 }
1065
1066 for (i = 0; i < nr_queues; i++) {
1067 dev->queues[i + 1] = nvme_create_queue(dev, i + 1,
1068 NVME_Q_DEPTH, i);
1069 if (!dev->queues[i + 1])
1070 return -ENOMEM;
1071 dev->queue_count++;
1072 }
Matthew Wilcoxb60503b2011-01-20 12:50:14 -05001073
1074 return 0;
1075}
1076
1077static void nvme_free_queues(struct nvme_dev *dev)
1078{
1079 int i;
1080
1081 for (i = dev->queue_count - 1; i >= 0; i--)
1082 nvme_free_queue(dev, i);
1083}
1084
1085static int __devinit nvme_dev_add(struct nvme_dev *dev)
1086{
1087 int res, nn, i;
1088 struct nvme_ns *ns, *next;
Matthew Wilcox51814232011-02-01 16:18:08 -05001089 struct nvme_id_ctrl *ctrl;
Matthew Wilcoxb60503b2011-01-20 12:50:14 -05001090 void *id;
1091 dma_addr_t dma_addr;
1092 struct nvme_command cid, crt;
1093
1094 res = nvme_setup_io_queues(dev);
1095 if (res)
1096 return res;
1097
1098 /* XXX: Switch to a SG list once prp2 works */
1099 id = dma_alloc_coherent(&dev->pci_dev->dev, 8192, &dma_addr,
1100 GFP_KERNEL);
1101
1102 memset(&cid, 0, sizeof(cid));
1103 cid.identify.opcode = nvme_admin_identify;
1104 cid.identify.nsid = 0;
1105 cid.identify.prp1 = cpu_to_le64(dma_addr);
1106 cid.identify.cns = cpu_to_le32(1);
1107
1108 res = nvme_submit_admin_cmd(dev, &cid, NULL);
1109 if (res) {
1110 res = -EIO;
1111 goto out_free;
1112 }
1113
Matthew Wilcox51814232011-02-01 16:18:08 -05001114 ctrl = id;
1115 nn = le32_to_cpup(&ctrl->nn);
1116 memcpy(dev->serial, ctrl->sn, sizeof(ctrl->sn));
1117 memcpy(dev->model, ctrl->mn, sizeof(ctrl->mn));
1118 memcpy(dev->firmware_rev, ctrl->fr, sizeof(ctrl->fr));
Matthew Wilcoxb60503b2011-01-20 12:50:14 -05001119
1120 cid.identify.cns = 0;
1121 memset(&crt, 0, sizeof(crt));
1122 crt.features.opcode = nvme_admin_get_features;
1123 crt.features.prp1 = cpu_to_le64(dma_addr + 4096);
1124 crt.features.fid = cpu_to_le32(NVME_FEAT_LBA_RANGE);
1125
1126 for (i = 0; i < nn; i++) {
1127 cid.identify.nsid = cpu_to_le32(i);
1128 res = nvme_submit_admin_cmd(dev, &cid, NULL);
1129 if (res)
1130 continue;
1131
1132 if (((struct nvme_id_ns *)id)->ncap == 0)
1133 continue;
1134
1135 crt.features.nsid = cpu_to_le32(i);
1136 res = nvme_submit_admin_cmd(dev, &crt, NULL);
1137 if (res)
1138 continue;
1139
1140 ns = nvme_alloc_ns(dev, i, id, id + 4096);
1141 if (ns)
1142 list_add_tail(&ns->list, &dev->namespaces);
1143 }
1144 list_for_each_entry(ns, &dev->namespaces, list)
1145 add_disk(ns->disk);
1146
1147 dma_free_coherent(&dev->pci_dev->dev, 4096, id, dma_addr);
1148 return 0;
1149
1150 out_free:
1151 list_for_each_entry_safe(ns, next, &dev->namespaces, list) {
1152 list_del(&ns->list);
1153 nvme_ns_free(ns);
1154 }
1155
1156 dma_free_coherent(&dev->pci_dev->dev, 4096, id, dma_addr);
1157 return res;
1158}
1159
1160static int nvme_dev_remove(struct nvme_dev *dev)
1161{
1162 struct nvme_ns *ns, *next;
1163
1164 /* TODO: wait all I/O finished or cancel them */
1165
1166 list_for_each_entry_safe(ns, next, &dev->namespaces, list) {
1167 list_del(&ns->list);
1168 del_gendisk(ns->disk);
1169 nvme_ns_free(ns);
1170 }
1171
1172 nvme_free_queues(dev);
1173
1174 return 0;
1175}
1176
1177/* XXX: Use an ida or something to let remove / add work correctly */
1178static void nvme_set_instance(struct nvme_dev *dev)
1179{
1180 static int instance;
1181 dev->instance = instance++;
1182}
1183
1184static void nvme_release_instance(struct nvme_dev *dev)
1185{
1186}
1187
1188static int __devinit nvme_probe(struct pci_dev *pdev,
1189 const struct pci_device_id *id)
1190{
Matthew Wilcox574e8b92011-02-01 16:24:35 -05001191 int bars, result = -ENOMEM;
Matthew Wilcoxb60503b2011-01-20 12:50:14 -05001192 struct nvme_dev *dev;
1193
1194 dev = kzalloc(sizeof(*dev), GFP_KERNEL);
1195 if (!dev)
1196 return -ENOMEM;
1197 dev->entry = kcalloc(num_possible_cpus(), sizeof(*dev->entry),
1198 GFP_KERNEL);
1199 if (!dev->entry)
1200 goto free;
Matthew Wilcox1b234842011-01-20 13:01:49 -05001201 dev->queues = kcalloc(num_possible_cpus() + 1, sizeof(void *),
1202 GFP_KERNEL);
Matthew Wilcoxb60503b2011-01-20 12:50:14 -05001203 if (!dev->queues)
1204 goto free;
1205
Shane Michael Matthews0ee5a7d2011-02-01 08:49:30 -05001206 if (pci_enable_device_mem(pdev))
1207 goto free;
Matthew Wilcoxf64d3362011-02-01 09:01:59 -05001208 pci_set_master(pdev);
Matthew Wilcox574e8b92011-02-01 16:24:35 -05001209 bars = pci_select_bars(pdev, IORESOURCE_MEM);
1210 if (pci_request_selected_regions(pdev, bars, "nvme"))
1211 goto disable;
Shane Michael Matthews0ee5a7d2011-02-01 08:49:30 -05001212
Matthew Wilcoxb60503b2011-01-20 12:50:14 -05001213 INIT_LIST_HEAD(&dev->namespaces);
1214 dev->pci_dev = pdev;
1215 pci_set_drvdata(pdev, dev);
Matthew Wilcox29303532011-02-01 16:23:39 -05001216 dma_set_mask(&pdev->dev, DMA_BIT_MASK(64));
1217 dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(64));
Matthew Wilcoxb60503b2011-01-20 12:50:14 -05001218 nvme_set_instance(dev);
Matthew Wilcox53c95772011-01-20 13:42:34 -05001219 dev->entry[0].vector = pdev->irq;
Matthew Wilcoxb60503b2011-01-20 12:50:14 -05001220
1221 dev->bar = ioremap(pci_resource_start(pdev, 0), 8192);
1222 if (!dev->bar) {
1223 result = -ENOMEM;
Matthew Wilcox574e8b92011-02-01 16:24:35 -05001224 goto disable_msix;
Matthew Wilcoxb60503b2011-01-20 12:50:14 -05001225 }
1226
1227 result = nvme_configure_admin_queue(dev);
1228 if (result)
1229 goto unmap;
1230 dev->queue_count++;
1231
1232 result = nvme_dev_add(dev);
1233 if (result)
1234 goto delete;
1235 return 0;
1236
1237 delete:
1238 nvme_free_queues(dev);
1239 unmap:
1240 iounmap(dev->bar);
Matthew Wilcox574e8b92011-02-01 16:24:35 -05001241 disable_msix:
Matthew Wilcoxb60503b2011-01-20 12:50:14 -05001242 pci_disable_msix(pdev);
1243 nvme_release_instance(dev);
Matthew Wilcox574e8b92011-02-01 16:24:35 -05001244 disable:
Shane Michael Matthews0ee5a7d2011-02-01 08:49:30 -05001245 pci_disable_device(pdev);
Matthew Wilcox574e8b92011-02-01 16:24:35 -05001246 pci_release_regions(pdev);
Matthew Wilcoxb60503b2011-01-20 12:50:14 -05001247 free:
1248 kfree(dev->queues);
1249 kfree(dev->entry);
1250 kfree(dev);
1251 return result;
1252}
1253
1254static void __devexit nvme_remove(struct pci_dev *pdev)
1255{
1256 struct nvme_dev *dev = pci_get_drvdata(pdev);
1257 nvme_dev_remove(dev);
1258 pci_disable_msix(pdev);
1259 iounmap(dev->bar);
1260 nvme_release_instance(dev);
Shane Michael Matthews0ee5a7d2011-02-01 08:49:30 -05001261 pci_disable_device(pdev);
Matthew Wilcox574e8b92011-02-01 16:24:35 -05001262 pci_release_regions(pdev);
Matthew Wilcoxb60503b2011-01-20 12:50:14 -05001263 kfree(dev->queues);
1264 kfree(dev->entry);
1265 kfree(dev);
1266}
1267
1268/* These functions are yet to be implemented */
1269#define nvme_error_detected NULL
1270#define nvme_dump_registers NULL
1271#define nvme_link_reset NULL
1272#define nvme_slot_reset NULL
1273#define nvme_error_resume NULL
1274#define nvme_suspend NULL
1275#define nvme_resume NULL
1276
1277static struct pci_error_handlers nvme_err_handler = {
1278 .error_detected = nvme_error_detected,
1279 .mmio_enabled = nvme_dump_registers,
1280 .link_reset = nvme_link_reset,
1281 .slot_reset = nvme_slot_reset,
1282 .resume = nvme_error_resume,
1283};
1284
1285/* Move to pci_ids.h later */
1286#define PCI_CLASS_STORAGE_EXPRESS 0x010802
1287
1288static DEFINE_PCI_DEVICE_TABLE(nvme_id_table) = {
1289 { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) },
1290 { 0, }
1291};
1292MODULE_DEVICE_TABLE(pci, nvme_id_table);
1293
1294static struct pci_driver nvme_driver = {
1295 .name = "nvme",
1296 .id_table = nvme_id_table,
1297 .probe = nvme_probe,
1298 .remove = __devexit_p(nvme_remove),
1299 .suspend = nvme_suspend,
1300 .resume = nvme_resume,
1301 .err_handler = &nvme_err_handler,
1302};
1303
1304static int __init nvme_init(void)
1305{
1306 int result;
1307
1308 nvme_major = register_blkdev(nvme_major, "nvme");
1309 if (nvme_major <= 0)
1310 return -EBUSY;
1311
1312 result = pci_register_driver(&nvme_driver);
1313 if (!result)
1314 return 0;
1315
1316 unregister_blkdev(nvme_major, "nvme");
1317 return result;
1318}
1319
1320static void __exit nvme_exit(void)
1321{
1322 pci_unregister_driver(&nvme_driver);
1323 unregister_blkdev(nvme_major, "nvme");
1324}
1325
1326MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>");
1327MODULE_LICENSE("GPL");
Matthew Wilcoxdb5d0c12011-02-03 14:36:07 -05001328MODULE_VERSION("0.2");
Matthew Wilcoxb60503b2011-01-20 12:50:14 -05001329module_init(nvme_init);
1330module_exit(nvme_exit);