blob: b179f5a357f66b43bdfce849051064741a9e1b38 [file] [log] [blame]
Alex Williamson89e1f7d2012-07-31 08:16:24 -06001/*
2 * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
3 * Author: Alex Williamson <alex.williamson@redhat.com>
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 2 as
7 * published by the Free Software Foundation.
8 *
9 * Derived from original vfio:
10 * Copyright 2010 Cisco Systems, Inc. All rights reserved.
11 * Author: Tom Lyon, pugs@cisco.com
12 */
13
14#include <linux/device.h>
15#include <linux/eventfd.h>
16#include <linux/interrupt.h>
17#include <linux/iommu.h>
18#include <linux/module.h>
19#include <linux/mutex.h>
20#include <linux/notifier.h>
21#include <linux/pci.h>
22#include <linux/pm_runtime.h>
23#include <linux/slab.h>
24#include <linux/types.h>
25#include <linux/uaccess.h>
26#include <linux/vfio.h>
27
28#include "vfio_pci_private.h"
29
30#define DRIVER_VERSION "0.2"
31#define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>"
32#define DRIVER_DESC "VFIO PCI - User Level meta-driver"
33
34static bool nointxmask;
35module_param_named(nointxmask, nointxmask, bool, S_IRUGO | S_IWUSR);
36MODULE_PARM_DESC(nointxmask,
37 "Disable support for PCI 2.3 style INTx masking. If this resolves problems for specific devices, report lspci -vvvxxx to linux-pci@vger.kernel.org so the device can be fixed automatically via the broken_intx_masking flag.");
38
39static int vfio_pci_enable(struct vfio_pci_device *vdev)
40{
41 struct pci_dev *pdev = vdev->pdev;
42 int ret;
43 u16 cmd;
44 u8 msix_pos;
45
46 vdev->reset_works = (pci_reset_function(pdev) == 0);
47 pci_save_state(pdev);
48 vdev->pci_saved_state = pci_store_saved_state(pdev);
49 if (!vdev->pci_saved_state)
50 pr_debug("%s: Couldn't store %s saved state\n",
51 __func__, dev_name(&pdev->dev));
52
53 ret = vfio_config_init(vdev);
54 if (ret)
55 goto out;
56
57 if (likely(!nointxmask))
58 vdev->pci_2_3 = pci_intx_mask_supported(pdev);
59
60 pci_read_config_word(pdev, PCI_COMMAND, &cmd);
61 if (vdev->pci_2_3 && (cmd & PCI_COMMAND_INTX_DISABLE)) {
62 cmd &= ~PCI_COMMAND_INTX_DISABLE;
63 pci_write_config_word(pdev, PCI_COMMAND, cmd);
64 }
65
66 msix_pos = pci_find_capability(pdev, PCI_CAP_ID_MSIX);
67 if (msix_pos) {
68 u16 flags;
69 u32 table;
70
71 pci_read_config_word(pdev, msix_pos + PCI_MSIX_FLAGS, &flags);
72 pci_read_config_dword(pdev, msix_pos + PCI_MSIX_TABLE, &table);
73
74 vdev->msix_bar = table & PCI_MSIX_FLAGS_BIRMASK;
75 vdev->msix_offset = table & ~PCI_MSIX_FLAGS_BIRMASK;
76 vdev->msix_size = ((flags & PCI_MSIX_FLAGS_QSIZE) + 1) * 16;
77 } else
78 vdev->msix_bar = 0xFF;
79
80 ret = pci_enable_device(pdev);
81 if (ret)
82 goto out;
83
84 return ret;
85
86out:
87 kfree(vdev->pci_saved_state);
88 vdev->pci_saved_state = NULL;
89 vfio_config_free(vdev);
90 return ret;
91}
92
93static void vfio_pci_disable(struct vfio_pci_device *vdev)
94{
Alex Williamson20077222012-12-07 13:43:50 -070095 struct pci_dev *pdev = vdev->pdev;
Alex Williamson89e1f7d2012-07-31 08:16:24 -060096 int bar;
97
Alex Williamson20077222012-12-07 13:43:50 -070098 pci_disable_device(pdev);
Alex Williamson89e1f7d2012-07-31 08:16:24 -060099
100 vfio_pci_set_irqs_ioctl(vdev, VFIO_IRQ_SET_DATA_NONE |
101 VFIO_IRQ_SET_ACTION_TRIGGER,
102 vdev->irq_type, 0, 0, NULL);
103
104 vdev->virq_disabled = false;
105
106 vfio_config_free(vdev);
107
Alex Williamson89e1f7d2012-07-31 08:16:24 -0600108 for (bar = PCI_STD_RESOURCES; bar <= PCI_STD_RESOURCE_END; bar++) {
109 if (!vdev->barmap[bar])
110 continue;
Alex Williamson20077222012-12-07 13:43:50 -0700111 pci_iounmap(pdev, vdev->barmap[bar]);
112 pci_release_selected_regions(pdev, 1 << bar);
Alex Williamson89e1f7d2012-07-31 08:16:24 -0600113 vdev->barmap[bar] = NULL;
114 }
Alex Williamson20077222012-12-07 13:43:50 -0700115
116 /*
117 * If we have saved state, restore it. If we can reset the device,
118 * even better. Resetting with current state seems better than
119 * nothing, but saving and restoring current state without reset
120 * is just busy work.
121 */
122 if (pci_load_and_free_saved_state(pdev, &vdev->pci_saved_state)) {
123 pr_info("%s: Couldn't reload %s saved state\n",
124 __func__, dev_name(&pdev->dev));
125
126 if (!vdev->reset_works)
127 return;
128
129 pci_save_state(pdev);
130 }
131
132 /*
133 * Disable INTx and MSI, presumably to avoid spurious interrupts
134 * during reset. Stolen from pci_reset_function()
135 */
136 pci_write_config_word(pdev, PCI_COMMAND, PCI_COMMAND_INTX_DISABLE);
137
138 if (vdev->reset_works)
139 __pci_reset_function(pdev);
140
141 pci_restore_state(pdev);
Alex Williamson89e1f7d2012-07-31 08:16:24 -0600142}
143
144static void vfio_pci_release(void *device_data)
145{
146 struct vfio_pci_device *vdev = device_data;
147
148 if (atomic_dec_and_test(&vdev->refcnt))
149 vfio_pci_disable(vdev);
150
151 module_put(THIS_MODULE);
152}
153
154static int vfio_pci_open(void *device_data)
155{
156 struct vfio_pci_device *vdev = device_data;
157
158 if (!try_module_get(THIS_MODULE))
159 return -ENODEV;
160
161 if (atomic_inc_return(&vdev->refcnt) == 1) {
162 int ret = vfio_pci_enable(vdev);
163 if (ret) {
164 module_put(THIS_MODULE);
165 return ret;
166 }
167 }
168
169 return 0;
170}
171
172static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type)
173{
174 if (irq_type == VFIO_PCI_INTX_IRQ_INDEX) {
175 u8 pin;
176 pci_read_config_byte(vdev->pdev, PCI_INTERRUPT_PIN, &pin);
177 if (pin)
178 return 1;
179
180 } else if (irq_type == VFIO_PCI_MSI_IRQ_INDEX) {
181 u8 pos;
182 u16 flags;
183
184 pos = pci_find_capability(vdev->pdev, PCI_CAP_ID_MSI);
185 if (pos) {
186 pci_read_config_word(vdev->pdev,
187 pos + PCI_MSI_FLAGS, &flags);
188
189 return 1 << (flags & PCI_MSI_FLAGS_QMASK);
190 }
191 } else if (irq_type == VFIO_PCI_MSIX_IRQ_INDEX) {
192 u8 pos;
193 u16 flags;
194
195 pos = pci_find_capability(vdev->pdev, PCI_CAP_ID_MSIX);
196 if (pos) {
197 pci_read_config_word(vdev->pdev,
198 pos + PCI_MSIX_FLAGS, &flags);
199
200 return (flags & PCI_MSIX_FLAGS_QSIZE) + 1;
201 }
202 }
203
204 return 0;
205}
206
207static long vfio_pci_ioctl(void *device_data,
208 unsigned int cmd, unsigned long arg)
209{
210 struct vfio_pci_device *vdev = device_data;
211 unsigned long minsz;
212
213 if (cmd == VFIO_DEVICE_GET_INFO) {
214 struct vfio_device_info info;
215
216 minsz = offsetofend(struct vfio_device_info, num_irqs);
217
218 if (copy_from_user(&info, (void __user *)arg, minsz))
219 return -EFAULT;
220
221 if (info.argsz < minsz)
222 return -EINVAL;
223
224 info.flags = VFIO_DEVICE_FLAGS_PCI;
225
226 if (vdev->reset_works)
227 info.flags |= VFIO_DEVICE_FLAGS_RESET;
228
229 info.num_regions = VFIO_PCI_NUM_REGIONS;
230 info.num_irqs = VFIO_PCI_NUM_IRQS;
231
232 return copy_to_user((void __user *)arg, &info, minsz);
233
234 } else if (cmd == VFIO_DEVICE_GET_REGION_INFO) {
235 struct pci_dev *pdev = vdev->pdev;
236 struct vfio_region_info info;
237
238 minsz = offsetofend(struct vfio_region_info, offset);
239
240 if (copy_from_user(&info, (void __user *)arg, minsz))
241 return -EFAULT;
242
243 if (info.argsz < minsz)
244 return -EINVAL;
245
246 switch (info.index) {
247 case VFIO_PCI_CONFIG_REGION_INDEX:
248 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
249 info.size = pdev->cfg_size;
250 info.flags = VFIO_REGION_INFO_FLAG_READ |
251 VFIO_REGION_INFO_FLAG_WRITE;
252 break;
253 case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
254 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
255 info.size = pci_resource_len(pdev, info.index);
256 if (!info.size) {
257 info.flags = 0;
258 break;
259 }
260
261 info.flags = VFIO_REGION_INFO_FLAG_READ |
262 VFIO_REGION_INFO_FLAG_WRITE;
263 if (pci_resource_flags(pdev, info.index) &
264 IORESOURCE_MEM && info.size >= PAGE_SIZE)
265 info.flags |= VFIO_REGION_INFO_FLAG_MMAP;
266 break;
267 case VFIO_PCI_ROM_REGION_INDEX:
268 {
269 void __iomem *io;
270 size_t size;
271
272 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
273 info.flags = 0;
274
275 /* Report the BAR size, not the ROM size */
276 info.size = pci_resource_len(pdev, info.index);
277 if (!info.size)
278 break;
279
280 /* Is it really there? */
281 io = pci_map_rom(pdev, &size);
282 if (!io || !size) {
283 info.size = 0;
284 break;
285 }
286 pci_unmap_rom(pdev, io);
287
288 info.flags = VFIO_REGION_INFO_FLAG_READ;
289 break;
290 }
291 default:
292 return -EINVAL;
293 }
294
295 return copy_to_user((void __user *)arg, &info, minsz);
296
297 } else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) {
298 struct vfio_irq_info info;
299
300 minsz = offsetofend(struct vfio_irq_info, count);
301
302 if (copy_from_user(&info, (void __user *)arg, minsz))
303 return -EFAULT;
304
305 if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS)
306 return -EINVAL;
307
308 info.flags = VFIO_IRQ_INFO_EVENTFD;
309
310 info.count = vfio_pci_get_irq_count(vdev, info.index);
311
312 if (info.index == VFIO_PCI_INTX_IRQ_INDEX)
313 info.flags |= (VFIO_IRQ_INFO_MASKABLE |
314 VFIO_IRQ_INFO_AUTOMASKED);
315 else
316 info.flags |= VFIO_IRQ_INFO_NORESIZE;
317
318 return copy_to_user((void __user *)arg, &info, minsz);
319
320 } else if (cmd == VFIO_DEVICE_SET_IRQS) {
321 struct vfio_irq_set hdr;
322 u8 *data = NULL;
323 int ret = 0;
324
325 minsz = offsetofend(struct vfio_irq_set, count);
326
327 if (copy_from_user(&hdr, (void __user *)arg, minsz))
328 return -EFAULT;
329
330 if (hdr.argsz < minsz || hdr.index >= VFIO_PCI_NUM_IRQS ||
331 hdr.flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
332 VFIO_IRQ_SET_ACTION_TYPE_MASK))
333 return -EINVAL;
334
335 if (!(hdr.flags & VFIO_IRQ_SET_DATA_NONE)) {
336 size_t size;
337
338 if (hdr.flags & VFIO_IRQ_SET_DATA_BOOL)
339 size = sizeof(uint8_t);
340 else if (hdr.flags & VFIO_IRQ_SET_DATA_EVENTFD)
341 size = sizeof(int32_t);
342 else
343 return -EINVAL;
344
345 if (hdr.argsz - minsz < hdr.count * size ||
346 hdr.count > vfio_pci_get_irq_count(vdev, hdr.index))
347 return -EINVAL;
348
Fengguang Wu3a1f7042012-12-07 13:43:49 -0700349 data = memdup_user((void __user *)(arg + minsz),
350 hdr.count * size);
351 if (IS_ERR(data))
352 return PTR_ERR(data);
Alex Williamson89e1f7d2012-07-31 08:16:24 -0600353 }
354
355 mutex_lock(&vdev->igate);
356
357 ret = vfio_pci_set_irqs_ioctl(vdev, hdr.flags, hdr.index,
358 hdr.start, hdr.count, data);
359
360 mutex_unlock(&vdev->igate);
361 kfree(data);
362
363 return ret;
364
365 } else if (cmd == VFIO_DEVICE_RESET)
366 return vdev->reset_works ?
367 pci_reset_function(vdev->pdev) : -EINVAL;
368
369 return -ENOTTY;
370}
371
372static ssize_t vfio_pci_read(void *device_data, char __user *buf,
373 size_t count, loff_t *ppos)
374{
375 unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
376 struct vfio_pci_device *vdev = device_data;
377 struct pci_dev *pdev = vdev->pdev;
378
379 if (index >= VFIO_PCI_NUM_REGIONS)
380 return -EINVAL;
381
382 if (index == VFIO_PCI_CONFIG_REGION_INDEX)
383 return vfio_pci_config_readwrite(vdev, buf, count, ppos, false);
384 else if (index == VFIO_PCI_ROM_REGION_INDEX)
385 return vfio_pci_mem_readwrite(vdev, buf, count, ppos, false);
386 else if (pci_resource_flags(pdev, index) & IORESOURCE_IO)
387 return vfio_pci_io_readwrite(vdev, buf, count, ppos, false);
388 else if (pci_resource_flags(pdev, index) & IORESOURCE_MEM)
389 return vfio_pci_mem_readwrite(vdev, buf, count, ppos, false);
390
391 return -EINVAL;
392}
393
394static ssize_t vfio_pci_write(void *device_data, const char __user *buf,
395 size_t count, loff_t *ppos)
396{
397 unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
398 struct vfio_pci_device *vdev = device_data;
399 struct pci_dev *pdev = vdev->pdev;
400
401 if (index >= VFIO_PCI_NUM_REGIONS)
402 return -EINVAL;
403
404 if (index == VFIO_PCI_CONFIG_REGION_INDEX)
405 return vfio_pci_config_readwrite(vdev, (char __user *)buf,
406 count, ppos, true);
407 else if (index == VFIO_PCI_ROM_REGION_INDEX)
408 return -EINVAL;
409 else if (pci_resource_flags(pdev, index) & IORESOURCE_IO)
410 return vfio_pci_io_readwrite(vdev, (char __user *)buf,
411 count, ppos, true);
412 else if (pci_resource_flags(pdev, index) & IORESOURCE_MEM) {
413 return vfio_pci_mem_readwrite(vdev, (char __user *)buf,
414 count, ppos, true);
415 }
416
417 return -EINVAL;
418}
419
420static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma)
421{
422 struct vfio_pci_device *vdev = device_data;
423 struct pci_dev *pdev = vdev->pdev;
424 unsigned int index;
Alex Williamson34002f52012-10-10 09:10:31 -0600425 u64 phys_len, req_len, pgoff, req_start;
Alex Williamson89e1f7d2012-07-31 08:16:24 -0600426 int ret;
427
428 index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
429
430 if (vma->vm_end < vma->vm_start)
431 return -EINVAL;
432 if ((vma->vm_flags & VM_SHARED) == 0)
433 return -EINVAL;
434 if (index >= VFIO_PCI_ROM_REGION_INDEX)
435 return -EINVAL;
436 if (!(pci_resource_flags(pdev, index) & IORESOURCE_MEM))
437 return -EINVAL;
438
439 phys_len = pci_resource_len(pdev, index);
440 req_len = vma->vm_end - vma->vm_start;
441 pgoff = vma->vm_pgoff &
442 ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
443 req_start = pgoff << PAGE_SHIFT;
444
445 if (phys_len < PAGE_SIZE || req_start + req_len > phys_len)
446 return -EINVAL;
447
448 if (index == vdev->msix_bar) {
449 /*
450 * Disallow mmaps overlapping the MSI-X table; users don't
451 * get to touch this directly. We could find somewhere
452 * else to map the overlap, but page granularity is only
453 * a recommendation, not a requirement, so the user needs
454 * to know which bits are real. Requiring them to mmap
455 * around the table makes that clear.
456 */
457
458 /* If neither entirely above nor below, then it overlaps */
459 if (!(req_start >= vdev->msix_offset + vdev->msix_size ||
460 req_start + req_len <= vdev->msix_offset))
461 return -EINVAL;
462 }
463
464 /*
465 * Even though we don't make use of the barmap for the mmap,
466 * we need to request the region and the barmap tracks that.
467 */
468 if (!vdev->barmap[index]) {
469 ret = pci_request_selected_regions(pdev,
470 1 << index, "vfio-pci");
471 if (ret)
472 return ret;
473
474 vdev->barmap[index] = pci_iomap(pdev, index, 0);
475 }
476
477 vma->vm_private_data = vdev;
Linus Torvalds547b1e82012-10-09 21:06:41 +0900478 vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP;
Alex Williamson89e1f7d2012-07-31 08:16:24 -0600479 vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
Alex Williamson34002f52012-10-10 09:10:31 -0600480 vma->vm_pgoff = (pci_resource_start(pdev, index) >> PAGE_SHIFT) + pgoff;
Alex Williamson89e1f7d2012-07-31 08:16:24 -0600481
Alex Williamson34002f52012-10-10 09:10:31 -0600482 return remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
Alex Williamson89e1f7d2012-07-31 08:16:24 -0600483 req_len, vma->vm_page_prot);
484}
485
486static const struct vfio_device_ops vfio_pci_ops = {
487 .name = "vfio-pci",
488 .open = vfio_pci_open,
489 .release = vfio_pci_release,
490 .ioctl = vfio_pci_ioctl,
491 .read = vfio_pci_read,
492 .write = vfio_pci_write,
493 .mmap = vfio_pci_mmap,
494};
495
496static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
497{
498 u8 type;
499 struct vfio_pci_device *vdev;
500 struct iommu_group *group;
501 int ret;
502
503 pci_read_config_byte(pdev, PCI_HEADER_TYPE, &type);
504 if ((type & PCI_HEADER_TYPE) != PCI_HEADER_TYPE_NORMAL)
505 return -EINVAL;
506
507 group = iommu_group_get(&pdev->dev);
508 if (!group)
509 return -EINVAL;
510
511 vdev = kzalloc(sizeof(*vdev), GFP_KERNEL);
512 if (!vdev) {
513 iommu_group_put(group);
514 return -ENOMEM;
515 }
516
517 vdev->pdev = pdev;
518 vdev->irq_type = VFIO_PCI_NUM_IRQS;
519 mutex_init(&vdev->igate);
520 spin_lock_init(&vdev->irqlock);
521 atomic_set(&vdev->refcnt, 0);
522
523 ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev);
524 if (ret) {
525 iommu_group_put(group);
526 kfree(vdev);
527 }
528
529 return ret;
530}
531
532static void vfio_pci_remove(struct pci_dev *pdev)
533{
534 struct vfio_pci_device *vdev;
535
536 vdev = vfio_del_group_dev(&pdev->dev);
537 if (!vdev)
538 return;
539
540 iommu_group_put(pdev->dev.iommu_group);
541 kfree(vdev);
542}
543
544static struct pci_driver vfio_pci_driver = {
545 .name = "vfio-pci",
546 .id_table = NULL, /* only dynamic ids */
547 .probe = vfio_pci_probe,
548 .remove = vfio_pci_remove,
549};
550
551static void __exit vfio_pci_cleanup(void)
552{
553 pci_unregister_driver(&vfio_pci_driver);
554 vfio_pci_virqfd_exit();
555 vfio_pci_uninit_perm_bits();
556}
557
558static int __init vfio_pci_init(void)
559{
560 int ret;
561
562 /* Allocate shared config space permision data used by all devices */
563 ret = vfio_pci_init_perm_bits();
564 if (ret)
565 return ret;
566
567 /* Start the virqfd cleanup handler */
568 ret = vfio_pci_virqfd_init();
569 if (ret)
570 goto out_virqfd;
571
572 /* Register and scan for devices */
573 ret = pci_register_driver(&vfio_pci_driver);
574 if (ret)
575 goto out_driver;
576
577 return 0;
578
579out_virqfd:
580 vfio_pci_virqfd_exit();
581out_driver:
582 vfio_pci_uninit_perm_bits();
583 return ret;
584}
585
586module_init(vfio_pci_init);
587module_exit(vfio_pci_cleanup);
588
589MODULE_VERSION(DRIVER_VERSION);
590MODULE_LICENSE("GPL v2");
591MODULE_AUTHOR(DRIVER_AUTHOR);
592MODULE_DESCRIPTION(DRIVER_DESC);