blob: 334c9a7b899199434eb4e423ee18ac763e3913d7 [file] [log] [blame]
Jake Oshins4daace02016-02-16 21:56:23 +00001/*
2 * Copyright (c) Microsoft Corporation.
3 *
4 * Author:
5 * Jake Oshins <jakeo@microsoft.com>
6 *
7 * This driver acts as a paravirtual front-end for PCI Express root buses.
8 * When a PCI Express function (either an entire device or an SR-IOV
9 * Virtual Function) is being passed through to the VM, this driver exposes
10 * a new bus to the guest VM. This is modeled as a root PCI bus because
11 * no bridges are being exposed to the VM. In fact, with a "Generation 2"
12 * VM within Hyper-V, there may seem to be no PCI bus at all in the VM
13 * until a device as been exposed using this driver.
14 *
15 * Each root PCI bus has its own PCI domain, which is called "Segment" in
16 * the PCI Firmware Specifications. Thus while each device passed through
17 * to the VM using this front-end will appear at "device 0", the domain will
18 * be unique. Typically, each bus will have one PCI function on it, though
19 * this driver does support more than one.
20 *
21 * In order to map the interrupts from the device through to the guest VM,
22 * this driver also implements an IRQ Domain, which handles interrupts (either
23 * MSI or MSI-X) associated with the functions on the bus. As interrupts are
24 * set up, torn down, or reaffined, this driver communicates with the
25 * underlying hypervisor to adjust the mappings in the I/O MMU so that each
26 * interrupt will be delivered to the correct virtual processor at the right
27 * vector. This driver does not support level-triggered (line-based)
28 * interrupts, and will report that the Interrupt Line register in the
29 * function's configuration space is zero.
30 *
31 * The rest of this driver mostly maps PCI concepts onto underlying Hyper-V
32 * facilities. For instance, the configuration space of a function exposed
33 * by Hyper-V is mapped into a single page of memory space, and the
34 * read and write handlers for config space must be aware of this mechanism.
35 * Similarly, device setup and teardown involves messages sent to and from
36 * the PCI back-end driver in Hyper-V.
37 *
38 * This program is free software; you can redistribute it and/or modify it
39 * under the terms of the GNU General Public License version 2 as published
40 * by the Free Software Foundation.
41 *
42 * This program is distributed in the hope that it will be useful, but
43 * WITHOUT ANY WARRANTY; without even the implied warranty of
44 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
45 * NON INFRINGEMENT. See the GNU General Public License for more
46 * details.
47 *
48 */
49
50#include <linux/kernel.h>
51#include <linux/module.h>
52#include <linux/pci.h>
Stephen Hemminger80bfeeb2017-07-31 16:48:29 -070053#include <linux/delay.h>
Jake Oshins4daace02016-02-16 21:56:23 +000054#include <linux/semaphore.h>
55#include <linux/irqdomain.h>
56#include <asm/irqdomain.h>
57#include <asm/apic.h>
58#include <linux/msi.h>
59#include <linux/hyperv.h>
Elena Reshetova24196f02017-04-18 09:02:48 -050060#include <linux/refcount.h>
Jake Oshins4daace02016-02-16 21:56:23 +000061#include <asm/mshyperv.h>
62
63/*
64 * Protocol versions. The low word is the minor version, the high word the
65 * major version.
66 */
67
Jork Loeserb1db7e72017-05-24 13:41:27 -070068#define PCI_MAKE_VERSION(major, minor) ((u32)(((major) << 16) | (minor)))
Jake Oshins4daace02016-02-16 21:56:23 +000069#define PCI_MAJOR_VERSION(version) ((u32)(version) >> 16)
70#define PCI_MINOR_VERSION(version) ((u32)(version) & 0xff)
71
Jork Loeserb1db7e72017-05-24 13:41:27 -070072enum pci_protocol_version_t {
73 PCI_PROTOCOL_VERSION_1_1 = PCI_MAKE_VERSION(1, 1), /* Win10 */
Jork Loeser7dcf90e2017-05-24 13:41:28 -070074 PCI_PROTOCOL_VERSION_1_2 = PCI_MAKE_VERSION(1, 2), /* RS1 */
Jake Oshins4daace02016-02-16 21:56:23 +000075};
76
K. Y. Srinivasan433fcf62017-03-24 11:07:21 -070077#define CPU_AFFINITY_ALL -1ULL
Jork Loeserb1db7e72017-05-24 13:41:27 -070078
79/*
80 * Supported protocol versions in the order of probing - highest go
81 * first.
82 */
83static enum pci_protocol_version_t pci_protocol_versions[] = {
Jork Loeser7dcf90e2017-05-24 13:41:28 -070084 PCI_PROTOCOL_VERSION_1_2,
Jork Loeserb1db7e72017-05-24 13:41:27 -070085 PCI_PROTOCOL_VERSION_1_1,
86};
87
88/*
89 * Protocol version negotiated by hv_pci_protocol_negotiation().
90 */
91static enum pci_protocol_version_t pci_protocol_version;
92
Jake Oshins4daace02016-02-16 21:56:23 +000093#define PCI_CONFIG_MMIO_LENGTH 0x2000
94#define CFG_PAGE_OFFSET 0x1000
95#define CFG_PAGE_SIZE (PCI_CONFIG_MMIO_LENGTH - CFG_PAGE_OFFSET)
96
97#define MAX_SUPPORTED_MSI_MESSAGES 0x400
98
Jork Loeserb1db7e72017-05-24 13:41:27 -070099#define STATUS_REVISION_MISMATCH 0xC0000059
100
Jake Oshins4daace02016-02-16 21:56:23 +0000101/*
102 * Message Types
103 */
104
105enum pci_message_type {
106 /*
107 * Version 1.1
108 */
109 PCI_MESSAGE_BASE = 0x42490000,
110 PCI_BUS_RELATIONS = PCI_MESSAGE_BASE + 0,
111 PCI_QUERY_BUS_RELATIONS = PCI_MESSAGE_BASE + 1,
112 PCI_POWER_STATE_CHANGE = PCI_MESSAGE_BASE + 4,
113 PCI_QUERY_RESOURCE_REQUIREMENTS = PCI_MESSAGE_BASE + 5,
114 PCI_QUERY_RESOURCE_RESOURCES = PCI_MESSAGE_BASE + 6,
115 PCI_BUS_D0ENTRY = PCI_MESSAGE_BASE + 7,
116 PCI_BUS_D0EXIT = PCI_MESSAGE_BASE + 8,
117 PCI_READ_BLOCK = PCI_MESSAGE_BASE + 9,
118 PCI_WRITE_BLOCK = PCI_MESSAGE_BASE + 0xA,
119 PCI_EJECT = PCI_MESSAGE_BASE + 0xB,
120 PCI_QUERY_STOP = PCI_MESSAGE_BASE + 0xC,
121 PCI_REENABLE = PCI_MESSAGE_BASE + 0xD,
122 PCI_QUERY_STOP_FAILED = PCI_MESSAGE_BASE + 0xE,
123 PCI_EJECTION_COMPLETE = PCI_MESSAGE_BASE + 0xF,
124 PCI_RESOURCES_ASSIGNED = PCI_MESSAGE_BASE + 0x10,
125 PCI_RESOURCES_RELEASED = PCI_MESSAGE_BASE + 0x11,
126 PCI_INVALIDATE_BLOCK = PCI_MESSAGE_BASE + 0x12,
127 PCI_QUERY_PROTOCOL_VERSION = PCI_MESSAGE_BASE + 0x13,
128 PCI_CREATE_INTERRUPT_MESSAGE = PCI_MESSAGE_BASE + 0x14,
129 PCI_DELETE_INTERRUPT_MESSAGE = PCI_MESSAGE_BASE + 0x15,
Jork Loeser7dcf90e2017-05-24 13:41:28 -0700130 PCI_RESOURCES_ASSIGNED2 = PCI_MESSAGE_BASE + 0x16,
131 PCI_CREATE_INTERRUPT_MESSAGE2 = PCI_MESSAGE_BASE + 0x17,
132 PCI_DELETE_INTERRUPT_MESSAGE2 = PCI_MESSAGE_BASE + 0x18, /* unused */
Jake Oshins4daace02016-02-16 21:56:23 +0000133 PCI_MESSAGE_MAXIMUM
134};
135
136/*
137 * Structures defining the virtual PCI Express protocol.
138 */
139
140union pci_version {
141 struct {
142 u16 minor_version;
143 u16 major_version;
144 } parts;
145 u32 version;
146} __packed;
147
148/*
149 * Function numbers are 8-bits wide on Express, as interpreted through ARI,
150 * which is all this driver does. This representation is the one used in
151 * Windows, which is what is expected when sending this back and forth with
152 * the Hyper-V parent partition.
153 */
154union win_slot_encoding {
155 struct {
Dexuan Cui60e2e2f2017-02-10 15:18:46 -0600156 u32 dev:5;
157 u32 func:3;
Jake Oshins4daace02016-02-16 21:56:23 +0000158 u32 reserved:24;
159 } bits;
160 u32 slot;
161} __packed;
162
163/*
164 * Pretty much as defined in the PCI Specifications.
165 */
166struct pci_function_description {
167 u16 v_id; /* vendor ID */
168 u16 d_id; /* device ID */
169 u8 rev;
170 u8 prog_intf;
171 u8 subclass;
172 u8 base_class;
173 u32 subsystem_id;
174 union win_slot_encoding win_slot;
175 u32 ser; /* serial number */
176} __packed;
177
178/**
179 * struct hv_msi_desc
180 * @vector: IDT entry
181 * @delivery_mode: As defined in Intel's Programmer's
182 * Reference Manual, Volume 3, Chapter 8.
183 * @vector_count: Number of contiguous entries in the
184 * Interrupt Descriptor Table that are
185 * occupied by this Message-Signaled
186 * Interrupt. For "MSI", as first defined
187 * in PCI 2.2, this can be between 1 and
188 * 32. For "MSI-X," as first defined in PCI
189 * 3.0, this must be 1, as each MSI-X table
190 * entry would have its own descriptor.
191 * @reserved: Empty space
192 * @cpu_mask: All the target virtual processors.
193 */
194struct hv_msi_desc {
195 u8 vector;
196 u8 delivery_mode;
197 u16 vector_count;
198 u32 reserved;
199 u64 cpu_mask;
200} __packed;
201
202/**
Jork Loeser7dcf90e2017-05-24 13:41:28 -0700203 * struct hv_msi_desc2 - 1.2 version of hv_msi_desc
204 * @vector: IDT entry
205 * @delivery_mode: As defined in Intel's Programmer's
206 * Reference Manual, Volume 3, Chapter 8.
207 * @vector_count: Number of contiguous entries in the
208 * Interrupt Descriptor Table that are
209 * occupied by this Message-Signaled
210 * Interrupt. For "MSI", as first defined
211 * in PCI 2.2, this can be between 1 and
212 * 32. For "MSI-X," as first defined in PCI
213 * 3.0, this must be 1, as each MSI-X table
214 * entry would have its own descriptor.
215 * @processor_count: number of bits enabled in array.
216 * @processor_array: All the target virtual processors.
217 */
218struct hv_msi_desc2 {
219 u8 vector;
220 u8 delivery_mode;
221 u16 vector_count;
222 u16 processor_count;
223 u16 processor_array[32];
224} __packed;
225
226/**
Jake Oshins4daace02016-02-16 21:56:23 +0000227 * struct tran_int_desc
228 * @reserved: unused, padding
229 * @vector_count: same as in hv_msi_desc
230 * @data: This is the "data payload" value that is
231 * written by the device when it generates
232 * a message-signaled interrupt, either MSI
233 * or MSI-X.
234 * @address: This is the address to which the data
235 * payload is written on interrupt
236 * generation.
237 */
238struct tran_int_desc {
239 u16 reserved;
240 u16 vector_count;
241 u32 data;
242 u64 address;
243} __packed;
244
245/*
246 * A generic message format for virtual PCI.
247 * Specific message formats are defined later in the file.
248 */
249
250struct pci_message {
Dexuan Cui0c6045d2016-08-23 04:45:51 +0000251 u32 type;
Jake Oshins4daace02016-02-16 21:56:23 +0000252} __packed;
253
254struct pci_child_message {
Dexuan Cui0c6045d2016-08-23 04:45:51 +0000255 struct pci_message message_type;
Jake Oshins4daace02016-02-16 21:56:23 +0000256 union win_slot_encoding wslot;
257} __packed;
258
259struct pci_incoming_message {
260 struct vmpacket_descriptor hdr;
261 struct pci_message message_type;
262} __packed;
263
264struct pci_response {
265 struct vmpacket_descriptor hdr;
266 s32 status; /* negative values are failures */
267} __packed;
268
269struct pci_packet {
270 void (*completion_func)(void *context, struct pci_response *resp,
271 int resp_packet_size);
272 void *compl_ctxt;
Dexuan Cui0c6045d2016-08-23 04:45:51 +0000273
274 struct pci_message message[0];
Jake Oshins4daace02016-02-16 21:56:23 +0000275};
276
277/*
278 * Specific message types supporting the PCI protocol.
279 */
280
281/*
282 * Version negotiation message. Sent from the guest to the host.
283 * The guest is free to try different versions until the host
284 * accepts the version.
285 *
286 * pci_version: The protocol version requested.
287 * is_last_attempt: If TRUE, this is the last version guest will request.
288 * reservedz: Reserved field, set to zero.
289 */
290
291struct pci_version_request {
292 struct pci_message message_type;
Jork Loeser691ac1d2017-05-24 13:41:24 -0700293 u32 protocol_version;
Jake Oshins4daace02016-02-16 21:56:23 +0000294} __packed;
295
296/*
297 * Bus D0 Entry. This is sent from the guest to the host when the virtual
298 * bus (PCI Express port) is ready for action.
299 */
300
301struct pci_bus_d0_entry {
302 struct pci_message message_type;
303 u32 reserved;
304 u64 mmio_base;
305} __packed;
306
307struct pci_bus_relations {
308 struct pci_incoming_message incoming;
309 u32 device_count;
Dexuan Cui7d0f8ee2016-08-23 04:46:39 +0000310 struct pci_function_description func[0];
Jake Oshins4daace02016-02-16 21:56:23 +0000311} __packed;
312
313struct pci_q_res_req_response {
314 struct vmpacket_descriptor hdr;
315 s32 status; /* negative values are failures */
316 u32 probed_bar[6];
317} __packed;
318
319struct pci_set_power {
320 struct pci_message message_type;
321 union win_slot_encoding wslot;
322 u32 power_state; /* In Windows terms */
323 u32 reserved;
324} __packed;
325
326struct pci_set_power_response {
327 struct vmpacket_descriptor hdr;
328 s32 status; /* negative values are failures */
329 union win_slot_encoding wslot;
330 u32 resultant_state; /* In Windows terms */
331 u32 reserved;
332} __packed;
333
334struct pci_resources_assigned {
335 struct pci_message message_type;
336 union win_slot_encoding wslot;
337 u8 memory_range[0x14][6]; /* not used here */
338 u32 msi_descriptors;
339 u32 reserved[4];
340} __packed;
341
Jork Loeser7dcf90e2017-05-24 13:41:28 -0700342struct pci_resources_assigned2 {
343 struct pci_message message_type;
344 union win_slot_encoding wslot;
345 u8 memory_range[0x14][6]; /* not used here */
346 u32 msi_descriptor_count;
347 u8 reserved[70];
348} __packed;
349
Jake Oshins4daace02016-02-16 21:56:23 +0000350struct pci_create_interrupt {
351 struct pci_message message_type;
352 union win_slot_encoding wslot;
353 struct hv_msi_desc int_desc;
354} __packed;
355
356struct pci_create_int_response {
357 struct pci_response response;
358 u32 reserved;
359 struct tran_int_desc int_desc;
360} __packed;
361
Jork Loeser7dcf90e2017-05-24 13:41:28 -0700362struct pci_create_interrupt2 {
363 struct pci_message message_type;
364 union win_slot_encoding wslot;
365 struct hv_msi_desc2 int_desc;
366} __packed;
367
Jake Oshins4daace02016-02-16 21:56:23 +0000368struct pci_delete_interrupt {
369 struct pci_message message_type;
370 union win_slot_encoding wslot;
371 struct tran_int_desc int_desc;
372} __packed;
373
374struct pci_dev_incoming {
375 struct pci_incoming_message incoming;
376 union win_slot_encoding wslot;
377} __packed;
378
379struct pci_eject_response {
Dexuan Cui0c6045d2016-08-23 04:45:51 +0000380 struct pci_message message_type;
Jake Oshins4daace02016-02-16 21:56:23 +0000381 union win_slot_encoding wslot;
382 u32 status;
383} __packed;
384
385static int pci_ring_size = (4 * PAGE_SIZE);
386
387/*
388 * Definitions or interrupt steering hypercall.
389 */
390#define HV_PARTITION_ID_SELF ((u64)-1)
391#define HVCALL_RETARGET_INTERRUPT 0x7e
392
Jork Loeser7dcf90e2017-05-24 13:41:28 -0700393struct hv_interrupt_entry {
Jake Oshins4daace02016-02-16 21:56:23 +0000394 u32 source; /* 1 for MSI(-X) */
395 u32 reserved1;
396 u32 address;
397 u32 data;
Jork Loeser7dcf90e2017-05-24 13:41:28 -0700398};
399
400#define HV_VP_SET_BANK_COUNT_MAX 5 /* current implementation limit */
401
402struct hv_vp_set {
403 u64 format; /* 0 (HvGenericSetSparse4k) */
404 u64 valid_banks;
405 u64 masks[HV_VP_SET_BANK_COUNT_MAX];
406};
407
408/*
409 * flags for hv_device_interrupt_target.flags
410 */
411#define HV_DEVICE_INTERRUPT_TARGET_MULTICAST 1
412#define HV_DEVICE_INTERRUPT_TARGET_PROCESSOR_SET 2
413
414struct hv_device_interrupt_target {
Jake Oshins4daace02016-02-16 21:56:23 +0000415 u32 vector;
416 u32 flags;
Jork Loeser7dcf90e2017-05-24 13:41:28 -0700417 union {
418 u64 vp_mask;
419 struct hv_vp_set vp_set;
420 };
421};
422
423struct retarget_msi_interrupt {
424 u64 partition_id; /* use "self" */
425 u64 device_id;
426 struct hv_interrupt_entry int_entry;
427 u64 reserved2;
428 struct hv_device_interrupt_target int_target;
Jake Oshins4daace02016-02-16 21:56:23 +0000429} __packed;
430
431/*
432 * Driver specific state.
433 */
434
435enum hv_pcibus_state {
436 hv_pcibus_init = 0,
437 hv_pcibus_probed,
438 hv_pcibus_installed,
Long Lid3a78d82017-03-23 14:58:10 -0700439 hv_pcibus_removed,
Jake Oshins4daace02016-02-16 21:56:23 +0000440 hv_pcibus_maximum
441};
442
443struct hv_pcibus_device {
444 struct pci_sysdata sysdata;
445 enum hv_pcibus_state state;
446 atomic_t remove_lock;
447 struct hv_device *hdev;
448 resource_size_t low_mmio_space;
449 resource_size_t high_mmio_space;
450 struct resource *mem_config;
451 struct resource *low_mmio_res;
452 struct resource *high_mmio_res;
453 struct completion *survey_event;
454 struct completion remove_event;
455 struct pci_bus *pci_bus;
456 spinlock_t config_lock; /* Avoid two threads writing index page */
457 spinlock_t device_list_lock; /* Protect lists below */
458 void __iomem *cfg_addr;
459
460 struct semaphore enum_sem;
461 struct list_head resources_for_children;
462
463 struct list_head children;
464 struct list_head dr_list;
Jake Oshins4daace02016-02-16 21:56:23 +0000465
466 struct msi_domain_info msi_info;
467 struct msi_controller msi_chip;
468 struct irq_domain *irq_domain;
Jork Loeserbe66b672017-05-24 13:41:25 -0700469
470 /* hypercall arg, must not cross page boundary */
Long Li0de8ce32016-11-08 14:04:38 -0800471 struct retarget_msi_interrupt retarget_msi_interrupt_params;
Jork Loeserbe66b672017-05-24 13:41:25 -0700472
Long Li0de8ce32016-11-08 14:04:38 -0800473 spinlock_t retarget_msi_interrupt_lock;
Jake Oshins4daace02016-02-16 21:56:23 +0000474};
475
476/*
477 * Tracks "Device Relations" messages from the host, which must be both
478 * processed in order and deferred so that they don't run in the context
479 * of the incoming packet callback.
480 */
481struct hv_dr_work {
482 struct work_struct wrk;
483 struct hv_pcibus_device *bus;
484};
485
486struct hv_dr_state {
487 struct list_head list_entry;
488 u32 device_count;
Dexuan Cui7d0f8ee2016-08-23 04:46:39 +0000489 struct pci_function_description func[0];
Jake Oshins4daace02016-02-16 21:56:23 +0000490};
491
492enum hv_pcichild_state {
493 hv_pcichild_init = 0,
494 hv_pcichild_requirements,
495 hv_pcichild_resourced,
496 hv_pcichild_ejecting,
497 hv_pcichild_maximum
498};
499
500enum hv_pcidev_ref_reason {
501 hv_pcidev_ref_invalid = 0,
502 hv_pcidev_ref_initial,
503 hv_pcidev_ref_by_slot,
504 hv_pcidev_ref_packet,
505 hv_pcidev_ref_pnp,
506 hv_pcidev_ref_childlist,
507 hv_pcidev_irqdata,
508 hv_pcidev_ref_max
509};
510
511struct hv_pci_dev {
512 /* List protected by pci_rescan_remove_lock */
513 struct list_head list_entry;
Elena Reshetova24196f02017-04-18 09:02:48 -0500514 refcount_t refs;
Jake Oshins4daace02016-02-16 21:56:23 +0000515 enum hv_pcichild_state state;
516 struct pci_function_description desc;
517 bool reported_missing;
518 struct hv_pcibus_device *hbus;
519 struct work_struct wrk;
520
521 /*
522 * What would be observed if one wrote 0xFFFFFFFF to a BAR and then
523 * read it back, for each of the BAR offsets within config space.
524 */
525 u32 probed_bar[6];
526};
527
528struct hv_pci_compl {
529 struct completion host_event;
530 s32 completion_status;
531};
532
533/**
534 * hv_pci_generic_compl() - Invoked for a completion packet
535 * @context: Set up by the sender of the packet.
536 * @resp: The response packet
537 * @resp_packet_size: Size in bytes of the packet
538 *
539 * This function is used to trigger an event and report status
540 * for any message for which the completion packet contains a
541 * status and nothing else.
542 */
Dexuan Cuia5b45b72016-08-23 04:49:22 +0000543static void hv_pci_generic_compl(void *context, struct pci_response *resp,
544 int resp_packet_size)
Jake Oshins4daace02016-02-16 21:56:23 +0000545{
546 struct hv_pci_compl *comp_pkt = context;
547
548 if (resp_packet_size >= offsetofend(struct pci_response, status))
549 comp_pkt->completion_status = resp->status;
Dexuan Cuia5b45b72016-08-23 04:49:22 +0000550 else
551 comp_pkt->completion_status = -1;
552
Jake Oshins4daace02016-02-16 21:56:23 +0000553 complete(&comp_pkt->host_event);
554}
555
556static struct hv_pci_dev *get_pcichild_wslot(struct hv_pcibus_device *hbus,
557 u32 wslot);
558static void get_pcichild(struct hv_pci_dev *hv_pcidev,
559 enum hv_pcidev_ref_reason reason);
560static void put_pcichild(struct hv_pci_dev *hv_pcidev,
561 enum hv_pcidev_ref_reason reason);
562
563static void get_hvpcibus(struct hv_pcibus_device *hv_pcibus);
564static void put_hvpcibus(struct hv_pcibus_device *hv_pcibus);
565
Jork Loeser02c37642017-05-24 13:41:26 -0700566
567/*
568 * Temporary CPU to vCPU mapping to address transitioning
569 * vmbus_cpu_number_to_vp_number() being migrated to
570 * hv_cpu_number_to_vp_number() in a separate patch. Once that patch
571 * has been picked up in the main line, remove this code here and use
572 * the official code.
573 */
574static struct hv_tmpcpumap
575{
576 bool initialized;
577 u32 vp_index[NR_CPUS];
578} hv_tmpcpumap;
579
580static void hv_tmpcpumap_init_cpu(void *_unused)
581{
582 int cpu = smp_processor_id();
583 u64 vp_index;
584
585 hv_get_vp_index(vp_index);
586
587 hv_tmpcpumap.vp_index[cpu] = vp_index;
588}
589
590static void hv_tmpcpumap_init(void)
591{
592 if (hv_tmpcpumap.initialized)
593 return;
594
595 memset(hv_tmpcpumap.vp_index, -1, sizeof(hv_tmpcpumap.vp_index));
596 on_each_cpu(hv_tmpcpumap_init_cpu, NULL, true);
597 hv_tmpcpumap.initialized = true;
598}
599
600/**
601 * hv_tmp_cpu_nr_to_vp_nr() - Convert Linux CPU nr to Hyper-V vCPU nr
602 *
603 * Remove once vmbus_cpu_number_to_vp_number() has been converted to
604 * hv_cpu_number_to_vp_number() and replace callers appropriately.
605 */
606static u32 hv_tmp_cpu_nr_to_vp_nr(int cpu)
607{
608 return hv_tmpcpumap.vp_index[cpu];
609}
610
611
Jake Oshins4daace02016-02-16 21:56:23 +0000612/**
613 * devfn_to_wslot() - Convert from Linux PCI slot to Windows
614 * @devfn: The Linux representation of PCI slot
615 *
616 * Windows uses a slightly different representation of PCI slot.
617 *
618 * Return: The Windows representation
619 */
620static u32 devfn_to_wslot(int devfn)
621{
622 union win_slot_encoding wslot;
623
624 wslot.slot = 0;
Dexuan Cui60e2e2f2017-02-10 15:18:46 -0600625 wslot.bits.dev = PCI_SLOT(devfn);
626 wslot.bits.func = PCI_FUNC(devfn);
Jake Oshins4daace02016-02-16 21:56:23 +0000627
628 return wslot.slot;
629}
630
631/**
632 * wslot_to_devfn() - Convert from Windows PCI slot to Linux
633 * @wslot: The Windows representation of PCI slot
634 *
635 * Windows uses a slightly different representation of PCI slot.
636 *
637 * Return: The Linux representation
638 */
639static int wslot_to_devfn(u32 wslot)
640{
641 union win_slot_encoding slot_no;
642
643 slot_no.slot = wslot;
Dexuan Cui60e2e2f2017-02-10 15:18:46 -0600644 return PCI_DEVFN(slot_no.bits.dev, slot_no.bits.func);
Jake Oshins4daace02016-02-16 21:56:23 +0000645}
646
647/*
648 * PCI Configuration Space for these root PCI buses is implemented as a pair
649 * of pages in memory-mapped I/O space. Writing to the first page chooses
650 * the PCI function being written or read. Once the first page has been
651 * written to, the following page maps in the entire configuration space of
652 * the function.
653 */
654
655/**
656 * _hv_pcifront_read_config() - Internal PCI config read
657 * @hpdev: The PCI driver's representation of the device
658 * @where: Offset within config space
659 * @size: Size of the transfer
660 * @val: Pointer to the buffer receiving the data
661 */
662static void _hv_pcifront_read_config(struct hv_pci_dev *hpdev, int where,
663 int size, u32 *val)
664{
665 unsigned long flags;
666 void __iomem *addr = hpdev->hbus->cfg_addr + CFG_PAGE_OFFSET + where;
667
668 /*
669 * If the attempt is to read the IDs or the ROM BAR, simulate that.
670 */
671 if (where + size <= PCI_COMMAND) {
672 memcpy(val, ((u8 *)&hpdev->desc.v_id) + where, size);
673 } else if (where >= PCI_CLASS_REVISION && where + size <=
674 PCI_CACHE_LINE_SIZE) {
675 memcpy(val, ((u8 *)&hpdev->desc.rev) + where -
676 PCI_CLASS_REVISION, size);
677 } else if (where >= PCI_SUBSYSTEM_VENDOR_ID && where + size <=
678 PCI_ROM_ADDRESS) {
679 memcpy(val, (u8 *)&hpdev->desc.subsystem_id + where -
680 PCI_SUBSYSTEM_VENDOR_ID, size);
681 } else if (where >= PCI_ROM_ADDRESS && where + size <=
682 PCI_CAPABILITY_LIST) {
683 /* ROM BARs are unimplemented */
684 *val = 0;
685 } else if (where >= PCI_INTERRUPT_LINE && where + size <=
686 PCI_INTERRUPT_PIN) {
687 /*
688 * Interrupt Line and Interrupt PIN are hard-wired to zero
689 * because this front-end only supports message-signaled
690 * interrupts.
691 */
692 *val = 0;
693 } else if (where + size <= CFG_PAGE_SIZE) {
694 spin_lock_irqsave(&hpdev->hbus->config_lock, flags);
695 /* Choose the function to be read. (See comment above) */
696 writel(hpdev->desc.win_slot.slot, hpdev->hbus->cfg_addr);
Vitaly Kuznetsovbdd74442016-05-03 14:22:00 +0200697 /* Make sure the function was chosen before we start reading. */
698 mb();
Jake Oshins4daace02016-02-16 21:56:23 +0000699 /* Read from that function's config space. */
700 switch (size) {
701 case 1:
702 *val = readb(addr);
703 break;
704 case 2:
705 *val = readw(addr);
706 break;
707 default:
708 *val = readl(addr);
709 break;
710 }
Vitaly Kuznetsovbdd74442016-05-03 14:22:00 +0200711 /*
712 * Make sure the write was done before we release the spinlock
713 * allowing consecutive reads/writes.
714 */
715 mb();
Jake Oshins4daace02016-02-16 21:56:23 +0000716 spin_unlock_irqrestore(&hpdev->hbus->config_lock, flags);
717 } else {
718 dev_err(&hpdev->hbus->hdev->device,
719 "Attempt to read beyond a function's config space.\n");
720 }
721}
722
723/**
724 * _hv_pcifront_write_config() - Internal PCI config write
725 * @hpdev: The PCI driver's representation of the device
726 * @where: Offset within config space
727 * @size: Size of the transfer
728 * @val: The data being transferred
729 */
730static void _hv_pcifront_write_config(struct hv_pci_dev *hpdev, int where,
731 int size, u32 val)
732{
733 unsigned long flags;
734 void __iomem *addr = hpdev->hbus->cfg_addr + CFG_PAGE_OFFSET + where;
735
736 if (where >= PCI_SUBSYSTEM_VENDOR_ID &&
737 where + size <= PCI_CAPABILITY_LIST) {
738 /* SSIDs and ROM BARs are read-only */
739 } else if (where >= PCI_COMMAND && where + size <= CFG_PAGE_SIZE) {
740 spin_lock_irqsave(&hpdev->hbus->config_lock, flags);
741 /* Choose the function to be written. (See comment above) */
742 writel(hpdev->desc.win_slot.slot, hpdev->hbus->cfg_addr);
Vitaly Kuznetsovbdd74442016-05-03 14:22:00 +0200743 /* Make sure the function was chosen before we start writing. */
744 wmb();
Jake Oshins4daace02016-02-16 21:56:23 +0000745 /* Write to that function's config space. */
746 switch (size) {
747 case 1:
748 writeb(val, addr);
749 break;
750 case 2:
751 writew(val, addr);
752 break;
753 default:
754 writel(val, addr);
755 break;
756 }
Vitaly Kuznetsovbdd74442016-05-03 14:22:00 +0200757 /*
758 * Make sure the write was done before we release the spinlock
759 * allowing consecutive reads/writes.
760 */
761 mb();
Jake Oshins4daace02016-02-16 21:56:23 +0000762 spin_unlock_irqrestore(&hpdev->hbus->config_lock, flags);
763 } else {
764 dev_err(&hpdev->hbus->hdev->device,
765 "Attempt to write beyond a function's config space.\n");
766 }
767}
768
769/**
770 * hv_pcifront_read_config() - Read configuration space
771 * @bus: PCI Bus structure
772 * @devfn: Device/function
773 * @where: Offset from base
774 * @size: Byte/word/dword
775 * @val: Value to be read
776 *
777 * Return: PCIBIOS_SUCCESSFUL on success
778 * PCIBIOS_DEVICE_NOT_FOUND on failure
779 */
780static int hv_pcifront_read_config(struct pci_bus *bus, unsigned int devfn,
781 int where, int size, u32 *val)
782{
783 struct hv_pcibus_device *hbus =
784 container_of(bus->sysdata, struct hv_pcibus_device, sysdata);
785 struct hv_pci_dev *hpdev;
786
787 hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(devfn));
788 if (!hpdev)
789 return PCIBIOS_DEVICE_NOT_FOUND;
790
791 _hv_pcifront_read_config(hpdev, where, size, val);
792
793 put_pcichild(hpdev, hv_pcidev_ref_by_slot);
794 return PCIBIOS_SUCCESSFUL;
795}
796
797/**
798 * hv_pcifront_write_config() - Write configuration space
799 * @bus: PCI Bus structure
800 * @devfn: Device/function
801 * @where: Offset from base
802 * @size: Byte/word/dword
803 * @val: Value to be written to device
804 *
805 * Return: PCIBIOS_SUCCESSFUL on success
806 * PCIBIOS_DEVICE_NOT_FOUND on failure
807 */
808static int hv_pcifront_write_config(struct pci_bus *bus, unsigned int devfn,
809 int where, int size, u32 val)
810{
811 struct hv_pcibus_device *hbus =
812 container_of(bus->sysdata, struct hv_pcibus_device, sysdata);
813 struct hv_pci_dev *hpdev;
814
815 hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(devfn));
816 if (!hpdev)
817 return PCIBIOS_DEVICE_NOT_FOUND;
818
819 _hv_pcifront_write_config(hpdev, where, size, val);
820
821 put_pcichild(hpdev, hv_pcidev_ref_by_slot);
822 return PCIBIOS_SUCCESSFUL;
823}
824
825/* PCIe operations */
826static struct pci_ops hv_pcifront_ops = {
827 .read = hv_pcifront_read_config,
828 .write = hv_pcifront_write_config,
829};
830
831/* Interrupt management hooks */
832static void hv_int_desc_free(struct hv_pci_dev *hpdev,
833 struct tran_int_desc *int_desc)
834{
835 struct pci_delete_interrupt *int_pkt;
836 struct {
837 struct pci_packet pkt;
Dexuan Cui0c6045d2016-08-23 04:45:51 +0000838 u8 buffer[sizeof(struct pci_delete_interrupt)];
Jake Oshins4daace02016-02-16 21:56:23 +0000839 } ctxt;
840
841 memset(&ctxt, 0, sizeof(ctxt));
842 int_pkt = (struct pci_delete_interrupt *)&ctxt.pkt.message;
Dexuan Cui0c6045d2016-08-23 04:45:51 +0000843 int_pkt->message_type.type =
Jake Oshins4daace02016-02-16 21:56:23 +0000844 PCI_DELETE_INTERRUPT_MESSAGE;
845 int_pkt->wslot.slot = hpdev->desc.win_slot.slot;
846 int_pkt->int_desc = *int_desc;
847 vmbus_sendpacket(hpdev->hbus->hdev->channel, int_pkt, sizeof(*int_pkt),
848 (unsigned long)&ctxt.pkt, VM_PKT_DATA_INBAND, 0);
849 kfree(int_desc);
850}
851
852/**
853 * hv_msi_free() - Free the MSI.
854 * @domain: The interrupt domain pointer
855 * @info: Extra MSI-related context
856 * @irq: Identifies the IRQ.
857 *
858 * The Hyper-V parent partition and hypervisor are tracking the
859 * messages that are in use, keeping the interrupt redirection
860 * table up to date. This callback sends a message that frees
861 * the IRT entry and related tracking nonsense.
862 */
863static void hv_msi_free(struct irq_domain *domain, struct msi_domain_info *info,
864 unsigned int irq)
865{
866 struct hv_pcibus_device *hbus;
867 struct hv_pci_dev *hpdev;
868 struct pci_dev *pdev;
869 struct tran_int_desc *int_desc;
870 struct irq_data *irq_data = irq_domain_get_irq_data(domain, irq);
871 struct msi_desc *msi = irq_data_get_msi_desc(irq_data);
872
873 pdev = msi_desc_to_pci_dev(msi);
874 hbus = info->data;
Cathy Avery0c6e6172016-07-12 11:31:24 -0400875 int_desc = irq_data_get_irq_chip_data(irq_data);
876 if (!int_desc)
Jake Oshins4daace02016-02-16 21:56:23 +0000877 return;
878
Cathy Avery0c6e6172016-07-12 11:31:24 -0400879 irq_data->chip_data = NULL;
880 hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn));
881 if (!hpdev) {
882 kfree(int_desc);
883 return;
Jake Oshins4daace02016-02-16 21:56:23 +0000884 }
885
Cathy Avery0c6e6172016-07-12 11:31:24 -0400886 hv_int_desc_free(hpdev, int_desc);
Jake Oshins4daace02016-02-16 21:56:23 +0000887 put_pcichild(hpdev, hv_pcidev_ref_by_slot);
888}
889
890static int hv_set_affinity(struct irq_data *data, const struct cpumask *dest,
891 bool force)
892{
893 struct irq_data *parent = data->parent_data;
894
895 return parent->chip->irq_set_affinity(parent, dest, force);
896}
897
Tobias Klauser542ccf42016-10-31 12:04:09 +0100898static void hv_irq_mask(struct irq_data *data)
Jake Oshins4daace02016-02-16 21:56:23 +0000899{
900 pci_msi_mask_irq(data);
901}
902
903/**
904 * hv_irq_unmask() - "Unmask" the IRQ by setting its current
905 * affinity.
906 * @data: Describes the IRQ
907 *
908 * Build new a destination for the MSI and make a hypercall to
909 * update the Interrupt Redirection Table. "Device Logical ID"
910 * is built out of this PCI bus's instance GUID and the function
911 * number of the device.
912 */
Tobias Klauser542ccf42016-10-31 12:04:09 +0100913static void hv_irq_unmask(struct irq_data *data)
Jake Oshins4daace02016-02-16 21:56:23 +0000914{
915 struct msi_desc *msi_desc = irq_data_get_msi_desc(data);
916 struct irq_cfg *cfg = irqd_cfg(data);
Long Li0de8ce32016-11-08 14:04:38 -0800917 struct retarget_msi_interrupt *params;
Jake Oshins4daace02016-02-16 21:56:23 +0000918 struct hv_pcibus_device *hbus;
919 struct cpumask *dest;
920 struct pci_bus *pbus;
921 struct pci_dev *pdev;
Long Li0de8ce32016-11-08 14:04:38 -0800922 unsigned long flags;
Jork Loeser7dcf90e2017-05-24 13:41:28 -0700923 u32 var_size = 0;
924 int cpu_vmbus;
925 int cpu;
926 u64 res;
Jake Oshins4daace02016-02-16 21:56:23 +0000927
928 dest = irq_data_get_affinity_mask(data);
929 pdev = msi_desc_to_pci_dev(msi_desc);
930 pbus = pdev->bus;
931 hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata);
932
Long Li0de8ce32016-11-08 14:04:38 -0800933 spin_lock_irqsave(&hbus->retarget_msi_interrupt_lock, flags);
934
935 params = &hbus->retarget_msi_interrupt_params;
936 memset(params, 0, sizeof(*params));
937 params->partition_id = HV_PARTITION_ID_SELF;
Jork Loeser7dcf90e2017-05-24 13:41:28 -0700938 params->int_entry.source = 1; /* MSI(-X) */
939 params->int_entry.address = msi_desc->msg.address_lo;
940 params->int_entry.data = msi_desc->msg.data;
Long Li0de8ce32016-11-08 14:04:38 -0800941 params->device_id = (hbus->hdev->dev_instance.b[5] << 24) |
Jake Oshins4daace02016-02-16 21:56:23 +0000942 (hbus->hdev->dev_instance.b[4] << 16) |
943 (hbus->hdev->dev_instance.b[7] << 8) |
944 (hbus->hdev->dev_instance.b[6] & 0xf8) |
945 PCI_FUNC(pdev->devfn);
Jork Loeser7dcf90e2017-05-24 13:41:28 -0700946 params->int_target.vector = cfg->vector;
Jake Oshins4daace02016-02-16 21:56:23 +0000947
Jork Loeser7dcf90e2017-05-24 13:41:28 -0700948 /*
949 * Honoring apic->irq_delivery_mode set to dest_Fixed by
950 * setting the HV_DEVICE_INTERRUPT_TARGET_MULTICAST flag results in a
951 * spurious interrupt storm. Not doing so does not seem to have a
952 * negative effect (yet?).
953 */
Jake Oshins4daace02016-02-16 21:56:23 +0000954
Jork Loeser7dcf90e2017-05-24 13:41:28 -0700955 if (pci_protocol_version >= PCI_PROTOCOL_VERSION_1_2) {
956 /*
957 * PCI_PROTOCOL_VERSION_1_2 supports the VP_SET version of the
958 * HVCALL_RETARGET_INTERRUPT hypercall, which also coincides
959 * with >64 VP support.
960 * ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED
961 * is not sufficient for this hypercall.
962 */
963 params->int_target.flags |=
964 HV_DEVICE_INTERRUPT_TARGET_PROCESSOR_SET;
965 params->int_target.vp_set.valid_banks =
966 (1ull << HV_VP_SET_BANK_COUNT_MAX) - 1;
Long Li0de8ce32016-11-08 14:04:38 -0800967
Jork Loeser7dcf90e2017-05-24 13:41:28 -0700968 /*
969 * var-sized hypercall, var-size starts after vp_mask (thus
970 * vp_set.format does not count, but vp_set.valid_banks does).
971 */
972 var_size = 1 + HV_VP_SET_BANK_COUNT_MAX;
973
974 for_each_cpu_and(cpu, dest, cpu_online_mask) {
975 cpu_vmbus = hv_tmp_cpu_nr_to_vp_nr(cpu);
976
977 if (cpu_vmbus >= HV_VP_SET_BANK_COUNT_MAX * 64) {
978 dev_err(&hbus->hdev->device,
979 "too high CPU %d", cpu_vmbus);
980 res = 1;
981 goto exit_unlock;
982 }
983
984 params->int_target.vp_set.masks[cpu_vmbus / 64] |=
985 (1ULL << (cpu_vmbus & 63));
986 }
987 } else {
988 for_each_cpu_and(cpu, dest, cpu_online_mask) {
989 params->int_target.vp_mask |=
990 (1ULL << hv_tmp_cpu_nr_to_vp_nr(cpu));
991 }
992 }
993
994 res = hv_do_hypercall(HVCALL_RETARGET_INTERRUPT | (var_size << 17),
995 params, NULL);
996
997exit_unlock:
Long Li0de8ce32016-11-08 14:04:38 -0800998 spin_unlock_irqrestore(&hbus->retarget_msi_interrupt_lock, flags);
Jake Oshins4daace02016-02-16 21:56:23 +0000999
Jork Loeser7dcf90e2017-05-24 13:41:28 -07001000 if (res) {
1001 dev_err(&hbus->hdev->device,
1002 "%s() failed: %#llx", __func__, res);
1003 return;
1004 }
1005
Jake Oshins4daace02016-02-16 21:56:23 +00001006 pci_msi_unmask_irq(data);
1007}
1008
1009struct compose_comp_ctxt {
1010 struct hv_pci_compl comp_pkt;
1011 struct tran_int_desc int_desc;
1012};
1013
1014static void hv_pci_compose_compl(void *context, struct pci_response *resp,
1015 int resp_packet_size)
1016{
1017 struct compose_comp_ctxt *comp_pkt = context;
1018 struct pci_create_int_response *int_resp =
1019 (struct pci_create_int_response *)resp;
1020
1021 comp_pkt->comp_pkt.completion_status = resp->status;
1022 comp_pkt->int_desc = int_resp->int_desc;
1023 complete(&comp_pkt->comp_pkt.host_event);
1024}
1025
Jork Loeser7dcf90e2017-05-24 13:41:28 -07001026static u32 hv_compose_msi_req_v1(
1027 struct pci_create_interrupt *int_pkt, struct cpumask *affinity,
1028 u32 slot, u8 vector)
1029{
1030 int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE;
1031 int_pkt->wslot.slot = slot;
1032 int_pkt->int_desc.vector = vector;
1033 int_pkt->int_desc.vector_count = 1;
1034 int_pkt->int_desc.delivery_mode =
1035 (apic->irq_delivery_mode == dest_LowestPrio) ?
1036 dest_LowestPrio : dest_Fixed;
1037
1038 /*
1039 * Create MSI w/ dummy vCPU set, overwritten by subsequent retarget in
1040 * hv_irq_unmask().
1041 */
1042 int_pkt->int_desc.cpu_mask = CPU_AFFINITY_ALL;
1043
1044 return sizeof(*int_pkt);
1045}
1046
1047static u32 hv_compose_msi_req_v2(
1048 struct pci_create_interrupt2 *int_pkt, struct cpumask *affinity,
1049 u32 slot, u8 vector)
1050{
1051 int cpu;
1052
1053 int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE2;
1054 int_pkt->wslot.slot = slot;
1055 int_pkt->int_desc.vector = vector;
1056 int_pkt->int_desc.vector_count = 1;
1057 int_pkt->int_desc.delivery_mode =
1058 (apic->irq_delivery_mode == dest_LowestPrio) ?
1059 dest_LowestPrio : dest_Fixed;
1060
1061 /*
1062 * Create MSI w/ dummy vCPU set targeting just one vCPU, overwritten
1063 * by subsequent retarget in hv_irq_unmask().
1064 */
1065 cpu = cpumask_first_and(affinity, cpu_online_mask);
1066 int_pkt->int_desc.processor_array[0] =
1067 hv_tmp_cpu_nr_to_vp_nr(cpu);
1068 int_pkt->int_desc.processor_count = 1;
1069
1070 return sizeof(*int_pkt);
1071}
1072
Jake Oshins4daace02016-02-16 21:56:23 +00001073/**
1074 * hv_compose_msi_msg() - Supplies a valid MSI address/data
1075 * @data: Everything about this MSI
1076 * @msg: Buffer that is filled in by this function
1077 *
1078 * This function unpacks the IRQ looking for target CPU set, IDT
1079 * vector and mode and sends a message to the parent partition
1080 * asking for a mapping for that tuple in this partition. The
1081 * response supplies a data value and address to which that data
1082 * should be written to trigger that interrupt.
1083 */
1084static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
1085{
1086 struct irq_cfg *cfg = irqd_cfg(data);
1087 struct hv_pcibus_device *hbus;
1088 struct hv_pci_dev *hpdev;
1089 struct pci_bus *pbus;
1090 struct pci_dev *pdev;
Jake Oshins4daace02016-02-16 21:56:23 +00001091 struct compose_comp_ctxt comp;
1092 struct tran_int_desc *int_desc;
Jake Oshins4daace02016-02-16 21:56:23 +00001093 struct {
Jork Loeser7dcf90e2017-05-24 13:41:28 -07001094 struct pci_packet pci_pkt;
1095 union {
1096 struct pci_create_interrupt v1;
1097 struct pci_create_interrupt2 v2;
1098 } int_pkts;
1099 } __packed ctxt;
1100
1101 u32 size;
Jake Oshins4daace02016-02-16 21:56:23 +00001102 int ret;
1103
1104 pdev = msi_desc_to_pci_dev(irq_data_get_msi_desc(data));
1105 pbus = pdev->bus;
1106 hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata);
1107 hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn));
1108 if (!hpdev)
1109 goto return_null_message;
1110
1111 /* Free any previous message that might have already been composed. */
1112 if (data->chip_data) {
1113 int_desc = data->chip_data;
1114 data->chip_data = NULL;
1115 hv_int_desc_free(hpdev, int_desc);
1116 }
1117
K. Y. Srinivasan59c58cee2017-03-24 11:07:22 -07001118 int_desc = kzalloc(sizeof(*int_desc), GFP_ATOMIC);
Jake Oshins4daace02016-02-16 21:56:23 +00001119 if (!int_desc)
1120 goto drop_reference;
1121
1122 memset(&ctxt, 0, sizeof(ctxt));
1123 init_completion(&comp.comp_pkt.host_event);
Jork Loeser7dcf90e2017-05-24 13:41:28 -07001124 ctxt.pci_pkt.completion_func = hv_pci_compose_compl;
1125 ctxt.pci_pkt.compl_ctxt = &comp;
Jake Oshins4daace02016-02-16 21:56:23 +00001126
Jork Loeser7dcf90e2017-05-24 13:41:28 -07001127 switch (pci_protocol_version) {
1128 case PCI_PROTOCOL_VERSION_1_1:
1129 size = hv_compose_msi_req_v1(&ctxt.int_pkts.v1,
1130 irq_data_get_affinity_mask(data),
1131 hpdev->desc.win_slot.slot,
1132 cfg->vector);
1133 break;
1134
1135 case PCI_PROTOCOL_VERSION_1_2:
1136 size = hv_compose_msi_req_v2(&ctxt.int_pkts.v2,
1137 irq_data_get_affinity_mask(data),
1138 hpdev->desc.win_slot.slot,
1139 cfg->vector);
1140 break;
1141
1142 default:
1143 /* As we only negotiate protocol versions known to this driver,
1144 * this path should never hit. However, this is it not a hot
1145 * path so we print a message to aid future updates.
1146 */
1147 dev_err(&hbus->hdev->device,
1148 "Unexpected vPCI protocol, update driver.");
1149 goto free_int_desc;
Jake Oshins4daace02016-02-16 21:56:23 +00001150 }
1151
Jork Loeser7dcf90e2017-05-24 13:41:28 -07001152 ret = vmbus_sendpacket(hpdev->hbus->hdev->channel, &ctxt.int_pkts,
1153 size, (unsigned long)&ctxt.pci_pkt,
Jake Oshins4daace02016-02-16 21:56:23 +00001154 VM_PKT_DATA_INBAND,
1155 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
Jork Loeser7dcf90e2017-05-24 13:41:28 -07001156 if (ret) {
1157 dev_err(&hbus->hdev->device,
1158 "Sending request for interrupt failed: 0x%x",
1159 comp.comp_pkt.completion_status);
Dexuan Cui665e2242016-08-23 04:48:11 +00001160 goto free_int_desc;
Jork Loeser7dcf90e2017-05-24 13:41:28 -07001161 }
Dexuan Cui665e2242016-08-23 04:48:11 +00001162
Stephen Hemminger80bfeeb2017-07-31 16:48:29 -07001163 /*
1164 * Since this function is called with IRQ locks held, can't
1165 * do normal wait for completion; instead poll.
1166 */
1167 while (!try_wait_for_completion(&comp.comp_pkt.host_event))
1168 udelay(100);
Jake Oshins4daace02016-02-16 21:56:23 +00001169
1170 if (comp.comp_pkt.completion_status < 0) {
1171 dev_err(&hbus->hdev->device,
1172 "Request for interrupt failed: 0x%x",
1173 comp.comp_pkt.completion_status);
1174 goto free_int_desc;
1175 }
1176
1177 /*
1178 * Record the assignment so that this can be unwound later. Using
1179 * irq_set_chip_data() here would be appropriate, but the lock it takes
1180 * is already held.
1181 */
1182 *int_desc = comp.int_desc;
1183 data->chip_data = int_desc;
1184
1185 /* Pass up the result. */
1186 msg->address_hi = comp.int_desc.address >> 32;
1187 msg->address_lo = comp.int_desc.address & 0xffffffff;
1188 msg->data = comp.int_desc.data;
1189
1190 put_pcichild(hpdev, hv_pcidev_ref_by_slot);
1191 return;
1192
1193free_int_desc:
1194 kfree(int_desc);
1195drop_reference:
1196 put_pcichild(hpdev, hv_pcidev_ref_by_slot);
1197return_null_message:
1198 msg->address_hi = 0;
1199 msg->address_lo = 0;
1200 msg->data = 0;
1201}
1202
1203/* HW Interrupt Chip Descriptor */
1204static struct irq_chip hv_msi_irq_chip = {
1205 .name = "Hyper-V PCIe MSI",
1206 .irq_compose_msi_msg = hv_compose_msi_msg,
1207 .irq_set_affinity = hv_set_affinity,
1208 .irq_ack = irq_chip_ack_parent,
1209 .irq_mask = hv_irq_mask,
1210 .irq_unmask = hv_irq_unmask,
1211};
1212
1213static irq_hw_number_t hv_msi_domain_ops_get_hwirq(struct msi_domain_info *info,
1214 msi_alloc_info_t *arg)
1215{
1216 return arg->msi_hwirq;
1217}
1218
1219static struct msi_domain_ops hv_msi_ops = {
1220 .get_hwirq = hv_msi_domain_ops_get_hwirq,
1221 .msi_prepare = pci_msi_prepare,
1222 .set_desc = pci_msi_set_desc,
1223 .msi_free = hv_msi_free,
1224};
1225
1226/**
1227 * hv_pcie_init_irq_domain() - Initialize IRQ domain
1228 * @hbus: The root PCI bus
1229 *
1230 * This function creates an IRQ domain which will be used for
1231 * interrupts from devices that have been passed through. These
1232 * devices only support MSI and MSI-X, not line-based interrupts
1233 * or simulations of line-based interrupts through PCIe's
1234 * fabric-layer messages. Because interrupts are remapped, we
1235 * can support multi-message MSI here.
1236 *
1237 * Return: '0' on success and error value on failure
1238 */
1239static int hv_pcie_init_irq_domain(struct hv_pcibus_device *hbus)
1240{
1241 hbus->msi_info.chip = &hv_msi_irq_chip;
1242 hbus->msi_info.ops = &hv_msi_ops;
1243 hbus->msi_info.flags = (MSI_FLAG_USE_DEF_DOM_OPS |
1244 MSI_FLAG_USE_DEF_CHIP_OPS | MSI_FLAG_MULTI_PCI_MSI |
1245 MSI_FLAG_PCI_MSIX);
1246 hbus->msi_info.handler = handle_edge_irq;
1247 hbus->msi_info.handler_name = "edge";
1248 hbus->msi_info.data = hbus;
1249 hbus->irq_domain = pci_msi_create_irq_domain(hbus->sysdata.fwnode,
1250 &hbus->msi_info,
1251 x86_vector_domain);
1252 if (!hbus->irq_domain) {
1253 dev_err(&hbus->hdev->device,
1254 "Failed to build an MSI IRQ domain\n");
1255 return -ENODEV;
1256 }
1257
1258 return 0;
1259}
1260
1261/**
1262 * get_bar_size() - Get the address space consumed by a BAR
1263 * @bar_val: Value that a BAR returned after -1 was written
1264 * to it.
1265 *
1266 * This function returns the size of the BAR, rounded up to 1
1267 * page. It has to be rounded up because the hypervisor's page
1268 * table entry that maps the BAR into the VM can't specify an
1269 * offset within a page. The invariant is that the hypervisor
1270 * must place any BARs of smaller than page length at the
1271 * beginning of a page.
1272 *
1273 * Return: Size in bytes of the consumed MMIO space.
1274 */
1275static u64 get_bar_size(u64 bar_val)
1276{
1277 return round_up((1 + ~(bar_val & PCI_BASE_ADDRESS_MEM_MASK)),
1278 PAGE_SIZE);
1279}
1280
1281/**
1282 * survey_child_resources() - Total all MMIO requirements
1283 * @hbus: Root PCI bus, as understood by this driver
1284 */
1285static void survey_child_resources(struct hv_pcibus_device *hbus)
1286{
1287 struct list_head *iter;
1288 struct hv_pci_dev *hpdev;
1289 resource_size_t bar_size = 0;
1290 unsigned long flags;
1291 struct completion *event;
1292 u64 bar_val;
1293 int i;
1294
1295 /* If nobody is waiting on the answer, don't compute it. */
1296 event = xchg(&hbus->survey_event, NULL);
1297 if (!event)
1298 return;
1299
1300 /* If the answer has already been computed, go with it. */
1301 if (hbus->low_mmio_space || hbus->high_mmio_space) {
1302 complete(event);
1303 return;
1304 }
1305
1306 spin_lock_irqsave(&hbus->device_list_lock, flags);
1307
1308 /*
1309 * Due to an interesting quirk of the PCI spec, all memory regions
1310 * for a child device are a power of 2 in size and aligned in memory,
1311 * so it's sufficient to just add them up without tracking alignment.
1312 */
1313 list_for_each(iter, &hbus->children) {
1314 hpdev = container_of(iter, struct hv_pci_dev, list_entry);
1315 for (i = 0; i < 6; i++) {
1316 if (hpdev->probed_bar[i] & PCI_BASE_ADDRESS_SPACE_IO)
1317 dev_err(&hbus->hdev->device,
1318 "There's an I/O BAR in this list!\n");
1319
1320 if (hpdev->probed_bar[i] != 0) {
1321 /*
1322 * A probed BAR has all the upper bits set that
1323 * can be changed.
1324 */
1325
1326 bar_val = hpdev->probed_bar[i];
1327 if (bar_val & PCI_BASE_ADDRESS_MEM_TYPE_64)
1328 bar_val |=
1329 ((u64)hpdev->probed_bar[++i] << 32);
1330 else
1331 bar_val |= 0xffffffff00000000ULL;
1332
1333 bar_size = get_bar_size(bar_val);
1334
1335 if (bar_val & PCI_BASE_ADDRESS_MEM_TYPE_64)
1336 hbus->high_mmio_space += bar_size;
1337 else
1338 hbus->low_mmio_space += bar_size;
1339 }
1340 }
1341 }
1342
1343 spin_unlock_irqrestore(&hbus->device_list_lock, flags);
1344 complete(event);
1345}
1346
1347/**
1348 * prepopulate_bars() - Fill in BARs with defaults
1349 * @hbus: Root PCI bus, as understood by this driver
1350 *
1351 * The core PCI driver code seems much, much happier if the BARs
1352 * for a device have values upon first scan. So fill them in.
1353 * The algorithm below works down from large sizes to small,
1354 * attempting to pack the assignments optimally. The assumption,
1355 * enforced in other parts of the code, is that the beginning of
1356 * the memory-mapped I/O space will be aligned on the largest
1357 * BAR size.
1358 */
1359static void prepopulate_bars(struct hv_pcibus_device *hbus)
1360{
1361 resource_size_t high_size = 0;
1362 resource_size_t low_size = 0;
1363 resource_size_t high_base = 0;
1364 resource_size_t low_base = 0;
1365 resource_size_t bar_size;
1366 struct hv_pci_dev *hpdev;
1367 struct list_head *iter;
1368 unsigned long flags;
1369 u64 bar_val;
1370 u32 command;
1371 bool high;
1372 int i;
1373
1374 if (hbus->low_mmio_space) {
1375 low_size = 1ULL << (63 - __builtin_clzll(hbus->low_mmio_space));
1376 low_base = hbus->low_mmio_res->start;
1377 }
1378
1379 if (hbus->high_mmio_space) {
1380 high_size = 1ULL <<
1381 (63 - __builtin_clzll(hbus->high_mmio_space));
1382 high_base = hbus->high_mmio_res->start;
1383 }
1384
1385 spin_lock_irqsave(&hbus->device_list_lock, flags);
1386
1387 /* Pick addresses for the BARs. */
1388 do {
1389 list_for_each(iter, &hbus->children) {
1390 hpdev = container_of(iter, struct hv_pci_dev,
1391 list_entry);
1392 for (i = 0; i < 6; i++) {
1393 bar_val = hpdev->probed_bar[i];
1394 if (bar_val == 0)
1395 continue;
1396 high = bar_val & PCI_BASE_ADDRESS_MEM_TYPE_64;
1397 if (high) {
1398 bar_val |=
1399 ((u64)hpdev->probed_bar[i + 1]
1400 << 32);
1401 } else {
1402 bar_val |= 0xffffffffULL << 32;
1403 }
1404 bar_size = get_bar_size(bar_val);
1405 if (high) {
1406 if (high_size != bar_size) {
1407 i++;
1408 continue;
1409 }
1410 _hv_pcifront_write_config(hpdev,
1411 PCI_BASE_ADDRESS_0 + (4 * i),
1412 4,
1413 (u32)(high_base & 0xffffff00));
1414 i++;
1415 _hv_pcifront_write_config(hpdev,
1416 PCI_BASE_ADDRESS_0 + (4 * i),
1417 4, (u32)(high_base >> 32));
1418 high_base += bar_size;
1419 } else {
1420 if (low_size != bar_size)
1421 continue;
1422 _hv_pcifront_write_config(hpdev,
1423 PCI_BASE_ADDRESS_0 + (4 * i),
1424 4,
1425 (u32)(low_base & 0xffffff00));
1426 low_base += bar_size;
1427 }
1428 }
1429 if (high_size <= 1 && low_size <= 1) {
1430 /* Set the memory enable bit. */
1431 _hv_pcifront_read_config(hpdev, PCI_COMMAND, 2,
1432 &command);
1433 command |= PCI_COMMAND_MEMORY;
1434 _hv_pcifront_write_config(hpdev, PCI_COMMAND, 2,
1435 command);
1436 break;
1437 }
1438 }
1439
1440 high_size >>= 1;
1441 low_size >>= 1;
1442 } while (high_size || low_size);
1443
1444 spin_unlock_irqrestore(&hbus->device_list_lock, flags);
1445}
1446
1447/**
1448 * create_root_hv_pci_bus() - Expose a new root PCI bus
1449 * @hbus: Root PCI bus, as understood by this driver
1450 *
1451 * Return: 0 on success, -errno on failure
1452 */
1453static int create_root_hv_pci_bus(struct hv_pcibus_device *hbus)
1454{
1455 /* Register the device */
1456 hbus->pci_bus = pci_create_root_bus(&hbus->hdev->device,
1457 0, /* bus number is always zero */
1458 &hv_pcifront_ops,
1459 &hbus->sysdata,
1460 &hbus->resources_for_children);
1461 if (!hbus->pci_bus)
1462 return -ENODEV;
1463
1464 hbus->pci_bus->msi = &hbus->msi_chip;
1465 hbus->pci_bus->msi->dev = &hbus->hdev->device;
1466
Long Li414428c2017-03-23 14:58:32 -07001467 pci_lock_rescan_remove();
Jake Oshins4daace02016-02-16 21:56:23 +00001468 pci_scan_child_bus(hbus->pci_bus);
1469 pci_bus_assign_resources(hbus->pci_bus);
1470 pci_bus_add_devices(hbus->pci_bus);
Long Li414428c2017-03-23 14:58:32 -07001471 pci_unlock_rescan_remove();
Jake Oshins4daace02016-02-16 21:56:23 +00001472 hbus->state = hv_pcibus_installed;
1473 return 0;
1474}
1475
1476struct q_res_req_compl {
1477 struct completion host_event;
1478 struct hv_pci_dev *hpdev;
1479};
1480
1481/**
1482 * q_resource_requirements() - Query Resource Requirements
1483 * @context: The completion context.
1484 * @resp: The response that came from the host.
1485 * @resp_packet_size: The size in bytes of resp.
1486 *
1487 * This function is invoked on completion of a Query Resource
1488 * Requirements packet.
1489 */
1490static void q_resource_requirements(void *context, struct pci_response *resp,
1491 int resp_packet_size)
1492{
1493 struct q_res_req_compl *completion = context;
1494 struct pci_q_res_req_response *q_res_req =
1495 (struct pci_q_res_req_response *)resp;
1496 int i;
1497
1498 if (resp->status < 0) {
1499 dev_err(&completion->hpdev->hbus->hdev->device,
1500 "query resource requirements failed: %x\n",
1501 resp->status);
1502 } else {
1503 for (i = 0; i < 6; i++) {
1504 completion->hpdev->probed_bar[i] =
1505 q_res_req->probed_bar[i];
1506 }
1507 }
1508
1509 complete(&completion->host_event);
1510}
1511
1512static void get_pcichild(struct hv_pci_dev *hpdev,
1513 enum hv_pcidev_ref_reason reason)
1514{
Elena Reshetova24196f02017-04-18 09:02:48 -05001515 refcount_inc(&hpdev->refs);
Jake Oshins4daace02016-02-16 21:56:23 +00001516}
1517
1518static void put_pcichild(struct hv_pci_dev *hpdev,
1519 enum hv_pcidev_ref_reason reason)
1520{
Elena Reshetova24196f02017-04-18 09:02:48 -05001521 if (refcount_dec_and_test(&hpdev->refs))
Jake Oshins4daace02016-02-16 21:56:23 +00001522 kfree(hpdev);
1523}
1524
1525/**
1526 * new_pcichild_device() - Create a new child device
1527 * @hbus: The internal struct tracking this root PCI bus.
1528 * @desc: The information supplied so far from the host
1529 * about the device.
1530 *
1531 * This function creates the tracking structure for a new child
1532 * device and kicks off the process of figuring out what it is.
1533 *
1534 * Return: Pointer to the new tracking struct
1535 */
1536static struct hv_pci_dev *new_pcichild_device(struct hv_pcibus_device *hbus,
1537 struct pci_function_description *desc)
1538{
1539 struct hv_pci_dev *hpdev;
1540 struct pci_child_message *res_req;
1541 struct q_res_req_compl comp_pkt;
Dexuan Cui8286e962016-11-10 07:17:48 +00001542 struct {
1543 struct pci_packet init_packet;
1544 u8 buffer[sizeof(struct pci_child_message)];
Jake Oshins4daace02016-02-16 21:56:23 +00001545 } pkt;
1546 unsigned long flags;
1547 int ret;
1548
1549 hpdev = kzalloc(sizeof(*hpdev), GFP_ATOMIC);
1550 if (!hpdev)
1551 return NULL;
1552
1553 hpdev->hbus = hbus;
1554
1555 memset(&pkt, 0, sizeof(pkt));
1556 init_completion(&comp_pkt.host_event);
1557 comp_pkt.hpdev = hpdev;
1558 pkt.init_packet.compl_ctxt = &comp_pkt;
1559 pkt.init_packet.completion_func = q_resource_requirements;
1560 res_req = (struct pci_child_message *)&pkt.init_packet.message;
Dexuan Cui0c6045d2016-08-23 04:45:51 +00001561 res_req->message_type.type = PCI_QUERY_RESOURCE_REQUIREMENTS;
Jake Oshins4daace02016-02-16 21:56:23 +00001562 res_req->wslot.slot = desc->win_slot.slot;
1563
1564 ret = vmbus_sendpacket(hbus->hdev->channel, res_req,
1565 sizeof(struct pci_child_message),
1566 (unsigned long)&pkt.init_packet,
1567 VM_PKT_DATA_INBAND,
1568 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
1569 if (ret)
1570 goto error;
1571
1572 wait_for_completion(&comp_pkt.host_event);
1573
1574 hpdev->desc = *desc;
Elena Reshetova24196f02017-04-18 09:02:48 -05001575 refcount_set(&hpdev->refs, 1);
Jake Oshins4daace02016-02-16 21:56:23 +00001576 get_pcichild(hpdev, hv_pcidev_ref_childlist);
1577 spin_lock_irqsave(&hbus->device_list_lock, flags);
Haiyang Zhang4a9b0932017-02-13 18:10:11 +00001578
1579 /*
1580 * When a device is being added to the bus, we set the PCI domain
1581 * number to be the device serial number, which is non-zero and
1582 * unique on the same VM. The serial numbers start with 1, and
1583 * increase by 1 for each device. So device names including this
1584 * can have shorter names than based on the bus instance UUID.
1585 * Only the first device serial number is used for domain, so the
1586 * domain number will not change after the first device is added.
1587 */
1588 if (list_empty(&hbus->children))
1589 hbus->sysdata.domain = desc->ser;
Jake Oshins4daace02016-02-16 21:56:23 +00001590 list_add_tail(&hpdev->list_entry, &hbus->children);
1591 spin_unlock_irqrestore(&hbus->device_list_lock, flags);
1592 return hpdev;
1593
1594error:
1595 kfree(hpdev);
1596 return NULL;
1597}
1598
1599/**
1600 * get_pcichild_wslot() - Find device from slot
1601 * @hbus: Root PCI bus, as understood by this driver
1602 * @wslot: Location on the bus
1603 *
1604 * This function looks up a PCI device and returns the internal
1605 * representation of it. It acquires a reference on it, so that
1606 * the device won't be deleted while somebody is using it. The
1607 * caller is responsible for calling put_pcichild() to release
1608 * this reference.
1609 *
1610 * Return: Internal representation of a PCI device
1611 */
1612static struct hv_pci_dev *get_pcichild_wslot(struct hv_pcibus_device *hbus,
1613 u32 wslot)
1614{
1615 unsigned long flags;
1616 struct hv_pci_dev *iter, *hpdev = NULL;
1617
1618 spin_lock_irqsave(&hbus->device_list_lock, flags);
1619 list_for_each_entry(iter, &hbus->children, list_entry) {
1620 if (iter->desc.win_slot.slot == wslot) {
1621 hpdev = iter;
1622 get_pcichild(hpdev, hv_pcidev_ref_by_slot);
1623 break;
1624 }
1625 }
1626 spin_unlock_irqrestore(&hbus->device_list_lock, flags);
1627
1628 return hpdev;
1629}
1630
1631/**
1632 * pci_devices_present_work() - Handle new list of child devices
1633 * @work: Work struct embedded in struct hv_dr_work
1634 *
1635 * "Bus Relations" is the Windows term for "children of this
1636 * bus." The terminology is preserved here for people trying to
1637 * debug the interaction between Hyper-V and Linux. This
1638 * function is called when the parent partition reports a list
1639 * of functions that should be observed under this PCI Express
1640 * port (bus).
1641 *
1642 * This function updates the list, and must tolerate being
1643 * called multiple times with the same information. The typical
1644 * number of child devices is one, with very atypical cases
1645 * involving three or four, so the algorithms used here can be
1646 * simple and inefficient.
1647 *
1648 * It must also treat the omission of a previously observed device as
1649 * notification that the device no longer exists.
1650 *
1651 * Note that this function is a work item, and it may not be
1652 * invoked in the order that it was queued. Back to back
1653 * updates of the list of present devices may involve queuing
1654 * multiple work items, and this one may run before ones that
1655 * were sent later. As such, this function only does something
1656 * if is the last one in the queue.
1657 */
1658static void pci_devices_present_work(struct work_struct *work)
1659{
1660 u32 child_no;
1661 bool found;
1662 struct list_head *iter;
1663 struct pci_function_description *new_desc;
1664 struct hv_pci_dev *hpdev;
1665 struct hv_pcibus_device *hbus;
1666 struct list_head removed;
1667 struct hv_dr_work *dr_wrk;
1668 struct hv_dr_state *dr = NULL;
1669 unsigned long flags;
1670
1671 dr_wrk = container_of(work, struct hv_dr_work, wrk);
1672 hbus = dr_wrk->bus;
1673 kfree(dr_wrk);
1674
1675 INIT_LIST_HEAD(&removed);
1676
1677 if (down_interruptible(&hbus->enum_sem)) {
1678 put_hvpcibus(hbus);
1679 return;
1680 }
1681
1682 /* Pull this off the queue and process it if it was the last one. */
1683 spin_lock_irqsave(&hbus->device_list_lock, flags);
1684 while (!list_empty(&hbus->dr_list)) {
1685 dr = list_first_entry(&hbus->dr_list, struct hv_dr_state,
1686 list_entry);
1687 list_del(&dr->list_entry);
1688
1689 /* Throw this away if the list still has stuff in it. */
1690 if (!list_empty(&hbus->dr_list)) {
1691 kfree(dr);
1692 continue;
1693 }
1694 }
1695 spin_unlock_irqrestore(&hbus->device_list_lock, flags);
1696
1697 if (!dr) {
1698 up(&hbus->enum_sem);
1699 put_hvpcibus(hbus);
1700 return;
1701 }
1702
1703 /* First, mark all existing children as reported missing. */
1704 spin_lock_irqsave(&hbus->device_list_lock, flags);
1705 list_for_each(iter, &hbus->children) {
1706 hpdev = container_of(iter, struct hv_pci_dev,
1707 list_entry);
1708 hpdev->reported_missing = true;
1709 }
1710 spin_unlock_irqrestore(&hbus->device_list_lock, flags);
1711
1712 /* Next, add back any reported devices. */
1713 for (child_no = 0; child_no < dr->device_count; child_no++) {
1714 found = false;
1715 new_desc = &dr->func[child_no];
1716
1717 spin_lock_irqsave(&hbus->device_list_lock, flags);
1718 list_for_each(iter, &hbus->children) {
1719 hpdev = container_of(iter, struct hv_pci_dev,
1720 list_entry);
1721 if ((hpdev->desc.win_slot.slot ==
1722 new_desc->win_slot.slot) &&
1723 (hpdev->desc.v_id == new_desc->v_id) &&
1724 (hpdev->desc.d_id == new_desc->d_id) &&
1725 (hpdev->desc.ser == new_desc->ser)) {
1726 hpdev->reported_missing = false;
1727 found = true;
1728 }
1729 }
1730 spin_unlock_irqrestore(&hbus->device_list_lock, flags);
1731
1732 if (!found) {
1733 hpdev = new_pcichild_device(hbus, new_desc);
1734 if (!hpdev)
1735 dev_err(&hbus->hdev->device,
1736 "couldn't record a child device.\n");
1737 }
1738 }
1739
1740 /* Move missing children to a list on the stack. */
1741 spin_lock_irqsave(&hbus->device_list_lock, flags);
1742 do {
1743 found = false;
1744 list_for_each(iter, &hbus->children) {
1745 hpdev = container_of(iter, struct hv_pci_dev,
1746 list_entry);
1747 if (hpdev->reported_missing) {
1748 found = true;
1749 put_pcichild(hpdev, hv_pcidev_ref_childlist);
Wei Yongjun4f1cb012016-07-28 16:16:48 +00001750 list_move_tail(&hpdev->list_entry, &removed);
Jake Oshins4daace02016-02-16 21:56:23 +00001751 break;
1752 }
1753 }
1754 } while (found);
1755 spin_unlock_irqrestore(&hbus->device_list_lock, flags);
1756
1757 /* Delete everything that should no longer exist. */
1758 while (!list_empty(&removed)) {
1759 hpdev = list_first_entry(&removed, struct hv_pci_dev,
1760 list_entry);
1761 list_del(&hpdev->list_entry);
1762 put_pcichild(hpdev, hv_pcidev_ref_initial);
1763 }
1764
Jork Loeser691ac1d2017-05-24 13:41:24 -07001765 switch (hbus->state) {
Long Lid3a78d82017-03-23 14:58:10 -07001766 case hv_pcibus_installed:
1767 /*
Jork Loeser691ac1d2017-05-24 13:41:24 -07001768 * Tell the core to rescan bus
1769 * because there may have been changes.
1770 */
Jake Oshins4daace02016-02-16 21:56:23 +00001771 pci_lock_rescan_remove();
1772 pci_scan_child_bus(hbus->pci_bus);
1773 pci_unlock_rescan_remove();
Long Lid3a78d82017-03-23 14:58:10 -07001774 break;
1775
1776 case hv_pcibus_init:
1777 case hv_pcibus_probed:
Jake Oshins4daace02016-02-16 21:56:23 +00001778 survey_child_resources(hbus);
Long Lid3a78d82017-03-23 14:58:10 -07001779 break;
1780
1781 default:
1782 break;
Jake Oshins4daace02016-02-16 21:56:23 +00001783 }
1784
1785 up(&hbus->enum_sem);
1786 put_hvpcibus(hbus);
1787 kfree(dr);
1788}
1789
1790/**
1791 * hv_pci_devices_present() - Handles list of new children
1792 * @hbus: Root PCI bus, as understood by this driver
1793 * @relations: Packet from host listing children
1794 *
1795 * This function is invoked whenever a new list of devices for
1796 * this bus appears.
1797 */
1798static void hv_pci_devices_present(struct hv_pcibus_device *hbus,
1799 struct pci_bus_relations *relations)
1800{
1801 struct hv_dr_state *dr;
1802 struct hv_dr_work *dr_wrk;
1803 unsigned long flags;
1804
1805 dr_wrk = kzalloc(sizeof(*dr_wrk), GFP_NOWAIT);
1806 if (!dr_wrk)
1807 return;
1808
1809 dr = kzalloc(offsetof(struct hv_dr_state, func) +
1810 (sizeof(struct pci_function_description) *
1811 (relations->device_count)), GFP_NOWAIT);
1812 if (!dr) {
1813 kfree(dr_wrk);
1814 return;
1815 }
1816
1817 INIT_WORK(&dr_wrk->wrk, pci_devices_present_work);
1818 dr_wrk->bus = hbus;
1819 dr->device_count = relations->device_count;
1820 if (dr->device_count != 0) {
1821 memcpy(dr->func, relations->func,
1822 sizeof(struct pci_function_description) *
1823 dr->device_count);
1824 }
1825
1826 spin_lock_irqsave(&hbus->device_list_lock, flags);
1827 list_add_tail(&dr->list_entry, &hbus->dr_list);
1828 spin_unlock_irqrestore(&hbus->device_list_lock, flags);
1829
1830 get_hvpcibus(hbus);
1831 schedule_work(&dr_wrk->wrk);
1832}
1833
1834/**
1835 * hv_eject_device_work() - Asynchronously handles ejection
1836 * @work: Work struct embedded in internal device struct
1837 *
1838 * This function handles ejecting a device. Windows will
1839 * attempt to gracefully eject a device, waiting 60 seconds to
1840 * hear back from the guest OS that this completed successfully.
1841 * If this timer expires, the device will be forcibly removed.
1842 */
1843static void hv_eject_device_work(struct work_struct *work)
1844{
1845 struct pci_eject_response *ejct_pkt;
1846 struct hv_pci_dev *hpdev;
1847 struct pci_dev *pdev;
1848 unsigned long flags;
1849 int wslot;
1850 struct {
1851 struct pci_packet pkt;
Dexuan Cui0c6045d2016-08-23 04:45:51 +00001852 u8 buffer[sizeof(struct pci_eject_response)];
Jake Oshins4daace02016-02-16 21:56:23 +00001853 } ctxt;
1854
1855 hpdev = container_of(work, struct hv_pci_dev, wrk);
1856
1857 if (hpdev->state != hv_pcichild_ejecting) {
1858 put_pcichild(hpdev, hv_pcidev_ref_pnp);
1859 return;
1860 }
1861
1862 /*
1863 * Ejection can come before or after the PCI bus has been set up, so
1864 * attempt to find it and tear down the bus state, if it exists. This
1865 * must be done without constructs like pci_domain_nr(hbus->pci_bus)
1866 * because hbus->pci_bus may not exist yet.
1867 */
1868 wslot = wslot_to_devfn(hpdev->desc.win_slot.slot);
1869 pdev = pci_get_domain_bus_and_slot(hpdev->hbus->sysdata.domain, 0,
1870 wslot);
1871 if (pdev) {
Long Li414428c2017-03-23 14:58:32 -07001872 pci_lock_rescan_remove();
Jake Oshins4daace02016-02-16 21:56:23 +00001873 pci_stop_and_remove_bus_device(pdev);
1874 pci_dev_put(pdev);
Long Li414428c2017-03-23 14:58:32 -07001875 pci_unlock_rescan_remove();
Jake Oshins4daace02016-02-16 21:56:23 +00001876 }
1877
Dexuan Cuie74d2eb2016-11-10 07:19:52 +00001878 spin_lock_irqsave(&hpdev->hbus->device_list_lock, flags);
1879 list_del(&hpdev->list_entry);
1880 spin_unlock_irqrestore(&hpdev->hbus->device_list_lock, flags);
1881
Jake Oshins4daace02016-02-16 21:56:23 +00001882 memset(&ctxt, 0, sizeof(ctxt));
1883 ejct_pkt = (struct pci_eject_response *)&ctxt.pkt.message;
Dexuan Cui0c6045d2016-08-23 04:45:51 +00001884 ejct_pkt->message_type.type = PCI_EJECTION_COMPLETE;
Jake Oshins4daace02016-02-16 21:56:23 +00001885 ejct_pkt->wslot.slot = hpdev->desc.win_slot.slot;
1886 vmbus_sendpacket(hpdev->hbus->hdev->channel, ejct_pkt,
1887 sizeof(*ejct_pkt), (unsigned long)&ctxt.pkt,
1888 VM_PKT_DATA_INBAND, 0);
1889
Jake Oshins4daace02016-02-16 21:56:23 +00001890 put_pcichild(hpdev, hv_pcidev_ref_childlist);
1891 put_pcichild(hpdev, hv_pcidev_ref_pnp);
1892 put_hvpcibus(hpdev->hbus);
1893}
1894
1895/**
1896 * hv_pci_eject_device() - Handles device ejection
1897 * @hpdev: Internal device tracking struct
1898 *
1899 * This function is invoked when an ejection packet arrives. It
1900 * just schedules work so that we don't re-enter the packet
1901 * delivery code handling the ejection.
1902 */
1903static void hv_pci_eject_device(struct hv_pci_dev *hpdev)
1904{
1905 hpdev->state = hv_pcichild_ejecting;
1906 get_pcichild(hpdev, hv_pcidev_ref_pnp);
1907 INIT_WORK(&hpdev->wrk, hv_eject_device_work);
1908 get_hvpcibus(hpdev->hbus);
1909 schedule_work(&hpdev->wrk);
1910}
1911
1912/**
1913 * hv_pci_onchannelcallback() - Handles incoming packets
1914 * @context: Internal bus tracking struct
1915 *
1916 * This function is invoked whenever the host sends a packet to
1917 * this channel (which is private to this root PCI bus).
1918 */
1919static void hv_pci_onchannelcallback(void *context)
1920{
1921 const int packet_size = 0x100;
1922 int ret;
1923 struct hv_pcibus_device *hbus = context;
1924 u32 bytes_recvd;
1925 u64 req_id;
1926 struct vmpacket_descriptor *desc;
1927 unsigned char *buffer;
1928 int bufferlen = packet_size;
1929 struct pci_packet *comp_packet;
1930 struct pci_response *response;
1931 struct pci_incoming_message *new_message;
1932 struct pci_bus_relations *bus_rel;
1933 struct pci_dev_incoming *dev_message;
1934 struct hv_pci_dev *hpdev;
1935
1936 buffer = kmalloc(bufferlen, GFP_ATOMIC);
1937 if (!buffer)
1938 return;
1939
1940 while (1) {
1941 ret = vmbus_recvpacket_raw(hbus->hdev->channel, buffer,
1942 bufferlen, &bytes_recvd, &req_id);
1943
1944 if (ret == -ENOBUFS) {
1945 kfree(buffer);
1946 /* Handle large packet */
1947 bufferlen = bytes_recvd;
1948 buffer = kmalloc(bytes_recvd, GFP_ATOMIC);
1949 if (!buffer)
1950 return;
1951 continue;
1952 }
1953
Vitaly Kuznetsov837d7412016-06-17 12:45:30 -05001954 /* Zero length indicates there are no more packets. */
1955 if (ret || !bytes_recvd)
1956 break;
1957
Jake Oshins4daace02016-02-16 21:56:23 +00001958 /*
1959 * All incoming packets must be at least as large as a
1960 * response.
1961 */
Vitaly Kuznetsov60fcdac2016-05-30 16:17:58 +02001962 if (bytes_recvd <= sizeof(struct pci_response))
Vitaly Kuznetsov837d7412016-06-17 12:45:30 -05001963 continue;
Jake Oshins4daace02016-02-16 21:56:23 +00001964 desc = (struct vmpacket_descriptor *)buffer;
1965
1966 switch (desc->type) {
1967 case VM_PKT_COMP:
1968
1969 /*
1970 * The host is trusted, and thus it's safe to interpret
1971 * this transaction ID as a pointer.
1972 */
1973 comp_packet = (struct pci_packet *)req_id;
1974 response = (struct pci_response *)buffer;
1975 comp_packet->completion_func(comp_packet->compl_ctxt,
1976 response,
1977 bytes_recvd);
Vitaly Kuznetsov60fcdac2016-05-30 16:17:58 +02001978 break;
Jake Oshins4daace02016-02-16 21:56:23 +00001979
1980 case VM_PKT_DATA_INBAND:
1981
1982 new_message = (struct pci_incoming_message *)buffer;
Dexuan Cui0c6045d2016-08-23 04:45:51 +00001983 switch (new_message->message_type.type) {
Jake Oshins4daace02016-02-16 21:56:23 +00001984 case PCI_BUS_RELATIONS:
1985
1986 bus_rel = (struct pci_bus_relations *)buffer;
1987 if (bytes_recvd <
1988 offsetof(struct pci_bus_relations, func) +
1989 (sizeof(struct pci_function_description) *
1990 (bus_rel->device_count))) {
1991 dev_err(&hbus->hdev->device,
1992 "bus relations too small\n");
1993 break;
1994 }
1995
1996 hv_pci_devices_present(hbus, bus_rel);
1997 break;
1998
1999 case PCI_EJECT:
2000
2001 dev_message = (struct pci_dev_incoming *)buffer;
2002 hpdev = get_pcichild_wslot(hbus,
2003 dev_message->wslot.slot);
2004 if (hpdev) {
2005 hv_pci_eject_device(hpdev);
2006 put_pcichild(hpdev,
2007 hv_pcidev_ref_by_slot);
2008 }
2009 break;
2010
2011 default:
2012 dev_warn(&hbus->hdev->device,
2013 "Unimplemented protocol message %x\n",
Dexuan Cui0c6045d2016-08-23 04:45:51 +00002014 new_message->message_type.type);
Jake Oshins4daace02016-02-16 21:56:23 +00002015 break;
2016 }
2017 break;
2018
2019 default:
2020 dev_err(&hbus->hdev->device,
2021 "unhandled packet type %d, tid %llx len %d\n",
2022 desc->type, req_id, bytes_recvd);
2023 break;
2024 }
Jake Oshins4daace02016-02-16 21:56:23 +00002025 }
Vitaly Kuznetsov60fcdac2016-05-30 16:17:58 +02002026
2027 kfree(buffer);
Jake Oshins4daace02016-02-16 21:56:23 +00002028}
2029
2030/**
2031 * hv_pci_protocol_negotiation() - Set up protocol
2032 * @hdev: VMBus's tracking struct for this root PCI bus
2033 *
2034 * This driver is intended to support running on Windows 10
2035 * (server) and later versions. It will not run on earlier
2036 * versions, as they assume that many of the operations which
2037 * Linux needs accomplished with a spinlock held were done via
2038 * asynchronous messaging via VMBus. Windows 10 increases the
2039 * surface area of PCI emulation so that these actions can take
2040 * place by suspending a virtual processor for their duration.
2041 *
2042 * This function negotiates the channel protocol version,
2043 * failing if the host doesn't support the necessary protocol
2044 * level.
2045 */
2046static int hv_pci_protocol_negotiation(struct hv_device *hdev)
2047{
2048 struct pci_version_request *version_req;
2049 struct hv_pci_compl comp_pkt;
2050 struct pci_packet *pkt;
2051 int ret;
Jork Loeserb1db7e72017-05-24 13:41:27 -07002052 int i;
Jake Oshins4daace02016-02-16 21:56:23 +00002053
2054 /*
2055 * Initiate the handshake with the host and negotiate
2056 * a version that the host can support. We start with the
2057 * highest version number and go down if the host cannot
2058 * support it.
2059 */
2060 pkt = kzalloc(sizeof(*pkt) + sizeof(*version_req), GFP_KERNEL);
2061 if (!pkt)
2062 return -ENOMEM;
2063
2064 init_completion(&comp_pkt.host_event);
2065 pkt->completion_func = hv_pci_generic_compl;
2066 pkt->compl_ctxt = &comp_pkt;
2067 version_req = (struct pci_version_request *)&pkt->message;
Dexuan Cui0c6045d2016-08-23 04:45:51 +00002068 version_req->message_type.type = PCI_QUERY_PROTOCOL_VERSION;
Jake Oshins4daace02016-02-16 21:56:23 +00002069
Jork Loeserb1db7e72017-05-24 13:41:27 -07002070 for (i = 0; i < ARRAY_SIZE(pci_protocol_versions); i++) {
2071 version_req->protocol_version = pci_protocol_versions[i];
2072 ret = vmbus_sendpacket(hdev->channel, version_req,
2073 sizeof(struct pci_version_request),
2074 (unsigned long)pkt, VM_PKT_DATA_INBAND,
2075 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
2076 if (ret) {
2077 dev_err(&hdev->device,
2078 "PCI Pass-through VSP failed sending version reqquest: %#x",
2079 ret);
2080 goto exit;
2081 }
Jake Oshins4daace02016-02-16 21:56:23 +00002082
Jork Loeserb1db7e72017-05-24 13:41:27 -07002083 wait_for_completion(&comp_pkt.host_event);
Jake Oshins4daace02016-02-16 21:56:23 +00002084
Jork Loeserb1db7e72017-05-24 13:41:27 -07002085 if (comp_pkt.completion_status >= 0) {
2086 pci_protocol_version = pci_protocol_versions[i];
2087 dev_info(&hdev->device,
2088 "PCI VMBus probing: Using version %#x\n",
2089 pci_protocol_version);
2090 goto exit;
2091 }
2092
2093 if (comp_pkt.completion_status != STATUS_REVISION_MISMATCH) {
2094 dev_err(&hdev->device,
2095 "PCI Pass-through VSP failed version request: %#x",
2096 comp_pkt.completion_status);
2097 ret = -EPROTO;
2098 goto exit;
2099 }
2100
2101 reinit_completion(&comp_pkt.host_event);
Jake Oshins4daace02016-02-16 21:56:23 +00002102 }
2103
Jork Loeserb1db7e72017-05-24 13:41:27 -07002104 dev_err(&hdev->device,
2105 "PCI pass-through VSP failed to find supported version");
2106 ret = -EPROTO;
Jake Oshins4daace02016-02-16 21:56:23 +00002107
2108exit:
2109 kfree(pkt);
2110 return ret;
2111}
2112
2113/**
2114 * hv_pci_free_bridge_windows() - Release memory regions for the
2115 * bus
2116 * @hbus: Root PCI bus, as understood by this driver
2117 */
2118static void hv_pci_free_bridge_windows(struct hv_pcibus_device *hbus)
2119{
2120 /*
2121 * Set the resources back to the way they looked when they
2122 * were allocated by setting IORESOURCE_BUSY again.
2123 */
2124
2125 if (hbus->low_mmio_space && hbus->low_mmio_res) {
2126 hbus->low_mmio_res->flags |= IORESOURCE_BUSY;
Jake Oshins696ca5e2016-04-05 10:22:52 -07002127 vmbus_free_mmio(hbus->low_mmio_res->start,
2128 resource_size(hbus->low_mmio_res));
Jake Oshins4daace02016-02-16 21:56:23 +00002129 }
2130
2131 if (hbus->high_mmio_space && hbus->high_mmio_res) {
2132 hbus->high_mmio_res->flags |= IORESOURCE_BUSY;
Jake Oshins696ca5e2016-04-05 10:22:52 -07002133 vmbus_free_mmio(hbus->high_mmio_res->start,
2134 resource_size(hbus->high_mmio_res));
Jake Oshins4daace02016-02-16 21:56:23 +00002135 }
2136}
2137
2138/**
2139 * hv_pci_allocate_bridge_windows() - Allocate memory regions
2140 * for the bus
2141 * @hbus: Root PCI bus, as understood by this driver
2142 *
2143 * This function calls vmbus_allocate_mmio(), which is itself a
2144 * bit of a compromise. Ideally, we might change the pnp layer
2145 * in the kernel such that it comprehends either PCI devices
2146 * which are "grandchildren of ACPI," with some intermediate bus
2147 * node (in this case, VMBus) or change it such that it
2148 * understands VMBus. The pnp layer, however, has been declared
2149 * deprecated, and not subject to change.
2150 *
2151 * The workaround, implemented here, is to ask VMBus to allocate
2152 * MMIO space for this bus. VMBus itself knows which ranges are
2153 * appropriate by looking at its own ACPI objects. Then, after
2154 * these ranges are claimed, they're modified to look like they
2155 * would have looked if the ACPI and pnp code had allocated
2156 * bridge windows. These descriptors have to exist in this form
2157 * in order to satisfy the code which will get invoked when the
2158 * endpoint PCI function driver calls request_mem_region() or
2159 * request_mem_region_exclusive().
2160 *
2161 * Return: 0 on success, -errno on failure
2162 */
2163static int hv_pci_allocate_bridge_windows(struct hv_pcibus_device *hbus)
2164{
2165 resource_size_t align;
2166 int ret;
2167
2168 if (hbus->low_mmio_space) {
2169 align = 1ULL << (63 - __builtin_clzll(hbus->low_mmio_space));
2170 ret = vmbus_allocate_mmio(&hbus->low_mmio_res, hbus->hdev, 0,
2171 (u64)(u32)0xffffffff,
2172 hbus->low_mmio_space,
2173 align, false);
2174 if (ret) {
2175 dev_err(&hbus->hdev->device,
2176 "Need %#llx of low MMIO space. Consider reconfiguring the VM.\n",
2177 hbus->low_mmio_space);
2178 return ret;
2179 }
2180
2181 /* Modify this resource to become a bridge window. */
2182 hbus->low_mmio_res->flags |= IORESOURCE_WINDOW;
2183 hbus->low_mmio_res->flags &= ~IORESOURCE_BUSY;
2184 pci_add_resource(&hbus->resources_for_children,
2185 hbus->low_mmio_res);
2186 }
2187
2188 if (hbus->high_mmio_space) {
2189 align = 1ULL << (63 - __builtin_clzll(hbus->high_mmio_space));
2190 ret = vmbus_allocate_mmio(&hbus->high_mmio_res, hbus->hdev,
2191 0x100000000, -1,
2192 hbus->high_mmio_space, align,
2193 false);
2194 if (ret) {
2195 dev_err(&hbus->hdev->device,
2196 "Need %#llx of high MMIO space. Consider reconfiguring the VM.\n",
2197 hbus->high_mmio_space);
2198 goto release_low_mmio;
2199 }
2200
2201 /* Modify this resource to become a bridge window. */
2202 hbus->high_mmio_res->flags |= IORESOURCE_WINDOW;
2203 hbus->high_mmio_res->flags &= ~IORESOURCE_BUSY;
2204 pci_add_resource(&hbus->resources_for_children,
2205 hbus->high_mmio_res);
2206 }
2207
2208 return 0;
2209
2210release_low_mmio:
2211 if (hbus->low_mmio_res) {
Jake Oshins696ca5e2016-04-05 10:22:52 -07002212 vmbus_free_mmio(hbus->low_mmio_res->start,
2213 resource_size(hbus->low_mmio_res));
Jake Oshins4daace02016-02-16 21:56:23 +00002214 }
2215
2216 return ret;
2217}
2218
2219/**
2220 * hv_allocate_config_window() - Find MMIO space for PCI Config
2221 * @hbus: Root PCI bus, as understood by this driver
2222 *
2223 * This function claims memory-mapped I/O space for accessing
2224 * configuration space for the functions on this bus.
2225 *
2226 * Return: 0 on success, -errno on failure
2227 */
2228static int hv_allocate_config_window(struct hv_pcibus_device *hbus)
2229{
2230 int ret;
2231
2232 /*
2233 * Set up a region of MMIO space to use for accessing configuration
2234 * space.
2235 */
2236 ret = vmbus_allocate_mmio(&hbus->mem_config, hbus->hdev, 0, -1,
2237 PCI_CONFIG_MMIO_LENGTH, 0x1000, false);
2238 if (ret)
2239 return ret;
2240
2241 /*
2242 * vmbus_allocate_mmio() gets used for allocating both device endpoint
2243 * resource claims (those which cannot be overlapped) and the ranges
2244 * which are valid for the children of this bus, which are intended
2245 * to be overlapped by those children. Set the flag on this claim
2246 * meaning that this region can't be overlapped.
2247 */
2248
2249 hbus->mem_config->flags |= IORESOURCE_BUSY;
2250
2251 return 0;
2252}
2253
2254static void hv_free_config_window(struct hv_pcibus_device *hbus)
2255{
Jake Oshins696ca5e2016-04-05 10:22:52 -07002256 vmbus_free_mmio(hbus->mem_config->start, PCI_CONFIG_MMIO_LENGTH);
Jake Oshins4daace02016-02-16 21:56:23 +00002257}
2258
2259/**
2260 * hv_pci_enter_d0() - Bring the "bus" into the D0 power state
2261 * @hdev: VMBus's tracking struct for this root PCI bus
2262 *
2263 * Return: 0 on success, -errno on failure
2264 */
2265static int hv_pci_enter_d0(struct hv_device *hdev)
2266{
2267 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
2268 struct pci_bus_d0_entry *d0_entry;
2269 struct hv_pci_compl comp_pkt;
2270 struct pci_packet *pkt;
2271 int ret;
2272
2273 /*
2274 * Tell the host that the bus is ready to use, and moved into the
2275 * powered-on state. This includes telling the host which region
2276 * of memory-mapped I/O space has been chosen for configuration space
2277 * access.
2278 */
2279 pkt = kzalloc(sizeof(*pkt) + sizeof(*d0_entry), GFP_KERNEL);
2280 if (!pkt)
2281 return -ENOMEM;
2282
2283 init_completion(&comp_pkt.host_event);
2284 pkt->completion_func = hv_pci_generic_compl;
2285 pkt->compl_ctxt = &comp_pkt;
2286 d0_entry = (struct pci_bus_d0_entry *)&pkt->message;
Dexuan Cui0c6045d2016-08-23 04:45:51 +00002287 d0_entry->message_type.type = PCI_BUS_D0ENTRY;
Jake Oshins4daace02016-02-16 21:56:23 +00002288 d0_entry->mmio_base = hbus->mem_config->start;
2289
2290 ret = vmbus_sendpacket(hdev->channel, d0_entry, sizeof(*d0_entry),
2291 (unsigned long)pkt, VM_PKT_DATA_INBAND,
2292 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
2293 if (ret)
2294 goto exit;
2295
2296 wait_for_completion(&comp_pkt.host_event);
2297
2298 if (comp_pkt.completion_status < 0) {
2299 dev_err(&hdev->device,
2300 "PCI Pass-through VSP failed D0 Entry with status %x\n",
2301 comp_pkt.completion_status);
2302 ret = -EPROTO;
2303 goto exit;
2304 }
2305
2306 ret = 0;
2307
2308exit:
2309 kfree(pkt);
2310 return ret;
2311}
2312
2313/**
2314 * hv_pci_query_relations() - Ask host to send list of child
2315 * devices
2316 * @hdev: VMBus's tracking struct for this root PCI bus
2317 *
2318 * Return: 0 on success, -errno on failure
2319 */
2320static int hv_pci_query_relations(struct hv_device *hdev)
2321{
2322 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
2323 struct pci_message message;
2324 struct completion comp;
2325 int ret;
2326
2327 /* Ask the host to send along the list of child devices */
2328 init_completion(&comp);
2329 if (cmpxchg(&hbus->survey_event, NULL, &comp))
2330 return -ENOTEMPTY;
2331
2332 memset(&message, 0, sizeof(message));
Dexuan Cui0c6045d2016-08-23 04:45:51 +00002333 message.type = PCI_QUERY_BUS_RELATIONS;
Jake Oshins4daace02016-02-16 21:56:23 +00002334
2335 ret = vmbus_sendpacket(hdev->channel, &message, sizeof(message),
2336 0, VM_PKT_DATA_INBAND, 0);
2337 if (ret)
2338 return ret;
2339
2340 wait_for_completion(&comp);
2341 return 0;
2342}
2343
2344/**
2345 * hv_send_resources_allocated() - Report local resource choices
2346 * @hdev: VMBus's tracking struct for this root PCI bus
2347 *
2348 * The host OS is expecting to be sent a request as a message
2349 * which contains all the resources that the device will use.
2350 * The response contains those same resources, "translated"
2351 * which is to say, the values which should be used by the
2352 * hardware, when it delivers an interrupt. (MMIO resources are
2353 * used in local terms.) This is nice for Windows, and lines up
2354 * with the FDO/PDO split, which doesn't exist in Linux. Linux
2355 * is deeply expecting to scan an emulated PCI configuration
2356 * space. So this message is sent here only to drive the state
2357 * machine on the host forward.
2358 *
2359 * Return: 0 on success, -errno on failure
2360 */
2361static int hv_send_resources_allocated(struct hv_device *hdev)
2362{
2363 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
2364 struct pci_resources_assigned *res_assigned;
Jork Loeser7dcf90e2017-05-24 13:41:28 -07002365 struct pci_resources_assigned2 *res_assigned2;
Jake Oshins4daace02016-02-16 21:56:23 +00002366 struct hv_pci_compl comp_pkt;
2367 struct hv_pci_dev *hpdev;
2368 struct pci_packet *pkt;
Jork Loeser7dcf90e2017-05-24 13:41:28 -07002369 size_t size_res;
Jake Oshins4daace02016-02-16 21:56:23 +00002370 u32 wslot;
2371 int ret;
2372
Jork Loeser7dcf90e2017-05-24 13:41:28 -07002373 size_res = (pci_protocol_version < PCI_PROTOCOL_VERSION_1_2)
2374 ? sizeof(*res_assigned) : sizeof(*res_assigned2);
2375
2376 pkt = kmalloc(sizeof(*pkt) + size_res, GFP_KERNEL);
Jake Oshins4daace02016-02-16 21:56:23 +00002377 if (!pkt)
2378 return -ENOMEM;
2379
2380 ret = 0;
2381
2382 for (wslot = 0; wslot < 256; wslot++) {
2383 hpdev = get_pcichild_wslot(hbus, wslot);
2384 if (!hpdev)
2385 continue;
2386
Jork Loeser7dcf90e2017-05-24 13:41:28 -07002387 memset(pkt, 0, sizeof(*pkt) + size_res);
Jake Oshins4daace02016-02-16 21:56:23 +00002388 init_completion(&comp_pkt.host_event);
2389 pkt->completion_func = hv_pci_generic_compl;
2390 pkt->compl_ctxt = &comp_pkt;
Jake Oshins4daace02016-02-16 21:56:23 +00002391
Jork Loeser7dcf90e2017-05-24 13:41:28 -07002392 if (pci_protocol_version < PCI_PROTOCOL_VERSION_1_2) {
2393 res_assigned =
2394 (struct pci_resources_assigned *)&pkt->message;
2395 res_assigned->message_type.type =
2396 PCI_RESOURCES_ASSIGNED;
2397 res_assigned->wslot.slot = hpdev->desc.win_slot.slot;
2398 } else {
2399 res_assigned2 =
2400 (struct pci_resources_assigned2 *)&pkt->message;
2401 res_assigned2->message_type.type =
2402 PCI_RESOURCES_ASSIGNED2;
2403 res_assigned2->wslot.slot = hpdev->desc.win_slot.slot;
2404 }
Jake Oshins4daace02016-02-16 21:56:23 +00002405 put_pcichild(hpdev, hv_pcidev_ref_by_slot);
2406
Jork Loeser7dcf90e2017-05-24 13:41:28 -07002407 ret = vmbus_sendpacket(hdev->channel, &pkt->message,
2408 size_res, (unsigned long)pkt,
2409 VM_PKT_DATA_INBAND,
2410 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
Jake Oshins4daace02016-02-16 21:56:23 +00002411 if (ret)
2412 break;
2413
2414 wait_for_completion(&comp_pkt.host_event);
2415
2416 if (comp_pkt.completion_status < 0) {
2417 ret = -EPROTO;
2418 dev_err(&hdev->device,
2419 "resource allocated returned 0x%x",
2420 comp_pkt.completion_status);
2421 break;
2422 }
2423 }
2424
2425 kfree(pkt);
2426 return ret;
2427}
2428
2429/**
2430 * hv_send_resources_released() - Report local resources
2431 * released
2432 * @hdev: VMBus's tracking struct for this root PCI bus
2433 *
2434 * Return: 0 on success, -errno on failure
2435 */
2436static int hv_send_resources_released(struct hv_device *hdev)
2437{
2438 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
2439 struct pci_child_message pkt;
2440 struct hv_pci_dev *hpdev;
2441 u32 wslot;
2442 int ret;
2443
2444 for (wslot = 0; wslot < 256; wslot++) {
2445 hpdev = get_pcichild_wslot(hbus, wslot);
2446 if (!hpdev)
2447 continue;
2448
2449 memset(&pkt, 0, sizeof(pkt));
Dexuan Cui0c6045d2016-08-23 04:45:51 +00002450 pkt.message_type.type = PCI_RESOURCES_RELEASED;
Jake Oshins4daace02016-02-16 21:56:23 +00002451 pkt.wslot.slot = hpdev->desc.win_slot.slot;
2452
2453 put_pcichild(hpdev, hv_pcidev_ref_by_slot);
2454
2455 ret = vmbus_sendpacket(hdev->channel, &pkt, sizeof(pkt), 0,
2456 VM_PKT_DATA_INBAND, 0);
2457 if (ret)
2458 return ret;
2459 }
2460
2461 return 0;
2462}
2463
2464static void get_hvpcibus(struct hv_pcibus_device *hbus)
2465{
2466 atomic_inc(&hbus->remove_lock);
2467}
2468
2469static void put_hvpcibus(struct hv_pcibus_device *hbus)
2470{
2471 if (atomic_dec_and_test(&hbus->remove_lock))
2472 complete(&hbus->remove_event);
2473}
2474
2475/**
2476 * hv_pci_probe() - New VMBus channel probe, for a root PCI bus
2477 * @hdev: VMBus's tracking struct for this root PCI bus
2478 * @dev_id: Identifies the device itself
2479 *
2480 * Return: 0 on success, -errno on failure
2481 */
2482static int hv_pci_probe(struct hv_device *hdev,
2483 const struct hv_vmbus_device_id *dev_id)
2484{
2485 struct hv_pcibus_device *hbus;
2486 int ret;
2487
Jork Loeserbe66b672017-05-24 13:41:25 -07002488 /*
2489 * hv_pcibus_device contains the hypercall arguments for retargeting in
2490 * hv_irq_unmask(). Those must not cross a page boundary.
2491 */
2492 BUILD_BUG_ON(sizeof(*hbus) > PAGE_SIZE);
2493
2494 hbus = (struct hv_pcibus_device *)get_zeroed_page(GFP_KERNEL);
Jake Oshins4daace02016-02-16 21:56:23 +00002495 if (!hbus)
2496 return -ENOMEM;
Long Lid3a78d82017-03-23 14:58:10 -07002497 hbus->state = hv_pcibus_init;
Jake Oshins4daace02016-02-16 21:56:23 +00002498
Jork Loeser02c37642017-05-24 13:41:26 -07002499 hv_tmpcpumap_init();
2500
Jake Oshins4daace02016-02-16 21:56:23 +00002501 /*
2502 * The PCI bus "domain" is what is called "segment" in ACPI and
2503 * other specs. Pull it from the instance ID, to get something
2504 * unique. Bytes 8 and 9 are what is used in Windows guests, so
2505 * do the same thing for consistency. Note that, since this code
2506 * only runs in a Hyper-V VM, Hyper-V can (and does) guarantee
2507 * that (1) the only domain in use for something that looks like
2508 * a physical PCI bus (which is actually emulated by the
2509 * hypervisor) is domain 0 and (2) there will be no overlap
2510 * between domains derived from these instance IDs in the same
2511 * VM.
2512 */
2513 hbus->sysdata.domain = hdev->dev_instance.b[9] |
2514 hdev->dev_instance.b[8] << 8;
2515
2516 hbus->hdev = hdev;
2517 atomic_inc(&hbus->remove_lock);
2518 INIT_LIST_HEAD(&hbus->children);
2519 INIT_LIST_HEAD(&hbus->dr_list);
2520 INIT_LIST_HEAD(&hbus->resources_for_children);
2521 spin_lock_init(&hbus->config_lock);
2522 spin_lock_init(&hbus->device_list_lock);
Long Li0de8ce32016-11-08 14:04:38 -08002523 spin_lock_init(&hbus->retarget_msi_interrupt_lock);
Jake Oshins4daace02016-02-16 21:56:23 +00002524 sema_init(&hbus->enum_sem, 1);
2525 init_completion(&hbus->remove_event);
2526
2527 ret = vmbus_open(hdev->channel, pci_ring_size, pci_ring_size, NULL, 0,
2528 hv_pci_onchannelcallback, hbus);
2529 if (ret)
2530 goto free_bus;
2531
2532 hv_set_drvdata(hdev, hbus);
2533
2534 ret = hv_pci_protocol_negotiation(hdev);
2535 if (ret)
2536 goto close;
2537
2538 ret = hv_allocate_config_window(hbus);
2539 if (ret)
2540 goto close;
2541
2542 hbus->cfg_addr = ioremap(hbus->mem_config->start,
2543 PCI_CONFIG_MMIO_LENGTH);
2544 if (!hbus->cfg_addr) {
2545 dev_err(&hdev->device,
2546 "Unable to map a virtual address for config space\n");
2547 ret = -ENOMEM;
2548 goto free_config;
2549 }
2550
2551 hbus->sysdata.fwnode = irq_domain_alloc_fwnode(hbus);
2552 if (!hbus->sysdata.fwnode) {
2553 ret = -ENOMEM;
2554 goto unmap;
2555 }
2556
2557 ret = hv_pcie_init_irq_domain(hbus);
2558 if (ret)
2559 goto free_fwnode;
2560
2561 ret = hv_pci_query_relations(hdev);
2562 if (ret)
2563 goto free_irq_domain;
2564
2565 ret = hv_pci_enter_d0(hdev);
2566 if (ret)
2567 goto free_irq_domain;
2568
2569 ret = hv_pci_allocate_bridge_windows(hbus);
2570 if (ret)
2571 goto free_irq_domain;
2572
2573 ret = hv_send_resources_allocated(hdev);
2574 if (ret)
2575 goto free_windows;
2576
2577 prepopulate_bars(hbus);
2578
2579 hbus->state = hv_pcibus_probed;
2580
2581 ret = create_root_hv_pci_bus(hbus);
2582 if (ret)
2583 goto free_windows;
2584
2585 return 0;
2586
2587free_windows:
2588 hv_pci_free_bridge_windows(hbus);
2589free_irq_domain:
2590 irq_domain_remove(hbus->irq_domain);
2591free_fwnode:
2592 irq_domain_free_fwnode(hbus->sysdata.fwnode);
2593unmap:
2594 iounmap(hbus->cfg_addr);
2595free_config:
2596 hv_free_config_window(hbus);
2597close:
2598 vmbus_close(hdev->channel);
2599free_bus:
Jork Loeserbe66b672017-05-24 13:41:25 -07002600 free_page((unsigned long)hbus);
Jake Oshins4daace02016-02-16 21:56:23 +00002601 return ret;
2602}
2603
Dexuan Cui179785242016-11-10 07:18:47 +00002604static void hv_pci_bus_exit(struct hv_device *hdev)
Jake Oshins4daace02016-02-16 21:56:23 +00002605{
Dexuan Cui179785242016-11-10 07:18:47 +00002606 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
2607 struct {
Jake Oshins4daace02016-02-16 21:56:23 +00002608 struct pci_packet teardown_packet;
Dexuan Cui179785242016-11-10 07:18:47 +00002609 u8 buffer[sizeof(struct pci_message)];
Jake Oshins4daace02016-02-16 21:56:23 +00002610 } pkt;
2611 struct pci_bus_relations relations;
2612 struct hv_pci_compl comp_pkt;
Dexuan Cui179785242016-11-10 07:18:47 +00002613 int ret;
Jake Oshins4daace02016-02-16 21:56:23 +00002614
Dexuan Cui179785242016-11-10 07:18:47 +00002615 /*
2616 * After the host sends the RESCIND_CHANNEL message, it doesn't
2617 * access the per-channel ringbuffer any longer.
2618 */
2619 if (hdev->channel->rescind)
2620 return;
2621
2622 /* Delete any children which might still exist. */
2623 memset(&relations, 0, sizeof(relations));
2624 hv_pci_devices_present(hbus, &relations);
2625
2626 ret = hv_send_resources_released(hdev);
2627 if (ret)
2628 dev_err(&hdev->device,
2629 "Couldn't send resources released packet(s)\n");
Jake Oshins4daace02016-02-16 21:56:23 +00002630
Jake Oshins4daace02016-02-16 21:56:23 +00002631 memset(&pkt.teardown_packet, 0, sizeof(pkt.teardown_packet));
2632 init_completion(&comp_pkt.host_event);
2633 pkt.teardown_packet.completion_func = hv_pci_generic_compl;
2634 pkt.teardown_packet.compl_ctxt = &comp_pkt;
Dexuan Cui0c6045d2016-08-23 04:45:51 +00002635 pkt.teardown_packet.message[0].type = PCI_BUS_D0EXIT;
Jake Oshins4daace02016-02-16 21:56:23 +00002636
2637 ret = vmbus_sendpacket(hdev->channel, &pkt.teardown_packet.message,
2638 sizeof(struct pci_message),
2639 (unsigned long)&pkt.teardown_packet,
2640 VM_PKT_DATA_INBAND,
2641 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
2642 if (!ret)
2643 wait_for_completion_timeout(&comp_pkt.host_event, 10 * HZ);
Dexuan Cui179785242016-11-10 07:18:47 +00002644}
Jake Oshins4daace02016-02-16 21:56:23 +00002645
Dexuan Cui179785242016-11-10 07:18:47 +00002646/**
2647 * hv_pci_remove() - Remove routine for this VMBus channel
2648 * @hdev: VMBus's tracking struct for this root PCI bus
2649 *
2650 * Return: 0 on success, -errno on failure
2651 */
2652static int hv_pci_remove(struct hv_device *hdev)
2653{
2654 struct hv_pcibus_device *hbus;
2655
2656 hbus = hv_get_drvdata(hdev);
Jake Oshins4daace02016-02-16 21:56:23 +00002657 if (hbus->state == hv_pcibus_installed) {
2658 /* Remove the bus from PCI's point of view. */
2659 pci_lock_rescan_remove();
2660 pci_stop_root_bus(hbus->pci_bus);
2661 pci_remove_root_bus(hbus->pci_bus);
2662 pci_unlock_rescan_remove();
Long Lid3a78d82017-03-23 14:58:10 -07002663 hbus->state = hv_pcibus_removed;
Jake Oshins4daace02016-02-16 21:56:23 +00002664 }
2665
Dexuan Cui179785242016-11-10 07:18:47 +00002666 hv_pci_bus_exit(hdev);
Vitaly Kuznetsovdeb22e52016-04-29 11:39:10 +02002667
Jake Oshins4daace02016-02-16 21:56:23 +00002668 vmbus_close(hdev->channel);
2669
Jake Oshins4daace02016-02-16 21:56:23 +00002670 iounmap(hbus->cfg_addr);
2671 hv_free_config_window(hbus);
2672 pci_free_resource_list(&hbus->resources_for_children);
2673 hv_pci_free_bridge_windows(hbus);
2674 irq_domain_remove(hbus->irq_domain);
2675 irq_domain_free_fwnode(hbus->sysdata.fwnode);
2676 put_hvpcibus(hbus);
2677 wait_for_completion(&hbus->remove_event);
Jork Loeserbe66b672017-05-24 13:41:25 -07002678 free_page((unsigned long)hbus);
Jake Oshins4daace02016-02-16 21:56:23 +00002679 return 0;
2680}
2681
2682static const struct hv_vmbus_device_id hv_pci_id_table[] = {
2683 /* PCI Pass-through Class ID */
2684 /* 44C4F61D-4444-4400-9D52-802E27EDE19F */
2685 { HV_PCIE_GUID, },
2686 { },
2687};
2688
2689MODULE_DEVICE_TABLE(vmbus, hv_pci_id_table);
2690
2691static struct hv_driver hv_pci_drv = {
2692 .name = "hv_pci",
2693 .id_table = hv_pci_id_table,
2694 .probe = hv_pci_probe,
2695 .remove = hv_pci_remove,
2696};
2697
2698static void __exit exit_hv_pci_drv(void)
2699{
2700 vmbus_driver_unregister(&hv_pci_drv);
2701}
2702
2703static int __init init_hv_pci_drv(void)
2704{
2705 return vmbus_driver_register(&hv_pci_drv);
2706}
2707
2708module_init(init_hv_pci_drv);
2709module_exit(exit_hv_pci_drv);
2710
2711MODULE_DESCRIPTION("Hyper-V PCI");
2712MODULE_LICENSE("GPL v2");