Merge branch 'kvm-master' into HEAD
Merge more important SMM fixes.
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 22a4b68..73588fc 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1553,6 +1553,7 @@
nosid disable Source ID checking
no_x2apic_optout
BIOS x2APIC opt-out request will be ignored
+ nopost disable Interrupt Posting
iomem= Disable strict checking of access to MMIO memory
strict regions from userspace.
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index d9eccee..34cc068 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -401,10 +401,9 @@
Architectures: x86, ppc, mips
Type: vcpu ioctl
Parameters: struct kvm_interrupt (in)
-Returns: 0 on success, -1 on error
+Returns: 0 on success, negative on failure.
-Queues a hardware interrupt vector to be injected. This is only
-useful if in-kernel local APIC or equivalent is not used.
+Queues a hardware interrupt vector to be injected.
/* for KVM_INTERRUPT */
struct kvm_interrupt {
@@ -414,7 +413,14 @@
X86:
-Note 'irq' is an interrupt vector, not an interrupt pin or line.
+Returns: 0 on success,
+ -EEXIST if an interrupt is already enqueued
+ -EINVAL the the irq number is invalid
+ -ENXIO if the PIC is in the kernel
+ -EFAULT if the pointer is invalid
+
+Note 'irq' is an interrupt vector, not an interrupt pin or line. This
+ioctl is useful if the in-kernel PIC is not used.
PPC:
@@ -1598,7 +1604,7 @@
struct kvm_ioeventfd {
__u64 datamatch;
__u64 addr; /* legal pio/mmio address */
- __u32 len; /* 1, 2, 4, or 8 bytes */
+ __u32 len; /* 0, 1, 2, 4, or 8 bytes */
__s32 fd;
__u32 flags;
__u8 pad[36];
@@ -1621,6 +1627,10 @@
For virtio-ccw devices, addr contains the subchannel id and datamatch the
virtqueue index.
+With KVM_CAP_IOEVENTFD_ANY_LENGTH, a zero length ioeventfd is allowed, and
+the kernel will ignore the length of guest write and may get a faster vmexit.
+The speedup may only apply to specific architectures, but the ioeventfd will
+work anyway.
4.60 KVM_DIRTY_TLB
@@ -3309,6 +3319,18 @@
to ignore the request, or to gather VM memory core dump and/or
reset/shutdown of the VM.
+ /* KVM_EXIT_IOAPIC_EOI */
+ struct {
+ __u8 vector;
+ } eoi;
+
+Indicates that the VCPU's in-kernel local APIC received an EOI for a
+level-triggered IOAPIC interrupt. This exit only triggers when the
+IOAPIC is implemented in userspace (i.e. KVM_CAP_SPLIT_IRQCHIP is enabled);
+the userspace IOAPIC should process the EOI and retrigger the interrupt if
+it is still asserted. Vector is the LAPIC interrupt vector for which the
+EOI was received.
+
/* Fix the size of the union. */
char padding[256];
};
@@ -3627,6 +3649,26 @@
KVM handlers should exit to userspace with rc = -EREMOTE.
+7.5 KVM_CAP_SPLIT_IRQCHIP
+
+Architectures: x86
+Parameters: args[0] - number of routes reserved for userspace IOAPICs
+Returns: 0 on success, -1 on error
+
+Create a local apic for each processor in the kernel. This can be used
+instead of KVM_CREATE_IRQCHIP if the userspace VMM wishes to emulate the
+IOAPIC and PIC (and also the PIT, even though this has to be enabled
+separately).
+
+This capability also enables in kernel routing of interrupt requests;
+when KVM_CAP_SPLIT_IRQCHIP only routes of KVM_IRQ_ROUTING_MSI type are
+used in the IRQ routing table. The first args[0] MSI routes are reserved
+for the IOAPIC pins. Whenever the LAPIC receives an EOI for these routes,
+a KVM_EXIT_IOAPIC_EOI vmexit will be reported to userspace.
+
+Fails if VCPU has already been created, or if the irqchip is already in the
+kernel (i.e. KVM_CREATE_IRQCHIP has already been called).
+
8. Other capabilities.
----------------------
diff --git a/Documentation/virtual/kvm/locking.txt b/Documentation/virtual/kvm/locking.txt
index d68af4d..19f94a6 100644
--- a/Documentation/virtual/kvm/locking.txt
+++ b/Documentation/virtual/kvm/locking.txt
@@ -166,3 +166,15 @@
MMIO/PIO address->device structure mapping (kvm->buses).
The srcu index can be stored in kvm_vcpu->srcu_idx per vcpu
if it is needed by multiple functions.
+
+Name: blocked_vcpu_on_cpu_lock
+Type: spinlock_t
+Arch: x86
+Protects: blocked_vcpu_on_cpu
+Comment: This is a per-CPU lock and it is used for VT-d posted-interrupts.
+ When VT-d posted-interrupts is supported and the VM has assigned
+ devices, we put the blocked vCPU on the list blocked_vcpu_on_cpu
+ protected by blocked_vcpu_on_cpu_lock, when VT-d hardware issues
+ wakeup notification event since external interrupts from the
+ assigned devices happens, we will find the vCPU on the list to
+ wakeup.
diff --git a/MAINTAINERS b/MAINTAINERS
index 797236b..3b738cb 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -11177,6 +11177,13 @@
S: Maintained
F: drivers/net/ethernet/via/via-velocity.*
+VIRT LIB
+M: Alex Williamson <alex.williamson@redhat.com>
+M: Paolo Bonzini <pbonzini@redhat.com>
+L: kvm@vger.kernel.org
+S: Supported
+F: virt/lib/
+
VIVID VIRTUAL VIDEO DRIVER
M: Hans Verkuil <hverkuil@xs4all.nl>
L: linux-media@vger.kernel.org
diff --git a/Makefile b/Makefile
index 1d341eb..7c6c976 100644
--- a/Makefile
+++ b/Makefile
@@ -550,6 +550,7 @@
net-y := net/
libs-y := lib/
core-y := usr/
+virt-y := virt/
endif # KBUILD_EXTMOD
ifeq ($(dot-config),1)
@@ -882,10 +883,10 @@
vmlinux-dirs := $(patsubst %/,%,$(filter %/, $(init-y) $(init-m) \
$(core-y) $(core-m) $(drivers-y) $(drivers-m) \
- $(net-y) $(net-m) $(libs-y) $(libs-m)))
+ $(net-y) $(net-m) $(libs-y) $(libs-m) $(virt-y)))
vmlinux-alldirs := $(sort $(vmlinux-dirs) $(patsubst %/,%,$(filter %/, \
- $(init-) $(core-) $(drivers-) $(net-) $(libs-))))
+ $(init-) $(core-) $(drivers-) $(net-) $(libs-) $(virt-))))
init-y := $(patsubst %/, %/built-in.o, $(init-y))
core-y := $(patsubst %/, %/built-in.o, $(core-y))
@@ -894,14 +895,15 @@
libs-y1 := $(patsubst %/, %/lib.a, $(libs-y))
libs-y2 := $(patsubst %/, %/built-in.o, $(libs-y))
libs-y := $(libs-y1) $(libs-y2)
+virt-y := $(patsubst %/, %/built-in.o, $(virt-y))
# Externally visible symbols (used by link-vmlinux.sh)
export KBUILD_VMLINUX_INIT := $(head-y) $(init-y)
-export KBUILD_VMLINUX_MAIN := $(core-y) $(libs-y) $(drivers-y) $(net-y)
+export KBUILD_VMLINUX_MAIN := $(core-y) $(libs-y) $(drivers-y) $(net-y) $(virt-y)
export KBUILD_LDS := arch/$(SRCARCH)/kernel/vmlinux.lds
export LDFLAGS_vmlinux
# used by scripts/pacmage/Makefile
-export KBUILD_ALLDIRS := $(sort $(filter-out arch/%,$(vmlinux-alldirs)) arch Documentation include samples scripts tools virt)
+export KBUILD_ALLDIRS := $(sort $(filter-out arch/%,$(vmlinux-alldirs)) arch Documentation include samples scripts tools)
vmlinux-deps := $(KBUILD_LDS) $(KBUILD_VMLINUX_INIT) $(KBUILD_VMLINUX_MAIN)
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index 5c2c169..373e323 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -51,11 +51,9 @@
static int psw_interrupts_disabled(struct kvm_vcpu *vcpu)
{
- if ((vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PER) ||
- (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_IO) ||
- (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_EXT))
- return 0;
- return 1;
+ return psw_extint_disabled(vcpu) &&
+ psw_ioint_disabled(vcpu) &&
+ psw_mchk_disabled(vcpu);
}
static int ckc_interrupts_enabled(struct kvm_vcpu *vcpu)
@@ -71,13 +69,8 @@
static int ckc_irq_pending(struct kvm_vcpu *vcpu)
{
- preempt_disable();
- if (!(vcpu->arch.sie_block->ckc <
- get_tod_clock_fast() + vcpu->arch.sie_block->epoch)) {
- preempt_enable();
+ if (vcpu->arch.sie_block->ckc >= kvm_s390_get_tod_clock_fast(vcpu->kvm))
return 0;
- }
- preempt_enable();
return ckc_interrupts_enabled(vcpu);
}
@@ -109,14 +102,10 @@
return (int_word & 0x38000000) >> 27;
}
-static inline unsigned long pending_floating_irqs(struct kvm_vcpu *vcpu)
+static inline unsigned long pending_irqs(struct kvm_vcpu *vcpu)
{
- return vcpu->kvm->arch.float_int.pending_irqs;
-}
-
-static inline unsigned long pending_local_irqs(struct kvm_vcpu *vcpu)
-{
- return vcpu->arch.local_int.pending_irqs;
+ return vcpu->kvm->arch.float_int.pending_irqs |
+ vcpu->arch.local_int.pending_irqs;
}
static unsigned long disable_iscs(struct kvm_vcpu *vcpu,
@@ -135,8 +124,7 @@
{
unsigned long active_mask;
- active_mask = pending_local_irqs(vcpu);
- active_mask |= pending_floating_irqs(vcpu);
+ active_mask = pending_irqs(vcpu);
if (!active_mask)
return 0;
@@ -204,7 +192,7 @@
static void set_intercept_indicators_io(struct kvm_vcpu *vcpu)
{
- if (!(pending_floating_irqs(vcpu) & IRQ_PEND_IO_MASK))
+ if (!(pending_irqs(vcpu) & IRQ_PEND_IO_MASK))
return;
else if (psw_ioint_disabled(vcpu))
__set_cpuflag(vcpu, CPUSTAT_IO_INT);
@@ -214,7 +202,7 @@
static void set_intercept_indicators_ext(struct kvm_vcpu *vcpu)
{
- if (!(pending_local_irqs(vcpu) & IRQ_PEND_EXT_MASK))
+ if (!(pending_irqs(vcpu) & IRQ_PEND_EXT_MASK))
return;
if (psw_extint_disabled(vcpu))
__set_cpuflag(vcpu, CPUSTAT_EXT_INT);
@@ -224,7 +212,7 @@
static void set_intercept_indicators_mchk(struct kvm_vcpu *vcpu)
{
- if (!(pending_local_irqs(vcpu) & IRQ_PEND_MCHK_MASK))
+ if (!(pending_irqs(vcpu) & IRQ_PEND_MCHK_MASK))
return;
if (psw_mchk_disabled(vcpu))
vcpu->arch.sie_block->ictl |= ICTL_LPSW;
@@ -815,23 +803,21 @@
int kvm_s390_vcpu_has_irq(struct kvm_vcpu *vcpu, int exclude_stop)
{
- int rc;
+ if (deliverable_irqs(vcpu))
+ return 1;
- rc = !!deliverable_irqs(vcpu);
-
- if (!rc && kvm_cpu_has_pending_timer(vcpu))
- rc = 1;
+ if (kvm_cpu_has_pending_timer(vcpu))
+ return 1;
/* external call pending and deliverable */
- if (!rc && kvm_s390_ext_call_pending(vcpu) &&
+ if (kvm_s390_ext_call_pending(vcpu) &&
!psw_extint_disabled(vcpu) &&
(vcpu->arch.sie_block->gcr[0] & 0x2000ul))
- rc = 1;
+ return 1;
- if (!rc && !exclude_stop && kvm_s390_is_stop_irq_pending(vcpu))
- rc = 1;
-
- return rc;
+ if (!exclude_stop && kvm_s390_is_stop_irq_pending(vcpu))
+ return 1;
+ return 0;
}
int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
@@ -846,7 +832,7 @@
vcpu->stat.exit_wait_state++;
/* fast path */
- if (kvm_cpu_has_pending_timer(vcpu) || kvm_arch_vcpu_runnable(vcpu))
+ if (kvm_arch_vcpu_runnable(vcpu))
return 0;
if (psw_interrupts_disabled(vcpu)) {
@@ -860,9 +846,7 @@
goto no_timer;
}
- preempt_disable();
- now = get_tod_clock_fast() + vcpu->arch.sie_block->epoch;
- preempt_enable();
+ now = kvm_s390_get_tod_clock_fast(vcpu->kvm);
sltime = tod_to_ns(vcpu->arch.sie_block->ckc - now);
/* underflow */
@@ -901,9 +885,7 @@
u64 now, sltime;
vcpu = container_of(timer, struct kvm_vcpu, arch.ckc_timer);
- preempt_disable();
- now = get_tod_clock_fast() + vcpu->arch.sie_block->epoch;
- preempt_enable();
+ now = kvm_s390_get_tod_clock_fast(vcpu->kvm);
sltime = tod_to_ns(vcpu->arch.sie_block->ckc - now);
/*
@@ -981,39 +963,30 @@
trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_PROGRAM_INT,
irq->u.pgm.code, 0);
- li->irq.pgm = irq->u.pgm;
+ if (irq->u.pgm.code == PGM_PER) {
+ li->irq.pgm.code |= PGM_PER;
+ /* only modify PER related information */
+ li->irq.pgm.per_address = irq->u.pgm.per_address;
+ li->irq.pgm.per_code = irq->u.pgm.per_code;
+ li->irq.pgm.per_atmid = irq->u.pgm.per_atmid;
+ li->irq.pgm.per_access_id = irq->u.pgm.per_access_id;
+ } else if (!(irq->u.pgm.code & PGM_PER)) {
+ li->irq.pgm.code = (li->irq.pgm.code & PGM_PER) |
+ irq->u.pgm.code;
+ /* only modify non-PER information */
+ li->irq.pgm.trans_exc_code = irq->u.pgm.trans_exc_code;
+ li->irq.pgm.mon_code = irq->u.pgm.mon_code;
+ li->irq.pgm.data_exc_code = irq->u.pgm.data_exc_code;
+ li->irq.pgm.mon_class_nr = irq->u.pgm.mon_class_nr;
+ li->irq.pgm.exc_access_id = irq->u.pgm.exc_access_id;
+ li->irq.pgm.op_access_id = irq->u.pgm.op_access_id;
+ } else {
+ li->irq.pgm = irq->u.pgm;
+ }
set_bit(IRQ_PEND_PROG, &li->pending_irqs);
return 0;
}
-int kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code)
-{
- struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
- struct kvm_s390_irq irq;
-
- spin_lock(&li->lock);
- irq.u.pgm.code = code;
- __inject_prog(vcpu, &irq);
- BUG_ON(waitqueue_active(li->wq));
- spin_unlock(&li->lock);
- return 0;
-}
-
-int kvm_s390_inject_prog_irq(struct kvm_vcpu *vcpu,
- struct kvm_s390_pgm_info *pgm_info)
-{
- struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
- struct kvm_s390_irq irq;
- int rc;
-
- spin_lock(&li->lock);
- irq.u.pgm = *pgm_info;
- rc = __inject_prog(vcpu, &irq);
- BUG_ON(waitqueue_active(li->wq));
- spin_unlock(&li->lock);
- return rc;
-}
-
static int __inject_pfault_init(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq)
{
struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
@@ -1390,12 +1363,9 @@
static int __inject_vm(struct kvm *kvm, struct kvm_s390_interrupt_info *inti)
{
- struct kvm_s390_float_interrupt *fi;
u64 type = READ_ONCE(inti->type);
int rc;
- fi = &kvm->arch.float_int;
-
switch (type) {
case KVM_S390_MCHK:
rc = __inject_float_mchk(kvm, inti);
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 0a67c40..618c854 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -521,27 +521,12 @@
static int kvm_s390_set_tod_low(struct kvm *kvm, struct kvm_device_attr *attr)
{
- struct kvm_vcpu *cur_vcpu;
- unsigned int vcpu_idx;
- u64 host_tod, gtod;
- int r;
+ u64 gtod;
if (copy_from_user(>od, (void __user *)attr->addr, sizeof(gtod)))
return -EFAULT;
- r = store_tod_clock(&host_tod);
- if (r)
- return r;
-
- mutex_lock(&kvm->lock);
- preempt_disable();
- kvm->arch.epoch = gtod - host_tod;
- kvm_s390_vcpu_block_all(kvm);
- kvm_for_each_vcpu(vcpu_idx, cur_vcpu, kvm)
- cur_vcpu->arch.sie_block->epoch = kvm->arch.epoch;
- kvm_s390_vcpu_unblock_all(kvm);
- preempt_enable();
- mutex_unlock(&kvm->lock);
+ kvm_s390_set_tod_clock(kvm, gtod);
VM_EVENT(kvm, 3, "SET: TOD base: 0x%llx\n", gtod);
return 0;
}
@@ -581,16 +566,9 @@
static int kvm_s390_get_tod_low(struct kvm *kvm, struct kvm_device_attr *attr)
{
- u64 host_tod, gtod;
- int r;
+ u64 gtod;
- r = store_tod_clock(&host_tod);
- if (r)
- return r;
-
- preempt_disable();
- gtod = host_tod + kvm->arch.epoch;
- preempt_enable();
+ gtod = kvm_s390_get_tod_clock_fast(kvm);
if (copy_to_user((void __user *)attr->addr, >od, sizeof(gtod)))
return -EFAULT;
VM_EVENT(kvm, 3, "QUERY: TOD base: 0x%llx\n", gtod);
@@ -1916,6 +1894,22 @@
return 0;
}
+void kvm_s390_set_tod_clock(struct kvm *kvm, u64 tod)
+{
+ struct kvm_vcpu *vcpu;
+ int i;
+
+ mutex_lock(&kvm->lock);
+ preempt_disable();
+ kvm->arch.epoch = tod - get_tod_clock();
+ kvm_s390_vcpu_block_all(kvm);
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ vcpu->arch.sie_block->epoch = kvm->arch.epoch;
+ kvm_s390_vcpu_unblock_all(kvm);
+ preempt_enable();
+ mutex_unlock(&kvm->lock);
+}
+
/**
* kvm_arch_fault_in_page - fault-in guest page if necessary
* @vcpu: The corresponding virtual cpu
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index c446aab..1e70e00 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -175,6 +175,7 @@
return kvm->arch.user_cpu_state_ctrl != 0;
}
+/* implemented in interrupt.c */
int kvm_s390_handle_wait(struct kvm_vcpu *vcpu);
void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu);
enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer);
@@ -185,7 +186,25 @@
struct kvm_s390_interrupt *s390int);
int __must_check kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu,
struct kvm_s390_irq *irq);
-int __must_check kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code);
+static inline int kvm_s390_inject_prog_irq(struct kvm_vcpu *vcpu,
+ struct kvm_s390_pgm_info *pgm_info)
+{
+ struct kvm_s390_irq irq = {
+ .type = KVM_S390_PROGRAM_INT,
+ .u.pgm = *pgm_info,
+ };
+
+ return kvm_s390_inject_vcpu(vcpu, &irq);
+}
+static inline int kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code)
+{
+ struct kvm_s390_irq irq = {
+ .type = KVM_S390_PROGRAM_INT,
+ .u.pgm.code = code,
+ };
+
+ return kvm_s390_inject_vcpu(vcpu, &irq);
+}
struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm,
u64 isc_mask, u32 schid);
int kvm_s390_reinject_io_int(struct kvm *kvm,
@@ -212,6 +231,7 @@
int kvm_s390_handle_sigp_pei(struct kvm_vcpu *vcpu);
/* implemented in kvm-s390.c */
+void kvm_s390_set_tod_clock(struct kvm *kvm, u64 tod);
long kvm_arch_fault_in_page(struct kvm_vcpu *vcpu, gpa_t gpa, int writable);
int kvm_s390_store_status_unloaded(struct kvm_vcpu *vcpu, unsigned long addr);
int kvm_s390_store_adtl_status_unloaded(struct kvm_vcpu *vcpu,
@@ -231,9 +251,6 @@
/* implemented in diag.c */
int kvm_s390_handle_diag(struct kvm_vcpu *vcpu);
-/* implemented in interrupt.c */
-int kvm_s390_inject_prog_irq(struct kvm_vcpu *vcpu,
- struct kvm_s390_pgm_info *pgm_info);
static inline void kvm_s390_vcpu_block_all(struct kvm *kvm)
{
@@ -254,6 +271,16 @@
kvm_s390_vcpu_unblock(vcpu);
}
+static inline u64 kvm_s390_get_tod_clock_fast(struct kvm *kvm)
+{
+ u64 rc;
+
+ preempt_disable();
+ rc = get_tod_clock_fast() + kvm->arch.epoch;
+ preempt_enable();
+ return rc;
+}
+
/**
* kvm_s390_inject_prog_cond - conditionally inject a program check
* @vcpu: virtual cpu
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index 4d21dc4..77191b8 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -33,11 +33,9 @@
/* Handle SCK (SET CLOCK) interception */
static int handle_set_clock(struct kvm_vcpu *vcpu)
{
- struct kvm_vcpu *cpup;
- s64 hostclk, val;
- int i, rc;
+ int rc;
ar_t ar;
- u64 op2;
+ u64 op2, val;
if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
@@ -49,19 +47,8 @@
if (rc)
return kvm_s390_inject_prog_cond(vcpu, rc);
- if (store_tod_clock(&hostclk)) {
- kvm_s390_set_psw_cc(vcpu, 3);
- return 0;
- }
VCPU_EVENT(vcpu, 3, "SCK: setting guest TOD to 0x%llx", val);
- val = (val - hostclk) & ~0x3fUL;
-
- mutex_lock(&vcpu->kvm->lock);
- preempt_disable();
- kvm_for_each_vcpu(i, cpup, vcpu->kvm)
- cpup->arch.sie_block->epoch = val;
- preempt_enable();
- mutex_unlock(&vcpu->kvm->lock);
+ kvm_s390_set_tod_clock(vcpu->kvm, val);
kvm_s390_set_psw_cc(vcpu, 0);
return 0;
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index ebf6d5e..a30316b 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -115,6 +115,59 @@
return msr & X2APIC_ENABLE;
}
+extern void enable_IR_x2apic(void);
+
+extern int get_physical_broadcast(void);
+
+extern int lapic_get_maxlvt(void);
+extern void clear_local_APIC(void);
+extern void disconnect_bsp_APIC(int virt_wire_setup);
+extern void disable_local_APIC(void);
+extern void lapic_shutdown(void);
+extern void sync_Arb_IDs(void);
+extern void init_bsp_APIC(void);
+extern void setup_local_APIC(void);
+extern void init_apic_mappings(void);
+void register_lapic_address(unsigned long address);
+extern void setup_boot_APIC_clock(void);
+extern void setup_secondary_APIC_clock(void);
+extern int APIC_init_uniprocessor(void);
+
+#ifdef CONFIG_X86_64
+static inline int apic_force_enable(unsigned long addr)
+{
+ return -1;
+}
+#else
+extern int apic_force_enable(unsigned long addr);
+#endif
+
+extern int apic_bsp_setup(bool upmode);
+extern void apic_ap_setup(void);
+
+/*
+ * On 32bit this is mach-xxx local
+ */
+#ifdef CONFIG_X86_64
+extern int apic_is_clustered_box(void);
+#else
+static inline int apic_is_clustered_box(void)
+{
+ return 0;
+}
+#endif
+
+extern int setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask);
+
+#else /* !CONFIG_X86_LOCAL_APIC */
+static inline void lapic_shutdown(void) { }
+#define local_apic_timer_c2_ok 1
+static inline void init_apic_mappings(void) { }
+static inline void disable_local_APIC(void) { }
+# define setup_boot_APIC_clock x86_init_noop
+# define setup_secondary_APIC_clock x86_init_noop
+#endif /* !CONFIG_X86_LOCAL_APIC */
+
#ifdef CONFIG_X86_X2APIC
/*
* Make previous memory operations globally visible before
@@ -186,67 +239,14 @@
}
#define x2apic_supported() (cpu_has_x2apic)
-#else
+#else /* !CONFIG_X86_X2APIC */
static inline void check_x2apic(void) { }
static inline void x2apic_setup(void) { }
static inline int x2apic_enabled(void) { return 0; }
#define x2apic_mode (0)
#define x2apic_supported() (0)
-#endif
-
-extern void enable_IR_x2apic(void);
-
-extern int get_physical_broadcast(void);
-
-extern int lapic_get_maxlvt(void);
-extern void clear_local_APIC(void);
-extern void disconnect_bsp_APIC(int virt_wire_setup);
-extern void disable_local_APIC(void);
-extern void lapic_shutdown(void);
-extern void sync_Arb_IDs(void);
-extern void init_bsp_APIC(void);
-extern void setup_local_APIC(void);
-extern void init_apic_mappings(void);
-void register_lapic_address(unsigned long address);
-extern void setup_boot_APIC_clock(void);
-extern void setup_secondary_APIC_clock(void);
-extern int APIC_init_uniprocessor(void);
-
-#ifdef CONFIG_X86_64
-static inline int apic_force_enable(unsigned long addr)
-{
- return -1;
-}
-#else
-extern int apic_force_enable(unsigned long addr);
-#endif
-
-extern int apic_bsp_setup(bool upmode);
-extern void apic_ap_setup(void);
-
-/*
- * On 32bit this is mach-xxx local
- */
-#ifdef CONFIG_X86_64
-extern int apic_is_clustered_box(void);
-#else
-static inline int apic_is_clustered_box(void)
-{
- return 0;
-}
-#endif
-
-extern int setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask);
-
-#else /* !CONFIG_X86_LOCAL_APIC */
-static inline void lapic_shutdown(void) { }
-#define local_apic_timer_c2_ok 1
-static inline void init_apic_mappings(void) { }
-static inline void disable_local_APIC(void) { }
-# define setup_boot_APIC_clock x86_init_noop
-# define setup_secondary_APIC_clock x86_init_noop
-#endif /* !CONFIG_X86_LOCAL_APIC */
+#endif /* !CONFIG_X86_X2APIC */
#ifdef CONFIG_X86_64
#define SET_APIC_ID(x) (apic->set_apic_id(x))
diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h
index 046c7fb..a210eba 100644
--- a/arch/x86/include/asm/irq_remapping.h
+++ b/arch/x86/include/asm/irq_remapping.h
@@ -33,6 +33,11 @@
IRQ_POSTING_CAP = 0,
};
+struct vcpu_data {
+ u64 pi_desc_addr; /* Physical address of PI Descriptor */
+ u32 vector; /* Guest vector of the interrupt */
+};
+
#ifdef CONFIG_IRQ_REMAP
extern bool irq_remapping_cap(enum irq_remap_cap cap);
@@ -58,11 +63,6 @@
return x86_vector_domain;
}
-struct vcpu_data {
- u64 pi_desc_addr; /* Physical address of PI Descriptor */
- u32 vector; /* Guest vector of the interrupt */
-};
-
#else /* CONFIG_IRQ_REMAP */
static inline bool irq_remapping_cap(enum irq_remap_cap cap) { return 0; }
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3a36ee7..53deb27 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -24,6 +24,7 @@
#include <linux/perf_event.h>
#include <linux/pvclock_gtod.h>
#include <linux/clocksource.h>
+#include <linux/irqbypass.h>
#include <asm/pvclock-abi.h>
#include <asm/desc.h>
@@ -176,6 +177,8 @@
*/
#define KVM_APIC_PV_EOI_PENDING 1
+struct kvm_kernel_irq_routing_entry;
+
/*
* We don't want allocation failures within the mmu code, so we preallocate
* enough memory for a single page fault in a cache.
@@ -374,6 +377,7 @@
/* Hyper-V per vcpu emulation context */
struct kvm_vcpu_hv {
u64 hv_vapic;
+ s64 runtime_offset;
};
struct kvm_vcpu_arch {
@@ -396,6 +400,7 @@
u64 efer;
u64 apic_base;
struct kvm_lapic *apic; /* kernel irqchip context */
+ u64 eoi_exit_bitmap[4];
unsigned long apic_attention;
int32_t apic_arb_prio;
int mp_state;
@@ -573,6 +578,9 @@
struct {
bool pv_unhalted;
} pv;
+
+ int pending_ioapic_eoi;
+ int pending_external_vector;
};
struct kvm_lpage_info {
@@ -683,6 +691,9 @@
u32 bsp_vcpu_id;
u64 disabled_quirks;
+
+ bool irqchip_split;
+ u8 nr_reserved_ioapic_pins;
};
struct kvm_vm_stat {
@@ -819,10 +830,10 @@
void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
void (*enable_irq_window)(struct kvm_vcpu *vcpu);
void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
- int (*vm_has_apicv)(struct kvm *kvm);
+ int (*cpu_uses_apicv)(struct kvm_vcpu *vcpu);
void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr);
void (*hwapic_isr_update)(struct kvm *kvm, int isr);
- void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
+ void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu);
void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set);
void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu, hpa_t hpa);
void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector);
@@ -887,6 +898,20 @@
gfn_t offset, unsigned long mask);
/* pmu operations of sub-arch */
const struct kvm_pmu_ops *pmu_ops;
+
+ /*
+ * Architecture specific hooks for vCPU blocking due to
+ * HLT instruction.
+ * Returns for .pre_block():
+ * - 0 means continue to block the vCPU.
+ * - 1 means we cannot block the vCPU since some event
+ * happens during this period, such as, 'ON' bit in
+ * posted-interrupts descriptor is set.
+ */
+ int (*pre_block)(struct kvm_vcpu *vcpu);
+ void (*post_block)(struct kvm_vcpu *vcpu);
+ int (*update_pi_irte)(struct kvm *kvm, unsigned int host_irq,
+ uint32_t guest_irq, bool set);
};
struct kvm_arch_async_pf {
@@ -1231,4 +1256,9 @@
bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu);
bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu);
+bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
+ struct kvm_vcpu **dest_vcpu);
+
+void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
+ struct kvm_lapic_irq *irq);
#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 448b7ca..d25f32a 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -72,7 +72,7 @@
#define SECONDARY_EXEC_SHADOW_VMCS 0x00004000
#define SECONDARY_EXEC_ENABLE_PML 0x00020000
#define SECONDARY_EXEC_XSAVES 0x00100000
-
+#define SECONDARY_EXEC_PCOMMIT 0x00200000
#define PIN_BASED_EXT_INTR_MASK 0x00000001
#define PIN_BASED_NMI_EXITING 0x00000008
diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/uapi/asm/hyperv.h
index f0412c5..2677a0a 100644
--- a/arch/x86/include/uapi/asm/hyperv.h
+++ b/arch/x86/include/uapi/asm/hyperv.h
@@ -153,6 +153,12 @@
/* MSR used to provide vcpu index */
#define HV_X64_MSR_VP_INDEX 0x40000002
+/* MSR used to reset the guest OS. */
+#define HV_X64_MSR_RESET 0x40000003
+
+/* MSR used to provide vcpu runtime in 100ns units */
+#define HV_X64_MSR_VP_RUNTIME 0x40000010
+
/* MSR used to read the per-partition time reference counter */
#define HV_X64_MSR_TIME_REF_COUNT 0x40000020
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index 37fee27..5b15d94 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -78,6 +78,7 @@
#define EXIT_REASON_PML_FULL 62
#define EXIT_REASON_XSAVES 63
#define EXIT_REASON_XRSTORS 64
+#define EXIT_REASON_PCOMMIT 65
#define VMX_EXIT_REASONS \
{ EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \
@@ -126,7 +127,8 @@
{ EXIT_REASON_INVVPID, "INVVPID" }, \
{ EXIT_REASON_INVPCID, "INVPCID" }, \
{ EXIT_REASON_XSAVES, "XSAVES" }, \
- { EXIT_REASON_XRSTORS, "XRSTORS" }
+ { EXIT_REASON_XRSTORS, "XRSTORS" }, \
+ { EXIT_REASON_PCOMMIT, "PCOMMIT" }
#define VMX_ABORT_SAVE_GUEST_MSR_FAIL 1
#define VMX_ABORT_LOAD_HOST_MSR_FAIL 4
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 2c7aafa..2bd81e3 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -32,6 +32,7 @@
static int kvmclock = 1;
static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME;
static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK;
+static cycle_t kvm_sched_clock_offset;
static int parse_no_kvmclock(char *arg)
{
@@ -92,6 +93,29 @@
return kvm_clock_read();
}
+static cycle_t kvm_sched_clock_read(void)
+{
+ return kvm_clock_read() - kvm_sched_clock_offset;
+}
+
+static inline void kvm_sched_clock_init(bool stable)
+{
+ if (!stable) {
+ pv_time_ops.sched_clock = kvm_clock_read;
+ return;
+ }
+
+ kvm_sched_clock_offset = kvm_clock_read();
+ pv_time_ops.sched_clock = kvm_sched_clock_read;
+ set_sched_clock_stable();
+
+ printk(KERN_INFO "kvm-clock: using sched offset of %llu cycles\n",
+ kvm_sched_clock_offset);
+
+ BUILD_BUG_ON(sizeof(kvm_sched_clock_offset) >
+ sizeof(((struct pvclock_vcpu_time_info *)NULL)->system_time));
+}
+
/*
* If we don't do that, there is the possibility that the guest
* will calibrate under heavy load - thus, getting a lower lpj -
@@ -248,7 +272,17 @@
memblock_free(mem, size);
return;
}
- pv_time_ops.sched_clock = kvm_clock_read;
+
+ if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT))
+ pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT);
+
+ cpu = get_cpu();
+ vcpu_time = &hv_clock[cpu].pvti;
+ flags = pvclock_read_flags(vcpu_time);
+
+ kvm_sched_clock_init(flags & PVCLOCK_TSC_STABLE_BIT);
+ put_cpu();
+
x86_platform.calibrate_tsc = kvm_get_tsc_khz;
x86_platform.get_wallclock = kvm_get_wallclock;
x86_platform.set_wallclock = kvm_set_wallclock;
@@ -265,16 +299,6 @@
kvm_get_preset_lpj();
clocksource_register_hz(&kvm_clock, NSEC_PER_SEC);
pv_info.name = "KVM";
-
- if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT))
- pvclock_set_flags(~0);
-
- cpu = get_cpu();
- vcpu_time = &hv_clock[cpu].pvti;
- flags = pvclock_read_flags(vcpu_time);
- if (flags & PVCLOCK_COUNTS_FROM_ZERO)
- set_sched_clock_stable();
- put_cpu();
}
int __init kvm_setup_vsyscall_timeinfo(void)
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index d8a1d56..639a6e3 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -28,6 +28,8 @@
select ANON_INODES
select HAVE_KVM_IRQCHIP
select HAVE_KVM_IRQFD
+ select IRQ_BYPASS_MANAGER
+ select HAVE_KVM_IRQ_BYPASS
select HAVE_KVM_IRQ_ROUTING
select HAVE_KVM_EVENTFD
select KVM_APIC_ARCHITECTURE
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 2fbea25..faeb0b3 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -348,7 +348,7 @@
F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) |
F(ADX) | F(SMAP) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) |
- F(AVX512CD);
+ F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(PCOMMIT);
/* cpuid 0xD.1.eax */
const u32 kvm_supported_word10_x86_features =
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index dd05b9c..d434ee9 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -133,4 +133,20 @@
best = kvm_find_cpuid_entry(vcpu, 7, 0);
return best && (best->ebx & bit(X86_FEATURE_MPX));
}
+
+static inline bool guest_cpuid_has_pcommit(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 7, 0);
+ return best && (best->ebx & bit(X86_FEATURE_PCOMMIT));
+}
+
+static inline bool guest_cpuid_has_rdtscp(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
+ return best && (best->edx & bit(X86_FEATURE_RDTSCP));
+}
#endif
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index a8160d2..62cf8c9 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -41,6 +41,7 @@
case HV_X64_MSR_TIME_REF_COUNT:
case HV_X64_MSR_CRASH_CTL:
case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
+ case HV_X64_MSR_RESET:
r = true;
break;
}
@@ -163,6 +164,12 @@
data);
case HV_X64_MSR_CRASH_CTL:
return kvm_hv_msr_set_crash_ctl(vcpu, data, host);
+ case HV_X64_MSR_RESET:
+ if (data == 1) {
+ vcpu_debug(vcpu, "hyper-v reset requested\n");
+ kvm_make_request(KVM_REQ_HV_RESET, vcpu);
+ }
+ break;
default:
vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n",
msr, data);
@@ -171,7 +178,16 @@
return 0;
}
-static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+/* Calculate cpu time spent by current task in 100ns units */
+static u64 current_task_runtime_100ns(void)
+{
+ cputime_t utime, stime;
+
+ task_cputime_adjusted(current, &utime, &stime);
+ return div_u64(cputime_to_nsecs(utime + stime), 100);
+}
+
+static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
{
struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv;
@@ -205,6 +221,11 @@
return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data);
case HV_X64_MSR_TPR:
return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data);
+ case HV_X64_MSR_VP_RUNTIME:
+ if (!host)
+ return 1;
+ hv->runtime_offset = data - current_task_runtime_100ns();
+ break;
default:
vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n",
msr, data);
@@ -241,6 +262,9 @@
pdata);
case HV_X64_MSR_CRASH_CTL:
return kvm_hv_msr_get_crash_ctl(vcpu, pdata);
+ case HV_X64_MSR_RESET:
+ data = 0;
+ break;
default:
vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
return 1;
@@ -277,6 +301,9 @@
case HV_X64_MSR_APIC_ASSIST_PAGE:
data = hv->hv_vapic;
break;
+ case HV_X64_MSR_VP_RUNTIME:
+ data = current_task_runtime_100ns() + hv->runtime_offset;
+ break;
default:
vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
return 1;
@@ -295,7 +322,7 @@
mutex_unlock(&vcpu->kvm->lock);
return r;
} else
- return kvm_hv_set_msr(vcpu, msr, data);
+ return kvm_hv_set_msr(vcpu, msr, data, host);
}
int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index f90952f..08116ff 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -35,6 +35,7 @@
#include <linux/kvm_host.h>
#include <linux/slab.h>
+#include "ioapic.h"
#include "irq.h"
#include "i8254.h"
#include "x86.h"
@@ -333,7 +334,8 @@
struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
s64 interval;
- if (!irqchip_in_kernel(kvm) || ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)
+ if (!ioapic_in_kernel(kvm) ||
+ ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)
return;
interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ);
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index 856f791..2dcda0f 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -233,21 +233,7 @@
}
-static void update_handled_vectors(struct kvm_ioapic *ioapic)
-{
- DECLARE_BITMAP(handled_vectors, 256);
- int i;
-
- memset(handled_vectors, 0, sizeof(handled_vectors));
- for (i = 0; i < IOAPIC_NUM_PINS; ++i)
- __set_bit(ioapic->redirtbl[i].fields.vector, handled_vectors);
- memcpy(ioapic->handled_vectors, handled_vectors,
- sizeof(handled_vectors));
- smp_wmb();
-}
-
-void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap,
- u32 *tmr)
+void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
{
struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic;
union kvm_ioapic_redirect_entry *e;
@@ -260,13 +246,9 @@
kvm_irq_has_notifier(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index) ||
index == RTC_GSI) {
if (kvm_apic_match_dest(vcpu, NULL, 0,
- e->fields.dest_id, e->fields.dest_mode)) {
+ e->fields.dest_id, e->fields.dest_mode))
__set_bit(e->fields.vector,
(unsigned long *)eoi_exit_bitmap);
- if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG)
- __set_bit(e->fields.vector,
- (unsigned long *)tmr);
- }
}
}
spin_unlock(&ioapic->lock);
@@ -315,7 +297,6 @@
e->bits |= (u32) val;
e->fields.remote_irr = 0;
}
- update_handled_vectors(ioapic);
mask_after = e->fields.mask;
if (mask_before != mask_after)
kvm_fire_mask_notifiers(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index, mask_after);
@@ -599,7 +580,6 @@
ioapic->id = 0;
memset(ioapic->irq_eoi, 0x00, IOAPIC_NUM_PINS);
rtc_irq_eoi_tracking_reset(ioapic);
- update_handled_vectors(ioapic);
}
static const struct kvm_io_device_ops ioapic_mmio_ops = {
@@ -628,8 +608,10 @@
if (ret < 0) {
kvm->arch.vioapic = NULL;
kfree(ioapic);
+ return ret;
}
+ kvm_vcpu_request_scan_ioapic(kvm);
return ret;
}
@@ -666,7 +648,6 @@
memcpy(ioapic, state, sizeof(struct kvm_ioapic_state));
ioapic->irr = 0;
ioapic->irr_delivered = 0;
- update_handled_vectors(ioapic);
kvm_vcpu_request_scan_ioapic(kvm);
kvm_ioapic_inject_all(ioapic, state->irr);
spin_unlock(&ioapic->lock);
diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h
index ca0b0b4..084617d 100644
--- a/arch/x86/kvm/ioapic.h
+++ b/arch/x86/kvm/ioapic.h
@@ -9,6 +9,7 @@
struct kvm_vcpu;
#define IOAPIC_NUM_PINS KVM_IOAPIC_NUM_PINS
+#define MAX_NR_RESERVED_IOAPIC_PINS KVM_MAX_IRQ_ROUTES
#define IOAPIC_VERSION_ID 0x11 /* IOAPIC version */
#define IOAPIC_EDGE_TRIG 0
#define IOAPIC_LEVEL_TRIG 1
@@ -73,7 +74,6 @@
struct kvm *kvm;
void (*ack_notifier)(void *opaque, int irq);
spinlock_t lock;
- DECLARE_BITMAP(handled_vectors, 256);
struct rtc_status rtc_status;
struct delayed_work eoi_inject;
u32 irq_eoi[IOAPIC_NUM_PINS];
@@ -98,11 +98,12 @@
return kvm->arch.vioapic;
}
-static inline bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector)
+static inline int ioapic_in_kernel(struct kvm *kvm)
{
- struct kvm_ioapic *ioapic = kvm->arch.vioapic;
- smp_rmb();
- return test_bit(vector, ioapic->handled_vectors);
+ int ret;
+
+ ret = (ioapic_irqchip(kvm) != NULL);
+ return ret;
}
void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu);
@@ -120,7 +121,7 @@
struct kvm_lapic_irq *irq, unsigned long *dest_map);
int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
-void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap,
- u32 *tmr);
+void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
+void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
#endif
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index a1ec6a50..097060e 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -38,14 +38,27 @@
EXPORT_SYMBOL(kvm_cpu_has_pending_timer);
/*
+ * check if there is a pending userspace external interrupt
+ */
+static int pending_userspace_extint(struct kvm_vcpu *v)
+{
+ return v->arch.pending_external_vector != -1;
+}
+
+/*
* check if there is pending interrupt from
* non-APIC source without intack.
*/
static int kvm_cpu_has_extint(struct kvm_vcpu *v)
{
- if (kvm_apic_accept_pic_intr(v))
- return pic_irqchip(v->kvm)->output; /* PIC */
- else
+ u8 accept = kvm_apic_accept_pic_intr(v);
+
+ if (accept) {
+ if (irqchip_split(v->kvm))
+ return pending_userspace_extint(v);
+ else
+ return pic_irqchip(v->kvm)->output;
+ } else
return 0;
}
@@ -57,13 +70,13 @@
*/
int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v)
{
- if (!irqchip_in_kernel(v->kvm))
+ if (!lapic_in_kernel(v))
return v->arch.interrupt.pending;
if (kvm_cpu_has_extint(v))
return 1;
- if (kvm_apic_vid_enabled(v->kvm))
+ if (kvm_vcpu_apic_vid_enabled(v))
return 0;
return kvm_apic_has_interrupt(v) != -1; /* LAPIC */
@@ -75,7 +88,7 @@
*/
int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
{
- if (!irqchip_in_kernel(v->kvm))
+ if (!lapic_in_kernel(v))
return v->arch.interrupt.pending;
if (kvm_cpu_has_extint(v))
@@ -91,9 +104,16 @@
*/
static int kvm_cpu_get_extint(struct kvm_vcpu *v)
{
- if (kvm_cpu_has_extint(v))
- return kvm_pic_read_irq(v->kvm); /* PIC */
- return -1;
+ if (kvm_cpu_has_extint(v)) {
+ if (irqchip_split(v->kvm)) {
+ int vector = v->arch.pending_external_vector;
+
+ v->arch.pending_external_vector = -1;
+ return vector;
+ } else
+ return kvm_pic_read_irq(v->kvm); /* PIC */
+ } else
+ return -1;
}
/*
@@ -103,7 +123,7 @@
{
int vector;
- if (!irqchip_in_kernel(v->kvm))
+ if (!lapic_in_kernel(v))
return v->arch.interrupt.nr;
vector = kvm_cpu_get_extint(v);
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 3d782a2..ae5c78f 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -83,13 +83,38 @@
return kvm->arch.vpic;
}
+static inline int pic_in_kernel(struct kvm *kvm)
+{
+ int ret;
+
+ ret = (pic_irqchip(kvm) != NULL);
+ return ret;
+}
+
+static inline int irqchip_split(struct kvm *kvm)
+{
+ return kvm->arch.irqchip_split;
+}
+
static inline int irqchip_in_kernel(struct kvm *kvm)
{
struct kvm_pic *vpic = pic_irqchip(kvm);
+ bool ret;
+
+ ret = (vpic != NULL);
+ ret |= irqchip_split(kvm);
/* Read vpic before kvm->irq_routing. */
smp_rmb();
- return vpic != NULL;
+ return ret;
+}
+
+static inline int lapic_in_kernel(struct kvm_vcpu *vcpu)
+{
+ /* Same as irqchip_in_kernel(vcpu->kvm), but with less
+ * pointer chasing and no unnecessary memory barriers.
+ */
+ return vcpu->arch.apic != NULL;
}
void kvm_pic_reset(struct kvm_kpic_state *s);
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index 9efff9e..c892289 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -91,8 +91,8 @@
return r;
}
-static inline void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
- struct kvm_lapic_irq *irq)
+void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
+ struct kvm_lapic_irq *irq)
{
trace_kvm_msi_set_irq(e->msi.address_lo, e->msi.data);
@@ -108,6 +108,7 @@
irq->level = 1;
irq->shorthand = 0;
}
+EXPORT_SYMBOL_GPL(kvm_set_msi_irq);
int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
struct kvm *kvm, int irq_source_id, int level, bool line_status)
@@ -208,7 +209,7 @@
goto unlock;
}
clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap);
- if (!irqchip_in_kernel(kvm))
+ if (!ioapic_in_kernel(kvm))
goto unlock;
kvm_ioapic_clear_all(kvm->arch.vioapic, irq_source_id);
@@ -297,6 +298,33 @@
return r;
}
+bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
+ struct kvm_vcpu **dest_vcpu)
+{
+ int i, r = 0;
+ struct kvm_vcpu *vcpu;
+
+ if (kvm_intr_is_single_vcpu_fast(kvm, irq, dest_vcpu))
+ return true;
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (!kvm_apic_present(vcpu))
+ continue;
+
+ if (!kvm_apic_match_dest(vcpu, NULL, irq->shorthand,
+ irq->dest_id, irq->dest_mode))
+ continue;
+
+ if (++r == 2)
+ return false;
+
+ *dest_vcpu = vcpu;
+ }
+
+ return r == 1;
+}
+EXPORT_SYMBOL_GPL(kvm_intr_is_single_vcpu);
+
#define IOAPIC_ROUTING_ENTRY(irq) \
{ .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \
.u.irqchip = { .irqchip = KVM_IRQCHIP_IOAPIC, .pin = (irq) } }
@@ -328,3 +356,52 @@
return kvm_set_irq_routing(kvm, default_routing,
ARRAY_SIZE(default_routing), 0);
}
+
+static const struct kvm_irq_routing_entry empty_routing[] = {};
+
+int kvm_setup_empty_irq_routing(struct kvm *kvm)
+{
+ return kvm_set_irq_routing(kvm, empty_routing, 0, 0);
+}
+
+void kvm_arch_irq_routing_update(struct kvm *kvm)
+{
+ if (ioapic_in_kernel(kvm) || !irqchip_in_kernel(kvm))
+ return;
+ kvm_make_scan_ioapic_request(kvm);
+}
+
+void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_kernel_irq_routing_entry *entry;
+ struct kvm_irq_routing_table *table;
+ u32 i, nr_ioapic_pins;
+ int idx;
+
+ /* kvm->irq_routing must be read after clearing
+ * KVM_SCAN_IOAPIC. */
+ smp_mb();
+ idx = srcu_read_lock(&kvm->irq_srcu);
+ table = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
+ nr_ioapic_pins = min_t(u32, table->nr_rt_entries,
+ kvm->arch.nr_reserved_ioapic_pins);
+ for (i = 0; i < nr_ioapic_pins; ++i) {
+ hlist_for_each_entry(entry, &table->map[i], link) {
+ u32 dest_id, dest_mode;
+
+ if (entry->type != KVM_IRQ_ROUTING_MSI)
+ continue;
+ dest_id = (entry->msi.address_lo >> 12) & 0xff;
+ dest_mode = (entry->msi.address_lo >> 2) & 0x1;
+ if (kvm_apic_match_dest(vcpu, NULL, 0, dest_id,
+ dest_mode)) {
+ u32 vector = entry->msi.data & 0xff;
+
+ __set_bit(vector,
+ (unsigned long *) eoi_exit_bitmap);
+ }
+ }
+ }
+ srcu_read_unlock(&kvm->irq_srcu, idx);
+}
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 8d9013c..944b38a 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -209,7 +209,7 @@
if (old)
kfree_rcu(old, rcu);
- kvm_vcpu_request_scan_ioapic(kvm);
+ kvm_make_scan_ioapic_request(kvm);
}
static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
@@ -390,7 +390,7 @@
vcpu = apic->vcpu;
- if (unlikely(kvm_apic_vid_enabled(vcpu->kvm))) {
+ if (unlikely(kvm_vcpu_apic_vid_enabled(vcpu))) {
/* try to update RVI */
apic_clear_vector(vec, apic->regs + APIC_IRR);
kvm_make_request(KVM_REQ_EVENT, vcpu);
@@ -551,15 +551,6 @@
__clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention);
}
-void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr)
-{
- struct kvm_lapic *apic = vcpu->arch.apic;
- int i;
-
- for (i = 0; i < 8; i++)
- apic_set_reg(apic, APIC_TMR + 0x10 * i, tmr[i]);
-}
-
static void apic_update_ppr(struct kvm_lapic *apic)
{
u32 tpr, isrv, ppr, old_ppr;
@@ -764,6 +755,65 @@
return ret;
}
+bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
+ struct kvm_vcpu **dest_vcpu)
+{
+ struct kvm_apic_map *map;
+ bool ret = false;
+ struct kvm_lapic *dst = NULL;
+
+ if (irq->shorthand)
+ return false;
+
+ rcu_read_lock();
+ map = rcu_dereference(kvm->arch.apic_map);
+
+ if (!map)
+ goto out;
+
+ if (irq->dest_mode == APIC_DEST_PHYSICAL) {
+ if (irq->dest_id == 0xFF)
+ goto out;
+
+ if (irq->dest_id >= ARRAY_SIZE(map->phys_map))
+ goto out;
+
+ dst = map->phys_map[irq->dest_id];
+ if (dst && kvm_apic_present(dst->vcpu))
+ *dest_vcpu = dst->vcpu;
+ else
+ goto out;
+ } else {
+ u16 cid;
+ unsigned long bitmap = 1;
+ int i, r = 0;
+
+ if (!kvm_apic_logical_map_valid(map))
+ goto out;
+
+ apic_logical_id(map, irq->dest_id, &cid, (u16 *)&bitmap);
+
+ if (cid >= ARRAY_SIZE(map->logical_map))
+ goto out;
+
+ for_each_set_bit(i, &bitmap, 16) {
+ dst = map->logical_map[cid][i];
+ if (++r == 2)
+ goto out;
+ }
+
+ if (dst && kvm_apic_present(dst->vcpu))
+ *dest_vcpu = dst->vcpu;
+ else
+ goto out;
+ }
+
+ ret = true;
+out:
+ rcu_read_unlock();
+ return ret;
+}
+
/*
* Add a pending IRQ into lapic.
* Return 1 if successfully added and 0 if discarded.
@@ -781,6 +831,9 @@
case APIC_DM_LOWEST:
vcpu->arch.apic_arb_prio++;
case APIC_DM_FIXED:
+ if (unlikely(trig_mode && !level))
+ break;
+
/* FIXME add logic for vcpu on reset */
if (unlikely(!apic_enabled(apic)))
break;
@@ -790,6 +843,13 @@
if (dest_map)
__set_bit(vcpu->vcpu_id, dest_map);
+ if (apic_test_vector(vector, apic->regs + APIC_TMR) != !!trig_mode) {
+ if (trig_mode)
+ apic_set_vector(vector, apic->regs + APIC_TMR);
+ else
+ apic_clear_vector(vector, apic->regs + APIC_TMR);
+ }
+
if (kvm_x86_ops->deliver_posted_interrupt)
kvm_x86_ops->deliver_posted_interrupt(vcpu, vector);
else {
@@ -868,16 +928,32 @@
return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio;
}
+static bool kvm_ioapic_handles_vector(struct kvm_lapic *apic, int vector)
+{
+ return test_bit(vector, (ulong *)apic->vcpu->arch.eoi_exit_bitmap);
+}
+
static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector)
{
- if (kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) {
- int trigger_mode;
- if (apic_test_vector(vector, apic->regs + APIC_TMR))
- trigger_mode = IOAPIC_LEVEL_TRIG;
- else
- trigger_mode = IOAPIC_EDGE_TRIG;
- kvm_ioapic_update_eoi(apic->vcpu, vector, trigger_mode);
+ int trigger_mode;
+
+ /* Eoi the ioapic only if the ioapic doesn't own the vector. */
+ if (!kvm_ioapic_handles_vector(apic, vector))
+ return;
+
+ /* Request a KVM exit to inform the userspace IOAPIC. */
+ if (irqchip_split(apic->vcpu->kvm)) {
+ apic->vcpu->arch.pending_ioapic_eoi = vector;
+ kvm_make_request(KVM_REQ_IOAPIC_EOI_EXIT, apic->vcpu);
+ return;
}
+
+ if (apic_test_vector(vector, apic->regs + APIC_TMR))
+ trigger_mode = IOAPIC_LEVEL_TRIG;
+ else
+ trigger_mode = IOAPIC_EDGE_TRIG;
+
+ kvm_ioapic_update_eoi(apic->vcpu, vector, trigger_mode);
}
static int apic_set_eoi(struct kvm_lapic *apic)
@@ -1615,7 +1691,7 @@
apic_set_reg(apic, APIC_ISR + 0x10 * i, 0);
apic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
}
- apic->irr_pending = kvm_apic_vid_enabled(vcpu->kvm);
+ apic->irr_pending = kvm_vcpu_apic_vid_enabled(vcpu);
apic->isr_count = kvm_x86_ops->hwapic_isr_update ? 1 : 0;
apic->highest_isr_cache = -1;
update_divide_count(apic);
@@ -1838,7 +1914,8 @@
kvm_x86_ops->hwapic_isr_update(vcpu->kvm,
apic_find_highest_isr(apic));
kvm_make_request(KVM_REQ_EVENT, vcpu);
- kvm_rtc_eoi_tracking_restore_one(vcpu);
+ if (ioapic_in_kernel(vcpu->kvm))
+ kvm_rtc_eoi_tracking_restore_one(vcpu);
}
void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
@@ -1922,7 +1999,7 @@
/* Cache not set: could be safe but we don't bother. */
apic->highest_isr_cache == -1 ||
/* Need EOI to update ioapic. */
- kvm_ioapic_handles_vector(vcpu->kvm, apic->highest_isr_cache)) {
+ kvm_ioapic_handles_vector(apic, apic->highest_isr_cache)) {
/*
* PV EOI was disabled by apic_sync_pv_eoi_from_guest
* so we need not do anything here.
@@ -1978,7 +2055,7 @@
struct kvm_lapic *apic = vcpu->arch.apic;
u32 reg = (msr - APIC_BASE_MSR) << 4;
- if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic))
+ if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(apic))
return 1;
if (reg == APIC_ICR2)
@@ -1995,7 +2072,7 @@
struct kvm_lapic *apic = vcpu->arch.apic;
u32 reg = (msr - APIC_BASE_MSR) << 4, low, high = 0;
- if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic))
+ if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(apic))
return 1;
if (reg == APIC_DFR || reg == APIC_ICR2) {
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 7640379..fde8e35d 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -57,7 +57,6 @@
u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
void kvm_apic_set_version(struct kvm_vcpu *vcpu);
-void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr);
void __kvm_apic_update_irr(u32 *pir, void *regs);
void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir);
int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
@@ -144,9 +143,9 @@
return apic->vcpu->arch.apic_base & X2APIC_ENABLE;
}
-static inline bool kvm_apic_vid_enabled(struct kvm *kvm)
+static inline bool kvm_vcpu_apic_vid_enabled(struct kvm_vcpu *vcpu)
{
- return kvm_x86_ops->vm_has_apicv(kvm);
+ return kvm_x86_ops->cpu_uses_apicv(vcpu);
}
static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu)
@@ -169,4 +168,6 @@
void wait_lapic_expire(struct kvm_vcpu *vcpu);
+bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
+ struct kvm_vcpu **dest_vcpu);
#endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index ff606f5..c3f39aa 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3427,7 +3427,7 @@
static bool can_do_async_pf(struct kvm_vcpu *vcpu)
{
- if (unlikely(!irqchip_in_kernel(vcpu->kvm) ||
+ if (unlikely(!lapic_in_kernel(vcpu) ||
kvm_event_needs_reinjection(vcpu)))
return false;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 2f9ed1f..54a8618 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3060,7 +3060,7 @@
u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
/* instruction emulation calls kvm_set_cr8() */
r = cr_interception(svm);
- if (irqchip_in_kernel(svm->vcpu.kvm))
+ if (lapic_in_kernel(&svm->vcpu))
return r;
if (cr8_prev <= kvm_get_cr8(&svm->vcpu))
return r;
@@ -3294,24 +3294,11 @@
static int interrupt_window_interception(struct vcpu_svm *svm)
{
- struct kvm_run *kvm_run = svm->vcpu.run;
-
kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
svm_clear_vintr(svm);
svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
mark_dirty(svm->vmcb, VMCB_INTR);
++svm->vcpu.stat.irq_window_exits;
- /*
- * If the user space waits to inject interrupts, exit as soon as
- * possible
- */
- if (!irqchip_in_kernel(svm->vcpu.kvm) &&
- kvm_run->request_interrupt_window &&
- !kvm_cpu_has_interrupt(&svm->vcpu)) {
- kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
- return 0;
- }
-
return 1;
}
@@ -3659,12 +3646,12 @@
return;
}
-static int svm_vm_has_apicv(struct kvm *kvm)
+static int svm_cpu_uses_apicv(struct kvm_vcpu *vcpu)
{
return 0;
}
-static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
+static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu)
{
return;
}
@@ -4425,7 +4412,7 @@
.enable_irq_window = enable_irq_window,
.update_cr8_intercept = update_cr8_intercept,
.set_virtual_x2apic_mode = svm_set_virtual_x2apic_mode,
- .vm_has_apicv = svm_vm_has_apicv,
+ .cpu_uses_apicv = svm_cpu_uses_apicv,
.load_eoi_exitmap = svm_load_eoi_exitmap,
.sync_pir_to_irr = svm_sync_pir_to_irr,
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 4eae7c3..1203025 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -129,6 +129,24 @@
);
/*
+ * Tracepoint for fast mmio.
+ */
+TRACE_EVENT(kvm_fast_mmio,
+ TP_PROTO(u64 gpa),
+ TP_ARGS(gpa),
+
+ TP_STRUCT__entry(
+ __field(u64, gpa)
+ ),
+
+ TP_fast_assign(
+ __entry->gpa = gpa;
+ ),
+
+ TP_printk("fast mmio at gpa 0x%llx", __entry->gpa)
+);
+
+/*
* Tracepoint for cpuid.
*/
TRACE_EVENT(kvm_cpuid,
@@ -974,6 +992,39 @@
__entry->smbase)
);
+/*
+ * Tracepoint for VT-d posted-interrupts.
+ */
+TRACE_EVENT(kvm_pi_irte_update,
+ TP_PROTO(unsigned int vcpu_id, unsigned int gsi,
+ unsigned int gvec, u64 pi_desc_addr, bool set),
+ TP_ARGS(vcpu_id, gsi, gvec, pi_desc_addr, set),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, vcpu_id )
+ __field( unsigned int, gsi )
+ __field( unsigned int, gvec )
+ __field( u64, pi_desc_addr )
+ __field( bool, set )
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->gsi = gsi;
+ __entry->gvec = gvec;
+ __entry->pi_desc_addr = pi_desc_addr;
+ __entry->set = set;
+ ),
+
+ TP_printk("VT-d PI is %s for this irq, vcpu %u, gsi: 0x%x, "
+ "gvec: 0x%x, pi_desc_addr: 0x%llx",
+ __entry->set ? "enabled and being updated" : "disabled",
+ __entry->vcpu_id,
+ __entry->gsi,
+ __entry->gvec,
+ __entry->pi_desc_addr)
+);
+
#endif /* _TRACE_KVM_H */
#undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 6a8bc64..8eeba6a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -35,6 +35,7 @@
#include "kvm_cache_regs.h"
#include "x86.h"
+#include <asm/cpu.h>
#include <asm/io.h>
#include <asm/desc.h>
#include <asm/vmx.h>
@@ -45,6 +46,7 @@
#include <asm/debugreg.h>
#include <asm/kexec.h>
#include <asm/apic.h>
+#include <asm/irq_remapping.h>
#include "trace.h"
#include "pmu.h"
@@ -443,11 +445,29 @@
};
#define POSTED_INTR_ON 0
+#define POSTED_INTR_SN 1
+
/* Posted-Interrupt Descriptor */
struct pi_desc {
u32 pir[8]; /* Posted interrupt requested */
- u32 control; /* bit 0 of control is outstanding notification bit */
- u32 rsvd[7];
+ union {
+ struct {
+ /* bit 256 - Outstanding Notification */
+ u16 on : 1,
+ /* bit 257 - Suppress Notification */
+ sn : 1,
+ /* bit 271:258 - Reserved */
+ rsvd_1 : 14;
+ /* bit 279:272 - Notification Vector */
+ u8 nv;
+ /* bit 287:280 - Reserved */
+ u8 rsvd_2;
+ /* bit 319:288 - Notification Destination */
+ u32 ndst;
+ };
+ u64 control;
+ };
+ u32 rsvd[6];
} __aligned(64);
static bool pi_test_and_set_on(struct pi_desc *pi_desc)
@@ -467,6 +487,30 @@
return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
}
+static inline void pi_clear_sn(struct pi_desc *pi_desc)
+{
+ return clear_bit(POSTED_INTR_SN,
+ (unsigned long *)&pi_desc->control);
+}
+
+static inline void pi_set_sn(struct pi_desc *pi_desc)
+{
+ return set_bit(POSTED_INTR_SN,
+ (unsigned long *)&pi_desc->control);
+}
+
+static inline int pi_test_on(struct pi_desc *pi_desc)
+{
+ return test_bit(POSTED_INTR_ON,
+ (unsigned long *)&pi_desc->control);
+}
+
+static inline int pi_test_sn(struct pi_desc *pi_desc)
+{
+ return test_bit(POSTED_INTR_SN,
+ (unsigned long *)&pi_desc->control);
+}
+
struct vcpu_vmx {
struct kvm_vcpu vcpu;
unsigned long host_rsp;
@@ -532,8 +576,6 @@
s64 vnmi_blocked_time;
u32 exit_reason;
- bool rdtscp_enabled;
-
/* Posted interrupt descriptor */
struct pi_desc pi_desc;
@@ -563,6 +605,11 @@
return container_of(vcpu, struct vcpu_vmx, vcpu);
}
+static struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
+{
+ return &(to_vmx(vcpu)->pi_desc);
+}
+
#define VMCS12_OFFSET(x) offsetof(struct vmcs12, x)
#define FIELD(number, name) [number] = VMCS12_OFFSET(name)
#define FIELD64(number, name) [number] = VMCS12_OFFSET(name), \
@@ -809,7 +856,7 @@
static void kvm_cpu_vmxoff(void);
static bool vmx_mpx_supported(void);
static bool vmx_xsaves_supported(void);
-static int vmx_vm_has_apicv(struct kvm *kvm);
+static int vmx_cpu_uses_apicv(struct kvm_vcpu *vcpu);
static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
static void vmx_set_segment(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg);
@@ -831,6 +878,13 @@
static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
static DEFINE_PER_CPU(struct desc_ptr, host_gdt);
+/*
+ * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we
+ * can find which vCPU should be waken up.
+ */
+static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu);
+static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);
+
static unsigned long *vmx_io_bitmap_a;
static unsigned long *vmx_io_bitmap_b;
static unsigned long *vmx_msr_bitmap_legacy;
@@ -946,9 +1000,9 @@
return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW;
}
-static inline bool vm_need_tpr_shadow(struct kvm *kvm)
+static inline bool cpu_need_tpr_shadow(struct kvm_vcpu *vcpu)
{
- return (cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm));
+ return cpu_has_vmx_tpr_shadow() && lapic_in_kernel(vcpu);
}
static inline bool cpu_has_secondary_exec_ctrls(void)
@@ -983,7 +1037,8 @@
static inline bool cpu_has_vmx_posted_intr(void)
{
- return vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR;
+ return IS_ENABLED(CONFIG_X86_LOCAL_APIC) &&
+ vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR;
}
static inline bool cpu_has_vmx_apicv(void)
@@ -1062,9 +1117,9 @@
SECONDARY_EXEC_PAUSE_LOOP_EXITING;
}
-static inline bool vm_need_virtualize_apic_accesses(struct kvm *kvm)
+static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu)
{
- return flexpriority_enabled && irqchip_in_kernel(kvm);
+ return flexpriority_enabled && lapic_in_kernel(vcpu);
}
static inline bool cpu_has_vmx_vpid(void)
@@ -1895,6 +1950,52 @@
preempt_enable();
}
+static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
+{
+ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+ struct pi_desc old, new;
+ unsigned int dest;
+
+ if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
+ !irq_remapping_cap(IRQ_POSTING_CAP))
+ return;
+
+ do {
+ old.control = new.control = pi_desc->control;
+
+ /*
+ * If 'nv' field is POSTED_INTR_WAKEUP_VECTOR, there
+ * are two possible cases:
+ * 1. After running 'pre_block', context switch
+ * happened. For this case, 'sn' was set in
+ * vmx_vcpu_put(), so we need to clear it here.
+ * 2. After running 'pre_block', we were blocked,
+ * and woken up by some other guy. For this case,
+ * we don't need to do anything, 'pi_post_block'
+ * will do everything for us. However, we cannot
+ * check whether it is case #1 or case #2 here
+ * (maybe, not needed), so we also clear sn here,
+ * I think it is not a big deal.
+ */
+ if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR) {
+ if (vcpu->cpu != cpu) {
+ dest = cpu_physical_id(cpu);
+
+ if (x2apic_enabled())
+ new.ndst = dest;
+ else
+ new.ndst = (dest << 8) & 0xFF00;
+ }
+
+ /* set 'NV' to 'notification vector' */
+ new.nv = POSTED_INTR_VECTOR;
+ }
+
+ /* Allow posting non-urgent interrupts */
+ new.sn = 0;
+ } while (cmpxchg(&pi_desc->control, old.control,
+ new.control) != old.control);
+}
/*
* Switches to specified vcpu, until a matching vcpu_put(), but assumes
* vcpu mutex is already taken.
@@ -1945,10 +2046,27 @@
vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
vmx->loaded_vmcs->cpu = cpu;
}
+
+ vmx_vcpu_pi_load(vcpu, cpu);
+}
+
+static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
+{
+ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+
+ if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
+ !irq_remapping_cap(IRQ_POSTING_CAP))
+ return;
+
+ /* Set SN when the vCPU is preempted */
+ if (vcpu->preempted)
+ pi_set_sn(pi_desc);
}
static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
{
+ vmx_vcpu_pi_put(vcpu);
+
__vmx_load_host_state(to_vmx(vcpu));
if (!vmm_exclusive) {
__loaded_vmcs_clear(to_vmx(vcpu)->loaded_vmcs);
@@ -2207,7 +2325,7 @@
if (index >= 0)
move_msr_up(vmx, index, save_nmsrs++);
index = __find_msr_index(vmx, MSR_TSC_AUX);
- if (index >= 0 && vmx->rdtscp_enabled)
+ if (index >= 0 && guest_cpuid_has_rdtscp(&vmx->vcpu))
move_msr_up(vmx, index, save_nmsrs++);
/*
* MSR_STAR is only needed on long mode guests, and only
@@ -2377,7 +2495,7 @@
vmx->nested.nested_vmx_pinbased_ctls_high |=
PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
PIN_BASED_VMX_PREEMPTION_TIMER;
- if (vmx_vm_has_apicv(vmx->vcpu.kvm))
+ if (vmx_cpu_uses_apicv(&vmx->vcpu))
vmx->nested.nested_vmx_pinbased_ctls_high |=
PIN_BASED_POSTED_INTR;
@@ -2474,7 +2592,8 @@
SECONDARY_EXEC_APIC_REGISTER_VIRT |
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
SECONDARY_EXEC_WBINVD_EXITING |
- SECONDARY_EXEC_XSAVES;
+ SECONDARY_EXEC_XSAVES |
+ SECONDARY_EXEC_PCOMMIT;
if (enable_ept) {
/* nested EPT: emulate EPT also to L1 */
@@ -2673,7 +2792,7 @@
msr_info->data = vcpu->arch.ia32_xss;
break;
case MSR_TSC_AUX:
- if (!to_vmx(vcpu)->rdtscp_enabled)
+ if (!guest_cpuid_has_rdtscp(vcpu))
return 1;
/* Otherwise falls through */
default:
@@ -2779,7 +2898,7 @@
clear_atomic_switch_msr(vmx, MSR_IA32_XSS);
break;
case MSR_TSC_AUX:
- if (!vmx->rdtscp_enabled)
+ if (!guest_cpuid_has_rdtscp(vcpu))
return 1;
/* Check reserved bit, higher 32 bits should be zero */
if ((data >> 32) != 0)
@@ -2874,6 +2993,8 @@
return -EBUSY;
INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
+ INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
+ spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
/*
* Now we can enable the vmclear operation in kdump
@@ -3015,7 +3136,8 @@
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
SECONDARY_EXEC_SHADOW_VMCS |
SECONDARY_EXEC_XSAVES |
- SECONDARY_EXEC_ENABLE_PML;
+ SECONDARY_EXEC_ENABLE_PML |
+ SECONDARY_EXEC_PCOMMIT;
if (adjust_vmx_controls(min2, opt2,
MSR_IA32_VMX_PROCBASED_CTLS2,
&_cpu_based_2nd_exec_control) < 0)
@@ -4323,9 +4445,9 @@
msr, MSR_TYPE_W);
}
-static int vmx_vm_has_apicv(struct kvm *kvm)
+static int vmx_cpu_uses_apicv(struct kvm_vcpu *vcpu)
{
- return enable_apicv && irqchip_in_kernel(kvm);
+ return enable_apicv && lapic_in_kernel(vcpu);
}
static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
@@ -4369,6 +4491,22 @@
{
#ifdef CONFIG_SMP
if (vcpu->mode == IN_GUEST_MODE) {
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ /*
+ * Currently, we don't support urgent interrupt,
+ * all interrupts are recognized as non-urgent
+ * interrupt, so we cannot post interrupts when
+ * 'SN' is set.
+ *
+ * If the vcpu is in guest mode, it means it is
+ * running instead of being scheduled out and
+ * waiting in the run queue, and that's the only
+ * case when 'SN' is set currently, warning if
+ * 'SN' is set.
+ */
+ WARN_ON_ONCE(pi_test_sn(&vmx->pi_desc));
+
apic->send_IPI_mask(get_cpu_mask(vcpu->cpu),
POSTED_INTR_VECTOR);
return true;
@@ -4505,7 +4643,7 @@
{
u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl;
- if (!vmx_vm_has_apicv(vmx->vcpu.kvm))
+ if (!vmx_cpu_uses_apicv(&vmx->vcpu))
pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
return pin_based_exec_ctrl;
}
@@ -4517,7 +4655,7 @@
if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)
exec_control &= ~CPU_BASED_MOV_DR_EXITING;
- if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) {
+ if (!cpu_need_tpr_shadow(&vmx->vcpu)) {
exec_control &= ~CPU_BASED_TPR_SHADOW;
#ifdef CONFIG_X86_64
exec_control |= CPU_BASED_CR8_STORE_EXITING |
@@ -4534,7 +4672,7 @@
static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
{
u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
- if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
+ if (!cpu_need_virtualize_apic_accesses(&vmx->vcpu))
exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
if (vmx->vpid == 0)
exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
@@ -4548,7 +4686,7 @@
exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
if (!ple_gap)
exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
- if (!vmx_vm_has_apicv(vmx->vcpu.kvm))
+ if (!vmx_cpu_uses_apicv(&vmx->vcpu))
exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT |
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
@@ -4561,6 +4699,9 @@
/* PML is enabled/disabled in creating/destorying vcpu */
exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
+ /* Currently, we allow L1 guest to directly run pcommit instruction. */
+ exec_control &= ~SECONDARY_EXEC_PCOMMIT;
+
return exec_control;
}
@@ -4604,12 +4745,11 @@
vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx));
- if (cpu_has_secondary_exec_ctrls()) {
+ if (cpu_has_secondary_exec_ctrls())
vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
vmx_secondary_exec_control(vmx));
- }
- if (vmx_vm_has_apicv(vmx->vcpu.kvm)) {
+ if (vmx_cpu_uses_apicv(&vmx->vcpu)) {
vmcs_write64(EOI_EXIT_BITMAP0, 0);
vmcs_write64(EOI_EXIT_BITMAP1, 0);
vmcs_write64(EOI_EXIT_BITMAP2, 0);
@@ -4753,7 +4893,7 @@
if (cpu_has_vmx_tpr_shadow() && !init_event) {
vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
- if (vm_need_tpr_shadow(vcpu->kvm))
+ if (cpu_need_tpr_shadow(vcpu))
vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
__pa(vcpu->arch.apic->regs));
vmcs_write32(TPR_THRESHOLD, 0);
@@ -4761,7 +4901,7 @@
kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
- if (vmx_vm_has_apicv(vcpu->kvm))
+ if (vmx_cpu_uses_apicv(vcpu))
memset(&vmx->pi_desc, 0, sizeof(struct pi_desc));
if (vmx->vpid != 0)
@@ -5296,7 +5436,7 @@
u8 cr8 = (u8)val;
err = kvm_set_cr8(vcpu, cr8);
kvm_complete_insn_gp(vcpu, err);
- if (irqchip_in_kernel(vcpu->kvm))
+ if (lapic_in_kernel(vcpu))
return 1;
if (cr8_prev <= cr8)
return 1;
@@ -5510,17 +5650,6 @@
kvm_make_request(KVM_REQ_EVENT, vcpu);
++vcpu->stat.irq_window_exits;
-
- /*
- * If the user space waits to inject interrupts, exit as soon as
- * possible
- */
- if (!irqchip_in_kernel(vcpu->kvm) &&
- vcpu->run->request_interrupt_window &&
- !kvm_cpu_has_interrupt(vcpu)) {
- vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
- return 0;
- }
return 1;
}
@@ -5753,6 +5882,7 @@
gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
skip_emulated_instruction(vcpu);
+ trace_kvm_fast_mmio(gpa);
return 1;
}
@@ -5910,6 +6040,25 @@
ple_window_grow, INT_MIN);
}
+/*
+ * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR.
+ */
+static void wakeup_handler(void)
+{
+ struct kvm_vcpu *vcpu;
+ int cpu = smp_processor_id();
+
+ spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
+ list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu),
+ blocked_vcpu_list) {
+ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+
+ if (pi_test_on(pi_desc) == 1)
+ kvm_vcpu_kick(vcpu);
+ }
+ spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
+}
+
static __init int hardware_setup(void)
{
int r = -ENOMEM, i, msr;
@@ -6096,6 +6245,8 @@
kvm_x86_ops->enable_log_dirty_pt_masked = NULL;
}
+ kvm_set_posted_intr_wakeup_handler(wakeup_handler);
+
return alloc_kvm_area();
out8:
@@ -6627,7 +6778,6 @@
static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)
{
- u32 exec_control;
if (vmx->nested.current_vmptr == -1ull)
return;
@@ -6640,9 +6790,8 @@
they were modified */
copy_shadow_to_vmcs12(vmx);
vmx->nested.sync_shadow_vmcs = false;
- exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
- exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
- vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+ vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
+ SECONDARY_EXEC_SHADOW_VMCS);
vmcs_write64(VMCS_LINK_POINTER, -1ull);
}
vmx->nested.posted_intr_nv = -1;
@@ -7038,7 +7187,6 @@
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
gpa_t vmptr;
- u32 exec_control;
if (!nested_vmx_check_permission(vcpu))
return 1;
@@ -7070,9 +7218,8 @@
vmx->nested.current_vmcs12 = new_vmcs12;
vmx->nested.current_vmcs12_page = page;
if (enable_shadow_vmcs) {
- exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
- exec_control |= SECONDARY_EXEC_SHADOW_VMCS;
- vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+ vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
+ SECONDARY_EXEC_SHADOW_VMCS);
vmcs_write64(VMCS_LINK_POINTER,
__pa(vmx->nested.current_shadow_vmcs));
vmx->nested.sync_shadow_vmcs = true;
@@ -7207,6 +7354,13 @@
return 1;
}
+static int handle_pcommit(struct kvm_vcpu *vcpu)
+{
+ /* we never catch pcommit instruct for L1 guest. */
+ WARN_ON(1);
+ return 1;
+}
+
/*
* The exit handlers return 1 if the exit was handled fully and guest execution
* may resume. Otherwise they set the kvm_run parameter to indicate what needs
@@ -7257,6 +7411,7 @@
[EXIT_REASON_XSAVES] = handle_xsaves,
[EXIT_REASON_XRSTORS] = handle_xrstors,
[EXIT_REASON_PML_FULL] = handle_pml_full,
+ [EXIT_REASON_PCOMMIT] = handle_pcommit,
};
static const int kvm_vmx_max_exit_handlers =
@@ -7558,6 +7713,8 @@
* the XSS exit bitmap in vmcs12.
*/
return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
+ case EXIT_REASON_PCOMMIT:
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_PCOMMIT);
default:
return true;
}
@@ -7572,7 +7729,6 @@
static int vmx_enable_pml(struct vcpu_vmx *vmx)
{
struct page *pml_pg;
- u32 exec_control;
pml_pg = alloc_page(GFP_KERNEL | __GFP_ZERO);
if (!pml_pg)
@@ -7583,24 +7739,18 @@
vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
- exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
- exec_control |= SECONDARY_EXEC_ENABLE_PML;
- vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+ vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_ENABLE_PML);
return 0;
}
static void vmx_disable_pml(struct vcpu_vmx *vmx)
{
- u32 exec_control;
-
ASSERT(vmx->pml_pg);
__free_page(vmx->pml_pg);
vmx->pml_pg = NULL;
- exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
- exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
- vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+ vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_ENABLE_PML);
}
static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu)
@@ -7924,10 +8074,10 @@
* apicv
*/
if (!cpu_has_vmx_virtualize_x2apic_mode() ||
- !vmx_vm_has_apicv(vcpu->kvm))
+ !vmx_cpu_uses_apicv(vcpu))
return;
- if (!vm_need_tpr_shadow(vcpu->kvm))
+ if (!cpu_need_tpr_shadow(vcpu))
return;
sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
@@ -8029,9 +8179,10 @@
}
}
-static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
+static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu)
{
- if (!vmx_vm_has_apicv(vcpu->kvm))
+ u64 *eoi_exit_bitmap = vcpu->arch.eoi_exit_bitmap;
+ if (!vmx_cpu_uses_apicv(vcpu))
return;
vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]);
@@ -8530,7 +8681,7 @@
put_cpu();
if (err)
goto free_vmcs;
- if (vm_need_virtualize_apic_accesses(kvm)) {
+ if (cpu_need_virtualize_apic_accesses(&vmx->vcpu)) {
err = alloc_apic_access_page(kvm);
if (err)
goto free_vmcs;
@@ -8648,49 +8799,67 @@
return PT_PDPE_LEVEL;
}
+static void vmcs_set_secondary_exec_control(u32 new_ctl)
+{
+ /*
+ * These bits in the secondary execution controls field
+ * are dynamic, the others are mostly based on the hypervisor
+ * architecture and the guest's CPUID. Do not touch the
+ * dynamic bits.
+ */
+ u32 mask =
+ SECONDARY_EXEC_SHADOW_VMCS |
+ SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
+ SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+
+ u32 cur_ctl = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+
+ vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
+ (new_ctl & ~mask) | (cur_ctl & mask));
+}
+
static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best;
struct vcpu_vmx *vmx = to_vmx(vcpu);
- u32 exec_control;
+ u32 secondary_exec_ctl = vmx_secondary_exec_control(vmx);
- vmx->rdtscp_enabled = false;
if (vmx_rdtscp_supported()) {
- exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
- if (exec_control & SECONDARY_EXEC_RDTSCP) {
- best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
- if (best && (best->edx & bit(X86_FEATURE_RDTSCP)))
- vmx->rdtscp_enabled = true;
- else {
- exec_control &= ~SECONDARY_EXEC_RDTSCP;
- vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
- exec_control);
- }
+ bool rdtscp_enabled = guest_cpuid_has_rdtscp(vcpu);
+ if (!rdtscp_enabled)
+ secondary_exec_ctl &= ~SECONDARY_EXEC_RDTSCP;
+
+ if (nested) {
+ if (rdtscp_enabled)
+ vmx->nested.nested_vmx_secondary_ctls_high |=
+ SECONDARY_EXEC_RDTSCP;
+ else
+ vmx->nested.nested_vmx_secondary_ctls_high &=
+ ~SECONDARY_EXEC_RDTSCP;
}
- if (nested && !vmx->rdtscp_enabled)
- vmx->nested.nested_vmx_secondary_ctls_high &=
- ~SECONDARY_EXEC_RDTSCP;
}
/* Exposing INVPCID only when PCID is exposed */
best = kvm_find_cpuid_entry(vcpu, 0x7, 0);
if (vmx_invpcid_supported() &&
- best && (best->ebx & bit(X86_FEATURE_INVPCID)) &&
- guest_cpuid_has_pcid(vcpu)) {
- exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
- exec_control |= SECONDARY_EXEC_ENABLE_INVPCID;
- vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
- exec_control);
- } else {
- if (cpu_has_secondary_exec_ctrls()) {
- exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
- exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID;
- vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
- exec_control);
- }
+ (!best || !(best->ebx & bit(X86_FEATURE_INVPCID)) ||
+ !guest_cpuid_has_pcid(vcpu))) {
+ secondary_exec_ctl &= ~SECONDARY_EXEC_ENABLE_INVPCID;
+
if (best)
best->ebx &= ~bit(X86_FEATURE_INVPCID);
}
+
+ vmcs_set_secondary_exec_control(secondary_exec_ctl);
+
+ if (static_cpu_has(X86_FEATURE_PCOMMIT) && nested) {
+ if (guest_cpuid_has_pcommit(vcpu))
+ vmx->nested.nested_vmx_secondary_ctls_high |=
+ SECONDARY_EXEC_PCOMMIT;
+ else
+ vmx->nested.nested_vmx_secondary_ctls_high &=
+ ~SECONDARY_EXEC_PCOMMIT;
+ }
}
static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
@@ -9298,13 +9467,13 @@
if (cpu_has_secondary_exec_ctrls()) {
exec_control = vmx_secondary_exec_control(vmx);
- if (!vmx->rdtscp_enabled)
- exec_control &= ~SECONDARY_EXEC_RDTSCP;
+
/* Take the following fields only from vmcs12 */
exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
SECONDARY_EXEC_RDTSCP |
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
- SECONDARY_EXEC_APIC_REGISTER_VIRT);
+ SECONDARY_EXEC_APIC_REGISTER_VIRT |
+ SECONDARY_EXEC_PCOMMIT);
if (nested_cpu_has(vmcs12,
CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
exec_control |= vmcs12->secondary_vm_exec_control;
@@ -9323,7 +9492,7 @@
vmcs_write64(APIC_ACCESS_ADDR,
page_to_phys(vmx->nested.apic_access_page));
} else if (!(nested_cpu_has_virt_x2apic_mode(vmcs12)) &&
- (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))) {
+ cpu_need_virtualize_apic_accesses(&vmx->vcpu)) {
exec_control |=
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
kvm_vcpu_reload_apic_access_page(vcpu);
@@ -10278,6 +10447,201 @@
kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask);
}
+/*
+ * This routine does the following things for vCPU which is going
+ * to be blocked if VT-d PI is enabled.
+ * - Store the vCPU to the wakeup list, so when interrupts happen
+ * we can find the right vCPU to wake up.
+ * - Change the Posted-interrupt descriptor as below:
+ * 'NDST' <-- vcpu->pre_pcpu
+ * 'NV' <-- POSTED_INTR_WAKEUP_VECTOR
+ * - If 'ON' is set during this process, which means at least one
+ * interrupt is posted for this vCPU, we cannot block it, in
+ * this case, return 1, otherwise, return 0.
+ *
+ */
+static int vmx_pre_block(struct kvm_vcpu *vcpu)
+{
+ unsigned long flags;
+ unsigned int dest;
+ struct pi_desc old, new;
+ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+
+ if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
+ !irq_remapping_cap(IRQ_POSTING_CAP))
+ return 0;
+
+ vcpu->pre_pcpu = vcpu->cpu;
+ spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock,
+ vcpu->pre_pcpu), flags);
+ list_add_tail(&vcpu->blocked_vcpu_list,
+ &per_cpu(blocked_vcpu_on_cpu,
+ vcpu->pre_pcpu));
+ spin_unlock_irqrestore(&per_cpu(blocked_vcpu_on_cpu_lock,
+ vcpu->pre_pcpu), flags);
+
+ do {
+ old.control = new.control = pi_desc->control;
+
+ /*
+ * We should not block the vCPU if
+ * an interrupt is posted for it.
+ */
+ if (pi_test_on(pi_desc) == 1) {
+ spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock,
+ vcpu->pre_pcpu), flags);
+ list_del(&vcpu->blocked_vcpu_list);
+ spin_unlock_irqrestore(
+ &per_cpu(blocked_vcpu_on_cpu_lock,
+ vcpu->pre_pcpu), flags);
+ vcpu->pre_pcpu = -1;
+
+ return 1;
+ }
+
+ WARN((pi_desc->sn == 1),
+ "Warning: SN field of posted-interrupts "
+ "is set before blocking\n");
+
+ /*
+ * Since vCPU can be preempted during this process,
+ * vcpu->cpu could be different with pre_pcpu, we
+ * need to set pre_pcpu as the destination of wakeup
+ * notification event, then we can find the right vCPU
+ * to wakeup in wakeup handler if interrupts happen
+ * when the vCPU is in blocked state.
+ */
+ dest = cpu_physical_id(vcpu->pre_pcpu);
+
+ if (x2apic_enabled())
+ new.ndst = dest;
+ else
+ new.ndst = (dest << 8) & 0xFF00;
+
+ /* set 'NV' to 'wakeup vector' */
+ new.nv = POSTED_INTR_WAKEUP_VECTOR;
+ } while (cmpxchg(&pi_desc->control, old.control,
+ new.control) != old.control);
+
+ return 0;
+}
+
+static void vmx_post_block(struct kvm_vcpu *vcpu)
+{
+ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+ struct pi_desc old, new;
+ unsigned int dest;
+ unsigned long flags;
+
+ if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
+ !irq_remapping_cap(IRQ_POSTING_CAP))
+ return;
+
+ do {
+ old.control = new.control = pi_desc->control;
+
+ dest = cpu_physical_id(vcpu->cpu);
+
+ if (x2apic_enabled())
+ new.ndst = dest;
+ else
+ new.ndst = (dest << 8) & 0xFF00;
+
+ /* Allow posting non-urgent interrupts */
+ new.sn = 0;
+
+ /* set 'NV' to 'notification vector' */
+ new.nv = POSTED_INTR_VECTOR;
+ } while (cmpxchg(&pi_desc->control, old.control,
+ new.control) != old.control);
+
+ if(vcpu->pre_pcpu != -1) {
+ spin_lock_irqsave(
+ &per_cpu(blocked_vcpu_on_cpu_lock,
+ vcpu->pre_pcpu), flags);
+ list_del(&vcpu->blocked_vcpu_list);
+ spin_unlock_irqrestore(
+ &per_cpu(blocked_vcpu_on_cpu_lock,
+ vcpu->pre_pcpu), flags);
+ vcpu->pre_pcpu = -1;
+ }
+}
+
+/*
+ * vmx_update_pi_irte - set IRTE for Posted-Interrupts
+ *
+ * @kvm: kvm
+ * @host_irq: host irq of the interrupt
+ * @guest_irq: gsi of the interrupt
+ * @set: set or unset PI
+ * returns 0 on success, < 0 on failure
+ */
+static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
+ uint32_t guest_irq, bool set)
+{
+ struct kvm_kernel_irq_routing_entry *e;
+ struct kvm_irq_routing_table *irq_rt;
+ struct kvm_lapic_irq irq;
+ struct kvm_vcpu *vcpu;
+ struct vcpu_data vcpu_info;
+ int idx, ret = -EINVAL;
+
+ if (!kvm_arch_has_assigned_device(kvm) ||
+ !irq_remapping_cap(IRQ_POSTING_CAP))
+ return 0;
+
+ idx = srcu_read_lock(&kvm->irq_srcu);
+ irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
+ BUG_ON(guest_irq >= irq_rt->nr_rt_entries);
+
+ hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
+ if (e->type != KVM_IRQ_ROUTING_MSI)
+ continue;
+ /*
+ * VT-d PI cannot support posting multicast/broadcast
+ * interrupts to a vCPU, we still use interrupt remapping
+ * for these kind of interrupts.
+ *
+ * For lowest-priority interrupts, we only support
+ * those with single CPU as the destination, e.g. user
+ * configures the interrupts via /proc/irq or uses
+ * irqbalance to make the interrupts single-CPU.
+ *
+ * We will support full lowest-priority interrupt later.
+ */
+
+ kvm_set_msi_irq(e, &irq);
+ if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu))
+ continue;
+
+ vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu));
+ vcpu_info.vector = irq.vector;
+
+ trace_kvm_pi_irte_update(vcpu->vcpu_id, e->gsi,
+ vcpu_info.vector, vcpu_info.pi_desc_addr, set);
+
+ if (set)
+ ret = irq_set_vcpu_affinity(host_irq, &vcpu_info);
+ else {
+ /* suppress notification event before unposting */
+ pi_set_sn(vcpu_to_pi_desc(vcpu));
+ ret = irq_set_vcpu_affinity(host_irq, NULL);
+ pi_clear_sn(vcpu_to_pi_desc(vcpu));
+ }
+
+ if (ret < 0) {
+ printk(KERN_INFO "%s: failed to update PI IRTE\n",
+ __func__);
+ goto out;
+ }
+ }
+
+ ret = 0;
+out:
+ srcu_read_unlock(&kvm->irq_srcu, idx);
+ return ret;
+}
+
static struct kvm_x86_ops vmx_x86_ops = {
.cpu_has_kvm_support = cpu_has_kvm_support,
.disabled_by_bios = vmx_disabled_by_bios,
@@ -10347,7 +10711,7 @@
.update_cr8_intercept = update_cr8_intercept,
.set_virtual_x2apic_mode = vmx_set_virtual_x2apic_mode,
.set_apic_access_page_addr = vmx_set_apic_access_page_addr,
- .vm_has_apicv = vmx_vm_has_apicv,
+ .cpu_uses_apicv = vmx_cpu_uses_apicv,
.load_eoi_exitmap = vmx_load_eoi_exitmap,
.hwapic_irr_update = vmx_hwapic_irr_update,
.hwapic_isr_update = vmx_hwapic_isr_update,
@@ -10394,7 +10758,12 @@
.flush_log_dirty = vmx_flush_log_dirty,
.enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked,
+ .pre_block = vmx_pre_block,
+ .post_block = vmx_post_block,
+
.pmu_ops = &intel_pmu_ops,
+
+ .update_pi_irte = vmx_update_pi_irte,
};
static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 9a9a198..e28954d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -51,6 +51,8 @@
#include <linux/pci.h>
#include <linux/timekeeper_internal.h>
#include <linux/pvclock_gtod.h>
+#include <linux/kvm_irqfd.h>
+#include <linux/irqbypass.h>
#include <trace/events/kvm.h>
#define CREATE_TRACE_POINTS
@@ -64,6 +66,7 @@
#include <asm/fpu/internal.h> /* Ugh! */
#include <asm/pvclock.h>
#include <asm/div64.h>
+#include <asm/irq_remapping.h>
#define MAX_IO_MSRS 256
#define KVM_MAX_MCE_BANKS 32
@@ -788,7 +791,7 @@
{
if (cr8 & CR8_RESERVED_BITS)
return 1;
- if (irqchip_in_kernel(vcpu->kvm))
+ if (lapic_in_kernel(vcpu))
kvm_lapic_set_tpr(vcpu, cr8);
else
vcpu->arch.cr8 = cr8;
@@ -798,7 +801,7 @@
unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
{
- if (irqchip_in_kernel(vcpu->kvm))
+ if (lapic_in_kernel(vcpu))
return kvm_lapic_get_cr8(vcpu);
else
return vcpu->arch.cr8;
@@ -952,6 +955,9 @@
HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC,
HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2,
HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL,
+ HV_X64_MSR_RESET,
+ HV_X64_MSR_VP_INDEX,
+ HV_X64_MSR_VP_RUNTIME,
HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
MSR_KVM_PV_EOI_EN,
@@ -2448,6 +2454,7 @@
case KVM_CAP_ENABLE_CAP_VM:
case KVM_CAP_DISABLE_QUIRKS:
case KVM_CAP_SET_BOOT_CPU_ID:
+ case KVM_CAP_SPLIT_IRQCHIP:
#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
case KVM_CAP_ASSIGN_DEV_IRQ:
case KVM_CAP_PCI_2_3:
@@ -2661,12 +2668,24 @@
{
if (irq->irq >= KVM_NR_INTERRUPTS)
return -EINVAL;
- if (irqchip_in_kernel(vcpu->kvm))
+
+ if (!irqchip_in_kernel(vcpu->kvm)) {
+ kvm_queue_interrupt(vcpu, irq->irq, false);
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ return 0;
+ }
+
+ /*
+ * With in-kernel LAPIC, we only use this to inject EXTINT, so
+ * fail for in-kernel 8259.
+ */
+ if (pic_in_kernel(vcpu->kvm))
return -ENXIO;
- kvm_queue_interrupt(vcpu, irq->irq, false);
- kvm_make_request(KVM_REQ_EVENT, vcpu);
+ if (vcpu->arch.pending_external_vector != -1)
+ return -EEXIST;
+ vcpu->arch.pending_external_vector = irq->irq;
return 0;
}
@@ -3175,7 +3194,7 @@
struct kvm_vapic_addr va;
r = -EINVAL;
- if (!irqchip_in_kernel(vcpu->kvm))
+ if (!lapic_in_kernel(vcpu))
goto out;
r = -EFAULT;
if (copy_from_user(&va, argp, sizeof va))
@@ -3555,6 +3574,28 @@
kvm->arch.disabled_quirks = cap->args[0];
r = 0;
break;
+ case KVM_CAP_SPLIT_IRQCHIP: {
+ mutex_lock(&kvm->lock);
+ r = -EINVAL;
+ if (cap->args[0] > MAX_NR_RESERVED_IOAPIC_PINS)
+ goto split_irqchip_unlock;
+ r = -EEXIST;
+ if (irqchip_in_kernel(kvm))
+ goto split_irqchip_unlock;
+ if (atomic_read(&kvm->online_vcpus))
+ goto split_irqchip_unlock;
+ r = kvm_setup_empty_irq_routing(kvm);
+ if (r)
+ goto split_irqchip_unlock;
+ /* Pairs with irqchip_in_kernel. */
+ smp_wmb();
+ kvm->arch.irqchip_split = true;
+ kvm->arch.nr_reserved_ioapic_pins = cap->args[0];
+ r = 0;
+split_irqchip_unlock:
+ mutex_unlock(&kvm->lock);
+ break;
+ }
default:
r = -EINVAL;
break;
@@ -3668,7 +3709,7 @@
}
r = -ENXIO;
- if (!irqchip_in_kernel(kvm))
+ if (!irqchip_in_kernel(kvm) || irqchip_split(kvm))
goto get_irqchip_out;
r = kvm_vm_ioctl_get_irqchip(kvm, chip);
if (r)
@@ -3692,7 +3733,7 @@
}
r = -ENXIO;
- if (!irqchip_in_kernel(kvm))
+ if (!irqchip_in_kernel(kvm) || irqchip_split(kvm))
goto set_irqchip_out;
r = kvm_vm_ioctl_set_irqchip(kvm, chip);
if (r)
@@ -5666,7 +5707,7 @@
int kvm_vcpu_halt(struct kvm_vcpu *vcpu)
{
++vcpu->stat.halt_exits;
- if (irqchip_in_kernel(vcpu->kvm)) {
+ if (lapic_in_kernel(vcpu)) {
vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
return 1;
} else {
@@ -5773,9 +5814,15 @@
*/
static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu)
{
- return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) &&
- vcpu->run->request_interrupt_window &&
- kvm_arch_interrupt_allowed(vcpu));
+ if (!vcpu->run->request_interrupt_window || pic_in_kernel(vcpu->kvm))
+ return false;
+
+ if (kvm_cpu_has_interrupt(vcpu))
+ return false;
+
+ return (irqchip_split(vcpu->kvm)
+ ? kvm_apic_accept_pic_intr(vcpu)
+ : kvm_arch_interrupt_allowed(vcpu));
}
static void post_kvm_run_save(struct kvm_vcpu *vcpu)
@@ -5786,13 +5833,17 @@
kvm_run->flags = is_smm(vcpu) ? KVM_RUN_X86_SMM : 0;
kvm_run->cr8 = kvm_get_cr8(vcpu);
kvm_run->apic_base = kvm_get_apic_base(vcpu);
- if (irqchip_in_kernel(vcpu->kvm))
- kvm_run->ready_for_interrupt_injection = 1;
- else
+ if (!irqchip_in_kernel(vcpu->kvm))
kvm_run->ready_for_interrupt_injection =
kvm_arch_interrupt_allowed(vcpu) &&
!kvm_cpu_has_interrupt(vcpu) &&
!kvm_event_needs_reinjection(vcpu);
+ else if (!pic_in_kernel(vcpu->kvm))
+ kvm_run->ready_for_interrupt_injection =
+ kvm_apic_accept_pic_intr(vcpu) &&
+ !kvm_cpu_has_interrupt(vcpu);
+ else
+ kvm_run->ready_for_interrupt_injection = 1;
}
static void update_cr8_intercept(struct kvm_vcpu *vcpu)
@@ -6143,18 +6194,16 @@
static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
{
- u64 eoi_exit_bitmap[4];
- u32 tmr[8];
-
if (!kvm_apic_hw_enabled(vcpu->arch.apic))
return;
- memset(eoi_exit_bitmap, 0, 32);
- memset(tmr, 0, 32);
+ memset(vcpu->arch.eoi_exit_bitmap, 0, 256 / 8);
- kvm_ioapic_scan_entry(vcpu, eoi_exit_bitmap, tmr);
- kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap);
- kvm_apic_update_tmr(vcpu, tmr);
+ if (irqchip_split(vcpu->kvm))
+ kvm_scan_ioapic_routes(vcpu, vcpu->arch.eoi_exit_bitmap);
+ else
+ kvm_ioapic_scan_entry(vcpu, vcpu->arch.eoi_exit_bitmap);
+ kvm_x86_ops->load_eoi_exitmap(vcpu);
}
static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu)
@@ -6167,7 +6216,7 @@
{
struct page *page = NULL;
- if (!irqchip_in_kernel(vcpu->kvm))
+ if (!lapic_in_kernel(vcpu))
return;
if (!kvm_x86_ops->set_apic_access_page_addr)
@@ -6205,7 +6254,7 @@
static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
{
int r;
- bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
+ bool req_int_win = !lapic_in_kernel(vcpu) &&
vcpu->run->request_interrupt_window;
bool req_immediate_exit = false;
@@ -6257,6 +6306,17 @@
kvm_pmu_handle_event(vcpu);
if (kvm_check_request(KVM_REQ_PMI, vcpu))
kvm_pmu_deliver_pmi(vcpu);
+ if (kvm_check_request(KVM_REQ_IOAPIC_EOI_EXIT, vcpu)) {
+ BUG_ON(vcpu->arch.pending_ioapic_eoi > 255);
+ if (test_bit(vcpu->arch.pending_ioapic_eoi,
+ (void *) vcpu->arch.eoi_exit_bitmap)) {
+ vcpu->run->exit_reason = KVM_EXIT_IOAPIC_EOI;
+ vcpu->run->eoi.vector =
+ vcpu->arch.pending_ioapic_eoi;
+ r = 0;
+ goto out;
+ }
+ }
if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu))
vcpu_scan_ioapic(vcpu);
if (kvm_check_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu))
@@ -6267,6 +6327,26 @@
r = 0;
goto out;
}
+ if (kvm_check_request(KVM_REQ_HV_RESET, vcpu)) {
+ vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
+ vcpu->run->system_event.type = KVM_SYSTEM_EVENT_RESET;
+ r = 0;
+ goto out;
+ }
+ }
+
+ /*
+ * KVM_REQ_EVENT is not set when posted interrupts are set by
+ * VT-d hardware, so we have to update RVI unconditionally.
+ */
+ if (kvm_lapic_enabled(vcpu)) {
+ /*
+ * Update architecture specific hints for APIC
+ * virtual interrupt delivery.
+ */
+ if (kvm_x86_ops->hwapic_irr_update)
+ kvm_x86_ops->hwapic_irr_update(vcpu,
+ kvm_lapic_find_highest_irr(vcpu));
}
if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
@@ -6285,13 +6365,6 @@
kvm_x86_ops->enable_irq_window(vcpu);
if (kvm_lapic_enabled(vcpu)) {
- /*
- * Update architecture specific hints for APIC
- * virtual interrupt delivery.
- */
- if (kvm_x86_ops->hwapic_irr_update)
- kvm_x86_ops->hwapic_irr_update(vcpu,
- kvm_lapic_find_highest_irr(vcpu));
update_cr8_intercept(vcpu);
kvm_lapic_sync_to_vapic(vcpu);
}
@@ -6427,10 +6500,15 @@
static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
{
- if (!kvm_arch_vcpu_runnable(vcpu)) {
+ if (!kvm_arch_vcpu_runnable(vcpu) &&
+ (!kvm_x86_ops->pre_block || kvm_x86_ops->pre_block(vcpu) == 0)) {
srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
kvm_vcpu_block(vcpu);
vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
+
+ if (kvm_x86_ops->post_block)
+ kvm_x86_ops->post_block(vcpu);
+
if (!kvm_check_request(KVM_REQ_UNHALT, vcpu))
return 1;
}
@@ -6467,10 +6545,12 @@
vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
for (;;) {
- if (kvm_vcpu_running(vcpu))
+ if (kvm_vcpu_running(vcpu)) {
r = vcpu_enter_guest(vcpu);
- else
+ } else {
r = vcpu_block(kvm, vcpu);
+ }
+
if (r <= 0)
break;
@@ -6479,8 +6559,8 @@
kvm_inject_pending_timer_irqs(vcpu);
if (dm_request_for_irq_injection(vcpu)) {
- r = -EINTR;
- vcpu->run->exit_reason = KVM_EXIT_INTR;
+ r = 0;
+ vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
++vcpu->stat.request_irq_exits;
break;
}
@@ -6607,7 +6687,7 @@
}
/* re-sync apic's tpr */
- if (!irqchip_in_kernel(vcpu->kvm)) {
+ if (!lapic_in_kernel(vcpu)) {
if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) {
r = -EINVAL;
goto out;
@@ -7307,7 +7387,7 @@
bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu)
{
- return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL);
+ return irqchip_in_kernel(vcpu->kvm) == lapic_in_kernel(vcpu);
}
struct static_key kvm_no_apic_vcpu __read_mostly;
@@ -7376,6 +7456,8 @@
kvm_async_pf_hash_reset(vcpu);
kvm_pmu_init(vcpu);
+ vcpu->arch.pending_external_vector = -1;
+
return 0;
fail_free_mce_banks:
@@ -7401,7 +7483,7 @@
kvm_mmu_destroy(vcpu);
srcu_read_unlock(&vcpu->kvm->srcu, idx);
free_page((unsigned long)vcpu->arch.pio_data);
- if (!irqchip_in_kernel(vcpu->kvm))
+ if (!lapic_in_kernel(vcpu))
static_key_slow_dec(&kvm_no_apic_vcpu);
}
@@ -8028,7 +8110,59 @@
}
EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma);
+int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
+ struct irq_bypass_producer *prod)
+{
+ struct kvm_kernel_irqfd *irqfd =
+ container_of(cons, struct kvm_kernel_irqfd, consumer);
+
+ if (kvm_x86_ops->update_pi_irte) {
+ irqfd->producer = prod;
+ return kvm_x86_ops->update_pi_irte(irqfd->kvm,
+ prod->irq, irqfd->gsi, 1);
+ }
+
+ return -EINVAL;
+}
+
+void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
+ struct irq_bypass_producer *prod)
+{
+ int ret;
+ struct kvm_kernel_irqfd *irqfd =
+ container_of(cons, struct kvm_kernel_irqfd, consumer);
+
+ if (!kvm_x86_ops->update_pi_irte) {
+ WARN_ON(irqfd->producer != NULL);
+ return;
+ }
+
+ WARN_ON(irqfd->producer != prod);
+ irqfd->producer = NULL;
+
+ /*
+ * When producer of consumer is unregistered, we change back to
+ * remapped mode, so we can re-use the current implementation
+ * when the irq is masked/disabed or the consumer side (KVM
+ * int this case doesn't want to receive the interrupts.
+ */
+ ret = kvm_x86_ops->update_pi_irte(irqfd->kvm, prod->irq, irqfd->gsi, 0);
+ if (ret)
+ printk(KERN_INFO "irq bypass consumer (token %p) unregistration"
+ " fails: %d\n", irqfd->consumer.token, ret);
+}
+
+int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
+ uint32_t guest_irq, bool set)
+{
+ if (!kvm_x86_ops->update_pi_irte)
+ return -EINVAL;
+
+ return kvm_x86_ops->update_pi_irte(kvm, host_irq, guest_irq, set);
+}
+
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr);
@@ -8043,3 +8177,4 @@
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update);
diff --git a/drivers/iommu/irq_remapping.c b/drivers/iommu/irq_remapping.c
index 913455a..8adaaea 100644
--- a/drivers/iommu/irq_remapping.c
+++ b/drivers/iommu/irq_remapping.c
@@ -22,7 +22,7 @@
int disable_sourceid_checking;
int no_x2apic_optout;
-int disable_irq_post = 1;
+int disable_irq_post = 0;
static int disable_irq_remap;
static struct irq_remap_ops *remap_ops;
@@ -58,14 +58,18 @@
return -EINVAL;
while (*str) {
- if (!strncmp(str, "on", 2))
+ if (!strncmp(str, "on", 2)) {
disable_irq_remap = 0;
- else if (!strncmp(str, "off", 3))
+ disable_irq_post = 0;
+ } else if (!strncmp(str, "off", 3)) {
disable_irq_remap = 1;
- else if (!strncmp(str, "nosid", 5))
+ disable_irq_post = 1;
+ } else if (!strncmp(str, "nosid", 5))
disable_sourceid_checking = 1;
else if (!strncmp(str, "no_x2apic_optout", 16))
no_x2apic_optout = 1;
+ else if (!strncmp(str, "nopost", 6))
+ disable_irq_post = 1;
str += strcspn(str, ",");
while (*str == ',')
diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
index 4540179..850d86c 100644
--- a/drivers/vfio/Kconfig
+++ b/drivers/vfio/Kconfig
@@ -33,3 +33,4 @@
source "drivers/vfio/pci/Kconfig"
source "drivers/vfio/platform/Kconfig"
+source "virt/lib/Kconfig"
diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig
index 579d83b..02912f1 100644
--- a/drivers/vfio/pci/Kconfig
+++ b/drivers/vfio/pci/Kconfig
@@ -2,6 +2,7 @@
tristate "VFIO support for PCI devices"
depends on VFIO && PCI && EVENTFD
select VFIO_VIRQFD
+ select IRQ_BYPASS_MANAGER
help
Support for the PCI VFIO bus driver. This is required to make
use of PCI drivers using the VFIO framework.
diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c
index 1f577b4..3b3ba15 100644
--- a/drivers/vfio/pci/vfio_pci_intrs.c
+++ b/drivers/vfio/pci/vfio_pci_intrs.c
@@ -319,6 +319,7 @@
if (vdev->ctx[vector].trigger) {
free_irq(irq, vdev->ctx[vector].trigger);
+ irq_bypass_unregister_producer(&vdev->ctx[vector].producer);
kfree(vdev->ctx[vector].name);
eventfd_ctx_put(vdev->ctx[vector].trigger);
vdev->ctx[vector].trigger = NULL;
@@ -360,6 +361,14 @@
return ret;
}
+ vdev->ctx[vector].producer.token = trigger;
+ vdev->ctx[vector].producer.irq = irq;
+ ret = irq_bypass_register_producer(&vdev->ctx[vector].producer);
+ if (unlikely(ret))
+ dev_info(&pdev->dev,
+ "irq bypass producer (token %p) registration fails: %d\n",
+ vdev->ctx[vector].producer.token, ret);
+
vdev->ctx[vector].trigger = trigger;
return 0;
diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h
index ae0e1b4..0e7394f 100644
--- a/drivers/vfio/pci/vfio_pci_private.h
+++ b/drivers/vfio/pci/vfio_pci_private.h
@@ -13,6 +13,7 @@
#include <linux/mutex.h>
#include <linux/pci.h>
+#include <linux/irqbypass.h>
#ifndef VFIO_PCI_PRIVATE_H
#define VFIO_PCI_PRIVATE_H
@@ -29,6 +30,7 @@
struct virqfd *mask;
char *name;
bool masked;
+ struct irq_bypass_producer producer;
};
struct vfio_pci_device {
diff --git a/include/linux/irqbypass.h b/include/linux/irqbypass.h
new file mode 100644
index 0000000..1551b5b
--- /dev/null
+++ b/include/linux/irqbypass.h
@@ -0,0 +1,90 @@
+/*
+ * IRQ offload/bypass manager
+ *
+ * Copyright (C) 2015 Red Hat, Inc.
+ * Copyright (c) 2015 Linaro Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef IRQBYPASS_H
+#define IRQBYPASS_H
+
+#include <linux/list.h>
+
+struct irq_bypass_consumer;
+
+/*
+ * Theory of operation
+ *
+ * The IRQ bypass manager is a simple set of lists and callbacks that allows
+ * IRQ producers (ex. physical interrupt sources) to be matched to IRQ
+ * consumers (ex. virtualization hardware that allows IRQ bypass or offload)
+ * via a shared token (ex. eventfd_ctx). Producers and consumers register
+ * independently. When a token match is found, the optional @stop callback
+ * will be called for each participant. The pair will then be connected via
+ * the @add_* callbacks, and finally the optional @start callback will allow
+ * any final coordination. When either participant is unregistered, the
+ * process is repeated using the @del_* callbacks in place of the @add_*
+ * callbacks. Match tokens must be unique per producer/consumer, 1:N pairings
+ * are not supported.
+ */
+
+/**
+ * struct irq_bypass_producer - IRQ bypass producer definition
+ * @node: IRQ bypass manager private list management
+ * @token: opaque token to match between producer and consumer
+ * @irq: Linux IRQ number for the producer device
+ * @add_consumer: Connect the IRQ producer to an IRQ consumer (optional)
+ * @del_consumer: Disconnect the IRQ producer from an IRQ consumer (optional)
+ * @stop: Perform any quiesce operations necessary prior to add/del (optional)
+ * @start: Perform any startup operations necessary after add/del (optional)
+ *
+ * The IRQ bypass producer structure represents an interrupt source for
+ * participation in possible host bypass, for instance an interrupt vector
+ * for a physical device assigned to a VM.
+ */
+struct irq_bypass_producer {
+ struct list_head node;
+ void *token;
+ int irq;
+ int (*add_consumer)(struct irq_bypass_producer *,
+ struct irq_bypass_consumer *);
+ void (*del_consumer)(struct irq_bypass_producer *,
+ struct irq_bypass_consumer *);
+ void (*stop)(struct irq_bypass_producer *);
+ void (*start)(struct irq_bypass_producer *);
+};
+
+/**
+ * struct irq_bypass_consumer - IRQ bypass consumer definition
+ * @node: IRQ bypass manager private list management
+ * @token: opaque token to match between producer and consumer
+ * @add_producer: Connect the IRQ consumer to an IRQ producer
+ * @del_producer: Disconnect the IRQ consumer from an IRQ producer
+ * @stop: Perform any quiesce operations necessary prior to add/del (optional)
+ * @start: Perform any startup operations necessary after add/del (optional)
+ *
+ * The IRQ bypass consumer structure represents an interrupt sink for
+ * participation in possible host bypass, for instance a hypervisor may
+ * support offloads to allow bypassing the host entirely or offload
+ * portions of the interrupt handling to the VM.
+ */
+struct irq_bypass_consumer {
+ struct list_head node;
+ void *token;
+ int (*add_producer)(struct irq_bypass_consumer *,
+ struct irq_bypass_producer *);
+ void (*del_producer)(struct irq_bypass_consumer *,
+ struct irq_bypass_producer *);
+ void (*stop)(struct irq_bypass_consumer *);
+ void (*start)(struct irq_bypass_consumer *);
+};
+
+int irq_bypass_register_producer(struct irq_bypass_producer *);
+void irq_bypass_unregister_producer(struct irq_bypass_producer *);
+int irq_bypass_register_consumer(struct irq_bypass_consumer *);
+void irq_bypass_unregister_consumer(struct irq_bypass_consumer *);
+
+#endif /* IRQBYPASS_H */
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 1bef9e2..9596a2f 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -24,6 +24,7 @@
#include <linux/err.h>
#include <linux/irqflags.h>
#include <linux/context_tracking.h>
+#include <linux/irqbypass.h>
#include <asm/signal.h>
#include <linux/kvm.h>
@@ -140,6 +141,8 @@
#define KVM_REQ_APIC_PAGE_RELOAD 25
#define KVM_REQ_SMI 26
#define KVM_REQ_HV_CRASH 27
+#define KVM_REQ_IOAPIC_EOI_EXIT 28
+#define KVM_REQ_HV_RESET 29
#define KVM_USERSPACE_IRQ_SOURCE_ID 0
#define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID 1
@@ -231,6 +234,9 @@
unsigned long requests;
unsigned long guest_debug;
+ int pre_pcpu;
+ struct list_head blocked_vcpu_list;
+
struct mutex mutex;
struct kvm_run *run;
@@ -329,6 +335,18 @@
struct hlist_node link;
};
+#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
+struct kvm_irq_routing_table {
+ int chip[KVM_NR_IRQCHIPS][KVM_IRQCHIP_NUM_PINS];
+ u32 nr_rt_entries;
+ /*
+ * Array indexed by gsi. Each entry contains list of irq chips
+ * the gsi is connected to.
+ */
+ struct hlist_head map[0];
+};
+#endif
+
#ifndef KVM_PRIVATE_MEM_SLOTS
#define KVM_PRIVATE_MEM_SLOTS 0
#endif
@@ -455,10 +473,14 @@
#ifdef __KVM_HAVE_IOAPIC
void kvm_vcpu_request_scan_ioapic(struct kvm *kvm);
+void kvm_arch_irq_routing_update(struct kvm *kvm);
#else
static inline void kvm_vcpu_request_scan_ioapic(struct kvm *kvm)
{
}
+static inline void kvm_arch_irq_routing_update(struct kvm *kvm)
+{
+}
#endif
#ifdef CONFIG_HAVE_KVM_IRQFD
@@ -1002,6 +1024,7 @@
#endif
int kvm_setup_default_irq_routing(struct kvm *kvm);
+int kvm_setup_empty_irq_routing(struct kvm *kvm);
int kvm_set_irq_routing(struct kvm *kvm,
const struct kvm_irq_routing_entry *entries,
unsigned nr,
@@ -1144,5 +1167,15 @@
{
}
#endif /* CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */
-#endif
+#ifdef CONFIG_HAVE_KVM_IRQ_BYPASS
+int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *,
+ struct irq_bypass_producer *);
+void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *,
+ struct irq_bypass_producer *);
+void kvm_arch_irq_bypass_stop(struct irq_bypass_consumer *);
+void kvm_arch_irq_bypass_start(struct irq_bypass_consumer *);
+int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
+ uint32_t guest_irq, bool set);
+#endif /* CONFIG_HAVE_KVM_IRQ_BYPASS */
+#endif
diff --git a/include/linux/kvm_irqfd.h b/include/linux/kvm_irqfd.h
new file mode 100644
index 0000000..0c1de05
--- /dev/null
+++ b/include/linux/kvm_irqfd.h
@@ -0,0 +1,71 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * irqfd: Allows an fd to be used to inject an interrupt to the guest
+ * Credit goes to Avi Kivity for the original idea.
+ */
+
+#ifndef __LINUX_KVM_IRQFD_H
+#define __LINUX_KVM_IRQFD_H
+
+#include <linux/kvm_host.h>
+#include <linux/poll.h>
+
+/*
+ * Resampling irqfds are a special variety of irqfds used to emulate
+ * level triggered interrupts. The interrupt is asserted on eventfd
+ * trigger. On acknowledgment through the irq ack notifier, the
+ * interrupt is de-asserted and userspace is notified through the
+ * resamplefd. All resamplers on the same gsi are de-asserted
+ * together, so we don't need to track the state of each individual
+ * user. We can also therefore share the same irq source ID.
+ */
+struct kvm_kernel_irqfd_resampler {
+ struct kvm *kvm;
+ /*
+ * List of resampling struct _irqfd objects sharing this gsi.
+ * RCU list modified under kvm->irqfds.resampler_lock
+ */
+ struct list_head list;
+ struct kvm_irq_ack_notifier notifier;
+ /*
+ * Entry in list of kvm->irqfd.resampler_list. Use for sharing
+ * resamplers among irqfds on the same gsi.
+ * Accessed and modified under kvm->irqfds.resampler_lock
+ */
+ struct list_head link;
+};
+
+struct kvm_kernel_irqfd {
+ /* Used for MSI fast-path */
+ struct kvm *kvm;
+ wait_queue_t wait;
+ /* Update side is protected by irqfds.lock */
+ struct kvm_kernel_irq_routing_entry irq_entry;
+ seqcount_t irq_entry_sc;
+ /* Used for level IRQ fast-path */
+ int gsi;
+ struct work_struct inject;
+ /* The resampler used by this irqfd (resampler-only) */
+ struct kvm_kernel_irqfd_resampler *resampler;
+ /* Eventfd notified on resample (resampler-only) */
+ struct eventfd_ctx *resamplefd;
+ /* Entry in list of irqfds for a resampler (resampler-only) */
+ struct list_head resampler_link;
+ /* Used for setup/shutdown */
+ struct eventfd_ctx *eventfd;
+ struct list_head list;
+ poll_table pt;
+ struct work_struct shutdown;
+ struct irq_bypass_consumer consumer;
+ struct irq_bypass_producer *producer;
+};
+
+#endif /* __LINUX_KVM_IRQFD_H */
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index a9256f0..03f3618 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -183,6 +183,7 @@
#define KVM_EXIT_EPR 23
#define KVM_EXIT_SYSTEM_EVENT 24
#define KVM_EXIT_S390_STSI 25
+#define KVM_EXIT_IOAPIC_EOI 26
/* For KVM_EXIT_INTERNAL_ERROR */
/* Emulate instruction failed. */
@@ -333,6 +334,10 @@
__u8 sel1;
__u16 sel2;
} s390_stsi;
+ /* KVM_EXIT_IOAPIC_EOI */
+ struct {
+ __u8 vector;
+ } eoi;
/* Fix the size of the union. */
char padding[256];
};
@@ -824,6 +829,8 @@
#define KVM_CAP_MULTI_ADDRESS_SPACE 118
#define KVM_CAP_GUEST_DEBUG_HW_BPS 119
#define KVM_CAP_GUEST_DEBUG_HW_WPS 120
+#define KVM_CAP_SPLIT_IRQCHIP 121
+#define KVM_CAP_IOEVENTFD_ANY_LENGTH 122
#ifdef KVM_CAP_IRQ_ROUTING
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 8cbc3db..26a5446 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -444,6 +444,7 @@
*ut = p->utime;
*st = p->stime;
}
+EXPORT_SYMBOL_GPL(task_cputime_adjusted);
void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
{
@@ -652,6 +653,7 @@
task_cputime(p, &cputime.utime, &cputime.stime);
cputime_adjust(&cputime, &p->prev_cputime, ut, st);
}
+EXPORT_SYMBOL_GPL(task_cputime_adjusted);
void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
{
diff --git a/tools/lib/traceevent/plugin_kvm.c b/tools/lib/traceevent/plugin_kvm.c
index 88fe83d..18536f7 100644
--- a/tools/lib/traceevent/plugin_kvm.c
+++ b/tools/lib/traceevent/plugin_kvm.c
@@ -124,7 +124,10 @@
_ER(WBINVD, 54) \
_ER(XSETBV, 55) \
_ER(APIC_WRITE, 56) \
- _ER(INVPCID, 58)
+ _ER(INVPCID, 58) \
+ _ER(PML_FULL, 62) \
+ _ER(XSAVES, 63) \
+ _ER(XRSTORS, 64)
#define SVM_EXIT_REASONS \
_ER(EXIT_READ_CR0, 0x000) \
@@ -352,15 +355,18 @@
union kvm_mmu_page_role {
unsigned word;
struct {
- unsigned glevels:4;
unsigned level:4;
+ unsigned cr4_pae:1;
unsigned quadrant:2;
- unsigned pad_for_nice_hex_output:6;
unsigned direct:1;
unsigned access:3;
unsigned invalid:1;
- unsigned cr4_pge:1;
unsigned nxe:1;
+ unsigned cr0_wp:1;
+ unsigned smep_and_not_wp:1;
+ unsigned smap_and_not_wp:1;
+ unsigned pad_for_nice_hex_output:8;
+ unsigned smm:8;
};
};
@@ -385,15 +391,18 @@
if (pevent_is_file_bigendian(event->pevent) ==
pevent_is_host_bigendian(event->pevent)) {
- trace_seq_printf(s, "%u/%u q%u%s %s%s %spge %snxe",
+ trace_seq_printf(s, "%u q%u%s %s%s %spae %snxe %swp%s%s%s",
role.level,
- role.glevels,
role.quadrant,
role.direct ? " direct" : "",
access_str[role.access],
role.invalid ? " invalid" : "",
- role.cr4_pge ? "" : "!",
- role.nxe ? "" : "!");
+ role.cr4_pae ? "" : "!",
+ role.nxe ? "" : "!",
+ role.cr0_wp ? "" : "!",
+ role.smep_and_not_wp ? " smep" : "",
+ role.smap_and_not_wp ? " smap" : "",
+ role.smm ? " smm" : "");
} else
trace_seq_printf(s, "WORD: %08x", role.word);
diff --git a/virt/Makefile b/virt/Makefile
new file mode 100644
index 0000000..be78347
--- /dev/null
+++ b/virt/Makefile
@@ -0,0 +1 @@
+obj-y += lib/
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index e2c876d..9f8014d 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -47,3 +47,6 @@
config KVM_COMPAT
def_bool y
depends on COMPAT && !S390
+
+config HAVE_KVM_IRQ_BYPASS
+ bool
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 79db453..b637965 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -23,6 +23,7 @@
#include <linux/kvm_host.h>
#include <linux/kvm.h>
+#include <linux/kvm_irqfd.h>
#include <linux/workqueue.h>
#include <linux/syscalls.h>
#include <linux/wait.h>
@@ -34,73 +35,20 @@
#include <linux/srcu.h>
#include <linux/slab.h>
#include <linux/seqlock.h>
+#include <linux/irqbypass.h>
#include <trace/events/kvm.h>
#include <kvm/iodev.h>
#ifdef CONFIG_HAVE_KVM_IRQFD
-/*
- * --------------------------------------------------------------------
- * irqfd: Allows an fd to be used to inject an interrupt to the guest
- *
- * Credit goes to Avi Kivity for the original idea.
- * --------------------------------------------------------------------
- */
-
-/*
- * Resampling irqfds are a special variety of irqfds used to emulate
- * level triggered interrupts. The interrupt is asserted on eventfd
- * trigger. On acknowledgement through the irq ack notifier, the
- * interrupt is de-asserted and userspace is notified through the
- * resamplefd. All resamplers on the same gsi are de-asserted
- * together, so we don't need to track the state of each individual
- * user. We can also therefore share the same irq source ID.
- */
-struct _irqfd_resampler {
- struct kvm *kvm;
- /*
- * List of resampling struct _irqfd objects sharing this gsi.
- * RCU list modified under kvm->irqfds.resampler_lock
- */
- struct list_head list;
- struct kvm_irq_ack_notifier notifier;
- /*
- * Entry in list of kvm->irqfd.resampler_list. Use for sharing
- * resamplers among irqfds on the same gsi.
- * Accessed and modified under kvm->irqfds.resampler_lock
- */
- struct list_head link;
-};
-
-struct _irqfd {
- /* Used for MSI fast-path */
- struct kvm *kvm;
- wait_queue_t wait;
- /* Update side is protected by irqfds.lock */
- struct kvm_kernel_irq_routing_entry irq_entry;
- seqcount_t irq_entry_sc;
- /* Used for level IRQ fast-path */
- int gsi;
- struct work_struct inject;
- /* The resampler used by this irqfd (resampler-only) */
- struct _irqfd_resampler *resampler;
- /* Eventfd notified on resample (resampler-only) */
- struct eventfd_ctx *resamplefd;
- /* Entry in list of irqfds for a resampler (resampler-only) */
- struct list_head resampler_link;
- /* Used for setup/shutdown */
- struct eventfd_ctx *eventfd;
- struct list_head list;
- poll_table pt;
- struct work_struct shutdown;
-};
static struct workqueue_struct *irqfd_cleanup_wq;
static void
irqfd_inject(struct work_struct *work)
{
- struct _irqfd *irqfd = container_of(work, struct _irqfd, inject);
+ struct kvm_kernel_irqfd *irqfd =
+ container_of(work, struct kvm_kernel_irqfd, inject);
struct kvm *kvm = irqfd->kvm;
if (!irqfd->resampler) {
@@ -121,12 +69,13 @@
static void
irqfd_resampler_ack(struct kvm_irq_ack_notifier *kian)
{
- struct _irqfd_resampler *resampler;
+ struct kvm_kernel_irqfd_resampler *resampler;
struct kvm *kvm;
- struct _irqfd *irqfd;
+ struct kvm_kernel_irqfd *irqfd;
int idx;
- resampler = container_of(kian, struct _irqfd_resampler, notifier);
+ resampler = container_of(kian,
+ struct kvm_kernel_irqfd_resampler, notifier);
kvm = resampler->kvm;
kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
@@ -141,9 +90,9 @@
}
static void
-irqfd_resampler_shutdown(struct _irqfd *irqfd)
+irqfd_resampler_shutdown(struct kvm_kernel_irqfd *irqfd)
{
- struct _irqfd_resampler *resampler = irqfd->resampler;
+ struct kvm_kernel_irqfd_resampler *resampler = irqfd->resampler;
struct kvm *kvm = resampler->kvm;
mutex_lock(&kvm->irqfds.resampler_lock);
@@ -168,7 +117,8 @@
static void
irqfd_shutdown(struct work_struct *work)
{
- struct _irqfd *irqfd = container_of(work, struct _irqfd, shutdown);
+ struct kvm_kernel_irqfd *irqfd =
+ container_of(work, struct kvm_kernel_irqfd, shutdown);
u64 cnt;
/*
@@ -191,6 +141,9 @@
/*
* It is now safe to release the object's resources
*/
+#ifdef CONFIG_HAVE_KVM_IRQ_BYPASS
+ irq_bypass_unregister_consumer(&irqfd->consumer);
+#endif
eventfd_ctx_put(irqfd->eventfd);
kfree(irqfd);
}
@@ -198,7 +151,7 @@
/* assumes kvm->irqfds.lock is held */
static bool
-irqfd_is_active(struct _irqfd *irqfd)
+irqfd_is_active(struct kvm_kernel_irqfd *irqfd)
{
return list_empty(&irqfd->list) ? false : true;
}
@@ -209,7 +162,7 @@
* assumes kvm->irqfds.lock is held
*/
static void
-irqfd_deactivate(struct _irqfd *irqfd)
+irqfd_deactivate(struct kvm_kernel_irqfd *irqfd)
{
BUG_ON(!irqfd_is_active(irqfd));
@@ -224,7 +177,8 @@
static int
irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
{
- struct _irqfd *irqfd = container_of(wait, struct _irqfd, wait);
+ struct kvm_kernel_irqfd *irqfd =
+ container_of(wait, struct kvm_kernel_irqfd, wait);
unsigned long flags = (unsigned long)key;
struct kvm_kernel_irq_routing_entry irq;
struct kvm *kvm = irqfd->kvm;
@@ -274,12 +228,13 @@
irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh,
poll_table *pt)
{
- struct _irqfd *irqfd = container_of(pt, struct _irqfd, pt);
+ struct kvm_kernel_irqfd *irqfd =
+ container_of(pt, struct kvm_kernel_irqfd, pt);
add_wait_queue(wqh, &irqfd->wait);
}
/* Must be called under irqfds.lock */
-static void irqfd_update(struct kvm *kvm, struct _irqfd *irqfd)
+static void irqfd_update(struct kvm *kvm, struct kvm_kernel_irqfd *irqfd)
{
struct kvm_kernel_irq_routing_entry *e;
struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS];
@@ -301,10 +256,29 @@
write_seqcount_end(&irqfd->irq_entry_sc);
}
+#ifdef CONFIG_HAVE_KVM_IRQ_BYPASS
+void __attribute__((weak)) kvm_arch_irq_bypass_stop(
+ struct irq_bypass_consumer *cons)
+{
+}
+
+void __attribute__((weak)) kvm_arch_irq_bypass_start(
+ struct irq_bypass_consumer *cons)
+{
+}
+
+int __attribute__((weak)) kvm_arch_update_irqfd_routing(
+ struct kvm *kvm, unsigned int host_irq,
+ uint32_t guest_irq, bool set)
+{
+ return 0;
+}
+#endif
+
static int
kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
{
- struct _irqfd *irqfd, *tmp;
+ struct kvm_kernel_irqfd *irqfd, *tmp;
struct fd f;
struct eventfd_ctx *eventfd = NULL, *resamplefd = NULL;
int ret;
@@ -340,7 +314,7 @@
irqfd->eventfd = eventfd;
if (args->flags & KVM_IRQFD_FLAG_RESAMPLE) {
- struct _irqfd_resampler *resampler;
+ struct kvm_kernel_irqfd_resampler *resampler;
resamplefd = eventfd_ctx_fdget(args->resamplefd);
if (IS_ERR(resamplefd)) {
@@ -428,6 +402,17 @@
* we might race against the POLLHUP
*/
fdput(f);
+#ifdef CONFIG_HAVE_KVM_IRQ_BYPASS
+ irqfd->consumer.token = (void *)irqfd->eventfd;
+ irqfd->consumer.add_producer = kvm_arch_irq_bypass_add_producer;
+ irqfd->consumer.del_producer = kvm_arch_irq_bypass_del_producer;
+ irqfd->consumer.stop = kvm_arch_irq_bypass_stop;
+ irqfd->consumer.start = kvm_arch_irq_bypass_start;
+ ret = irq_bypass_register_consumer(&irqfd->consumer);
+ if (ret)
+ pr_info("irq bypass consumer (token %p) registration fails: %d\n",
+ irqfd->consumer.token, ret);
+#endif
return 0;
@@ -525,7 +510,7 @@
static int
kvm_irqfd_deassign(struct kvm *kvm, struct kvm_irqfd *args)
{
- struct _irqfd *irqfd, *tmp;
+ struct kvm_kernel_irqfd *irqfd, *tmp;
struct eventfd_ctx *eventfd;
eventfd = eventfd_ctx_fdget(args->fd);
@@ -581,7 +566,7 @@
void
kvm_irqfd_release(struct kvm *kvm)
{
- struct _irqfd *irqfd, *tmp;
+ struct kvm_kernel_irqfd *irqfd, *tmp;
spin_lock_irq(&kvm->irqfds.lock);
@@ -604,13 +589,23 @@
*/
void kvm_irq_routing_update(struct kvm *kvm)
{
- struct _irqfd *irqfd;
+ struct kvm_kernel_irqfd *irqfd;
spin_lock_irq(&kvm->irqfds.lock);
- list_for_each_entry(irqfd, &kvm->irqfds.items, list)
+ list_for_each_entry(irqfd, &kvm->irqfds.items, list) {
irqfd_update(kvm, irqfd);
+#ifdef CONFIG_HAVE_KVM_IRQ_BYPASS
+ if (irqfd->producer) {
+ int ret = kvm_arch_update_irqfd_routing(
+ irqfd->kvm, irqfd->producer->irq,
+ irqfd->gsi, 1);
+ WARN_ON(ret);
+ }
+#endif
+ }
+
spin_unlock_irq(&kvm->irqfds.lock);
}
@@ -914,9 +909,7 @@
return -EINVAL;
/* ioeventfd with no length can't be combined with DATAMATCH */
- if (!args->len &&
- args->flags & (KVM_IOEVENTFD_FLAG_PIO |
- KVM_IOEVENTFD_FLAG_DATAMATCH))
+ if (!args->len && (args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH))
return -EINVAL;
ret = kvm_assign_ioeventfd_idx(kvm, bus_idx, args);
diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c
index d7ea8e2..716a1c4 100644
--- a/virt/kvm/irqchip.c
+++ b/virt/kvm/irqchip.c
@@ -31,16 +31,6 @@
#include <trace/events/kvm.h>
#include "irq.h"
-struct kvm_irq_routing_table {
- int chip[KVM_NR_IRQCHIPS][KVM_IRQCHIP_NUM_PINS];
- u32 nr_rt_entries;
- /*
- * Array indexed by gsi. Each entry contains list of irq chips
- * the gsi is connected to.
- */
- struct hlist_head map[0];
-};
-
int kvm_irq_map_gsi(struct kvm *kvm,
struct kvm_kernel_irq_routing_entry *entries, int gsi)
{
@@ -231,6 +221,8 @@
kvm_irq_routing_update(kvm);
mutex_unlock(&kvm->irq_lock);
+ kvm_arch_irq_routing_update(kvm);
+
synchronize_srcu_expedited(&kvm->irq_srcu);
new = old;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 8db1d93..a75502c 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -230,6 +230,9 @@
init_waitqueue_head(&vcpu->wq);
kvm_async_pf_vcpu_init(vcpu);
+ vcpu->pre_pcpu = -1;
+ INIT_LIST_HEAD(&vcpu->blocked_vcpu_list);
+
page = alloc_page(GFP_KERNEL | __GFP_ZERO);
if (!page) {
r = -ENOMEM;
@@ -2718,6 +2721,7 @@
case KVM_CAP_IRQFD:
case KVM_CAP_IRQFD_RESAMPLE:
#endif
+ case KVM_CAP_IOEVENTFD_ANY_LENGTH:
case KVM_CAP_CHECK_EXTENSION_VM:
return 1;
#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
@@ -3341,7 +3345,7 @@
if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1)
return -ENOSPC;
- new_bus = kzalloc(sizeof(*bus) + ((bus->dev_count + 1) *
+ new_bus = kmalloc(sizeof(*bus) + ((bus->dev_count + 1) *
sizeof(struct kvm_io_range)), GFP_KERNEL);
if (!new_bus)
return -ENOMEM;
@@ -3373,7 +3377,7 @@
if (r)
return r;
- new_bus = kzalloc(sizeof(*bus) + ((bus->dev_count - 1) *
+ new_bus = kmalloc(sizeof(*bus) + ((bus->dev_count - 1) *
sizeof(struct kvm_io_range)), GFP_KERNEL);
if (!new_bus)
return -ENOMEM;
diff --git a/virt/lib/Kconfig b/virt/lib/Kconfig
new file mode 100644
index 0000000..89a414f8
--- /dev/null
+++ b/virt/lib/Kconfig
@@ -0,0 +1,2 @@
+config IRQ_BYPASS_MANAGER
+ tristate
diff --git a/virt/lib/Makefile b/virt/lib/Makefile
new file mode 100644
index 0000000..901228d
--- /dev/null
+++ b/virt/lib/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_IRQ_BYPASS_MANAGER) += irqbypass.o
diff --git a/virt/lib/irqbypass.c b/virt/lib/irqbypass.c
new file mode 100644
index 0000000..09a03b5
--- /dev/null
+++ b/virt/lib/irqbypass.c
@@ -0,0 +1,257 @@
+/*
+ * IRQ offload/bypass manager
+ *
+ * Copyright (C) 2015 Red Hat, Inc.
+ * Copyright (c) 2015 Linaro Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Various virtualization hardware acceleration techniques allow bypassing or
+ * offloading interrupts received from devices around the host kernel. Posted
+ * Interrupts on Intel VT-d systems can allow interrupts to be received
+ * directly by a virtual machine. ARM IRQ Forwarding allows forwarded physical
+ * interrupts to be directly deactivated by the guest. This manager allows
+ * interrupt producers and consumers to find each other to enable this sort of
+ * bypass.
+ */
+
+#include <linux/irqbypass.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("IRQ bypass manager utility module");
+
+static LIST_HEAD(producers);
+static LIST_HEAD(consumers);
+static DEFINE_MUTEX(lock);
+
+/* @lock must be held when calling connect */
+static int __connect(struct irq_bypass_producer *prod,
+ struct irq_bypass_consumer *cons)
+{
+ int ret = 0;
+
+ if (prod->stop)
+ prod->stop(prod);
+ if (cons->stop)
+ cons->stop(cons);
+
+ if (prod->add_consumer)
+ ret = prod->add_consumer(prod, cons);
+
+ if (!ret) {
+ ret = cons->add_producer(cons, prod);
+ if (ret && prod->del_consumer)
+ prod->del_consumer(prod, cons);
+ }
+
+ if (cons->start)
+ cons->start(cons);
+ if (prod->start)
+ prod->start(prod);
+
+ return ret;
+}
+
+/* @lock must be held when calling disconnect */
+static void __disconnect(struct irq_bypass_producer *prod,
+ struct irq_bypass_consumer *cons)
+{
+ if (prod->stop)
+ prod->stop(prod);
+ if (cons->stop)
+ cons->stop(cons);
+
+ cons->del_producer(cons, prod);
+
+ if (prod->del_consumer)
+ prod->del_consumer(prod, cons);
+
+ if (cons->start)
+ cons->start(cons);
+ if (prod->start)
+ prod->start(prod);
+}
+
+/**
+ * irq_bypass_register_producer - register IRQ bypass producer
+ * @producer: pointer to producer structure
+ *
+ * Add the provided IRQ producer to the list of producers and connect
+ * with any matching token found on the IRQ consumers list.
+ */
+int irq_bypass_register_producer(struct irq_bypass_producer *producer)
+{
+ struct irq_bypass_producer *tmp;
+ struct irq_bypass_consumer *consumer;
+
+ might_sleep();
+
+ if (!try_module_get(THIS_MODULE))
+ return -ENODEV;
+
+ mutex_lock(&lock);
+
+ list_for_each_entry(tmp, &producers, node) {
+ if (tmp->token == producer->token) {
+ mutex_unlock(&lock);
+ module_put(THIS_MODULE);
+ return -EBUSY;
+ }
+ }
+
+ list_for_each_entry(consumer, &consumers, node) {
+ if (consumer->token == producer->token) {
+ int ret = __connect(producer, consumer);
+ if (ret) {
+ mutex_unlock(&lock);
+ module_put(THIS_MODULE);
+ return ret;
+ }
+ break;
+ }
+ }
+
+ list_add(&producer->node, &producers);
+
+ mutex_unlock(&lock);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(irq_bypass_register_producer);
+
+/**
+ * irq_bypass_unregister_producer - unregister IRQ bypass producer
+ * @producer: pointer to producer structure
+ *
+ * Remove a previously registered IRQ producer from the list of producers
+ * and disconnect it from any connected IRQ consumer.
+ */
+void irq_bypass_unregister_producer(struct irq_bypass_producer *producer)
+{
+ struct irq_bypass_producer *tmp;
+ struct irq_bypass_consumer *consumer;
+
+ might_sleep();
+
+ if (!try_module_get(THIS_MODULE))
+ return; /* nothing in the list anyway */
+
+ mutex_lock(&lock);
+
+ list_for_each_entry(tmp, &producers, node) {
+ if (tmp->token != producer->token)
+ continue;
+
+ list_for_each_entry(consumer, &consumers, node) {
+ if (consumer->token == producer->token) {
+ __disconnect(producer, consumer);
+ break;
+ }
+ }
+
+ list_del(&producer->node);
+ module_put(THIS_MODULE);
+ break;
+ }
+
+ mutex_unlock(&lock);
+
+ module_put(THIS_MODULE);
+}
+EXPORT_SYMBOL_GPL(irq_bypass_unregister_producer);
+
+/**
+ * irq_bypass_register_consumer - register IRQ bypass consumer
+ * @consumer: pointer to consumer structure
+ *
+ * Add the provided IRQ consumer to the list of consumers and connect
+ * with any matching token found on the IRQ producer list.
+ */
+int irq_bypass_register_consumer(struct irq_bypass_consumer *consumer)
+{
+ struct irq_bypass_consumer *tmp;
+ struct irq_bypass_producer *producer;
+
+ if (!consumer->add_producer || !consumer->del_producer)
+ return -EINVAL;
+
+ might_sleep();
+
+ if (!try_module_get(THIS_MODULE))
+ return -ENODEV;
+
+ mutex_lock(&lock);
+
+ list_for_each_entry(tmp, &consumers, node) {
+ if (tmp->token == consumer->token) {
+ mutex_unlock(&lock);
+ module_put(THIS_MODULE);
+ return -EBUSY;
+ }
+ }
+
+ list_for_each_entry(producer, &producers, node) {
+ if (producer->token == consumer->token) {
+ int ret = __connect(producer, consumer);
+ if (ret) {
+ mutex_unlock(&lock);
+ module_put(THIS_MODULE);
+ return ret;
+ }
+ break;
+ }
+ }
+
+ list_add(&consumer->node, &consumers);
+
+ mutex_unlock(&lock);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(irq_bypass_register_consumer);
+
+/**
+ * irq_bypass_unregister_consumer - unregister IRQ bypass consumer
+ * @consumer: pointer to consumer structure
+ *
+ * Remove a previously registered IRQ consumer from the list of consumers
+ * and disconnect it from any connected IRQ producer.
+ */
+void irq_bypass_unregister_consumer(struct irq_bypass_consumer *consumer)
+{
+ struct irq_bypass_consumer *tmp;
+ struct irq_bypass_producer *producer;
+
+ might_sleep();
+
+ if (!try_module_get(THIS_MODULE))
+ return; /* nothing in the list anyway */
+
+ mutex_lock(&lock);
+
+ list_for_each_entry(tmp, &consumers, node) {
+ if (tmp->token != consumer->token)
+ continue;
+
+ list_for_each_entry(producer, &producers, node) {
+ if (producer->token == consumer->token) {
+ __disconnect(producer, consumer);
+ break;
+ }
+ }
+
+ list_del(&consumer->node);
+ module_put(THIS_MODULE);
+ break;
+ }
+
+ mutex_unlock(&lock);
+
+ module_put(THIS_MODULE);
+}
+EXPORT_SYMBOL_GPL(irq_bypass_unregister_consumer);