Merge tag 'kvmarm-5.10' of git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm into HEAD
KVM/arm64 updates for Linux 5.10
- New page table code for both hypervisor and guest stage-2
- Introduction of a new EL2-private host context
- Allow EL2 to have its own private per-CPU variables
- Support of PMU event filtering
- Complete rework of the Spectre mitigation
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index d2b733d..425325f 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -4498,11 +4498,14 @@
- HYPERV_CPUID_ENLIGHTMENT_INFO
- HYPERV_CPUID_IMPLEMENT_LIMITS
- HYPERV_CPUID_NESTED_FEATURES
+ - HYPERV_CPUID_SYNDBG_VENDOR_AND_MAX_FUNCTIONS
+ - HYPERV_CPUID_SYNDBG_INTERFACE
+ - HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES
HYPERV_CPUID_NESTED_FEATURES leaf is only exposed when Enlightened VMCS was
enabled on the corresponding vCPU (KVM_CAP_HYPERV_ENLIGHTENED_VMCS).
-Userspace invokes KVM_GET_SUPPORTED_CPUID by passing a kvm_cpuid2 structure
+Userspace invokes KVM_GET_SUPPORTED_HV_CPUID by passing a kvm_cpuid2 structure
with the 'nent' field indicating the number of entries in the variable-size
array 'entries'. If the number of entries is too low to describe all Hyper-V
feature leaves, an error (E2BIG) is returned. If the number is more or equal
@@ -4704,6 +4707,99 @@
Verify the integrity of the unpacked image. Only if this succeeds,
KVM is allowed to start protected VCPUs.
+4.126 KVM_X86_SET_MSR_FILTER
+----------------------------
+
+:Capability: KVM_X86_SET_MSR_FILTER
+:Architectures: x86
+:Type: vm ioctl
+:Parameters: struct kvm_msr_filter
+:Returns: 0 on success, < 0 on error
+
+::
+
+ struct kvm_msr_filter_range {
+ #define KVM_MSR_FILTER_READ (1 << 0)
+ #define KVM_MSR_FILTER_WRITE (1 << 1)
+ __u32 flags;
+ __u32 nmsrs; /* number of msrs in bitmap */
+ __u32 base; /* MSR index the bitmap starts at */
+ __u8 *bitmap; /* a 1 bit allows the operations in flags, 0 denies */
+ };
+
+ #define KVM_MSR_FILTER_MAX_RANGES 16
+ struct kvm_msr_filter {
+ #define KVM_MSR_FILTER_DEFAULT_ALLOW (0 << 0)
+ #define KVM_MSR_FILTER_DEFAULT_DENY (1 << 0)
+ __u32 flags;
+ struct kvm_msr_filter_range ranges[KVM_MSR_FILTER_MAX_RANGES];
+ };
+
+flags values for struct kvm_msr_filter_range:
+
+KVM_MSR_FILTER_READ
+
+ Filter read accesses to MSRs using the given bitmap. A 0 in the bitmap
+ indicates that a read should immediately fail, while a 1 indicates that
+ a read for a particular MSR should be handled regardless of the default
+ filter action.
+
+KVM_MSR_FILTER_WRITE
+
+ Filter write accesses to MSRs using the given bitmap. A 0 in the bitmap
+ indicates that a write should immediately fail, while a 1 indicates that
+ a write for a particular MSR should be handled regardless of the default
+ filter action.
+
+KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE
+
+ Filter both read and write accesses to MSRs using the given bitmap. A 0
+ in the bitmap indicates that both reads and writes should immediately fail,
+ while a 1 indicates that reads and writes for a particular MSR are not
+ filtered by this range.
+
+flags values for struct kvm_msr_filter:
+
+KVM_MSR_FILTER_DEFAULT_ALLOW
+
+ If no filter range matches an MSR index that is getting accessed, KVM will
+ fall back to allowing access to the MSR.
+
+KVM_MSR_FILTER_DEFAULT_DENY
+
+ If no filter range matches an MSR index that is getting accessed, KVM will
+ fall back to rejecting access to the MSR. In this mode, all MSRs that should
+ be processed by KVM need to explicitly be marked as allowed in the bitmaps.
+
+This ioctl allows user space to define up to 16 bitmaps of MSR ranges to
+specify whether a certain MSR access should be explicitly filtered for or not.
+
+If this ioctl has never been invoked, MSR accesses are not guarded and the
+old KVM in-kernel emulation behavior is fully preserved.
+
+As soon as the filtering is in place, every MSR access is processed through
+the filtering. If a bit is within one of the defined ranges, read and write
+accesses are guarded by the bitmap's value for the MSR index. If it is not
+defined in any range, whether MSR access is rejected is determined by the flags
+field in the kvm_msr_filter struct: KVM_MSR_FILTER_DEFAULT_ALLOW and
+KVM_MSR_FILTER_DEFAULT_DENY.
+
+Calling this ioctl with an empty set of ranges (all nmsrs == 0) disables MSR
+filtering. In that mode, KVM_MSR_FILTER_DEFAULT_DENY no longer has any effect.
+
+Each bitmap range specifies a range of MSRs to potentially allow access on.
+The range goes from MSR index [base .. base+nmsrs]. The flags field
+indicates whether reads, writes or both reads and writes are filtered
+by setting a 1 bit in the bitmap for the corresponding MSR index.
+
+If an MSR access is not permitted through the filtering, it generates a
+#GP inside the guest. When combined with KVM_CAP_X86_USER_SPACE_MSR, that
+allows user space to deflect and potentially handle various MSR accesses
+into user space.
+
+If a vCPU is in running state while this ioctl is invoked, the vCPU may
+experience inconsistent filtering behavior on MSR accesses.
+
5. The kvm_run structure
========================
@@ -4869,14 +4965,13 @@
.. note::
- For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR and
- KVM_EXIT_EPR the corresponding
-
-operations are complete (and guest state is consistent) only after userspace
-has re-entered the kernel with KVM_RUN. The kernel side will first finish
-incomplete operations and then check for pending signals. Userspace
-can re-enter the guest with an unmasked signal pending to complete
-pending operations.
+ For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR,
+ KVM_EXIT_EPR, KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR the corresponding
+ operations are complete (and guest state is consistent) only after userspace
+ has re-entered the kernel with KVM_RUN. The kernel side will first finish
+ incomplete operations and then check for pending signals. Userspace
+ can re-enter the guest with an unmasked signal pending to complete
+ pending operations.
::
@@ -5165,6 +5260,44 @@
::
+ /* KVM_EXIT_X86_RDMSR / KVM_EXIT_X86_WRMSR */
+ struct {
+ __u8 error; /* user -> kernel */
+ __u8 pad[7];
+ __u32 reason; /* kernel -> user */
+ __u32 index; /* kernel -> user */
+ __u64 data; /* kernel <-> user */
+ } msr;
+
+Used on x86 systems. When the VM capability KVM_CAP_X86_USER_SPACE_MSR is
+enabled, MSR accesses to registers that would invoke a #GP by KVM kernel code
+will instead trigger a KVM_EXIT_X86_RDMSR exit for reads and KVM_EXIT_X86_WRMSR
+exit for writes.
+
+The "reason" field specifies why the MSR trap occurred. User space will only
+receive MSR exit traps when a particular reason was requested during through
+ENABLE_CAP. Currently valid exit reasons are:
+
+ KVM_MSR_EXIT_REASON_UNKNOWN - access to MSR that is unknown to KVM
+ KVM_MSR_EXIT_REASON_INVAL - access to invalid MSRs or reserved bits
+ KVM_MSR_EXIT_REASON_FILTER - access blocked by KVM_X86_SET_MSR_FILTER
+
+For KVM_EXIT_X86_RDMSR, the "index" field tells user space which MSR the guest
+wants to read. To respond to this request with a successful read, user space
+writes the respective data into the "data" field and must continue guest
+execution to ensure the read data is transferred into guest register state.
+
+If the RDMSR request was unsuccessful, user space indicates that with a "1" in
+the "error" field. This will inject a #GP into the guest when the VCPU is
+executed again.
+
+For KVM_EXIT_X86_WRMSR, the "index" field tells user space which MSR the guest
+wants to write. Once finished processing the event, user space must continue
+vCPU execution. If the MSR write was unsuccessful, user space also sets the
+"error" field to "1".
+
+::
+
/* Fix the size of the union. */
char padding[256];
};
@@ -5852,6 +5985,28 @@
the maximum halt time to specified on a per-VM basis, effectively overriding
the module parameter for the target VM.
+7.21 KVM_CAP_X86_USER_SPACE_MSR
+-------------------------------
+
+:Architectures: x86
+:Target: VM
+:Parameters: args[0] contains the mask of KVM_MSR_EXIT_REASON_* events to report
+:Returns: 0 on success; -1 on error
+
+This capability enables trapping of #GP invoking RDMSR and WRMSR instructions
+into user space.
+
+When a guest requests to read or write an MSR, KVM may not implement all MSRs
+that are relevant to a respective system. It also does not differentiate by
+CPU type.
+
+To allow more fine grained control over MSR handling, user space may enable
+this capability. With it enabled, MSR accesses that match the mask specified in
+args[0] and trigger a #GP event inside the guest by KVM will instead trigger
+KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR exit notifications which user space
+can then handle to implement model specific MSR handling and/or user notifications
+to inform a user that an MSR was not handled.
+
8. Other capabilities.
======================
@@ -6173,3 +6328,48 @@
is supported, than the other should as well and vice versa. For arm64
see Documentation/virt/kvm/devices/vcpu.rst "KVM_ARM_VCPU_PVTIME_CTRL".
For x86 see Documentation/virt/kvm/msr.rst "MSR_KVM_STEAL_TIME".
+
+8.25 KVM_CAP_S390_DIAG318
+-------------------------
+
+:Architectures: s390
+
+This capability enables a guest to set information about its control program
+(i.e. guest kernel type and version). The information is helpful during
+system/firmware service events, providing additional data about the guest
+environments running on the machine.
+
+The information is associated with the DIAGNOSE 0x318 instruction, which sets
+an 8-byte value consisting of a one-byte Control Program Name Code (CPNC) and
+a 7-byte Control Program Version Code (CPVC). The CPNC determines what
+environment the control program is running in (e.g. Linux, z/VM...), and the
+CPVC is used for information specific to OS (e.g. Linux version, Linux
+distribution...)
+
+If this capability is available, then the CPNC and CPVC can be synchronized
+between KVM and userspace via the sync regs mechanism (KVM_SYNC_DIAG318).
+
+8.26 KVM_CAP_X86_USER_SPACE_MSR
+-------------------------------
+
+:Architectures: x86
+
+This capability indicates that KVM supports deflection of MSR reads and
+writes to user space. It can be enabled on a VM level. If enabled, MSR
+accesses that would usually trigger a #GP by KVM into the guest will
+instead get bounced to user space through the KVM_EXIT_X86_RDMSR and
+KVM_EXIT_X86_WRMSR exit notifications.
+
+8.25 KVM_X86_SET_MSR_FILTER
+---------------------------
+
+:Architectures: x86
+
+This capability indicates that KVM supports that accesses to user defined MSRs
+may be rejected. With this capability exposed, KVM exports new VM ioctl
+KVM_X86_SET_MSR_FILTER which user space can call to specify bitmaps of MSR
+ranges that KVM should reject access to.
+
+In combination with KVM_CAP_X86_USER_SPACE_MSR, this allows user space to
+trap and emulate MSRs that are outside of the scope of KVM as well as
+limit the attack surface on KVM's MSR emulation code.
diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
index 96eccb1..5ef2669 100644
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -298,15 +298,15 @@ static __always_inline int kvm_vcpu_dabt_get_rd(const struct kvm_vcpu *vcpu)
return (kvm_vcpu_get_esr(vcpu) & ESR_ELx_SRT_MASK) >> ESR_ELx_SRT_SHIFT;
}
-static __always_inline bool kvm_vcpu_dabt_iss1tw(const struct kvm_vcpu *vcpu)
+static __always_inline bool kvm_vcpu_abt_iss1tw(const struct kvm_vcpu *vcpu)
{
return !!(kvm_vcpu_get_esr(vcpu) & ESR_ELx_S1PTW);
}
+/* Always check for S1PTW *before* using this. */
static __always_inline bool kvm_vcpu_dabt_iswrite(const struct kvm_vcpu *vcpu)
{
- return !!(kvm_vcpu_get_esr(vcpu) & ESR_ELx_WNR) ||
- kvm_vcpu_dabt_iss1tw(vcpu); /* AF/DBM update */
+ return kvm_vcpu_get_esr(vcpu) & ESR_ELx_WNR;
}
static inline bool kvm_vcpu_dabt_is_cm(const struct kvm_vcpu *vcpu)
@@ -335,6 +335,11 @@ static inline bool kvm_vcpu_trap_is_iabt(const struct kvm_vcpu *vcpu)
return kvm_vcpu_trap_get_class(vcpu) == ESR_ELx_EC_IABT_LOW;
}
+static inline bool kvm_vcpu_trap_is_exec_fault(const struct kvm_vcpu *vcpu)
+{
+ return kvm_vcpu_trap_is_iabt(vcpu) && !kvm_vcpu_abt_iss1tw(vcpu);
+}
+
static __always_inline u8 kvm_vcpu_trap_get_fault(const struct kvm_vcpu *vcpu)
{
return kvm_vcpu_get_esr(vcpu) & ESR_ELx_FSC;
@@ -372,6 +377,9 @@ static __always_inline int kvm_vcpu_sys_get_rt(struct kvm_vcpu *vcpu)
static inline bool kvm_is_write_fault(struct kvm_vcpu *vcpu)
{
+ if (kvm_vcpu_abt_iss1tw(vcpu))
+ return true;
+
if (kvm_vcpu_trap_is_iabt(vcpu))
return false;
diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h
index eeac62b..313a8fa3 100644
--- a/arch/arm64/kvm/hyp/include/hyp/switch.h
+++ b/arch/arm64/kvm/hyp/include/hyp/switch.h
@@ -446,7 +446,7 @@ static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code)
kvm_vcpu_trap_get_fault_type(vcpu) == FSC_FAULT &&
kvm_vcpu_dabt_isvalid(vcpu) &&
!kvm_vcpu_abt_issea(vcpu) &&
- !kvm_vcpu_dabt_iss1tw(vcpu);
+ !kvm_vcpu_abt_iss1tw(vcpu);
if (valid) {
int ret = __vgic_v2_perform_cpuif_access(vcpu);
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index a816cb8..19aacc7 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -759,7 +759,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
struct kvm_pgtable *pgt;
write_fault = kvm_is_write_fault(vcpu);
- exec_fault = kvm_vcpu_trap_is_iabt(vcpu);
+ exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu);
VM_BUG_ON(write_fault && exec_fault);
if (fault_status == FSC_PERM && !write_fault && !exec_fault) {
@@ -984,7 +984,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
goto out;
}
- if (kvm_vcpu_dabt_iss1tw(vcpu)) {
+ if (kvm_vcpu_abt_iss1tw(vcpu)) {
kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu));
ret = 1;
goto out_unlock;
diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index 825d337..24f3d0f 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -341,7 +341,7 @@ struct kvm_mips_tlb {
#define KVM_MIPS_GUEST_TLB_SIZE 64
struct kvm_vcpu_arch {
void *guest_ebase;
- int (*vcpu_run)(struct kvm_run *run, struct kvm_vcpu *vcpu);
+ int (*vcpu_run)(struct kvm_vcpu *vcpu);
/* Host registers preserved across guest mode execution */
unsigned long host_stack;
@@ -852,7 +852,7 @@ int kvm_mips_emulation_init(struct kvm_mips_callbacks **install_callbacks);
/* Debug: dump vcpu state */
int kvm_arch_vcpu_dump_regs(struct kvm_vcpu *vcpu);
-extern int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu);
+extern int kvm_mips_handle_exit(struct kvm_vcpu *vcpu);
/* Building of entry/exception code */
int kvm_mips_entry_setup(void);
diff --git a/arch/mips/kvm/entry.c b/arch/mips/kvm/entry.c
index fd71694..832475b 100644
--- a/arch/mips/kvm/entry.c
+++ b/arch/mips/kvm/entry.c
@@ -205,7 +205,7 @@ static inline void build_set_exc_base(u32 **p, unsigned int reg)
* Assemble the start of the vcpu_run function to run a guest VCPU. The function
* conforms to the following prototype:
*
- * int vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu);
+ * int vcpu_run(struct kvm_vcpu *vcpu);
*
* The exit from the guest and return to the caller is handled by the code
* generated by kvm_mips_build_ret_to_host().
@@ -218,8 +218,7 @@ void *kvm_mips_build_vcpu_run(void *addr)
unsigned int i;
/*
- * A0: run
- * A1: vcpu
+ * A0: vcpu
*/
/* k0/k1 not being used in host kernel context */
@@ -238,10 +237,10 @@ void *kvm_mips_build_vcpu_run(void *addr)
kvm_mips_build_save_scratch(&p, V1, K1);
/* VCPU scratch register has pointer to vcpu */
- UASM_i_MTC0(&p, A1, scratch_vcpu[0], scratch_vcpu[1]);
+ UASM_i_MTC0(&p, A0, scratch_vcpu[0], scratch_vcpu[1]);
/* Offset into vcpu->arch */
- UASM_i_ADDIU(&p, K1, A1, offsetof(struct kvm_vcpu, arch));
+ UASM_i_ADDIU(&p, K1, A0, offsetof(struct kvm_vcpu, arch));
/*
* Save the host stack to VCPU, used for exception processing
@@ -645,10 +644,7 @@ void *kvm_mips_build_exit(void *addr)
/* Now that context has been saved, we can use other registers */
/* Restore vcpu */
- UASM_i_MFC0(&p, S1, scratch_vcpu[0], scratch_vcpu[1]);
-
- /* Restore run (vcpu->run) */
- UASM_i_LW(&p, S0, offsetof(struct kvm_vcpu, run), S1);
+ UASM_i_MFC0(&p, S0, scratch_vcpu[0], scratch_vcpu[1]);
/*
* Save Host level EPC, BadVaddr and Cause to VCPU, useful to process
@@ -810,7 +806,6 @@ void *kvm_mips_build_exit(void *addr)
* with this in the kernel
*/
uasm_i_move(&p, A0, S0);
- uasm_i_move(&p, A1, S1);
UASM_i_LA(&p, T9, (unsigned long)kvm_mips_handle_exit);
uasm_i_jalr(&p, RA, T9);
UASM_i_ADDIU(&p, SP, SP, -CALLFRAME_SIZ);
@@ -852,7 +847,7 @@ static void *kvm_mips_build_ret_from_exit(void *addr)
* guest, reload k1
*/
- uasm_i_move(&p, K1, S1);
+ uasm_i_move(&p, K1, S0);
UASM_i_ADDIU(&p, K1, K1, offsetof(struct kvm_vcpu, arch));
/*
@@ -886,8 +881,8 @@ static void *kvm_mips_build_ret_to_guest(void *addr)
{
u32 *p = addr;
- /* Put the saved pointer to vcpu (s1) back into the scratch register */
- UASM_i_MTC0(&p, S1, scratch_vcpu[0], scratch_vcpu[1]);
+ /* Put the saved pointer to vcpu (s0) back into the scratch register */
+ UASM_i_MTC0(&p, S0, scratch_vcpu[0], scratch_vcpu[1]);
/* Load up the Guest EBASE to minimize the window where BEV is set */
UASM_i_LW(&p, T0, offsetof(struct kvm_vcpu_arch, guest_ebase), K1);
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 7de85d2..3d6a7f5 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -137,6 +137,8 @@ extern void kvm_init_loongson_ipi(struct kvm *kvm);
int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
{
switch (type) {
+ case KVM_VM_MIPS_AUTO:
+ break;
#ifdef CONFIG_KVM_MIPS_VZ
case KVM_VM_MIPS_VZ:
#else
@@ -1197,8 +1199,9 @@ static void kvm_mips_set_c0_status(void)
/*
* Return value is in the form (errcode<<2 | RESUME_FLAG_HOST | RESUME_FLAG_NV)
*/
-int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
+int kvm_mips_handle_exit(struct kvm_vcpu *vcpu)
{
+ struct kvm_run *run = vcpu->run;
u32 cause = vcpu->arch.host_cp0_cause;
u32 exccode = (cause >> CAUSEB_EXCCODE) & 0x1f;
u32 __user *opc = (u32 __user *) vcpu->arch.pc;
diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c
index f8cba51..0788c00 100644
--- a/arch/mips/kvm/trap_emul.c
+++ b/arch/mips/kvm/trap_emul.c
@@ -1241,7 +1241,7 @@ static int kvm_trap_emul_vcpu_run(struct kvm_vcpu *vcpu)
*/
kvm_mips_suspend_mm(cpu);
- r = vcpu->arch.vcpu_run(vcpu->run, vcpu);
+ r = vcpu->arch.vcpu_run(vcpu);
/* We may have migrated while handling guest exits */
cpu = smp_processor_id();
diff --git a/arch/mips/kvm/vz.c b/arch/mips/kvm/vz.c
index c299e5d..2ffbe92 100644
--- a/arch/mips/kvm/vz.c
+++ b/arch/mips/kvm/vz.c
@@ -3266,7 +3266,7 @@ static int kvm_vz_vcpu_run(struct kvm_vcpu *vcpu)
kvm_vz_vcpu_load_tlb(vcpu, cpu);
kvm_vz_vcpu_load_wired(vcpu);
- r = vcpu->arch.vcpu_run(vcpu->run, vcpu);
+ r = vcpu->arch.vcpu_run(vcpu);
kvm_vz_vcpu_save_wired(vcpu);
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 10ded83..d67a470 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -326,6 +326,7 @@ struct kvm_arch {
#endif
#ifdef CONFIG_KVM_XICS
struct kvmppc_xics *xics;
+ struct kvmppc_xics *xics_device;
struct kvmppc_xive *xive; /* Current XIVE device in use */
struct {
struct kvmppc_xive *native;
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 49db50d..44bf567 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -558,12 +558,12 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
{
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
}
int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
{
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
}
int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id,
@@ -879,13 +879,15 @@ void kvmppc_core_destroy_vm(struct kvm *kvm)
#ifdef CONFIG_KVM_XICS
/*
- * Free the XIVE devices which are not directly freed by the
+ * Free the XIVE and XICS devices which are not directly freed by the
* device 'release' method
*/
kfree(kvm->arch.xive_devices.native);
kvm->arch.xive_devices.native = NULL;
kfree(kvm->arch.xive_devices.xics_on_xive);
kvm->arch.xive_devices.xics_on_xive = NULL;
+ kfree(kvm->arch.xics_device);
+ kvm->arch.xics_device = NULL;
#endif /* CONFIG_KVM_XICS */
}
diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
index 22a677b..bb35490 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
@@ -347,7 +347,7 @@ static unsigned long kvmppc_radix_update_pte(struct kvm *kvm, pte_t *ptep,
return __radix_pte_update(ptep, clr, set);
}
-void kvmppc_radix_set_pte_at(struct kvm *kvm, unsigned long addr,
+static void kvmppc_radix_set_pte_at(struct kvm *kvm, unsigned long addr,
pte_t *ptep, pte_t pte)
{
radix__set_pte_at(kvm->mm, addr, ptep, pte, 0);
diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index 1a529df..8da93fd 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -283,7 +283,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
struct kvmppc_spapr_tce_table *siter;
struct mm_struct *mm = kvm->mm;
unsigned long npages, size = args->size;
- int ret = -ENOMEM;
+ int ret;
if (!args->size || args->page_shift < 12 || args->page_shift > 34 ||
(args->offset + args->size > (ULLONG_MAX >> args->page_shift)))
@@ -489,7 +489,7 @@ static long kvmppc_tce_iommu_unmap(struct kvm *kvm,
return ret;
}
-long kvmppc_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl,
+static long kvmppc_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl,
unsigned long entry, unsigned long ua,
enum dma_data_direction dir)
{
diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c
index ac6ac19..470e7c5 100644
--- a/arch/powerpc/kvm/book3s_64_vio_hv.c
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@@ -237,7 +237,7 @@ static long iommu_tce_xchg_no_kill_rm(struct mm_struct *mm,
return ret;
}
-extern void iommu_tce_kill_rm(struct iommu_table *tbl,
+static void iommu_tce_kill_rm(struct iommu_table *tbl,
unsigned long entry, unsigned long pages)
{
if (tbl->it_ops->tce_kill)
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 4ba06a2..490a0f6 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -3442,9 +3442,19 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit,
unsigned long host_psscr = mfspr(SPRN_PSSCR);
unsigned long host_pidr = mfspr(SPRN_PID);
+ /*
+ * P8 and P9 suppress the HDEC exception when LPCR[HDICE] = 0,
+ * so set HDICE before writing HDEC.
+ */
+ mtspr(SPRN_LPCR, vcpu->kvm->arch.host_lpcr | LPCR_HDICE);
+ isync();
+
hdec = time_limit - mftb();
- if (hdec < 0)
+ if (hdec < 0) {
+ mtspr(SPRN_LPCR, vcpu->kvm->arch.host_lpcr);
+ isync();
return BOOK3S_INTERRUPT_HV_DECREMENTER;
+ }
mtspr(SPRN_HDEC, hdec);
if (vc->tb_offset) {
@@ -3558,7 +3568,7 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit,
* Virtual-mode guest entry for POWER9 and later when the host and
* guest are both using the radix MMU. The LPIDR has already been set.
*/
-int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
+static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
unsigned long lpcr)
{
struct kvmppc_vcore *vc = vcpu->arch.vcore;
@@ -3572,7 +3582,7 @@ int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
dec = mfspr(SPRN_DEC);
tb = mftb();
- if (dec < 512)
+ if (dec < 0)
return BOOK3S_INTERRUPT_HV_DECREMENTER;
local_paca->kvm_hstate.dec_expires = dec + tb;
if (local_paca->kvm_hstate.dec_expires < time_limit)
@@ -5250,6 +5260,12 @@ static long kvm_arch_vm_ioctl_hv(struct file *filp,
case KVM_PPC_ALLOCATE_HTAB: {
u32 htab_order;
+ /* If we're a nested hypervisor, we currently only support radix */
+ if (kvmhv_on_pseries()) {
+ r = -EOPNOTSUPP;
+ break;
+ }
+
r = -EFAULT;
if (get_user(htab_order, (u32 __user *)argp))
break;
diff --git a/arch/powerpc/kvm/book3s_hv_interrupts.S b/arch/powerpc/kvm/book3s_hv_interrupts.S
index 59822cb..327417d 100644
--- a/arch/powerpc/kvm/book3s_hv_interrupts.S
+++ b/arch/powerpc/kvm/book3s_hv_interrupts.S
@@ -58,13 +58,16 @@
/*
* Put whatever is in the decrementer into the
* hypervisor decrementer.
+ * Because of a hardware deviation in P8 and P9,
+ * we need to set LPCR[HDICE] before writing HDEC.
*/
-BEGIN_FTR_SECTION
ld r5, HSTATE_KVM_VCORE(r13)
ld r6, VCORE_KVM(r5)
ld r9, KVM_HOST_LPCR(r6)
- andis. r9, r9, LPCR_LD@h
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
+ ori r8, r9, LPCR_HDICE
+ mtspr SPRN_LPCR, r8
+ isync
+ andis. r0, r9, LPCR_LD@h
mfspr r8,SPRN_DEC
mftb r7
BEGIN_FTR_SECTION
diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c
index 6822d23..33b5854 100644
--- a/arch/powerpc/kvm/book3s_hv_nested.c
+++ b/arch/powerpc/kvm/book3s_hv_nested.c
@@ -569,7 +569,7 @@ static void kvmhv_update_ptbl_cache(struct kvm_nested_guest *gp)
kvmhv_set_nested_ptbl(gp);
}
-struct kvm_nested_guest *kvmhv_alloc_nested(struct kvm *kvm, unsigned int lpid)
+static struct kvm_nested_guest *kvmhv_alloc_nested(struct kvm *kvm, unsigned int lpid)
{
struct kvm_nested_guest *gp;
long shadow_lpid;
diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c
index 4d7e561..c2c9c73 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_xics.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c
@@ -764,7 +764,7 @@ int xics_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
return ics_rm_eoi(vcpu, irq);
}
-unsigned long eoi_rc;
+static unsigned long eoi_rc;
static void icp_eoi(struct irq_chip *c, u32 hwirq, __be32 xirr, bool *again)
{
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 88fac22..b1fefa6 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -569,7 +569,7 @@ static void kvmppc_set_msr_pr(struct kvm_vcpu *vcpu, u64 msr)
#endif
}
-void kvmppc_set_pvr_pr(struct kvm_vcpu *vcpu, u32 pvr)
+static void kvmppc_set_pvr_pr(struct kvm_vcpu *vcpu, u32 pvr)
{
u32 host_pvr;
diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c
index 381bf8d..5fee5a1 100644
--- a/arch/powerpc/kvm/book3s_xics.c
+++ b/arch/powerpc/kvm/book3s_xics.c
@@ -1334,47 +1334,97 @@ static int xics_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
return -ENXIO;
}
-static void kvmppc_xics_free(struct kvm_device *dev)
+/*
+ * Called when device fd is closed. kvm->lock is held.
+ */
+static void kvmppc_xics_release(struct kvm_device *dev)
{
struct kvmppc_xics *xics = dev->private;
int i;
struct kvm *kvm = xics->kvm;
+ struct kvm_vcpu *vcpu;
+
+ pr_devel("Releasing xics device\n");
+
+ /*
+ * Since this is the device release function, we know that
+ * userspace does not have any open fd referring to the
+ * device. Therefore there can not be any of the device
+ * attribute set/get functions being executed concurrently,
+ * and similarly, the connect_vcpu and set/clr_mapped
+ * functions also cannot be being executed.
+ */
debugfs_remove(xics->dentry);
+ /*
+ * We should clean up the vCPU interrupt presenters first.
+ */
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ /*
+ * Take vcpu->mutex to ensure that no one_reg get/set ioctl
+ * (i.e. kvmppc_xics_[gs]et_icp) can be done concurrently.
+ * Holding the vcpu->mutex also means that execution is
+ * excluded for the vcpu until the ICP was freed. When the vcpu
+ * can execute again, vcpu->arch.icp and vcpu->arch.irq_type
+ * have been cleared and the vcpu will not be going into the
+ * XICS code anymore.
+ */
+ mutex_lock(&vcpu->mutex);
+ kvmppc_xics_free_icp(vcpu);
+ mutex_unlock(&vcpu->mutex);
+ }
+
if (kvm)
kvm->arch.xics = NULL;
- for (i = 0; i <= xics->max_icsid; i++)
+ for (i = 0; i <= xics->max_icsid; i++) {
kfree(xics->ics[i]);
- kfree(xics);
+ xics->ics[i] = NULL;
+ }
+ /*
+ * A reference of the kvmppc_xics pointer is now kept under
+ * the xics_device pointer of the machine for reuse. It is
+ * freed when the VM is destroyed for now until we fix all the
+ * execution paths.
+ */
kfree(dev);
}
+static struct kvmppc_xics *kvmppc_xics_get_device(struct kvm *kvm)
+{
+ struct kvmppc_xics **kvm_xics_device = &kvm->arch.xics_device;
+ struct kvmppc_xics *xics = *kvm_xics_device;
+
+ if (!xics) {
+ xics = kzalloc(sizeof(*xics), GFP_KERNEL);
+ *kvm_xics_device = xics;
+ } else {
+ memset(xics, 0, sizeof(*xics));
+ }
+
+ return xics;
+}
+
static int kvmppc_xics_create(struct kvm_device *dev, u32 type)
{
struct kvmppc_xics *xics;
struct kvm *kvm = dev->kvm;
- int ret = 0;
- xics = kzalloc(sizeof(*xics), GFP_KERNEL);
+ pr_devel("Creating xics for partition\n");
+
+ /* Already there ? */
+ if (kvm->arch.xics)
+ return -EEXIST;
+
+ xics = kvmppc_xics_get_device(kvm);
if (!xics)
return -ENOMEM;
dev->private = xics;
xics->dev = dev;
xics->kvm = kvm;
-
- /* Already there ? */
- if (kvm->arch.xics)
- ret = -EEXIST;
- else
- kvm->arch.xics = xics;
-
- if (ret) {
- kfree(xics);
- return ret;
- }
+ kvm->arch.xics = xics;
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
if (cpu_has_feature(CPU_FTR_ARCH_206) &&
@@ -1399,7 +1449,7 @@ struct kvm_device_ops kvm_xics_ops = {
.name = "kvm-xics",
.create = kvmppc_xics_create,
.init = kvmppc_xics_init,
- .destroy = kvmppc_xics_free,
+ .release = kvmppc_xics_release,
.set_attr = xics_set_attr,
.get_attr = xics_get_attr,
.has_attr = xics_has_attr,
@@ -1415,7 +1465,7 @@ int kvmppc_xics_connect_vcpu(struct kvm_device *dev, struct kvm_vcpu *vcpu,
return -EPERM;
if (xics->kvm != vcpu->kvm)
return -EPERM;
- if (vcpu->arch.irq_type)
+ if (vcpu->arch.irq_type != KVMPPC_IRQ_DEFAULT)
return -EBUSY;
r = kvmppc_xics_create_icp(vcpu, xcpu);
diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c
index bdea91d..d0c2db0 100644
--- a/arch/powerpc/kvm/book3s_xive_native.c
+++ b/arch/powerpc/kvm/book3s_xive_native.c
@@ -1227,17 +1227,7 @@ static int xive_native_debug_show(struct seq_file *m, void *private)
return 0;
}
-static int xive_native_debug_open(struct inode *inode, struct file *file)
-{
- return single_open(file, xive_native_debug_show, inode->i_private);
-}
-
-static const struct file_operations xive_native_debug_fops = {
- .open = xive_native_debug_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(xive_native_debug);
static void xive_native_debugfs_init(struct kvmppc_xive *xive)
{
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 3e1c9f0..b1abcb8 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -1747,12 +1747,12 @@ int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id,
int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
{
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
}
int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
{
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
}
int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
@@ -1773,7 +1773,7 @@ void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
{
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
}
void kvmppc_core_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c
index dde7cb3..877970d 100644
--- a/arch/x86/boot/compressed/kaslr.c
+++ b/arch/x86/boot/compressed/kaslr.c
@@ -36,6 +36,10 @@
#define STATIC
#include <linux/decompress/mm.h>
+#define _SETUP
+#include <asm/setup.h> /* For COMMAND_LINE_SIZE */
+#undef _SETUP
+
#ifdef CONFIG_X86_5LEVEL
unsigned int __pgtable_l5_enabled;
unsigned int pgdir_shift __ro_after_init = 39;
@@ -87,8 +91,11 @@ static unsigned long get_boot_seed(void)
static bool memmap_too_large;
-/* Store memory limit specified by "mem=nn[KMG]" or "memmap=nn[KMG]" */
-static unsigned long long mem_limit = ULLONG_MAX;
+/*
+ * Store memory limit: MAXMEM on 64-bit and KERNEL_IMAGE_SIZE on 32-bit.
+ * It may be reduced by "mem=nn[KMG]" or "memmap=nn[KMG]" command line options.
+ */
+static u64 mem_limit;
/* Number of immovable memory regions */
static int num_immovable_mem;
@@ -131,8 +138,7 @@ enum parse_mode {
};
static int
-parse_memmap(char *p, unsigned long long *start, unsigned long long *size,
- enum parse_mode mode)
+parse_memmap(char *p, u64 *start, u64 *size, enum parse_mode mode)
{
char *oldp;
@@ -162,7 +168,7 @@ parse_memmap(char *p, unsigned long long *start, unsigned long long *size,
*/
*size = 0;
} else {
- unsigned long long flags;
+ u64 flags;
/*
* efi_fake_mem=nn@ss:attr the attr specifies
@@ -201,7 +207,7 @@ static void mem_avoid_memmap(enum parse_mode mode, char *str)
while (str && (i < MAX_MEMMAP_REGIONS)) {
int rc;
- unsigned long long start, size;
+ u64 start, size;
char *k = strchr(str, ',');
if (k)
@@ -214,7 +220,7 @@ static void mem_avoid_memmap(enum parse_mode mode, char *str)
if (start == 0) {
/* Store the specified memory limit if size > 0 */
- if (size > 0)
+ if (size > 0 && size < mem_limit)
mem_limit = size;
continue;
@@ -261,15 +267,15 @@ static void parse_gb_huge_pages(char *param, char *val)
static void handle_mem_options(void)
{
char *args = (char *)get_cmd_line_ptr();
- size_t len = strlen((char *)args);
+ size_t len;
char *tmp_cmdline;
char *param, *val;
u64 mem_size;
- if (!strstr(args, "memmap=") && !strstr(args, "mem=") &&
- !strstr(args, "hugepages"))
+ if (!args)
return;
+ len = strnlen(args, COMMAND_LINE_SIZE-1);
tmp_cmdline = malloc(len + 1);
if (!tmp_cmdline)
error("Failed to allocate space for tmp_cmdline");
@@ -284,14 +290,12 @@ static void handle_mem_options(void)
while (*args) {
args = next_arg(args, ¶m, &val);
/* Stop at -- */
- if (!val && strcmp(param, "--") == 0) {
- warn("Only '--' specified in cmdline");
- goto out;
- }
+ if (!val && strcmp(param, "--") == 0)
+ break;
if (!strcmp(param, "memmap")) {
mem_avoid_memmap(PARSE_MEMMAP, val);
- } else if (strstr(param, "hugepages")) {
+ } else if (IS_ENABLED(CONFIG_X86_64) && strstr(param, "hugepages")) {
parse_gb_huge_pages(param, val);
} else if (!strcmp(param, "mem")) {
char *p = val;
@@ -300,21 +304,23 @@ static void handle_mem_options(void)
continue;
mem_size = memparse(p, &p);
if (mem_size == 0)
- goto out;
+ break;
- mem_limit = mem_size;
+ if (mem_size < mem_limit)
+ mem_limit = mem_size;
} else if (!strcmp(param, "efi_fake_mem")) {
mem_avoid_memmap(PARSE_EFI, val);
}
}
-out:
free(tmp_cmdline);
return;
}
/*
- * In theory, KASLR can put the kernel anywhere in the range of [16M, 64T).
+ * In theory, KASLR can put the kernel anywhere in the range of [16M, MAXMEM)
+ * on 64-bit, and [16M, KERNEL_IMAGE_SIZE) on 32-bit.
+ *
* The mem_avoid array is used to store the ranges that need to be avoided
* when KASLR searches for an appropriate random address. We must avoid any
* regions that are unsafe to overlap with during decompression, and other
@@ -392,8 +398,7 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size,
{
unsigned long init_size = boot_params->hdr.init_size;
u64 initrd_start, initrd_size;
- u64 cmd_line, cmd_line_size;
- char *ptr;
+ unsigned long cmd_line, cmd_line_size;
/*
* Avoid the region that is unsafe to overlap during
@@ -414,16 +419,15 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size,
/* No need to set mapping for initrd, it will be handled in VO. */
/* Avoid kernel command line. */
- cmd_line = (u64)boot_params->ext_cmd_line_ptr << 32;
- cmd_line |= boot_params->hdr.cmd_line_ptr;
+ cmd_line = get_cmd_line_ptr();
/* Calculate size of cmd_line. */
- ptr = (char *)(unsigned long)cmd_line;
- for (cmd_line_size = 0; ptr[cmd_line_size++];)
- ;
- mem_avoid[MEM_AVOID_CMDLINE].start = cmd_line;
- mem_avoid[MEM_AVOID_CMDLINE].size = cmd_line_size;
- add_identity_map(mem_avoid[MEM_AVOID_CMDLINE].start,
- mem_avoid[MEM_AVOID_CMDLINE].size);
+ if (cmd_line) {
+ cmd_line_size = strnlen((char *)cmd_line, COMMAND_LINE_SIZE-1) + 1;
+ mem_avoid[MEM_AVOID_CMDLINE].start = cmd_line;
+ mem_avoid[MEM_AVOID_CMDLINE].size = cmd_line_size;
+ add_identity_map(mem_avoid[MEM_AVOID_CMDLINE].start,
+ mem_avoid[MEM_AVOID_CMDLINE].size);
+ }
/* Avoid boot parameters. */
mem_avoid[MEM_AVOID_BOOTPARAMS].start = (unsigned long)boot_params;
@@ -454,7 +458,7 @@ static bool mem_avoid_overlap(struct mem_vector *img,
{
int i;
struct setup_data *ptr;
- unsigned long earliest = img->start + img->size;
+ u64 earliest = img->start + img->size;
bool is_overlapping = false;
for (i = 0; i < MEM_AVOID_MAX; i++) {
@@ -499,18 +503,16 @@ static bool mem_avoid_overlap(struct mem_vector *img,
}
struct slot_area {
- unsigned long addr;
- int num;
+ u64 addr;
+ unsigned long num;
};
#define MAX_SLOT_AREA 100
static struct slot_area slot_areas[MAX_SLOT_AREA];
-
+static unsigned int slot_area_index;
static unsigned long slot_max;
-static unsigned long slot_area_index;
-
static void store_slot_info(struct mem_vector *region, unsigned long image_size)
{
struct slot_area slot_area;
@@ -519,13 +521,10 @@ static void store_slot_info(struct mem_vector *region, unsigned long image_size)
return;
slot_area.addr = region->start;
- slot_area.num = (region->size - image_size) /
- CONFIG_PHYSICAL_ALIGN + 1;
+ slot_area.num = 1 + (region->size - image_size) / CONFIG_PHYSICAL_ALIGN;
- if (slot_area.num > 0) {
- slot_areas[slot_area_index++] = slot_area;
- slot_max += slot_area.num;
- }
+ slot_areas[slot_area_index++] = slot_area;
+ slot_max += slot_area.num;
}
/*
@@ -535,57 +534,53 @@ static void store_slot_info(struct mem_vector *region, unsigned long image_size)
static void
process_gb_huge_pages(struct mem_vector *region, unsigned long image_size)
{
- unsigned long addr, size = 0;
+ u64 pud_start, pud_end;
+ unsigned long gb_huge_pages;
struct mem_vector tmp;
- int i = 0;
- if (!max_gb_huge_pages) {
+ if (!IS_ENABLED(CONFIG_X86_64) || !max_gb_huge_pages) {
store_slot_info(region, image_size);
return;
}
- addr = ALIGN(region->start, PUD_SIZE);
- /* Did we raise the address above the passed in memory entry? */
- if (addr < region->start + region->size)
- size = region->size - (addr - region->start);
-
- /* Check how many 1GB huge pages can be filtered out: */
- while (size > PUD_SIZE && max_gb_huge_pages) {
- size -= PUD_SIZE;
- max_gb_huge_pages--;
- i++;
- }
+ /* Are there any 1GB pages in the region? */
+ pud_start = ALIGN(region->start, PUD_SIZE);
+ pud_end = ALIGN_DOWN(region->start + region->size, PUD_SIZE);
/* No good 1GB huge pages found: */
- if (!i) {
+ if (pud_start >= pud_end) {
store_slot_info(region, image_size);
return;
}
- /*
- * Skip those 'i'*1GB good huge pages, and continue checking and
- * processing the remaining head or tail part of the passed region
- * if available.
- */
-
- if (addr >= region->start + image_size) {
+ /* Check if the head part of the region is usable. */
+ if (pud_start >= region->start + image_size) {
tmp.start = region->start;
- tmp.size = addr - region->start;
+ tmp.size = pud_start - region->start;
store_slot_info(&tmp, image_size);
}
- size = region->size - (addr - region->start) - i * PUD_SIZE;
- if (size >= image_size) {
- tmp.start = addr + i * PUD_SIZE;
- tmp.size = size;
+ /* Skip the good 1GB pages. */
+ gb_huge_pages = (pud_end - pud_start) >> PUD_SHIFT;
+ if (gb_huge_pages > max_gb_huge_pages) {
+ pud_end = pud_start + (max_gb_huge_pages << PUD_SHIFT);
+ max_gb_huge_pages = 0;
+ } else {
+ max_gb_huge_pages -= gb_huge_pages;
+ }
+
+ /* Check if the tail part of the region is usable. */
+ if (region->start + region->size >= pud_end + image_size) {
+ tmp.start = pud_end;
+ tmp.size = region->start + region->size - pud_end;
store_slot_info(&tmp, image_size);
}
}
-static unsigned long slots_fetch_random(void)
+static u64 slots_fetch_random(void)
{
unsigned long slot;
- int i;
+ unsigned int i;
/* Handle case of no slots stored. */
if (slot_max == 0)
@@ -598,7 +593,7 @@ static unsigned long slots_fetch_random(void)
slot -= slot_areas[i].num;
continue;
}
- return slot_areas[i].addr + slot * CONFIG_PHYSICAL_ALIGN;
+ return slot_areas[i].addr + ((u64)slot * CONFIG_PHYSICAL_ALIGN);
}
if (i == slot_area_index)
@@ -611,49 +606,23 @@ static void __process_mem_region(struct mem_vector *entry,
unsigned long image_size)
{
struct mem_vector region, overlap;
- unsigned long start_orig, end;
- struct mem_vector cur_entry;
+ u64 region_end;
- /* On 32-bit, ignore entries entirely above our maximum. */
- if (IS_ENABLED(CONFIG_X86_32) && entry->start >= KERNEL_IMAGE_SIZE)
- return;
-
- /* Ignore entries entirely below our minimum. */
- if (entry->start + entry->size < minimum)
- return;
-
- /* Ignore entries above memory limit */
- end = min(entry->size + entry->start, mem_limit);
- if (entry->start >= end)
- return;
- cur_entry.start = entry->start;
- cur_entry.size = end - entry->start;
-
- region.start = cur_entry.start;
- region.size = cur_entry.size;
+ /* Enforce minimum and memory limit. */
+ region.start = max_t(u64, entry->start, minimum);
+ region_end = min(entry->start + entry->size, mem_limit);
/* Give up if slot area array is full. */
while (slot_area_index < MAX_SLOT_AREA) {
- start_orig = region.start;
-
- /* Potentially raise address to minimum location. */
- if (region.start < minimum)
- region.start = minimum;
-
/* Potentially raise address to meet alignment needs. */
region.start = ALIGN(region.start, CONFIG_PHYSICAL_ALIGN);
/* Did we raise the address above the passed in memory entry? */
- if (region.start > cur_entry.start + cur_entry.size)
+ if (region.start > region_end)
return;
/* Reduce size by any delta from the original address. */
- region.size -= region.start - start_orig;
-
- /* On 32-bit, reduce region size to fit within max size. */
- if (IS_ENABLED(CONFIG_X86_32) &&
- region.start + region.size > KERNEL_IMAGE_SIZE)
- region.size = KERNEL_IMAGE_SIZE - region.start;
+ region.size = region_end - region.start;
/* Return if region can't contain decompressed kernel */
if (region.size < image_size)
@@ -666,27 +635,19 @@ static void __process_mem_region(struct mem_vector *entry,
}
/* Store beginning of region if holds at least image_size. */
- if (overlap.start > region.start + image_size) {
- struct mem_vector beginning;
-
- beginning.start = region.start;
- beginning.size = overlap.start - region.start;
- process_gb_huge_pages(&beginning, image_size);
+ if (overlap.start >= region.start + image_size) {
+ region.size = overlap.start - region.start;
+ process_gb_huge_pages(®ion, image_size);
}
- /* Return if overlap extends to or past end of region. */
- if (overlap.start + overlap.size >= region.start + region.size)
- return;
-
/* Clip off the overlapping region and start over. */
- region.size -= overlap.start - region.start + overlap.size;
region.start = overlap.start + overlap.size;
}
}
static bool process_mem_region(struct mem_vector *region,
- unsigned long long minimum,
- unsigned long long image_size)
+ unsigned long minimum,
+ unsigned long image_size)
{
int i;
/*
@@ -709,7 +670,7 @@ static bool process_mem_region(struct mem_vector *region,
* immovable memory and @region.
*/
for (i = 0; i < num_immovable_mem; i++) {
- unsigned long long start, end, entry_end, region_end;
+ u64 start, end, entry_end, region_end;
struct mem_vector entry;
if (!mem_overlaps(region, &immovable_mem[i]))
@@ -736,8 +697,8 @@ static bool process_mem_region(struct mem_vector *region,
#ifdef CONFIG_EFI
/*
- * Returns true if mirror region found (and must have been processed
- * for slots adding)
+ * Returns true if we processed the EFI memmap, which we prefer over the E820
+ * table if it is available.
*/
static bool
process_efi_entries(unsigned long minimum, unsigned long image_size)
@@ -839,20 +800,30 @@ static void process_e820_entries(unsigned long minimum,
static unsigned long find_random_phys_addr(unsigned long minimum,
unsigned long image_size)
{
+ u64 phys_addr;
+
+ /* Bail out early if it's impossible to succeed. */
+ if (minimum + image_size > mem_limit)
+ return 0;
+
/* Check if we had too many memmaps. */
if (memmap_too_large) {
debug_putstr("Aborted memory entries scan (more than 4 memmap= args)!\n");
return 0;
}
- /* Make sure minimum is aligned. */
- minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN);
+ if (!process_efi_entries(minimum, image_size))
+ process_e820_entries(minimum, image_size);
- if (process_efi_entries(minimum, image_size))
- return slots_fetch_random();
+ phys_addr = slots_fetch_random();
- process_e820_entries(minimum, image_size);
- return slots_fetch_random();
+ /* Perform a final check to make sure the address is in range. */
+ if (phys_addr < minimum || phys_addr + image_size > mem_limit) {
+ warn("Invalid physical address chosen!\n");
+ return 0;
+ }
+
+ return (unsigned long)phys_addr;
}
static unsigned long find_random_virt_addr(unsigned long minimum,
@@ -860,18 +831,12 @@ static unsigned long find_random_virt_addr(unsigned long minimum,
{
unsigned long slots, random_addr;
- /* Make sure minimum is aligned. */
- minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN);
- /* Align image_size for easy slot calculations. */
- image_size = ALIGN(image_size, CONFIG_PHYSICAL_ALIGN);
-
/*
* There are how many CONFIG_PHYSICAL_ALIGN-sized slots
* that can hold image_size within the range of minimum to
* KERNEL_IMAGE_SIZE?
*/
- slots = (KERNEL_IMAGE_SIZE - minimum - image_size) /
- CONFIG_PHYSICAL_ALIGN + 1;
+ slots = 1 + (KERNEL_IMAGE_SIZE - minimum - image_size) / CONFIG_PHYSICAL_ALIGN;
random_addr = kaslr_get_random_long("Virtual") % slots;
@@ -908,6 +873,11 @@ void choose_random_location(unsigned long input,
/* Prepare to add new identity pagetables on demand. */
initialize_identity_maps();
+ if (IS_ENABLED(CONFIG_X86_32))
+ mem_limit = KERNEL_IMAGE_SIZE;
+ else
+ mem_limit = MAXMEM;
+
/* Record the various known unsafe memory ranges. */
mem_avoid_init(input, input_size, *output);
@@ -917,6 +887,8 @@ void choose_random_location(unsigned long input,
* location:
*/
min_addr = min(*output, 512UL << 20);
+ /* Make sure minimum is aligned. */
+ min_addr = ALIGN(min_addr, CONFIG_PHYSICAL_ALIGN);
/* Walk available memory entries to find a random address. */
random_addr = find_random_phys_addr(min_addr, output_size);
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index 726e264..3efce27 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -70,8 +70,8 @@ int cmdline_find_option(const char *option, char *buffer, int bufsize);
int cmdline_find_option_bool(const char *option);
struct mem_vector {
- unsigned long long start;
- unsigned long long size;
+ u64 start;
+ u64 size;
};
#if CONFIG_RANDOMIZE_BASE
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 2901d5d..83fc9d3 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -368,6 +368,7 @@
#define X86_FEATURE_MD_CLEAR (18*32+10) /* VERW clears CPU buffers */
#define X86_FEATURE_TSX_FORCE_ABORT (18*32+13) /* "" TSX_FORCE_ABORT */
#define X86_FEATURE_SERIALIZE (18*32+14) /* SERIALIZE instruction */
+#define X86_FEATURE_TSXLDTRK (18*32+16) /* TSX Suspend Load Address Tracking */
#define X86_FEATURE_PCONFIG (18*32+18) /* Intel PCONFIG */
#define X86_FEATURE_ARCH_LBR (18*32+19) /* Intel ARCH LBR */
#define X86_FEATURE_SPEC_CTRL (18*32+26) /* "" Speculation Control (IBRS + IBPB) */
diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 0a460f2..21a8b52 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -602,9 +602,7 @@ static inline u64 xgetbv(u32 index)
{
u32 eax, edx;
- asm volatile(".byte 0x0f,0x01,0xd0" /* xgetbv */
- : "=a" (eax), "=d" (edx)
- : "c" (index));
+ asm volatile("xgetbv" : "=a" (eax), "=d" (edx) : "c" (index));
return eax + ((u64)edx << 32);
}
@@ -613,8 +611,7 @@ static inline void xsetbv(u32 index, u64 value)
u32 eax = value;
u32 edx = value >> 32;
- asm volatile(".byte 0x0f,0x01,0xd1" /* xsetbv */
- : : "a" (eax), "d" (edx), "c" (index));
+ asm volatile("xsetbv" :: "a" (eax), "d" (edx), "c" (index));
}
#endif /* _ASM_X86_FPU_INTERNAL_H */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 5303dbc..d0f77235 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -80,13 +80,14 @@
#define KVM_REQ_HV_EXIT KVM_ARCH_REQ(21)
#define KVM_REQ_HV_STIMER KVM_ARCH_REQ(22)
#define KVM_REQ_LOAD_EOI_EXITMAP KVM_ARCH_REQ(23)
-#define KVM_REQ_GET_VMCS12_PAGES KVM_ARCH_REQ(24)
+#define KVM_REQ_GET_NESTED_STATE_PAGES KVM_ARCH_REQ(24)
#define KVM_REQ_APICV_UPDATE \
KVM_ARCH_REQ_FLAGS(25, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
#define KVM_REQ_TLB_FLUSH_CURRENT KVM_ARCH_REQ(26)
#define KVM_REQ_HV_TLB_FLUSH \
KVM_ARCH_REQ_FLAGS(27, KVM_REQUEST_NO_WAKEUP)
#define KVM_REQ_APF_READY KVM_ARCH_REQ(28)
+#define KVM_REQ_MSR_FILTER_CHANGED KVM_ARCH_REQ(29)
#define CR0_RESERVED_BITS \
(~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
@@ -860,6 +861,13 @@ struct kvm_hv {
struct kvm_hv_syndbg hv_syndbg;
};
+struct msr_bitmap_range {
+ u32 flags;
+ u32 nmsrs;
+ u32 base;
+ unsigned long *bitmap;
+};
+
enum kvm_irqchip_mode {
KVM_IRQCHIP_NONE,
KVM_IRQCHIP_KERNEL, /* created with KVM_CREATE_IRQCHIP */
@@ -961,6 +969,15 @@ struct kvm_arch {
bool guest_can_read_msr_platform_info;
bool exception_payload_enabled;
+ /* Deflect RDMSR and WRMSR to user space when they trigger a #GP */
+ u32 user_space_msr_mask;
+
+ struct {
+ u8 count;
+ bool default_allow:1;
+ struct msr_bitmap_range ranges[16];
+ } msr_filter;
+
struct kvm_pmu_event_filter *pmu_event_filter;
struct task_struct *nx_lpage_recovery_thread;
};
@@ -1143,7 +1160,12 @@ struct kvm_x86_ops {
/* Returns actual tsc_offset set in active VMCS */
u64 (*write_l1_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
- void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2);
+ /*
+ * Retrieve somewhat arbitrary exit information. Intended to be used
+ * only from within tracepoints to avoid VMREADs when tracing is off.
+ */
+ void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2,
+ u32 *exit_int_info, u32 *exit_int_info_err_code);
int (*check_intercept)(struct kvm_vcpu *vcpu,
struct x86_instruction_info *info,
@@ -1221,12 +1243,13 @@ struct kvm_x86_ops {
int (*get_msr_feature)(struct kvm_msr_entry *entry);
- bool (*need_emulation_on_page_fault)(struct kvm_vcpu *vcpu);
+ bool (*can_emulate_instruction)(struct kvm_vcpu *vcpu, void *insn, int insn_len);
bool (*apic_init_signal_blocked)(struct kvm_vcpu *vcpu);
int (*enable_direct_tlbflush)(struct kvm_vcpu *vcpu);
void (*migrate_timers)(struct kvm_vcpu *vcpu);
+ void (*msr_filter_changed)(struct kvm_vcpu *vcpu);
};
struct kvm_x86_nested_ops {
@@ -1238,7 +1261,7 @@ struct kvm_x86_nested_ops {
int (*set_state)(struct kvm_vcpu *vcpu,
struct kvm_nested_state __user *user_kvm_nested_state,
struct kvm_nested_state *kvm_state);
- bool (*get_vmcs12_pages)(struct kvm_vcpu *vcpu);
+ bool (*get_nested_state_pages)(struct kvm_vcpu *vcpu);
int (*write_log_dirty)(struct kvm_vcpu *vcpu, gpa_t l2_gpa);
int (*enable_evmcs)(struct kvm_vcpu *vcpu,
@@ -1612,8 +1635,8 @@ int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low,
unsigned long ipi_bitmap_high, u32 min,
unsigned long icr, int op_64_bit);
-void kvm_define_shared_msr(unsigned index, u32 msr);
-int kvm_set_shared_msr(unsigned index, u64 val, u64 mask);
+void kvm_define_user_return_msr(unsigned index, u32 msr);
+int kvm_set_user_return_msr(unsigned index, u64 val, u64 mask);
u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc);
u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc);
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index 59a3e13..5999b0b 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -234,6 +234,12 @@ static inline void clwb(volatile void *__p)
#define nop() asm volatile ("nop")
+static inline void serialize(void)
+{
+ /* Instruction opcode for SERIALIZE; supported in binutils >= 2.35. */
+ asm volatile(".byte 0xf, 0x1, 0xe8" ::: "memory");
+}
+
#endif /* __KERNEL__ */
#endif /* _ASM_X86_SPECIAL_INSNS_H */
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 8a1f538..71d630b 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -3,10 +3,54 @@
#define __SVM_H
#include <uapi/asm/svm.h>
+#include <uapi/asm/kvm.h>
+/*
+ * 32-bit intercept words in the VMCB Control Area, starting
+ * at Byte offset 000h.
+ */
+
+enum intercept_words {
+ INTERCEPT_CR = 0,
+ INTERCEPT_DR,
+ INTERCEPT_EXCEPTION,
+ INTERCEPT_WORD3,
+ INTERCEPT_WORD4,
+ INTERCEPT_WORD5,
+ MAX_INTERCEPT,
+};
enum {
- INTERCEPT_INTR,
+ /* Byte offset 000h (word 0) */
+ INTERCEPT_CR0_READ = 0,
+ INTERCEPT_CR3_READ = 3,
+ INTERCEPT_CR4_READ = 4,
+ INTERCEPT_CR8_READ = 8,
+ INTERCEPT_CR0_WRITE = 16,
+ INTERCEPT_CR3_WRITE = 16 + 3,
+ INTERCEPT_CR4_WRITE = 16 + 4,
+ INTERCEPT_CR8_WRITE = 16 + 8,
+ /* Byte offset 004h (word 1) */
+ INTERCEPT_DR0_READ = 32,
+ INTERCEPT_DR1_READ,
+ INTERCEPT_DR2_READ,
+ INTERCEPT_DR3_READ,
+ INTERCEPT_DR4_READ,
+ INTERCEPT_DR5_READ,
+ INTERCEPT_DR6_READ,
+ INTERCEPT_DR7_READ,
+ INTERCEPT_DR0_WRITE = 48,
+ INTERCEPT_DR1_WRITE,
+ INTERCEPT_DR2_WRITE,
+ INTERCEPT_DR3_WRITE,
+ INTERCEPT_DR4_WRITE,
+ INTERCEPT_DR5_WRITE,
+ INTERCEPT_DR6_WRITE,
+ INTERCEPT_DR7_WRITE,
+ /* Byte offset 008h (word 2) */
+ INTERCEPT_EXCEPTION_OFFSET = 64,
+ /* Byte offset 00Ch (word 3) */
+ INTERCEPT_INTR = 96,
INTERCEPT_NMI,
INTERCEPT_SMI,
INTERCEPT_INIT,
@@ -38,7 +82,8 @@ enum {
INTERCEPT_TASK_SWITCH,
INTERCEPT_FERR_FREEZE,
INTERCEPT_SHUTDOWN,
- INTERCEPT_VMRUN,
+ /* Byte offset 010h (word 4) */
+ INTERCEPT_VMRUN = 128,
INTERCEPT_VMMCALL,
INTERCEPT_VMLOAD,
INTERCEPT_VMSAVE,
@@ -53,15 +98,18 @@ enum {
INTERCEPT_MWAIT_COND,
INTERCEPT_XSETBV,
INTERCEPT_RDPRU,
+ /* Byte offset 014h (word 5) */
+ INTERCEPT_INVLPGB = 160,
+ INTERCEPT_INVLPGB_ILLEGAL,
+ INTERCEPT_INVPCID,
+ INTERCEPT_MCOMMIT,
+ INTERCEPT_TLBSYNC,
};
struct __attribute__ ((__packed__)) vmcb_control_area {
- u32 intercept_cr;
- u32 intercept_dr;
- u32 intercept_exceptions;
- u64 intercept;
- u8 reserved_1[40];
+ u32 intercepts[MAX_INTERCEPT];
+ u32 reserved_1[15 - MAX_INTERCEPT];
u16 pause_filter_thresh;
u16 pause_filter_count;
u64 iopm_base_pa;
@@ -150,14 +198,14 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
#define SVM_NESTED_CTL_NP_ENABLE BIT(0)
#define SVM_NESTED_CTL_SEV_ENABLE BIT(1)
-struct __attribute__ ((__packed__)) vmcb_seg {
+struct vmcb_seg {
u16 selector;
u16 attrib;
u32 limit;
u64 base;
-};
+} __packed;
-struct __attribute__ ((__packed__)) vmcb_save_area {
+struct vmcb_save_area {
struct vmcb_seg es;
struct vmcb_seg cs;
struct vmcb_seg ss;
@@ -200,20 +248,67 @@ struct __attribute__ ((__packed__)) vmcb_save_area {
u64 br_to;
u64 last_excp_from;
u64 last_excp_to;
-};
+ /*
+ * The following part of the save area is valid only for
+ * SEV-ES guests when referenced through the GHCB.
+ */
+ u8 reserved_7[104];
+ u64 reserved_8; /* rax already available at 0x01f8 */
+ u64 rcx;
+ u64 rdx;
+ u64 rbx;
+ u64 reserved_9; /* rsp already available at 0x01d8 */
+ u64 rbp;
+ u64 rsi;
+ u64 rdi;
+ u64 r8;
+ u64 r9;
+ u64 r10;
+ u64 r11;
+ u64 r12;
+ u64 r13;
+ u64 r14;
+ u64 r15;
+ u8 reserved_10[16];
+ u64 sw_exit_code;
+ u64 sw_exit_info_1;
+ u64 sw_exit_info_2;
+ u64 sw_scratch;
+ u8 reserved_11[56];
+ u64 xcr0;
+ u8 valid_bitmap[16];
+ u64 x87_state_gpa;
+} __packed;
+
+struct ghcb {
+ struct vmcb_save_area save;
+ u8 reserved_save[2048 - sizeof(struct vmcb_save_area)];
+
+ u8 shared_buffer[2032];
+
+ u8 reserved_1[10];
+ u16 protocol_version; /* negotiated SEV-ES/GHCB protocol version */
+ u32 ghcb_usage;
+} __packed;
+
+
+#define EXPECTED_VMCB_SAVE_AREA_SIZE 1032
+#define EXPECTED_VMCB_CONTROL_AREA_SIZE 256
+#define EXPECTED_GHCB_SIZE PAGE_SIZE
static inline void __unused_size_checks(void)
{
- BUILD_BUG_ON(sizeof(struct vmcb_save_area) != 0x298);
- BUILD_BUG_ON(sizeof(struct vmcb_control_area) != 256);
+ BUILD_BUG_ON(sizeof(struct vmcb_save_area) != EXPECTED_VMCB_SAVE_AREA_SIZE);
+ BUILD_BUG_ON(sizeof(struct vmcb_control_area) != EXPECTED_VMCB_CONTROL_AREA_SIZE);
+ BUILD_BUG_ON(sizeof(struct ghcb) != EXPECTED_GHCB_SIZE);
}
-struct __attribute__ ((__packed__)) vmcb {
+struct vmcb {
struct vmcb_control_area control;
u8 reserved_control[1024 - sizeof(struct vmcb_control_area)];
struct vmcb_save_area save;
-};
+} __packed;
#define SVM_CPUID_FUNC 0x8000000a
@@ -240,32 +335,6 @@ struct __attribute__ ((__packed__)) vmcb {
#define SVM_SELECTOR_READ_MASK SVM_SELECTOR_WRITE_MASK
#define SVM_SELECTOR_CODE_MASK (1 << 3)
-#define INTERCEPT_CR0_READ 0
-#define INTERCEPT_CR3_READ 3
-#define INTERCEPT_CR4_READ 4
-#define INTERCEPT_CR8_READ 8
-#define INTERCEPT_CR0_WRITE (16 + 0)
-#define INTERCEPT_CR3_WRITE (16 + 3)
-#define INTERCEPT_CR4_WRITE (16 + 4)
-#define INTERCEPT_CR8_WRITE (16 + 8)
-
-#define INTERCEPT_DR0_READ 0
-#define INTERCEPT_DR1_READ 1
-#define INTERCEPT_DR2_READ 2
-#define INTERCEPT_DR3_READ 3
-#define INTERCEPT_DR4_READ 4
-#define INTERCEPT_DR5_READ 5
-#define INTERCEPT_DR6_READ 6
-#define INTERCEPT_DR7_READ 7
-#define INTERCEPT_DR0_WRITE (16 + 0)
-#define INTERCEPT_DR1_WRITE (16 + 1)
-#define INTERCEPT_DR2_WRITE (16 + 2)
-#define INTERCEPT_DR3_WRITE (16 + 3)
-#define INTERCEPT_DR4_WRITE (16 + 4)
-#define INTERCEPT_DR5_WRITE (16 + 5)
-#define INTERCEPT_DR6_WRITE (16 + 6)
-#define INTERCEPT_DR7_WRITE (16 + 7)
-
#define SVM_EVTINJ_VEC_MASK 0xff
#define SVM_EVTINJ_TYPE_SHIFT 8
@@ -298,4 +367,47 @@ struct __attribute__ ((__packed__)) vmcb {
#define SVM_CR0_SELECTIVE_MASK (X86_CR0_TS | X86_CR0_MP)
+/* GHCB Accessor functions */
+
+#define GHCB_BITMAP_IDX(field) \
+ (offsetof(struct vmcb_save_area, field) / sizeof(u64))
+
+#define DEFINE_GHCB_ACCESSORS(field) \
+ static inline bool ghcb_##field##_is_valid(const struct ghcb *ghcb) \
+ { \
+ return test_bit(GHCB_BITMAP_IDX(field), \
+ (unsigned long *)&ghcb->save.valid_bitmap); \
+ } \
+ \
+ static inline void ghcb_set_##field(struct ghcb *ghcb, u64 value) \
+ { \
+ __set_bit(GHCB_BITMAP_IDX(field), \
+ (unsigned long *)&ghcb->save.valid_bitmap); \
+ ghcb->save.field = value; \
+ }
+
+DEFINE_GHCB_ACCESSORS(cpl)
+DEFINE_GHCB_ACCESSORS(rip)
+DEFINE_GHCB_ACCESSORS(rsp)
+DEFINE_GHCB_ACCESSORS(rax)
+DEFINE_GHCB_ACCESSORS(rcx)
+DEFINE_GHCB_ACCESSORS(rdx)
+DEFINE_GHCB_ACCESSORS(rbx)
+DEFINE_GHCB_ACCESSORS(rbp)
+DEFINE_GHCB_ACCESSORS(rsi)
+DEFINE_GHCB_ACCESSORS(rdi)
+DEFINE_GHCB_ACCESSORS(r8)
+DEFINE_GHCB_ACCESSORS(r9)
+DEFINE_GHCB_ACCESSORS(r10)
+DEFINE_GHCB_ACCESSORS(r11)
+DEFINE_GHCB_ACCESSORS(r12)
+DEFINE_GHCB_ACCESSORS(r13)
+DEFINE_GHCB_ACCESSORS(r14)
+DEFINE_GHCB_ACCESSORS(r15)
+DEFINE_GHCB_ACCESSORS(sw_exit_code)
+DEFINE_GHCB_ACCESSORS(sw_exit_info_1)
+DEFINE_GHCB_ACCESSORS(sw_exit_info_2)
+DEFINE_GHCB_ACCESSORS(sw_scratch)
+DEFINE_GHCB_ACCESSORS(xcr0)
+
#endif
diff --git a/arch/x86/include/asm/sync_core.h b/arch/x86/include/asm/sync_core.h
index fdb5b35..0fd4a9d 100644
--- a/arch/x86/include/asm/sync_core.h
+++ b/arch/x86/include/asm/sync_core.h
@@ -5,6 +5,7 @@
#include <linux/preempt.h>
#include <asm/processor.h>
#include <asm/cpufeature.h>
+#include <asm/special_insns.h>
#ifdef CONFIG_X86_32
static inline void iret_to_self(void)
@@ -46,22 +47,34 @@ static inline void iret_to_self(void)
*
* b) Text was modified on a different CPU, may subsequently be
* executed on this CPU, and you want to make sure the new version
- * gets executed. This generally means you're calling this in a IPI.
+ * gets executed. This generally means you're calling this in an IPI.
*
* If you're calling this for a different reason, you're probably doing
* it wrong.
+ *
+ * Like all of Linux's memory ordering operations, this is a
+ * compiler barrier as well.
*/
static inline void sync_core(void)
{
/*
- * There are quite a few ways to do this. IRET-to-self is nice
- * because it works on every CPU, at any CPL (so it's compatible
- * with paravirtualization), and it never exits to a hypervisor.
- * The only down sides are that it's a bit slow (it seems to be
- * a bit more than 2x slower than the fastest options) and that
- * it unmasks NMIs. The "push %cs" is needed because, in
- * paravirtual environments, __KERNEL_CS may not be a valid CS
- * value when we do IRET directly.
+ * The SERIALIZE instruction is the most straightforward way to
+ * do this, but it is not universally available.
+ */
+ if (static_cpu_has(X86_FEATURE_SERIALIZE)) {
+ serialize();
+ return;
+ }
+
+ /*
+ * For all other processors, there are quite a few ways to do this.
+ * IRET-to-self is nice because it works on every CPU, at any CPL
+ * (so it's compatible with paravirtualization), and it never exits
+ * to a hypervisor. The only downsides are that it's a bit slow
+ * (it seems to be a bit more than 2x slower than the fastest
+ * options) and that it unmasks NMIs. The "push %cs" is needed,
+ * because in paravirtual environments __KERNEL_CS may not be a
+ * valid CS value when we do IRET directly.
*
* In case NMI unmasking or performance ever becomes a problem,
* the next best option appears to be MOV-to-CR2 and an
@@ -71,9 +84,6 @@ static inline void sync_core(void)
* CPUID is the conventional way, but it's nasty: it doesn't
* exist on some 486-like CPUs, and it usually exits to a
* hypervisor.
- *
- * Like all of Linux's memory ordering operations, this is a
- * compiler barrier as well.
*/
iret_to_self();
}
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index cd7de4b..f8ba528 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -52,7 +52,7 @@
#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES VMCS_CONTROL_BIT(VIRT_APIC_ACCESSES)
#define SECONDARY_EXEC_ENABLE_EPT VMCS_CONTROL_BIT(EPT)
#define SECONDARY_EXEC_DESC VMCS_CONTROL_BIT(DESC_EXITING)
-#define SECONDARY_EXEC_RDTSCP VMCS_CONTROL_BIT(RDTSCP)
+#define SECONDARY_EXEC_ENABLE_RDTSCP VMCS_CONTROL_BIT(RDTSCP)
#define SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE VMCS_CONTROL_BIT(VIRTUAL_X2APIC)
#define SECONDARY_EXEC_ENABLE_VPID VMCS_CONTROL_BIT(VPID)
#define SECONDARY_EXEC_WBINVD_EXITING VMCS_CONTROL_BIT(WBINVD_EXITING)
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 0780f97..89e5f3d 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -192,6 +192,26 @@ struct kvm_msr_list {
__u32 indices[0];
};
+/* Maximum size of any access bitmap in bytes */
+#define KVM_MSR_FILTER_MAX_BITMAP_SIZE 0x600
+
+/* for KVM_X86_SET_MSR_FILTER */
+struct kvm_msr_filter_range {
+#define KVM_MSR_FILTER_READ (1 << 0)
+#define KVM_MSR_FILTER_WRITE (1 << 1)
+ __u32 flags;
+ __u32 nmsrs; /* number of msrs in bitmap */
+ __u32 base; /* MSR index the bitmap starts at */
+ __u8 *bitmap; /* a 1 bit allows the operations in flags, 0 denies */
+};
+
+#define KVM_MSR_FILTER_MAX_RANGES 16
+struct kvm_msr_filter {
+#define KVM_MSR_FILTER_DEFAULT_ALLOW (0 << 0)
+#define KVM_MSR_FILTER_DEFAULT_DENY (1 << 0)
+ __u32 flags;
+ struct kvm_msr_filter_range ranges[KVM_MSR_FILTER_MAX_RANGES];
+};
struct kvm_cpuid_entry {
__u32 function;
diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h
index 2e8a30f..522d42d 100644
--- a/arch/x86/include/uapi/asm/svm.h
+++ b/arch/x86/include/uapi/asm/svm.h
@@ -76,6 +76,7 @@
#define SVM_EXIT_MWAIT_COND 0x08c
#define SVM_EXIT_XSETBV 0x08d
#define SVM_EXIT_RDPRU 0x08e
+#define SVM_EXIT_INVPCID 0x0a2
#define SVM_EXIT_NPF 0x400
#define SVM_EXIT_AVIC_INCOMPLETE_IPI 0x401
#define SVM_EXIT_AVIC_UNACCELERATED_ACCESS 0x402
@@ -171,6 +172,7 @@
{ SVM_EXIT_MONITOR, "monitor" }, \
{ SVM_EXIT_MWAIT, "mwait" }, \
{ SVM_EXIT_XSETBV, "xsetbv" }, \
+ { SVM_EXIT_INVPCID, "invpcid" }, \
{ SVM_EXIT_NPF, "npf" }, \
{ SVM_EXIT_AVIC_INCOMPLETE_IPI, "avic_incomplete_ipi" }, \
{ SVM_EXIT_AVIC_UNACCELERATED_ACCESS, "avic_unaccelerated_access" }, \
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 08320b0..42c6e0d 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -270,9 +270,8 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_asyncpf_interrupt)
{
struct pt_regs *old_regs = set_irq_regs(regs);
u32 token;
- irqentry_state_t state;
- state = irqentry_enter(regs);
+ ack_APIC_irq();
inc_irq_stat(irq_hv_callback_count);
@@ -283,7 +282,6 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_asyncpf_interrupt)
wrmsrl(MSR_KVM_ASYNC_PF_ACK, 1);
}
- irqentry_exit(regs, state);
set_irq_regs(old_regs);
}
@@ -954,7 +952,7 @@ void arch_haltpoll_disable(unsigned int cpu)
if (!kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL))
return;
- /* Enable guest halt poll disables host halt poll */
+ /* Disable guest halt poll enables host halt poll */
smp_call_function_single(cpu, kvm_enable_host_haltpoll, NULL, 1);
}
EXPORT_SYMBOL_GPL(arch_haltpoll_disable);
diff --git a/arch/x86/kernel/umip.c b/arch/x86/kernel/umip.c
index 8d5cbe1..2c304fd 100644
--- a/arch/x86/kernel/umip.c
+++ b/arch/x86/kernel/umip.c
@@ -45,11 +45,12 @@
* value that, lies close to the top of the kernel memory. The limit for the GDT
* and the IDT are set to zero.
*
- * Given that SLDT and STR are not commonly used in programs that run on WineHQ
- * or DOSEMU2, they are not emulated.
- *
- * The instruction smsw is emulated to return the value that the register CR0
+ * The instruction SMSW is emulated to return the value that the register CR0
* has at boot time as set in the head_32.
+ * SLDT and STR are emulated to return the values that the kernel programmatically
+ * assigns:
+ * - SLDT returns (GDT_ENTRY_LDT * 8) if an LDT has been set, 0 if not.
+ * - STR returns (GDT_ENTRY_TSS * 8).
*
* Emulation is provided for both 32-bit and 64-bit processes.
*
@@ -244,16 +245,34 @@ static int emulate_umip_insn(struct insn *insn, int umip_inst,
*data_size += UMIP_GDT_IDT_LIMIT_SIZE;
memcpy(data, &dummy_limit, UMIP_GDT_IDT_LIMIT_SIZE);
- } else if (umip_inst == UMIP_INST_SMSW) {
- unsigned long dummy_value = CR0_STATE;
+ } else if (umip_inst == UMIP_INST_SMSW || umip_inst == UMIP_INST_SLDT ||
+ umip_inst == UMIP_INST_STR) {
+ unsigned long dummy_value;
+
+ if (umip_inst == UMIP_INST_SMSW) {
+ dummy_value = CR0_STATE;
+ } else if (umip_inst == UMIP_INST_STR) {
+ dummy_value = GDT_ENTRY_TSS * 8;
+ } else if (umip_inst == UMIP_INST_SLDT) {
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
+ down_read(¤t->mm->context.ldt_usr_sem);
+ if (current->mm->context.ldt)
+ dummy_value = GDT_ENTRY_LDT * 8;
+ else
+ dummy_value = 0;
+ up_read(¤t->mm->context.ldt_usr_sem);
+#else
+ dummy_value = 0;
+#endif
+ }
/*
- * Even though the CR0 register has 4 bytes, the number
+ * For these 3 instructions, the number
* of bytes to be copied in the result buffer is determined
* by whether the operand is a register or a memory location.
* If operand is a register, return as many bytes as the operand
* size. If operand is memory, return only the two least
- * siginificant bytes of CR0.
+ * siginificant bytes.
*/
if (X86_MODRM_MOD(insn->modrm.value) == 3)
*data_size = insn->opnd_bytes;
@@ -261,7 +280,6 @@ static int emulate_umip_insn(struct insn *insn, int umip_inst,
*data_size = 2;
memcpy(data, &dummy_value, *data_size);
- /* STR and SLDT are not emulated */
} else {
return -EINVAL;
}
@@ -383,10 +401,6 @@ bool fixup_umip_exception(struct pt_regs *regs)
umip_pr_warn(regs, "%s instruction cannot be used by applications.\n",
umip_insns[umip_inst]);
- /* Do not emulate (spoof) SLDT or STR. */
- if (umip_inst == UMIP_INST_STR || umip_inst == UMIP_INST_SLDT)
- return false;
-
umip_pr_warn(regs, "For now, expensive software emulation returns the result.\n");
if (emulate_umip_insn(&insn, umip_inst, dummy_data, &dummy_data_size,
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 4a3081e..7f86a14 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -17,7 +17,8 @@
i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \
hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o
-kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o vmx/evmcs.o vmx/nested.o
+kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \
+ vmx/evmcs.o vmx/nested.o vmx/posted_intr.o
kvm-amd-y += svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o svm/sev.o
obj-$(CONFIG_KVM) += kvm.o
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 3fd6eec..37c3668 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -186,7 +186,6 @@ int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu)
not_found:
return 36;
}
-EXPORT_SYMBOL_GPL(cpuid_query_maxphyaddr);
/* when an old userspace process fills a new kernel module */
int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
@@ -371,7 +370,7 @@ void kvm_set_cpu_caps(void)
F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) |
F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) |
F(MD_CLEAR) | F(AVX512_VP2INTERSECT) | F(FSRM) |
- F(SERIALIZE)
+ F(SERIALIZE) | F(TSXLDTRK)
);
/* TSC_ADJUST and ARCH_CAPABILITIES are emulated in software. */
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 3a923ae..1d2c4f2 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -34,6 +34,11 @@ static inline int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
return vcpu->arch.maxphyaddr;
}
+static inline bool kvm_vcpu_is_illegal_gpa(struct kvm_vcpu *vcpu, gpa_t gpa)
+{
+ return (gpa >= BIT_ULL(cpuid_maxphyaddr(vcpu)));
+}
+
struct cpuid_reg {
u32 function;
u32 index;
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 5299ef5..0cc0db5 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2505,9 +2505,14 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt,
*reg_write(ctxt, i) = GET_SMSTATE(u32, smstate, 0x7fd0 + i * 4);
val = GET_SMSTATE(u32, smstate, 0x7fcc);
- ctxt->ops->set_dr(ctxt, 6, (val & DR6_VOLATILE) | DR6_FIXED_1);
+
+ if (ctxt->ops->set_dr(ctxt, 6, (val & DR6_VOLATILE) | DR6_FIXED_1))
+ return X86EMUL_UNHANDLEABLE;
+
val = GET_SMSTATE(u32, smstate, 0x7fc8);
- ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1);
+
+ if (ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1))
+ return X86EMUL_UNHANDLEABLE;
selector = GET_SMSTATE(u32, smstate, 0x7fc4);
set_desc_base(&desc, GET_SMSTATE(u32, smstate, 0x7f64));
@@ -2560,16 +2565,23 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt,
ctxt->eflags = GET_SMSTATE(u32, smstate, 0x7f70) | X86_EFLAGS_FIXED;
val = GET_SMSTATE(u32, smstate, 0x7f68);
- ctxt->ops->set_dr(ctxt, 6, (val & DR6_VOLATILE) | DR6_FIXED_1);
+
+ if (ctxt->ops->set_dr(ctxt, 6, (val & DR6_VOLATILE) | DR6_FIXED_1))
+ return X86EMUL_UNHANDLEABLE;
+
val = GET_SMSTATE(u32, smstate, 0x7f60);
- ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1);
+
+ if (ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1))
+ return X86EMUL_UNHANDLEABLE;
cr0 = GET_SMSTATE(u64, smstate, 0x7f58);
cr3 = GET_SMSTATE(u64, smstate, 0x7f50);
cr4 = GET_SMSTATE(u64, smstate, 0x7f48);
ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smstate, 0x7f00));
val = GET_SMSTATE(u64, smstate, 0x7ed0);
- ctxt->ops->set_msr(ctxt, MSR_EFER, val & ~EFER_LMA);
+
+ if (ctxt->ops->set_msr(ctxt, MSR_EFER, val & ~EFER_LMA))
+ return X86EMUL_UNHANDLEABLE;
selector = GET_SMSTATE(u32, smstate, 0x7e90);
rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smstate, 0x7e92) << 8);
@@ -3594,7 +3606,7 @@ static int em_rdpid(struct x86_emulate_ctxt *ctxt)
u64 tsc_aux = 0;
if (ctxt->ops->get_msr(ctxt, MSR_TSC_AUX, &tsc_aux))
- return emulate_gp(ctxt, 0);
+ return emulate_ud(ctxt);
ctxt->dst.val = tsc_aux;
return X86EMUL_CONTINUE;
}
@@ -3689,11 +3701,18 @@ static int em_dr_write(struct x86_emulate_ctxt *ctxt)
static int em_wrmsr(struct x86_emulate_ctxt *ctxt)
{
+ u64 msr_index = reg_read(ctxt, VCPU_REGS_RCX);
u64 msr_data;
+ int r;
msr_data = (u32)reg_read(ctxt, VCPU_REGS_RAX)
| ((u64)reg_read(ctxt, VCPU_REGS_RDX) << 32);
- if (ctxt->ops->set_msr(ctxt, reg_read(ctxt, VCPU_REGS_RCX), msr_data))
+ r = ctxt->ops->set_msr(ctxt, msr_index, msr_data);
+
+ if (r == X86EMUL_IO_NEEDED)
+ return r;
+
+ if (r)
return emulate_gp(ctxt, 0);
return X86EMUL_CONTINUE;
@@ -3701,9 +3720,16 @@ static int em_wrmsr(struct x86_emulate_ctxt *ctxt)
static int em_rdmsr(struct x86_emulate_ctxt *ctxt)
{
+ u64 msr_index = reg_read(ctxt, VCPU_REGS_RCX);
u64 msr_data;
+ int r;
- if (ctxt->ops->get_msr(ctxt, reg_read(ctxt, VCPU_REGS_RCX), &msr_data))
+ r = ctxt->ops->get_msr(ctxt, msr_index, &msr_data);
+
+ if (r == X86EMUL_IO_NEEDED)
+ return r;
+
+ if (r)
return emulate_gp(ctxt, 0);
*reg_write(ctxt, VCPU_REGS_RAX) = (u32)msr_data;
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 1d33056..67a4f60 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -633,6 +633,11 @@ static int stimer_set_config(struct kvm_vcpu_hv_stimer *stimer, u64 config,
{
union hv_stimer_config new_config = {.as_uint64 = config},
old_config = {.as_uint64 = stimer->config.as_uint64};
+ struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer);
+ struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu);
+
+ if (!synic->active && !host)
+ return 1;
trace_kvm_hv_stimer_set_config(stimer_to_vcpu(stimer)->vcpu_id,
stimer->index, config, host);
@@ -652,6 +657,12 @@ static int stimer_set_config(struct kvm_vcpu_hv_stimer *stimer, u64 config,
static int stimer_set_count(struct kvm_vcpu_hv_stimer *stimer, u64 count,
bool host)
{
+ struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer);
+ struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu);
+
+ if (!synic->active && !host)
+ return 1;
+
trace_kvm_hv_stimer_set_count(stimer_to_vcpu(stimer)->vcpu_id,
stimer->index, count, host);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 35cca2e..105e785 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -310,6 +310,12 @@ static inline void kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id)
atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
}
+static inline void kvm_apic_set_dfr(struct kvm_lapic *apic, u32 val)
+{
+ kvm_lapic_set_reg(apic, APIC_DFR, val);
+ atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
+}
+
static inline u32 kvm_apic_calc_x2apic_ldr(u32 id)
{
return ((id >> 4) << 16) | (1 << (id & 0xf));
@@ -488,6 +494,12 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
}
}
+void kvm_apic_clear_irr(struct kvm_vcpu *vcpu, int vec)
+{
+ apic_clear_irr(vec, vcpu->arch.apic);
+}
+EXPORT_SYMBOL_GPL(kvm_apic_clear_irr);
+
static inline void apic_set_isr(int vec, struct kvm_lapic *apic)
{
struct kvm_vcpu *vcpu;
@@ -1576,9 +1588,6 @@ static void __kvm_wait_lapic_expire(struct kvm_vcpu *vcpu)
struct kvm_lapic *apic = vcpu->arch.apic;
u64 guest_tsc, tsc_deadline;
- if (apic->lapic_timer.expired_tscdeadline == 0)
- return;
-
tsc_deadline = apic->lapic_timer.expired_tscdeadline;
apic->lapic_timer.expired_tscdeadline = 0;
guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
@@ -1593,7 +1602,10 @@ static void __kvm_wait_lapic_expire(struct kvm_vcpu *vcpu)
void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu)
{
- if (lapic_timer_int_injected(vcpu))
+ if (lapic_in_kernel(vcpu) &&
+ vcpu->arch.apic->lapic_timer.expired_tscdeadline &&
+ vcpu->arch.apic->lapic_timer.timer_advance_ns &&
+ lapic_timer_int_injected(vcpu))
__kvm_wait_lapic_expire(vcpu);
}
EXPORT_SYMBOL_GPL(kvm_wait_lapic_expire);
@@ -1629,14 +1641,15 @@ static void apic_timer_expired(struct kvm_lapic *apic, bool from_timer_fn)
}
if (kvm_use_posted_timer_interrupt(apic->vcpu)) {
- if (apic->lapic_timer.timer_advance_ns)
- __kvm_wait_lapic_expire(vcpu);
+ kvm_wait_lapic_expire(vcpu);
kvm_apic_inject_pending_timer_irqs(apic);
return;
}
atomic_inc(&apic->lapic_timer.pending);
- kvm_set_pending_timer(vcpu);
+ kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
+ if (from_timer_fn)
+ kvm_vcpu_kick(vcpu);
}
static void start_sw_tscdeadline(struct kvm_lapic *apic)
@@ -1984,10 +1997,9 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
break;
case APIC_DFR:
- if (!apic_x2apic_mode(apic)) {
- kvm_lapic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF);
- atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
- } else
+ if (!apic_x2apic_mode(apic))
+ kvm_apic_set_dfr(apic, val | 0x0FFFFFFF);
+ else
ret = 1;
break;
@@ -2183,8 +2195,7 @@ u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
- if (!lapic_in_kernel(vcpu) ||
- !apic_lvtt_tscdeadline(apic))
+ if (!kvm_apic_present(vcpu) || !apic_lvtt_tscdeadline(apic))
return 0;
return apic->lapic_timer.tscdeadline;
@@ -2194,8 +2205,7 @@ void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data)
{
struct kvm_lapic *apic = vcpu->arch.apic;
- if (!kvm_apic_present(vcpu) || apic_lvtt_oneshot(apic) ||
- apic_lvtt_period(apic))
+ if (!kvm_apic_present(vcpu) || !apic_lvtt_tscdeadline(apic))
return;
hrtimer_cancel(&apic->lapic_timer.timer);
@@ -2303,7 +2313,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT));
apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0));
- kvm_lapic_set_reg(apic, APIC_DFR, 0xffffffffU);
+ kvm_apic_set_dfr(apic, 0xffffffffU);
apic_set_spiv(apic, 0xff);
kvm_lapic_set_reg(apic, APIC_TASKPRI, 0);
if (!apic_x2apic_mode(apic))
@@ -2461,6 +2471,7 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
__apic_update_ppr(apic, &ppr);
return apic_has_interrupt_for_ppr(apic, ppr);
}
+EXPORT_SYMBOL_GPL(kvm_apic_has_interrupt);
int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
{
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 754f29b..4fb86e3 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -89,6 +89,7 @@ int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
int shorthand, unsigned int dest, int dest_mode);
int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2);
+void kvm_apic_clear_irr(struct kvm_vcpu *vcpu, int vec);
bool __kvm_apic_update_irr(u32 *pir, void *regs, int *max_irr);
bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir, int *max_irr);
void kvm_apic_update_ppr(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 5efc608..9c4a9c8 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -155,11 +155,6 @@ static inline bool is_write_protection(struct kvm_vcpu *vcpu)
return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
}
-static inline bool kvm_mmu_is_illegal_gpa(struct kvm_vcpu *vcpu, gpa_t gpa)
-{
- return (gpa >= BIT_ULL(cpuid_maxphyaddr(vcpu)));
-}
-
/*
* Check if a given access (described through the I/D, W/R and U/S bits of a
* page fault error code pfec) causes a permission fault with the given PTE
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 43fdb0c..32e0e5c 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -198,17 +198,20 @@ module_param(dbg, bool, 0644);
#define PTE_LIST_EXT 3
/*
- * Return values of handle_mmio_page_fault and mmu.page_fault:
+ * Return values of handle_mmio_page_fault, mmu.page_fault, and fast_page_fault().
+ *
* RET_PF_RETRY: let CPU fault again on the address.
* RET_PF_EMULATE: mmio page fault, emulate the instruction directly.
- *
- * For handle_mmio_page_fault only:
* RET_PF_INVALID: the spte is invalid, let the real page fault path update it.
+ * RET_PF_FIXED: The faulting entry has been fixed.
+ * RET_PF_SPURIOUS: The faulting entry was already fixed, e.g. by another vCPU.
*/
enum {
RET_PF_RETRY = 0,
- RET_PF_EMULATE = 1,
- RET_PF_INVALID = 2,
+ RET_PF_EMULATE,
+ RET_PF_INVALID,
+ RET_PF_FIXED,
+ RET_PF_SPURIOUS,
};
struct pte_list_desc {
@@ -521,7 +524,7 @@ static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
struct x86_exception *exception)
{
/* Check if guest physical address doesn't exceed guest maximum */
- if (kvm_mmu_is_illegal_gpa(vcpu, gpa)) {
+ if (kvm_vcpu_is_illegal_gpa(vcpu, gpa)) {
exception->error_code |= PFERR_RSVD_MASK;
return UNMAPPED_GVA;
}
@@ -2469,7 +2472,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
}
if (sp->unsync_children)
- kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
+ kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
__clear_sp_write_flooding_count(sp);
@@ -2615,8 +2618,9 @@ static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
}
}
-static bool mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
- u64 *spte)
+/* Returns the number of zapped non-leaf child shadow pages. */
+static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
+ u64 *spte, struct list_head *invalid_list)
{
u64 pte;
struct kvm_mmu_page *child;
@@ -2630,23 +2634,34 @@ static bool mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
} else {
child = to_shadow_page(pte & PT64_BASE_ADDR_MASK);
drop_parent_pte(child, spte);
+
+ /*
+ * Recursively zap nested TDP SPs, parentless SPs are
+ * unlikely to be used again in the near future. This
+ * avoids retaining a large number of stale nested SPs.
+ */
+ if (tdp_enabled && invalid_list &&
+ child->role.guest_mode && !child->parent_ptes.val)
+ return kvm_mmu_prepare_zap_page(kvm, child,
+ invalid_list);
}
- return true;
- }
-
- if (is_mmio_spte(pte))
+ } else if (is_mmio_spte(pte)) {
mmu_spte_clear_no_track(spte);
-
- return false;
+ }
+ return 0;
}
-static void kvm_mmu_page_unlink_children(struct kvm *kvm,
- struct kvm_mmu_page *sp)
+static int kvm_mmu_page_unlink_children(struct kvm *kvm,
+ struct kvm_mmu_page *sp,
+ struct list_head *invalid_list)
{
+ int zapped = 0;
unsigned i;
for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
- mmu_page_zap_pte(kvm, sp, sp->spt + i);
+ zapped += mmu_page_zap_pte(kvm, sp, sp->spt + i, invalid_list);
+
+ return zapped;
}
static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
@@ -2692,7 +2707,7 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
trace_kvm_mmu_prepare_zap_page(sp);
++kvm->stat.mmu_shadow_zapped;
*nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list);
- kvm_mmu_page_unlink_children(kvm, sp);
+ *nr_zapped += kvm_mmu_page_unlink_children(kvm, sp, invalid_list);
kvm_mmu_unlink_parents(kvm, sp);
/* Zapping children means active_mmu_pages has become unstable. */
@@ -2970,6 +2985,7 @@ static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
/* Bits which may be returned by set_spte() */
#define SET_SPTE_WRITE_PROTECTED_PT BIT(0)
#define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1)
+#define SET_SPTE_SPURIOUS BIT(2)
static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
unsigned int pte_access, int level,
@@ -3058,20 +3074,22 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
spte = mark_spte_for_access_track(spte);
set_pte:
- if (mmu_spte_update(sptep, spte))
+ if (*sptep == spte)
+ ret |= SET_SPTE_SPURIOUS;
+ else if (mmu_spte_update(sptep, spte))
ret |= SET_SPTE_NEED_REMOTE_TLB_FLUSH;
return ret;
}
static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
- unsigned int pte_access, int write_fault, int level,
+ unsigned int pte_access, bool write_fault, int level,
gfn_t gfn, kvm_pfn_t pfn, bool speculative,
bool host_writable)
{
int was_rmapped = 0;
int rmap_count;
int set_spte_ret;
- int ret = RET_PF_RETRY;
+ int ret = RET_PF_FIXED;
bool flush = false;
pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
@@ -3113,6 +3131,15 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
if (unlikely(is_mmio_spte(*sptep)))
ret = RET_PF_EMULATE;
+ /*
+ * The fault is fully spurious if and only if the new SPTE and old SPTE
+ * are identical, and emulation is not required.
+ */
+ if ((set_spte_ret & SET_SPTE_SPURIOUS) && ret == RET_PF_FIXED) {
+ WARN_ON_ONCE(!was_rmapped);
+ return RET_PF_SPURIOUS;
+ }
+
pgprintk("%s: setting spte %llx\n", __func__, *sptep);
trace_kvm_mmu_set_spte(level, gfn, sptep);
if (!was_rmapped && is_large_pte(*sptep))
@@ -3161,7 +3188,7 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
return -1;
for (i = 0; i < ret; i++, gfn++, start++) {
- mmu_set_spte(vcpu, start, access, 0, sp->role.level, gfn,
+ mmu_set_spte(vcpu, start, access, false, sp->role.level, gfn,
page_to_pfn(pages[i]), true, true);
put_page(pages[i]);
}
@@ -3240,7 +3267,8 @@ static int host_pfn_mapping_level(struct kvm_vcpu *vcpu, gfn_t gfn,
}
static int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn,
- int max_level, kvm_pfn_t *pfnp)
+ int max_level, kvm_pfn_t *pfnp,
+ bool huge_page_disallowed, int *req_level)
{
struct kvm_memory_slot *slot;
struct kvm_lpage_info *linfo;
@@ -3248,6 +3276,8 @@ static int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn,
kvm_pfn_t mask;
int level;
+ *req_level = PG_LEVEL_4K;
+
if (unlikely(max_level == PG_LEVEL_4K))
return PG_LEVEL_4K;
@@ -3272,7 +3302,14 @@ static int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn,
if (level == PG_LEVEL_4K)
return level;
- level = min(level, max_level);
+ *req_level = level = min(level, max_level);
+
+ /*
+ * Enforce the iTLB multihit workaround after capturing the requested
+ * level, which will be used to do precise, accurate accounting.
+ */
+ if (huge_page_disallowed)
+ return PG_LEVEL_4K;
/*
* mmu_notifier_retry() was successful and mmu_lock is held, so
@@ -3292,7 +3329,6 @@ static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it,
u64 spte = *it.sptep;
if (it.level == level && level > PG_LEVEL_4K &&
- is_nx_huge_page_enabled() &&
is_shadow_present_pte(spte) &&
!is_large_pte(spte)) {
/*
@@ -3308,20 +3344,25 @@ static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it,
}
}
-static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
+static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
int map_writable, int max_level, kvm_pfn_t pfn,
- bool prefault, bool account_disallowed_nx_lpage)
+ bool prefault, bool is_tdp)
{
+ bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled();
+ bool write = error_code & PFERR_WRITE_MASK;
+ bool exec = error_code & PFERR_FETCH_MASK;
+ bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled;
struct kvm_shadow_walk_iterator it;
struct kvm_mmu_page *sp;
- int level, ret;
+ int level, req_level, ret;
gfn_t gfn = gpa >> PAGE_SHIFT;
gfn_t base_gfn = gfn;
if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)))
return RET_PF_RETRY;
- level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn);
+ level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn,
+ huge_page_disallowed, &req_level);
trace_kvm_mmu_spte_requested(gpa, level, pfn);
for_each_shadow_entry(vcpu, gpa, it) {
@@ -3329,7 +3370,8 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
* We cannot overwrite existing page tables with an NX
* large page, as the leaf could be executable.
*/
- disallowed_hugepage_adjust(it, gfn, &pfn, &level);
+ if (nx_huge_page_workaround_enabled)
+ disallowed_hugepage_adjust(it, gfn, &pfn, &level);
base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
if (it.level == level)
@@ -3341,7 +3383,8 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
it.level - 1, true, ACC_ALL);
link_shadow_page(vcpu, it.sptep, sp);
- if (account_disallowed_nx_lpage)
+ if (is_tdp && huge_page_disallowed &&
+ req_level >= it.level)
account_huge_nx_page(vcpu->kvm, sp);
}
}
@@ -3349,6 +3392,9 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
ret = mmu_set_spte(vcpu, it.sptep, ACC_ALL,
write, level, base_gfn, pfn, prefault,
map_writable);
+ if (ret == RET_PF_SPURIOUS)
+ return ret;
+
direct_pte_prefetch(vcpu, it.sptep);
++vcpu->stat.pf_fixed;
return ret;
@@ -3479,21 +3525,19 @@ static bool is_access_allowed(u32 fault_err_code, u64 spte)
}
/*
- * Return value:
- * - true: let the vcpu to access on the same address again.
- * - false: let the real page fault path to fix it.
+ * Returns one of RET_PF_INVALID, RET_PF_FIXED or RET_PF_SPURIOUS.
*/
-static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
- u32 error_code)
+static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
+ u32 error_code)
{
struct kvm_shadow_walk_iterator iterator;
struct kvm_mmu_page *sp;
- bool fault_handled = false;
+ int ret = RET_PF_INVALID;
u64 spte = 0ull;
uint retry_count = 0;
if (!page_fault_can_be_fast(error_code))
- return false;
+ return ret;
walk_shadow_page_lockless_begin(vcpu);
@@ -3519,7 +3563,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
* they are always ACC_ALL.
*/
if (is_access_allowed(error_code, spte)) {
- fault_handled = true;
+ ret = RET_PF_SPURIOUS;
break;
}
@@ -3562,11 +3606,11 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
* since the gfn is not stable for indirect shadow page. See
* Documentation/virt/kvm/locking.rst to get more detail.
*/
- fault_handled = fast_pf_fix_direct_spte(vcpu, sp,
- iterator.sptep, spte,
- new_spte);
- if (fault_handled)
+ if (fast_pf_fix_direct_spte(vcpu, sp, iterator.sptep, spte,
+ new_spte)) {
+ ret = RET_PF_FIXED;
break;
+ }
if (++retry_count > 4) {
printk_once(KERN_WARNING
@@ -3577,10 +3621,10 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
} while (true);
trace_fast_page_fault(vcpu, cr2_or_gpa, error_code, iterator.sptep,
- spte, fault_handled);
+ spte, ret);
walk_shadow_page_lockless_end(vcpu);
- return fault_handled;
+ return ret;
}
static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
@@ -3603,6 +3647,7 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
ulong roots_to_free)
{
+ struct kvm *kvm = vcpu->kvm;
int i;
LIST_HEAD(invalid_list);
bool free_active_root = roots_to_free & KVM_MMU_ROOT_CURRENT;
@@ -3620,22 +3665,21 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
return;
}
- spin_lock(&vcpu->kvm->mmu_lock);
+ spin_lock(&kvm->mmu_lock);
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i))
- mmu_free_root_page(vcpu->kvm, &mmu->prev_roots[i].hpa,
+ mmu_free_root_page(kvm, &mmu->prev_roots[i].hpa,
&invalid_list);
if (free_active_root) {
if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
(mmu->root_level >= PT64_ROOT_4LEVEL || mmu->direct_map)) {
- mmu_free_root_page(vcpu->kvm, &mmu->root_hpa,
- &invalid_list);
+ mmu_free_root_page(kvm, &mmu->root_hpa, &invalid_list);
} else {
for (i = 0; i < 4; ++i)
if (mmu->pae_root[i] != 0)
- mmu_free_root_page(vcpu->kvm,
+ mmu_free_root_page(kvm,
&mmu->pae_root[i],
&invalid_list);
mmu->root_hpa = INVALID_PAGE;
@@ -3643,8 +3687,8 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
mmu->root_pgd = 0;
}
- kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
- spin_unlock(&vcpu->kvm->mmu_lock);
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
+ spin_unlock(&kvm->mmu_lock);
}
EXPORT_SYMBOL_GPL(kvm_mmu_free_roots);
@@ -4080,8 +4124,6 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
bool prefault, int max_level, bool is_tdp)
{
bool write = error_code & PFERR_WRITE_MASK;
- bool exec = error_code & PFERR_FETCH_MASK;
- bool lpage_disallowed = exec && is_nx_huge_page_enabled();
bool map_writable;
gfn_t gfn = gpa >> PAGE_SHIFT;
@@ -4092,16 +4134,14 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
if (page_fault_handle_page_track(vcpu, error_code, gfn))
return RET_PF_EMULATE;
- if (fast_page_fault(vcpu, gpa, error_code))
- return RET_PF_RETRY;
+ r = fast_page_fault(vcpu, gpa, error_code);
+ if (r != RET_PF_INVALID)
+ return r;
r = mmu_topup_memory_caches(vcpu, false);
if (r)
return r;
- if (lpage_disallowed)
- max_level = PG_LEVEL_4K;
-
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
@@ -4118,8 +4158,8 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
r = make_mmu_pages_available(vcpu);
if (r)
goto out_unlock;
- r = __direct_map(vcpu, gpa, write, map_writable, max_level, pfn,
- prefault, is_tdp && lpage_disallowed);
+ r = __direct_map(vcpu, gpa, error_code, map_writable, max_level, pfn,
+ prefault, is_tdp);
out_unlock:
spin_unlock(&vcpu->kvm->mmu_lock);
@@ -5400,7 +5440,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
u32 base_role = vcpu->arch.mmu->mmu_role.base.word;
entry = *spte;
- mmu_page_zap_pte(vcpu->kvm, sp, spte);
+ mmu_page_zap_pte(vcpu->kvm, sp, spte, NULL);
if (gentry &&
!((sp->role.word ^ base_role) & ~role_ign.word) &&
rmap_can_add(vcpu))
@@ -5450,13 +5490,14 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
if (r == RET_PF_INVALID) {
r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa,
lower_32_bits(error_code), false);
- WARN_ON(r == RET_PF_INVALID);
+ if (WARN_ON_ONCE(r == RET_PF_INVALID))
+ return -EIO;
}
- if (r == RET_PF_RETRY)
- return 1;
if (r < 0)
return r;
+ if (r != RET_PF_EMULATE)
+ return 1;
/*
* Before emulating the instruction, check if the error code
@@ -5485,18 +5526,6 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu))
emulation_type |= EMULTYPE_ALLOW_RETRY_PF;
emulate:
- /*
- * On AMD platforms, under certain conditions insn_len may be zero on #NPF.
- * This can happen if a guest gets a page-fault on data access but the HW
- * table walker is not able to read the instruction page (e.g instruction
- * page is not present in memory). In those cases we simply restart the
- * guest, with the exception of AMD Erratum 1096 which is unrecoverable.
- */
- if (unlikely(insn && !insn_len)) {
- if (!kvm_x86_ops.need_emulation_on_page_fault(vcpu))
- return 1;
- }
-
return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn,
insn_len);
}
@@ -5682,11 +5711,17 @@ static void free_mmu_pages(struct kvm_mmu *mmu)
free_page((unsigned long)mmu->lm_root);
}
-static int alloc_mmu_pages(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
+static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
{
struct page *page;
int i;
+ mmu->root_hpa = INVALID_PAGE;
+ mmu->root_pgd = 0;
+ mmu->translate_gpa = translate_gpa;
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
+ mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
+
/*
* When using PAE paging, the four PDPTEs are treated as 'root' pages,
* while the PDP table is a per-vCPU construct that's allocated at MMU
@@ -5712,7 +5747,6 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
int kvm_mmu_create(struct kvm_vcpu *vcpu)
{
- uint i;
int ret;
vcpu->arch.mmu_pte_list_desc_cache.kmem_cache = pte_list_desc_cache;
@@ -5726,25 +5760,13 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
vcpu->arch.mmu = &vcpu->arch.root_mmu;
vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
- vcpu->arch.root_mmu.root_hpa = INVALID_PAGE;
- vcpu->arch.root_mmu.root_pgd = 0;
- vcpu->arch.root_mmu.translate_gpa = translate_gpa;
- for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
- vcpu->arch.root_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
-
- vcpu->arch.guest_mmu.root_hpa = INVALID_PAGE;
- vcpu->arch.guest_mmu.root_pgd = 0;
- vcpu->arch.guest_mmu.translate_gpa = translate_gpa;
- for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
- vcpu->arch.guest_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
-
vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa;
- ret = alloc_mmu_pages(vcpu, &vcpu->arch.guest_mmu);
+ ret = __kvm_mmu_create(vcpu, &vcpu->arch.guest_mmu);
if (ret)
return ret;
- ret = alloc_mmu_pages(vcpu, &vcpu->arch.root_mmu);
+ ret = __kvm_mmu_create(vcpu, &vcpu->arch.root_mmu);
if (ret)
goto fail_allocate_root;
@@ -6357,7 +6379,10 @@ static void kvm_recover_nx_lpages(struct kvm *kvm)
ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
to_zap = ratio ? DIV_ROUND_UP(kvm->stat.nx_lpage_splits, ratio) : 0;
- while (to_zap && !list_empty(&kvm->arch.lpage_disallowed_mmu_pages)) {
+ for ( ; to_zap; --to_zap) {
+ if (list_empty(&kvm->arch.lpage_disallowed_mmu_pages))
+ break;
+
/*
* We use a separate list instead of just using active_mmu_pages
* because the number of lpage_disallowed pages is expected to
@@ -6370,12 +6395,12 @@ static void kvm_recover_nx_lpages(struct kvm *kvm)
kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
WARN_ON_ONCE(sp->lpage_disallowed);
- if (!--to_zap || need_resched() || spin_needbreak(&kvm->mmu_lock)) {
+ if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
kvm_mmu_commit_zap_page(kvm, &invalid_list);
- if (to_zap)
- cond_resched_lock(&kvm->mmu_lock);
+ cond_resched_lock(&kvm->mmu_lock);
}
}
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
spin_unlock(&kvm->mmu_lock);
srcu_read_unlock(&kvm->srcu, rcu_idx);
diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h
index 9d15bc0..2080f9c 100644
--- a/arch/x86/kvm/mmu/mmutrace.h
+++ b/arch/x86/kvm/mmu/mmutrace.h
@@ -244,14 +244,11 @@ TRACE_EVENT(
__entry->access)
);
-#define __spte_satisfied(__spte) \
- (__entry->retry && is_writable_pte(__entry->__spte))
-
TRACE_EVENT(
fast_page_fault,
TP_PROTO(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u32 error_code,
- u64 *sptep, u64 old_spte, bool retry),
- TP_ARGS(vcpu, cr2_or_gpa, error_code, sptep, old_spte, retry),
+ u64 *sptep, u64 old_spte, int ret),
+ TP_ARGS(vcpu, cr2_or_gpa, error_code, sptep, old_spte, ret),
TP_STRUCT__entry(
__field(int, vcpu_id)
@@ -260,7 +257,7 @@ TRACE_EVENT(
__field(u64 *, sptep)
__field(u64, old_spte)
__field(u64, new_spte)
- __field(bool, retry)
+ __field(int, ret)
),
TP_fast_assign(
@@ -270,7 +267,7 @@ TRACE_EVENT(
__entry->sptep = sptep;
__entry->old_spte = old_spte;
__entry->new_spte = *sptep;
- __entry->retry = retry;
+ __entry->ret = ret;
),
TP_printk("vcpu %d gva %llx error_code %s sptep %p old %#llx"
@@ -278,7 +275,7 @@ TRACE_EVENT(
__entry->cr2_or_gpa, __print_flags(__entry->error_code, "|",
kvm_mmu_trace_pferr_flags), __entry->sptep,
__entry->old_spte, __entry->new_spte,
- __spte_satisfied(old_spte), __spte_satisfied(new_spte)
+ __entry->ret == RET_PF_SPURIOUS, __entry->ret == RET_PF_FIXED
)
);
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 4dd6b1e..9a1a15f 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -550,7 +550,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
* we call mmu_set_spte() with host_writable = true because
* pte_prefetch_gfn_to_pfn always gets a writable pfn.
*/
- mmu_set_spte(vcpu, spte, pte_access, 0, PG_LEVEL_4K, gfn, pfn,
+ mmu_set_spte(vcpu, spte, pte_access, false, PG_LEVEL_4K, gfn, pfn,
true, true);
kvm_release_pfn_clean(pfn);
@@ -625,15 +625,18 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
* emulate this operation, return 1 to indicate this case.
*/
static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr,
- struct guest_walker *gw,
- int write_fault, int max_level,
- kvm_pfn_t pfn, bool map_writable, bool prefault,
- bool lpage_disallowed)
+ struct guest_walker *gw, u32 error_code,
+ int max_level, kvm_pfn_t pfn, bool map_writable,
+ bool prefault)
{
+ bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled();
+ bool write_fault = error_code & PFERR_WRITE_MASK;
+ bool exec = error_code & PFERR_FETCH_MASK;
+ bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled;
struct kvm_mmu_page *sp = NULL;
struct kvm_shadow_walk_iterator it;
unsigned direct_access, access = gw->pt_access;
- int top_level, hlevel, ret;
+ int top_level, level, req_level, ret;
gfn_t base_gfn = gw->gfn;
direct_access = gw->pte_access;
@@ -679,7 +682,8 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr,
link_shadow_page(vcpu, it.sptep, sp);
}
- hlevel = kvm_mmu_hugepage_adjust(vcpu, gw->gfn, max_level, &pfn);
+ level = kvm_mmu_hugepage_adjust(vcpu, gw->gfn, max_level, &pfn,
+ huge_page_disallowed, &req_level);
trace_kvm_mmu_spte_requested(addr, gw->level, pfn);
@@ -690,10 +694,11 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr,
* We cannot overwrite existing page tables with an NX
* large page, as the leaf could be executable.
*/
- disallowed_hugepage_adjust(it, gw->gfn, &pfn, &hlevel);
+ if (nx_huge_page_workaround_enabled)
+ disallowed_hugepage_adjust(it, gw->gfn, &pfn, &level);
base_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
- if (it.level == hlevel)
+ if (it.level == level)
break;
validate_direct_spte(vcpu, it.sptep, direct_access);
@@ -704,13 +709,16 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr,
sp = kvm_mmu_get_page(vcpu, base_gfn, addr,
it.level - 1, true, direct_access);
link_shadow_page(vcpu, it.sptep, sp);
- if (lpage_disallowed)
+ if (huge_page_disallowed && req_level >= it.level)
account_huge_nx_page(vcpu->kvm, sp);
}
}
ret = mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault,
it.level, base_gfn, pfn, prefault, map_writable);
+ if (ret == RET_PF_SPURIOUS)
+ return ret;
+
FNAME(pte_prefetch)(vcpu, gw, it.sptep);
++vcpu->stat.pf_fixed;
return ret;
@@ -738,7 +746,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr,
*/
static bool
FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu,
- struct guest_walker *walker, int user_fault,
+ struct guest_walker *walker, bool user_fault,
bool *write_fault_to_shadow_pgtable)
{
int level;
@@ -776,15 +784,13 @@ FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu,
static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
bool prefault)
{
- int write_fault = error_code & PFERR_WRITE_MASK;
- int user_fault = error_code & PFERR_USER_MASK;
+ bool write_fault = error_code & PFERR_WRITE_MASK;
+ bool user_fault = error_code & PFERR_USER_MASK;
struct guest_walker walker;
int r;
kvm_pfn_t pfn;
unsigned long mmu_seq;
bool map_writable, is_self_change_mapping;
- bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) &&
- is_nx_huge_page_enabled();
int max_level;
pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
@@ -825,7 +831,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu,
&walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable);
- if (lpage_disallowed || is_self_change_mapping)
+ if (is_self_change_mapping)
max_level = PG_LEVEL_4K;
else
max_level = walker.level;
@@ -869,8 +875,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
r = make_mmu_pages_available(vcpu);
if (r)
goto out_unlock;
- r = FNAME(fetch)(vcpu, addr, &walker, write_fault, max_level, pfn,
- map_writable, prefault, lpage_disallowed);
+ r = FNAME(fetch)(vcpu, addr, &walker, error_code, max_level, pfn,
+ map_writable, prefault);
kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
out_unlock:
@@ -895,6 +901,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa)
{
struct kvm_shadow_walk_iterator iterator;
struct kvm_mmu_page *sp;
+ u64 old_spte;
int level;
u64 *sptep;
@@ -917,7 +924,8 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa)
sptep = iterator.sptep;
sp = sptep_to_sp(sptep);
- if (is_last_spte(*sptep, level)) {
+ old_spte = *sptep;
+ if (is_last_spte(old_spte, level)) {
pt_element_t gpte;
gpa_t pte_gpa;
@@ -927,7 +935,8 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa)
pte_gpa = FNAME(get_level1_sp_gpa)(sp);
pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
- if (mmu_page_zap_pte(vcpu->kvm, sp, sptep))
+ mmu_page_zap_pte(vcpu->kvm, sp, sptep, NULL);
+ if (is_shadow_present_pte(old_spte))
kvm_flush_remote_tlbs_with_address(vcpu->kvm,
sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level));
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index ac830cd..f73f84d 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -153,20 +153,18 @@ int avic_vm_init(struct kvm *kvm)
return 0;
/* Allocating physical APIC ID table (4KB) */
- p_page = alloc_page(GFP_KERNEL_ACCOUNT);
+ p_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
if (!p_page)
goto free_avic;
kvm_svm->avic_physical_id_table_page = p_page;
- clear_page(page_address(p_page));
/* Allocating logical APIC ID table (4KB) */
- l_page = alloc_page(GFP_KERNEL_ACCOUNT);
+ l_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
if (!l_page)
goto free_avic;
kvm_svm->avic_logical_id_table_page = l_page;
- clear_page(page_address(l_page));
spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
again:
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index fb68467..ba50ff6 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -98,6 +98,7 @@ static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
void recalc_intercepts(struct vcpu_svm *svm)
{
struct vmcb_control_area *c, *h, *g;
+ unsigned int i;
vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
@@ -108,42 +109,37 @@ void recalc_intercepts(struct vcpu_svm *svm)
h = &svm->nested.hsave->control;
g = &svm->nested.ctl;
- svm->nested.host_intercept_exceptions = h->intercept_exceptions;
-
- c->intercept_cr = h->intercept_cr;
- c->intercept_dr = h->intercept_dr;
- c->intercept_exceptions = h->intercept_exceptions;
- c->intercept = h->intercept;
+ for (i = 0; i < MAX_INTERCEPT; i++)
+ c->intercepts[i] = h->intercepts[i];
if (g->int_ctl & V_INTR_MASKING_MASK) {
/* We only want the cr8 intercept bits of L1 */
- c->intercept_cr &= ~(1U << INTERCEPT_CR8_READ);
- c->intercept_cr &= ~(1U << INTERCEPT_CR8_WRITE);
+ vmcb_clr_intercept(c, INTERCEPT_CR8_READ);
+ vmcb_clr_intercept(c, INTERCEPT_CR8_WRITE);
/*
* Once running L2 with HF_VINTR_MASK, EFLAGS.IF does not
* affect any interrupt we may want to inject; therefore,
* interrupt window vmexits are irrelevant to L0.
*/
- c->intercept &= ~(1ULL << INTERCEPT_VINTR);
+ vmcb_clr_intercept(c, INTERCEPT_VINTR);
}
/* We don't want to see VMMCALLs from a nested guest */
- c->intercept &= ~(1ULL << INTERCEPT_VMMCALL);
+ vmcb_clr_intercept(c, INTERCEPT_VMMCALL);
- c->intercept_cr |= g->intercept_cr;
- c->intercept_dr |= g->intercept_dr;
- c->intercept_exceptions |= g->intercept_exceptions;
- c->intercept |= g->intercept;
+ for (i = 0; i < MAX_INTERCEPT; i++)
+ c->intercepts[i] |= g->intercepts[i];
}
static void copy_vmcb_control_area(struct vmcb_control_area *dst,
struct vmcb_control_area *from)
{
- dst->intercept_cr = from->intercept_cr;
- dst->intercept_dr = from->intercept_dr;
- dst->intercept_exceptions = from->intercept_exceptions;
- dst->intercept = from->intercept;
+ unsigned int i;
+
+ for (i = 0; i < MAX_INTERCEPT; i++)
+ dst->intercepts[i] = from->intercepts[i];
+
dst->iopm_base_pa = from->iopm_base_pa;
dst->msrpm_base_pa = from->msrpm_base_pa;
dst->tsc_offset = from->tsc_offset;
@@ -176,7 +172,7 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
*/
int i;
- if (!(svm->nested.ctl.intercept & (1ULL << INTERCEPT_MSR_PROT)))
+ if (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT)))
return true;
for (i = 0; i < MSRPM_OFFSETS; i++) {
@@ -200,9 +196,23 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
return true;
}
+static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ if (!nested_svm_vmrun_msrpm(svm)) {
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror =
+ KVM_INTERNAL_ERROR_EMULATION;
+ vcpu->run->internal.ndata = 0;
+ return false;
+ }
+
+ return true;
+}
+
static bool nested_vmcb_check_controls(struct vmcb_control_area *control)
{
- if ((control->intercept & (1ULL << INTERCEPT_VMRUN)) == 0)
+ if ((vmcb_is_intercept(control, INTERCEPT_VMRUN)) == 0)
return false;
if (control->asid == 0)
@@ -215,41 +225,39 @@ static bool nested_vmcb_check_controls(struct vmcb_control_area *control)
return true;
}
-static bool nested_vmcb_checks(struct vcpu_svm *svm, struct vmcb *vmcb)
+static bool nested_vmcb_checks(struct vcpu_svm *svm, struct vmcb *vmcb12)
{
- bool nested_vmcb_lma;
- if ((vmcb->save.efer & EFER_SVME) == 0)
+ bool vmcb12_lma;
+
+ if ((vmcb12->save.efer & EFER_SVME) == 0)
return false;
- if (((vmcb->save.cr0 & X86_CR0_CD) == 0) &&
- (vmcb->save.cr0 & X86_CR0_NW))
+ if (((vmcb12->save.cr0 & X86_CR0_CD) == 0) && (vmcb12->save.cr0 & X86_CR0_NW))
return false;
- if (!kvm_dr6_valid(vmcb->save.dr6) || !kvm_dr7_valid(vmcb->save.dr7))
+ if (!kvm_dr6_valid(vmcb12->save.dr6) || !kvm_dr7_valid(vmcb12->save.dr7))
return false;
- nested_vmcb_lma =
- (vmcb->save.efer & EFER_LME) &&
- (vmcb->save.cr0 & X86_CR0_PG);
+ vmcb12_lma = (vmcb12->save.efer & EFER_LME) && (vmcb12->save.cr0 & X86_CR0_PG);
- if (!nested_vmcb_lma) {
- if (vmcb->save.cr4 & X86_CR4_PAE) {
- if (vmcb->save.cr3 & MSR_CR3_LEGACY_PAE_RESERVED_MASK)
+ if (!vmcb12_lma) {
+ if (vmcb12->save.cr4 & X86_CR4_PAE) {
+ if (vmcb12->save.cr3 & MSR_CR3_LEGACY_PAE_RESERVED_MASK)
return false;
} else {
- if (vmcb->save.cr3 & MSR_CR3_LEGACY_RESERVED_MASK)
+ if (vmcb12->save.cr3 & MSR_CR3_LEGACY_RESERVED_MASK)
return false;
}
} else {
- if (!(vmcb->save.cr4 & X86_CR4_PAE) ||
- !(vmcb->save.cr0 & X86_CR0_PE) ||
- (vmcb->save.cr3 & MSR_CR3_LONG_RESERVED_MASK))
+ if (!(vmcb12->save.cr4 & X86_CR4_PAE) ||
+ !(vmcb12->save.cr0 & X86_CR0_PE) ||
+ (vmcb12->save.cr3 & MSR_CR3_LONG_MBZ_MASK))
return false;
}
- if (kvm_valid_cr4(&svm->vcpu, vmcb->save.cr4))
+ if (kvm_valid_cr4(&svm->vcpu, vmcb12->save.cr4))
return false;
- return nested_vmcb_check_controls(&vmcb->control);
+ return nested_vmcb_check_controls(&vmcb12->control);
}
static void load_nested_vmcb_control(struct vcpu_svm *svm,
@@ -296,7 +304,7 @@ void sync_nested_vmcb_control(struct vcpu_svm *svm)
* EXIT_INT_INFO.
*/
static void nested_vmcb_save_pending_event(struct vcpu_svm *svm,
- struct vmcb *nested_vmcb)
+ struct vmcb *vmcb12)
{
struct kvm_vcpu *vcpu = &svm->vcpu;
u32 exit_int_info = 0;
@@ -308,7 +316,7 @@ static void nested_vmcb_save_pending_event(struct vcpu_svm *svm,
if (vcpu->arch.exception.has_error_code) {
exit_int_info |= SVM_EVTINJ_VALID_ERR;
- nested_vmcb->control.exit_int_info_err =
+ vmcb12->control.exit_int_info_err =
vcpu->arch.exception.error_code;
}
@@ -325,7 +333,7 @@ static void nested_vmcb_save_pending_event(struct vcpu_svm *svm,
exit_int_info |= SVM_EVTINJ_TYPE_INTR;
}
- nested_vmcb->control.exit_int_info = exit_int_info;
+ vmcb12->control.exit_int_info = exit_int_info;
}
static inline bool nested_npt_enabled(struct vcpu_svm *svm)
@@ -364,31 +372,31 @@ static int nested_svm_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3,
return 0;
}
-static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *nested_vmcb)
+static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *vmcb12)
{
/* Load the nested guest state */
- svm->vmcb->save.es = nested_vmcb->save.es;
- svm->vmcb->save.cs = nested_vmcb->save.cs;
- svm->vmcb->save.ss = nested_vmcb->save.ss;
- svm->vmcb->save.ds = nested_vmcb->save.ds;
- svm->vmcb->save.gdtr = nested_vmcb->save.gdtr;
- svm->vmcb->save.idtr = nested_vmcb->save.idtr;
- kvm_set_rflags(&svm->vcpu, nested_vmcb->save.rflags);
- svm_set_efer(&svm->vcpu, nested_vmcb->save.efer);
- svm_set_cr0(&svm->vcpu, nested_vmcb->save.cr0);
- svm_set_cr4(&svm->vcpu, nested_vmcb->save.cr4);
- svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = nested_vmcb->save.cr2;
- kvm_rax_write(&svm->vcpu, nested_vmcb->save.rax);
- kvm_rsp_write(&svm->vcpu, nested_vmcb->save.rsp);
- kvm_rip_write(&svm->vcpu, nested_vmcb->save.rip);
+ svm->vmcb->save.es = vmcb12->save.es;
+ svm->vmcb->save.cs = vmcb12->save.cs;
+ svm->vmcb->save.ss = vmcb12->save.ss;
+ svm->vmcb->save.ds = vmcb12->save.ds;
+ svm->vmcb->save.gdtr = vmcb12->save.gdtr;
+ svm->vmcb->save.idtr = vmcb12->save.idtr;
+ kvm_set_rflags(&svm->vcpu, vmcb12->save.rflags);
+ svm_set_efer(&svm->vcpu, vmcb12->save.efer);
+ svm_set_cr0(&svm->vcpu, vmcb12->save.cr0);
+ svm_set_cr4(&svm->vcpu, vmcb12->save.cr4);
+ svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = vmcb12->save.cr2;
+ kvm_rax_write(&svm->vcpu, vmcb12->save.rax);
+ kvm_rsp_write(&svm->vcpu, vmcb12->save.rsp);
+ kvm_rip_write(&svm->vcpu, vmcb12->save.rip);
/* In case we don't even reach vcpu_run, the fields are not updated */
- svm->vmcb->save.rax = nested_vmcb->save.rax;
- svm->vmcb->save.rsp = nested_vmcb->save.rsp;
- svm->vmcb->save.rip = nested_vmcb->save.rip;
- svm->vmcb->save.dr7 = nested_vmcb->save.dr7;
- svm->vcpu.arch.dr6 = nested_vmcb->save.dr6;
- svm->vmcb->save.cpl = nested_vmcb->save.cpl;
+ svm->vmcb->save.rax = vmcb12->save.rax;
+ svm->vmcb->save.rsp = vmcb12->save.rsp;
+ svm->vmcb->save.rip = vmcb12->save.rip;
+ svm->vmcb->save.dr7 = vmcb12->save.dr7;
+ svm->vcpu.arch.dr6 = vmcb12->save.dr6;
+ svm->vmcb->save.cpl = vmcb12->save.cpl;
}
static void nested_prepare_vmcb_control(struct vcpu_svm *svm)
@@ -426,17 +434,17 @@ static void nested_prepare_vmcb_control(struct vcpu_svm *svm)
vmcb_mark_all_dirty(svm->vmcb);
}
-int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
- struct vmcb *nested_vmcb)
+int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb12_gpa,
+ struct vmcb *vmcb12)
{
int ret;
- svm->nested.vmcb = vmcb_gpa;
- load_nested_vmcb_control(svm, &nested_vmcb->control);
- nested_prepare_vmcb_save(svm, nested_vmcb);
+ svm->nested.vmcb12_gpa = vmcb12_gpa;
+ load_nested_vmcb_control(svm, &vmcb12->control);
+ nested_prepare_vmcb_save(svm, vmcb12);
nested_prepare_vmcb_control(svm);
- ret = nested_svm_load_cr3(&svm->vcpu, nested_vmcb->save.cr3,
+ ret = nested_svm_load_cr3(&svm->vcpu, vmcb12->save.cr3,
nested_npt_enabled(svm));
if (ret)
return ret;
@@ -449,19 +457,19 @@ int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
int nested_svm_vmrun(struct vcpu_svm *svm)
{
int ret;
- struct vmcb *nested_vmcb;
+ struct vmcb *vmcb12;
struct vmcb *hsave = svm->nested.hsave;
struct vmcb *vmcb = svm->vmcb;
struct kvm_host_map map;
- u64 vmcb_gpa;
+ u64 vmcb12_gpa;
if (is_smm(&svm->vcpu)) {
kvm_queue_exception(&svm->vcpu, UD_VECTOR);
return 1;
}
- vmcb_gpa = svm->vmcb->save.rax;
- ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb_gpa), &map);
+ vmcb12_gpa = svm->vmcb->save.rax;
+ ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb12_gpa), &map);
if (ret == -EINVAL) {
kvm_inject_gp(&svm->vcpu, 0);
return 1;
@@ -471,26 +479,28 @@ int nested_svm_vmrun(struct vcpu_svm *svm)
ret = kvm_skip_emulated_instruction(&svm->vcpu);
- nested_vmcb = map.hva;
+ vmcb12 = map.hva;
- if (!nested_vmcb_checks(svm, nested_vmcb)) {
- nested_vmcb->control.exit_code = SVM_EXIT_ERR;
- nested_vmcb->control.exit_code_hi = 0;
- nested_vmcb->control.exit_info_1 = 0;
- nested_vmcb->control.exit_info_2 = 0;
+ if (!nested_vmcb_checks(svm, vmcb12)) {
+ vmcb12->control.exit_code = SVM_EXIT_ERR;
+ vmcb12->control.exit_code_hi = 0;
+ vmcb12->control.exit_info_1 = 0;
+ vmcb12->control.exit_info_2 = 0;
goto out;
}
- trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa,
- nested_vmcb->save.rip,
- nested_vmcb->control.int_ctl,
- nested_vmcb->control.event_inj,
- nested_vmcb->control.nested_ctl);
+ trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb12_gpa,
+ vmcb12->save.rip,
+ vmcb12->control.int_ctl,
+ vmcb12->control.event_inj,
+ vmcb12->control.nested_ctl);
- trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr & 0xffff,
- nested_vmcb->control.intercept_cr >> 16,
- nested_vmcb->control.intercept_exceptions,
- nested_vmcb->control.intercept);
+ trace_kvm_nested_intercepts(vmcb12->control.intercepts[INTERCEPT_CR] & 0xffff,
+ vmcb12->control.intercepts[INTERCEPT_CR] >> 16,
+ vmcb12->control.intercepts[INTERCEPT_EXCEPTION],
+ vmcb12->control.intercepts[INTERCEPT_WORD3],
+ vmcb12->control.intercepts[INTERCEPT_WORD4],
+ vmcb12->control.intercepts[INTERCEPT_WORD5]);
/* Clear internal status */
kvm_clear_exception_queue(&svm->vcpu);
@@ -522,7 +532,7 @@ int nested_svm_vmrun(struct vcpu_svm *svm)
svm->nested.nested_run_pending = 1;
- if (enter_svm_guest_mode(svm, vmcb_gpa, nested_vmcb))
+ if (enter_svm_guest_mode(svm, vmcb12_gpa, vmcb12))
goto out_exit_err;
if (nested_svm_vmrun_msrpm(svm))
@@ -563,75 +573,77 @@ void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
int nested_svm_vmexit(struct vcpu_svm *svm)
{
int rc;
- struct vmcb *nested_vmcb;
+ struct vmcb *vmcb12;
struct vmcb *hsave = svm->nested.hsave;
struct vmcb *vmcb = svm->vmcb;
struct kvm_host_map map;
- rc = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->nested.vmcb), &map);
+ rc = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->nested.vmcb12_gpa), &map);
if (rc) {
if (rc == -EINVAL)
kvm_inject_gp(&svm->vcpu, 0);
return 1;
}
- nested_vmcb = map.hva;
+ vmcb12 = map.hva;
/* Exit Guest-Mode */
leave_guest_mode(&svm->vcpu);
- svm->nested.vmcb = 0;
+ svm->nested.vmcb12_gpa = 0;
WARN_ON_ONCE(svm->nested.nested_run_pending);
/* in case we halted in L2 */
svm->vcpu.arch.mp_state = KVM_MP_STATE_RUNNABLE;
/* Give the current vmcb to the guest */
- svm_set_gif(svm, false);
- nested_vmcb->save.es = vmcb->save.es;
- nested_vmcb->save.cs = vmcb->save.cs;
- nested_vmcb->save.ss = vmcb->save.ss;
- nested_vmcb->save.ds = vmcb->save.ds;
- nested_vmcb->save.gdtr = vmcb->save.gdtr;
- nested_vmcb->save.idtr = vmcb->save.idtr;
- nested_vmcb->save.efer = svm->vcpu.arch.efer;
- nested_vmcb->save.cr0 = kvm_read_cr0(&svm->vcpu);
- nested_vmcb->save.cr3 = kvm_read_cr3(&svm->vcpu);
- nested_vmcb->save.cr2 = vmcb->save.cr2;
- nested_vmcb->save.cr4 = svm->vcpu.arch.cr4;
- nested_vmcb->save.rflags = kvm_get_rflags(&svm->vcpu);
- nested_vmcb->save.rip = kvm_rip_read(&svm->vcpu);
- nested_vmcb->save.rsp = kvm_rsp_read(&svm->vcpu);
- nested_vmcb->save.rax = kvm_rax_read(&svm->vcpu);
- nested_vmcb->save.dr7 = vmcb->save.dr7;
- nested_vmcb->save.dr6 = svm->vcpu.arch.dr6;
- nested_vmcb->save.cpl = vmcb->save.cpl;
+ vmcb12->save.es = vmcb->save.es;
+ vmcb12->save.cs = vmcb->save.cs;
+ vmcb12->save.ss = vmcb->save.ss;
+ vmcb12->save.ds = vmcb->save.ds;
+ vmcb12->save.gdtr = vmcb->save.gdtr;
+ vmcb12->save.idtr = vmcb->save.idtr;
+ vmcb12->save.efer = svm->vcpu.arch.efer;
+ vmcb12->save.cr0 = kvm_read_cr0(&svm->vcpu);
+ vmcb12->save.cr3 = kvm_read_cr3(&svm->vcpu);
+ vmcb12->save.cr2 = vmcb->save.cr2;
+ vmcb12->save.cr4 = svm->vcpu.arch.cr4;
+ vmcb12->save.rflags = kvm_get_rflags(&svm->vcpu);
+ vmcb12->save.rip = kvm_rip_read(&svm->vcpu);
+ vmcb12->save.rsp = kvm_rsp_read(&svm->vcpu);
+ vmcb12->save.rax = kvm_rax_read(&svm->vcpu);
+ vmcb12->save.dr7 = vmcb->save.dr7;
+ vmcb12->save.dr6 = svm->vcpu.arch.dr6;
+ vmcb12->save.cpl = vmcb->save.cpl;
- nested_vmcb->control.int_state = vmcb->control.int_state;
- nested_vmcb->control.exit_code = vmcb->control.exit_code;
- nested_vmcb->control.exit_code_hi = vmcb->control.exit_code_hi;
- nested_vmcb->control.exit_info_1 = vmcb->control.exit_info_1;
- nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2;
+ vmcb12->control.int_state = vmcb->control.int_state;
+ vmcb12->control.exit_code = vmcb->control.exit_code;
+ vmcb12->control.exit_code_hi = vmcb->control.exit_code_hi;
+ vmcb12->control.exit_info_1 = vmcb->control.exit_info_1;
+ vmcb12->control.exit_info_2 = vmcb->control.exit_info_2;
- if (nested_vmcb->control.exit_code != SVM_EXIT_ERR)
- nested_vmcb_save_pending_event(svm, nested_vmcb);
+ if (vmcb12->control.exit_code != SVM_EXIT_ERR)
+ nested_vmcb_save_pending_event(svm, vmcb12);
if (svm->nrips_enabled)
- nested_vmcb->control.next_rip = vmcb->control.next_rip;
+ vmcb12->control.next_rip = vmcb->control.next_rip;
- nested_vmcb->control.int_ctl = svm->nested.ctl.int_ctl;
- nested_vmcb->control.tlb_ctl = svm->nested.ctl.tlb_ctl;
- nested_vmcb->control.event_inj = svm->nested.ctl.event_inj;
- nested_vmcb->control.event_inj_err = svm->nested.ctl.event_inj_err;
+ vmcb12->control.int_ctl = svm->nested.ctl.int_ctl;
+ vmcb12->control.tlb_ctl = svm->nested.ctl.tlb_ctl;
+ vmcb12->control.event_inj = svm->nested.ctl.event_inj;
+ vmcb12->control.event_inj_err = svm->nested.ctl.event_inj_err;
- nested_vmcb->control.pause_filter_count =
+ vmcb12->control.pause_filter_count =
svm->vmcb->control.pause_filter_count;
- nested_vmcb->control.pause_filter_thresh =
+ vmcb12->control.pause_filter_thresh =
svm->vmcb->control.pause_filter_thresh;
/* Restore the original control entries */
copy_vmcb_control_area(&vmcb->control, &hsave->control);
+ /* On vmexit the GIF is set to false */
+ svm_set_gif(svm, false);
+
svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset =
svm->vcpu.arch.l1_tsc_offset;
@@ -657,11 +669,11 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
vmcb_mark_all_dirty(svm->vmcb);
- trace_kvm_nested_vmexit_inject(nested_vmcb->control.exit_code,
- nested_vmcb->control.exit_info_1,
- nested_vmcb->control.exit_info_2,
- nested_vmcb->control.exit_int_info,
- nested_vmcb->control.exit_int_info_err,
+ trace_kvm_nested_vmexit_inject(vmcb12->control.exit_code,
+ vmcb12->control.exit_info_1,
+ vmcb12->control.exit_info_2,
+ vmcb12->control.exit_int_info,
+ vmcb12->control.exit_int_info_err,
KVM_ISA_SVM);
kvm_vcpu_unmap(&svm->vcpu, &map, true);
@@ -700,6 +712,8 @@ void svm_leave_nested(struct vcpu_svm *svm)
copy_vmcb_control_area(&vmcb->control, &hsave->control);
nested_svm_uninit_mmu_context(&svm->vcpu);
}
+
+ kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, &svm->vcpu);
}
static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
@@ -707,7 +721,7 @@ static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
u32 offset, msr, value;
int write, mask;
- if (!(svm->nested.ctl.intercept & (1ULL << INTERCEPT_MSR_PROT)))
+ if (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT)))
return NESTED_EXIT_HOST;
msr = svm->vcpu.arch.regs[VCPU_REGS_RCX];
@@ -734,7 +748,7 @@ static int nested_svm_intercept_ioio(struct vcpu_svm *svm)
u8 start_bit;
u64 gpa;
- if (!(svm->nested.ctl.intercept & (1ULL << INTERCEPT_IOIO_PROT)))
+ if (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_IOIO_PROT)))
return NESTED_EXIT_HOST;
port = svm->vmcb->control.exit_info_1 >> 16;
@@ -765,14 +779,12 @@ static int nested_svm_intercept(struct vcpu_svm *svm)
vmexit = nested_svm_intercept_ioio(svm);
break;
case SVM_EXIT_READ_CR0 ... SVM_EXIT_WRITE_CR8: {
- u32 bit = 1U << (exit_code - SVM_EXIT_READ_CR0);
- if (svm->nested.ctl.intercept_cr & bit)
+ if (vmcb_is_intercept(&svm->nested.ctl, exit_code))
vmexit = NESTED_EXIT_DONE;
break;
}
case SVM_EXIT_READ_DR0 ... SVM_EXIT_WRITE_DR7: {
- u32 bit = 1U << (exit_code - SVM_EXIT_READ_DR0);
- if (svm->nested.ctl.intercept_dr & bit)
+ if (vmcb_is_intercept(&svm->nested.ctl, exit_code))
vmexit = NESTED_EXIT_DONE;
break;
}
@@ -790,8 +802,7 @@ static int nested_svm_intercept(struct vcpu_svm *svm)
break;
}
default: {
- u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR);
- if (svm->nested.ctl.intercept & exit_bits)
+ if (vmcb_is_intercept(&svm->nested.ctl, exit_code))
vmexit = NESTED_EXIT_DONE;
}
}
@@ -831,7 +842,7 @@ static bool nested_exit_on_exception(struct vcpu_svm *svm)
{
unsigned int nr = svm->vcpu.arch.exception.nr;
- return (svm->nested.ctl.intercept_exceptions & (1 << nr));
+ return (svm->nested.ctl.intercepts[INTERCEPT_EXCEPTION] & BIT(nr));
}
static void nested_svm_inject_exception_vmexit(struct vcpu_svm *svm)
@@ -899,7 +910,7 @@ static void nested_svm_intr(struct vcpu_svm *svm)
static inline bool nested_exit_on_init(struct vcpu_svm *svm)
{
- return (svm->nested.ctl.intercept & (1ULL << INTERCEPT_INIT));
+ return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_INIT);
}
static void nested_svm_init(struct vcpu_svm *svm)
@@ -980,7 +991,8 @@ int nested_svm_exit_special(struct vcpu_svm *svm)
case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: {
u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE);
- if (get_host_vmcb(svm)->control.intercept_exceptions & excp_bits)
+ if (get_host_vmcb(svm)->control.intercepts[INTERCEPT_EXCEPTION] &
+ excp_bits)
return NESTED_EXIT_HOST;
else if (exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR &&
svm->vcpu.arch.apf.host_apf_flags)
@@ -1018,7 +1030,7 @@ static int svm_get_nested_state(struct kvm_vcpu *vcpu,
/* First fill in the header and copy it out. */
if (is_guest_mode(vcpu)) {
- kvm_state.hdr.svm.vmcb_pa = svm->nested.vmcb;
+ kvm_state.hdr.svm.vmcb_pa = svm->nested.vmcb12_gpa;
kvm_state.size += KVM_STATE_NESTED_SVM_VMCB_SIZE;
kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE;
@@ -1060,10 +1072,14 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
struct vmcb *hsave = svm->nested.hsave;
struct vmcb __user *user_vmcb = (struct vmcb __user *)
&user_kvm_nested_state->data.svm[0];
- struct vmcb_control_area ctl;
- struct vmcb_save_area save;
+ struct vmcb_control_area *ctl;
+ struct vmcb_save_area *save;
+ int ret;
u32 cr0;
+ BUILD_BUG_ON(sizeof(struct vmcb_control_area) + sizeof(struct vmcb_save_area) >
+ KVM_STATE_NESTED_SVM_VMCB_SIZE);
+
if (kvm_state->format != KVM_STATE_NESTED_FORMAT_SVM)
return -EINVAL;
@@ -1088,20 +1104,30 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) {
svm_leave_nested(svm);
- goto out_set_gif;
+ svm_set_gif(svm, !!(kvm_state->flags & KVM_STATE_NESTED_GIF_SET));
+ return 0;
}
if (!page_address_valid(vcpu, kvm_state->hdr.svm.vmcb_pa))
return -EINVAL;
if (kvm_state->size < sizeof(*kvm_state) + KVM_STATE_NESTED_SVM_VMCB_SIZE)
return -EINVAL;
- if (copy_from_user(&ctl, &user_vmcb->control, sizeof(ctl)))
- return -EFAULT;
- if (copy_from_user(&save, &user_vmcb->save, sizeof(save)))
- return -EFAULT;
- if (!nested_vmcb_check_controls(&ctl))
- return -EINVAL;
+ ret = -ENOMEM;
+ ctl = kzalloc(sizeof(*ctl), GFP_KERNEL);
+ save = kzalloc(sizeof(*save), GFP_KERNEL);
+ if (!ctl || !save)
+ goto out_free;
+
+ ret = -EFAULT;
+ if (copy_from_user(ctl, &user_vmcb->control, sizeof(*ctl)))
+ goto out_free;
+ if (copy_from_user(save, &user_vmcb->save, sizeof(*save)))
+ goto out_free;
+
+ ret = -EINVAL;
+ if (!nested_vmcb_check_controls(ctl))
+ goto out_free;
/*
* Processor state contains L2 state. Check that it is
@@ -1109,15 +1135,15 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
*/
cr0 = kvm_read_cr0(vcpu);
if (((cr0 & X86_CR0_CD) == 0) && (cr0 & X86_CR0_NW))
- return -EINVAL;
+ goto out_free;
/*
* Validate host state saved from before VMRUN (see
* nested_svm_check_permissions).
* TODO: validate reserved bits for all saved state.
*/
- if (!(save.cr0 & X86_CR0_PG))
- return -EINVAL;
+ if (!(save->cr0 & X86_CR0_PG))
+ goto out_free;
/*
* All checks done, we can enter guest mode. L1 control fields
@@ -1126,19 +1152,24 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
* contains saved L1 state.
*/
copy_vmcb_control_area(&hsave->control, &svm->vmcb->control);
- hsave->save = save;
+ hsave->save = *save;
- svm->nested.vmcb = kvm_state->hdr.svm.vmcb_pa;
- load_nested_vmcb_control(svm, &ctl);
+ svm->nested.vmcb12_gpa = kvm_state->hdr.svm.vmcb_pa;
+ load_nested_vmcb_control(svm, ctl);
nested_prepare_vmcb_control(svm);
-out_set_gif:
- svm_set_gif(svm, !!(kvm_state->flags & KVM_STATE_NESTED_GIF_SET));
- return 0;
+ kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
+ ret = 0;
+out_free:
+ kfree(save);
+ kfree(ctl);
+
+ return ret;
}
struct kvm_x86_nested_ops svm_nested_ops = {
.check_events = svm_check_nested_events,
+ .get_nested_state_pages = svm_get_nested_state_pages,
.get_state = svm_get_nested_state,
.set_state = svm_set_nested_state,
};
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 402dc42..65e15c2 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -446,10 +446,8 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
}
/*
- * The LAUNCH_UPDATE command will perform in-place encryption of the
- * memory content (i.e it will write the same memory region with C=1).
- * It's possible that the cache may contain the data with C=0, i.e.,
- * unencrypted so invalidate it first.
+ * Flush (on non-coherent CPUs) before LAUNCH_UPDATE encrypts pages in
+ * place; the cache may contain the data that was written unencrypted.
*/
sev_clflush_pages(inpages, npages);
@@ -805,10 +803,9 @@ static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec)
}
/*
- * The DBG_{DE,EN}CRYPT commands will perform {dec,en}cryption of the
- * memory content (i.e it will write the same memory region with C=1).
- * It's possible that the cache may contain the data with C=0, i.e.,
- * unencrypted so invalidate it first.
+ * Flush (on non-coherent CPUs) before DBG_{DE,EN}CRYPT read or modify
+ * the pages; flush the destination too so that future accesses do not
+ * see stale data.
*/
sev_clflush_pages(src_p, 1);
sev_clflush_pages(dst_p, 1);
@@ -856,7 +853,7 @@ static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp)
struct kvm_sev_launch_secret params;
struct page **pages;
void *blob, *hdr;
- unsigned long n;
+ unsigned long n, i;
int ret, offset;
if (!sev_guest(kvm))
@@ -870,6 +867,12 @@ static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp)
return PTR_ERR(pages);
/*
+ * Flush (on non-coherent CPUs) before LAUNCH_SECRET encrypts pages in
+ * place; the cache may contain the data that was written unencrypted.
+ */
+ sev_clflush_pages(pages, n);
+
+ /*
* The secret must be copied into contiguous memory region, lets verify
* that userspace memory pages are contiguous before we issue command.
*/
@@ -914,6 +917,11 @@ static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp)
e_free:
kfree(data);
e_unpin_memory:
+ /* content of memory is updated, mark pages dirty */
+ for (i = 0; i < n; i++) {
+ set_page_dirty_lock(pages[i]);
+ mark_page_accessed(pages[i]);
+ }
sev_unpin_memory(kvm, pages, n);
return ret;
}
@@ -1106,6 +1114,7 @@ void sev_vm_destroy(struct kvm *kvm)
list_for_each_safe(pos, q, head) {
__unregister_enc_region_locked(kvm,
list_entry(pos, struct enc_region, list));
+ cond_resched();
}
}
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 0194336..4f401fc 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -91,7 +91,7 @@ static DEFINE_PER_CPU(u64, current_tsc_ratio);
static const struct svm_direct_access_msrs {
u32 index; /* Index of the MSR */
bool always; /* True if intercept is always on */
-} direct_access_msrs[] = {
+} direct_access_msrs[MAX_DIRECT_ACCESS_MSRS] = {
{ .index = MSR_STAR, .always = true },
{ .index = MSR_IA32_SYSENTER_CS, .always = true },
#ifdef CONFIG_X86_64
@@ -553,18 +553,44 @@ static int svm_cpu_init(int cpu)
}
-static bool valid_msr_intercept(u32 index)
+static int direct_access_msr_slot(u32 msr)
{
- int i;
+ u32 i;
for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++)
- if (direct_access_msrs[i].index == index)
- return true;
+ if (direct_access_msrs[i].index == msr)
+ return i;
- return false;
+ return -ENOENT;
}
-static bool msr_write_intercepted(struct kvm_vcpu *vcpu, unsigned msr)
+static void set_shadow_msr_intercept(struct kvm_vcpu *vcpu, u32 msr, int read,
+ int write)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ int slot = direct_access_msr_slot(msr);
+
+ if (slot == -ENOENT)
+ return;
+
+ /* Set the shadow bitmaps to the desired intercept states */
+ if (read)
+ set_bit(slot, svm->shadow_msr_intercept.read);
+ else
+ clear_bit(slot, svm->shadow_msr_intercept.read);
+
+ if (write)
+ set_bit(slot, svm->shadow_msr_intercept.write);
+ else
+ clear_bit(slot, svm->shadow_msr_intercept.write);
+}
+
+static bool valid_msr_intercept(u32 index)
+{
+ return direct_access_msr_slot(index) != -ENOENT;
+}
+
+static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
{
u8 bit_write;
unsigned long tmp;
@@ -583,8 +609,8 @@ static bool msr_write_intercepted(struct kvm_vcpu *vcpu, unsigned msr)
return !!test_bit(bit_write, &tmp);
}
-static void set_msr_interception(u32 *msrpm, unsigned msr,
- int read, int write)
+static void set_msr_interception_bitmap(struct kvm_vcpu *vcpu, u32 *msrpm,
+ u32 msr, int read, int write)
{
u8 bit_read, bit_write;
unsigned long tmp;
@@ -596,6 +622,13 @@ static void set_msr_interception(u32 *msrpm, unsigned msr,
*/
WARN_ON(!valid_msr_intercept(msr));
+ /* Enforce non allowed MSRs to trap */
+ if (read && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ))
+ read = 0;
+
+ if (write && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE))
+ write = 0;
+
offset = svm_msrpm_offset(msr);
bit_read = 2 * (msr & 0x0f);
bit_write = 2 * (msr & 0x0f) + 1;
@@ -609,17 +642,59 @@ static void set_msr_interception(u32 *msrpm, unsigned msr,
msrpm[offset] = tmp;
}
-static void svm_vcpu_init_msrpm(u32 *msrpm)
+static void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr,
+ int read, int write)
+{
+ set_shadow_msr_intercept(vcpu, msr, read, write);
+ set_msr_interception_bitmap(vcpu, msrpm, msr, read, write);
+}
+
+static u32 *svm_vcpu_alloc_msrpm(void)
+{
+ struct page *pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER);
+ u32 *msrpm;
+
+ if (!pages)
+ return NULL;
+
+ msrpm = page_address(pages);
+ memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER));
+
+ return msrpm;
+}
+
+static void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm)
{
int i;
- memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER));
-
for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
if (!direct_access_msrs[i].always)
continue;
+ set_msr_interception(vcpu, msrpm, direct_access_msrs[i].index, 1, 1);
+ }
+}
- set_msr_interception(msrpm, direct_access_msrs[i].index, 1, 1);
+static void svm_vcpu_free_msrpm(u32 *msrpm)
+{
+ __free_pages(virt_to_page(msrpm), MSRPM_ALLOC_ORDER);
+}
+
+static void svm_msr_filter_changed(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u32 i;
+
+ /*
+ * Set intercept permissions for all direct access MSRs again. They
+ * will automatically get filtered through the MSR filter, so we are
+ * back in sync after this.
+ */
+ for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
+ u32 msr = direct_access_msrs[i].index;
+ u32 read = test_bit(i, svm->shadow_msr_intercept.read);
+ u32 write = test_bit(i, svm->shadow_msr_intercept.write);
+
+ set_msr_interception_bitmap(vcpu, svm->msrpm, msr, read, write);
}
}
@@ -666,26 +741,26 @@ static void init_msrpm_offsets(void)
}
}
-static void svm_enable_lbrv(struct vcpu_svm *svm)
+static void svm_enable_lbrv(struct kvm_vcpu *vcpu)
{
- u32 *msrpm = svm->msrpm;
+ struct vcpu_svm *svm = to_svm(vcpu);
svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK;
- set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1);
- set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
- set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
- set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1);
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
}
-static void svm_disable_lbrv(struct vcpu_svm *svm)
+static void svm_disable_lbrv(struct kvm_vcpu *vcpu)
{
- u32 *msrpm = svm->msrpm;
+ struct vcpu_svm *svm = to_svm(vcpu);
svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK;
- set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0);
- set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0);
- set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 0, 0);
- set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0);
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0);
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 0, 0);
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
}
void disable_nmi_singlestep(struct vcpu_svm *svm)
@@ -813,6 +888,9 @@ static __init void svm_set_cpu_caps(void)
if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) ||
boot_cpu_has(X86_FEATURE_AMD_SSBD))
kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD);
+
+ /* Enable INVPCID feature */
+ kvm_cpu_cap_check_and_set(X86_FEATURE_INVPCID);
}
static __init int svm_hardware_setup(void)
@@ -985,6 +1063,21 @@ static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
return svm->vmcb->control.tsc_offset;
}
+static void svm_check_invpcid(struct vcpu_svm *svm)
+{
+ /*
+ * Intercept INVPCID instruction only if shadow page table is
+ * enabled. Interception is not required with nested page table
+ * enabled.
+ */
+ if (kvm_cpu_cap_has(X86_FEATURE_INVPCID)) {
+ if (!npt_enabled)
+ svm_set_intercept(svm, INTERCEPT_INVPCID);
+ else
+ svm_clr_intercept(svm, INTERCEPT_INVPCID);
+ }
+}
+
static void init_vmcb(struct vcpu_svm *svm)
{
struct vmcb_control_area *control = &svm->vmcb->control;
@@ -992,14 +1085,14 @@ static void init_vmcb(struct vcpu_svm *svm)
svm->vcpu.arch.hflags = 0;
- set_cr_intercept(svm, INTERCEPT_CR0_READ);
- set_cr_intercept(svm, INTERCEPT_CR3_READ);
- set_cr_intercept(svm, INTERCEPT_CR4_READ);
- set_cr_intercept(svm, INTERCEPT_CR0_WRITE);
- set_cr_intercept(svm, INTERCEPT_CR3_WRITE);
- set_cr_intercept(svm, INTERCEPT_CR4_WRITE);
+ svm_set_intercept(svm, INTERCEPT_CR0_READ);
+ svm_set_intercept(svm, INTERCEPT_CR3_READ);
+ svm_set_intercept(svm, INTERCEPT_CR4_READ);
+ svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
+ svm_set_intercept(svm, INTERCEPT_CR3_WRITE);
+ svm_set_intercept(svm, INTERCEPT_CR4_WRITE);
if (!kvm_vcpu_apicv_active(&svm->vcpu))
- set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
+ svm_set_intercept(svm, INTERCEPT_CR8_WRITE);
set_dr_intercepts(svm);
@@ -1094,15 +1187,15 @@ static void init_vmcb(struct vcpu_svm *svm)
control->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE;
svm_clr_intercept(svm, INTERCEPT_INVLPG);
clr_exception_intercept(svm, PF_VECTOR);
- clr_cr_intercept(svm, INTERCEPT_CR3_READ);
- clr_cr_intercept(svm, INTERCEPT_CR3_WRITE);
+ svm_clr_intercept(svm, INTERCEPT_CR3_READ);
+ svm_clr_intercept(svm, INTERCEPT_CR3_WRITE);
save->g_pat = svm->vcpu.arch.pat;
save->cr3 = 0;
save->cr4 = 0;
}
svm->asid_generation = 0;
- svm->nested.vmcb = 0;
+ svm->nested.vmcb12_gpa = 0;
svm->vcpu.arch.hflags = 0;
if (!kvm_pause_in_guest(svm->vcpu.kvm)) {
@@ -1114,6 +1207,8 @@ static void init_vmcb(struct vcpu_svm *svm)
svm_clr_intercept(svm, INTERCEPT_PAUSE);
}
+ svm_check_invpcid(svm);
+
if (kvm_vcpu_apicv_active(&svm->vcpu))
avic_init_vmcb(svm);
@@ -1171,35 +1266,25 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
static int svm_create_vcpu(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm;
- struct page *page;
- struct page *msrpm_pages;
+ struct page *vmcb_page;
struct page *hsave_page;
- struct page *nested_msrpm_pages;
int err;
BUILD_BUG_ON(offsetof(struct vcpu_svm, vcpu) != 0);
svm = to_svm(vcpu);
err = -ENOMEM;
- page = alloc_page(GFP_KERNEL_ACCOUNT);
- if (!page)
+ vmcb_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ if (!vmcb_page)
goto out;
- msrpm_pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER);
- if (!msrpm_pages)
- goto free_page1;
-
- nested_msrpm_pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER);
- if (!nested_msrpm_pages)
- goto free_page2;
-
- hsave_page = alloc_page(GFP_KERNEL_ACCOUNT);
+ hsave_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
if (!hsave_page)
- goto free_page3;
+ goto error_free_vmcb_page;
err = avic_init_vcpu(svm);
if (err)
- goto free_page4;
+ goto error_free_hsave_page;
/* We initialize this flag to true to make sure that the is_running
* bit would be set the first time the vcpu is loaded.
@@ -1208,17 +1293,22 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
svm->avic_is_running = true;
svm->nested.hsave = page_address(hsave_page);
- clear_page(svm->nested.hsave);
- svm->msrpm = page_address(msrpm_pages);
- svm_vcpu_init_msrpm(svm->msrpm);
+ svm->msrpm = svm_vcpu_alloc_msrpm();
+ if (!svm->msrpm)
+ goto error_free_hsave_page;
- svm->nested.msrpm = page_address(nested_msrpm_pages);
- svm_vcpu_init_msrpm(svm->nested.msrpm);
+ svm_vcpu_init_msrpm(vcpu, svm->msrpm);
- svm->vmcb = page_address(page);
- clear_page(svm->vmcb);
- svm->vmcb_pa = __sme_set(page_to_pfn(page) << PAGE_SHIFT);
+ svm->nested.msrpm = svm_vcpu_alloc_msrpm();
+ if (!svm->nested.msrpm)
+ goto error_free_msrpm;
+
+ /* We only need the L1 pass-through MSR state, so leave vcpu as NULL */
+ svm_vcpu_init_msrpm(vcpu, svm->nested.msrpm);
+
+ svm->vmcb = page_address(vmcb_page);
+ svm->vmcb_pa = __sme_set(page_to_pfn(vmcb_page) << PAGE_SHIFT);
svm->asid_generation = 0;
init_vmcb(svm);
@@ -1227,14 +1317,12 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
return 0;
-free_page4:
+error_free_msrpm:
+ svm_vcpu_free_msrpm(svm->msrpm);
+error_free_hsave_page:
__free_page(hsave_page);
-free_page3:
- __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER);
-free_page2:
- __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER);
-free_page1:
- __free_page(page);
+error_free_vmcb_page:
+ __free_page(vmcb_page);
out:
return err;
}
@@ -1549,11 +1637,11 @@ static void update_cr0_intercept(struct vcpu_svm *svm)
vmcb_mark_dirty(svm->vmcb, VMCB_CR);
if (gcr0 == *hcr0) {
- clr_cr_intercept(svm, INTERCEPT_CR0_READ);
- clr_cr_intercept(svm, INTERCEPT_CR0_WRITE);
+ svm_clr_intercept(svm, INTERCEPT_CR0_READ);
+ svm_clr_intercept(svm, INTERCEPT_CR0_WRITE);
} else {
- set_cr_intercept(svm, INTERCEPT_CR0_READ);
- set_cr_intercept(svm, INTERCEPT_CR0_WRITE);
+ svm_set_intercept(svm, INTERCEPT_CR0_READ);
+ svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
}
}
@@ -2218,12 +2306,9 @@ static bool check_selective_cr0_intercepted(struct vcpu_svm *svm,
{
unsigned long cr0 = svm->vcpu.arch.cr0;
bool ret = false;
- u64 intercept;
-
- intercept = svm->nested.ctl.intercept;
if (!is_guest_mode(&svm->vcpu) ||
- (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0))))
+ (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_SELECTIVE_CR0))))
return false;
cr0 &= ~SVM_CR0_SELECTIVE_MASK;
@@ -2261,6 +2346,7 @@ static int cr_interception(struct vcpu_svm *svm)
if (cr >= 16) { /* mov to cr */
cr -= 16;
val = kvm_register_read(&svm->vcpu, reg);
+ trace_kvm_cr_write(cr, val);
switch (cr) {
case 0:
if (!check_selective_cr0_intercepted(svm, val))
@@ -2306,6 +2392,7 @@ static int cr_interception(struct vcpu_svm *svm)
return 1;
}
kvm_register_write(&svm->vcpu, reg, val);
+ trace_kvm_cr_read(cr, val);
}
return kvm_complete_insn_gp(&svm->vcpu, err);
}
@@ -2556,7 +2643,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
* We update the L1 MSR bit as well since it will end up
* touching the MSR anyway now.
*/
- set_msr_interception(svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
break;
case MSR_IA32_PRED_CMD:
if (!msr->host_initiated &&
@@ -2571,7 +2658,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
break;
wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
- set_msr_interception(svm->msrpm, MSR_IA32_PRED_CMD, 0, 1);
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_PRED_CMD, 0, 1);
break;
case MSR_AMD64_VIRT_SPEC_CTRL:
if (!msr->host_initiated &&
@@ -2635,9 +2722,9 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
svm->vmcb->save.dbgctl = data;
vmcb_mark_dirty(svm->vmcb, VMCB_LBR);
if (data & (1ULL<<0))
- svm_enable_lbrv(svm);
+ svm_enable_lbrv(vcpu);
else
- svm_disable_lbrv(svm);
+ svm_disable_lbrv(vcpu);
break;
case MSR_VM_HSAVE_PA:
svm->nested.hsave_msr = data;
@@ -2733,6 +2820,33 @@ static int mwait_interception(struct vcpu_svm *svm)
return nop_interception(svm);
}
+static int invpcid_interception(struct vcpu_svm *svm)
+{
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ unsigned long type;
+ gva_t gva;
+
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ /*
+ * For an INVPCID intercept:
+ * EXITINFO1 provides the linear address of the memory operand.
+ * EXITINFO2 provides the contents of the register operand.
+ */
+ type = svm->vmcb->control.exit_info_2;
+ gva = svm->vmcb->control.exit_info_1;
+
+ if (type > 3) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ return kvm_handle_invpcid(vcpu, type, gva);
+}
+
static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
[SVM_EXIT_READ_CR0] = cr_interception,
[SVM_EXIT_READ_CR3] = cr_interception,
@@ -2795,6 +2909,7 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
[SVM_EXIT_MWAIT] = mwait_interception,
[SVM_EXIT_XSETBV] = xsetbv_interception,
[SVM_EXIT_RDPRU] = rdpru_interception,
+ [SVM_EXIT_INVPCID] = invpcid_interception,
[SVM_EXIT_NPF] = npf_interception,
[SVM_EXIT_RSM] = rsm_interception,
[SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception,
@@ -2813,12 +2928,14 @@ static void dump_vmcb(struct kvm_vcpu *vcpu)
}
pr_err("VMCB Control Area:\n");
- pr_err("%-20s%04x\n", "cr_read:", control->intercept_cr & 0xffff);
- pr_err("%-20s%04x\n", "cr_write:", control->intercept_cr >> 16);
- pr_err("%-20s%04x\n", "dr_read:", control->intercept_dr & 0xffff);
- pr_err("%-20s%04x\n", "dr_write:", control->intercept_dr >> 16);
- pr_err("%-20s%08x\n", "exceptions:", control->intercept_exceptions);
- pr_err("%-20s%016llx\n", "intercepts:", control->intercept);
+ pr_err("%-20s%04x\n", "cr_read:", control->intercepts[INTERCEPT_CR] & 0xffff);
+ pr_err("%-20s%04x\n", "cr_write:", control->intercepts[INTERCEPT_CR] >> 16);
+ pr_err("%-20s%04x\n", "dr_read:", control->intercepts[INTERCEPT_DR] & 0xffff);
+ pr_err("%-20s%04x\n", "dr_write:", control->intercepts[INTERCEPT_DR] >> 16);
+ pr_err("%-20s%08x\n", "exceptions:", control->intercepts[INTERCEPT_EXCEPTION]);
+ pr_err("%-20s%08x %08x\n", "intercepts:",
+ control->intercepts[INTERCEPT_WORD3],
+ control->intercepts[INTERCEPT_WORD4]);
pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count);
pr_err("%-20s%d\n", "pause filter threshold:",
control->pause_filter_thresh);
@@ -2917,12 +3034,19 @@ static void dump_vmcb(struct kvm_vcpu *vcpu)
"excp_to:", save->last_excp_to);
}
-static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
+static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2,
+ u32 *intr_info, u32 *error_code)
{
struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
*info1 = control->exit_info_1;
*info2 = control->exit_info_2;
+ *intr_info = control->exit_int_info;
+ if ((*intr_info & SVM_EXITINTINFO_VALID) &&
+ (*intr_info & SVM_EXITINTINFO_VALID_ERR))
+ *error_code = control->exit_int_info_err;
+ else
+ *error_code = 0;
}
static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
@@ -2933,22 +3057,15 @@ static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM);
- if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE))
+ if (!svm_is_intercept(svm, INTERCEPT_CR0_WRITE))
vcpu->arch.cr0 = svm->vmcb->save.cr0;
if (npt_enabled)
vcpu->arch.cr3 = svm->vmcb->save.cr3;
- svm_complete_interrupts(svm);
-
if (is_guest_mode(vcpu)) {
int vmexit;
- trace_kvm_nested_vmexit(svm->vmcb->save.rip, exit_code,
- svm->vmcb->control.exit_info_1,
- svm->vmcb->control.exit_info_2,
- svm->vmcb->control.exit_int_info,
- svm->vmcb->control.exit_int_info_err,
- KVM_ISA_SVM);
+ trace_kvm_nested_vmexit(exit_code, vcpu, KVM_ISA_SVM);
vmexit = nested_svm_exit_special(svm);
@@ -3058,13 +3175,13 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
if (nested_svm_virtualize_tpr(vcpu))
return;
- clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
+ svm_clr_intercept(svm, INTERCEPT_CR8_WRITE);
if (irr == -1)
return;
if (tpr >= irr)
- set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
+ svm_set_intercept(svm, INTERCEPT_CR8_WRITE);
}
bool svm_nmi_blocked(struct kvm_vcpu *vcpu)
@@ -3252,7 +3369,7 @@ static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
if (nested_svm_virtualize_tpr(vcpu))
return;
- if (!is_cr_intercept(svm, INTERCEPT_CR8_WRITE)) {
+ if (!svm_is_intercept(svm, INTERCEPT_CR8_WRITE)) {
int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
kvm_set_cr8(vcpu, cr8);
}
@@ -3349,8 +3466,7 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu)
static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
{
- if (!is_guest_mode(vcpu) &&
- to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR &&
+ if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR &&
to_svm(vcpu)->vmcb->control.exit_info_1)
return handle_fastpath_set_msr_irqoff(vcpu);
@@ -3415,7 +3531,6 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu,
static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
{
- fastpath_t exit_fastpath;
struct vcpu_svm *svm = to_svm(vcpu);
svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
@@ -3456,9 +3571,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
clgi();
kvm_load_guest_xsave_state(vcpu);
- if (lapic_in_kernel(vcpu) &&
- vcpu->arch.apic->lapic_timer.timer_advance_ns)
- kvm_wait_lapic_expire(vcpu);
+ kvm_wait_lapic_expire(vcpu);
/*
* If this vCPU has touched SPEC_CTRL, restore the guest's value if
@@ -3504,7 +3617,6 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
stgi();
/* Any pending NMI will happen here */
- exit_fastpath = svm_exit_handlers_fastpath(vcpu);
if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
kvm_after_interrupt(&svm->vcpu);
@@ -3518,6 +3630,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
}
svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
+ vmcb_mark_all_clean(svm->vmcb);
/* if exit due to PF check for async PF */
if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR)
@@ -3537,8 +3650,12 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
SVM_EXIT_EXCP_BASE + MC_VECTOR))
svm_handle_mce(svm);
- vmcb_mark_all_clean(svm->vmcb);
- return exit_fastpath;
+ svm_complete_interrupts(svm);
+
+ if (is_guest_mode(vcpu))
+ return EXIT_FASTPATH_NONE;
+
+ return svm_exit_handlers_fastpath(vcpu);
}
static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long root,
@@ -3624,6 +3741,9 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
svm->nrips_enabled = kvm_cpu_cap_has(X86_FEATURE_NRIPS) &&
guest_cpuid_has(&svm->vcpu, X86_FEATURE_NRIPS);
+ /* Check again if INVPCID interception if required */
+ svm_check_invpcid(svm);
+
if (!kvm_vcpu_apicv_active(vcpu))
return;
@@ -3738,7 +3858,6 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu,
break;
case SVM_EXIT_WRITE_CR0: {
unsigned long cr0, val;
- u64 intercept;
if (info->intercept == x86_intercept_cr_write)
icpt_info.exit_code += info->modrm_reg;
@@ -3747,9 +3866,8 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu,
info->intercept == x86_intercept_clts)
break;
- intercept = svm->nested.ctl.intercept;
-
- if (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0)))
+ if (!(vmcb_is_intercept(&svm->nested.ctl,
+ INTERCEPT_SELECTIVE_CR0)))
break;
cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK;
@@ -3884,7 +4002,7 @@ static int svm_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
/* FED8h - SVM Guest */
put_smstate(u64, smstate, 0x7ed8, 1);
/* FEE0h - SVM Guest VMCB Physical Address */
- put_smstate(u64, smstate, 0x7ee0, svm->nested.vmcb);
+ put_smstate(u64, smstate, 0x7ee0, svm->nested.vmcb12_gpa);
svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
@@ -3900,21 +4018,28 @@ static int svm_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
{
struct vcpu_svm *svm = to_svm(vcpu);
- struct vmcb *nested_vmcb;
struct kvm_host_map map;
- u64 guest;
- u64 vmcb;
int ret = 0;
- guest = GET_SMSTATE(u64, smstate, 0x7ed8);
- vmcb = GET_SMSTATE(u64, smstate, 0x7ee0);
+ if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) {
+ u64 saved_efer = GET_SMSTATE(u64, smstate, 0x7ed0);
+ u64 guest = GET_SMSTATE(u64, smstate, 0x7ed8);
+ u64 vmcb12_gpa = GET_SMSTATE(u64, smstate, 0x7ee0);
- if (guest) {
- if (kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb), &map) == -EINVAL)
- return 1;
- nested_vmcb = map.hva;
- ret = enter_svm_guest_mode(svm, vmcb, nested_vmcb);
- kvm_vcpu_unmap(&svm->vcpu, &map, true);
+ if (guest) {
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_SVM))
+ return 1;
+
+ if (!(saved_efer & EFER_SVME))
+ return 1;
+
+ if (kvm_vcpu_map(&svm->vcpu,
+ gpa_to_gfn(vmcb12_gpa), &map) == -EINVAL)
+ return 1;
+
+ ret = enter_svm_guest_mode(svm, vmcb12_gpa, map.hva);
+ kvm_vcpu_unmap(&svm->vcpu, &map, true);
+ }
}
return ret;
@@ -3933,19 +4058,10 @@ static void enable_smi_window(struct kvm_vcpu *vcpu)
}
}
-static bool svm_need_emulation_on_page_fault(struct kvm_vcpu *vcpu)
+static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, void *insn, int insn_len)
{
- unsigned long cr4 = kvm_read_cr4(vcpu);
- bool smep = cr4 & X86_CR4_SMEP;
- bool smap = cr4 & X86_CR4_SMAP;
- bool is_user = svm_get_cpl(vcpu) == 3;
-
- /*
- * If RIP is invalid, go ahead with emulation which will cause an
- * internal error exit.
- */
- if (!kvm_vcpu_gfn_to_memslot(vcpu, kvm_rip_read(vcpu) >> PAGE_SHIFT))
- return true;
+ bool smep, smap, is_user;
+ unsigned long cr4;
/*
* Detect and workaround Errata 1096 Fam_17h_00_0Fh.
@@ -3987,6 +4103,20 @@ static bool svm_need_emulation_on_page_fault(struct kvm_vcpu *vcpu)
* instruction pointer so we will not able to workaround it. Lets
* print the error and request to kill the guest.
*/
+ if (likely(!insn || insn_len))
+ return true;
+
+ /*
+ * If RIP is invalid, go ahead with emulation which will cause an
+ * internal error exit.
+ */
+ if (!kvm_vcpu_gfn_to_memslot(vcpu, kvm_rip_read(vcpu) >> PAGE_SHIFT))
+ return true;
+
+ cr4 = kvm_read_cr4(vcpu);
+ smep = cr4 & X86_CR4_SMEP;
+ smap = cr4 & X86_CR4_SMAP;
+ is_user = svm_get_cpl(vcpu) == 3;
if (smap && (!smep || is_user)) {
if (!sev_guest(vcpu->kvm))
return true;
@@ -4010,7 +4140,7 @@ static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
* if an INIT signal is pending.
*/
return !gif_set(svm) ||
- (svm->vmcb->control.intercept & (1ULL << INTERCEPT_INIT));
+ (vmcb_is_intercept(&svm->vmcb->control, INTERCEPT_INIT));
}
static void svm_vm_destroy(struct kvm *kvm)
@@ -4148,9 +4278,11 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.mem_enc_reg_region = svm_register_enc_region,
.mem_enc_unreg_region = svm_unregister_enc_region,
- .need_emulation_on_page_fault = svm_need_emulation_on_page_fault,
+ .can_emulate_instruction = svm_can_emulate_instruction,
.apic_init_signal_blocked = svm_apic_init_signal_blocked,
+
+ .msr_filter_changed = svm_msr_filter_changed,
};
static struct kvm_x86_init_ops svm_init_ops __initdata = {
@@ -4164,6 +4296,8 @@ static struct kvm_x86_init_ops svm_init_ops __initdata = {
static int __init svm_init(void)
{
+ __unused_size_checks();
+
return kvm_init(&svm_init_ops, sizeof(struct vcpu_svm),
__alignof__(struct vcpu_svm), THIS_MODULE);
}
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index a798e17..a7f9974 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -31,6 +31,7 @@ static const u32 host_save_user_msrs[] = {
#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
+#define MAX_DIRECT_ACCESS_MSRS 15
#define MSRPM_OFFSETS 16
extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
extern bool npt_enabled;
@@ -85,8 +86,7 @@ struct svm_nested_state {
struct vmcb *hsave;
u64 hsave_msr;
u64 vm_cr_msr;
- u64 vmcb;
- u32 host_intercept_exceptions;
+ u64 vmcb12_gpa;
/* These are the merged vectors */
u32 *msrpm;
@@ -158,6 +158,12 @@ struct vcpu_svm {
*/
struct list_head ir_list;
spinlock_t ir_list_lock;
+
+ /* Save desired MSR intercept (read: pass-through) state */
+ struct {
+ DECLARE_BITMAP(read, MAX_DIRECT_ACCESS_MSRS);
+ DECLARE_BITMAP(write, MAX_DIRECT_ACCESS_MSRS);
+ } shadow_msr_intercept;
};
struct svm_cpu_data {
@@ -214,51 +220,44 @@ static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm)
return svm->vmcb;
}
-static inline void set_cr_intercept(struct vcpu_svm *svm, int bit)
+static inline void vmcb_set_intercept(struct vmcb_control_area *control, u32 bit)
{
- struct vmcb *vmcb = get_host_vmcb(svm);
-
- vmcb->control.intercept_cr |= (1U << bit);
-
- recalc_intercepts(svm);
+ WARN_ON_ONCE(bit >= 32 * MAX_INTERCEPT);
+ __set_bit(bit, (unsigned long *)&control->intercepts);
}
-static inline void clr_cr_intercept(struct vcpu_svm *svm, int bit)
+static inline void vmcb_clr_intercept(struct vmcb_control_area *control, u32 bit)
{
- struct vmcb *vmcb = get_host_vmcb(svm);
-
- vmcb->control.intercept_cr &= ~(1U << bit);
-
- recalc_intercepts(svm);
+ WARN_ON_ONCE(bit >= 32 * MAX_INTERCEPT);
+ __clear_bit(bit, (unsigned long *)&control->intercepts);
}
-static inline bool is_cr_intercept(struct vcpu_svm *svm, int bit)
+static inline bool vmcb_is_intercept(struct vmcb_control_area *control, u32 bit)
{
- struct vmcb *vmcb = get_host_vmcb(svm);
-
- return vmcb->control.intercept_cr & (1U << bit);
+ WARN_ON_ONCE(bit >= 32 * MAX_INTERCEPT);
+ return test_bit(bit, (unsigned long *)&control->intercepts);
}
static inline void set_dr_intercepts(struct vcpu_svm *svm)
{
struct vmcb *vmcb = get_host_vmcb(svm);
- vmcb->control.intercept_dr = (1 << INTERCEPT_DR0_READ)
- | (1 << INTERCEPT_DR1_READ)
- | (1 << INTERCEPT_DR2_READ)
- | (1 << INTERCEPT_DR3_READ)
- | (1 << INTERCEPT_DR4_READ)
- | (1 << INTERCEPT_DR5_READ)
- | (1 << INTERCEPT_DR6_READ)
- | (1 << INTERCEPT_DR7_READ)
- | (1 << INTERCEPT_DR0_WRITE)
- | (1 << INTERCEPT_DR1_WRITE)
- | (1 << INTERCEPT_DR2_WRITE)
- | (1 << INTERCEPT_DR3_WRITE)
- | (1 << INTERCEPT_DR4_WRITE)
- | (1 << INTERCEPT_DR5_WRITE)
- | (1 << INTERCEPT_DR6_WRITE)
- | (1 << INTERCEPT_DR7_WRITE);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_WRITE);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_WRITE);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_WRITE);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_WRITE);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_WRITE);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_WRITE);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_WRITE);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE);
recalc_intercepts(svm);
}
@@ -267,25 +266,27 @@ static inline void clr_dr_intercepts(struct vcpu_svm *svm)
{
struct vmcb *vmcb = get_host_vmcb(svm);
- vmcb->control.intercept_dr = 0;
+ vmcb->control.intercepts[INTERCEPT_DR] = 0;
recalc_intercepts(svm);
}
-static inline void set_exception_intercept(struct vcpu_svm *svm, int bit)
+static inline void set_exception_intercept(struct vcpu_svm *svm, u32 bit)
{
struct vmcb *vmcb = get_host_vmcb(svm);
- vmcb->control.intercept_exceptions |= (1U << bit);
+ WARN_ON_ONCE(bit >= 32);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_EXCEPTION_OFFSET + bit);
recalc_intercepts(svm);
}
-static inline void clr_exception_intercept(struct vcpu_svm *svm, int bit)
+static inline void clr_exception_intercept(struct vcpu_svm *svm, u32 bit)
{
struct vmcb *vmcb = get_host_vmcb(svm);
- vmcb->control.intercept_exceptions &= ~(1U << bit);
+ WARN_ON_ONCE(bit >= 32);
+ vmcb_clr_intercept(&vmcb->control, INTERCEPT_EXCEPTION_OFFSET + bit);
recalc_intercepts(svm);
}
@@ -294,7 +295,7 @@ static inline void svm_set_intercept(struct vcpu_svm *svm, int bit)
{
struct vmcb *vmcb = get_host_vmcb(svm);
- vmcb->control.intercept |= (1ULL << bit);
+ vmcb_set_intercept(&vmcb->control, bit);
recalc_intercepts(svm);
}
@@ -303,14 +304,14 @@ static inline void svm_clr_intercept(struct vcpu_svm *svm, int bit)
{
struct vmcb *vmcb = get_host_vmcb(svm);
- vmcb->control.intercept &= ~(1ULL << bit);
+ vmcb_clr_intercept(&vmcb->control, bit);
recalc_intercepts(svm);
}
static inline bool svm_is_intercept(struct vcpu_svm *svm, int bit)
{
- return (svm->vmcb->control.intercept & (1ULL << bit)) != 0;
+ return vmcb_is_intercept(&svm->vmcb->control, bit);
}
static inline bool vgif_enabled(struct vcpu_svm *svm)
@@ -345,7 +346,7 @@ static inline bool gif_set(struct vcpu_svm *svm)
/* svm.c */
#define MSR_CR3_LEGACY_RESERVED_MASK 0xfe7U
#define MSR_CR3_LEGACY_PAE_RESERVED_MASK 0x7U
-#define MSR_CR3_LONG_RESERVED_MASK 0xfff0000000000fe7U
+#define MSR_CR3_LONG_MBZ_MASK 0xfff0000000000000U
#define MSR_INVALID 0xffffffffU
u32 svm_msrpm_offset(u32 msr);
@@ -374,17 +375,17 @@ static inline bool nested_svm_virtualize_tpr(struct kvm_vcpu *vcpu)
static inline bool nested_exit_on_smi(struct vcpu_svm *svm)
{
- return (svm->nested.ctl.intercept & (1ULL << INTERCEPT_SMI));
+ return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_SMI);
}
static inline bool nested_exit_on_intr(struct vcpu_svm *svm)
{
- return (svm->nested.ctl.intercept & (1ULL << INTERCEPT_INTR));
+ return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_INTR);
}
static inline bool nested_exit_on_nmi(struct vcpu_svm *svm)
{
- return (svm->nested.ctl.intercept & (1ULL << INTERCEPT_NMI));
+ return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_NMI);
}
int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index b66432b..aef960f 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -15,18 +15,20 @@
* Tracepoint for guest mode entry.
*/
TRACE_EVENT(kvm_entry,
- TP_PROTO(unsigned int vcpu_id),
- TP_ARGS(vcpu_id),
+ TP_PROTO(struct kvm_vcpu *vcpu),
+ TP_ARGS(vcpu),
TP_STRUCT__entry(
__field( unsigned int, vcpu_id )
+ __field( unsigned long, rip )
),
TP_fast_assign(
- __entry->vcpu_id = vcpu_id;
+ __entry->vcpu_id = vcpu->vcpu_id;
+ __entry->rip = kvm_rip_read(vcpu);
),
- TP_printk("vcpu %u", __entry->vcpu_id)
+ TP_printk("vcpu %u, rip 0x%lx", __entry->vcpu_id, __entry->rip)
);
/*
@@ -233,36 +235,45 @@ TRACE_EVENT(kvm_apic,
(isa == KVM_ISA_VMX) ? \
__print_flags(exit_reason & ~0xffff, " ", VMX_EXIT_REASON_FLAGS) : ""
+#define TRACE_EVENT_KVM_EXIT(name) \
+TRACE_EVENT(name, \
+ TP_PROTO(unsigned int exit_reason, struct kvm_vcpu *vcpu, u32 isa), \
+ TP_ARGS(exit_reason, vcpu, isa), \
+ \
+ TP_STRUCT__entry( \
+ __field( unsigned int, exit_reason ) \
+ __field( unsigned long, guest_rip ) \
+ __field( u32, isa ) \
+ __field( u64, info1 ) \
+ __field( u64, info2 ) \
+ __field( u32, intr_info ) \
+ __field( u32, error_code ) \
+ __field( unsigned int, vcpu_id ) \
+ ), \
+ \
+ TP_fast_assign( \
+ __entry->exit_reason = exit_reason; \
+ __entry->guest_rip = kvm_rip_read(vcpu); \
+ __entry->isa = isa; \
+ __entry->vcpu_id = vcpu->vcpu_id; \
+ kvm_x86_ops.get_exit_info(vcpu, &__entry->info1, \
+ &__entry->info2, \
+ &__entry->intr_info, \
+ &__entry->error_code); \
+ ), \
+ \
+ TP_printk("vcpu %u reason %s%s%s rip 0x%lx info1 0x%016llx " \
+ "info2 0x%016llx intr_info 0x%08x error_code 0x%08x", \
+ __entry->vcpu_id, \
+ kvm_print_exit_reason(__entry->exit_reason, __entry->isa), \
+ __entry->guest_rip, __entry->info1, __entry->info2, \
+ __entry->intr_info, __entry->error_code) \
+)
+
/*
* Tracepoint for kvm guest exit:
*/
-TRACE_EVENT(kvm_exit,
- TP_PROTO(unsigned int exit_reason, struct kvm_vcpu *vcpu, u32 isa),
- TP_ARGS(exit_reason, vcpu, isa),
-
- TP_STRUCT__entry(
- __field( unsigned int, exit_reason )
- __field( unsigned long, guest_rip )
- __field( u32, isa )
- __field( u64, info1 )
- __field( u64, info2 )
- __field( unsigned int, vcpu_id )
- ),
-
- TP_fast_assign(
- __entry->exit_reason = exit_reason;
- __entry->guest_rip = kvm_rip_read(vcpu);
- __entry->isa = isa;
- __entry->vcpu_id = vcpu->vcpu_id;
- kvm_x86_ops.get_exit_info(vcpu, &__entry->info1,
- &__entry->info2);
- ),
-
- TP_printk("vcpu %u reason %s%s%s rip 0x%lx info %llx %llx",
- __entry->vcpu_id,
- kvm_print_exit_reason(__entry->exit_reason, __entry->isa),
- __entry->guest_rip, __entry->info1, __entry->info2)
-);
+TRACE_EVENT_KVM_EXIT(kvm_exit);
/*
* Tracepoint for kvm interrupt injection:
@@ -544,63 +555,38 @@ TRACE_EVENT(kvm_nested_vmrun,
);
TRACE_EVENT(kvm_nested_intercepts,
- TP_PROTO(__u16 cr_read, __u16 cr_write, __u32 exceptions, __u64 intercept),
- TP_ARGS(cr_read, cr_write, exceptions, intercept),
+ TP_PROTO(__u16 cr_read, __u16 cr_write, __u32 exceptions,
+ __u32 intercept1, __u32 intercept2, __u32 intercept3),
+ TP_ARGS(cr_read, cr_write, exceptions, intercept1,
+ intercept2, intercept3),
TP_STRUCT__entry(
__field( __u16, cr_read )
__field( __u16, cr_write )
__field( __u32, exceptions )
- __field( __u64, intercept )
+ __field( __u32, intercept1 )
+ __field( __u32, intercept2 )
+ __field( __u32, intercept3 )
),
TP_fast_assign(
__entry->cr_read = cr_read;
__entry->cr_write = cr_write;
__entry->exceptions = exceptions;
- __entry->intercept = intercept;
+ __entry->intercept1 = intercept1;
+ __entry->intercept2 = intercept2;
+ __entry->intercept3 = intercept3;
),
- TP_printk("cr_read: %04x cr_write: %04x excp: %08x intercept: %016llx",
- __entry->cr_read, __entry->cr_write, __entry->exceptions,
- __entry->intercept)
+ TP_printk("cr_read: %04x cr_write: %04x excp: %08x "
+ "intercepts: %08x %08x %08x",
+ __entry->cr_read, __entry->cr_write, __entry->exceptions,
+ __entry->intercept1, __entry->intercept2, __entry->intercept3)
);
/*
* Tracepoint for #VMEXIT while nested
*/
-TRACE_EVENT(kvm_nested_vmexit,
- TP_PROTO(__u64 rip, __u32 exit_code,
- __u64 exit_info1, __u64 exit_info2,
- __u32 exit_int_info, __u32 exit_int_info_err, __u32 isa),
- TP_ARGS(rip, exit_code, exit_info1, exit_info2,
- exit_int_info, exit_int_info_err, isa),
-
- TP_STRUCT__entry(
- __field( __u64, rip )
- __field( __u32, exit_code )
- __field( __u64, exit_info1 )
- __field( __u64, exit_info2 )
- __field( __u32, exit_int_info )
- __field( __u32, exit_int_info_err )
- __field( __u32, isa )
- ),
-
- TP_fast_assign(
- __entry->rip = rip;
- __entry->exit_code = exit_code;
- __entry->exit_info1 = exit_info1;
- __entry->exit_info2 = exit_info2;
- __entry->exit_int_info = exit_int_info;
- __entry->exit_int_info_err = exit_int_info_err;
- __entry->isa = isa;
- ),
- TP_printk("rip: 0x%016llx reason: %s%s%s ext_inf1: 0x%016llx "
- "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x",
- __entry->rip,
- kvm_print_exit_reason(__entry->exit_code, __entry->isa),
- __entry->exit_info1, __entry->exit_info2,
- __entry->exit_int_info, __entry->exit_int_info_err)
-);
+TRACE_EVENT_KVM_EXIT(kvm_nested_vmexit);
/*
* Tracepoint for #VMEXIT reinjected to the guest
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
index 4bbd8b4..3a18614 100644
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -151,7 +151,7 @@ static inline bool vmx_umip_emulated(void)
static inline bool cpu_has_vmx_rdtscp(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
- SECONDARY_EXEC_RDTSCP;
+ SECONDARY_EXEC_ENABLE_RDTSCP;
}
static inline bool cpu_has_vmx_virtualize_x2apic_mode(void)
@@ -196,7 +196,7 @@ static inline bool cpu_has_vmx_ple(void)
SECONDARY_EXEC_PAUSE_LOOP_EXITING;
}
-static inline bool vmx_rdrand_supported(void)
+static inline bool cpu_has_vmx_rdrand(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
SECONDARY_EXEC_RDRAND_EXITING;
@@ -233,7 +233,7 @@ static inline bool cpu_has_vmx_encls_vmexit(void)
SECONDARY_EXEC_ENCLS_EXITING;
}
-static inline bool vmx_rdseed_supported(void)
+static inline bool cpu_has_vmx_rdseed(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
SECONDARY_EXEC_RDSEED_EXITING;
@@ -244,13 +244,13 @@ static inline bool cpu_has_vmx_pml(void)
return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_PML;
}
-static inline bool vmx_xsaves_supported(void)
+static inline bool cpu_has_vmx_xsaves(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
SECONDARY_EXEC_XSAVES;
}
-static inline bool vmx_waitpkg_supported(void)
+static inline bool cpu_has_vmx_waitpkg(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 23b58c2..6eca8a7 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -233,6 +233,44 @@ static inline void nested_release_evmcs(struct kvm_vcpu *vcpu)
vmx->nested.hv_evmcs = NULL;
}
+static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx,
+ struct loaded_vmcs *prev)
+{
+ struct vmcs_host_state *dest, *src;
+
+ if (unlikely(!vmx->guest_state_loaded))
+ return;
+
+ src = &prev->host_state;
+ dest = &vmx->loaded_vmcs->host_state;
+
+ vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base);
+ dest->ldt_sel = src->ldt_sel;
+#ifdef CONFIG_X86_64
+ dest->ds_sel = src->ds_sel;
+ dest->es_sel = src->es_sel;
+#endif
+}
+
+static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct loaded_vmcs *prev;
+ int cpu;
+
+ if (WARN_ON_ONCE(vmx->loaded_vmcs == vmcs))
+ return;
+
+ cpu = get_cpu();
+ prev = vmx->loaded_vmcs;
+ vmx->loaded_vmcs = vmcs;
+ vmx_vcpu_load_vmcs(vcpu, cpu, prev);
+ vmx_sync_vmcs_host_state(vmx, prev);
+ put_cpu();
+
+ vmx_register_cache_reset(vcpu);
+}
+
/*
* Free whatever needs to be freed from vmx->nested when L1 goes down, or
* just stops using VMX.
@@ -241,10 +279,13 @@ static void free_nested(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ if (WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01))
+ vmx_switch_vmcs(vcpu, &vmx->vmcs01);
+
if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon)
return;
- kvm_clear_request(KVM_REQ_GET_VMCS12_PAGES, vcpu);
+ kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
vmx->nested.vmxon = false;
vmx->nested.smm.vmxon = false;
@@ -277,44 +318,6 @@ static void free_nested(struct kvm_vcpu *vcpu)
free_loaded_vmcs(&vmx->nested.vmcs02);
}
-static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx,
- struct loaded_vmcs *prev)
-{
- struct vmcs_host_state *dest, *src;
-
- if (unlikely(!vmx->guest_state_loaded))
- return;
-
- src = &prev->host_state;
- dest = &vmx->loaded_vmcs->host_state;
-
- vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base);
- dest->ldt_sel = src->ldt_sel;
-#ifdef CONFIG_X86_64
- dest->ds_sel = src->ds_sel;
- dest->es_sel = src->es_sel;
-#endif
-}
-
-static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- struct loaded_vmcs *prev;
- int cpu;
-
- if (vmx->loaded_vmcs == vmcs)
- return;
-
- cpu = get_cpu();
- prev = vmx->loaded_vmcs;
- vmx->loaded_vmcs = vmcs;
- vmx_vcpu_load_vmcs(vcpu, cpu, prev);
- vmx_sync_vmcs_host_state(vmx, prev);
- put_cpu();
-
- vmx_register_cache_reset(vcpu);
-}
-
/*
* Ensure that the current vmcs of the logical processor is the
* vmcs01 of the vcpu before calling free_nested().
@@ -323,8 +326,6 @@ void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu)
{
vcpu_load(vcpu);
vmx_leave_nested(vcpu);
- vmx_switch_vmcs(vcpu, &to_vmx(vcpu)->vmcs01);
- free_nested(vcpu);
vcpu_put(vcpu);
}
@@ -938,11 +939,11 @@ static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu,
* VM-exit in L0, use the more accurate value.
*/
if (msr_index == MSR_IA32_TSC) {
- int index = vmx_find_msr_index(&vmx->msr_autostore.guest,
- MSR_IA32_TSC);
+ int i = vmx_find_loadstore_msr_slot(&vmx->msr_autostore.guest,
+ MSR_IA32_TSC);
- if (index >= 0) {
- u64 val = vmx->msr_autostore.guest.val[index].value;
+ if (i >= 0) {
+ u64 val = vmx->msr_autostore.guest.val[i].value;
*data = kvm_read_l1_tsc(vcpu, val);
return true;
@@ -1031,16 +1032,16 @@ static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu,
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct vmx_msrs *autostore = &vmx->msr_autostore.guest;
bool in_vmcs12_store_list;
- int msr_autostore_index;
+ int msr_autostore_slot;
bool in_autostore_list;
int last;
- msr_autostore_index = vmx_find_msr_index(autostore, msr_index);
- in_autostore_list = msr_autostore_index >= 0;
+ msr_autostore_slot = vmx_find_loadstore_msr_slot(autostore, msr_index);
+ in_autostore_list = msr_autostore_slot >= 0;
in_vmcs12_store_list = nested_msr_store_list_has_msr(vcpu, msr_index);
if (in_vmcs12_store_list && !in_autostore_list) {
- if (autostore->nr == NR_LOADSTORE_MSRS) {
+ if (autostore->nr == MAX_NR_LOADSTORE_MSRS) {
/*
* Emulated VMEntry does not fail here. Instead a less
* accurate value will be returned by
@@ -1057,7 +1058,7 @@ static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu,
autostore->val[last].index = msr_index;
} else if (!in_vmcs12_store_list && in_autostore_list) {
last = --autostore->nr;
- autostore->val[msr_autostore_index] = autostore->val[last];
+ autostore->val[msr_autostore_slot] = autostore->val[last];
}
}
@@ -2286,7 +2287,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
/* Take the following fields only from vmcs12 */
exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
SECONDARY_EXEC_ENABLE_INVPCID |
- SECONDARY_EXEC_RDTSCP |
+ SECONDARY_EXEC_ENABLE_RDTSCP |
SECONDARY_EXEC_XSAVES |
SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE |
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
@@ -2314,6 +2315,9 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
vmcs_write16(GUEST_INTR_STATUS,
vmcs12->guest_intr_status);
+ if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST))
+ exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
+
secondary_exec_controls_set(vmx, exec_control);
}
@@ -2408,6 +2412,8 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base);
vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
+
+ vmx->segment_cache.bitmask = 0;
}
if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
@@ -2571,7 +2577,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
* which means L1 attempted VMEntry to L2 with invalid state.
* Fail the VMEntry.
*/
- if (vmx->emulation_required) {
+ if (CC(!vmx_guest_state_valid(vcpu))) {
*entry_failure_code = ENTRY_FAIL_DEFAULT;
return -EINVAL;
}
@@ -3344,8 +3350,10 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
prepare_vmcs02_early(vmx, vmcs12);
if (from_vmentry) {
- if (unlikely(!nested_get_vmcs12_pages(vcpu)))
+ if (unlikely(!nested_get_vmcs12_pages(vcpu))) {
+ vmx_switch_vmcs(vcpu, &vmx->vmcs01);
return NVMX_VMENTRY_KVM_INTERNAL_ERROR;
+ }
if (nested_vmx_check_vmentry_hw(vcpu)) {
vmx_switch_vmcs(vcpu, &vmx->vmcs01);
@@ -3387,7 +3395,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
* to nested_get_vmcs12_pages before the next VM-entry. The MSRs
* have already been set at vmentry time and should not be reset.
*/
- kvm_make_request(KVM_REQ_GET_VMCS12_PAGES, vcpu);
+ kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
}
/*
@@ -3468,11 +3476,11 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
if (evmptrld_status == EVMPTRLD_ERROR) {
kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
- } else if (evmptrld_status == EVMPTRLD_VMFAIL) {
+ } else if (CC(evmptrld_status == EVMPTRLD_VMFAIL)) {
return nested_vmx_failInvalid(vcpu);
}
- if (!vmx->nested.hv_evmcs && vmx->nested.current_vmptr == -1ull)
+ if (CC(!vmx->nested.hv_evmcs && vmx->nested.current_vmptr == -1ull))
return nested_vmx_failInvalid(vcpu);
vmcs12 = get_vmcs12(vcpu);
@@ -3483,7 +3491,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
* rather than RFLAGS.ZF, and no error number is stored to the
* VM-instruction error field.
*/
- if (vmcs12->hdr.shadow_vmcs)
+ if (CC(vmcs12->hdr.shadow_vmcs))
return nested_vmx_failInvalid(vcpu);
if (vmx->nested.hv_evmcs) {
@@ -3504,10 +3512,10 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
* for misconfigurations which will anyway be caught by the processor
* when using the merged vmcs02.
*/
- if (interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS)
+ if (CC(interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS))
return nested_vmx_fail(vcpu, VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS);
- if (vmcs12->launch_state == launch)
+ if (CC(vmcs12->launch_state == launch))
return nested_vmx_fail(vcpu,
launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
: VMXERR_VMRESUME_NONLAUNCHED_VMCS);
@@ -3528,6 +3536,14 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
if (unlikely(status != NVMX_VMENTRY_SUCCESS))
goto vmentry_failed;
+ /* Emulate processing of posted interrupts on VM-Enter. */
+ if (nested_cpu_has_posted_intr(vmcs12) &&
+ kvm_apic_has_interrupt(vcpu) == vmx->nested.posted_intr_nv) {
+ vmx->nested.pi_pending = true;
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ kvm_apic_clear_irr(vcpu, vmx->nested.posted_intr_nv);
+ }
+
/* Hide L1D cache contents from the nested guest. */
vmx->vcpu.arch.l1tf_flush_l1d = true;
@@ -4257,7 +4273,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx)
{
- struct shared_msr_entry *efer_msr;
+ struct vmx_uret_msr *efer_msr;
unsigned int i;
if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER)
@@ -4271,7 +4287,7 @@ static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx)
return vmx->msr_autoload.guest.val[i].value;
}
- efer_msr = find_msr_entry(vmx, MSR_EFER);
+ efer_msr = vmx_find_uret_msr(vmx, MSR_EFER);
if (efer_msr)
return efer_msr->data;
@@ -4404,6 +4420,14 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
kvm_vcpu_flush_tlb_current(vcpu);
+ /*
+ * VCPU_EXREG_PDPTR will be clobbered in arch/x86/kvm/vmx/vmx.h between
+ * now and the new vmentry. Ensure that the VMCS02 PDPTR fields are
+ * up-to-date before switching to L1.
+ */
+ if (enable_ept && is_pae_paging(vcpu))
+ vmx_ept_load_pdptrs(vcpu);
+
leave_guest_mode(vcpu);
if (nested_cpu_has_preemption_timer(vmcs12))
@@ -4668,7 +4692,7 @@ void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu)
vmx->nested.msrs.entry_ctls_high &=
~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
vmx->nested.msrs.exit_ctls_high &=
- ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
+ ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
}
}
@@ -4688,7 +4712,7 @@ static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer,
r = kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e);
if (r != X86EMUL_CONTINUE) {
- *ret = vmx_handle_memory_failure(vcpu, r, &e);
+ *ret = kvm_handle_memory_failure(vcpu, r, &e);
return -EINVAL;
}
@@ -4752,7 +4776,7 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
if (vmx_pt_mode_is_host_guest()) {
vmx->pt_desc.guest.ctl = 0;
- pt_update_intercept_for_msr(vmx);
+ pt_update_intercept_for_msr(vcpu);
}
return 0;
@@ -4995,7 +5019,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
/* _system ok, nested_vmx_check_permission has verified cpl=0 */
r = kvm_write_guest_virt_system(vcpu, gva, &value, len, &e);
if (r != X86EMUL_CONTINUE)
- return vmx_handle_memory_failure(vcpu, r, &e);
+ return kvm_handle_memory_failure(vcpu, r, &e);
}
return nested_vmx_succeed(vcpu);
@@ -5068,7 +5092,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
return 1;
r = kvm_read_guest_virt(vcpu, gva, &value, len, &e);
if (r != X86EMUL_CONTINUE)
- return vmx_handle_memory_failure(vcpu, r, &e);
+ return kvm_handle_memory_failure(vcpu, r, &e);
}
field = kvm_register_readl(vcpu, (((instr_info) >> 28) & 0xf));
@@ -5230,7 +5254,7 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
r = kvm_write_guest_virt_system(vcpu, gva, (void *)¤t_vmptr,
sizeof(gpa_t), &e);
if (r != X86EMUL_CONTINUE)
- return vmx_handle_memory_failure(vcpu, r, &e);
+ return kvm_handle_memory_failure(vcpu, r, &e);
return nested_vmx_succeed(vcpu);
}
@@ -5283,7 +5307,7 @@ static int handle_invept(struct kvm_vcpu *vcpu)
return 1;
r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e);
if (r != X86EMUL_CONTINUE)
- return vmx_handle_memory_failure(vcpu, r, &e);
+ return kvm_handle_memory_failure(vcpu, r, &e);
/*
* Nested EPT roots are always held through guest_mmu,
@@ -5365,7 +5389,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
return 1;
r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e);
if (r != X86EMUL_CONTINUE)
- return vmx_handle_memory_failure(vcpu, r, &e);
+ return kvm_handle_memory_failure(vcpu, r, &e);
if (operand.vpid >> 16)
return nested_vmx_fail(vcpu,
@@ -5910,13 +5934,7 @@ bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu)
goto reflect_vmexit;
}
- exit_intr_info = vmx_get_intr_info(vcpu);
- exit_qual = vmx_get_exit_qual(vcpu);
-
- trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason, exit_qual,
- vmx->idt_vectoring_info, exit_intr_info,
- vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
- KVM_ISA_VMX);
+ trace_kvm_nested_vmexit(exit_reason, vcpu, KVM_ISA_VMX);
/* If L0 (KVM) wants the exit, it trumps L1's desires. */
if (nested_vmx_l0_wants_exit(vcpu, exit_reason))
@@ -5932,14 +5950,14 @@ bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu)
* need to be synthesized by querying the in-kernel LAPIC, but external
* interrupts are never reflected to L1 so it's a non-issue.
*/
- if ((exit_intr_info &
- (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) ==
- (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) {
+ exit_intr_info = vmx_get_intr_info(vcpu);
+ if (is_exception_with_error_code(exit_intr_info)) {
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
vmcs12->vm_exit_intr_error_code =
vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
}
+ exit_qual = vmx_get_exit_qual(vcpu);
reflect_vmexit:
nested_vmx_vmexit(vcpu, exit_reason, exit_intr_info, exit_qual);
@@ -6174,7 +6192,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
* restored yet. EVMCS will be mapped from
* nested_get_vmcs12_pages().
*/
- kvm_make_request(KVM_REQ_GET_VMCS12_PAGES, vcpu);
+ kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
} else {
return -EINVAL;
}
@@ -6310,7 +6328,8 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps)
#ifdef CONFIG_X86_64
VM_EXIT_HOST_ADDR_SPACE_SIZE |
#endif
- VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
+ VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT |
+ VM_EXIT_CLEAR_BNDCFGS | VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
msrs->exit_ctls_high |=
VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
@@ -6329,7 +6348,8 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps)
#ifdef CONFIG_X86_64
VM_ENTRY_IA32E_MODE |
#endif
- VM_ENTRY_LOAD_IA32_PAT;
+ VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS |
+ VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
msrs->entry_ctls_high |=
(VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER);
@@ -6383,7 +6403,7 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps)
msrs->secondary_ctls_low = 0;
msrs->secondary_ctls_high &=
SECONDARY_EXEC_DESC |
- SECONDARY_EXEC_RDTSCP |
+ SECONDARY_EXEC_ENABLE_RDTSCP |
SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
SECONDARY_EXEC_WBINVD_EXITING |
SECONDARY_EXEC_APIC_REGISTER_VIRT |
@@ -6553,7 +6573,7 @@ struct kvm_x86_nested_ops vmx_nested_ops = {
.hv_timer_pending = nested_vmx_preemption_timer_pending,
.get_state = vmx_get_nested_state,
.set_state = vmx_set_nested_state,
- .get_vmcs12_pages = nested_get_vmcs12_pages,
+ .get_nested_state_pages = nested_get_vmcs12_pages,
.write_log_dirty = nested_vmx_write_pml_buffer,
.enable_evmcs = nested_enable_evmcs,
.get_evmcs_version = nested_get_evmcs_version,
diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c
new file mode 100644
index 0000000..e4e7adf
--- /dev/null
+++ b/arch/x86/kvm/vmx/posted_intr.c
@@ -0,0 +1,332 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/kvm_host.h>
+
+#include <asm/irq_remapping.h>
+#include <asm/cpu.h>
+
+#include "lapic.h"
+#include "posted_intr.h"
+#include "trace.h"
+#include "vmx.h"
+
+/*
+ * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we
+ * can find which vCPU should be waken up.
+ */
+static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu);
+static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);
+
+static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
+{
+ return &(to_vmx(vcpu)->pi_desc);
+}
+
+void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
+{
+ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+ struct pi_desc old, new;
+ unsigned int dest;
+
+ /*
+ * In case of hot-plug or hot-unplug, we may have to undo
+ * vmx_vcpu_pi_put even if there is no assigned device. And we
+ * always keep PI.NDST up to date for simplicity: it makes the
+ * code easier, and CPU migration is not a fast path.
+ */
+ if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu)
+ return;
+
+ /*
+ * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change
+ * PI.NDST: pi_post_block is the one expected to change PID.NDST and the
+ * wakeup handler expects the vCPU to be on the blocked_vcpu_list that
+ * matches PI.NDST. Otherwise, a vcpu may not be able to be woken up
+ * correctly.
+ */
+ if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR || vcpu->cpu == cpu) {
+ pi_clear_sn(pi_desc);
+ goto after_clear_sn;
+ }
+
+ /* The full case. */
+ do {
+ old.control = new.control = pi_desc->control;
+
+ dest = cpu_physical_id(cpu);
+
+ if (x2apic_enabled())
+ new.ndst = dest;
+ else
+ new.ndst = (dest << 8) & 0xFF00;
+
+ new.sn = 0;
+ } while (cmpxchg64(&pi_desc->control, old.control,
+ new.control) != old.control);
+
+after_clear_sn:
+
+ /*
+ * Clear SN before reading the bitmap. The VT-d firmware
+ * writes the bitmap and reads SN atomically (5.2.3 in the
+ * spec), so it doesn't really have a memory barrier that
+ * pairs with this, but we cannot do that and we need one.
+ */
+ smp_mb__after_atomic();
+
+ if (!pi_is_pir_empty(pi_desc))
+ pi_set_on(pi_desc);
+}
+
+void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
+{
+ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+
+ if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
+ !irq_remapping_cap(IRQ_POSTING_CAP) ||
+ !kvm_vcpu_apicv_active(vcpu))
+ return;
+
+ /* Set SN when the vCPU is preempted */
+ if (vcpu->preempted)
+ pi_set_sn(pi_desc);
+}
+
+static void __pi_post_block(struct kvm_vcpu *vcpu)
+{
+ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+ struct pi_desc old, new;
+ unsigned int dest;
+
+ do {
+ old.control = new.control = pi_desc->control;
+ WARN(old.nv != POSTED_INTR_WAKEUP_VECTOR,
+ "Wakeup handler not enabled while the VCPU is blocked\n");
+
+ dest = cpu_physical_id(vcpu->cpu);
+
+ if (x2apic_enabled())
+ new.ndst = dest;
+ else
+ new.ndst = (dest << 8) & 0xFF00;
+
+ /* set 'NV' to 'notification vector' */
+ new.nv = POSTED_INTR_VECTOR;
+ } while (cmpxchg64(&pi_desc->control, old.control,
+ new.control) != old.control);
+
+ if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) {
+ spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
+ list_del(&vcpu->blocked_vcpu_list);
+ spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
+ vcpu->pre_pcpu = -1;
+ }
+}
+
+/*
+ * This routine does the following things for vCPU which is going
+ * to be blocked if VT-d PI is enabled.
+ * - Store the vCPU to the wakeup list, so when interrupts happen
+ * we can find the right vCPU to wake up.
+ * - Change the Posted-interrupt descriptor as below:
+ * 'NDST' <-- vcpu->pre_pcpu
+ * 'NV' <-- POSTED_INTR_WAKEUP_VECTOR
+ * - If 'ON' is set during this process, which means at least one
+ * interrupt is posted for this vCPU, we cannot block it, in
+ * this case, return 1, otherwise, return 0.
+ *
+ */
+int pi_pre_block(struct kvm_vcpu *vcpu)
+{
+ unsigned int dest;
+ struct pi_desc old, new;
+ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+
+ if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
+ !irq_remapping_cap(IRQ_POSTING_CAP) ||
+ !kvm_vcpu_apicv_active(vcpu))
+ return 0;
+
+ WARN_ON(irqs_disabled());
+ local_irq_disable();
+ if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) {
+ vcpu->pre_pcpu = vcpu->cpu;
+ spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
+ list_add_tail(&vcpu->blocked_vcpu_list,
+ &per_cpu(blocked_vcpu_on_cpu,
+ vcpu->pre_pcpu));
+ spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
+ }
+
+ do {
+ old.control = new.control = pi_desc->control;
+
+ WARN((pi_desc->sn == 1),
+ "Warning: SN field of posted-interrupts "
+ "is set before blocking\n");
+
+ /*
+ * Since vCPU can be preempted during this process,
+ * vcpu->cpu could be different with pre_pcpu, we
+ * need to set pre_pcpu as the destination of wakeup
+ * notification event, then we can find the right vCPU
+ * to wakeup in wakeup handler if interrupts happen
+ * when the vCPU is in blocked state.
+ */
+ dest = cpu_physical_id(vcpu->pre_pcpu);
+
+ if (x2apic_enabled())
+ new.ndst = dest;
+ else
+ new.ndst = (dest << 8) & 0xFF00;
+
+ /* set 'NV' to 'wakeup vector' */
+ new.nv = POSTED_INTR_WAKEUP_VECTOR;
+ } while (cmpxchg64(&pi_desc->control, old.control,
+ new.control) != old.control);
+
+ /* We should not block the vCPU if an interrupt is posted for it. */
+ if (pi_test_on(pi_desc) == 1)
+ __pi_post_block(vcpu);
+
+ local_irq_enable();
+ return (vcpu->pre_pcpu == -1);
+}
+
+void pi_post_block(struct kvm_vcpu *vcpu)
+{
+ if (vcpu->pre_pcpu == -1)
+ return;
+
+ WARN_ON(irqs_disabled());
+ local_irq_disable();
+ __pi_post_block(vcpu);
+ local_irq_enable();
+}
+
+/*
+ * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR.
+ */
+void pi_wakeup_handler(void)
+{
+ struct kvm_vcpu *vcpu;
+ int cpu = smp_processor_id();
+
+ spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
+ list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu),
+ blocked_vcpu_list) {
+ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+
+ if (pi_test_on(pi_desc) == 1)
+ kvm_vcpu_kick(vcpu);
+ }
+ spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
+}
+
+void __init pi_init(int cpu)
+{
+ INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
+ spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
+}
+
+bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu)
+{
+ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+
+ return pi_test_on(pi_desc) ||
+ (pi_test_sn(pi_desc) && !pi_is_pir_empty(pi_desc));
+}
+
+
+/*
+ * pi_update_irte - set IRTE for Posted-Interrupts
+ *
+ * @kvm: kvm
+ * @host_irq: host irq of the interrupt
+ * @guest_irq: gsi of the interrupt
+ * @set: set or unset PI
+ * returns 0 on success, < 0 on failure
+ */
+int pi_update_irte(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq,
+ bool set)
+{
+ struct kvm_kernel_irq_routing_entry *e;
+ struct kvm_irq_routing_table *irq_rt;
+ struct kvm_lapic_irq irq;
+ struct kvm_vcpu *vcpu;
+ struct vcpu_data vcpu_info;
+ int idx, ret = 0;
+
+ if (!kvm_arch_has_assigned_device(kvm) ||
+ !irq_remapping_cap(IRQ_POSTING_CAP) ||
+ !kvm_vcpu_apicv_active(kvm->vcpus[0]))
+ return 0;
+
+ idx = srcu_read_lock(&kvm->irq_srcu);
+ irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
+ if (guest_irq >= irq_rt->nr_rt_entries ||
+ hlist_empty(&irq_rt->map[guest_irq])) {
+ pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n",
+ guest_irq, irq_rt->nr_rt_entries);
+ goto out;
+ }
+
+ hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
+ if (e->type != KVM_IRQ_ROUTING_MSI)
+ continue;
+ /*
+ * VT-d PI cannot support posting multicast/broadcast
+ * interrupts to a vCPU, we still use interrupt remapping
+ * for these kind of interrupts.
+ *
+ * For lowest-priority interrupts, we only support
+ * those with single CPU as the destination, e.g. user
+ * configures the interrupts via /proc/irq or uses
+ * irqbalance to make the interrupts single-CPU.
+ *
+ * We will support full lowest-priority interrupt later.
+ *
+ * In addition, we can only inject generic interrupts using
+ * the PI mechanism, refuse to route others through it.
+ */
+
+ kvm_set_msi_irq(kvm, e, &irq);
+ if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) ||
+ !kvm_irq_is_postable(&irq)) {
+ /*
+ * Make sure the IRTE is in remapped mode if
+ * we don't handle it in posted mode.
+ */
+ ret = irq_set_vcpu_affinity(host_irq, NULL);
+ if (ret < 0) {
+ printk(KERN_INFO
+ "failed to back to remapped mode, irq: %u\n",
+ host_irq);
+ goto out;
+ }
+
+ continue;
+ }
+
+ vcpu_info.pi_desc_addr = __pa(&to_vmx(vcpu)->pi_desc);
+ vcpu_info.vector = irq.vector;
+
+ trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi,
+ vcpu_info.vector, vcpu_info.pi_desc_addr, set);
+
+ if (set)
+ ret = irq_set_vcpu_affinity(host_irq, &vcpu_info);
+ else
+ ret = irq_set_vcpu_affinity(host_irq, NULL);
+
+ if (ret < 0) {
+ printk(KERN_INFO "%s: failed to update PI IRTE\n",
+ __func__);
+ goto out;
+ }
+ }
+
+ ret = 0;
+out:
+ srcu_read_unlock(&kvm->irq_srcu, idx);
+ return ret;
+}
diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h
new file mode 100644
index 0000000..e53b97f
--- /dev/null
+++ b/arch/x86/kvm/vmx/posted_intr.h
@@ -0,0 +1,99 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_VMX_POSTED_INTR_H
+#define __KVM_X86_VMX_POSTED_INTR_H
+
+#define POSTED_INTR_ON 0
+#define POSTED_INTR_SN 1
+
+/* Posted-Interrupt Descriptor */
+struct pi_desc {
+ u32 pir[8]; /* Posted interrupt requested */
+ union {
+ struct {
+ /* bit 256 - Outstanding Notification */
+ u16 on : 1,
+ /* bit 257 - Suppress Notification */
+ sn : 1,
+ /* bit 271:258 - Reserved */
+ rsvd_1 : 14;
+ /* bit 279:272 - Notification Vector */
+ u8 nv;
+ /* bit 287:280 - Reserved */
+ u8 rsvd_2;
+ /* bit 319:288 - Notification Destination */
+ u32 ndst;
+ };
+ u64 control;
+ };
+ u32 rsvd[6];
+} __aligned(64);
+
+static inline bool pi_test_and_set_on(struct pi_desc *pi_desc)
+{
+ return test_and_set_bit(POSTED_INTR_ON,
+ (unsigned long *)&pi_desc->control);
+}
+
+static inline bool pi_test_and_clear_on(struct pi_desc *pi_desc)
+{
+ return test_and_clear_bit(POSTED_INTR_ON,
+ (unsigned long *)&pi_desc->control);
+}
+
+static inline int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
+{
+ return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
+}
+
+static inline bool pi_is_pir_empty(struct pi_desc *pi_desc)
+{
+ return bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS);
+}
+
+static inline void pi_set_sn(struct pi_desc *pi_desc)
+{
+ set_bit(POSTED_INTR_SN,
+ (unsigned long *)&pi_desc->control);
+}
+
+static inline void pi_set_on(struct pi_desc *pi_desc)
+{
+ set_bit(POSTED_INTR_ON,
+ (unsigned long *)&pi_desc->control);
+}
+
+static inline void pi_clear_on(struct pi_desc *pi_desc)
+{
+ clear_bit(POSTED_INTR_ON,
+ (unsigned long *)&pi_desc->control);
+}
+
+static inline void pi_clear_sn(struct pi_desc *pi_desc)
+{
+ clear_bit(POSTED_INTR_SN,
+ (unsigned long *)&pi_desc->control);
+}
+
+static inline int pi_test_on(struct pi_desc *pi_desc)
+{
+ return test_bit(POSTED_INTR_ON,
+ (unsigned long *)&pi_desc->control);
+}
+
+static inline int pi_test_sn(struct pi_desc *pi_desc)
+{
+ return test_bit(POSTED_INTR_SN,
+ (unsigned long *)&pi_desc->control);
+}
+
+void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu);
+void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu);
+int pi_pre_block(struct kvm_vcpu *vcpu);
+void pi_post_block(struct kvm_vcpu *vcpu);
+void pi_wakeup_handler(void);
+void __init pi_init(int cpu);
+bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu);
+int pi_update_irte(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq,
+ bool set);
+
+#endif /* __KVM_X86_VMX_POSTED_INTR_H */
\ No newline at end of file
diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h
index 7a3675f..1472c6c 100644
--- a/arch/x86/kvm/vmx/vmcs.h
+++ b/arch/x86/kvm/vmx/vmcs.h
@@ -138,6 +138,13 @@ static inline bool is_external_intr(u32 intr_info)
return is_intr_type(intr_info, INTR_TYPE_EXT_INTR);
}
+static inline bool is_exception_with_error_code(u32 intr_info)
+{
+ const u32 mask = INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK;
+
+ return (intr_info & mask) == mask;
+}
+
enum vmcs_field_width {
VMCS_FIELD_WIDTH_U16 = 0,
VMCS_FIELD_WIDTH_U64 = 1,
diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S
index 799db08..90ad7a6 100644
--- a/arch/x86/kvm/vmx/vmenter.S
+++ b/arch/x86/kvm/vmx/vmenter.S
@@ -4,6 +4,7 @@
#include <asm/bitsperlong.h>
#include <asm/kvm_vcpu_regs.h>
#include <asm/nospec-branch.h>
+#include <asm/segment.h>
#define WORD_SIZE (BITS_PER_LONG / 8)
@@ -294,3 +295,36 @@
ret
SYM_FUNC_END(vmread_error_trampoline)
+
+SYM_FUNC_START(vmx_do_interrupt_nmi_irqoff)
+ /*
+ * Unconditionally create a stack frame, getting the correct RSP on the
+ * stack (for x86-64) would take two instructions anyways, and RBP can
+ * be used to restore RSP to make objtool happy (see below).
+ */
+ push %_ASM_BP
+ mov %_ASM_SP, %_ASM_BP
+
+#ifdef CONFIG_X86_64
+ /*
+ * Align RSP to a 16-byte boundary (to emulate CPU behavior) before
+ * creating the synthetic interrupt stack frame for the IRQ/NMI.
+ */
+ and $-16, %rsp
+ push $__KERNEL_DS
+ push %rbp
+#endif
+ pushf
+ push $__KERNEL_CS
+ CALL_NOSPEC _ASM_ARG1
+
+ /*
+ * "Restore" RSP from RBP, even though IRET has already unwound RSP to
+ * the correct value. objtool doesn't know the callee will IRET and,
+ * without the explicit restore, thinks the stack is getting walloped.
+ * Using an unwind hint is problematic due to x86-64's dynamic alignment.
+ */
+ mov %_ASM_BP, %_ASM_SP
+ pop %_ASM_BP
+ ret
+SYM_FUNC_END(vmx_do_interrupt_nmi_irqoff)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 819c185..4797ec9 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -56,7 +56,6 @@
#include "lapic.h"
#include "mmu.h"
#include "nested.h"
-#include "ops.h"
#include "pmu.h"
#include "trace.h"
#include "vmcs.h"
@@ -146,8 +145,25 @@ module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED | \
RTIT_STATUS_BYTECNT))
-#define MSR_IA32_RTIT_OUTPUT_BASE_MASK \
- (~((1UL << cpuid_query_maxphyaddr(vcpu)) - 1) | 0x7f)
+/*
+ * List of MSRs that can be directly passed to the guest.
+ * In addition to these x2apic and PT MSRs are handled specially.
+ */
+static u32 vmx_possible_passthrough_msrs[MAX_POSSIBLE_PASSTHROUGH_MSRS] = {
+ MSR_IA32_SPEC_CTRL,
+ MSR_IA32_PRED_CMD,
+ MSR_IA32_TSC,
+ MSR_FS_BASE,
+ MSR_GS_BASE,
+ MSR_KERNEL_GS_BASE,
+ MSR_IA32_SYSENTER_CS,
+ MSR_IA32_SYSENTER_ESP,
+ MSR_IA32_SYSENTER_EIP,
+ MSR_CORE_C1_RES,
+ MSR_CORE_C3_RESIDENCY,
+ MSR_CORE_C6_RESIDENCY,
+ MSR_CORE_C7_RESIDENCY,
+};
/*
* These 2 parameters are used to config the controls for Pause-Loop Exiting:
@@ -341,9 +357,8 @@ static const struct kernel_param_ops vmentry_l1d_flush_ops = {
};
module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644);
-static bool guest_state_valid(struct kvm_vcpu *vcpu);
static u32 vmx_segment_access_rights(struct kvm_segment *var);
-static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
+static __always_inline void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu,
u32 msr, int type);
void vmx_vmexit(void);
@@ -398,13 +413,6 @@ DEFINE_PER_CPU(struct vmcs *, current_vmcs);
*/
static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
-/*
- * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we
- * can find which vCPU should be waken up.
- */
-static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu);
-static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);
-
static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
static DEFINE_SPINLOCK(vmx_vpid_lock);
@@ -447,9 +455,9 @@ static unsigned long host_idt_base;
* will emulate SYSCALL in legacy mode if the vendor string in guest
* CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To
* support this emulation, IA32_STAR must always be included in
- * vmx_msr_index[], even in i386 builds.
+ * vmx_uret_msrs_list[], even in i386 builds.
*/
-const u32 vmx_msr_index[] = {
+static const u32 vmx_uret_msrs_list[] = {
#ifdef CONFIG_X86_64
MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
#endif
@@ -623,36 +631,71 @@ static inline bool report_flexpriority(void)
return flexpriority_enabled;
}
-static inline int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
+static int possible_passthrough_msr_slot(u32 msr)
+{
+ u32 i;
+
+ for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++)
+ if (vmx_possible_passthrough_msrs[i] == msr)
+ return i;
+
+ return -ENOENT;
+}
+
+static bool is_valid_passthrough_msr(u32 msr)
+{
+ bool r;
+
+ switch (msr) {
+ case 0x800 ... 0x8ff:
+ /* x2APIC MSRs. These are handled in vmx_update_msr_bitmap_x2apic() */
+ return true;
+ case MSR_IA32_RTIT_STATUS:
+ case MSR_IA32_RTIT_OUTPUT_BASE:
+ case MSR_IA32_RTIT_OUTPUT_MASK:
+ case MSR_IA32_RTIT_CR3_MATCH:
+ case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
+ /* PT MSRs. These are handled in pt_update_intercept_for_msr() */
+ return true;
+ }
+
+ r = possible_passthrough_msr_slot(msr) != -ENOENT;
+
+ WARN(!r, "Invalid MSR %x, please adapt vmx_possible_passthrough_msrs[]", msr);
+
+ return r;
+}
+
+static inline int __vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr)
{
int i;
- for (i = 0; i < vmx->nmsrs; ++i)
- if (vmx_msr_index[vmx->guest_msrs[i].index] == msr)
+ for (i = 0; i < vmx->nr_uret_msrs; ++i)
+ if (vmx_uret_msrs_list[vmx->guest_uret_msrs[i].slot] == msr)
return i;
return -1;
}
-struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
+struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr)
{
int i;
- i = __find_msr_index(vmx, msr);
+ i = __vmx_find_uret_msr(vmx, msr);
if (i >= 0)
- return &vmx->guest_msrs[i];
+ return &vmx->guest_uret_msrs[i];
return NULL;
}
-static int vmx_set_guest_msr(struct vcpu_vmx *vmx, struct shared_msr_entry *msr, u64 data)
+static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx,
+ struct vmx_uret_msr *msr, u64 data)
{
int ret = 0;
u64 old_msr_data = msr->data;
msr->data = data;
- if (msr - vmx->guest_msrs < vmx->save_nmsrs) {
+ if (msr - vmx->guest_uret_msrs < vmx->nr_active_uret_msrs) {
preempt_disable();
- ret = kvm_set_shared_msr(msr->index, msr->data,
- msr->mask);
+ ret = kvm_set_user_return_msr(msr->slot, msr->data, msr->mask);
preempt_enable();
if (ret)
msr->data = old_msr_data;
@@ -825,7 +868,7 @@ static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
vm_exit_controls_clearbit(vmx, exit);
}
-int vmx_find_msr_index(struct vmx_msrs *m, u32 msr)
+int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr)
{
unsigned int i;
@@ -859,7 +902,7 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
}
break;
}
- i = vmx_find_msr_index(&m->guest, msr);
+ i = vmx_find_loadstore_msr_slot(&m->guest, msr);
if (i < 0)
goto skip_guest;
--m->guest.nr;
@@ -867,7 +910,7 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
skip_guest:
- i = vmx_find_msr_index(&m->host, msr);
+ i = vmx_find_loadstore_msr_slot(&m->host, msr);
if (i < 0)
return;
@@ -926,12 +969,12 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
}
- i = vmx_find_msr_index(&m->guest, msr);
+ i = vmx_find_loadstore_msr_slot(&m->guest, msr);
if (!entry_only)
- j = vmx_find_msr_index(&m->host, msr);
+ j = vmx_find_loadstore_msr_slot(&m->host, msr);
- if ((i < 0 && m->guest.nr == NR_LOADSTORE_MSRS) ||
- (j < 0 && m->host.nr == NR_LOADSTORE_MSRS)) {
+ if ((i < 0 && m->guest.nr == MAX_NR_LOADSTORE_MSRS) ||
+ (j < 0 && m->host.nr == MAX_NR_LOADSTORE_MSRS)) {
printk_once(KERN_WARNING "Not enough msr switch entries. "
"Can't add msr %x\n", msr);
return;
@@ -954,10 +997,11 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
m->host.val[j].value = host_val;
}
-static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
+static bool update_transition_efer(struct vcpu_vmx *vmx)
{
u64 guest_efer = vmx->vcpu.arch.efer;
u64 ignore_bits = 0;
+ int i;
/* Shadow paging assumes NX to be available. */
if (!enable_ept)
@@ -989,17 +1033,21 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
else
clear_atomic_switch_msr(vmx, MSR_EFER);
return false;
- } else {
- clear_atomic_switch_msr(vmx, MSR_EFER);
-
- guest_efer &= ~ignore_bits;
- guest_efer |= host_efer & ignore_bits;
-
- vmx->guest_msrs[efer_offset].data = guest_efer;
- vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
-
- return true;
}
+
+ i = __vmx_find_uret_msr(vmx, MSR_EFER);
+ if (i < 0)
+ return false;
+
+ clear_atomic_switch_msr(vmx, MSR_EFER);
+
+ guest_efer &= ~ignore_bits;
+ guest_efer |= host_efer & ignore_bits;
+
+ vmx->guest_uret_msrs[i].data = guest_efer;
+ vmx->guest_uret_msrs[i].mask = ~ignore_bits;
+
+ return true;
}
#ifdef CONFIG_X86_32
@@ -1037,6 +1085,12 @@ static inline bool pt_can_write_msr(struct vcpu_vmx *vmx)
!(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN);
}
+static inline bool pt_output_base_valid(struct kvm_vcpu *vcpu, u64 base)
+{
+ /* The base must be 128-byte aligned and a legal physical address. */
+ return !kvm_vcpu_is_illegal_gpa(vcpu, base) && !(base & 0x7f);
+}
+
static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range)
{
u32 i;
@@ -1141,12 +1195,12 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
* when guest state is loaded. This happens when guest transitions
* to/from long-mode by setting MSR_EFER.LMA.
*/
- if (!vmx->guest_msrs_ready) {
- vmx->guest_msrs_ready = true;
- for (i = 0; i < vmx->save_nmsrs; ++i)
- kvm_set_shared_msr(vmx->guest_msrs[i].index,
- vmx->guest_msrs[i].data,
- vmx->guest_msrs[i].mask);
+ if (!vmx->guest_uret_msrs_loaded) {
+ vmx->guest_uret_msrs_loaded = true;
+ for (i = 0; i < vmx->nr_active_uret_msrs; ++i)
+ kvm_set_user_return_msr(vmx->guest_uret_msrs[i].slot,
+ vmx->guest_uret_msrs[i].data,
+ vmx->guest_uret_msrs[i].mask);
}
@@ -1230,7 +1284,7 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
#endif
load_fixmap_gdt(raw_smp_processor_id());
vmx->guest_state_loaded = false;
- vmx->guest_msrs_ready = false;
+ vmx->guest_uret_msrs_loaded = false;
}
#ifdef CONFIG_X86_64
@@ -1253,62 +1307,6 @@ static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data)
}
#endif
-static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
-{
- struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
- struct pi_desc old, new;
- unsigned int dest;
-
- /*
- * In case of hot-plug or hot-unplug, we may have to undo
- * vmx_vcpu_pi_put even if there is no assigned device. And we
- * always keep PI.NDST up to date for simplicity: it makes the
- * code easier, and CPU migration is not a fast path.
- */
- if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu)
- return;
-
- /*
- * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change
- * PI.NDST: pi_post_block is the one expected to change PID.NDST and the
- * wakeup handler expects the vCPU to be on the blocked_vcpu_list that
- * matches PI.NDST. Otherwise, a vcpu may not be able to be woken up
- * correctly.
- */
- if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR || vcpu->cpu == cpu) {
- pi_clear_sn(pi_desc);
- goto after_clear_sn;
- }
-
- /* The full case. */
- do {
- old.control = new.control = pi_desc->control;
-
- dest = cpu_physical_id(cpu);
-
- if (x2apic_enabled())
- new.ndst = dest;
- else
- new.ndst = (dest << 8) & 0xFF00;
-
- new.sn = 0;
- } while (cmpxchg64(&pi_desc->control, old.control,
- new.control) != old.control);
-
-after_clear_sn:
-
- /*
- * Clear SN before reading the bitmap. The VT-d firmware
- * writes the bitmap and reads SN atomically (5.2.3 in the
- * spec), so it doesn't really have a memory barrier that
- * pairs with this, but we cannot do that and we need one.
- */
- smp_mb__after_atomic();
-
- if (!pi_is_pir_empty(pi_desc))
- pi_set_on(pi_desc);
-}
-
void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
struct loaded_vmcs *buddy)
{
@@ -1392,20 +1390,6 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
vmx->host_debugctlmsr = get_debugctlmsr();
}
-static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
-{
- struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
-
- if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
- !irq_remapping_cap(IRQ_POSTING_CAP) ||
- !kvm_vcpu_apicv_active(vcpu))
- return;
-
- /* Set SN when the vCPU is preempted */
- if (vcpu->preempted)
- pi_set_sn(pi_desc);
-}
-
static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
{
vmx_vcpu_pi_put(vcpu);
@@ -1415,7 +1399,7 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
static bool emulation_required(struct kvm_vcpu *vcpu)
{
- return emulate_invalid_guest_state && !guest_state_valid(vcpu);
+ return emulate_invalid_guest_state && !vmx_guest_state_valid(vcpu);
}
unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
@@ -1441,7 +1425,7 @@ void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long old_rflags;
- if (enable_unrestricted_guest) {
+ if (is_unrestricted_guest(vcpu)) {
kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS);
vmx->rflags = rflags;
vmcs_writel(GUEST_RFLAGS, rflags);
@@ -1561,6 +1545,11 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
return 0;
}
+static bool vmx_can_emulate_instruction(struct kvm_vcpu *vcpu, void *insn, int insn_len)
+{
+ return true;
+}
+
static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
{
unsigned long rip, orig_rip;
@@ -1599,33 +1588,6 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
}
/*
- * Handles kvm_read/write_guest_virt*() result and either injects #PF or returns
- * KVM_EXIT_INTERNAL_ERROR for cases not currently handled by KVM. Return value
- * indicates whether exit to userspace is needed.
- */
-int vmx_handle_memory_failure(struct kvm_vcpu *vcpu, int r,
- struct x86_exception *e)
-{
- if (r == X86EMUL_PROPAGATE_FAULT) {
- kvm_inject_emulated_page_fault(vcpu, e);
- return 1;
- }
-
- /*
- * In case kvm_read/write_guest_virt*() failed with X86EMUL_IO_NEEDED
- * while handling a VMX instruction KVM could've handled the request
- * correctly by exiting to userspace and performing I/O but there
- * doesn't seem to be a real use-case behind such requests, just return
- * KVM_EXIT_INTERNAL_ERROR for now.
- */
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
- vcpu->run->internal.ndata = 0;
-
- return 0;
-}
-
-/*
* Recognizes a pending MTF VM-exit and records the nested state for later
* delivery.
*/
@@ -1708,16 +1670,19 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu)
vmx_clear_hlt(vcpu);
}
-/*
- * Swap MSR entry in host/guest MSR entry array.
- */
-static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
+static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr)
{
- struct shared_msr_entry tmp;
+ struct vmx_uret_msr tmp;
+ int from, to;
- tmp = vmx->guest_msrs[to];
- vmx->guest_msrs[to] = vmx->guest_msrs[from];
- vmx->guest_msrs[from] = tmp;
+ from = __vmx_find_uret_msr(vmx, msr);
+ if (from < 0)
+ return;
+ to = vmx->nr_active_uret_msrs++;
+
+ tmp = vmx->guest_uret_msrs[to];
+ vmx->guest_uret_msrs[to] = vmx->guest_uret_msrs[from];
+ vmx->guest_uret_msrs[from] = tmp;
}
/*
@@ -1727,38 +1692,26 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
*/
static void setup_msrs(struct vcpu_vmx *vmx)
{
- int save_nmsrs, index;
-
- save_nmsrs = 0;
+ vmx->guest_uret_msrs_loaded = false;
+ vmx->nr_active_uret_msrs = 0;
#ifdef CONFIG_X86_64
/*
* The SYSCALL MSRs are only needed on long mode guests, and only
* when EFER.SCE is set.
*/
if (is_long_mode(&vmx->vcpu) && (vmx->vcpu.arch.efer & EFER_SCE)) {
- index = __find_msr_index(vmx, MSR_STAR);
- if (index >= 0)
- move_msr_up(vmx, index, save_nmsrs++);
- index = __find_msr_index(vmx, MSR_LSTAR);
- if (index >= 0)
- move_msr_up(vmx, index, save_nmsrs++);
- index = __find_msr_index(vmx, MSR_SYSCALL_MASK);
- if (index >= 0)
- move_msr_up(vmx, index, save_nmsrs++);
+ vmx_setup_uret_msr(vmx, MSR_STAR);
+ vmx_setup_uret_msr(vmx, MSR_LSTAR);
+ vmx_setup_uret_msr(vmx, MSR_SYSCALL_MASK);
}
#endif
- index = __find_msr_index(vmx, MSR_EFER);
- if (index >= 0 && update_transition_efer(vmx, index))
- move_msr_up(vmx, index, save_nmsrs++);
- index = __find_msr_index(vmx, MSR_TSC_AUX);
- if (index >= 0 && guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP))
- move_msr_up(vmx, index, save_nmsrs++);
- index = __find_msr_index(vmx, MSR_IA32_TSX_CTRL);
- if (index >= 0)
- move_msr_up(vmx, index, save_nmsrs++);
+ if (update_transition_efer(vmx))
+ vmx_setup_uret_msr(vmx, MSR_EFER);
- vmx->save_nmsrs = save_nmsrs;
- vmx->guest_msrs_ready = false;
+ if (guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP))
+ vmx_setup_uret_msr(vmx, MSR_TSC_AUX);
+
+ vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL);
if (cpu_has_vmx_msr_bitmap())
vmx_update_msr_bitmap(&vmx->vcpu);
@@ -1828,7 +1781,7 @@ static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- struct shared_msr_entry *msr;
+ struct vmx_uret_msr *msr;
u32 index;
switch (msr_info->index) {
@@ -1849,7 +1802,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (!msr_info->host_initiated &&
!(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR))
return 1;
- goto find_shared_msr;
+ goto find_uret_msr;
case MSR_IA32_UMWAIT_CONTROL:
if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx))
return 1;
@@ -1956,10 +1909,10 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (!msr_info->host_initiated &&
!guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
return 1;
- goto find_shared_msr;
+ goto find_uret_msr;
default:
- find_shared_msr:
- msr = find_msr_entry(vmx, msr_info->index);
+ find_uret_msr:
+ msr = vmx_find_uret_msr(vmx, msr_info->index);
if (msr) {
msr_info->data = msr->data;
break;
@@ -1988,7 +1941,7 @@ static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu,
static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- struct shared_msr_entry *msr;
+ struct vmx_uret_msr *msr;
int ret = 0;
u32 msr_index = msr_info->index;
u64 data = msr_info->data;
@@ -2082,7 +2035,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
* in the merging. We update the vmcs01 here for L1 as well
* since it will end up touching the MSR anyway now.
*/
- vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap,
+ vmx_disable_intercept_for_msr(vcpu,
MSR_IA32_SPEC_CTRL,
MSR_TYPE_RW);
break;
@@ -2092,7 +2045,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 1;
if (data & ~(TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR))
return 1;
- goto find_shared_msr;
+ goto find_uret_msr;
case MSR_IA32_PRED_CMD:
if (!msr_info->host_initiated &&
!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
@@ -2118,8 +2071,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
* vmcs02.msr_bitmap here since it gets completely overwritten
* in the merging.
*/
- vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, MSR_IA32_PRED_CMD,
- MSR_TYPE_W);
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W);
break;
case MSR_IA32_CR_PAT:
if (!kvm_pat_valid(data))
@@ -2169,7 +2121,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 1;
vmcs_write64(GUEST_IA32_RTIT_CTL, data);
vmx->pt_desc.guest.ctl = data;
- pt_update_intercept_for_msr(vmx);
+ pt_update_intercept_for_msr(vcpu);
break;
case MSR_IA32_RTIT_STATUS:
if (!pt_can_write_msr(vmx))
@@ -2194,7 +2146,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
!intel_pt_validate_cap(vmx->pt_desc.caps,
PT_CAP_single_range_output))
return 1;
- if (data & MSR_IA32_RTIT_OUTPUT_BASE_MASK)
+ if (!pt_output_base_valid(vcpu, data))
return 1;
vmx->pt_desc.guest.output_base = data;
break;
@@ -2229,13 +2181,13 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
/* Check reserved bit, higher 32 bits should be zero */
if ((data >> 32) != 0)
return 1;
- goto find_shared_msr;
+ goto find_uret_msr;
default:
- find_shared_msr:
- msr = find_msr_entry(vmx, msr_index);
+ find_uret_msr:
+ msr = vmx_find_uret_msr(vmx, msr_index);
if (msr)
- ret = vmx_set_guest_msr(vmx, msr, data);
+ ret = vmx_set_guest_uret_msr(vmx, msr, data);
else
ret = kvm_set_msr_common(vcpu, msr_info);
}
@@ -2267,7 +2219,8 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & guest_owned_bits;
break;
case VCPU_EXREG_CR3:
- if (enable_unrestricted_guest || (enable_ept && is_paging(vcpu)))
+ if (is_unrestricted_guest(vcpu) ||
+ (enable_ept && is_paging(vcpu)))
vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
break;
case VCPU_EXREG_CR4:
@@ -2448,7 +2401,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
SECONDARY_EXEC_UNRESTRICTED_GUEST |
SECONDARY_EXEC_PAUSE_LOOP_EXITING |
SECONDARY_EXEC_DESC |
- SECONDARY_EXEC_RDTSCP |
+ SECONDARY_EXEC_ENABLE_RDTSCP |
SECONDARY_EXEC_ENABLE_INVPCID |
SECONDARY_EXEC_APIC_REGISTER_VIRT |
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
@@ -2865,7 +2818,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- struct shared_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
+ struct vmx_uret_msr *msr = vmx_find_uret_msr(vmx, MSR_EFER);
if (!msr)
return;
@@ -2971,7 +2924,7 @@ static void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu)
vpid_sync_context(to_vmx(vcpu)->vpid);
}
-static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
+void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu)
{
struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
@@ -3033,7 +2986,7 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
unsigned long hw_cr0;
hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF);
- if (enable_unrestricted_guest)
+ if (is_unrestricted_guest(vcpu))
hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
else {
hw_cr0 |= KVM_VM_CR0_ALWAYS_ON;
@@ -3054,7 +3007,7 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
}
#endif
- if (enable_ept && !enable_unrestricted_guest)
+ if (enable_ept && !is_unrestricted_guest(vcpu))
ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
vmcs_writel(CR0_READ_SHADOW, cr0);
@@ -3114,7 +3067,7 @@ static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd,
guest_cr3 = vcpu->arch.cr3;
else /* vmcs01.GUEST_CR3 is already up-to-date. */
update_guest_cr3 = false;
- ept_load_pdptrs(vcpu);
+ vmx_ept_load_pdptrs(vcpu);
} else {
guest_cr3 = pgd;
}
@@ -3134,7 +3087,7 @@ int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
unsigned long hw_cr4;
hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE);
- if (enable_unrestricted_guest)
+ if (is_unrestricted_guest(vcpu))
hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST;
else if (vmx->rmode.vm86_active)
hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON;
@@ -3169,7 +3122,7 @@ int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
vcpu->arch.cr4 = cr4;
kvm_register_mark_available(vcpu, VCPU_EXREG_CR4);
- if (!enable_unrestricted_guest) {
+ if (!is_unrestricted_guest(vcpu)) {
if (enable_ept) {
if (!is_paging(vcpu)) {
hw_cr4 &= ~X86_CR4_PAE;
@@ -3309,7 +3262,7 @@ void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
* tree. Newer qemu binaries with that qemu fix would not need this
* kvm hack.
*/
- if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR))
+ if (is_unrestricted_guest(vcpu) && (seg != VCPU_SREG_LDTR))
var->type |= 0x1; /* Accessed */
vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var));
@@ -3498,11 +3451,8 @@ static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
* not.
* We assume that registers are always usable
*/
-static bool guest_state_valid(struct kvm_vcpu *vcpu)
+bool __vmx_guest_state_valid(struct kvm_vcpu *vcpu)
{
- if (enable_unrestricted_guest)
- return true;
-
/* real mode guest state checks */
if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) {
if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
@@ -3688,10 +3638,51 @@ void free_vpid(int vpid)
spin_unlock(&vmx_vpid_lock);
}
-static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
+static void vmx_clear_msr_bitmap_read(ulong *msr_bitmap, u32 msr)
+{
+ int f = sizeof(unsigned long);
+
+ if (msr <= 0x1fff)
+ __clear_bit(msr, msr_bitmap + 0x000 / f);
+ else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff))
+ __clear_bit(msr & 0x1fff, msr_bitmap + 0x400 / f);
+}
+
+static void vmx_clear_msr_bitmap_write(ulong *msr_bitmap, u32 msr)
+{
+ int f = sizeof(unsigned long);
+
+ if (msr <= 0x1fff)
+ __clear_bit(msr, msr_bitmap + 0x800 / f);
+ else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff))
+ __clear_bit(msr & 0x1fff, msr_bitmap + 0xc00 / f);
+}
+
+static void vmx_set_msr_bitmap_read(ulong *msr_bitmap, u32 msr)
+{
+ int f = sizeof(unsigned long);
+
+ if (msr <= 0x1fff)
+ __set_bit(msr, msr_bitmap + 0x000 / f);
+ else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff))
+ __set_bit(msr & 0x1fff, msr_bitmap + 0x400 / f);
+}
+
+static void vmx_set_msr_bitmap_write(ulong *msr_bitmap, u32 msr)
+{
+ int f = sizeof(unsigned long);
+
+ if (msr <= 0x1fff)
+ __set_bit(msr, msr_bitmap + 0x800 / f);
+ else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff))
+ __set_bit(msr & 0x1fff, msr_bitmap + 0xc00 / f);
+}
+
+static __always_inline void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu,
u32 msr, int type)
{
- int f = sizeof(unsigned long);
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
if (!cpu_has_vmx_msr_bitmap())
return;
@@ -3700,36 +3691,44 @@ static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bit
evmcs_touch_msr_bitmap();
/*
- * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
- * have the write-low and read-high bitmap offsets the wrong way round.
- * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
- */
- if (msr <= 0x1fff) {
- if (type & MSR_TYPE_R)
- /* read-low */
- __clear_bit(msr, msr_bitmap + 0x000 / f);
+ * Mark the desired intercept state in shadow bitmap, this is needed
+ * for resync when the MSR filters change.
+ */
+ if (is_valid_passthrough_msr(msr)) {
+ int idx = possible_passthrough_msr_slot(msr);
- if (type & MSR_TYPE_W)
- /* write-low */
- __clear_bit(msr, msr_bitmap + 0x800 / f);
-
- } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
- msr &= 0x1fff;
- if (type & MSR_TYPE_R)
- /* read-high */
- __clear_bit(msr, msr_bitmap + 0x400 / f);
-
- if (type & MSR_TYPE_W)
- /* write-high */
- __clear_bit(msr, msr_bitmap + 0xc00 / f);
-
+ if (idx != -ENOENT) {
+ if (type & MSR_TYPE_R)
+ clear_bit(idx, vmx->shadow_msr_intercept.read);
+ if (type & MSR_TYPE_W)
+ clear_bit(idx, vmx->shadow_msr_intercept.write);
+ }
}
+
+ if ((type & MSR_TYPE_R) &&
+ !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ)) {
+ vmx_set_msr_bitmap_read(msr_bitmap, msr);
+ type &= ~MSR_TYPE_R;
+ }
+
+ if ((type & MSR_TYPE_W) &&
+ !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE)) {
+ vmx_set_msr_bitmap_write(msr_bitmap, msr);
+ type &= ~MSR_TYPE_W;
+ }
+
+ if (type & MSR_TYPE_R)
+ vmx_clear_msr_bitmap_read(msr_bitmap, msr);
+
+ if (type & MSR_TYPE_W)
+ vmx_clear_msr_bitmap_write(msr_bitmap, msr);
}
-static __always_inline void vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
+static __always_inline void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu,
u32 msr, int type)
{
- int f = sizeof(unsigned long);
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
if (!cpu_has_vmx_msr_bitmap())
return;
@@ -3738,39 +3737,34 @@ static __always_inline void vmx_enable_intercept_for_msr(unsigned long *msr_bitm
evmcs_touch_msr_bitmap();
/*
- * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
- * have the write-low and read-high bitmap offsets the wrong way round.
- * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
- */
- if (msr <= 0x1fff) {
- if (type & MSR_TYPE_R)
- /* read-low */
- __set_bit(msr, msr_bitmap + 0x000 / f);
+ * Mark the desired intercept state in shadow bitmap, this is needed
+ * for resync when the MSR filter changes.
+ */
+ if (is_valid_passthrough_msr(msr)) {
+ int idx = possible_passthrough_msr_slot(msr);
- if (type & MSR_TYPE_W)
- /* write-low */
- __set_bit(msr, msr_bitmap + 0x800 / f);
-
- } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
- msr &= 0x1fff;
- if (type & MSR_TYPE_R)
- /* read-high */
- __set_bit(msr, msr_bitmap + 0x400 / f);
-
- if (type & MSR_TYPE_W)
- /* write-high */
- __set_bit(msr, msr_bitmap + 0xc00 / f);
-
+ if (idx != -ENOENT) {
+ if (type & MSR_TYPE_R)
+ set_bit(idx, vmx->shadow_msr_intercept.read);
+ if (type & MSR_TYPE_W)
+ set_bit(idx, vmx->shadow_msr_intercept.write);
+ }
}
+
+ if (type & MSR_TYPE_R)
+ vmx_set_msr_bitmap_read(msr_bitmap, msr);
+
+ if (type & MSR_TYPE_W)
+ vmx_set_msr_bitmap_write(msr_bitmap, msr);
}
-static __always_inline void vmx_set_intercept_for_msr(unsigned long *msr_bitmap,
- u32 msr, int type, bool value)
+static __always_inline void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu,
+ u32 msr, int type, bool value)
{
if (value)
- vmx_enable_intercept_for_msr(msr_bitmap, msr, type);
+ vmx_enable_intercept_for_msr(vcpu, msr, type);
else
- vmx_disable_intercept_for_msr(msr_bitmap, msr, type);
+ vmx_disable_intercept_for_msr(vcpu, msr, type);
}
static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu)
@@ -3788,15 +3782,15 @@ static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu)
return mode;
}
-static void vmx_update_msr_bitmap_x2apic(unsigned long *msr_bitmap,
- u8 mode)
+static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu, u8 mode)
{
int msr;
- for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
- unsigned word = msr / BITS_PER_LONG;
- msr_bitmap[word] = (mode & MSR_BITMAP_MODE_X2APIC_APICV) ? 0 : ~0;
- msr_bitmap[word + (0x800 / sizeof(long))] = ~0;
+ for (msr = 0x800; msr <= 0x8ff; msr++) {
+ bool apicv = !!(mode & MSR_BITMAP_MODE_X2APIC_APICV);
+
+ vmx_set_intercept_for_msr(vcpu, msr, MSR_TYPE_R, !apicv);
+ vmx_set_intercept_for_msr(vcpu, msr, MSR_TYPE_W, true);
}
if (mode & MSR_BITMAP_MODE_X2APIC) {
@@ -3804,11 +3798,11 @@ static void vmx_update_msr_bitmap_x2apic(unsigned long *msr_bitmap,
* TPR reads and writes can be virtualized even if virtual interrupt
* delivery is not in use.
*/
- vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW);
if (mode & MSR_BITMAP_MODE_X2APIC_APICV) {
- vmx_enable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_R);
- vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_EOI), MSR_TYPE_W);
- vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W);
+ vmx_enable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_EOI), MSR_TYPE_W);
+ vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W);
}
}
}
@@ -3816,7 +3810,6 @@ static void vmx_update_msr_bitmap_x2apic(unsigned long *msr_bitmap,
void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
u8 mode = vmx_msr_bitmap_mode(vcpu);
u8 changed = mode ^ vmx->msr_bitmap_mode;
@@ -3824,30 +3817,24 @@ void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu)
return;
if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV))
- vmx_update_msr_bitmap_x2apic(msr_bitmap, mode);
+ vmx_update_msr_bitmap_x2apic(vcpu, mode);
vmx->msr_bitmap_mode = mode;
}
-void pt_update_intercept_for_msr(struct vcpu_vmx *vmx)
+void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu)
{
- unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
bool flag = !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN);
u32 i;
- vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_STATUS,
- MSR_TYPE_RW, flag);
- vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_OUTPUT_BASE,
- MSR_TYPE_RW, flag);
- vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_OUTPUT_MASK,
- MSR_TYPE_RW, flag);
- vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_CR3_MATCH,
- MSR_TYPE_RW, flag);
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_STATUS, MSR_TYPE_RW, flag);
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_BASE, MSR_TYPE_RW, flag);
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_MASK, MSR_TYPE_RW, flag);
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_CR3_MATCH, MSR_TYPE_RW, flag);
for (i = 0; i < vmx->pt_desc.addr_range; i++) {
- vmx_set_intercept_for_msr(msr_bitmap,
- MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag);
- vmx_set_intercept_for_msr(msr_bitmap,
- MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag);
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag);
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag);
}
}
@@ -3871,6 +3858,29 @@ static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
return ((rvi & 0xf0) > (vppr & 0xf0));
}
+static void vmx_msr_filter_changed(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u32 i;
+
+ /*
+ * Set intercept permissions for all potentially passed through MSRs
+ * again. They will automatically get filtered through the MSR filter,
+ * so we are back in sync after this.
+ */
+ for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) {
+ u32 msr = vmx_possible_passthrough_msrs[i];
+ bool read = test_bit(i, vmx->shadow_msr_intercept.read);
+ bool write = test_bit(i, vmx->shadow_msr_intercept.write);
+
+ vmx_set_intercept_for_msr(vcpu, msr, MSR_TYPE_R, read);
+ vmx_set_intercept_for_msr(vcpu, msr, MSR_TYPE_W, write);
+ }
+
+ pt_update_intercept_for_msr(vcpu);
+ vmx_update_msr_bitmap_x2apic(vcpu, vmx_msr_bitmap_mode(vcpu));
+}
+
static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
bool nested)
{
@@ -4099,6 +4109,61 @@ u32 vmx_exec_control(struct vcpu_vmx *vmx)
return exec_control;
}
+/*
+ * Adjust a single secondary execution control bit to intercept/allow an
+ * instruction in the guest. This is usually done based on whether or not a
+ * feature has been exposed to the guest in order to correctly emulate faults.
+ */
+static inline void
+vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control,
+ u32 control, bool enabled, bool exiting)
+{
+ /*
+ * If the control is for an opt-in feature, clear the control if the
+ * feature is not exposed to the guest, i.e. not enabled. If the
+ * control is opt-out, i.e. an exiting control, clear the control if
+ * the feature _is_ exposed to the guest, i.e. exiting/interception is
+ * disabled for the associated instruction. Note, the caller is
+ * responsible presetting exec_control to set all supported bits.
+ */
+ if (enabled == exiting)
+ *exec_control &= ~control;
+
+ /*
+ * Update the nested MSR settings so that a nested VMM can/can't set
+ * controls for features that are/aren't exposed to the guest.
+ */
+ if (nested) {
+ if (enabled)
+ vmx->nested.msrs.secondary_ctls_high |= control;
+ else
+ vmx->nested.msrs.secondary_ctls_high &= ~control;
+ }
+}
+
+/*
+ * Wrapper macro for the common case of adjusting a secondary execution control
+ * based on a single guest CPUID bit, with a dedicated feature bit. This also
+ * verifies that the control is actually supported by KVM and hardware.
+ */
+#define vmx_adjust_sec_exec_control(vmx, exec_control, name, feat_name, ctrl_name, exiting) \
+({ \
+ bool __enabled; \
+ \
+ if (cpu_has_vmx_##name()) { \
+ __enabled = guest_cpuid_has(&(vmx)->vcpu, \
+ X86_FEATURE_##feat_name); \
+ vmx_adjust_secondary_exec_control(vmx, exec_control, \
+ SECONDARY_EXEC_##ctrl_name, __enabled, exiting); \
+ } \
+})
+
+/* More macro magic for ENABLE_/opt-in versus _EXITING/opt-out controls. */
+#define vmx_adjust_sec_exec_feature(vmx, exec_control, lname, uname) \
+ vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, ENABLE_##uname, false)
+
+#define vmx_adjust_sec_exec_exiting(vmx, exec_control, lname, uname) \
+ vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, uname##_EXITING, true)
static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
{
@@ -4139,7 +4204,7 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
if (!enable_pml)
exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
- if (vmx_xsaves_supported()) {
+ if (cpu_has_vmx_xsaves()) {
/* Exposing XSAVES only when XSAVE is exposed */
bool xsaves_enabled =
boot_cpu_has(X86_FEATURE_XSAVE) &&
@@ -4148,101 +4213,29 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
vcpu->arch.xsaves_enabled = xsaves_enabled;
- if (!xsaves_enabled)
- exec_control &= ~SECONDARY_EXEC_XSAVES;
-
- if (nested) {
- if (xsaves_enabled)
- vmx->nested.msrs.secondary_ctls_high |=
- SECONDARY_EXEC_XSAVES;
- else
- vmx->nested.msrs.secondary_ctls_high &=
- ~SECONDARY_EXEC_XSAVES;
- }
+ vmx_adjust_secondary_exec_control(vmx, &exec_control,
+ SECONDARY_EXEC_XSAVES,
+ xsaves_enabled, false);
}
- if (cpu_has_vmx_rdtscp()) {
- bool rdtscp_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP);
- if (!rdtscp_enabled)
- exec_control &= ~SECONDARY_EXEC_RDTSCP;
+ vmx_adjust_sec_exec_feature(vmx, &exec_control, rdtscp, RDTSCP);
- if (nested) {
- if (rdtscp_enabled)
- vmx->nested.msrs.secondary_ctls_high |=
- SECONDARY_EXEC_RDTSCP;
- else
- vmx->nested.msrs.secondary_ctls_high &=
- ~SECONDARY_EXEC_RDTSCP;
- }
- }
+ /*
+ * Expose INVPCID if and only if PCID is also exposed to the guest.
+ * INVPCID takes a #UD when it's disabled in the VMCS, but a #GP or #PF
+ * if CR4.PCIDE=0. Enumerating CPUID.INVPCID=1 would lead to incorrect
+ * behavior from the guest perspective (it would expect #GP or #PF).
+ */
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_PCID))
+ guest_cpuid_clear(vcpu, X86_FEATURE_INVPCID);
+ vmx_adjust_sec_exec_feature(vmx, &exec_control, invpcid, INVPCID);
- if (cpu_has_vmx_invpcid()) {
- /* Exposing INVPCID only when PCID is exposed */
- bool invpcid_enabled =
- guest_cpuid_has(vcpu, X86_FEATURE_INVPCID) &&
- guest_cpuid_has(vcpu, X86_FEATURE_PCID);
- if (!invpcid_enabled) {
- exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID;
- guest_cpuid_clear(vcpu, X86_FEATURE_INVPCID);
- }
+ vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdrand, RDRAND);
+ vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdseed, RDSEED);
- if (nested) {
- if (invpcid_enabled)
- vmx->nested.msrs.secondary_ctls_high |=
- SECONDARY_EXEC_ENABLE_INVPCID;
- else
- vmx->nested.msrs.secondary_ctls_high &=
- ~SECONDARY_EXEC_ENABLE_INVPCID;
- }
- }
-
- if (vmx_rdrand_supported()) {
- bool rdrand_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDRAND);
- if (rdrand_enabled)
- exec_control &= ~SECONDARY_EXEC_RDRAND_EXITING;
-
- if (nested) {
- if (rdrand_enabled)
- vmx->nested.msrs.secondary_ctls_high |=
- SECONDARY_EXEC_RDRAND_EXITING;
- else
- vmx->nested.msrs.secondary_ctls_high &=
- ~SECONDARY_EXEC_RDRAND_EXITING;
- }
- }
-
- if (vmx_rdseed_supported()) {
- bool rdseed_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDSEED);
- if (rdseed_enabled)
- exec_control &= ~SECONDARY_EXEC_RDSEED_EXITING;
-
- if (nested) {
- if (rdseed_enabled)
- vmx->nested.msrs.secondary_ctls_high |=
- SECONDARY_EXEC_RDSEED_EXITING;
- else
- vmx->nested.msrs.secondary_ctls_high &=
- ~SECONDARY_EXEC_RDSEED_EXITING;
- }
- }
-
- if (vmx_waitpkg_supported()) {
- bool waitpkg_enabled =
- guest_cpuid_has(vcpu, X86_FEATURE_WAITPKG);
-
- if (!waitpkg_enabled)
- exec_control &= ~SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;
-
- if (nested) {
- if (waitpkg_enabled)
- vmx->nested.msrs.secondary_ctls_high |=
- SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;
- else
- vmx->nested.msrs.secondary_ctls_high &=
- ~SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;
- }
- }
+ vmx_adjust_sec_exec_control(vmx, &exec_control, waitpkg, WAITPKG,
+ ENABLE_USR_WAIT_PAUSE, false);
vmx->secondary_exec_control = exec_control;
}
@@ -4335,7 +4328,7 @@ static void init_vmcs(struct vcpu_vmx *vmx)
if (vmx->vpid != 0)
vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
- if (vmx_xsaves_supported())
+ if (cpu_has_vmx_xsaves())
vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP);
if (enable_pml) {
@@ -5148,7 +5141,8 @@ static int handle_vmcall(struct kvm_vcpu *vcpu)
static int handle_invd(struct kvm_vcpu *vcpu)
{
- return kvm_emulate_instruction(vcpu, 0);
+ /* Treat an INVD instruction as a NOP and just skip it. */
+ return kvm_skip_emulated_instruction(vcpu);
}
static int handle_invlpg(struct kvm_vcpu *vcpu)
@@ -5331,7 +5325,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
* would also use advanced VM-exit information for EPT violations to
* reconstruct the page fault error code.
*/
- if (unlikely(kvm_mmu_is_illegal_gpa(vcpu, gpa)))
+ if (unlikely(kvm_vcpu_is_illegal_gpa(vcpu, gpa)))
return kvm_emulate_instruction(vcpu, 0);
return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
@@ -5442,25 +5436,6 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu)
}
}
-/*
- * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR.
- */
-static void wakeup_handler(void)
-{
- struct kvm_vcpu *vcpu;
- int cpu = smp_processor_id();
-
- spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
- list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu),
- blocked_vcpu_list) {
- struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
-
- if (pi_test_on(pi_desc) == 1)
- kvm_vcpu_kick(vcpu);
- }
- spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
-}
-
static void vmx_enable_tdp(void)
{
kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK,
@@ -5524,16 +5499,11 @@ static int handle_invpcid(struct kvm_vcpu *vcpu)
{
u32 vmx_instruction_info;
unsigned long type;
- bool pcid_enabled;
gva_t gva;
- struct x86_exception e;
- unsigned i;
- unsigned long roots_to_free = 0;
struct {
u64 pcid;
u64 gla;
} operand;
- int r;
if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) {
kvm_queue_exception(vcpu, UD_VECTOR);
@@ -5556,68 +5526,7 @@ static int handle_invpcid(struct kvm_vcpu *vcpu)
sizeof(operand), &gva))
return 1;
- r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e);
- if (r != X86EMUL_CONTINUE)
- return vmx_handle_memory_failure(vcpu, r, &e);
-
- if (operand.pcid >> 12 != 0) {
- kvm_inject_gp(vcpu, 0);
- return 1;
- }
-
- pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
-
- switch (type) {
- case INVPCID_TYPE_INDIV_ADDR:
- if ((!pcid_enabled && (operand.pcid != 0)) ||
- is_noncanonical_address(operand.gla, vcpu)) {
- kvm_inject_gp(vcpu, 0);
- return 1;
- }
- kvm_mmu_invpcid_gva(vcpu, operand.gla, operand.pcid);
- return kvm_skip_emulated_instruction(vcpu);
-
- case INVPCID_TYPE_SINGLE_CTXT:
- if (!pcid_enabled && (operand.pcid != 0)) {
- kvm_inject_gp(vcpu, 0);
- return 1;
- }
-
- if (kvm_get_active_pcid(vcpu) == operand.pcid) {
- kvm_mmu_sync_roots(vcpu);
- kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
- }
-
- for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
- if (kvm_get_pcid(vcpu, vcpu->arch.mmu->prev_roots[i].pgd)
- == operand.pcid)
- roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
-
- kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, roots_to_free);
- /*
- * If neither the current cr3 nor any of the prev_roots use the
- * given PCID, then nothing needs to be done here because a
- * resync will happen anyway before switching to any other CR3.
- */
-
- return kvm_skip_emulated_instruction(vcpu);
-
- case INVPCID_TYPE_ALL_NON_GLOBAL:
- /*
- * Currently, KVM doesn't mark global entries in the shadow
- * page tables, so a non-global flush just degenerates to a
- * global flush. If needed, we could optimize this later by
- * keeping track of global entries in shadow page tables.
- */
-
- fallthrough;
- case INVPCID_TYPE_ALL_INCL_GLOBAL:
- kvm_mmu_unload(vcpu);
- return kvm_skip_emulated_instruction(vcpu);
-
- default:
- BUG(); /* We have already checked above that type <= 3 */
- }
+ return kvm_handle_invpcid(vcpu, type, gva);
}
static int handle_pml_full(struct kvm_vcpu *vcpu)
@@ -5746,10 +5655,24 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
static const int kvm_vmx_max_exit_handlers =
ARRAY_SIZE(kvm_vmx_exit_handlers);
-static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
+static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2,
+ u32 *intr_info, u32 *error_code)
{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
*info1 = vmx_get_exit_qual(vcpu);
- *info2 = vmx_get_intr_info(vcpu);
+ if (!(vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) {
+ *info2 = vmx->idt_vectoring_info;
+ *intr_info = vmx_get_intr_info(vcpu);
+ if (is_exception_with_error_code(*intr_info))
+ *error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
+ else
+ *error_code = 0;
+ } else {
+ *info2 = 0;
+ *intr_info = 0;
+ *error_code = 0;
+ }
}
static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx)
@@ -6054,6 +5977,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
(exit_reason != EXIT_REASON_EXCEPTION_NMI &&
exit_reason != EXIT_REASON_EPT_VIOLATION &&
exit_reason != EXIT_REASON_PML_FULL &&
+ exit_reason != EXIT_REASON_APIC_ACCESS &&
exit_reason != EXIT_REASON_TASK_SWITCH)) {
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
@@ -6382,14 +6306,6 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
return max_irr;
}
-static bool vmx_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu)
-{
- struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
-
- return pi_test_on(pi_desc) ||
- (pi_test_sn(pi_desc) && !pi_is_pir_empty(pi_desc));
-}
-
static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
{
if (!kvm_vcpu_apicv_active(vcpu))
@@ -6409,70 +6325,43 @@ static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu)
memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir));
}
+void vmx_do_interrupt_nmi_irqoff(unsigned long entry);
+
+static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu, u32 intr_info)
+{
+ unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK;
+ gate_desc *desc = (gate_desc *)host_idt_base + vector;
+
+ kvm_before_interrupt(vcpu);
+ vmx_do_interrupt_nmi_irqoff(gate_offset(desc));
+ kvm_after_interrupt(vcpu);
+}
+
static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
{
u32 intr_info = vmx_get_intr_info(&vmx->vcpu);
/* if exit due to PF check for async PF */
- if (is_page_fault(intr_info)) {
+ if (is_page_fault(intr_info))
vmx->vcpu.arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags();
/* Handle machine checks before interrupts are enabled */
- } else if (is_machine_check(intr_info)) {
+ else if (is_machine_check(intr_info))
kvm_machine_check();
/* We need to handle NMIs before interrupts are enabled */
- } else if (is_nmi(intr_info)) {
- kvm_before_interrupt(&vmx->vcpu);
- asm("int $2");
- kvm_after_interrupt(&vmx->vcpu);
- }
+ else if (is_nmi(intr_info))
+ handle_interrupt_nmi_irqoff(&vmx->vcpu, intr_info);
}
static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
{
- unsigned int vector;
- unsigned long entry;
-#ifdef CONFIG_X86_64
- unsigned long tmp;
-#endif
- gate_desc *desc;
u32 intr_info = vmx_get_intr_info(vcpu);
if (WARN_ONCE(!is_external_intr(intr_info),
"KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info))
return;
- vector = intr_info & INTR_INFO_VECTOR_MASK;
- desc = (gate_desc *)host_idt_base + vector;
- entry = gate_offset(desc);
-
- kvm_before_interrupt(vcpu);
-
- asm volatile(
-#ifdef CONFIG_X86_64
- "mov %%rsp, %[sp]\n\t"
- "and $-16, %%rsp\n\t"
- "push %[ss]\n\t"
- "push %[sp]\n\t"
-#endif
- "pushf\n\t"
- "push %[cs]\n\t"
- CALL_NOSPEC
- :
-#ifdef CONFIG_X86_64
- [sp]"=&r"(tmp),
-#endif
- ASM_CALL_CONSTRAINT
- :
- [thunk_target]"r"(entry),
-#ifdef CONFIG_X86_64
- [ss]"i"(__KERNEL_DS),
-#endif
- [cs]"i"(__KERNEL_CS)
- );
-
- kvm_after_interrupt(vcpu);
+ handle_interrupt_nmi_irqoff(vcpu, intr_info);
}
-STACK_FRAME_NON_STANDARD(handle_external_interrupt_irqoff);
static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
{
@@ -6799,9 +6688,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
if (enable_preemption_timer)
vmx_update_hv_timer(vcpu);
- if (lapic_in_kernel(vcpu) &&
- vcpu->arch.apic->lapic_timer.timer_advance_ns)
- kvm_wait_lapic_expire(vcpu);
+ kvm_wait_lapic_expire(vcpu);
/*
* If this vCPU has touched SPEC_CTRL, restore the guest's value if
@@ -6945,20 +6832,20 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
goto free_vpid;
}
- BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) != NR_SHARED_MSRS);
+ BUILD_BUG_ON(ARRAY_SIZE(vmx_uret_msrs_list) != MAX_NR_USER_RETURN_MSRS);
- for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) {
- u32 index = vmx_msr_index[i];
+ for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i) {
+ u32 index = vmx_uret_msrs_list[i];
u32 data_low, data_high;
- int j = vmx->nmsrs;
+ int j = vmx->nr_uret_msrs;
if (rdmsr_safe(index, &data_low, &data_high) < 0)
continue;
if (wrmsr_safe(index, data_low, data_high) < 0)
continue;
- vmx->guest_msrs[j].index = i;
- vmx->guest_msrs[j].data = 0;
+ vmx->guest_uret_msrs[j].slot = i;
+ vmx->guest_uret_msrs[j].data = 0;
switch (index) {
case MSR_IA32_TSX_CTRL:
/*
@@ -6966,32 +6853,36 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
* let's avoid changing CPUID bits under the host
* kernel's feet.
*/
- vmx->guest_msrs[j].mask = ~(u64)TSX_CTRL_CPUID_CLEAR;
+ vmx->guest_uret_msrs[j].mask = ~(u64)TSX_CTRL_CPUID_CLEAR;
break;
default:
- vmx->guest_msrs[j].mask = -1ull;
+ vmx->guest_uret_msrs[j].mask = -1ull;
break;
}
- ++vmx->nmsrs;
+ ++vmx->nr_uret_msrs;
}
err = alloc_loaded_vmcs(&vmx->vmcs01);
if (err < 0)
goto free_pml;
+ /* The MSR bitmap starts with all ones */
+ bitmap_fill(vmx->shadow_msr_intercept.read, MAX_POSSIBLE_PASSTHROUGH_MSRS);
+ bitmap_fill(vmx->shadow_msr_intercept.write, MAX_POSSIBLE_PASSTHROUGH_MSRS);
+
msr_bitmap = vmx->vmcs01.msr_bitmap;
- vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_TSC, MSR_TYPE_R);
- vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE, MSR_TYPE_RW);
- vmx_disable_intercept_for_msr(msr_bitmap, MSR_GS_BASE, MSR_TYPE_RW);
- vmx_disable_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
- vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
- vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
- vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R);
+ vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
if (kvm_cstate_in_guest(vcpu->kvm)) {
- vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C1_RES, MSR_TYPE_R);
- vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R);
- vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R);
- vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R);
+ vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C1_RES, MSR_TYPE_R);
+ vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R);
+ vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R);
+ vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R);
}
vmx->msr_bitmap_mode = 0;
@@ -7015,8 +6906,7 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
}
if (nested)
- nested_vmx_setup_ctls_msrs(&vmx->nested.msrs,
- vmx_capability.ept);
+ memcpy(&vmx->nested.msrs, &vmcs_config.nested, sizeof(vmx->nested.msrs));
else
memset(&vmx->nested.msrs, 0, sizeof(vmx->nested.msrs));
@@ -7336,11 +7226,11 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
update_intel_pt_cfg(vcpu);
if (boot_cpu_has(X86_FEATURE_RTM)) {
- struct shared_msr_entry *msr;
- msr = find_msr_entry(vmx, MSR_IA32_TSX_CTRL);
+ struct vmx_uret_msr *msr;
+ msr = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL);
if (msr) {
bool enabled = guest_cpuid_has(vcpu, X86_FEATURE_RTM);
- vmx_set_guest_msr(vmx, msr, enabled ? 0 : TSX_CTRL_RTM_DISABLE);
+ vmx_set_guest_uret_msr(vmx, msr, enabled ? 0 : TSX_CTRL_RTM_DISABLE);
}
}
}
@@ -7366,14 +7256,14 @@ static __init void vmx_set_cpu_caps(void)
/* CPUID 0xD.1 */
supported_xss = 0;
- if (!vmx_xsaves_supported())
+ if (!cpu_has_vmx_xsaves())
kvm_cpu_cap_clear(X86_FEATURE_XSAVES);
/* CPUID 0x80000001 */
if (!cpu_has_vmx_rdtscp())
kvm_cpu_cap_clear(X86_FEATURE_RDTSCP);
- if (vmx_waitpkg_supported())
+ if (cpu_has_vmx_waitpkg())
kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG);
}
@@ -7429,7 +7319,7 @@ static int vmx_check_intercept(struct kvm_vcpu *vcpu,
* Because it is marked as EmulateOnUD, we need to intercept it here.
*/
case x86_intercept_rdtscp:
- if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDTSCP)) {
+ if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_RDTSCP)) {
exception->vector = UD_VECTOR;
exception->error_code_valid = false;
return X86EMUL_PROPAGATE_FAULT;
@@ -7561,107 +7451,6 @@ static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm,
kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask);
}
-static void __pi_post_block(struct kvm_vcpu *vcpu)
-{
- struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
- struct pi_desc old, new;
- unsigned int dest;
-
- do {
- old.control = new.control = pi_desc->control;
- WARN(old.nv != POSTED_INTR_WAKEUP_VECTOR,
- "Wakeup handler not enabled while the VCPU is blocked\n");
-
- dest = cpu_physical_id(vcpu->cpu);
-
- if (x2apic_enabled())
- new.ndst = dest;
- else
- new.ndst = (dest << 8) & 0xFF00;
-
- /* set 'NV' to 'notification vector' */
- new.nv = POSTED_INTR_VECTOR;
- } while (cmpxchg64(&pi_desc->control, old.control,
- new.control) != old.control);
-
- if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) {
- spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
- list_del(&vcpu->blocked_vcpu_list);
- spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
- vcpu->pre_pcpu = -1;
- }
-}
-
-/*
- * This routine does the following things for vCPU which is going
- * to be blocked if VT-d PI is enabled.
- * - Store the vCPU to the wakeup list, so when interrupts happen
- * we can find the right vCPU to wake up.
- * - Change the Posted-interrupt descriptor as below:
- * 'NDST' <-- vcpu->pre_pcpu
- * 'NV' <-- POSTED_INTR_WAKEUP_VECTOR
- * - If 'ON' is set during this process, which means at least one
- * interrupt is posted for this vCPU, we cannot block it, in
- * this case, return 1, otherwise, return 0.
- *
- */
-static int pi_pre_block(struct kvm_vcpu *vcpu)
-{
- unsigned int dest;
- struct pi_desc old, new;
- struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
-
- if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
- !irq_remapping_cap(IRQ_POSTING_CAP) ||
- !kvm_vcpu_apicv_active(vcpu))
- return 0;
-
- WARN_ON(irqs_disabled());
- local_irq_disable();
- if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) {
- vcpu->pre_pcpu = vcpu->cpu;
- spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
- list_add_tail(&vcpu->blocked_vcpu_list,
- &per_cpu(blocked_vcpu_on_cpu,
- vcpu->pre_pcpu));
- spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
- }
-
- do {
- old.control = new.control = pi_desc->control;
-
- WARN((pi_desc->sn == 1),
- "Warning: SN field of posted-interrupts "
- "is set before blocking\n");
-
- /*
- * Since vCPU can be preempted during this process,
- * vcpu->cpu could be different with pre_pcpu, we
- * need to set pre_pcpu as the destination of wakeup
- * notification event, then we can find the right vCPU
- * to wakeup in wakeup handler if interrupts happen
- * when the vCPU is in blocked state.
- */
- dest = cpu_physical_id(vcpu->pre_pcpu);
-
- if (x2apic_enabled())
- new.ndst = dest;
- else
- new.ndst = (dest << 8) & 0xFF00;
-
- /* set 'NV' to 'wakeup vector' */
- new.nv = POSTED_INTR_WAKEUP_VECTOR;
- } while (cmpxchg64(&pi_desc->control, old.control,
- new.control) != old.control);
-
- /* We should not block the vCPU if an interrupt is posted for it. */
- if (pi_test_on(pi_desc) == 1)
- __pi_post_block(vcpu);
-
- local_irq_enable();
- return (vcpu->pre_pcpu == -1);
-}
-
static int vmx_pre_block(struct kvm_vcpu *vcpu)
{
if (pi_pre_block(vcpu))
@@ -7673,17 +7462,6 @@ static int vmx_pre_block(struct kvm_vcpu *vcpu)
return 0;
}
-static void pi_post_block(struct kvm_vcpu *vcpu)
-{
- if (vcpu->pre_pcpu == -1)
- return;
-
- WARN_ON(irqs_disabled());
- local_irq_disable();
- __pi_post_block(vcpu);
- local_irq_enable();
-}
-
static void vmx_post_block(struct kvm_vcpu *vcpu)
{
if (kvm_x86_ops.set_hv_timer)
@@ -7692,100 +7470,6 @@ static void vmx_post_block(struct kvm_vcpu *vcpu)
pi_post_block(vcpu);
}
-/*
- * vmx_update_pi_irte - set IRTE for Posted-Interrupts
- *
- * @kvm: kvm
- * @host_irq: host irq of the interrupt
- * @guest_irq: gsi of the interrupt
- * @set: set or unset PI
- * returns 0 on success, < 0 on failure
- */
-static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
- uint32_t guest_irq, bool set)
-{
- struct kvm_kernel_irq_routing_entry *e;
- struct kvm_irq_routing_table *irq_rt;
- struct kvm_lapic_irq irq;
- struct kvm_vcpu *vcpu;
- struct vcpu_data vcpu_info;
- int idx, ret = 0;
-
- if (!kvm_arch_has_assigned_device(kvm) ||
- !irq_remapping_cap(IRQ_POSTING_CAP) ||
- !kvm_vcpu_apicv_active(kvm->vcpus[0]))
- return 0;
-
- idx = srcu_read_lock(&kvm->irq_srcu);
- irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
- if (guest_irq >= irq_rt->nr_rt_entries ||
- hlist_empty(&irq_rt->map[guest_irq])) {
- pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n",
- guest_irq, irq_rt->nr_rt_entries);
- goto out;
- }
-
- hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
- if (e->type != KVM_IRQ_ROUTING_MSI)
- continue;
- /*
- * VT-d PI cannot support posting multicast/broadcast
- * interrupts to a vCPU, we still use interrupt remapping
- * for these kind of interrupts.
- *
- * For lowest-priority interrupts, we only support
- * those with single CPU as the destination, e.g. user
- * configures the interrupts via /proc/irq or uses
- * irqbalance to make the interrupts single-CPU.
- *
- * We will support full lowest-priority interrupt later.
- *
- * In addition, we can only inject generic interrupts using
- * the PI mechanism, refuse to route others through it.
- */
-
- kvm_set_msi_irq(kvm, e, &irq);
- if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) ||
- !kvm_irq_is_postable(&irq)) {
- /*
- * Make sure the IRTE is in remapped mode if
- * we don't handle it in posted mode.
- */
- ret = irq_set_vcpu_affinity(host_irq, NULL);
- if (ret < 0) {
- printk(KERN_INFO
- "failed to back to remapped mode, irq: %u\n",
- host_irq);
- goto out;
- }
-
- continue;
- }
-
- vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu));
- vcpu_info.vector = irq.vector;
-
- trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi,
- vcpu_info.vector, vcpu_info.pi_desc_addr, set);
-
- if (set)
- ret = irq_set_vcpu_affinity(host_irq, &vcpu_info);
- else
- ret = irq_set_vcpu_affinity(host_irq, NULL);
-
- if (ret < 0) {
- printk(KERN_INFO "%s: failed to update PI IRTE\n",
- __func__);
- goto out;
- }
- }
-
- ret = 0;
-out:
- srcu_read_unlock(&kvm->irq_srcu, idx);
- return ret;
-}
-
static void vmx_setup_mce(struct kvm_vcpu *vcpu)
{
if (vcpu->arch.mcg_cap & MCG_LMCE_P)
@@ -7843,11 +7527,6 @@ static void enable_smi_window(struct kvm_vcpu *vcpu)
/* RSM will cause a vmexit anyway. */
}
-static bool vmx_need_emulation_on_page_fault(struct kvm_vcpu *vcpu)
-{
- return false;
-}
-
static bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
{
return to_vmx(vcpu)->nested.vmxon;
@@ -7954,7 +7633,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.guest_apic_has_interrupt = vmx_guest_apic_has_interrupt,
.sync_pir_to_irr = vmx_sync_pir_to_irr,
.deliver_posted_interrupt = vmx_deliver_posted_interrupt,
- .dy_apicv_has_pending_interrupt = vmx_dy_apicv_has_pending_interrupt,
+ .dy_apicv_has_pending_interrupt = pi_has_pending_interrupt,
.set_tss_addr = vmx_set_tss_addr,
.set_identity_map_addr = vmx_set_identity_map_addr,
@@ -7988,7 +7667,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.pmu_ops = &intel_pmu_ops,
.nested_ops = &vmx_nested_ops,
- .update_pi_irte = vmx_update_pi_irte,
+ .update_pi_irte = pi_update_irte,
#ifdef CONFIG_X86_64
.set_hv_timer = vmx_set_hv_timer,
@@ -8002,9 +7681,11 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.pre_leave_smm = vmx_pre_leave_smm,
.enable_smi_window = enable_smi_window,
- .need_emulation_on_page_fault = vmx_need_emulation_on_page_fault,
+ .can_emulate_instruction = vmx_can_emulate_instruction,
.apic_init_signal_blocked = vmx_apic_init_signal_blocked,
.migrate_timers = vmx_migrate_timers,
+
+ .msr_filter_changed = vmx_msr_filter_changed,
};
static __init int hardware_setup(void)
@@ -8016,8 +7697,8 @@ static __init int hardware_setup(void)
store_idt(&dt);
host_idt_base = dt.address;
- for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i)
- kvm_define_shared_msr(i, vmx_msr_index[i]);
+ for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i)
+ kvm_define_user_return_msr(i, vmx_uret_msrs_list[i]);
if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0)
return -EIO;
@@ -8154,7 +7835,7 @@ static __init int hardware_setup(void)
vmx_x86_ops.request_immediate_exit = __kvm_request_immediate_exit;
}
- kvm_set_posted_intr_wakeup_handler(wakeup_handler);
+ kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler);
kvm_mce_cap_supported |= MCG_LMCE_P;
@@ -8293,8 +7974,8 @@ static int __init vmx_init(void)
for_each_possible_cpu(cpu) {
INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
- INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
- spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
+
+ pi_init(cpu);
}
#ifdef CONFIG_KEXEC_CORE
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 26175a4..5961cb8 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -9,8 +9,9 @@
#include "capabilities.h"
#include "kvm_cache_regs.h"
-#include "ops.h"
+#include "posted_intr.h"
#include "vmcs.h"
+#include "vmx_ops.h"
#include "cpuid.h"
extern const u32 vmx_msr_index[];
@@ -22,20 +23,20 @@ extern const u32 vmx_msr_index[];
#define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4))
#ifdef CONFIG_X86_64
-#define NR_SHARED_MSRS 7
+#define MAX_NR_USER_RETURN_MSRS 7
#else
-#define NR_SHARED_MSRS 4
+#define MAX_NR_USER_RETURN_MSRS 4
#endif
-#define NR_LOADSTORE_MSRS 8
+#define MAX_NR_LOADSTORE_MSRS 8
struct vmx_msrs {
unsigned int nr;
- struct vmx_msr_entry val[NR_LOADSTORE_MSRS];
+ struct vmx_msr_entry val[MAX_NR_LOADSTORE_MSRS];
};
-struct shared_msr_entry {
- unsigned index;
+struct vmx_uret_msr {
+ unsigned int slot; /* The MSR's slot in kvm_user_return_msrs. */
u64 data;
u64 mask;
};
@@ -49,29 +50,6 @@ enum segment_cache_field {
SEG_FIELD_NR = 4
};
-/* Posted-Interrupt Descriptor */
-struct pi_desc {
- u32 pir[8]; /* Posted interrupt requested */
- union {
- struct {
- /* bit 256 - Outstanding Notification */
- u16 on : 1,
- /* bit 257 - Suppress Notification */
- sn : 1,
- /* bit 271:258 - Reserved */
- rsvd_1 : 14;
- /* bit 279:272 - Notification Vector */
- u8 nv;
- /* bit 287:280 - Reserved */
- u8 rsvd_2;
- /* bit 319:288 - Notification Destination */
- u32 ndst;
- };
- u64 control;
- };
- u32 rsvd[6];
-} __aligned(64);
-
#define RTIT_ADDR_RANGE 4
struct pt_ctx {
@@ -218,10 +196,10 @@ struct vcpu_vmx {
u32 idt_vectoring_info;
ulong rflags;
- struct shared_msr_entry guest_msrs[NR_SHARED_MSRS];
- int nmsrs;
- int save_nmsrs;
- bool guest_msrs_ready;
+ struct vmx_uret_msr guest_uret_msrs[MAX_NR_USER_RETURN_MSRS];
+ int nr_uret_msrs;
+ int nr_active_uret_msrs;
+ bool guest_uret_msrs_loaded;
#ifdef CONFIG_X86_64
u64 msr_host_kernel_gs_base;
u64 msr_guest_kernel_gs_base;
@@ -301,6 +279,13 @@ struct vcpu_vmx {
u64 ept_pointer;
struct pt_desc pt_desc;
+
+ /* Save desired MSR intercept (read: pass-through) state */
+#define MAX_POSSIBLE_PASSTHROUGH_MSRS 13
+ struct {
+ DECLARE_BITMAP(read, MAX_POSSIBLE_PASSTHROUGH_MSRS);
+ DECLARE_BITMAP(write, MAX_POSSIBLE_PASSTHROUGH_MSRS);
+ } shadow_msr_intercept;
};
enum ept_pointers_status {
@@ -343,6 +328,7 @@ void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa,
int root_level);
+
void update_exception_bitmap(struct kvm_vcpu *vcpu);
void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu);
bool vmx_nmi_blocked(struct kvm_vcpu *vcpu);
@@ -350,73 +336,11 @@ bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu);
bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu);
void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked);
void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu);
-struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr);
-void pt_update_intercept_for_msr(struct vcpu_vmx *vmx);
+struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr);
+void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu);
void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp);
-int vmx_find_msr_index(struct vmx_msrs *m, u32 msr);
-int vmx_handle_memory_failure(struct kvm_vcpu *vcpu, int r,
- struct x86_exception *e);
-
-#define POSTED_INTR_ON 0
-#define POSTED_INTR_SN 1
-
-static inline bool pi_test_and_set_on(struct pi_desc *pi_desc)
-{
- return test_and_set_bit(POSTED_INTR_ON,
- (unsigned long *)&pi_desc->control);
-}
-
-static inline bool pi_test_and_clear_on(struct pi_desc *pi_desc)
-{
- return test_and_clear_bit(POSTED_INTR_ON,
- (unsigned long *)&pi_desc->control);
-}
-
-static inline int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
-{
- return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
-}
-
-static inline bool pi_is_pir_empty(struct pi_desc *pi_desc)
-{
- return bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS);
-}
-
-static inline void pi_set_sn(struct pi_desc *pi_desc)
-{
- set_bit(POSTED_INTR_SN,
- (unsigned long *)&pi_desc->control);
-}
-
-static inline void pi_set_on(struct pi_desc *pi_desc)
-{
- set_bit(POSTED_INTR_ON,
- (unsigned long *)&pi_desc->control);
-}
-
-static inline void pi_clear_on(struct pi_desc *pi_desc)
-{
- clear_bit(POSTED_INTR_ON,
- (unsigned long *)&pi_desc->control);
-}
-
-static inline void pi_clear_sn(struct pi_desc *pi_desc)
-{
- clear_bit(POSTED_INTR_SN,
- (unsigned long *)&pi_desc->control);
-}
-
-static inline int pi_test_on(struct pi_desc *pi_desc)
-{
- return test_bit(POSTED_INTR_ON,
- (unsigned long *)&pi_desc->control);
-}
-
-static inline int pi_test_sn(struct pi_desc *pi_desc)
-{
- return test_bit(POSTED_INTR_SN,
- (unsigned long *)&pi_desc->control);
-}
+int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr);
+void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu);
static inline u8 vmx_get_rvi(void)
{
@@ -498,11 +422,6 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
return container_of(vcpu, struct vcpu_vmx, vcpu);
}
-static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
-{
- return &(to_vmx(vcpu)->pi_desc);
-}
-
static inline unsigned long vmx_get_exit_qual(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -554,6 +473,19 @@ static inline bool vmx_need_pf_intercept(struct kvm_vcpu *vcpu)
return !enable_ept || cpuid_maxphyaddr(vcpu) < boot_cpu_data.x86_phys_bits;
}
+static inline bool is_unrestricted_guest(struct kvm_vcpu *vcpu)
+{
+ return enable_unrestricted_guest && (!is_guest_mode(vcpu) ||
+ (secondary_exec_controls_get(to_vmx(vcpu)) &
+ SECONDARY_EXEC_UNRESTRICTED_GUEST));
+}
+
+bool __vmx_guest_state_valid(struct kvm_vcpu *vcpu);
+static inline bool vmx_guest_state_valid(struct kvm_vcpu *vcpu)
+{
+ return is_unrestricted_guest(vcpu) || __vmx_guest_state_valid(vcpu);
+}
+
void dump_vmcs(void);
#endif /* __KVM_X86_VMX_H */
diff --git a/arch/x86/kvm/vmx/ops.h b/arch/x86/kvm/vmx/vmx_ops.h
similarity index 100%
rename from arch/x86/kvm/vmx/ops.h
rename to arch/x86/kvm/vmx/vmx_ops.h
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7527022..c4015a4 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -71,6 +71,7 @@
#include <asm/irq_remapping.h>
#include <asm/mshyperv.h>
#include <asm/hypervisor.h>
+#include <asm/tlbflush.h>
#include <asm/intel_pt.h>
#include <asm/emulate_prefix.h>
#include <clocksource/hyperv_timer.h>
@@ -161,24 +162,29 @@ module_param(force_emulation_prefix, bool, S_IRUGO);
int __read_mostly pi_inject_timer = -1;
module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR);
-#define KVM_NR_SHARED_MSRS 16
+/*
+ * Restoring the host value for MSRs that are only consumed when running in
+ * usermode, e.g. SYSCALL MSRs and TSC_AUX, can be deferred until the CPU
+ * returns to userspace, i.e. the kernel can run with the guest's value.
+ */
+#define KVM_MAX_NR_USER_RETURN_MSRS 16
-struct kvm_shared_msrs_global {
+struct kvm_user_return_msrs_global {
int nr;
- u32 msrs[KVM_NR_SHARED_MSRS];
+ u32 msrs[KVM_MAX_NR_USER_RETURN_MSRS];
};
-struct kvm_shared_msrs {
+struct kvm_user_return_msrs {
struct user_return_notifier urn;
bool registered;
- struct kvm_shared_msr_values {
+ struct kvm_user_return_msr_values {
u64 host;
u64 curr;
- } values[KVM_NR_SHARED_MSRS];
+ } values[KVM_MAX_NR_USER_RETURN_MSRS];
};
-static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
-static struct kvm_shared_msrs __percpu *shared_msrs;
+static struct kvm_user_return_msrs_global __read_mostly user_return_msrs_global;
+static struct kvm_user_return_msrs __percpu *user_return_msrs;
#define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \
| XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \
@@ -266,7 +272,7 @@ static int kvm_msr_ignored_check(struct kvm_vcpu *vcpu, u32 msr,
} else {
vcpu_debug_ratelimited(vcpu, "unhandled %s: 0x%x data 0x%llx\n",
op, msr, data);
- return 1;
+ return -ENOENT;
}
}
@@ -293,9 +299,9 @@ static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
static void kvm_on_user_return(struct user_return_notifier *urn)
{
unsigned slot;
- struct kvm_shared_msrs *locals
- = container_of(urn, struct kvm_shared_msrs, urn);
- struct kvm_shared_msr_values *values;
+ struct kvm_user_return_msrs *msrs
+ = container_of(urn, struct kvm_user_return_msrs, urn);
+ struct kvm_user_return_msr_values *values;
unsigned long flags;
/*
@@ -303,73 +309,73 @@ static void kvm_on_user_return(struct user_return_notifier *urn)
* interrupted and executed through kvm_arch_hardware_disable()
*/
local_irq_save(flags);
- if (locals->registered) {
- locals->registered = false;
+ if (msrs->registered) {
+ msrs->registered = false;
user_return_notifier_unregister(urn);
}
local_irq_restore(flags);
- for (slot = 0; slot < shared_msrs_global.nr; ++slot) {
- values = &locals->values[slot];
+ for (slot = 0; slot < user_return_msrs_global.nr; ++slot) {
+ values = &msrs->values[slot];
if (values->host != values->curr) {
- wrmsrl(shared_msrs_global.msrs[slot], values->host);
+ wrmsrl(user_return_msrs_global.msrs[slot], values->host);
values->curr = values->host;
}
}
}
-void kvm_define_shared_msr(unsigned slot, u32 msr)
+void kvm_define_user_return_msr(unsigned slot, u32 msr)
{
- BUG_ON(slot >= KVM_NR_SHARED_MSRS);
- shared_msrs_global.msrs[slot] = msr;
- if (slot >= shared_msrs_global.nr)
- shared_msrs_global.nr = slot + 1;
+ BUG_ON(slot >= KVM_MAX_NR_USER_RETURN_MSRS);
+ user_return_msrs_global.msrs[slot] = msr;
+ if (slot >= user_return_msrs_global.nr)
+ user_return_msrs_global.nr = slot + 1;
}
-EXPORT_SYMBOL_GPL(kvm_define_shared_msr);
+EXPORT_SYMBOL_GPL(kvm_define_user_return_msr);
-static void kvm_shared_msr_cpu_online(void)
+static void kvm_user_return_msr_cpu_online(void)
{
unsigned int cpu = smp_processor_id();
- struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
+ struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu);
u64 value;
int i;
- for (i = 0; i < shared_msrs_global.nr; ++i) {
- rdmsrl_safe(shared_msrs_global.msrs[i], &value);
- smsr->values[i].host = value;
- smsr->values[i].curr = value;
+ for (i = 0; i < user_return_msrs_global.nr; ++i) {
+ rdmsrl_safe(user_return_msrs_global.msrs[i], &value);
+ msrs->values[i].host = value;
+ msrs->values[i].curr = value;
}
}
-int kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)
+int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask)
{
unsigned int cpu = smp_processor_id();
- struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
+ struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu);
int err;
- value = (value & mask) | (smsr->values[slot].host & ~mask);
- if (value == smsr->values[slot].curr)
+ value = (value & mask) | (msrs->values[slot].host & ~mask);
+ if (value == msrs->values[slot].curr)
return 0;
- err = wrmsrl_safe(shared_msrs_global.msrs[slot], value);
+ err = wrmsrl_safe(user_return_msrs_global.msrs[slot], value);
if (err)
return 1;
- smsr->values[slot].curr = value;
- if (!smsr->registered) {
- smsr->urn.on_user_return = kvm_on_user_return;
- user_return_notifier_register(&smsr->urn);
- smsr->registered = true;
+ msrs->values[slot].curr = value;
+ if (!msrs->registered) {
+ msrs->urn.on_user_return = kvm_on_user_return;
+ user_return_notifier_register(&msrs->urn);
+ msrs->registered = true;
}
return 0;
}
-EXPORT_SYMBOL_GPL(kvm_set_shared_msr);
+EXPORT_SYMBOL_GPL(kvm_set_user_return_msr);
static void drop_user_return_notifiers(void)
{
unsigned int cpu = smp_processor_id();
- struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
+ struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu);
- if (smsr->registered)
- kvm_on_user_return(&smsr->urn);
+ if (msrs->registered)
+ kvm_on_user_return(&msrs->urn);
}
u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
@@ -1482,6 +1488,40 @@ void kvm_enable_efer_bits(u64 mask)
}
EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
+bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct msr_bitmap_range *ranges = kvm->arch.msr_filter.ranges;
+ u32 count = kvm->arch.msr_filter.count;
+ u32 i;
+ bool r = kvm->arch.msr_filter.default_allow;
+ int idx;
+
+ /* MSR filtering not set up, allow everything */
+ if (!count)
+ return true;
+
+ /* Prevent collision with set_msr_filter */
+ idx = srcu_read_lock(&kvm->srcu);
+
+ for (i = 0; i < count; i++) {
+ u32 start = ranges[i].base;
+ u32 end = start + ranges[i].nmsrs;
+ u32 flags = ranges[i].flags;
+ unsigned long *bitmap = ranges[i].bitmap;
+
+ if ((index >= start) && (index < end) && (flags & type)) {
+ r = !!test_bit(index - start, bitmap);
+ break;
+ }
+ }
+
+ srcu_read_unlock(&kvm->srcu, idx);
+
+ return r;
+}
+EXPORT_SYMBOL_GPL(kvm_msr_allowed);
+
/*
* Write @data into the MSR specified by @index. Select MSR specific fault
* checks are bypassed if @host_initiated is %true.
@@ -1493,6 +1533,9 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data,
{
struct msr_data msr;
+ if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE))
+ return -EPERM;
+
switch (index) {
case MSR_FS_BASE:
case MSR_GS_BASE:
@@ -1549,6 +1592,9 @@ int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data,
struct msr_data msr;
int ret;
+ if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ))
+ return -EPERM;
+
msr.index = index;
msr.host_initiated = host_initiated;
@@ -1584,12 +1630,91 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data)
}
EXPORT_SYMBOL_GPL(kvm_set_msr);
+static int complete_emulated_msr(struct kvm_vcpu *vcpu, bool is_read)
+{
+ if (vcpu->run->msr.error) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ } else if (is_read) {
+ kvm_rax_write(vcpu, (u32)vcpu->run->msr.data);
+ kvm_rdx_write(vcpu, vcpu->run->msr.data >> 32);
+ }
+
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+static int complete_emulated_rdmsr(struct kvm_vcpu *vcpu)
+{
+ return complete_emulated_msr(vcpu, true);
+}
+
+static int complete_emulated_wrmsr(struct kvm_vcpu *vcpu)
+{
+ return complete_emulated_msr(vcpu, false);
+}
+
+static u64 kvm_msr_reason(int r)
+{
+ switch (r) {
+ case -ENOENT:
+ return KVM_MSR_EXIT_REASON_UNKNOWN;
+ case -EPERM:
+ return KVM_MSR_EXIT_REASON_FILTER;
+ default:
+ return KVM_MSR_EXIT_REASON_INVAL;
+ }
+}
+
+static int kvm_msr_user_space(struct kvm_vcpu *vcpu, u32 index,
+ u32 exit_reason, u64 data,
+ int (*completion)(struct kvm_vcpu *vcpu),
+ int r)
+{
+ u64 msr_reason = kvm_msr_reason(r);
+
+ /* Check if the user wanted to know about this MSR fault */
+ if (!(vcpu->kvm->arch.user_space_msr_mask & msr_reason))
+ return 0;
+
+ vcpu->run->exit_reason = exit_reason;
+ vcpu->run->msr.error = 0;
+ memset(vcpu->run->msr.pad, 0, sizeof(vcpu->run->msr.pad));
+ vcpu->run->msr.reason = msr_reason;
+ vcpu->run->msr.index = index;
+ vcpu->run->msr.data = data;
+ vcpu->arch.complete_userspace_io = completion;
+
+ return 1;
+}
+
+static int kvm_get_msr_user_space(struct kvm_vcpu *vcpu, u32 index, int r)
+{
+ return kvm_msr_user_space(vcpu, index, KVM_EXIT_X86_RDMSR, 0,
+ complete_emulated_rdmsr, r);
+}
+
+static int kvm_set_msr_user_space(struct kvm_vcpu *vcpu, u32 index, u64 data, int r)
+{
+ return kvm_msr_user_space(vcpu, index, KVM_EXIT_X86_WRMSR, data,
+ complete_emulated_wrmsr, r);
+}
+
int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu)
{
u32 ecx = kvm_rcx_read(vcpu);
u64 data;
+ int r;
- if (kvm_get_msr(vcpu, ecx, &data)) {
+ r = kvm_get_msr(vcpu, ecx, &data);
+
+ /* MSR read failed? See if we should ask user space */
+ if (r && kvm_get_msr_user_space(vcpu, ecx, r)) {
+ /* Bounce to user space */
+ return 0;
+ }
+
+ /* MSR read failed? Inject a #GP */
+ if (r) {
trace_kvm_msr_read_ex(ecx);
kvm_inject_gp(vcpu, 0);
return 1;
@@ -1607,8 +1732,18 @@ int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu)
{
u32 ecx = kvm_rcx_read(vcpu);
u64 data = kvm_read_edx_eax(vcpu);
+ int r;
- if (kvm_set_msr(vcpu, ecx, data)) {
+ r = kvm_set_msr(vcpu, ecx, data);
+
+ /* MSR write failed? See if we should ask user space */
+ if (r && kvm_set_msr_user_space(vcpu, ecx, data, r)) {
+ /* Bounce to user space */
+ return 0;
+ }
+
+ /* MSR write failed? Inject a #GP */
+ if (r) {
trace_kvm_msr_write_ex(ecx, data);
kvm_inject_gp(vcpu, 0);
return 1;
@@ -1774,12 +1909,6 @@ static s64 get_kvmclock_base_ns(void)
}
#endif
-void kvm_set_pending_timer(struct kvm_vcpu *vcpu)
-{
- kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
- kvm_vcpu_kick(vcpu);
-}
-
static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
{
int version;
@@ -1978,12 +2107,6 @@ static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
#endif
}
-static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset)
-{
- u64 curr_offset = vcpu->arch.l1_tsc_offset;
- vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset;
-}
-
/*
* Multiply tsc by a fixed point number represented by ratio.
*
@@ -2045,14 +2168,13 @@ static inline bool kvm_check_tsc_unstable(void)
return check_tsc_unstable();
}
-void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
+static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
{
struct kvm *kvm = vcpu->kvm;
u64 offset, ns, elapsed;
unsigned long flags;
bool matched;
bool already_matched;
- u64 data = msr->data;
bool synchronizing = false;
raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
@@ -2061,7 +2183,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
elapsed = ns - kvm->arch.last_tsc_nsec;
if (vcpu->arch.virtual_tsc_khz) {
- if (data == 0 && msr->host_initiated) {
+ if (data == 0) {
/*
* detection of vcpu initialization -- need to sync
* with other vCPUs. This particularly helps to keep
@@ -2131,9 +2253,6 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
- if (!msr->host_initiated && guest_cpuid_has(vcpu, X86_FEATURE_TSC_ADJUST))
- update_ia32_tsc_adjust_msr(vcpu, offset);
-
kvm_vcpu_write_tsc_offset(vcpu, offset);
raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
@@ -2148,8 +2267,6 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
spin_unlock(&kvm->arch.pvclock_gtod_sync_lock);
}
-EXPORT_SYMBOL_GPL(kvm_write_tsc);
-
static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
s64 adjustment)
{
@@ -2731,7 +2848,7 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
return 1;
if (!lapic_in_kernel(vcpu))
- return 1;
+ return data ? 1 : 0;
vcpu->arch.apf.msr_en_val = data;
@@ -2944,7 +3061,13 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
vcpu->arch.msr_ia32_power_ctl = data;
break;
case MSR_IA32_TSC:
- kvm_write_tsc(vcpu, msr_info);
+ if (msr_info->host_initiated) {
+ kvm_synchronize_tsc(vcpu, data);
+ } else {
+ u64 adj = kvm_compute_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset;
+ adjust_tsc_offset_guest(vcpu, adj);
+ vcpu->arch.ia32_tsc_adjust_msr += adj;
+ }
break;
case MSR_IA32_XSS:
if (!msr_info->host_initiated &&
@@ -3221,9 +3344,22 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_POWER_CTL:
msr_info->data = vcpu->arch.msr_ia32_power_ctl;
break;
- case MSR_IA32_TSC:
- msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + vcpu->arch.tsc_offset;
+ case MSR_IA32_TSC: {
+ /*
+ * Intel SDM states that MSR_IA32_TSC read adds the TSC offset
+ * even when not intercepted. AMD manual doesn't explicitly
+ * state this but appears to behave the same.
+ *
+ * Unconditionally return L1's TSC offset on userspace reads
+ * so that userspace reads and writes always operate on L1's
+ * offset, e.g. to ensure deterministic behavior for migration.
+ */
+ u64 tsc_offset = msr_info->host_initiated ? vcpu->arch.l1_tsc_offset :
+ vcpu->arch.tsc_offset;
+
+ msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + tsc_offset;
break;
+ }
case MSR_MTRRcap:
case 0x200 ... 0x2ff:
return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data);
@@ -3513,6 +3649,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_EXCEPTION_PAYLOAD:
case KVM_CAP_SET_GUEST_DEBUG:
case KVM_CAP_LAST_CPU:
+ case KVM_CAP_X86_USER_SPACE_MSR:
+ case KVM_CAP_X86_MSR_FILTER:
r = 1;
break;
case KVM_CAP_SYNC_REGS:
@@ -5033,6 +5171,10 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
kvm->arch.exception_payload_enabled = cap->args[0];
r = 0;
break;
+ case KVM_CAP_X86_USER_SPACE_MSR:
+ kvm->arch.user_space_msr_mask = cap->args[0];
+ r = 0;
+ break;
default:
r = -EINVAL;
break;
@@ -5040,6 +5182,103 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
return r;
}
+static void kvm_clear_msr_filter(struct kvm *kvm)
+{
+ u32 i;
+ u32 count = kvm->arch.msr_filter.count;
+ struct msr_bitmap_range ranges[16];
+
+ mutex_lock(&kvm->lock);
+ kvm->arch.msr_filter.count = 0;
+ memcpy(ranges, kvm->arch.msr_filter.ranges, count * sizeof(ranges[0]));
+ mutex_unlock(&kvm->lock);
+ synchronize_srcu(&kvm->srcu);
+
+ for (i = 0; i < count; i++)
+ kfree(ranges[i].bitmap);
+}
+
+static int kvm_add_msr_filter(struct kvm *kvm, struct kvm_msr_filter_range *user_range)
+{
+ struct msr_bitmap_range *ranges = kvm->arch.msr_filter.ranges;
+ struct msr_bitmap_range range;
+ unsigned long *bitmap = NULL;
+ size_t bitmap_size;
+ int r;
+
+ if (!user_range->nmsrs)
+ return 0;
+
+ bitmap_size = BITS_TO_LONGS(user_range->nmsrs) * sizeof(long);
+ if (!bitmap_size || bitmap_size > KVM_MSR_FILTER_MAX_BITMAP_SIZE)
+ return -EINVAL;
+
+ bitmap = memdup_user((__user u8*)user_range->bitmap, bitmap_size);
+ if (IS_ERR(bitmap))
+ return PTR_ERR(bitmap);
+
+ range = (struct msr_bitmap_range) {
+ .flags = user_range->flags,
+ .base = user_range->base,
+ .nmsrs = user_range->nmsrs,
+ .bitmap = bitmap,
+ };
+
+ if (range.flags & ~(KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE)) {
+ r = -EINVAL;
+ goto err;
+ }
+
+ if (!range.flags) {
+ r = -EINVAL;
+ goto err;
+ }
+
+ /* Everything ok, add this range identifier to our global pool */
+ ranges[kvm->arch.msr_filter.count] = range;
+ /* Make sure we filled the array before we tell anyone to walk it */
+ smp_wmb();
+ kvm->arch.msr_filter.count++;
+
+ return 0;
+err:
+ kfree(bitmap);
+ return r;
+}
+
+static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, void __user *argp)
+{
+ struct kvm_msr_filter __user *user_msr_filter = argp;
+ struct kvm_msr_filter filter;
+ bool default_allow;
+ int r = 0;
+ u32 i;
+
+ if (copy_from_user(&filter, user_msr_filter, sizeof(filter)))
+ return -EFAULT;
+
+ kvm_clear_msr_filter(kvm);
+
+ default_allow = !(filter.flags & KVM_MSR_FILTER_DEFAULT_DENY);
+ kvm->arch.msr_filter.default_allow = default_allow;
+
+ /*
+ * Protect from concurrent calls to this function that could trigger
+ * a TOCTOU violation on kvm->arch.msr_filter.count.
+ */
+ mutex_lock(&kvm->lock);
+ for (i = 0; i < ARRAY_SIZE(filter.ranges); i++) {
+ r = kvm_add_msr_filter(kvm, &filter.ranges[i]);
+ if (r)
+ break;
+ }
+
+ kvm_make_all_cpus_request(kvm, KVM_REQ_MSR_FILTER_CHANGED);
+ mutex_unlock(&kvm->lock);
+
+ return r;
+}
+
long kvm_arch_vm_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
@@ -5346,6 +5585,9 @@ long kvm_arch_vm_ioctl(struct file *filp,
case KVM_SET_PMU_EVENT_FILTER:
r = kvm_vm_ioctl_set_pmu_event_filter(kvm, argp);
break;
+ case KVM_X86_SET_MSR_FILTER:
+ r = kvm_vm_ioctl_set_msr_filter(kvm, argp);
+ break;
default:
r = -ENOTTY;
}
@@ -5707,6 +5949,9 @@ int handle_ud(struct kvm_vcpu *vcpu)
char sig[5]; /* ud2; .ascii "kvm" */
struct x86_exception e;
+ if (unlikely(!kvm_x86_ops.can_emulate_instruction(vcpu, NULL, 0)))
+ return 1;
+
if (force_emulation_prefix &&
kvm_read_guest_virt(vcpu, kvm_get_linear_rip(vcpu),
sig, sizeof(sig), &e) == 0 &&
@@ -6362,13 +6607,33 @@ static void emulator_set_segment(struct x86_emulate_ctxt *ctxt, u16 selector,
static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
u32 msr_index, u64 *pdata)
{
- return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata);
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+ int r;
+
+ r = kvm_get_msr(vcpu, msr_index, pdata);
+
+ if (r && kvm_get_msr_user_space(vcpu, msr_index, r)) {
+ /* Bounce to user space */
+ return X86EMUL_IO_NEEDED;
+ }
+
+ return r;
}
static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
u32 msr_index, u64 data)
{
- return kvm_set_msr(emul_to_vcpu(ctxt), msr_index, data);
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+ int r;
+
+ r = kvm_set_msr(vcpu, msr_index, data);
+
+ if (r && kvm_set_msr_user_space(vcpu, msr_index, data, r)) {
+ /* Bounce to user space */
+ return X86EMUL_IO_NEEDED;
+ }
+
+ return r;
}
static u64 emulator_get_smbase(struct x86_emulate_ctxt *ctxt)
@@ -6912,7 +7177,10 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
int r;
struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
bool writeback = true;
- bool write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
+ bool write_fault_to_spt;
+
+ if (unlikely(!kvm_x86_ops.can_emulate_instruction(vcpu, insn, insn_len)))
+ return 1;
vcpu->arch.l1tf_flush_l1d = true;
@@ -6920,6 +7188,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
* Clear write_fault_to_shadow_pgtable here to ensure it is
* never reused.
*/
+ write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
vcpu->arch.write_fault_to_shadow_pgtable = false;
kvm_clear_exception_queue(vcpu);
@@ -7514,9 +7783,9 @@ int kvm_arch_init(void *opaque)
goto out_free_x86_fpu_cache;
}
- shared_msrs = alloc_percpu(struct kvm_shared_msrs);
- if (!shared_msrs) {
- printk(KERN_ERR "kvm: failed to allocate percpu kvm_shared_msrs\n");
+ user_return_msrs = alloc_percpu(struct kvm_user_return_msrs);
+ if (!user_return_msrs) {
+ printk(KERN_ERR "kvm: failed to allocate percpu kvm_user_return_msrs\n");
goto out_free_x86_emulator_cache;
}
@@ -7549,7 +7818,7 @@ int kvm_arch_init(void *opaque)
return 0;
out_free_percpu:
- free_percpu(shared_msrs);
+ free_percpu(user_return_msrs);
out_free_x86_emulator_cache:
kmem_cache_destroy(x86_emulator_cache);
out_free_x86_fpu_cache:
@@ -7576,7 +7845,7 @@ void kvm_arch_exit(void)
#endif
kvm_x86_ops.hardware_enable = NULL;
kvm_mmu_module_exit();
- free_percpu(shared_msrs);
+ free_percpu(user_return_msrs);
kmem_cache_destroy(x86_fpu_cache);
}
@@ -8365,8 +8634,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
bool req_immediate_exit = false;
if (kvm_request_pending(vcpu)) {
- if (kvm_check_request(KVM_REQ_GET_VMCS12_PAGES, vcpu)) {
- if (unlikely(!kvm_x86_ops.nested_ops->get_vmcs12_pages(vcpu))) {
+ if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) {
+ if (unlikely(!kvm_x86_ops.nested_ops->get_nested_state_pages(vcpu))) {
r = 0;
goto out;
}
@@ -8473,6 +8742,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
kvm_vcpu_update_apicv(vcpu);
if (kvm_check_request(KVM_REQ_APF_READY, vcpu))
kvm_check_async_pf_completion(vcpu);
+ if (kvm_check_request(KVM_REQ_MSR_FILTER_CHANGED, vcpu))
+ kvm_x86_ops.msr_filter_changed(vcpu);
}
if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
@@ -8548,7 +8819,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
kvm_x86_ops.request_immediate_exit(vcpu);
}
- trace_kvm_entry(vcpu->vcpu_id);
+ trace_kvm_entry(vcpu);
fpregs_assert_state_consistent();
if (test_thread_flag(TIF_NEED_FPU_LOAD))
@@ -9562,7 +9833,6 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
{
- struct msr_data msr;
struct kvm *kvm = vcpu->kvm;
kvm_hv_vcpu_postcreate(vcpu);
@@ -9570,10 +9840,7 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
if (mutex_lock_killable(&vcpu->mutex))
return;
vcpu_load(vcpu);
- msr.data = 0x0;
- msr.index = MSR_IA32_TSC;
- msr.host_initiated = true;
- kvm_write_tsc(vcpu, &msr);
+ kvm_synchronize_tsc(vcpu, 0);
vcpu_put(vcpu);
/* poll control enabled by default */
@@ -9707,7 +9974,7 @@ int kvm_arch_hardware_enable(void)
u64 max_tsc = 0;
bool stable, backwards_tsc = false;
- kvm_shared_msr_cpu_online();
+ kvm_user_return_msr_cpu_online();
ret = kvm_x86_ops.hardware_enable();
if (ret != 0)
return ret;
@@ -10025,6 +10292,8 @@ void kvm_arch_pre_destroy_vm(struct kvm *kvm)
void kvm_arch_destroy_vm(struct kvm *kvm)
{
+ u32 i;
+
if (current->mm == kvm->mm) {
/*
* Free memory regions allocated on behalf of userspace,
@@ -10041,6 +10310,8 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
}
if (kvm_x86_ops.vm_destroy)
kvm_x86_ops.vm_destroy(kvm);
+ for (i = 0; i < kvm->arch.msr_filter.count; i++)
+ kfree(kvm->arch.msr_filter.ranges[i].bitmap);
kvm_pic_destroy(kvm);
kvm_ioapic_destroy(kvm);
kvm_free_vcpus(kvm);
@@ -10771,6 +11042,111 @@ void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_c
}
EXPORT_SYMBOL_GPL(kvm_fixup_and_inject_pf_error);
+/*
+ * Handles kvm_read/write_guest_virt*() result and either injects #PF or returns
+ * KVM_EXIT_INTERNAL_ERROR for cases not currently handled by KVM. Return value
+ * indicates whether exit to userspace is needed.
+ */
+int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r,
+ struct x86_exception *e)
+{
+ if (r == X86EMUL_PROPAGATE_FAULT) {
+ kvm_inject_emulated_page_fault(vcpu, e);
+ return 1;
+ }
+
+ /*
+ * In case kvm_read/write_guest_virt*() failed with X86EMUL_IO_NEEDED
+ * while handling a VMX instruction KVM could've handled the request
+ * correctly by exiting to userspace and performing I/O but there
+ * doesn't seem to be a real use-case behind such requests, just return
+ * KVM_EXIT_INTERNAL_ERROR for now.
+ */
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+ vcpu->run->internal.ndata = 0;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_handle_memory_failure);
+
+int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva)
+{
+ bool pcid_enabled;
+ struct x86_exception e;
+ unsigned i;
+ unsigned long roots_to_free = 0;
+ struct {
+ u64 pcid;
+ u64 gla;
+ } operand;
+ int r;
+
+ r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e);
+ if (r != X86EMUL_CONTINUE)
+ return kvm_handle_memory_failure(vcpu, r, &e);
+
+ if (operand.pcid >> 12 != 0) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
+
+ switch (type) {
+ case INVPCID_TYPE_INDIV_ADDR:
+ if ((!pcid_enabled && (operand.pcid != 0)) ||
+ is_noncanonical_address(operand.gla, vcpu)) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+ kvm_mmu_invpcid_gva(vcpu, operand.gla, operand.pcid);
+ return kvm_skip_emulated_instruction(vcpu);
+
+ case INVPCID_TYPE_SINGLE_CTXT:
+ if (!pcid_enabled && (operand.pcid != 0)) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ if (kvm_get_active_pcid(vcpu) == operand.pcid) {
+ kvm_mmu_sync_roots(vcpu);
+ kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
+ }
+
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
+ if (kvm_get_pcid(vcpu, vcpu->arch.mmu->prev_roots[i].pgd)
+ == operand.pcid)
+ roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
+
+ kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, roots_to_free);
+ /*
+ * If neither the current cr3 nor any of the prev_roots use the
+ * given PCID, then nothing needs to be done here because a
+ * resync will happen anyway before switching to any other CR3.
+ */
+
+ return kvm_skip_emulated_instruction(vcpu);
+
+ case INVPCID_TYPE_ALL_NON_GLOBAL:
+ /*
+ * Currently, KVM doesn't mark global entries in the shadow
+ * page tables, so a non-global flush just degenerates to a
+ * global flush. If needed, we could optimize this later by
+ * keeping track of global entries in shadow page tables.
+ */
+
+ fallthrough;
+ case INVPCID_TYPE_ALL_INCL_GLOBAL:
+ kvm_mmu_unload(vcpu);
+ return kvm_skip_emulated_instruction(vcpu);
+
+ default:
+ BUG(); /* We have already checked above that type <= 3 */
+ }
+}
+EXPORT_SYMBOL_GPL(kvm_handle_invpcid);
+
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 995ab69..3900ab0 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -246,7 +246,6 @@ static inline bool kvm_vcpu_latch_init(struct kvm_vcpu *vcpu)
return is_smm(vcpu) || kvm_x86_ops.apic_init_signal_blocked(vcpu);
}
-void kvm_set_pending_timer(struct kvm_vcpu *vcpu);
void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr);
@@ -372,6 +371,10 @@ void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu);
int kvm_spec_ctrl_test_value(u64 value);
int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu);
+int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r,
+ struct x86_exception *e);
+int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva);
+bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type);
#define KVM_MSR_RET_INVALID 2
diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
index 9417a34..26cfb0f 100644
--- a/include/trace/events/kvm.h
+++ b/include/trace/events/kvm.h
@@ -17,7 +17,7 @@
ERSN(NMI), ERSN(INTERNAL_ERROR), ERSN(OSI), ERSN(PAPR_HCALL), \
ERSN(S390_UCONTROL), ERSN(WATCHDOG), ERSN(S390_TSCH), ERSN(EPR),\
ERSN(SYSTEM_EVENT), ERSN(S390_STSI), ERSN(IOAPIC_EOI), \
- ERSN(HYPERV), ERSN(ARM_NISV)
+ ERSN(HYPERV), ERSN(ARM_NISV), ERSN(X86_RDMSR), ERSN(X86_WRMSR)
TRACE_EVENT(kvm_userspace_exit,
TP_PROTO(__u32 reason, int errno),
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 3d80234..58f43aa 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -248,6 +248,8 @@ struct kvm_hyperv_exit {
#define KVM_EXIT_IOAPIC_EOI 26
#define KVM_EXIT_HYPERV 27
#define KVM_EXIT_ARM_NISV 28
+#define KVM_EXIT_X86_RDMSR 29
+#define KVM_EXIT_X86_WRMSR 30
/* For KVM_EXIT_INTERNAL_ERROR */
/* Emulate instruction failed. */
@@ -413,6 +415,17 @@ struct kvm_run {
__u64 esr_iss;
__u64 fault_ipa;
} arm_nisv;
+ /* KVM_EXIT_X86_RDMSR / KVM_EXIT_X86_WRMSR */
+ struct {
+ __u8 error; /* user -> kernel */
+ __u8 pad[7];
+#define KVM_MSR_EXIT_REASON_INVAL (1 << 0)
+#define KVM_MSR_EXIT_REASON_UNKNOWN (1 << 1)
+#define KVM_MSR_EXIT_REASON_FILTER (1 << 2)
+ __u32 reason; /* kernel -> user */
+ __u32 index; /* kernel -> user */
+ __u64 data; /* kernel <-> user */
+ } msr;
/* Fix the size of the union. */
char padding[256];
};
@@ -790,9 +803,10 @@ struct kvm_ppc_resize_hpt {
#define KVM_VM_PPC_HV 1
#define KVM_VM_PPC_PR 2
-/* on MIPS, 0 forces trap & emulate, 1 forces VZ ASE */
-#define KVM_VM_MIPS_TE 0
+/* on MIPS, 0 indicates auto, 1 forces VZ ASE, 2 forces trap & emulate */
+#define KVM_VM_MIPS_AUTO 0
#define KVM_VM_MIPS_VZ 1
+#define KVM_VM_MIPS_TE 2
#define KVM_S390_SIE_PAGE_OFFSET 1
@@ -1036,6 +1050,8 @@ struct kvm_ppc_resize_hpt {
#define KVM_CAP_SMALLER_MAXPHYADDR 185
#define KVM_CAP_S390_DIAG318 186
#define KVM_CAP_STEAL_TIME 187
+#define KVM_CAP_X86_USER_SPACE_MSR 188
+#define KVM_CAP_X86_MSR_FILTER 189
#ifdef KVM_CAP_IRQ_ROUTING
@@ -1537,6 +1553,9 @@ struct kvm_pv_cmd {
/* Available with KVM_CAP_S390_PROTECTED */
#define KVM_S390_PV_COMMAND _IOWR(KVMIO, 0xc5, struct kvm_pv_cmd)
+/* Available with KVM_CAP_X86_MSR_FILTER */
+#define KVM_X86_SET_MSR_FILTER _IOW(KVMIO, 0xc6, struct kvm_msr_filter)
+
/* Secure Encrypted Virtualization command */
enum sev_cmd_id {
/* Guest initialization commands */
diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore
index 4527871..307ceaa 100644
--- a/tools/testing/selftests/kvm/.gitignore
+++ b/tools/testing/selftests/kvm/.gitignore
@@ -11,6 +11,7 @@
/x86_64/set_sregs_test
/x86_64/smm_test
/x86_64/state_test
+/x86_64/user_msr_test
/x86_64/vmx_preemption_timer_test
/x86_64/svm_vmcall_test
/x86_64/sync_regs_test
diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
index 4a16658..7ebe71f 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -55,6 +55,8 @@
TEST_GEN_PROGS_x86_64 += x86_64/vmx_tsc_adjust_test
TEST_GEN_PROGS_x86_64 += x86_64/xss_msr_test
TEST_GEN_PROGS_x86_64 += x86_64/debug_regs
+TEST_GEN_PROGS_x86_64 += x86_64/tsc_msrs_test
+TEST_GEN_PROGS_x86_64 += x86_64/user_msr_test
TEST_GEN_PROGS_x86_64 += clear_dirty_log_test
TEST_GEN_PROGS_x86_64 += demand_paging_test
TEST_GEN_PROGS_x86_64 += dirty_log_test
diff --git a/tools/testing/selftests/kvm/include/x86_64/vmx.h b/tools/testing/selftests/kvm/include/x86_64/vmx.h
index 16fa21e..54d624d 100644
--- a/tools/testing/selftests/kvm/include/x86_64/vmx.h
+++ b/tools/testing/selftests/kvm/include/x86_64/vmx.h
@@ -48,7 +48,7 @@
#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
#define SECONDARY_EXEC_ENABLE_EPT 0x00000002
#define SECONDARY_EXEC_DESC 0x00000004
-#define SECONDARY_EXEC_RDTSCP 0x00000008
+#define SECONDARY_EXEC_ENABLE_RDTSCP 0x00000008
#define SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE 0x00000010
#define SECONDARY_EXEC_ENABLE_VPID 0x00000020
#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040
diff --git a/tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c b/tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c
new file mode 100644
index 0000000..f8e7611
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c
@@ -0,0 +1,168 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Tests for MSR_IA32_TSC and MSR_IA32_TSC_ADJUST.
+ *
+ * Copyright (C) 2020, Red Hat, Inc.
+ */
+#include <stdio.h>
+#include <string.h>
+#include "kvm_util.h"
+#include "processor.h"
+
+#define VCPU_ID 0
+
+#define UNITY (1ull << 30)
+#define HOST_ADJUST (UNITY * 64)
+#define GUEST_STEP (UNITY * 4)
+#define ROUND(x) ((x + UNITY / 2) & -UNITY)
+#define rounded_rdmsr(x) ROUND(rdmsr(x))
+#define rounded_host_rdmsr(x) ROUND(vcpu_get_msr(vm, 0, x))
+
+#define GUEST_ASSERT_EQ(a, b) do { \
+ __typeof(a) _a = (a); \
+ __typeof(b) _b = (b); \
+ if (_a != _b) \
+ ucall(UCALL_ABORT, 4, \
+ "Failed guest assert: " \
+ #a " == " #b, __LINE__, _a, _b); \
+ } while(0)
+
+static void guest_code(void)
+{
+ u64 val = 0;
+
+ GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC), val);
+ GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC_ADJUST), val);
+
+ /* Guest: writes to MSR_IA32_TSC affect both MSRs. */
+ val = 1ull * GUEST_STEP;
+ wrmsr(MSR_IA32_TSC, val);
+ GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC), val);
+ GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC_ADJUST), val);
+
+ /* Guest: writes to MSR_IA32_TSC_ADJUST affect both MSRs. */
+ GUEST_SYNC(2);
+ val = 2ull * GUEST_STEP;
+ wrmsr(MSR_IA32_TSC_ADJUST, val);
+ GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC), val);
+ GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC_ADJUST), val);
+
+ /* Host: setting the TSC offset. */
+ GUEST_SYNC(3);
+ GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC), HOST_ADJUST + val);
+ GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC_ADJUST), val);
+
+ /*
+ * Guest: writes to MSR_IA32_TSC_ADJUST do not destroy the
+ * host-side offset and affect both MSRs.
+ */
+ GUEST_SYNC(4);
+ val = 3ull * GUEST_STEP;
+ wrmsr(MSR_IA32_TSC_ADJUST, val);
+ GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC), HOST_ADJUST + val);
+ GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC_ADJUST), val);
+
+ /*
+ * Guest: writes to MSR_IA32_TSC affect both MSRs, so the host-side
+ * offset is now visible in MSR_IA32_TSC_ADJUST.
+ */
+ GUEST_SYNC(5);
+ val = 4ull * GUEST_STEP;
+ wrmsr(MSR_IA32_TSC, val);
+ GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC), val);
+ GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC_ADJUST), val - HOST_ADJUST);
+
+ GUEST_DONE();
+}
+
+static void run_vcpu(struct kvm_vm *vm, uint32_t vcpuid, int stage)
+{
+ struct ucall uc;
+
+ vcpu_args_set(vm, vcpuid, 1, vcpuid);
+
+ vcpu_ioctl(vm, vcpuid, KVM_RUN, NULL);
+
+ switch (get_ucall(vm, vcpuid, &uc)) {
+ case UCALL_SYNC:
+ TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") &&
+ uc.args[1] == stage + 1, "Stage %d: Unexpected register values vmexit, got %lx",
+ stage + 1, (ulong)uc.args[1]);
+ return;
+ case UCALL_DONE:
+ return;
+ case UCALL_ABORT:
+ TEST_ASSERT(false, "%s at %s:%ld\n" \
+ "\tvalues: %#lx, %#lx", (const char *)uc.args[0],
+ __FILE__, uc.args[1], uc.args[2], uc.args[3]);
+ default:
+ TEST_ASSERT(false, "Unexpected exit: %s",
+ exit_reason_str(vcpu_state(vm, vcpuid)->exit_reason));
+ }
+}
+
+int main(void)
+{
+ struct kvm_vm *vm;
+ uint64_t val;
+
+ vm = vm_create_default(VCPU_ID, 0, guest_code);
+ vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
+
+ val = 0;
+ ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), val);
+ ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val);
+
+ /* Guest: writes to MSR_IA32_TSC affect both MSRs. */
+ run_vcpu(vm, VCPU_ID, 1);
+ val = 1ull * GUEST_STEP;
+ ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), val);
+ ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val);
+
+ /* Guest: writes to MSR_IA32_TSC_ADJUST affect both MSRs. */
+ run_vcpu(vm, VCPU_ID, 2);
+ val = 2ull * GUEST_STEP;
+ ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), val);
+ ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val);
+
+ /*
+ * Host: writes to MSR_IA32_TSC set the host-side offset
+ * and therefore do not change MSR_IA32_TSC_ADJUST.
+ */
+ vcpu_set_msr(vm, 0, MSR_IA32_TSC, HOST_ADJUST + val);
+ ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), HOST_ADJUST + val);
+ ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val);
+ run_vcpu(vm, VCPU_ID, 3);
+
+ /* Host: writes to MSR_IA32_TSC_ADJUST do not modify the TSC. */
+ vcpu_set_msr(vm, 0, MSR_IA32_TSC_ADJUST, UNITY * 123456);
+ ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), HOST_ADJUST + val);
+ ASSERT_EQ(vcpu_get_msr(vm, 0, MSR_IA32_TSC_ADJUST), UNITY * 123456);
+
+ /* Restore previous value. */
+ vcpu_set_msr(vm, 0, MSR_IA32_TSC_ADJUST, val);
+ ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), HOST_ADJUST + val);
+ ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val);
+
+ /*
+ * Guest: writes to MSR_IA32_TSC_ADJUST do not destroy the
+ * host-side offset and affect both MSRs.
+ */
+ run_vcpu(vm, VCPU_ID, 4);
+ val = 3ull * GUEST_STEP;
+ ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), HOST_ADJUST + val);
+ ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val);
+
+ /*
+ * Guest: writes to MSR_IA32_TSC affect both MSRs, so the host-side
+ * offset is now visible in MSR_IA32_TSC_ADJUST.
+ */
+ run_vcpu(vm, VCPU_ID, 5);
+ val = 4ull * GUEST_STEP;
+ ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), val);
+ ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val - HOST_ADJUST);
+
+ kvm_vm_free(vm);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86_64/user_msr_test.c b/tools/testing/selftests/kvm/x86_64/user_msr_test.c
new file mode 100644
index 0000000..cbe1b08
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/user_msr_test.c
@@ -0,0 +1,248 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * tests for KVM_CAP_X86_USER_SPACE_MSR and KVM_X86_SET_MSR_FILTER
+ *
+ * Copyright (C) 2020, Amazon Inc.
+ *
+ * This is a functional test to verify that we can deflect MSR events
+ * into user space.
+ */
+#define _GNU_SOURCE /* for program_invocation_short_name */
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "test_util.h"
+
+#include "kvm_util.h"
+#include "processor.h"
+
+#define VCPU_ID 5
+
+static u32 msr_reads, msr_writes;
+
+static u8 bitmap_00000000[KVM_MSR_FILTER_MAX_BITMAP_SIZE];
+static u8 bitmap_00000000_write[KVM_MSR_FILTER_MAX_BITMAP_SIZE];
+static u8 bitmap_40000000[KVM_MSR_FILTER_MAX_BITMAP_SIZE];
+static u8 bitmap_c0000000[KVM_MSR_FILTER_MAX_BITMAP_SIZE];
+static u8 bitmap_c0000000_read[KVM_MSR_FILTER_MAX_BITMAP_SIZE];
+static u8 bitmap_deadbeef[1] = { 0x1 };
+
+static void deny_msr(uint8_t *bitmap, u32 msr)
+{
+ u32 idx = msr & (KVM_MSR_FILTER_MAX_BITMAP_SIZE - 1);
+
+ bitmap[idx / 8] &= ~(1 << (idx % 8));
+}
+
+static void prepare_bitmaps(void)
+{
+ memset(bitmap_00000000, 0xff, sizeof(bitmap_00000000));
+ memset(bitmap_00000000_write, 0xff, sizeof(bitmap_00000000_write));
+ memset(bitmap_40000000, 0xff, sizeof(bitmap_40000000));
+ memset(bitmap_c0000000, 0xff, sizeof(bitmap_c0000000));
+ memset(bitmap_c0000000_read, 0xff, sizeof(bitmap_c0000000_read));
+
+ deny_msr(bitmap_00000000_write, MSR_IA32_POWER_CTL);
+ deny_msr(bitmap_c0000000_read, MSR_SYSCALL_MASK);
+ deny_msr(bitmap_c0000000_read, MSR_GS_BASE);
+}
+
+struct kvm_msr_filter filter = {
+ .flags = KVM_MSR_FILTER_DEFAULT_DENY,
+ .ranges = {
+ {
+ .flags = KVM_MSR_FILTER_READ,
+ .base = 0x00000000,
+ .nmsrs = KVM_MSR_FILTER_MAX_BITMAP_SIZE * BITS_PER_BYTE,
+ .bitmap = bitmap_00000000,
+ }, {
+ .flags = KVM_MSR_FILTER_WRITE,
+ .base = 0x00000000,
+ .nmsrs = KVM_MSR_FILTER_MAX_BITMAP_SIZE * BITS_PER_BYTE,
+ .bitmap = bitmap_00000000_write,
+ }, {
+ .flags = KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE,
+ .base = 0x40000000,
+ .nmsrs = KVM_MSR_FILTER_MAX_BITMAP_SIZE * BITS_PER_BYTE,
+ .bitmap = bitmap_40000000,
+ }, {
+ .flags = KVM_MSR_FILTER_READ,
+ .base = 0xc0000000,
+ .nmsrs = KVM_MSR_FILTER_MAX_BITMAP_SIZE * BITS_PER_BYTE,
+ .bitmap = bitmap_c0000000_read,
+ }, {
+ .flags = KVM_MSR_FILTER_WRITE,
+ .base = 0xc0000000,
+ .nmsrs = KVM_MSR_FILTER_MAX_BITMAP_SIZE * BITS_PER_BYTE,
+ .bitmap = bitmap_c0000000,
+ }, {
+ .flags = KVM_MSR_FILTER_WRITE | KVM_MSR_FILTER_READ,
+ .base = 0xdeadbeef,
+ .nmsrs = 1,
+ .bitmap = bitmap_deadbeef,
+ },
+ },
+};
+
+struct kvm_msr_filter no_filter = {
+ .flags = KVM_MSR_FILTER_DEFAULT_ALLOW,
+};
+
+static void guest_msr_calls(bool trapped)
+{
+ /* This goes into the in-kernel emulation */
+ wrmsr(MSR_SYSCALL_MASK, 0);
+
+ if (trapped) {
+ /* This goes into user space emulation */
+ GUEST_ASSERT(rdmsr(MSR_SYSCALL_MASK) == MSR_SYSCALL_MASK);
+ GUEST_ASSERT(rdmsr(MSR_GS_BASE) == MSR_GS_BASE);
+ } else {
+ GUEST_ASSERT(rdmsr(MSR_SYSCALL_MASK) != MSR_SYSCALL_MASK);
+ GUEST_ASSERT(rdmsr(MSR_GS_BASE) != MSR_GS_BASE);
+ }
+
+ /* If trapped == true, this goes into user space emulation */
+ wrmsr(MSR_IA32_POWER_CTL, 0x1234);
+
+ /* This goes into the in-kernel emulation */
+ rdmsr(MSR_IA32_POWER_CTL);
+
+ /* Invalid MSR, should always be handled by user space exit */
+ GUEST_ASSERT(rdmsr(0xdeadbeef) == 0xdeadbeef);
+ wrmsr(0xdeadbeef, 0x1234);
+}
+
+static void guest_code(void)
+{
+ guest_msr_calls(true);
+
+ /*
+ * Disable msr filtering, so that the kernel
+ * handles everything in the next round
+ */
+ GUEST_SYNC(0);
+
+ guest_msr_calls(false);
+
+ GUEST_DONE();
+}
+
+static int handle_ucall(struct kvm_vm *vm)
+{
+ struct ucall uc;
+
+ switch (get_ucall(vm, VCPU_ID, &uc)) {
+ case UCALL_ABORT:
+ TEST_FAIL("Guest assertion not met");
+ break;
+ case UCALL_SYNC:
+ vm_ioctl(vm, KVM_X86_SET_MSR_FILTER, &no_filter);
+ break;
+ case UCALL_DONE:
+ return 1;
+ default:
+ TEST_FAIL("Unknown ucall %lu", uc.cmd);
+ }
+
+ return 0;
+}
+
+static void handle_rdmsr(struct kvm_run *run)
+{
+ run->msr.data = run->msr.index;
+ msr_reads++;
+
+ if (run->msr.index == MSR_SYSCALL_MASK ||
+ run->msr.index == MSR_GS_BASE) {
+ TEST_ASSERT(run->msr.reason == KVM_MSR_EXIT_REASON_FILTER,
+ "MSR read trap w/o access fault");
+ }
+
+ if (run->msr.index == 0xdeadbeef) {
+ TEST_ASSERT(run->msr.reason == KVM_MSR_EXIT_REASON_UNKNOWN,
+ "MSR deadbeef read trap w/o inval fault");
+ }
+}
+
+static void handle_wrmsr(struct kvm_run *run)
+{
+ /* ignore */
+ msr_writes++;
+
+ if (run->msr.index == MSR_IA32_POWER_CTL) {
+ TEST_ASSERT(run->msr.data == 0x1234,
+ "MSR data for MSR_IA32_POWER_CTL incorrect");
+ TEST_ASSERT(run->msr.reason == KVM_MSR_EXIT_REASON_FILTER,
+ "MSR_IA32_POWER_CTL trap w/o access fault");
+ }
+
+ if (run->msr.index == 0xdeadbeef) {
+ TEST_ASSERT(run->msr.data == 0x1234,
+ "MSR data for deadbeef incorrect");
+ TEST_ASSERT(run->msr.reason == KVM_MSR_EXIT_REASON_UNKNOWN,
+ "deadbeef trap w/o inval fault");
+ }
+}
+
+int main(int argc, char *argv[])
+{
+ struct kvm_enable_cap cap = {
+ .cap = KVM_CAP_X86_USER_SPACE_MSR,
+ .args[0] = KVM_MSR_EXIT_REASON_INVAL |
+ KVM_MSR_EXIT_REASON_UNKNOWN |
+ KVM_MSR_EXIT_REASON_FILTER,
+ };
+ struct kvm_vm *vm;
+ struct kvm_run *run;
+ int rc;
+
+ /* Tell stdout not to buffer its content */
+ setbuf(stdout, NULL);
+
+ /* Create VM */
+ vm = vm_create_default(VCPU_ID, 0, guest_code);
+ vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
+ run = vcpu_state(vm, VCPU_ID);
+
+ rc = kvm_check_cap(KVM_CAP_X86_USER_SPACE_MSR);
+ TEST_ASSERT(rc, "KVM_CAP_X86_USER_SPACE_MSR is available");
+ vm_enable_cap(vm, &cap);
+
+ rc = kvm_check_cap(KVM_CAP_X86_MSR_FILTER);
+ TEST_ASSERT(rc, "KVM_CAP_X86_MSR_FILTER is available");
+
+ prepare_bitmaps();
+ vm_ioctl(vm, KVM_X86_SET_MSR_FILTER, &filter);
+
+ while (1) {
+ rc = _vcpu_run(vm, VCPU_ID);
+
+ TEST_ASSERT(rc == 0, "vcpu_run failed: %d\n", rc);
+
+ switch (run->exit_reason) {
+ case KVM_EXIT_X86_RDMSR:
+ handle_rdmsr(run);
+ break;
+ case KVM_EXIT_X86_WRMSR:
+ handle_wrmsr(run);
+ break;
+ case KVM_EXIT_IO:
+ if (handle_ucall(vm))
+ goto done;
+ break;
+ }
+
+ }
+
+done:
+ TEST_ASSERT(msr_reads == 4, "Handled 4 rdmsr in user space");
+ TEST_ASSERT(msr_writes == 3, "Handled 3 wrmsr in user space");
+
+ kvm_vm_free(vm);
+
+ return 0;
+}
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index d6408bb..c2323c2 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -853,15 +853,17 @@ kvm_deassign_ioeventfd_idx(struct kvm *kvm, enum kvm_bus bus_idx,
struct eventfd_ctx *eventfd;
struct kvm_io_bus *bus;
int ret = -ENOENT;
+ bool wildcard;
eventfd = eventfd_ctx_fdget(args->fd);
if (IS_ERR(eventfd))
return PTR_ERR(eventfd);
+ wildcard = !(args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH);
+
mutex_lock(&kvm->slots_lock);
list_for_each_entry_safe(p, tmp, &kvm->ioeventfds, list) {
- bool wildcard = !(args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH);
if (p->bus_idx != bus_idx ||
p->eventfd != eventfd ||
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 67cd0b8..68edd25 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -4332,7 +4332,7 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
struct kvm_io_device *dev)
{
- int i;
+ int i, j;
struct kvm_io_bus *new_bus, *bus;
bus = kvm_get_bus(kvm, bus_idx);
@@ -4349,17 +4349,20 @@ void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
new_bus = kmalloc(struct_size(bus, range, bus->dev_count - 1),
GFP_KERNEL_ACCOUNT);
- if (!new_bus) {
+ if (new_bus) {
+ memcpy(new_bus, bus, struct_size(bus, range, i));
+ new_bus->dev_count--;
+ memcpy(new_bus->range + i, bus->range + i + 1,
+ flex_array_size(new_bus, range, new_bus->dev_count - i));
+ } else {
pr_err("kvm: failed to shrink bus, removing it completely\n");
- goto broken;
+ for (j = 0; j < bus->dev_count; j++) {
+ if (j == i)
+ continue;
+ kvm_iodevice_destructor(bus->range[j].dev);
+ }
}
- memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range));
- new_bus->dev_count--;
- memcpy(new_bus->range + i, bus->range + i + 1,
- (new_bus->dev_count - i) * sizeof(struct kvm_io_range));
-
-broken:
rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
synchronize_srcu_expedited(&kvm->srcu);
kfree(bus);