KVM: nVMX: Make nested control MSRs per-cpu
To enable nested apicv support, we need per-cpu vmx
control MSRs:
1. If in-kernel irqchip is enabled, we can enable nested
posted interrupt, we should set posted intr bit in
the nested_vmx_pinbased_ctls_high.
2. If in-kernel irqchip is disabled, we can not enable
nested posted interrupt, the posted intr bit
in the nested_vmx_pinbased_ctls_high will be cleared.
Since there would be different settings about in-kernel
irqchip between VMs, different nested control MSRs
are needed.
Signed-off-by: Wincy Van <fanwenyi0529@gmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 25984e7..a17bbb8 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -411,6 +411,23 @@
/* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */
u64 vmcs01_debugctl;
+
+ u32 nested_vmx_procbased_ctls_low;
+ u32 nested_vmx_procbased_ctls_high;
+ u32 nested_vmx_true_procbased_ctls_low;
+ u32 nested_vmx_secondary_ctls_low;
+ u32 nested_vmx_secondary_ctls_high;
+ u32 nested_vmx_pinbased_ctls_low;
+ u32 nested_vmx_pinbased_ctls_high;
+ u32 nested_vmx_exit_ctls_low;
+ u32 nested_vmx_exit_ctls_high;
+ u32 nested_vmx_true_exit_ctls_low;
+ u32 nested_vmx_entry_ctls_low;
+ u32 nested_vmx_entry_ctls_high;
+ u32 nested_vmx_true_entry_ctls_low;
+ u32 nested_vmx_misc_low;
+ u32 nested_vmx_misc_high;
+ u32 nested_vmx_ept_caps;
};
#define POSTED_INTR_ON 0
@@ -2292,20 +2309,8 @@
* if the corresponding bit in the (32-bit) control field *must* be on, and a
* bit in the high half is on if the corresponding bit in the control field
* may be on. See also vmx_control_verify().
- * TODO: allow these variables to be modified (downgraded) by module options
- * or other means.
*/
-static u32 nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high;
-static u32 nested_vmx_true_procbased_ctls_low;
-static u32 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high;
-static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high;
-static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high;
-static u32 nested_vmx_true_exit_ctls_low;
-static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high;
-static u32 nested_vmx_true_entry_ctls_low;
-static u32 nested_vmx_misc_low, nested_vmx_misc_high;
-static u32 nested_vmx_ept_caps;
-static __init void nested_vmx_setup_ctls_msrs(void)
+static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
{
/*
* Note that as a general rule, the high half of the MSRs (bits in
@@ -2324,57 +2329,71 @@
/* pin-based controls */
rdmsr(MSR_IA32_VMX_PINBASED_CTLS,
- nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high);
- nested_vmx_pinbased_ctls_low |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
- nested_vmx_pinbased_ctls_high &= PIN_BASED_EXT_INTR_MASK |
- PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS;
- nested_vmx_pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
+ vmx->nested.nested_vmx_pinbased_ctls_low,
+ vmx->nested.nested_vmx_pinbased_ctls_high);
+ vmx->nested.nested_vmx_pinbased_ctls_low |=
+ PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
+ vmx->nested.nested_vmx_pinbased_ctls_high &=
+ PIN_BASED_EXT_INTR_MASK |
+ PIN_BASED_NMI_EXITING |
+ PIN_BASED_VIRTUAL_NMIS;
+ vmx->nested.nested_vmx_pinbased_ctls_high |=
+ PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
PIN_BASED_VMX_PREEMPTION_TIMER;
/* exit controls */
rdmsr(MSR_IA32_VMX_EXIT_CTLS,
- nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high);
- nested_vmx_exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
+ vmx->nested.nested_vmx_exit_ctls_low,
+ vmx->nested.nested_vmx_exit_ctls_high);
+ vmx->nested.nested_vmx_exit_ctls_low =
+ VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
- nested_vmx_exit_ctls_high &=
+ vmx->nested.nested_vmx_exit_ctls_high &=
#ifdef CONFIG_X86_64
VM_EXIT_HOST_ADDR_SPACE_SIZE |
#endif
VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
- nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
+ vmx->nested.nested_vmx_exit_ctls_high |=
+ VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT;
if (vmx_mpx_supported())
- nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
+ vmx->nested.nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
/* We support free control of debug control saving. */
- nested_vmx_true_exit_ctls_low = nested_vmx_exit_ctls_low &
+ vmx->nested.nested_vmx_true_exit_ctls_low =
+ vmx->nested.nested_vmx_exit_ctls_low &
~VM_EXIT_SAVE_DEBUG_CONTROLS;
/* entry controls */
rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
- nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high);
- nested_vmx_entry_ctls_low = VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
- nested_vmx_entry_ctls_high &=
+ vmx->nested.nested_vmx_entry_ctls_low,
+ vmx->nested.nested_vmx_entry_ctls_high);
+ vmx->nested.nested_vmx_entry_ctls_low =
+ VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
+ vmx->nested.nested_vmx_entry_ctls_high &=
#ifdef CONFIG_X86_64
VM_ENTRY_IA32E_MODE |
#endif
VM_ENTRY_LOAD_IA32_PAT;
- nested_vmx_entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR |
- VM_ENTRY_LOAD_IA32_EFER);
+ vmx->nested.nested_vmx_entry_ctls_high |=
+ (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER);
if (vmx_mpx_supported())
- nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
+ vmx->nested.nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
/* We support free control of debug control loading. */
- nested_vmx_true_entry_ctls_low = nested_vmx_entry_ctls_low &
+ vmx->nested.nested_vmx_true_entry_ctls_low =
+ vmx->nested.nested_vmx_entry_ctls_low &
~VM_ENTRY_LOAD_DEBUG_CONTROLS;
/* cpu-based controls */
rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
- nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high);
- nested_vmx_procbased_ctls_low = CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
- nested_vmx_procbased_ctls_high &=
+ vmx->nested.nested_vmx_procbased_ctls_low,
+ vmx->nested.nested_vmx_procbased_ctls_high);
+ vmx->nested.nested_vmx_procbased_ctls_low =
+ CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
+ vmx->nested.nested_vmx_procbased_ctls_high &=
CPU_BASED_VIRTUAL_INTR_PENDING |
CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING |
CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING |
@@ -2394,18 +2413,21 @@
* can use it to avoid exits to L1 - even when L0 runs L2
* without MSR bitmaps.
*/
- nested_vmx_procbased_ctls_high |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
+ vmx->nested.nested_vmx_procbased_ctls_high |=
+ CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
CPU_BASED_USE_MSR_BITMAPS;
/* We support free control of CR3 access interception. */
- nested_vmx_true_procbased_ctls_low = nested_vmx_procbased_ctls_low &
+ vmx->nested.nested_vmx_true_procbased_ctls_low =
+ vmx->nested.nested_vmx_procbased_ctls_low &
~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING);
/* secondary cpu-based controls */
rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
- nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high);
- nested_vmx_secondary_ctls_low = 0;
- nested_vmx_secondary_ctls_high &=
+ vmx->nested.nested_vmx_secondary_ctls_low,
+ vmx->nested.nested_vmx_secondary_ctls_high);
+ vmx->nested.nested_vmx_secondary_ctls_low = 0;
+ vmx->nested.nested_vmx_secondary_ctls_high &=
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
SECONDARY_EXEC_WBINVD_EXITING |
@@ -2413,27 +2435,31 @@
if (enable_ept) {
/* nested EPT: emulate EPT also to L1 */
- nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT |
+ vmx->nested.nested_vmx_secondary_ctls_high |=
+ SECONDARY_EXEC_ENABLE_EPT |
SECONDARY_EXEC_UNRESTRICTED_GUEST;
- nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
+ vmx->nested.nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
VMX_EPTP_WB_BIT | VMX_EPT_2MB_PAGE_BIT |
VMX_EPT_INVEPT_BIT;
- nested_vmx_ept_caps &= vmx_capability.ept;
+ vmx->nested.nested_vmx_ept_caps &= vmx_capability.ept;
/*
* For nested guests, we don't do anything specific
* for single context invalidation. Hence, only advertise
* support for global context invalidation.
*/
- nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT;
+ vmx->nested.nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT;
} else
- nested_vmx_ept_caps = 0;
+ vmx->nested.nested_vmx_ept_caps = 0;
/* miscellaneous data */
- rdmsr(MSR_IA32_VMX_MISC, nested_vmx_misc_low, nested_vmx_misc_high);
- nested_vmx_misc_low &= VMX_MISC_SAVE_EFER_LMA;
- nested_vmx_misc_low |= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
+ rdmsr(MSR_IA32_VMX_MISC,
+ vmx->nested.nested_vmx_misc_low,
+ vmx->nested.nested_vmx_misc_high);
+ vmx->nested.nested_vmx_misc_low &= VMX_MISC_SAVE_EFER_LMA;
+ vmx->nested.nested_vmx_misc_low |=
+ VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
VMX_MISC_ACTIVITY_HLT;
- nested_vmx_misc_high = 0;
+ vmx->nested.nested_vmx_misc_high = 0;
}
static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
@@ -2452,6 +2478,8 @@
/* Returns 0 on success, non-0 otherwise. */
static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
switch (msr_index) {
case MSR_IA32_VMX_BASIC:
/*
@@ -2466,36 +2494,44 @@
break;
case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
case MSR_IA32_VMX_PINBASED_CTLS:
- *pdata = vmx_control_msr(nested_vmx_pinbased_ctls_low,
- nested_vmx_pinbased_ctls_high);
+ *pdata = vmx_control_msr(
+ vmx->nested.nested_vmx_pinbased_ctls_low,
+ vmx->nested.nested_vmx_pinbased_ctls_high);
break;
case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
- *pdata = vmx_control_msr(nested_vmx_true_procbased_ctls_low,
- nested_vmx_procbased_ctls_high);
+ *pdata = vmx_control_msr(
+ vmx->nested.nested_vmx_true_procbased_ctls_low,
+ vmx->nested.nested_vmx_procbased_ctls_high);
break;
case MSR_IA32_VMX_PROCBASED_CTLS:
- *pdata = vmx_control_msr(nested_vmx_procbased_ctls_low,
- nested_vmx_procbased_ctls_high);
+ *pdata = vmx_control_msr(
+ vmx->nested.nested_vmx_procbased_ctls_low,
+ vmx->nested.nested_vmx_procbased_ctls_high);
break;
case MSR_IA32_VMX_TRUE_EXIT_CTLS:
- *pdata = vmx_control_msr(nested_vmx_true_exit_ctls_low,
- nested_vmx_exit_ctls_high);
+ *pdata = vmx_control_msr(
+ vmx->nested.nested_vmx_true_exit_ctls_low,
+ vmx->nested.nested_vmx_exit_ctls_high);
break;
case MSR_IA32_VMX_EXIT_CTLS:
- *pdata = vmx_control_msr(nested_vmx_exit_ctls_low,
- nested_vmx_exit_ctls_high);
+ *pdata = vmx_control_msr(
+ vmx->nested.nested_vmx_exit_ctls_low,
+ vmx->nested.nested_vmx_exit_ctls_high);
break;
case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
- *pdata = vmx_control_msr(nested_vmx_true_entry_ctls_low,
- nested_vmx_entry_ctls_high);
+ *pdata = vmx_control_msr(
+ vmx->nested.nested_vmx_true_entry_ctls_low,
+ vmx->nested.nested_vmx_entry_ctls_high);
break;
case MSR_IA32_VMX_ENTRY_CTLS:
- *pdata = vmx_control_msr(nested_vmx_entry_ctls_low,
- nested_vmx_entry_ctls_high);
+ *pdata = vmx_control_msr(
+ vmx->nested.nested_vmx_entry_ctls_low,
+ vmx->nested.nested_vmx_entry_ctls_high);
break;
case MSR_IA32_VMX_MISC:
- *pdata = vmx_control_msr(nested_vmx_misc_low,
- nested_vmx_misc_high);
+ *pdata = vmx_control_msr(
+ vmx->nested.nested_vmx_misc_low,
+ vmx->nested.nested_vmx_misc_high);
break;
/*
* These MSRs specify bits which the guest must keep fixed (on or off)
@@ -2520,12 +2556,13 @@
*pdata = 0x2e; /* highest index: VMX_PREEMPTION_TIMER_VALUE */
break;
case MSR_IA32_VMX_PROCBASED_CTLS2:
- *pdata = vmx_control_msr(nested_vmx_secondary_ctls_low,
- nested_vmx_secondary_ctls_high);
+ *pdata = vmx_control_msr(
+ vmx->nested.nested_vmx_secondary_ctls_low,
+ vmx->nested.nested_vmx_secondary_ctls_high);
break;
case MSR_IA32_VMX_EPT_VPID_CAP:
/* Currently, no nested vpid support */
- *pdata = nested_vmx_ept_caps;
+ *pdata = vmx->nested.nested_vmx_ept_caps;
break;
default:
return 1;
@@ -5045,11 +5082,12 @@
hypercall[2] = 0xc1;
}
-static bool nested_cr0_valid(struct vmcs12 *vmcs12, unsigned long val)
+static bool nested_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val)
{
unsigned long always_on = VMXON_CR0_ALWAYSON;
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
- if (nested_vmx_secondary_ctls_high &
+ if (to_vmx(vcpu)->nested.nested_vmx_secondary_ctls_high &
SECONDARY_EXEC_UNRESTRICTED_GUEST &&
nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST))
always_on &= ~(X86_CR0_PE | X86_CR0_PG);
@@ -5074,7 +5112,7 @@
val = (val & ~vmcs12->cr0_guest_host_mask) |
(vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask);
- if (!nested_cr0_valid(vmcs12, val))
+ if (!nested_cr0_valid(vcpu, val))
return 1;
if (kvm_set_cr0(vcpu, val))
@@ -5969,9 +6007,6 @@
kvm_x86_ops->sync_pir_to_irr = vmx_sync_pir_to_irr_dummy;
}
- if (nested)
- nested_vmx_setup_ctls_msrs();
-
vmx_disable_intercept_for_msr(MSR_FS_BASE, false);
vmx_disable_intercept_for_msr(MSR_GS_BASE, false);
vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true);
@@ -6989,6 +7024,7 @@
/* Emulate the INVEPT instruction */
static int handle_invept(struct kvm_vcpu *vcpu)
{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
u32 vmx_instruction_info, types;
unsigned long type;
gva_t gva;
@@ -6997,8 +7033,9 @@
u64 eptp, gpa;
} operand;
- if (!(nested_vmx_secondary_ctls_high & SECONDARY_EXEC_ENABLE_EPT) ||
- !(nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) {
+ if (!(vmx->nested.nested_vmx_secondary_ctls_high &
+ SECONDARY_EXEC_ENABLE_EPT) ||
+ !(vmx->nested.nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) {
kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
}
@@ -7014,7 +7051,7 @@
vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
- types = (nested_vmx_ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
+ types = (vmx->nested.nested_vmx_ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
if (!(types & (1UL << type))) {
nested_vmx_failValid(vcpu,
@@ -8254,6 +8291,9 @@
goto free_vmcs;
}
+ if (nested)
+ nested_vmx_setup_ctls_msrs(vmx);
+
vmx->nested.current_vmptr = -1ull;
vmx->nested.current_vmcs12 = NULL;
@@ -8412,7 +8452,8 @@
{
WARN_ON(mmu_is_nested(vcpu));
kvm_init_shadow_ept_mmu(vcpu,
- nested_vmx_ept_caps & VMX_EPT_EXECUTE_ONLY_BIT);
+ to_vmx(vcpu)->nested.nested_vmx_ept_caps &
+ VMX_EPT_EXECUTE_ONLY_BIT);
vcpu->arch.mmu.set_cr3 = vmx_set_cr3;
vcpu->arch.mmu.get_cr3 = nested_ept_get_cr3;
vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault;
@@ -9123,18 +9164,20 @@
}
if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
- nested_vmx_true_procbased_ctls_low,
- nested_vmx_procbased_ctls_high) ||
+ vmx->nested.nested_vmx_true_procbased_ctls_low,
+ vmx->nested.nested_vmx_procbased_ctls_high) ||
!vmx_control_verify(vmcs12->secondary_vm_exec_control,
- nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high) ||
+ vmx->nested.nested_vmx_secondary_ctls_low,
+ vmx->nested.nested_vmx_secondary_ctls_high) ||
!vmx_control_verify(vmcs12->pin_based_vm_exec_control,
- nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high) ||
+ vmx->nested.nested_vmx_pinbased_ctls_low,
+ vmx->nested.nested_vmx_pinbased_ctls_high) ||
!vmx_control_verify(vmcs12->vm_exit_controls,
- nested_vmx_true_exit_ctls_low,
- nested_vmx_exit_ctls_high) ||
+ vmx->nested.nested_vmx_true_exit_ctls_low,
+ vmx->nested.nested_vmx_exit_ctls_high) ||
!vmx_control_verify(vmcs12->vm_entry_controls,
- nested_vmx_true_entry_ctls_low,
- nested_vmx_entry_ctls_high))
+ vmx->nested.nested_vmx_true_entry_ctls_low,
+ vmx->nested.nested_vmx_entry_ctls_high))
{
nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
return 1;
@@ -9147,7 +9190,7 @@
return 1;
}
- if (!nested_cr0_valid(vmcs12, vmcs12->guest_cr0) ||
+ if (!nested_cr0_valid(vcpu, vmcs12->guest_cr0) ||
((vmcs12->guest_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) {
nested_vmx_entry_failure(vcpu, vmcs12,
EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT);