blob: 6879966b7648eb8fe0bb7d6f5d017d477b74de97 [file] [log] [blame]
Sean Christopherson55d23752018-12-03 13:53:18 -08001// SPDX-License-Identifier: GPL-2.0
2
3#include <linux/frame.h>
4#include <linux/percpu.h>
5
6#include <asm/debugreg.h>
7#include <asm/mmu_context.h>
8
9#include "cpuid.h"
10#include "hyperv.h"
11#include "mmu.h"
12#include "nested.h"
Oliver Uptonbfc6ad62019-11-13 16:17:16 -080013#include "pmu.h"
Sean Christopherson55d23752018-12-03 13:53:18 -080014#include "trace.h"
15#include "x86.h"
16
17static bool __read_mostly enable_shadow_vmcs = 1;
18module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO);
19
20static bool __read_mostly nested_early_check = 0;
21module_param(nested_early_check, bool, S_IRUGO);
22
Sean Christopherson5497b952019-07-11 08:58:29 -070023#define CC(consistency_check) \
24({ \
25 bool failed = (consistency_check); \
26 if (failed) \
Sean Christopherson380e0052019-07-11 08:58:30 -070027 trace_kvm_nested_vmenter_failed(#consistency_check, 0); \
Sean Christopherson5497b952019-07-11 08:58:29 -070028 failed; \
29})
30
Oliver Upton458151f2019-11-13 16:17:18 -080031#define SET_MSR_OR_WARN(vcpu, idx, data) \
32({ \
33 bool failed = kvm_set_msr(vcpu, idx, data); \
34 if (failed) \
35 pr_warn_ratelimited( \
36 "%s cannot write MSR (0x%x, 0x%llx)\n", \
37 __func__, idx, data); \
38 failed; \
39})
40
Sean Christopherson55d23752018-12-03 13:53:18 -080041/*
42 * Hyper-V requires all of these, so mark them as supported even though
43 * they are just treated the same as all-context.
44 */
45#define VMX_VPID_EXTENT_SUPPORTED_MASK \
46 (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \
47 VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \
48 VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \
49 VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT)
50
51#define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5
52
53enum {
54 VMX_VMREAD_BITMAP,
55 VMX_VMWRITE_BITMAP,
56 VMX_BITMAP_NR
57};
58static unsigned long *vmx_bitmap[VMX_BITMAP_NR];
59
60#define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP])
61#define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP])
62
Sean Christopherson1c6f0b42019-05-07 08:36:25 -070063struct shadow_vmcs_field {
64 u16 encoding;
65 u16 offset;
66};
67static struct shadow_vmcs_field shadow_read_only_fields[] = {
68#define SHADOW_FIELD_RO(x, y) { x, offsetof(struct vmcs12, y) },
Sean Christopherson55d23752018-12-03 13:53:18 -080069#include "vmcs_shadow_fields.h"
70};
71static int max_shadow_read_only_fields =
72 ARRAY_SIZE(shadow_read_only_fields);
73
Sean Christopherson1c6f0b42019-05-07 08:36:25 -070074static struct shadow_vmcs_field shadow_read_write_fields[] = {
75#define SHADOW_FIELD_RW(x, y) { x, offsetof(struct vmcs12, y) },
Sean Christopherson55d23752018-12-03 13:53:18 -080076#include "vmcs_shadow_fields.h"
77};
78static int max_shadow_read_write_fields =
79 ARRAY_SIZE(shadow_read_write_fields);
80
Yi Wang8997f652019-01-21 15:27:05 +080081static void init_vmcs_shadow_fields(void)
Sean Christopherson55d23752018-12-03 13:53:18 -080082{
83 int i, j;
84
85 memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
86 memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
87
88 for (i = j = 0; i < max_shadow_read_only_fields; i++) {
Sean Christopherson1c6f0b42019-05-07 08:36:25 -070089 struct shadow_vmcs_field entry = shadow_read_only_fields[i];
90 u16 field = entry.encoding;
Sean Christopherson55d23752018-12-03 13:53:18 -080091
92 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
93 (i + 1 == max_shadow_read_only_fields ||
Sean Christopherson1c6f0b42019-05-07 08:36:25 -070094 shadow_read_only_fields[i + 1].encoding != field + 1))
Sean Christopherson55d23752018-12-03 13:53:18 -080095 pr_err("Missing field from shadow_read_only_field %x\n",
96 field + 1);
97
98 clear_bit(field, vmx_vmread_bitmap);
Sean Christopherson55d23752018-12-03 13:53:18 -080099 if (field & 1)
Sean Christopherson1c6f0b42019-05-07 08:36:25 -0700100#ifdef CONFIG_X86_64
Sean Christopherson55d23752018-12-03 13:53:18 -0800101 continue;
Sean Christopherson1c6f0b42019-05-07 08:36:25 -0700102#else
103 entry.offset += sizeof(u32);
Sean Christopherson55d23752018-12-03 13:53:18 -0800104#endif
Sean Christopherson1c6f0b42019-05-07 08:36:25 -0700105 shadow_read_only_fields[j++] = entry;
Sean Christopherson55d23752018-12-03 13:53:18 -0800106 }
107 max_shadow_read_only_fields = j;
108
109 for (i = j = 0; i < max_shadow_read_write_fields; i++) {
Sean Christopherson1c6f0b42019-05-07 08:36:25 -0700110 struct shadow_vmcs_field entry = shadow_read_write_fields[i];
111 u16 field = entry.encoding;
Sean Christopherson55d23752018-12-03 13:53:18 -0800112
113 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
114 (i + 1 == max_shadow_read_write_fields ||
Sean Christopherson1c6f0b42019-05-07 08:36:25 -0700115 shadow_read_write_fields[i + 1].encoding != field + 1))
Sean Christopherson55d23752018-12-03 13:53:18 -0800116 pr_err("Missing field from shadow_read_write_field %x\n",
117 field + 1);
118
Sean Christophersonb6437802019-05-07 08:36:24 -0700119 WARN_ONCE(field >= GUEST_ES_AR_BYTES &&
120 field <= GUEST_TR_AR_BYTES,
Sean Christopherson1c6f0b42019-05-07 08:36:25 -0700121 "Update vmcs12_write_any() to drop reserved bits from AR_BYTES");
Sean Christophersonb6437802019-05-07 08:36:24 -0700122
Sean Christopherson55d23752018-12-03 13:53:18 -0800123 /*
124 * PML and the preemption timer can be emulated, but the
125 * processor cannot vmwrite to fields that don't exist
126 * on bare metal.
127 */
128 switch (field) {
129 case GUEST_PML_INDEX:
130 if (!cpu_has_vmx_pml())
131 continue;
132 break;
133 case VMX_PREEMPTION_TIMER_VALUE:
134 if (!cpu_has_vmx_preemption_timer())
135 continue;
136 break;
137 case GUEST_INTR_STATUS:
138 if (!cpu_has_vmx_apicv())
139 continue;
140 break;
141 default:
142 break;
143 }
144
145 clear_bit(field, vmx_vmwrite_bitmap);
146 clear_bit(field, vmx_vmread_bitmap);
Sean Christopherson55d23752018-12-03 13:53:18 -0800147 if (field & 1)
Sean Christopherson1c6f0b42019-05-07 08:36:25 -0700148#ifdef CONFIG_X86_64
Sean Christopherson55d23752018-12-03 13:53:18 -0800149 continue;
Sean Christopherson1c6f0b42019-05-07 08:36:25 -0700150#else
151 entry.offset += sizeof(u32);
Sean Christopherson55d23752018-12-03 13:53:18 -0800152#endif
Sean Christopherson1c6f0b42019-05-07 08:36:25 -0700153 shadow_read_write_fields[j++] = entry;
Sean Christopherson55d23752018-12-03 13:53:18 -0800154 }
155 max_shadow_read_write_fields = j;
156}
157
158/*
159 * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
160 * set the success or error code of an emulated VMX instruction (as specified
161 * by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated
162 * instruction.
163 */
164static int nested_vmx_succeed(struct kvm_vcpu *vcpu)
165{
166 vmx_set_rflags(vcpu, vmx_get_rflags(vcpu)
167 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
168 X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF));
169 return kvm_skip_emulated_instruction(vcpu);
170}
171
172static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
173{
174 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
175 & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
176 X86_EFLAGS_SF | X86_EFLAGS_OF))
177 | X86_EFLAGS_CF);
178 return kvm_skip_emulated_instruction(vcpu);
179}
180
181static int nested_vmx_failValid(struct kvm_vcpu *vcpu,
182 u32 vm_instruction_error)
183{
184 struct vcpu_vmx *vmx = to_vmx(vcpu);
185
186 /*
187 * failValid writes the error number to the current VMCS, which
188 * can't be done if there isn't a current VMCS.
189 */
190 if (vmx->nested.current_vmptr == -1ull && !vmx->nested.hv_evmcs)
191 return nested_vmx_failInvalid(vcpu);
192
193 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
194 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
195 X86_EFLAGS_SF | X86_EFLAGS_OF))
196 | X86_EFLAGS_ZF);
197 get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error;
198 /*
199 * We don't need to force a shadow sync because
200 * VM_INSTRUCTION_ERROR is not shadowed
201 */
202 return kvm_skip_emulated_instruction(vcpu);
203}
204
205static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator)
206{
207 /* TODO: not to reset guest simply here. */
208 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
209 pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator);
210}
211
Marc Orrf0b51052019-09-17 11:50:57 -0700212static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
213{
214 return fixed_bits_valid(control, low, high);
215}
216
217static inline u64 vmx_control_msr(u32 low, u32 high)
218{
219 return low | ((u64)high << 32);
220}
221
Sean Christopherson55d23752018-12-03 13:53:18 -0800222static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx)
223{
Sean Christophersonfe7f895d2019-05-07 12:17:57 -0700224 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS);
Sean Christopherson55d23752018-12-03 13:53:18 -0800225 vmcs_write64(VMCS_LINK_POINTER, -1ull);
Paolo Bonzini88dddc12019-07-19 18:41:10 +0200226 vmx->nested.need_vmcs12_to_shadow_sync = false;
Sean Christopherson55d23752018-12-03 13:53:18 -0800227}
228
229static inline void nested_release_evmcs(struct kvm_vcpu *vcpu)
230{
231 struct vcpu_vmx *vmx = to_vmx(vcpu);
232
233 if (!vmx->nested.hv_evmcs)
234 return;
235
KarimAllah Ahmeddee9c042019-01-31 21:24:42 +0100236 kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, true);
Sean Christopherson55d23752018-12-03 13:53:18 -0800237 vmx->nested.hv_evmcs_vmptr = -1ull;
Sean Christopherson55d23752018-12-03 13:53:18 -0800238 vmx->nested.hv_evmcs = NULL;
239}
240
241/*
242 * Free whatever needs to be freed from vmx->nested when L1 goes down, or
243 * just stops using VMX.
244 */
245static void free_nested(struct kvm_vcpu *vcpu)
246{
247 struct vcpu_vmx *vmx = to_vmx(vcpu);
248
249 if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon)
250 return;
251
Jan Kiszkacf645272019-07-21 13:52:18 +0200252 kvm_clear_request(KVM_REQ_GET_VMCS12_PAGES, vcpu);
253
Sean Christopherson55d23752018-12-03 13:53:18 -0800254 vmx->nested.vmxon = false;
255 vmx->nested.smm.vmxon = false;
256 free_vpid(vmx->nested.vpid02);
257 vmx->nested.posted_intr_nv = -1;
258 vmx->nested.current_vmptr = -1ull;
259 if (enable_shadow_vmcs) {
260 vmx_disable_shadow_vmcs(vmx);
261 vmcs_clear(vmx->vmcs01.shadow_vmcs);
262 free_vmcs(vmx->vmcs01.shadow_vmcs);
263 vmx->vmcs01.shadow_vmcs = NULL;
264 }
265 kfree(vmx->nested.cached_vmcs12);
Jan Kiszkac6bf2ae2019-07-21 16:01:36 +0200266 vmx->nested.cached_vmcs12 = NULL;
Sean Christopherson55d23752018-12-03 13:53:18 -0800267 kfree(vmx->nested.cached_shadow_vmcs12);
Jan Kiszkac6bf2ae2019-07-21 16:01:36 +0200268 vmx->nested.cached_shadow_vmcs12 = NULL;
Sean Christopherson55d23752018-12-03 13:53:18 -0800269 /* Unpin physical memory we referred to in the vmcs02 */
270 if (vmx->nested.apic_access_page) {
Liran Alonb11494b2019-11-21 00:31:47 +0200271 kvm_release_page_clean(vmx->nested.apic_access_page);
Sean Christopherson55d23752018-12-03 13:53:18 -0800272 vmx->nested.apic_access_page = NULL;
273 }
KarimAllah Ahmed96c66e82019-01-31 21:24:37 +0100274 kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true);
KarimAllah Ahmed3278e042019-01-31 21:24:38 +0100275 kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true);
276 vmx->nested.pi_desc = NULL;
Sean Christopherson55d23752018-12-03 13:53:18 -0800277
278 kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
279
280 nested_release_evmcs(vcpu);
281
282 free_loaded_vmcs(&vmx->nested.vmcs02);
283}
284
Sean Christopherson13b964a2019-05-07 09:06:31 -0700285static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx,
286 struct loaded_vmcs *prev)
287{
288 struct vmcs_host_state *dest, *src;
289
290 if (unlikely(!vmx->guest_state_loaded))
291 return;
292
293 src = &prev->host_state;
294 dest = &vmx->loaded_vmcs->host_state;
295
296 vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base);
297 dest->ldt_sel = src->ldt_sel;
298#ifdef CONFIG_X86_64
299 dest->ds_sel = src->ds_sel;
300 dest->es_sel = src->es_sel;
301#endif
302}
303
Sean Christopherson55d23752018-12-03 13:53:18 -0800304static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
305{
306 struct vcpu_vmx *vmx = to_vmx(vcpu);
Sean Christopherson13b964a2019-05-07 09:06:31 -0700307 struct loaded_vmcs *prev;
Sean Christopherson55d23752018-12-03 13:53:18 -0800308 int cpu;
309
310 if (vmx->loaded_vmcs == vmcs)
311 return;
312
313 cpu = get_cpu();
Sean Christopherson13b964a2019-05-07 09:06:31 -0700314 prev = vmx->loaded_vmcs;
Sean Christopherson55d23752018-12-03 13:53:18 -0800315 vmx->loaded_vmcs = vmcs;
Sean Christopherson8ef863e2019-05-07 09:06:32 -0700316 vmx_vcpu_load_vmcs(vcpu, cpu);
Sean Christopherson13b964a2019-05-07 09:06:31 -0700317 vmx_sync_vmcs_host_state(vmx, prev);
Sean Christopherson55d23752018-12-03 13:53:18 -0800318 put_cpu();
319
Sean Christopherson55d23752018-12-03 13:53:18 -0800320 vmx_segment_cache_clear(vmx);
321}
322
323/*
324 * Ensure that the current vmcs of the logical processor is the
325 * vmcs01 of the vcpu before calling free_nested().
326 */
327void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu)
328{
329 vcpu_load(vcpu);
Paolo Bonzinib4b65b52019-01-29 19:12:35 +0100330 vmx_leave_nested(vcpu);
Sean Christopherson55d23752018-12-03 13:53:18 -0800331 vmx_switch_vmcs(vcpu, &to_vmx(vcpu)->vmcs01);
332 free_nested(vcpu);
333 vcpu_put(vcpu);
334}
335
336static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
337 struct x86_exception *fault)
338{
339 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
340 struct vcpu_vmx *vmx = to_vmx(vcpu);
341 u32 exit_reason;
342 unsigned long exit_qualification = vcpu->arch.exit_qualification;
343
344 if (vmx->nested.pml_full) {
345 exit_reason = EXIT_REASON_PML_FULL;
346 vmx->nested.pml_full = false;
347 exit_qualification &= INTR_INFO_UNBLOCK_NMI;
348 } else if (fault->error_code & PFERR_RSVD_MASK)
349 exit_reason = EXIT_REASON_EPT_MISCONFIG;
350 else
351 exit_reason = EXIT_REASON_EPT_VIOLATION;
352
353 nested_vmx_vmexit(vcpu, exit_reason, 0, exit_qualification);
354 vmcs12->guest_physical_address = fault->address;
355}
356
357static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
358{
359 WARN_ON(mmu_is_nested(vcpu));
360
361 vcpu->arch.mmu = &vcpu->arch.guest_mmu;
362 kvm_init_shadow_ept_mmu(vcpu,
363 to_vmx(vcpu)->nested.msrs.ept_caps &
364 VMX_EPT_EXECUTE_ONLY_BIT,
365 nested_ept_ad_enabled(vcpu),
366 nested_ept_get_cr3(vcpu));
367 vcpu->arch.mmu->set_cr3 = vmx_set_cr3;
368 vcpu->arch.mmu->get_cr3 = nested_ept_get_cr3;
369 vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault;
370 vcpu->arch.mmu->get_pdptr = kvm_pdptr_read;
371
372 vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu;
373}
374
375static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
376{
377 vcpu->arch.mmu = &vcpu->arch.root_mmu;
378 vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
379}
380
381static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
382 u16 error_code)
383{
384 bool inequality, bit;
385
386 bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0;
387 inequality =
388 (error_code & vmcs12->page_fault_error_code_mask) !=
389 vmcs12->page_fault_error_code_match;
390 return inequality ^ bit;
391}
392
393
394/*
395 * KVM wants to inject page-faults which it got to the guest. This function
396 * checks whether in a nested guest, we need to inject them to L1 or L2.
397 */
398static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit_qual)
399{
400 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
401 unsigned int nr = vcpu->arch.exception.nr;
402 bool has_payload = vcpu->arch.exception.has_payload;
403 unsigned long payload = vcpu->arch.exception.payload;
404
405 if (nr == PF_VECTOR) {
406 if (vcpu->arch.exception.nested_apf) {
407 *exit_qual = vcpu->arch.apf.nested_apf_token;
408 return 1;
409 }
410 if (nested_vmx_is_page_fault_vmexit(vmcs12,
411 vcpu->arch.exception.error_code)) {
412 *exit_qual = has_payload ? payload : vcpu->arch.cr2;
413 return 1;
414 }
415 } else if (vmcs12->exception_bitmap & (1u << nr)) {
416 if (nr == DB_VECTOR) {
417 if (!has_payload) {
418 payload = vcpu->arch.dr6;
419 payload &= ~(DR6_FIXED_1 | DR6_BT);
420 payload ^= DR6_RTM;
421 }
422 *exit_qual = payload;
423 } else
424 *exit_qual = 0;
425 return 1;
426 }
427
428 return 0;
429}
430
431
432static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
433 struct x86_exception *fault)
434{
435 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
436
437 WARN_ON(!is_guest_mode(vcpu));
438
439 if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code) &&
440 !to_vmx(vcpu)->nested.nested_run_pending) {
441 vmcs12->vm_exit_intr_error_code = fault->error_code;
442 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
443 PF_VECTOR | INTR_TYPE_HARD_EXCEPTION |
444 INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK,
445 fault->address);
446 } else {
447 kvm_inject_page_fault(vcpu, fault);
448 }
449}
450
451static bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa)
452{
453 return PAGE_ALIGNED(gpa) && !(gpa >> cpuid_maxphyaddr(vcpu));
454}
455
456static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu,
457 struct vmcs12 *vmcs12)
458{
459 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
460 return 0;
461
Sean Christopherson5497b952019-07-11 08:58:29 -0700462 if (CC(!page_address_valid(vcpu, vmcs12->io_bitmap_a)) ||
463 CC(!page_address_valid(vcpu, vmcs12->io_bitmap_b)))
Sean Christopherson55d23752018-12-03 13:53:18 -0800464 return -EINVAL;
465
466 return 0;
467}
468
469static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu,
470 struct vmcs12 *vmcs12)
471{
472 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
473 return 0;
474
Sean Christopherson5497b952019-07-11 08:58:29 -0700475 if (CC(!page_address_valid(vcpu, vmcs12->msr_bitmap)))
Sean Christopherson55d23752018-12-03 13:53:18 -0800476 return -EINVAL;
477
478 return 0;
479}
480
481static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu,
482 struct vmcs12 *vmcs12)
483{
484 if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
485 return 0;
486
Sean Christopherson5497b952019-07-11 08:58:29 -0700487 if (CC(!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr)))
Sean Christopherson55d23752018-12-03 13:53:18 -0800488 return -EINVAL;
489
490 return 0;
491}
492
493/*
494 * Check if MSR is intercepted for L01 MSR bitmap.
495 */
496static bool msr_write_intercepted_l01(struct kvm_vcpu *vcpu, u32 msr)
497{
498 unsigned long *msr_bitmap;
499 int f = sizeof(unsigned long);
500
501 if (!cpu_has_vmx_msr_bitmap())
502 return true;
503
504 msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap;
505
506 if (msr <= 0x1fff) {
507 return !!test_bit(msr, msr_bitmap + 0x800 / f);
508 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
509 msr &= 0x1fff;
510 return !!test_bit(msr, msr_bitmap + 0xc00 / f);
511 }
512
513 return true;
514}
515
516/*
517 * If a msr is allowed by L0, we should check whether it is allowed by L1.
518 * The corresponding bit will be cleared unless both of L0 and L1 allow it.
519 */
520static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1,
521 unsigned long *msr_bitmap_nested,
522 u32 msr, int type)
523{
524 int f = sizeof(unsigned long);
525
526 /*
527 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
528 * have the write-low and read-high bitmap offsets the wrong way round.
529 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
530 */
531 if (msr <= 0x1fff) {
532 if (type & MSR_TYPE_R &&
533 !test_bit(msr, msr_bitmap_l1 + 0x000 / f))
534 /* read-low */
535 __clear_bit(msr, msr_bitmap_nested + 0x000 / f);
536
537 if (type & MSR_TYPE_W &&
538 !test_bit(msr, msr_bitmap_l1 + 0x800 / f))
539 /* write-low */
540 __clear_bit(msr, msr_bitmap_nested + 0x800 / f);
541
542 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
543 msr &= 0x1fff;
544 if (type & MSR_TYPE_R &&
545 !test_bit(msr, msr_bitmap_l1 + 0x400 / f))
546 /* read-high */
547 __clear_bit(msr, msr_bitmap_nested + 0x400 / f);
548
549 if (type & MSR_TYPE_W &&
550 !test_bit(msr, msr_bitmap_l1 + 0xc00 / f))
551 /* write-high */
552 __clear_bit(msr, msr_bitmap_nested + 0xc00 / f);
553
554 }
555}
556
Marc Orracff7842019-04-01 23:55:59 -0700557static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap) {
558 int msr;
559
560 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
561 unsigned word = msr / BITS_PER_LONG;
562
563 msr_bitmap[word] = ~0;
564 msr_bitmap[word + (0x800 / sizeof(long))] = ~0;
565 }
566}
567
Sean Christopherson55d23752018-12-03 13:53:18 -0800568/*
569 * Merge L0's and L1's MSR bitmap, return false to indicate that
570 * we do not use the hardware.
571 */
572static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
573 struct vmcs12 *vmcs12)
574{
575 int msr;
Sean Christopherson55d23752018-12-03 13:53:18 -0800576 unsigned long *msr_bitmap_l1;
577 unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap;
KarimAllah Ahmed31f0b6c2019-01-31 21:24:36 +0100578 struct kvm_host_map *map = &to_vmx(vcpu)->nested.msr_bitmap_map;
Sean Christopherson55d23752018-12-03 13:53:18 -0800579
580 /* Nothing to do if the MSR bitmap is not in use. */
581 if (!cpu_has_vmx_msr_bitmap() ||
582 !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
583 return false;
584
KarimAllah Ahmed31f0b6c2019-01-31 21:24:36 +0100585 if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), map))
Sean Christopherson55d23752018-12-03 13:53:18 -0800586 return false;
587
KarimAllah Ahmed31f0b6c2019-01-31 21:24:36 +0100588 msr_bitmap_l1 = (unsigned long *)map->hva;
Sean Christopherson55d23752018-12-03 13:53:18 -0800589
Marc Orracff7842019-04-01 23:55:59 -0700590 /*
591 * To keep the control flow simple, pay eight 8-byte writes (sixteen
592 * 4-byte writes on 32-bit systems) up front to enable intercepts for
593 * the x2APIC MSR range and selectively disable them below.
594 */
595 enable_x2apic_msr_intercepts(msr_bitmap_l0);
Sean Christopherson55d23752018-12-03 13:53:18 -0800596
Marc Orracff7842019-04-01 23:55:59 -0700597 if (nested_cpu_has_virt_x2apic_mode(vmcs12)) {
598 if (nested_cpu_has_apic_reg_virt(vmcs12)) {
599 /*
600 * L0 need not intercept reads for MSRs between 0x800
601 * and 0x8ff, it just lets the processor take the value
602 * from the virtual-APIC page; take those 256 bits
603 * directly from the L1 bitmap.
604 */
605 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
606 unsigned word = msr / BITS_PER_LONG;
607
608 msr_bitmap_l0[word] = msr_bitmap_l1[word];
609 }
610 }
611
Sean Christopherson55d23752018-12-03 13:53:18 -0800612 nested_vmx_disable_intercept_for_msr(
613 msr_bitmap_l1, msr_bitmap_l0,
Marc Orracff7842019-04-01 23:55:59 -0700614 X2APIC_MSR(APIC_TASKPRI),
Marc Orrc73f4c92019-04-01 23:56:00 -0700615 MSR_TYPE_R | MSR_TYPE_W);
Marc Orracff7842019-04-01 23:55:59 -0700616
617 if (nested_cpu_has_vid(vmcs12)) {
618 nested_vmx_disable_intercept_for_msr(
619 msr_bitmap_l1, msr_bitmap_l0,
620 X2APIC_MSR(APIC_EOI),
621 MSR_TYPE_W);
622 nested_vmx_disable_intercept_for_msr(
623 msr_bitmap_l1, msr_bitmap_l0,
624 X2APIC_MSR(APIC_SELF_IPI),
625 MSR_TYPE_W);
626 }
Sean Christopherson55d23752018-12-03 13:53:18 -0800627 }
628
Sean Christophersond69129b2019-05-08 07:32:15 -0700629 /* KVM unconditionally exposes the FS/GS base MSRs to L1. */
630 nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0,
631 MSR_FS_BASE, MSR_TYPE_RW);
632
633 nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0,
634 MSR_GS_BASE, MSR_TYPE_RW);
635
636 nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0,
637 MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
638
639 /*
640 * Checking the L0->L1 bitmap is trying to verify two things:
641 *
642 * 1. L0 gave a permission to L1 to actually passthrough the MSR. This
643 * ensures that we do not accidentally generate an L02 MSR bitmap
644 * from the L12 MSR bitmap that is too permissive.
645 * 2. That L1 or L2s have actually used the MSR. This avoids
646 * unnecessarily merging of the bitmap if the MSR is unused. This
647 * works properly because we only update the L01 MSR bitmap lazily.
648 * So even if L0 should pass L1 these MSRs, the L01 bitmap is only
649 * updated to reflect this when L1 (or its L2s) actually write to
650 * the MSR.
651 */
652 if (!msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL))
Sean Christopherson55d23752018-12-03 13:53:18 -0800653 nested_vmx_disable_intercept_for_msr(
654 msr_bitmap_l1, msr_bitmap_l0,
655 MSR_IA32_SPEC_CTRL,
656 MSR_TYPE_R | MSR_TYPE_W);
657
Sean Christophersond69129b2019-05-08 07:32:15 -0700658 if (!msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD))
Sean Christopherson55d23752018-12-03 13:53:18 -0800659 nested_vmx_disable_intercept_for_msr(
660 msr_bitmap_l1, msr_bitmap_l0,
661 MSR_IA32_PRED_CMD,
662 MSR_TYPE_W);
663
KarimAllah Ahmed31f0b6c2019-01-31 21:24:36 +0100664 kvm_vcpu_unmap(vcpu, &to_vmx(vcpu)->nested.msr_bitmap_map, false);
Sean Christopherson55d23752018-12-03 13:53:18 -0800665
666 return true;
667}
668
669static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu,
670 struct vmcs12 *vmcs12)
671{
KarimAllah Ahmed88925302019-01-31 21:24:41 +0100672 struct kvm_host_map map;
Sean Christopherson55d23752018-12-03 13:53:18 -0800673 struct vmcs12 *shadow;
Sean Christopherson55d23752018-12-03 13:53:18 -0800674
675 if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
676 vmcs12->vmcs_link_pointer == -1ull)
677 return;
678
679 shadow = get_shadow_vmcs12(vcpu);
Sean Christopherson55d23752018-12-03 13:53:18 -0800680
KarimAllah Ahmed88925302019-01-31 21:24:41 +0100681 if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map))
682 return;
Sean Christopherson55d23752018-12-03 13:53:18 -0800683
KarimAllah Ahmed88925302019-01-31 21:24:41 +0100684 memcpy(shadow, map.hva, VMCS12_SIZE);
685 kvm_vcpu_unmap(vcpu, &map, false);
Sean Christopherson55d23752018-12-03 13:53:18 -0800686}
687
688static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu,
689 struct vmcs12 *vmcs12)
690{
691 struct vcpu_vmx *vmx = to_vmx(vcpu);
692
693 if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
694 vmcs12->vmcs_link_pointer == -1ull)
695 return;
696
697 kvm_write_guest(vmx->vcpu.kvm, vmcs12->vmcs_link_pointer,
698 get_shadow_vmcs12(vcpu), VMCS12_SIZE);
699}
700
701/*
702 * In nested virtualization, check if L1 has set
703 * VM_EXIT_ACK_INTR_ON_EXIT
704 */
705static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu)
706{
707 return get_vmcs12(vcpu)->vm_exit_controls &
708 VM_EXIT_ACK_INTR_ON_EXIT;
709}
710
711static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu)
712{
713 return nested_cpu_has_nmi_exiting(get_vmcs12(vcpu));
714}
715
716static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu,
717 struct vmcs12 *vmcs12)
718{
719 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) &&
Sean Christopherson5497b952019-07-11 08:58:29 -0700720 CC(!page_address_valid(vcpu, vmcs12->apic_access_addr)))
Sean Christopherson55d23752018-12-03 13:53:18 -0800721 return -EINVAL;
722 else
723 return 0;
724}
725
726static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
727 struct vmcs12 *vmcs12)
728{
729 if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
730 !nested_cpu_has_apic_reg_virt(vmcs12) &&
731 !nested_cpu_has_vid(vmcs12) &&
732 !nested_cpu_has_posted_intr(vmcs12))
733 return 0;
734
735 /*
736 * If virtualize x2apic mode is enabled,
737 * virtualize apic access must be disabled.
738 */
Sean Christopherson5497b952019-07-11 08:58:29 -0700739 if (CC(nested_cpu_has_virt_x2apic_mode(vmcs12) &&
740 nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)))
Sean Christopherson55d23752018-12-03 13:53:18 -0800741 return -EINVAL;
742
743 /*
744 * If virtual interrupt delivery is enabled,
745 * we must exit on external interrupts.
746 */
Sean Christopherson5497b952019-07-11 08:58:29 -0700747 if (CC(nested_cpu_has_vid(vmcs12) && !nested_exit_on_intr(vcpu)))
Sean Christopherson55d23752018-12-03 13:53:18 -0800748 return -EINVAL;
749
750 /*
751 * bits 15:8 should be zero in posted_intr_nv,
752 * the descriptor address has been already checked
753 * in nested_get_vmcs12_pages.
754 *
755 * bits 5:0 of posted_intr_desc_addr should be zero.
756 */
757 if (nested_cpu_has_posted_intr(vmcs12) &&
Sean Christopherson5497b952019-07-11 08:58:29 -0700758 (CC(!nested_cpu_has_vid(vmcs12)) ||
759 CC(!nested_exit_intr_ack_set(vcpu)) ||
760 CC((vmcs12->posted_intr_nv & 0xff00)) ||
761 CC((vmcs12->posted_intr_desc_addr & 0x3f)) ||
762 CC((vmcs12->posted_intr_desc_addr >> cpuid_maxphyaddr(vcpu)))))
Sean Christopherson55d23752018-12-03 13:53:18 -0800763 return -EINVAL;
764
765 /* tpr shadow is needed by all apicv features. */
Sean Christopherson5497b952019-07-11 08:58:29 -0700766 if (CC(!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)))
Sean Christopherson55d23752018-12-03 13:53:18 -0800767 return -EINVAL;
768
769 return 0;
770}
771
772static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
Sean Christophersonf9b245e2018-12-12 13:30:08 -0500773 u32 count, u64 addr)
Sean Christopherson55d23752018-12-03 13:53:18 -0800774{
Sean Christopherson55d23752018-12-03 13:53:18 -0800775 int maxphyaddr;
Sean Christopherson55d23752018-12-03 13:53:18 -0800776
Sean Christopherson55d23752018-12-03 13:53:18 -0800777 if (count == 0)
778 return 0;
779 maxphyaddr = cpuid_maxphyaddr(vcpu);
780 if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr ||
Sean Christophersonf9b245e2018-12-12 13:30:08 -0500781 (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr)
Sean Christopherson55d23752018-12-03 13:53:18 -0800782 return -EINVAL;
Sean Christophersonf9b245e2018-12-12 13:30:08 -0500783
Sean Christopherson55d23752018-12-03 13:53:18 -0800784 return 0;
785}
786
Krish Sadhukhan61446ba2018-12-12 13:30:09 -0500787static int nested_vmx_check_exit_msr_switch_controls(struct kvm_vcpu *vcpu,
788 struct vmcs12 *vmcs12)
Sean Christopherson55d23752018-12-03 13:53:18 -0800789{
Sean Christopherson5497b952019-07-11 08:58:29 -0700790 if (CC(nested_vmx_check_msr_switch(vcpu,
791 vmcs12->vm_exit_msr_load_count,
792 vmcs12->vm_exit_msr_load_addr)) ||
793 CC(nested_vmx_check_msr_switch(vcpu,
794 vmcs12->vm_exit_msr_store_count,
795 vmcs12->vm_exit_msr_store_addr)))
Sean Christopherson55d23752018-12-03 13:53:18 -0800796 return -EINVAL;
Sean Christophersonf9b245e2018-12-12 13:30:08 -0500797
Sean Christopherson55d23752018-12-03 13:53:18 -0800798 return 0;
799}
800
Krish Sadhukhan5fbf9632018-12-12 13:30:10 -0500801static int nested_vmx_check_entry_msr_switch_controls(struct kvm_vcpu *vcpu,
802 struct vmcs12 *vmcs12)
Krish Sadhukhan61446ba2018-12-12 13:30:09 -0500803{
Sean Christopherson5497b952019-07-11 08:58:29 -0700804 if (CC(nested_vmx_check_msr_switch(vcpu,
805 vmcs12->vm_entry_msr_load_count,
806 vmcs12->vm_entry_msr_load_addr)))
Krish Sadhukhan61446ba2018-12-12 13:30:09 -0500807 return -EINVAL;
808
809 return 0;
810}
811
Sean Christopherson55d23752018-12-03 13:53:18 -0800812static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu,
813 struct vmcs12 *vmcs12)
814{
815 if (!nested_cpu_has_pml(vmcs12))
816 return 0;
817
Sean Christopherson5497b952019-07-11 08:58:29 -0700818 if (CC(!nested_cpu_has_ept(vmcs12)) ||
819 CC(!page_address_valid(vcpu, vmcs12->pml_address)))
Sean Christopherson55d23752018-12-03 13:53:18 -0800820 return -EINVAL;
821
822 return 0;
823}
824
825static int nested_vmx_check_unrestricted_guest_controls(struct kvm_vcpu *vcpu,
826 struct vmcs12 *vmcs12)
827{
Sean Christopherson5497b952019-07-11 08:58:29 -0700828 if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST) &&
829 !nested_cpu_has_ept(vmcs12)))
Sean Christopherson55d23752018-12-03 13:53:18 -0800830 return -EINVAL;
831 return 0;
832}
833
834static int nested_vmx_check_mode_based_ept_exec_controls(struct kvm_vcpu *vcpu,
835 struct vmcs12 *vmcs12)
836{
Sean Christopherson5497b952019-07-11 08:58:29 -0700837 if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC) &&
838 !nested_cpu_has_ept(vmcs12)))
Sean Christopherson55d23752018-12-03 13:53:18 -0800839 return -EINVAL;
840 return 0;
841}
842
843static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu,
844 struct vmcs12 *vmcs12)
845{
846 if (!nested_cpu_has_shadow_vmcs(vmcs12))
847 return 0;
848
Sean Christopherson5497b952019-07-11 08:58:29 -0700849 if (CC(!page_address_valid(vcpu, vmcs12->vmread_bitmap)) ||
850 CC(!page_address_valid(vcpu, vmcs12->vmwrite_bitmap)))
Sean Christopherson55d23752018-12-03 13:53:18 -0800851 return -EINVAL;
852
853 return 0;
854}
855
856static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu,
857 struct vmx_msr_entry *e)
858{
859 /* x2APIC MSR accesses are not allowed */
Sean Christopherson5497b952019-07-11 08:58:29 -0700860 if (CC(vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8))
Sean Christopherson55d23752018-12-03 13:53:18 -0800861 return -EINVAL;
Sean Christopherson5497b952019-07-11 08:58:29 -0700862 if (CC(e->index == MSR_IA32_UCODE_WRITE) || /* SDM Table 35-2 */
863 CC(e->index == MSR_IA32_UCODE_REV))
Sean Christopherson55d23752018-12-03 13:53:18 -0800864 return -EINVAL;
Sean Christopherson5497b952019-07-11 08:58:29 -0700865 if (CC(e->reserved != 0))
Sean Christopherson55d23752018-12-03 13:53:18 -0800866 return -EINVAL;
867 return 0;
868}
869
870static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu,
871 struct vmx_msr_entry *e)
872{
Sean Christopherson5497b952019-07-11 08:58:29 -0700873 if (CC(e->index == MSR_FS_BASE) ||
874 CC(e->index == MSR_GS_BASE) ||
875 CC(e->index == MSR_IA32_SMM_MONITOR_CTL) || /* SMM is not supported */
Sean Christopherson55d23752018-12-03 13:53:18 -0800876 nested_vmx_msr_check_common(vcpu, e))
877 return -EINVAL;
878 return 0;
879}
880
881static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu,
882 struct vmx_msr_entry *e)
883{
Sean Christopherson5497b952019-07-11 08:58:29 -0700884 if (CC(e->index == MSR_IA32_SMBASE) || /* SMM is not supported */
Sean Christopherson55d23752018-12-03 13:53:18 -0800885 nested_vmx_msr_check_common(vcpu, e))
886 return -EINVAL;
887 return 0;
888}
889
Marc Orrf0b51052019-09-17 11:50:57 -0700890static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu)
891{
892 struct vcpu_vmx *vmx = to_vmx(vcpu);
893 u64 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low,
894 vmx->nested.msrs.misc_high);
895
896 return (vmx_misc_max_msr(vmx_misc) + 1) * VMX_MISC_MSR_LIST_MULTIPLIER;
897}
898
Sean Christopherson55d23752018-12-03 13:53:18 -0800899/*
900 * Load guest's/host's msr at nested entry/exit.
901 * return 0 for success, entry index for failure.
Marc Orrf0b51052019-09-17 11:50:57 -0700902 *
903 * One of the failure modes for MSR load/store is when a list exceeds the
904 * virtual hardware's capacity. To maintain compatibility with hardware inasmuch
905 * as possible, process all valid entries before failing rather than precheck
906 * for a capacity violation.
Sean Christopherson55d23752018-12-03 13:53:18 -0800907 */
908static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
909{
910 u32 i;
911 struct vmx_msr_entry e;
Marc Orrf0b51052019-09-17 11:50:57 -0700912 u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu);
Sean Christopherson55d23752018-12-03 13:53:18 -0800913
Sean Christopherson55d23752018-12-03 13:53:18 -0800914 for (i = 0; i < count; i++) {
Marc Orrf0b51052019-09-17 11:50:57 -0700915 if (unlikely(i >= max_msr_list_size))
916 goto fail;
917
Sean Christopherson55d23752018-12-03 13:53:18 -0800918 if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e),
919 &e, sizeof(e))) {
920 pr_debug_ratelimited(
921 "%s cannot read MSR entry (%u, 0x%08llx)\n",
922 __func__, i, gpa + i * sizeof(e));
923 goto fail;
924 }
925 if (nested_vmx_load_msr_check(vcpu, &e)) {
926 pr_debug_ratelimited(
927 "%s check failed (%u, 0x%x, 0x%x)\n",
928 __func__, i, e.index, e.reserved);
929 goto fail;
930 }
Sean Christophersonf20935d2019-09-05 14:22:54 -0700931 if (kvm_set_msr(vcpu, e.index, e.value)) {
Sean Christopherson55d23752018-12-03 13:53:18 -0800932 pr_debug_ratelimited(
933 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
934 __func__, i, e.index, e.value);
935 goto fail;
936 }
937 }
938 return 0;
939fail:
940 return i + 1;
941}
942
Aaron Lewis662f1d12019-11-07 21:14:39 -0800943static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu,
944 u32 msr_index,
945 u64 *data)
946{
947 struct vcpu_vmx *vmx = to_vmx(vcpu);
948
949 /*
950 * If the L0 hypervisor stored a more accurate value for the TSC that
951 * does not include the time taken for emulation of the L2->L1
952 * VM-exit in L0, use the more accurate value.
953 */
954 if (msr_index == MSR_IA32_TSC) {
955 int index = vmx_find_msr_index(&vmx->msr_autostore.guest,
956 MSR_IA32_TSC);
957
958 if (index >= 0) {
959 u64 val = vmx->msr_autostore.guest.val[index].value;
960
961 *data = kvm_read_l1_tsc(vcpu, val);
962 return true;
963 }
964 }
965
966 if (kvm_get_msr(vcpu, msr_index, data)) {
967 pr_debug_ratelimited("%s cannot read MSR (0x%x)\n", __func__,
968 msr_index);
969 return false;
970 }
971 return true;
972}
973
Aaron Lewis365d3d52019-11-07 21:14:36 -0800974static bool read_and_check_msr_entry(struct kvm_vcpu *vcpu, u64 gpa, int i,
975 struct vmx_msr_entry *e)
976{
977 if (kvm_vcpu_read_guest(vcpu,
978 gpa + i * sizeof(*e),
979 e, 2 * sizeof(u32))) {
980 pr_debug_ratelimited(
981 "%s cannot read MSR entry (%u, 0x%08llx)\n",
982 __func__, i, gpa + i * sizeof(*e));
983 return false;
984 }
985 if (nested_vmx_store_msr_check(vcpu, e)) {
986 pr_debug_ratelimited(
987 "%s check failed (%u, 0x%x, 0x%x)\n",
988 __func__, i, e->index, e->reserved);
989 return false;
990 }
991 return true;
992}
993
Sean Christopherson55d23752018-12-03 13:53:18 -0800994static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
995{
Sean Christophersonf20935d2019-09-05 14:22:54 -0700996 u64 data;
Sean Christopherson55d23752018-12-03 13:53:18 -0800997 u32 i;
998 struct vmx_msr_entry e;
Marc Orrf0b51052019-09-17 11:50:57 -0700999 u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu);
Sean Christopherson55d23752018-12-03 13:53:18 -08001000
1001 for (i = 0; i < count; i++) {
Marc Orrf0b51052019-09-17 11:50:57 -07001002 if (unlikely(i >= max_msr_list_size))
1003 return -EINVAL;
1004
Aaron Lewis365d3d52019-11-07 21:14:36 -08001005 if (!read_and_check_msr_entry(vcpu, gpa, i, &e))
Sean Christopherson55d23752018-12-03 13:53:18 -08001006 return -EINVAL;
Aaron Lewis365d3d52019-11-07 21:14:36 -08001007
Aaron Lewis662f1d12019-11-07 21:14:39 -08001008 if (!nested_vmx_get_vmexit_msr_value(vcpu, e.index, &data))
Sean Christopherson55d23752018-12-03 13:53:18 -08001009 return -EINVAL;
Aaron Lewis662f1d12019-11-07 21:14:39 -08001010
Sean Christopherson55d23752018-12-03 13:53:18 -08001011 if (kvm_vcpu_write_guest(vcpu,
1012 gpa + i * sizeof(e) +
1013 offsetof(struct vmx_msr_entry, value),
Sean Christophersonf20935d2019-09-05 14:22:54 -07001014 &data, sizeof(data))) {
Sean Christopherson55d23752018-12-03 13:53:18 -08001015 pr_debug_ratelimited(
1016 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
Sean Christophersonf20935d2019-09-05 14:22:54 -07001017 __func__, i, e.index, data);
Sean Christopherson55d23752018-12-03 13:53:18 -08001018 return -EINVAL;
1019 }
1020 }
1021 return 0;
1022}
1023
Aaron Lewis662f1d12019-11-07 21:14:39 -08001024static bool nested_msr_store_list_has_msr(struct kvm_vcpu *vcpu, u32 msr_index)
1025{
1026 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1027 u32 count = vmcs12->vm_exit_msr_store_count;
1028 u64 gpa = vmcs12->vm_exit_msr_store_addr;
1029 struct vmx_msr_entry e;
1030 u32 i;
1031
1032 for (i = 0; i < count; i++) {
1033 if (!read_and_check_msr_entry(vcpu, gpa, i, &e))
1034 return false;
1035
1036 if (e.index == msr_index)
1037 return true;
1038 }
1039 return false;
1040}
1041
1042static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu,
1043 u32 msr_index)
1044{
1045 struct vcpu_vmx *vmx = to_vmx(vcpu);
1046 struct vmx_msrs *autostore = &vmx->msr_autostore.guest;
1047 bool in_vmcs12_store_list;
1048 int msr_autostore_index;
1049 bool in_autostore_list;
1050 int last;
1051
1052 msr_autostore_index = vmx_find_msr_index(autostore, msr_index);
1053 in_autostore_list = msr_autostore_index >= 0;
1054 in_vmcs12_store_list = nested_msr_store_list_has_msr(vcpu, msr_index);
1055
1056 if (in_vmcs12_store_list && !in_autostore_list) {
1057 if (autostore->nr == NR_LOADSTORE_MSRS) {
1058 /*
1059 * Emulated VMEntry does not fail here. Instead a less
1060 * accurate value will be returned by
1061 * nested_vmx_get_vmexit_msr_value() using kvm_get_msr()
1062 * instead of reading the value from the vmcs02 VMExit
1063 * MSR-store area.
1064 */
1065 pr_warn_ratelimited(
1066 "Not enough msr entries in msr_autostore. Can't add msr %x\n",
1067 msr_index);
1068 return;
1069 }
1070 last = autostore->nr++;
1071 autostore->val[last].index = msr_index;
1072 } else if (!in_vmcs12_store_list && in_autostore_list) {
1073 last = --autostore->nr;
1074 autostore->val[msr_autostore_index] = autostore->val[last];
1075 }
1076}
1077
Sean Christopherson55d23752018-12-03 13:53:18 -08001078static bool nested_cr3_valid(struct kvm_vcpu *vcpu, unsigned long val)
1079{
1080 unsigned long invalid_mask;
1081
1082 invalid_mask = (~0ULL) << cpuid_maxphyaddr(vcpu);
1083 return (val & invalid_mask) == 0;
1084}
1085
1086/*
1087 * Load guest's/host's cr3 at nested entry/exit. nested_ept is true if we are
1088 * emulating VM entry into a guest with EPT enabled.
1089 * Returns 0 on success, 1 on failure. Invalid state exit qualification code
1090 * is assigned to entry_failure_code on failure.
1091 */
1092static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept,
1093 u32 *entry_failure_code)
1094{
1095 if (cr3 != kvm_read_cr3(vcpu) || (!nested_ept && pdptrs_changed(vcpu))) {
Sean Christopherson5497b952019-07-11 08:58:29 -07001096 if (CC(!nested_cr3_valid(vcpu, cr3))) {
Sean Christopherson55d23752018-12-03 13:53:18 -08001097 *entry_failure_code = ENTRY_FAIL_DEFAULT;
Sean Christophersonc80add02019-04-11 12:18:09 -07001098 return -EINVAL;
Sean Christopherson55d23752018-12-03 13:53:18 -08001099 }
1100
1101 /*
1102 * If PAE paging and EPT are both on, CR3 is not used by the CPU and
1103 * must not be dereferenced.
1104 */
Paolo Bonzinibf03d4f2019-06-06 18:52:44 +02001105 if (is_pae_paging(vcpu) && !nested_ept) {
Sean Christopherson5497b952019-07-11 08:58:29 -07001106 if (CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))) {
Sean Christopherson55d23752018-12-03 13:53:18 -08001107 *entry_failure_code = ENTRY_FAIL_PDPTE;
Sean Christophersonc80add02019-04-11 12:18:09 -07001108 return -EINVAL;
Sean Christopherson55d23752018-12-03 13:53:18 -08001109 }
1110 }
1111 }
1112
1113 if (!nested_ept)
1114 kvm_mmu_new_cr3(vcpu, cr3, false);
1115
1116 vcpu->arch.cr3 = cr3;
Sean Christophersoncb3c1e22019-09-27 14:45:22 -07001117 kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
Sean Christopherson55d23752018-12-03 13:53:18 -08001118
1119 kvm_init_mmu(vcpu, false);
1120
1121 return 0;
1122}
1123
1124/*
1125 * Returns if KVM is able to config CPU to tag TLB entries
1126 * populated by L2 differently than TLB entries populated
1127 * by L1.
1128 *
Liran Alon992edea2019-11-20 14:24:52 +02001129 * If L0 uses EPT, L1 and L2 run with different EPTP because
1130 * guest_mode is part of kvm_mmu_page_role. Thus, TLB entries
1131 * are tagged with different EPTP.
Sean Christopherson55d23752018-12-03 13:53:18 -08001132 *
1133 * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged
1134 * with different VPID (L1 entries are tagged with vmx->vpid
1135 * while L2 entries are tagged with vmx->nested.vpid02).
1136 */
1137static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu)
1138{
1139 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1140
Liran Alon992edea2019-11-20 14:24:52 +02001141 return enable_ept ||
Sean Christopherson55d23752018-12-03 13:53:18 -08001142 (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02);
1143}
1144
1145static u16 nested_get_vpid02(struct kvm_vcpu *vcpu)
1146{
1147 struct vcpu_vmx *vmx = to_vmx(vcpu);
1148
1149 return vmx->nested.vpid02 ? vmx->nested.vpid02 : vmx->vpid;
1150}
1151
Sean Christopherson55d23752018-12-03 13:53:18 -08001152static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask)
1153{
1154 superset &= mask;
1155 subset &= mask;
1156
1157 return (superset | subset) == superset;
1158}
1159
1160static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data)
1161{
1162 const u64 feature_and_reserved =
1163 /* feature (except bit 48; see below) */
1164 BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) |
1165 /* reserved */
1166 BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56);
1167 u64 vmx_basic = vmx->nested.msrs.basic;
1168
1169 if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved))
1170 return -EINVAL;
1171
1172 /*
1173 * KVM does not emulate a version of VMX that constrains physical
1174 * addresses of VMX structures (e.g. VMCS) to 32-bits.
1175 */
1176 if (data & BIT_ULL(48))
1177 return -EINVAL;
1178
1179 if (vmx_basic_vmcs_revision_id(vmx_basic) !=
1180 vmx_basic_vmcs_revision_id(data))
1181 return -EINVAL;
1182
1183 if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data))
1184 return -EINVAL;
1185
1186 vmx->nested.msrs.basic = data;
1187 return 0;
1188}
1189
1190static int
1191vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
1192{
1193 u64 supported;
1194 u32 *lowp, *highp;
1195
1196 switch (msr_index) {
1197 case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
1198 lowp = &vmx->nested.msrs.pinbased_ctls_low;
1199 highp = &vmx->nested.msrs.pinbased_ctls_high;
1200 break;
1201 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
1202 lowp = &vmx->nested.msrs.procbased_ctls_low;
1203 highp = &vmx->nested.msrs.procbased_ctls_high;
1204 break;
1205 case MSR_IA32_VMX_TRUE_EXIT_CTLS:
1206 lowp = &vmx->nested.msrs.exit_ctls_low;
1207 highp = &vmx->nested.msrs.exit_ctls_high;
1208 break;
1209 case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
1210 lowp = &vmx->nested.msrs.entry_ctls_low;
1211 highp = &vmx->nested.msrs.entry_ctls_high;
1212 break;
1213 case MSR_IA32_VMX_PROCBASED_CTLS2:
1214 lowp = &vmx->nested.msrs.secondary_ctls_low;
1215 highp = &vmx->nested.msrs.secondary_ctls_high;
1216 break;
1217 default:
1218 BUG();
1219 }
1220
1221 supported = vmx_control_msr(*lowp, *highp);
1222
1223 /* Check must-be-1 bits are still 1. */
1224 if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0)))
1225 return -EINVAL;
1226
1227 /* Check must-be-0 bits are still 0. */
1228 if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32)))
1229 return -EINVAL;
1230
1231 *lowp = data;
1232 *highp = data >> 32;
1233 return 0;
1234}
1235
1236static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data)
1237{
1238 const u64 feature_and_reserved_bits =
1239 /* feature */
1240 BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) |
1241 BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) |
1242 /* reserved */
1243 GENMASK_ULL(13, 9) | BIT_ULL(31);
1244 u64 vmx_misc;
1245
1246 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low,
1247 vmx->nested.msrs.misc_high);
1248
1249 if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits))
1250 return -EINVAL;
1251
1252 if ((vmx->nested.msrs.pinbased_ctls_high &
1253 PIN_BASED_VMX_PREEMPTION_TIMER) &&
1254 vmx_misc_preemption_timer_rate(data) !=
1255 vmx_misc_preemption_timer_rate(vmx_misc))
1256 return -EINVAL;
1257
1258 if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc))
1259 return -EINVAL;
1260
1261 if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc))
1262 return -EINVAL;
1263
1264 if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc))
1265 return -EINVAL;
1266
1267 vmx->nested.msrs.misc_low = data;
1268 vmx->nested.msrs.misc_high = data >> 32;
1269
Sean Christopherson55d23752018-12-03 13:53:18 -08001270 return 0;
1271}
1272
1273static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data)
1274{
1275 u64 vmx_ept_vpid_cap;
1276
1277 vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.msrs.ept_caps,
1278 vmx->nested.msrs.vpid_caps);
1279
1280 /* Every bit is either reserved or a feature bit. */
1281 if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL))
1282 return -EINVAL;
1283
1284 vmx->nested.msrs.ept_caps = data;
1285 vmx->nested.msrs.vpid_caps = data >> 32;
1286 return 0;
1287}
1288
1289static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
1290{
1291 u64 *msr;
1292
1293 switch (msr_index) {
1294 case MSR_IA32_VMX_CR0_FIXED0:
1295 msr = &vmx->nested.msrs.cr0_fixed0;
1296 break;
1297 case MSR_IA32_VMX_CR4_FIXED0:
1298 msr = &vmx->nested.msrs.cr4_fixed0;
1299 break;
1300 default:
1301 BUG();
1302 }
1303
1304 /*
1305 * 1 bits (which indicates bits which "must-be-1" during VMX operation)
1306 * must be 1 in the restored value.
1307 */
1308 if (!is_bitwise_subset(data, *msr, -1ULL))
1309 return -EINVAL;
1310
1311 *msr = data;
1312 return 0;
1313}
1314
1315/*
1316 * Called when userspace is restoring VMX MSRs.
1317 *
1318 * Returns 0 on success, non-0 otherwise.
1319 */
1320int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1321{
1322 struct vcpu_vmx *vmx = to_vmx(vcpu);
1323
1324 /*
1325 * Don't allow changes to the VMX capability MSRs while the vCPU
1326 * is in VMX operation.
1327 */
1328 if (vmx->nested.vmxon)
1329 return -EBUSY;
1330
1331 switch (msr_index) {
1332 case MSR_IA32_VMX_BASIC:
1333 return vmx_restore_vmx_basic(vmx, data);
1334 case MSR_IA32_VMX_PINBASED_CTLS:
1335 case MSR_IA32_VMX_PROCBASED_CTLS:
1336 case MSR_IA32_VMX_EXIT_CTLS:
1337 case MSR_IA32_VMX_ENTRY_CTLS:
1338 /*
1339 * The "non-true" VMX capability MSRs are generated from the
1340 * "true" MSRs, so we do not support restoring them directly.
1341 *
1342 * If userspace wants to emulate VMX_BASIC[55]=0, userspace
1343 * should restore the "true" MSRs with the must-be-1 bits
1344 * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND
1345 * DEFAULT SETTINGS".
1346 */
1347 return -EINVAL;
1348 case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
1349 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
1350 case MSR_IA32_VMX_TRUE_EXIT_CTLS:
1351 case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
1352 case MSR_IA32_VMX_PROCBASED_CTLS2:
1353 return vmx_restore_control_msr(vmx, msr_index, data);
1354 case MSR_IA32_VMX_MISC:
1355 return vmx_restore_vmx_misc(vmx, data);
1356 case MSR_IA32_VMX_CR0_FIXED0:
1357 case MSR_IA32_VMX_CR4_FIXED0:
1358 return vmx_restore_fixed0_msr(vmx, msr_index, data);
1359 case MSR_IA32_VMX_CR0_FIXED1:
1360 case MSR_IA32_VMX_CR4_FIXED1:
1361 /*
1362 * These MSRs are generated based on the vCPU's CPUID, so we
1363 * do not support restoring them directly.
1364 */
1365 return -EINVAL;
1366 case MSR_IA32_VMX_EPT_VPID_CAP:
1367 return vmx_restore_vmx_ept_vpid_cap(vmx, data);
1368 case MSR_IA32_VMX_VMCS_ENUM:
1369 vmx->nested.msrs.vmcs_enum = data;
1370 return 0;
Paolo Bonzinie8a70bd2019-07-02 14:40:40 +02001371 case MSR_IA32_VMX_VMFUNC:
1372 if (data & ~vmx->nested.msrs.vmfunc_controls)
1373 return -EINVAL;
1374 vmx->nested.msrs.vmfunc_controls = data;
1375 return 0;
Sean Christopherson55d23752018-12-03 13:53:18 -08001376 default:
1377 /*
1378 * The rest of the VMX capability MSRs do not support restore.
1379 */
1380 return -EINVAL;
1381 }
1382}
1383
1384/* Returns 0 on success, non-0 otherwise. */
1385int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata)
1386{
1387 switch (msr_index) {
1388 case MSR_IA32_VMX_BASIC:
1389 *pdata = msrs->basic;
1390 break;
1391 case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
1392 case MSR_IA32_VMX_PINBASED_CTLS:
1393 *pdata = vmx_control_msr(
1394 msrs->pinbased_ctls_low,
1395 msrs->pinbased_ctls_high);
1396 if (msr_index == MSR_IA32_VMX_PINBASED_CTLS)
1397 *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
1398 break;
1399 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
1400 case MSR_IA32_VMX_PROCBASED_CTLS:
1401 *pdata = vmx_control_msr(
1402 msrs->procbased_ctls_low,
1403 msrs->procbased_ctls_high);
1404 if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS)
1405 *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
1406 break;
1407 case MSR_IA32_VMX_TRUE_EXIT_CTLS:
1408 case MSR_IA32_VMX_EXIT_CTLS:
1409 *pdata = vmx_control_msr(
1410 msrs->exit_ctls_low,
1411 msrs->exit_ctls_high);
1412 if (msr_index == MSR_IA32_VMX_EXIT_CTLS)
1413 *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
1414 break;
1415 case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
1416 case MSR_IA32_VMX_ENTRY_CTLS:
1417 *pdata = vmx_control_msr(
1418 msrs->entry_ctls_low,
1419 msrs->entry_ctls_high);
1420 if (msr_index == MSR_IA32_VMX_ENTRY_CTLS)
1421 *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
1422 break;
1423 case MSR_IA32_VMX_MISC:
1424 *pdata = vmx_control_msr(
1425 msrs->misc_low,
1426 msrs->misc_high);
1427 break;
1428 case MSR_IA32_VMX_CR0_FIXED0:
1429 *pdata = msrs->cr0_fixed0;
1430 break;
1431 case MSR_IA32_VMX_CR0_FIXED1:
1432 *pdata = msrs->cr0_fixed1;
1433 break;
1434 case MSR_IA32_VMX_CR4_FIXED0:
1435 *pdata = msrs->cr4_fixed0;
1436 break;
1437 case MSR_IA32_VMX_CR4_FIXED1:
1438 *pdata = msrs->cr4_fixed1;
1439 break;
1440 case MSR_IA32_VMX_VMCS_ENUM:
1441 *pdata = msrs->vmcs_enum;
1442 break;
1443 case MSR_IA32_VMX_PROCBASED_CTLS2:
1444 *pdata = vmx_control_msr(
1445 msrs->secondary_ctls_low,
1446 msrs->secondary_ctls_high);
1447 break;
1448 case MSR_IA32_VMX_EPT_VPID_CAP:
1449 *pdata = msrs->ept_caps |
1450 ((u64)msrs->vpid_caps << 32);
1451 break;
1452 case MSR_IA32_VMX_VMFUNC:
1453 *pdata = msrs->vmfunc_controls;
1454 break;
1455 default:
1456 return 1;
1457 }
1458
1459 return 0;
1460}
1461
1462/*
Sean Christophersonfadcead2019-05-07 08:36:23 -07001463 * Copy the writable VMCS shadow fields back to the VMCS12, in case they have
1464 * been modified by the L1 guest. Note, "writable" in this context means
1465 * "writable by the guest", i.e. tagged SHADOW_FIELD_RW; the set of
1466 * fields tagged SHADOW_FIELD_RO may or may not align with the "read-only"
1467 * VM-exit information fields (which are actually writable if the vCPU is
1468 * configured to support "VMWRITE to any supported field in the VMCS").
Sean Christopherson55d23752018-12-03 13:53:18 -08001469 */
1470static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
1471{
Sean Christopherson55d23752018-12-03 13:53:18 -08001472 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
Sean Christophersonfadcead2019-05-07 08:36:23 -07001473 struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu);
Sean Christopherson1c6f0b42019-05-07 08:36:25 -07001474 struct shadow_vmcs_field field;
1475 unsigned long val;
Sean Christophersonfadcead2019-05-07 08:36:23 -07001476 int i;
Sean Christopherson55d23752018-12-03 13:53:18 -08001477
Paolo Bonzini88dddc12019-07-19 18:41:10 +02001478 if (WARN_ON(!shadow_vmcs))
1479 return;
1480
Sean Christopherson55d23752018-12-03 13:53:18 -08001481 preempt_disable();
1482
1483 vmcs_load(shadow_vmcs);
1484
Sean Christophersonfadcead2019-05-07 08:36:23 -07001485 for (i = 0; i < max_shadow_read_write_fields; i++) {
1486 field = shadow_read_write_fields[i];
Sean Christopherson1c6f0b42019-05-07 08:36:25 -07001487 val = __vmcs_readl(field.encoding);
1488 vmcs12_write_any(vmcs12, field.encoding, field.offset, val);
Sean Christopherson55d23752018-12-03 13:53:18 -08001489 }
1490
1491 vmcs_clear(shadow_vmcs);
1492 vmcs_load(vmx->loaded_vmcs->vmcs);
1493
1494 preempt_enable();
1495}
1496
1497static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
1498{
Sean Christopherson1c6f0b42019-05-07 08:36:25 -07001499 const struct shadow_vmcs_field *fields[] = {
Sean Christopherson55d23752018-12-03 13:53:18 -08001500 shadow_read_write_fields,
1501 shadow_read_only_fields
1502 };
1503 const int max_fields[] = {
1504 max_shadow_read_write_fields,
1505 max_shadow_read_only_fields
1506 };
Sean Christopherson55d23752018-12-03 13:53:18 -08001507 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
Sean Christopherson1c6f0b42019-05-07 08:36:25 -07001508 struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu);
1509 struct shadow_vmcs_field field;
1510 unsigned long val;
1511 int i, q;
Sean Christopherson55d23752018-12-03 13:53:18 -08001512
Paolo Bonzini88dddc12019-07-19 18:41:10 +02001513 if (WARN_ON(!shadow_vmcs))
1514 return;
1515
Sean Christopherson55d23752018-12-03 13:53:18 -08001516 vmcs_load(shadow_vmcs);
1517
1518 for (q = 0; q < ARRAY_SIZE(fields); q++) {
1519 for (i = 0; i < max_fields[q]; i++) {
1520 field = fields[q][i];
Sean Christopherson1c6f0b42019-05-07 08:36:25 -07001521 val = vmcs12_read_any(vmcs12, field.encoding,
1522 field.offset);
1523 __vmcs_writel(field.encoding, val);
Sean Christopherson55d23752018-12-03 13:53:18 -08001524 }
1525 }
1526
1527 vmcs_clear(shadow_vmcs);
1528 vmcs_load(vmx->loaded_vmcs->vmcs);
1529}
1530
1531static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx)
1532{
1533 struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
1534 struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
1535
1536 /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */
1537 vmcs12->tpr_threshold = evmcs->tpr_threshold;
1538 vmcs12->guest_rip = evmcs->guest_rip;
1539
1540 if (unlikely(!(evmcs->hv_clean_fields &
1541 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) {
1542 vmcs12->guest_rsp = evmcs->guest_rsp;
1543 vmcs12->guest_rflags = evmcs->guest_rflags;
1544 vmcs12->guest_interruptibility_info =
1545 evmcs->guest_interruptibility_info;
1546 }
1547
1548 if (unlikely(!(evmcs->hv_clean_fields &
1549 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) {
1550 vmcs12->cpu_based_vm_exec_control =
1551 evmcs->cpu_based_vm_exec_control;
1552 }
1553
1554 if (unlikely(!(evmcs->hv_clean_fields &
Vitaly Kuznetsovf9bc5222019-06-13 13:35:02 +02001555 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN))) {
Sean Christopherson55d23752018-12-03 13:53:18 -08001556 vmcs12->exception_bitmap = evmcs->exception_bitmap;
1557 }
1558
1559 if (unlikely(!(evmcs->hv_clean_fields &
1560 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) {
1561 vmcs12->vm_entry_controls = evmcs->vm_entry_controls;
1562 }
1563
1564 if (unlikely(!(evmcs->hv_clean_fields &
1565 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) {
1566 vmcs12->vm_entry_intr_info_field =
1567 evmcs->vm_entry_intr_info_field;
1568 vmcs12->vm_entry_exception_error_code =
1569 evmcs->vm_entry_exception_error_code;
1570 vmcs12->vm_entry_instruction_len =
1571 evmcs->vm_entry_instruction_len;
1572 }
1573
1574 if (unlikely(!(evmcs->hv_clean_fields &
1575 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) {
1576 vmcs12->host_ia32_pat = evmcs->host_ia32_pat;
1577 vmcs12->host_ia32_efer = evmcs->host_ia32_efer;
1578 vmcs12->host_cr0 = evmcs->host_cr0;
1579 vmcs12->host_cr3 = evmcs->host_cr3;
1580 vmcs12->host_cr4 = evmcs->host_cr4;
1581 vmcs12->host_ia32_sysenter_esp = evmcs->host_ia32_sysenter_esp;
1582 vmcs12->host_ia32_sysenter_eip = evmcs->host_ia32_sysenter_eip;
1583 vmcs12->host_rip = evmcs->host_rip;
1584 vmcs12->host_ia32_sysenter_cs = evmcs->host_ia32_sysenter_cs;
1585 vmcs12->host_es_selector = evmcs->host_es_selector;
1586 vmcs12->host_cs_selector = evmcs->host_cs_selector;
1587 vmcs12->host_ss_selector = evmcs->host_ss_selector;
1588 vmcs12->host_ds_selector = evmcs->host_ds_selector;
1589 vmcs12->host_fs_selector = evmcs->host_fs_selector;
1590 vmcs12->host_gs_selector = evmcs->host_gs_selector;
1591 vmcs12->host_tr_selector = evmcs->host_tr_selector;
1592 }
1593
1594 if (unlikely(!(evmcs->hv_clean_fields &
Vitaly Kuznetsovf9bc5222019-06-13 13:35:02 +02001595 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1))) {
Sean Christopherson55d23752018-12-03 13:53:18 -08001596 vmcs12->pin_based_vm_exec_control =
1597 evmcs->pin_based_vm_exec_control;
1598 vmcs12->vm_exit_controls = evmcs->vm_exit_controls;
1599 vmcs12->secondary_vm_exec_control =
1600 evmcs->secondary_vm_exec_control;
1601 }
1602
1603 if (unlikely(!(evmcs->hv_clean_fields &
1604 HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) {
1605 vmcs12->io_bitmap_a = evmcs->io_bitmap_a;
1606 vmcs12->io_bitmap_b = evmcs->io_bitmap_b;
1607 }
1608
1609 if (unlikely(!(evmcs->hv_clean_fields &
1610 HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) {
1611 vmcs12->msr_bitmap = evmcs->msr_bitmap;
1612 }
1613
1614 if (unlikely(!(evmcs->hv_clean_fields &
1615 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) {
1616 vmcs12->guest_es_base = evmcs->guest_es_base;
1617 vmcs12->guest_cs_base = evmcs->guest_cs_base;
1618 vmcs12->guest_ss_base = evmcs->guest_ss_base;
1619 vmcs12->guest_ds_base = evmcs->guest_ds_base;
1620 vmcs12->guest_fs_base = evmcs->guest_fs_base;
1621 vmcs12->guest_gs_base = evmcs->guest_gs_base;
1622 vmcs12->guest_ldtr_base = evmcs->guest_ldtr_base;
1623 vmcs12->guest_tr_base = evmcs->guest_tr_base;
1624 vmcs12->guest_gdtr_base = evmcs->guest_gdtr_base;
1625 vmcs12->guest_idtr_base = evmcs->guest_idtr_base;
1626 vmcs12->guest_es_limit = evmcs->guest_es_limit;
1627 vmcs12->guest_cs_limit = evmcs->guest_cs_limit;
1628 vmcs12->guest_ss_limit = evmcs->guest_ss_limit;
1629 vmcs12->guest_ds_limit = evmcs->guest_ds_limit;
1630 vmcs12->guest_fs_limit = evmcs->guest_fs_limit;
1631 vmcs12->guest_gs_limit = evmcs->guest_gs_limit;
1632 vmcs12->guest_ldtr_limit = evmcs->guest_ldtr_limit;
1633 vmcs12->guest_tr_limit = evmcs->guest_tr_limit;
1634 vmcs12->guest_gdtr_limit = evmcs->guest_gdtr_limit;
1635 vmcs12->guest_idtr_limit = evmcs->guest_idtr_limit;
1636 vmcs12->guest_es_ar_bytes = evmcs->guest_es_ar_bytes;
1637 vmcs12->guest_cs_ar_bytes = evmcs->guest_cs_ar_bytes;
1638 vmcs12->guest_ss_ar_bytes = evmcs->guest_ss_ar_bytes;
1639 vmcs12->guest_ds_ar_bytes = evmcs->guest_ds_ar_bytes;
1640 vmcs12->guest_fs_ar_bytes = evmcs->guest_fs_ar_bytes;
1641 vmcs12->guest_gs_ar_bytes = evmcs->guest_gs_ar_bytes;
1642 vmcs12->guest_ldtr_ar_bytes = evmcs->guest_ldtr_ar_bytes;
1643 vmcs12->guest_tr_ar_bytes = evmcs->guest_tr_ar_bytes;
1644 vmcs12->guest_es_selector = evmcs->guest_es_selector;
1645 vmcs12->guest_cs_selector = evmcs->guest_cs_selector;
1646 vmcs12->guest_ss_selector = evmcs->guest_ss_selector;
1647 vmcs12->guest_ds_selector = evmcs->guest_ds_selector;
1648 vmcs12->guest_fs_selector = evmcs->guest_fs_selector;
1649 vmcs12->guest_gs_selector = evmcs->guest_gs_selector;
1650 vmcs12->guest_ldtr_selector = evmcs->guest_ldtr_selector;
1651 vmcs12->guest_tr_selector = evmcs->guest_tr_selector;
1652 }
1653
1654 if (unlikely(!(evmcs->hv_clean_fields &
1655 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) {
1656 vmcs12->tsc_offset = evmcs->tsc_offset;
1657 vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr;
1658 vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap;
1659 }
1660
1661 if (unlikely(!(evmcs->hv_clean_fields &
1662 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) {
1663 vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask;
1664 vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask;
1665 vmcs12->cr0_read_shadow = evmcs->cr0_read_shadow;
1666 vmcs12->cr4_read_shadow = evmcs->cr4_read_shadow;
1667 vmcs12->guest_cr0 = evmcs->guest_cr0;
1668 vmcs12->guest_cr3 = evmcs->guest_cr3;
1669 vmcs12->guest_cr4 = evmcs->guest_cr4;
1670 vmcs12->guest_dr7 = evmcs->guest_dr7;
1671 }
1672
1673 if (unlikely(!(evmcs->hv_clean_fields &
1674 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) {
1675 vmcs12->host_fs_base = evmcs->host_fs_base;
1676 vmcs12->host_gs_base = evmcs->host_gs_base;
1677 vmcs12->host_tr_base = evmcs->host_tr_base;
1678 vmcs12->host_gdtr_base = evmcs->host_gdtr_base;
1679 vmcs12->host_idtr_base = evmcs->host_idtr_base;
1680 vmcs12->host_rsp = evmcs->host_rsp;
1681 }
1682
1683 if (unlikely(!(evmcs->hv_clean_fields &
1684 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) {
1685 vmcs12->ept_pointer = evmcs->ept_pointer;
1686 vmcs12->virtual_processor_id = evmcs->virtual_processor_id;
1687 }
1688
1689 if (unlikely(!(evmcs->hv_clean_fields &
1690 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) {
1691 vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer;
1692 vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl;
1693 vmcs12->guest_ia32_pat = evmcs->guest_ia32_pat;
1694 vmcs12->guest_ia32_efer = evmcs->guest_ia32_efer;
1695 vmcs12->guest_pdptr0 = evmcs->guest_pdptr0;
1696 vmcs12->guest_pdptr1 = evmcs->guest_pdptr1;
1697 vmcs12->guest_pdptr2 = evmcs->guest_pdptr2;
1698 vmcs12->guest_pdptr3 = evmcs->guest_pdptr3;
1699 vmcs12->guest_pending_dbg_exceptions =
1700 evmcs->guest_pending_dbg_exceptions;
1701 vmcs12->guest_sysenter_esp = evmcs->guest_sysenter_esp;
1702 vmcs12->guest_sysenter_eip = evmcs->guest_sysenter_eip;
1703 vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs;
1704 vmcs12->guest_activity_state = evmcs->guest_activity_state;
1705 vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs;
1706 }
1707
1708 /*
1709 * Not used?
1710 * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr;
1711 * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr;
1712 * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr;
1713 * vmcs12->cr3_target_value0 = evmcs->cr3_target_value0;
1714 * vmcs12->cr3_target_value1 = evmcs->cr3_target_value1;
1715 * vmcs12->cr3_target_value2 = evmcs->cr3_target_value2;
1716 * vmcs12->cr3_target_value3 = evmcs->cr3_target_value3;
1717 * vmcs12->page_fault_error_code_mask =
1718 * evmcs->page_fault_error_code_mask;
1719 * vmcs12->page_fault_error_code_match =
1720 * evmcs->page_fault_error_code_match;
1721 * vmcs12->cr3_target_count = evmcs->cr3_target_count;
1722 * vmcs12->vm_exit_msr_store_count = evmcs->vm_exit_msr_store_count;
1723 * vmcs12->vm_exit_msr_load_count = evmcs->vm_exit_msr_load_count;
1724 * vmcs12->vm_entry_msr_load_count = evmcs->vm_entry_msr_load_count;
1725 */
1726
1727 /*
1728 * Read only fields:
1729 * vmcs12->guest_physical_address = evmcs->guest_physical_address;
1730 * vmcs12->vm_instruction_error = evmcs->vm_instruction_error;
1731 * vmcs12->vm_exit_reason = evmcs->vm_exit_reason;
1732 * vmcs12->vm_exit_intr_info = evmcs->vm_exit_intr_info;
1733 * vmcs12->vm_exit_intr_error_code = evmcs->vm_exit_intr_error_code;
1734 * vmcs12->idt_vectoring_info_field = evmcs->idt_vectoring_info_field;
1735 * vmcs12->idt_vectoring_error_code = evmcs->idt_vectoring_error_code;
1736 * vmcs12->vm_exit_instruction_len = evmcs->vm_exit_instruction_len;
1737 * vmcs12->vmx_instruction_info = evmcs->vmx_instruction_info;
1738 * vmcs12->exit_qualification = evmcs->exit_qualification;
1739 * vmcs12->guest_linear_address = evmcs->guest_linear_address;
1740 *
1741 * Not present in struct vmcs12:
1742 * vmcs12->exit_io_instruction_ecx = evmcs->exit_io_instruction_ecx;
1743 * vmcs12->exit_io_instruction_esi = evmcs->exit_io_instruction_esi;
1744 * vmcs12->exit_io_instruction_edi = evmcs->exit_io_instruction_edi;
1745 * vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip;
1746 */
1747
1748 return 0;
1749}
1750
1751static int copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx)
1752{
1753 struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
1754 struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
1755
1756 /*
1757 * Should not be changed by KVM:
1758 *
1759 * evmcs->host_es_selector = vmcs12->host_es_selector;
1760 * evmcs->host_cs_selector = vmcs12->host_cs_selector;
1761 * evmcs->host_ss_selector = vmcs12->host_ss_selector;
1762 * evmcs->host_ds_selector = vmcs12->host_ds_selector;
1763 * evmcs->host_fs_selector = vmcs12->host_fs_selector;
1764 * evmcs->host_gs_selector = vmcs12->host_gs_selector;
1765 * evmcs->host_tr_selector = vmcs12->host_tr_selector;
1766 * evmcs->host_ia32_pat = vmcs12->host_ia32_pat;
1767 * evmcs->host_ia32_efer = vmcs12->host_ia32_efer;
1768 * evmcs->host_cr0 = vmcs12->host_cr0;
1769 * evmcs->host_cr3 = vmcs12->host_cr3;
1770 * evmcs->host_cr4 = vmcs12->host_cr4;
1771 * evmcs->host_ia32_sysenter_esp = vmcs12->host_ia32_sysenter_esp;
1772 * evmcs->host_ia32_sysenter_eip = vmcs12->host_ia32_sysenter_eip;
1773 * evmcs->host_rip = vmcs12->host_rip;
1774 * evmcs->host_ia32_sysenter_cs = vmcs12->host_ia32_sysenter_cs;
1775 * evmcs->host_fs_base = vmcs12->host_fs_base;
1776 * evmcs->host_gs_base = vmcs12->host_gs_base;
1777 * evmcs->host_tr_base = vmcs12->host_tr_base;
1778 * evmcs->host_gdtr_base = vmcs12->host_gdtr_base;
1779 * evmcs->host_idtr_base = vmcs12->host_idtr_base;
1780 * evmcs->host_rsp = vmcs12->host_rsp;
Sean Christopherson3731905ef2019-05-07 08:36:27 -07001781 * sync_vmcs02_to_vmcs12() doesn't read these:
Sean Christopherson55d23752018-12-03 13:53:18 -08001782 * evmcs->io_bitmap_a = vmcs12->io_bitmap_a;
1783 * evmcs->io_bitmap_b = vmcs12->io_bitmap_b;
1784 * evmcs->msr_bitmap = vmcs12->msr_bitmap;
1785 * evmcs->ept_pointer = vmcs12->ept_pointer;
1786 * evmcs->xss_exit_bitmap = vmcs12->xss_exit_bitmap;
1787 * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr;
1788 * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr;
1789 * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr;
1790 * evmcs->cr3_target_value0 = vmcs12->cr3_target_value0;
1791 * evmcs->cr3_target_value1 = vmcs12->cr3_target_value1;
1792 * evmcs->cr3_target_value2 = vmcs12->cr3_target_value2;
1793 * evmcs->cr3_target_value3 = vmcs12->cr3_target_value3;
1794 * evmcs->tpr_threshold = vmcs12->tpr_threshold;
1795 * evmcs->virtual_processor_id = vmcs12->virtual_processor_id;
1796 * evmcs->exception_bitmap = vmcs12->exception_bitmap;
1797 * evmcs->vmcs_link_pointer = vmcs12->vmcs_link_pointer;
1798 * evmcs->pin_based_vm_exec_control = vmcs12->pin_based_vm_exec_control;
1799 * evmcs->vm_exit_controls = vmcs12->vm_exit_controls;
1800 * evmcs->secondary_vm_exec_control = vmcs12->secondary_vm_exec_control;
1801 * evmcs->page_fault_error_code_mask =
1802 * vmcs12->page_fault_error_code_mask;
1803 * evmcs->page_fault_error_code_match =
1804 * vmcs12->page_fault_error_code_match;
1805 * evmcs->cr3_target_count = vmcs12->cr3_target_count;
1806 * evmcs->virtual_apic_page_addr = vmcs12->virtual_apic_page_addr;
1807 * evmcs->tsc_offset = vmcs12->tsc_offset;
1808 * evmcs->guest_ia32_debugctl = vmcs12->guest_ia32_debugctl;
1809 * evmcs->cr0_guest_host_mask = vmcs12->cr0_guest_host_mask;
1810 * evmcs->cr4_guest_host_mask = vmcs12->cr4_guest_host_mask;
1811 * evmcs->cr0_read_shadow = vmcs12->cr0_read_shadow;
1812 * evmcs->cr4_read_shadow = vmcs12->cr4_read_shadow;
1813 * evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count;
1814 * evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count;
1815 * evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count;
1816 *
1817 * Not present in struct vmcs12:
1818 * evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx;
1819 * evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi;
1820 * evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi;
1821 * evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip;
1822 */
1823
1824 evmcs->guest_es_selector = vmcs12->guest_es_selector;
1825 evmcs->guest_cs_selector = vmcs12->guest_cs_selector;
1826 evmcs->guest_ss_selector = vmcs12->guest_ss_selector;
1827 evmcs->guest_ds_selector = vmcs12->guest_ds_selector;
1828 evmcs->guest_fs_selector = vmcs12->guest_fs_selector;
1829 evmcs->guest_gs_selector = vmcs12->guest_gs_selector;
1830 evmcs->guest_ldtr_selector = vmcs12->guest_ldtr_selector;
1831 evmcs->guest_tr_selector = vmcs12->guest_tr_selector;
1832
1833 evmcs->guest_es_limit = vmcs12->guest_es_limit;
1834 evmcs->guest_cs_limit = vmcs12->guest_cs_limit;
1835 evmcs->guest_ss_limit = vmcs12->guest_ss_limit;
1836 evmcs->guest_ds_limit = vmcs12->guest_ds_limit;
1837 evmcs->guest_fs_limit = vmcs12->guest_fs_limit;
1838 evmcs->guest_gs_limit = vmcs12->guest_gs_limit;
1839 evmcs->guest_ldtr_limit = vmcs12->guest_ldtr_limit;
1840 evmcs->guest_tr_limit = vmcs12->guest_tr_limit;
1841 evmcs->guest_gdtr_limit = vmcs12->guest_gdtr_limit;
1842 evmcs->guest_idtr_limit = vmcs12->guest_idtr_limit;
1843
1844 evmcs->guest_es_ar_bytes = vmcs12->guest_es_ar_bytes;
1845 evmcs->guest_cs_ar_bytes = vmcs12->guest_cs_ar_bytes;
1846 evmcs->guest_ss_ar_bytes = vmcs12->guest_ss_ar_bytes;
1847 evmcs->guest_ds_ar_bytes = vmcs12->guest_ds_ar_bytes;
1848 evmcs->guest_fs_ar_bytes = vmcs12->guest_fs_ar_bytes;
1849 evmcs->guest_gs_ar_bytes = vmcs12->guest_gs_ar_bytes;
1850 evmcs->guest_ldtr_ar_bytes = vmcs12->guest_ldtr_ar_bytes;
1851 evmcs->guest_tr_ar_bytes = vmcs12->guest_tr_ar_bytes;
1852
1853 evmcs->guest_es_base = vmcs12->guest_es_base;
1854 evmcs->guest_cs_base = vmcs12->guest_cs_base;
1855 evmcs->guest_ss_base = vmcs12->guest_ss_base;
1856 evmcs->guest_ds_base = vmcs12->guest_ds_base;
1857 evmcs->guest_fs_base = vmcs12->guest_fs_base;
1858 evmcs->guest_gs_base = vmcs12->guest_gs_base;
1859 evmcs->guest_ldtr_base = vmcs12->guest_ldtr_base;
1860 evmcs->guest_tr_base = vmcs12->guest_tr_base;
1861 evmcs->guest_gdtr_base = vmcs12->guest_gdtr_base;
1862 evmcs->guest_idtr_base = vmcs12->guest_idtr_base;
1863
1864 evmcs->guest_ia32_pat = vmcs12->guest_ia32_pat;
1865 evmcs->guest_ia32_efer = vmcs12->guest_ia32_efer;
1866
1867 evmcs->guest_pdptr0 = vmcs12->guest_pdptr0;
1868 evmcs->guest_pdptr1 = vmcs12->guest_pdptr1;
1869 evmcs->guest_pdptr2 = vmcs12->guest_pdptr2;
1870 evmcs->guest_pdptr3 = vmcs12->guest_pdptr3;
1871
1872 evmcs->guest_pending_dbg_exceptions =
1873 vmcs12->guest_pending_dbg_exceptions;
1874 evmcs->guest_sysenter_esp = vmcs12->guest_sysenter_esp;
1875 evmcs->guest_sysenter_eip = vmcs12->guest_sysenter_eip;
1876
1877 evmcs->guest_activity_state = vmcs12->guest_activity_state;
1878 evmcs->guest_sysenter_cs = vmcs12->guest_sysenter_cs;
1879
1880 evmcs->guest_cr0 = vmcs12->guest_cr0;
1881 evmcs->guest_cr3 = vmcs12->guest_cr3;
1882 evmcs->guest_cr4 = vmcs12->guest_cr4;
1883 evmcs->guest_dr7 = vmcs12->guest_dr7;
1884
1885 evmcs->guest_physical_address = vmcs12->guest_physical_address;
1886
1887 evmcs->vm_instruction_error = vmcs12->vm_instruction_error;
1888 evmcs->vm_exit_reason = vmcs12->vm_exit_reason;
1889 evmcs->vm_exit_intr_info = vmcs12->vm_exit_intr_info;
1890 evmcs->vm_exit_intr_error_code = vmcs12->vm_exit_intr_error_code;
1891 evmcs->idt_vectoring_info_field = vmcs12->idt_vectoring_info_field;
1892 evmcs->idt_vectoring_error_code = vmcs12->idt_vectoring_error_code;
1893 evmcs->vm_exit_instruction_len = vmcs12->vm_exit_instruction_len;
1894 evmcs->vmx_instruction_info = vmcs12->vmx_instruction_info;
1895
1896 evmcs->exit_qualification = vmcs12->exit_qualification;
1897
1898 evmcs->guest_linear_address = vmcs12->guest_linear_address;
1899 evmcs->guest_rsp = vmcs12->guest_rsp;
1900 evmcs->guest_rflags = vmcs12->guest_rflags;
1901
1902 evmcs->guest_interruptibility_info =
1903 vmcs12->guest_interruptibility_info;
1904 evmcs->cpu_based_vm_exec_control = vmcs12->cpu_based_vm_exec_control;
1905 evmcs->vm_entry_controls = vmcs12->vm_entry_controls;
1906 evmcs->vm_entry_intr_info_field = vmcs12->vm_entry_intr_info_field;
1907 evmcs->vm_entry_exception_error_code =
1908 vmcs12->vm_entry_exception_error_code;
1909 evmcs->vm_entry_instruction_len = vmcs12->vm_entry_instruction_len;
1910
1911 evmcs->guest_rip = vmcs12->guest_rip;
1912
1913 evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs;
1914
1915 return 0;
1916}
1917
1918/*
1919 * This is an equivalent of the nested hypervisor executing the vmptrld
1920 * instruction.
1921 */
1922static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu,
1923 bool from_launch)
1924{
1925 struct vcpu_vmx *vmx = to_vmx(vcpu);
Vitaly Kuznetsova21a39c2019-06-28 13:23:32 +02001926 bool evmcs_gpa_changed = false;
Vitaly Kuznetsov11e34912019-06-28 13:23:33 +02001927 u64 evmcs_gpa;
Sean Christopherson55d23752018-12-03 13:53:18 -08001928
1929 if (likely(!vmx->nested.enlightened_vmcs_enabled))
1930 return 1;
1931
Vitaly Kuznetsov11e34912019-06-28 13:23:33 +02001932 if (!nested_enlightened_vmentry(vcpu, &evmcs_gpa))
Sean Christopherson55d23752018-12-03 13:53:18 -08001933 return 1;
1934
Vitaly Kuznetsov11e34912019-06-28 13:23:33 +02001935 if (unlikely(evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) {
Sean Christopherson55d23752018-12-03 13:53:18 -08001936 if (!vmx->nested.hv_evmcs)
1937 vmx->nested.current_vmptr = -1ull;
1938
1939 nested_release_evmcs(vcpu);
1940
Vitaly Kuznetsov11e34912019-06-28 13:23:33 +02001941 if (kvm_vcpu_map(vcpu, gpa_to_gfn(evmcs_gpa),
KarimAllah Ahmeddee9c042019-01-31 21:24:42 +01001942 &vmx->nested.hv_evmcs_map))
Sean Christopherson55d23752018-12-03 13:53:18 -08001943 return 0;
1944
KarimAllah Ahmeddee9c042019-01-31 21:24:42 +01001945 vmx->nested.hv_evmcs = vmx->nested.hv_evmcs_map.hva;
Sean Christopherson55d23752018-12-03 13:53:18 -08001946
1947 /*
1948 * Currently, KVM only supports eVMCS version 1
1949 * (== KVM_EVMCS_VERSION) and thus we expect guest to set this
1950 * value to first u32 field of eVMCS which should specify eVMCS
1951 * VersionNumber.
1952 *
1953 * Guest should be aware of supported eVMCS versions by host by
1954 * examining CPUID.0x4000000A.EAX[0:15]. Host userspace VMM is
1955 * expected to set this CPUID leaf according to the value
1956 * returned in vmcs_version from nested_enable_evmcs().
1957 *
1958 * However, it turns out that Microsoft Hyper-V fails to comply
1959 * to their own invented interface: When Hyper-V use eVMCS, it
1960 * just sets first u32 field of eVMCS to revision_id specified
1961 * in MSR_IA32_VMX_BASIC. Instead of used eVMCS version number
1962 * which is one of the supported versions specified in
1963 * CPUID.0x4000000A.EAX[0:15].
1964 *
1965 * To overcome Hyper-V bug, we accept here either a supported
1966 * eVMCS version or VMCS12 revision_id as valid values for first
1967 * u32 field of eVMCS.
1968 */
1969 if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) &&
1970 (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) {
1971 nested_release_evmcs(vcpu);
1972 return 0;
1973 }
1974
1975 vmx->nested.dirty_vmcs12 = true;
Vitaly Kuznetsov11e34912019-06-28 13:23:33 +02001976 vmx->nested.hv_evmcs_vmptr = evmcs_gpa;
Sean Christopherson55d23752018-12-03 13:53:18 -08001977
Vitaly Kuznetsova21a39c2019-06-28 13:23:32 +02001978 evmcs_gpa_changed = true;
Sean Christopherson55d23752018-12-03 13:53:18 -08001979 /*
1980 * Unlike normal vmcs12, enlightened vmcs12 is not fully
1981 * reloaded from guest's memory (read only fields, fields not
1982 * present in struct hv_enlightened_vmcs, ...). Make sure there
1983 * are no leftovers.
1984 */
1985 if (from_launch) {
1986 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1987 memset(vmcs12, 0, sizeof(*vmcs12));
1988 vmcs12->hdr.revision_id = VMCS12_REVISION;
1989 }
1990
1991 }
Vitaly Kuznetsova21a39c2019-06-28 13:23:32 +02001992
1993 /*
1994 * Clean fields data can't de used on VMLAUNCH and when we switch
1995 * between different L2 guests as KVM keeps a single VMCS12 per L1.
1996 */
1997 if (from_launch || evmcs_gpa_changed)
1998 vmx->nested.hv_evmcs->hv_clean_fields &=
1999 ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
2000
Sean Christopherson55d23752018-12-03 13:53:18 -08002001 return 1;
2002}
2003
Sean Christopherson3731905ef2019-05-07 08:36:27 -07002004void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu)
Sean Christopherson55d23752018-12-03 13:53:18 -08002005{
2006 struct vcpu_vmx *vmx = to_vmx(vcpu);
2007
2008 /*
2009 * hv_evmcs may end up being not mapped after migration (when
2010 * L2 was running), map it here to make sure vmcs12 changes are
2011 * properly reflected.
2012 */
2013 if (vmx->nested.enlightened_vmcs_enabled && !vmx->nested.hv_evmcs)
2014 nested_vmx_handle_enlightened_vmptrld(vcpu, false);
2015
2016 if (vmx->nested.hv_evmcs) {
2017 copy_vmcs12_to_enlightened(vmx);
2018 /* All fields are clean */
2019 vmx->nested.hv_evmcs->hv_clean_fields |=
2020 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
2021 } else {
2022 copy_vmcs12_to_shadow(vmx);
2023 }
2024
Sean Christopherson3731905ef2019-05-07 08:36:27 -07002025 vmx->nested.need_vmcs12_to_shadow_sync = false;
Sean Christopherson55d23752018-12-03 13:53:18 -08002026}
2027
2028static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer)
2029{
2030 struct vcpu_vmx *vmx =
2031 container_of(timer, struct vcpu_vmx, nested.preemption_timer);
2032
2033 vmx->nested.preemption_timer_expired = true;
2034 kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu);
2035 kvm_vcpu_kick(&vmx->vcpu);
2036
2037 return HRTIMER_NORESTART;
2038}
2039
2040static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu)
2041{
2042 u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value;
2043 struct vcpu_vmx *vmx = to_vmx(vcpu);
2044
2045 /*
2046 * A timer value of zero is architecturally guaranteed to cause
2047 * a VMExit prior to executing any instructions in the guest.
2048 */
2049 if (preemption_timeout == 0) {
2050 vmx_preemption_timer_fn(&vmx->nested.preemption_timer);
2051 return;
2052 }
2053
2054 if (vcpu->arch.virtual_tsc_khz == 0)
2055 return;
2056
2057 preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
2058 preemption_timeout *= 1000000;
2059 do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz);
2060 hrtimer_start(&vmx->nested.preemption_timer,
2061 ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL);
2062}
2063
2064static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
2065{
2066 if (vmx->nested.nested_run_pending &&
2067 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER))
2068 return vmcs12->guest_ia32_efer;
2069 else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
2070 return vmx->vcpu.arch.efer | (EFER_LMA | EFER_LME);
2071 else
2072 return vmx->vcpu.arch.efer & ~(EFER_LMA | EFER_LME);
2073}
2074
2075static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx)
2076{
2077 /*
2078 * If vmcs02 hasn't been initialized, set the constant vmcs02 state
2079 * according to L0's settings (vmcs12 is irrelevant here). Host
2080 * fields that come from L0 and are not constant, e.g. HOST_CR3,
2081 * will be set as needed prior to VMLAUNCH/VMRESUME.
2082 */
2083 if (vmx->nested.vmcs02_initialized)
2084 return;
2085 vmx->nested.vmcs02_initialized = true;
2086
2087 /*
2088 * We don't care what the EPTP value is we just need to guarantee
2089 * it's valid so we don't get a false positive when doing early
2090 * consistency checks.
2091 */
2092 if (enable_ept && nested_early_check)
2093 vmcs_write64(EPT_POINTER, construct_eptp(&vmx->vcpu, 0));
2094
2095 /* All VMFUNCs are currently emulated through L0 vmexits. */
2096 if (cpu_has_vmx_vmfunc())
2097 vmcs_write64(VM_FUNCTION_CONTROL, 0);
2098
2099 if (cpu_has_vmx_posted_intr())
2100 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR);
2101
2102 if (cpu_has_vmx_msr_bitmap())
2103 vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap));
2104
Sean Christopherson4d6c9892019-05-07 09:06:30 -07002105 /*
2106 * The PML address never changes, so it is constant in vmcs02.
2107 * Conceptually we want to copy the PML index from vmcs01 here,
2108 * and then back to vmcs01 on nested vmexit. But since we flush
2109 * the log and reset GUEST_PML_INDEX on each vmexit, the PML
2110 * index is also effectively constant in vmcs02.
2111 */
2112 if (enable_pml) {
Sean Christopherson55d23752018-12-03 13:53:18 -08002113 vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
Sean Christopherson4d6c9892019-05-07 09:06:30 -07002114 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
2115 }
Sean Christopherson55d23752018-12-03 13:53:18 -08002116
Sean Christophersonc538d572019-05-07 09:06:29 -07002117 if (cpu_has_vmx_encls_vmexit())
2118 vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
Sean Christopherson55d23752018-12-03 13:53:18 -08002119
2120 /*
2121 * Set the MSR load/store lists to match L0's settings. Only the
2122 * addresses are constant (for vmcs02), the counts can change based
2123 * on L2's behavior, e.g. switching to/from long mode.
2124 */
Aaron Lewis662f1d12019-11-07 21:14:39 -08002125 vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.guest.val));
Sean Christopherson55d23752018-12-03 13:53:18 -08002126 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
2127 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
2128
2129 vmx_set_constant_host_state(vmx);
2130}
2131
Paolo Bonzinib1346ab2019-06-06 17:24:00 +02002132static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx,
Sean Christopherson55d23752018-12-03 13:53:18 -08002133 struct vmcs12 *vmcs12)
2134{
2135 prepare_vmcs02_constant_state(vmx);
2136
2137 vmcs_write64(VMCS_LINK_POINTER, -1ull);
2138
2139 if (enable_vpid) {
2140 if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02)
2141 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02);
2142 else
2143 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
2144 }
2145}
2146
2147static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
2148{
2149 u32 exec_control, vmcs12_exec_ctrl;
2150 u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12);
2151
2152 if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs)
Paolo Bonzinib1346ab2019-06-06 17:24:00 +02002153 prepare_vmcs02_early_rare(vmx, vmcs12);
Sean Christopherson55d23752018-12-03 13:53:18 -08002154
2155 /*
Sean Christopherson55d23752018-12-03 13:53:18 -08002156 * PIN CONTROLS
2157 */
Sean Christophersonc075c3e2019-05-07 12:17:53 -07002158 exec_control = vmx_pin_based_exec_ctrl(vmx);
Sean Christopherson804939e2019-05-07 12:18:05 -07002159 exec_control |= (vmcs12->pin_based_vm_exec_control &
2160 ~PIN_BASED_VMX_PREEMPTION_TIMER);
Sean Christopherson55d23752018-12-03 13:53:18 -08002161
2162 /* Posted interrupts setting is only taken from vmcs12. */
2163 if (nested_cpu_has_posted_intr(vmcs12)) {
2164 vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
2165 vmx->nested.pi_pending = false;
2166 } else {
2167 exec_control &= ~PIN_BASED_POSTED_INTR;
2168 }
Sean Christopherson3af80fe2019-05-07 12:18:00 -07002169 pin_controls_set(vmx, exec_control);
Sean Christopherson55d23752018-12-03 13:53:18 -08002170
2171 /*
2172 * EXEC CONTROLS
2173 */
2174 exec_control = vmx_exec_control(vmx); /* L0's desires */
2175 exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
2176 exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
2177 exec_control &= ~CPU_BASED_TPR_SHADOW;
2178 exec_control |= vmcs12->cpu_based_vm_exec_control;
2179
Liran Alon02d496cf2019-11-11 14:30:55 +02002180 vmx->nested.l1_tpr_threshold = -1;
Sean Christophersonca2f5462019-05-07 09:06:33 -07002181 if (exec_control & CPU_BASED_TPR_SHADOW)
Sean Christopherson55d23752018-12-03 13:53:18 -08002182 vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
Sean Christopherson55d23752018-12-03 13:53:18 -08002183#ifdef CONFIG_X86_64
Sean Christophersonca2f5462019-05-07 09:06:33 -07002184 else
Sean Christopherson55d23752018-12-03 13:53:18 -08002185 exec_control |= CPU_BASED_CR8_LOAD_EXITING |
2186 CPU_BASED_CR8_STORE_EXITING;
2187#endif
Sean Christopherson55d23752018-12-03 13:53:18 -08002188
2189 /*
2190 * A vmexit (to either L1 hypervisor or L0 userspace) is always needed
2191 * for I/O port accesses.
2192 */
Sean Christopherson55d23752018-12-03 13:53:18 -08002193 exec_control |= CPU_BASED_UNCOND_IO_EXITING;
Sean Christophersonde0286b2019-05-07 12:18:01 -07002194 exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
2195
2196 /*
2197 * This bit will be computed in nested_get_vmcs12_pages, because
2198 * we do not have access to L1's MSR bitmap yet. For now, keep
2199 * the same bit as before, hoping to avoid multiple VMWRITEs that
2200 * only set/clear this bit.
2201 */
2202 exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
2203 exec_control |= exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS;
2204
Sean Christopherson3af80fe2019-05-07 12:18:00 -07002205 exec_controls_set(vmx, exec_control);
Sean Christopherson55d23752018-12-03 13:53:18 -08002206
2207 /*
2208 * SECONDARY EXEC CONTROLS
2209 */
2210 if (cpu_has_secondary_exec_ctrls()) {
2211 exec_control = vmx->secondary_exec_control;
2212
2213 /* Take the following fields only from vmcs12 */
2214 exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
2215 SECONDARY_EXEC_ENABLE_INVPCID |
2216 SECONDARY_EXEC_RDTSCP |
2217 SECONDARY_EXEC_XSAVES |
Tao Xue69e72fa2019-07-16 14:55:49 +08002218 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE |
Sean Christopherson55d23752018-12-03 13:53:18 -08002219 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
2220 SECONDARY_EXEC_APIC_REGISTER_VIRT |
2221 SECONDARY_EXEC_ENABLE_VMFUNC);
2222 if (nested_cpu_has(vmcs12,
2223 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) {
2224 vmcs12_exec_ctrl = vmcs12->secondary_vm_exec_control &
2225 ~SECONDARY_EXEC_ENABLE_PML;
2226 exec_control |= vmcs12_exec_ctrl;
2227 }
2228
2229 /* VMCS shadowing for L2 is emulated for now */
2230 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
2231
Sean Christopherson469debd2019-05-07 12:18:02 -07002232 /*
2233 * Preset *DT exiting when emulating UMIP, so that vmx_set_cr4()
2234 * will not have to rewrite the controls just for this bit.
2235 */
2236 if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated() &&
2237 (vmcs12->guest_cr4 & X86_CR4_UMIP))
2238 exec_control |= SECONDARY_EXEC_DESC;
2239
Sean Christopherson55d23752018-12-03 13:53:18 -08002240 if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
2241 vmcs_write16(GUEST_INTR_STATUS,
2242 vmcs12->guest_intr_status);
2243
Sean Christopherson3af80fe2019-05-07 12:18:00 -07002244 secondary_exec_controls_set(vmx, exec_control);
Sean Christopherson55d23752018-12-03 13:53:18 -08002245 }
2246
2247 /*
2248 * ENTRY CONTROLS
2249 *
2250 * vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE
2251 * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate
2252 * on the related bits (if supported by the CPU) in the hope that
2253 * we can avoid VMWrites during vmx_set_efer().
2254 */
2255 exec_control = (vmcs12->vm_entry_controls | vmx_vmentry_ctrl()) &
2256 ~VM_ENTRY_IA32E_MODE & ~VM_ENTRY_LOAD_IA32_EFER;
2257 if (cpu_has_load_ia32_efer()) {
2258 if (guest_efer & EFER_LMA)
2259 exec_control |= VM_ENTRY_IA32E_MODE;
2260 if (guest_efer != host_efer)
2261 exec_control |= VM_ENTRY_LOAD_IA32_EFER;
2262 }
Sean Christopherson3af80fe2019-05-07 12:18:00 -07002263 vm_entry_controls_set(vmx, exec_control);
Sean Christopherson55d23752018-12-03 13:53:18 -08002264
2265 /*
2266 * EXIT CONTROLS
2267 *
2268 * L2->L1 exit controls are emulated - the hardware exit is to L0 so
2269 * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
2270 * bits may be modified by vmx_set_efer() in prepare_vmcs02().
2271 */
2272 exec_control = vmx_vmexit_ctrl();
2273 if (cpu_has_load_ia32_efer() && guest_efer != host_efer)
2274 exec_control |= VM_EXIT_LOAD_IA32_EFER;
Sean Christopherson3af80fe2019-05-07 12:18:00 -07002275 vm_exit_controls_set(vmx, exec_control);
Sean Christopherson55d23752018-12-03 13:53:18 -08002276
2277 /*
2278 * Interrupt/Exception Fields
2279 */
2280 if (vmx->nested.nested_run_pending) {
2281 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2282 vmcs12->vm_entry_intr_info_field);
2283 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
2284 vmcs12->vm_entry_exception_error_code);
2285 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
2286 vmcs12->vm_entry_instruction_len);
2287 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
2288 vmcs12->guest_interruptibility_info);
2289 vmx->loaded_vmcs->nmi_known_unmasked =
2290 !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI);
2291 } else {
2292 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
2293 }
2294}
2295
Paolo Bonzinib1346ab2019-06-06 17:24:00 +02002296static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
Sean Christopherson55d23752018-12-03 13:53:18 -08002297{
2298 struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs;
2299
2300 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
2301 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) {
2302 vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
2303 vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
2304 vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector);
2305 vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector);
2306 vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector);
2307 vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector);
2308 vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector);
2309 vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector);
2310 vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit);
2311 vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit);
2312 vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit);
2313 vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit);
2314 vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit);
2315 vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit);
2316 vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit);
2317 vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit);
2318 vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit);
2319 vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit);
Sean Christopherson1c6f0b42019-05-07 08:36:25 -07002320 vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes);
2321 vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes);
Sean Christopherson55d23752018-12-03 13:53:18 -08002322 vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes);
2323 vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes);
2324 vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes);
2325 vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes);
2326 vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes);
2327 vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes);
2328 vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base);
2329 vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base);
2330 vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base);
2331 vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base);
2332 vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base);
2333 vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base);
2334 vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base);
2335 vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base);
2336 vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
2337 vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
2338 }
2339
2340 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
2341 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1)) {
2342 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
2343 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
2344 vmcs12->guest_pending_dbg_exceptions);
2345 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp);
2346 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip);
2347
2348 /*
2349 * L1 may access the L2's PDPTR, so save them to construct
2350 * vmcs12
2351 */
2352 if (enable_ept) {
2353 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
2354 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
2355 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
2356 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
2357 }
Sean Christophersonc27e5b02019-05-07 09:06:39 -07002358
2359 if (kvm_mpx_supported() && vmx->nested.nested_run_pending &&
2360 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
2361 vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
Sean Christopherson55d23752018-12-03 13:53:18 -08002362 }
2363
2364 if (nested_cpu_has_xsaves(vmcs12))
2365 vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap);
2366
2367 /*
2368 * Whether page-faults are trapped is determined by a combination of
2369 * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF.
2370 * If enable_ept, L0 doesn't care about page faults and we should
2371 * set all of these to L1's desires. However, if !enable_ept, L0 does
2372 * care about (at least some) page faults, and because it is not easy
2373 * (if at all possible?) to merge L0 and L1's desires, we simply ask
2374 * to exit on each and every L2 page fault. This is done by setting
2375 * MASK=MATCH=0 and (see below) EB.PF=1.
2376 * Note that below we don't need special code to set EB.PF beyond the
2377 * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept,
2378 * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when
2379 * !enable_ept, EB.PF is 1, so the "or" will always be 1.
2380 */
2381 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
2382 enable_ept ? vmcs12->page_fault_error_code_mask : 0);
2383 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH,
2384 enable_ept ? vmcs12->page_fault_error_code_match : 0);
2385
2386 if (cpu_has_vmx_apicv()) {
2387 vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0);
2388 vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1);
2389 vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2);
2390 vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3);
2391 }
2392
Aaron Lewis662f1d12019-11-07 21:14:39 -08002393 /*
2394 * Make sure the msr_autostore list is up to date before we set the
2395 * count in the vmcs02.
2396 */
2397 prepare_vmx_msr_autostore_list(&vmx->vcpu, MSR_IA32_TSC);
2398
2399 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.guest.nr);
Sean Christopherson55d23752018-12-03 13:53:18 -08002400 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
2401 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
2402
2403 set_cr4_guest_host_mask(vmx);
Sean Christopherson55d23752018-12-03 13:53:18 -08002404}
2405
2406/*
2407 * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
2408 * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
2409 * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2
2410 * guest in a way that will both be appropriate to L1's requests, and our
2411 * needs. In addition to modifying the active vmcs (which is vmcs02), this
2412 * function also has additional necessary side-effects, like setting various
2413 * vcpu->arch fields.
2414 * Returns 0 on success, 1 on failure. Invalid state exit qualification code
2415 * is assigned to entry_failure_code on failure.
2416 */
2417static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
2418 u32 *entry_failure_code)
2419{
2420 struct vcpu_vmx *vmx = to_vmx(vcpu);
2421 struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs;
Sean Christophersonc7554efc2019-05-07 09:06:40 -07002422 bool load_guest_pdptrs_vmcs12 = false;
Sean Christopherson55d23752018-12-03 13:53:18 -08002423
Sean Christophersonc7554efc2019-05-07 09:06:40 -07002424 if (vmx->nested.dirty_vmcs12 || hv_evmcs) {
Paolo Bonzinib1346ab2019-06-06 17:24:00 +02002425 prepare_vmcs02_rare(vmx, vmcs12);
Sean Christopherson55d23752018-12-03 13:53:18 -08002426 vmx->nested.dirty_vmcs12 = false;
Sean Christopherson55d23752018-12-03 13:53:18 -08002427
Sean Christophersonc7554efc2019-05-07 09:06:40 -07002428 load_guest_pdptrs_vmcs12 = !hv_evmcs ||
2429 !(hv_evmcs->hv_clean_fields &
2430 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1);
Sean Christopherson55d23752018-12-03 13:53:18 -08002431 }
2432
2433 if (vmx->nested.nested_run_pending &&
2434 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
2435 kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
2436 vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
2437 } else {
2438 kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
2439 vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl);
2440 }
Sean Christopherson3b013a22019-05-07 09:06:28 -07002441 if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending ||
2442 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
2443 vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs);
Sean Christopherson55d23752018-12-03 13:53:18 -08002444 vmx_set_rflags(vcpu, vmcs12->guest_rflags);
2445
Sean Christopherson55d23752018-12-03 13:53:18 -08002446 /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the
2447 * bitwise-or of what L1 wants to trap for L2, and what we want to
2448 * trap. Note that CR0.TS also needs updating - we do this later.
2449 */
2450 update_exception_bitmap(vcpu);
2451 vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
2452 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
2453
2454 if (vmx->nested.nested_run_pending &&
2455 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) {
2456 vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat);
2457 vcpu->arch.pat = vmcs12->guest_ia32_pat;
2458 } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
2459 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
2460 }
2461
2462 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
2463
2464 if (kvm_has_tsc_control)
2465 decache_tsc_multiplier(vmx);
2466
2467 if (enable_vpid) {
2468 /*
2469 * There is no direct mapping between vpid02 and vpid12, the
2470 * vpid02 is per-vCPU for L0 and reused while the value of
2471 * vpid12 is changed w/ one invvpid during nested vmentry.
2472 * The vpid12 is allocated by L1 for L2, so it will not
2473 * influence global bitmap(for vpid01 and vpid02 allocation)
2474 * even if spawn a lot of nested vCPUs.
2475 */
2476 if (nested_cpu_has_vpid(vmcs12) && nested_has_guest_tlb_tag(vcpu)) {
2477 if (vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
2478 vmx->nested.last_vpid = vmcs12->virtual_processor_id;
2479 __vmx_flush_tlb(vcpu, nested_get_vpid02(vcpu), false);
2480 }
2481 } else {
2482 /*
2483 * If L1 use EPT, then L0 needs to execute INVEPT on
2484 * EPTP02 instead of EPTP01. Therefore, delay TLB
2485 * flush until vmcs02->eptp is fully updated by
2486 * KVM_REQ_LOAD_CR3. Note that this assumes
2487 * KVM_REQ_TLB_FLUSH is evaluated after
2488 * KVM_REQ_LOAD_CR3 in vcpu_enter_guest().
2489 */
2490 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
2491 }
2492 }
2493
2494 if (nested_cpu_has_ept(vmcs12))
2495 nested_ept_init_mmu_context(vcpu);
Sean Christopherson55d23752018-12-03 13:53:18 -08002496
2497 /*
2498 * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those
2499 * bits which we consider mandatory enabled.
2500 * The CR0_READ_SHADOW is what L2 should have expected to read given
2501 * the specifications by L1; It's not enough to take
2502 * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we
2503 * have more bits than L1 expected.
2504 */
2505 vmx_set_cr0(vcpu, vmcs12->guest_cr0);
2506 vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12));
2507
2508 vmx_set_cr4(vcpu, vmcs12->guest_cr4);
2509 vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12));
2510
2511 vcpu->arch.efer = nested_vmx_calc_efer(vmx, vmcs12);
2512 /* Note: may modify VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */
2513 vmx_set_efer(vcpu, vcpu->arch.efer);
2514
2515 /*
2516 * Guest state is invalid and unrestricted guest is disabled,
2517 * which means L1 attempted VMEntry to L2 with invalid state.
2518 * Fail the VMEntry.
2519 */
2520 if (vmx->emulation_required) {
2521 *entry_failure_code = ENTRY_FAIL_DEFAULT;
Sean Christophersonc80add02019-04-11 12:18:09 -07002522 return -EINVAL;
Sean Christopherson55d23752018-12-03 13:53:18 -08002523 }
2524
2525 /* Shadow page tables on either EPT or shadow page tables. */
2526 if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12),
2527 entry_failure_code))
Sean Christophersonc80add02019-04-11 12:18:09 -07002528 return -EINVAL;
Sean Christopherson55d23752018-12-03 13:53:18 -08002529
Sean Christopherson04f11ef2019-09-27 14:45:16 -07002530 /*
2531 * Immediately write vmcs02.GUEST_CR3. It will be propagated to vmcs12
2532 * on nested VM-Exit, which can occur without actually running L2 and
2533 * thus without hitting vmx_set_cr3(), e.g. if L1 is entering L2 with
2534 * vmcs12.GUEST_ACTIVITYSTATE=HLT, in which case KVM will intercept the
2535 * transition to HLT instead of running L2.
2536 */
2537 if (enable_ept)
2538 vmcs_writel(GUEST_CR3, vmcs12->guest_cr3);
2539
Sean Christophersonc7554efc2019-05-07 09:06:40 -07002540 /* Late preparation of GUEST_PDPTRs now that EFER and CRs are set. */
2541 if (load_guest_pdptrs_vmcs12 && nested_cpu_has_ept(vmcs12) &&
2542 is_pae_paging(vcpu)) {
2543 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
2544 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
2545 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
2546 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
2547 }
2548
Sean Christopherson55d23752018-12-03 13:53:18 -08002549 if (!enable_ept)
2550 vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested;
2551
Oliver Upton71f73472019-11-13 16:17:19 -08002552 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) &&
2553 SET_MSR_OR_WARN(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
2554 vmcs12->guest_ia32_perf_global_ctrl))
2555 return -EINVAL;
2556
Paolo Bonzinie9c16c72019-04-30 22:07:26 +02002557 kvm_rsp_write(vcpu, vmcs12->guest_rsp);
2558 kvm_rip_write(vcpu, vmcs12->guest_rip);
Sean Christopherson55d23752018-12-03 13:53:18 -08002559 return 0;
2560}
2561
2562static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12)
2563{
Sean Christopherson5497b952019-07-11 08:58:29 -07002564 if (CC(!nested_cpu_has_nmi_exiting(vmcs12) &&
2565 nested_cpu_has_virtual_nmis(vmcs12)))
Sean Christopherson55d23752018-12-03 13:53:18 -08002566 return -EINVAL;
2567
Sean Christopherson5497b952019-07-11 08:58:29 -07002568 if (CC(!nested_cpu_has_virtual_nmis(vmcs12) &&
2569 nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING)))
Sean Christopherson55d23752018-12-03 13:53:18 -08002570 return -EINVAL;
2571
2572 return 0;
2573}
2574
2575static bool valid_ept_address(struct kvm_vcpu *vcpu, u64 address)
2576{
2577 struct vcpu_vmx *vmx = to_vmx(vcpu);
2578 int maxphyaddr = cpuid_maxphyaddr(vcpu);
2579
2580 /* Check for memory type validity */
2581 switch (address & VMX_EPTP_MT_MASK) {
2582 case VMX_EPTP_MT_UC:
Sean Christopherson5497b952019-07-11 08:58:29 -07002583 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT)))
Sean Christopherson55d23752018-12-03 13:53:18 -08002584 return false;
2585 break;
2586 case VMX_EPTP_MT_WB:
Sean Christopherson5497b952019-07-11 08:58:29 -07002587 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT)))
Sean Christopherson55d23752018-12-03 13:53:18 -08002588 return false;
2589 break;
2590 default:
2591 return false;
2592 }
2593
2594 /* only 4 levels page-walk length are valid */
Sean Christopherson5497b952019-07-11 08:58:29 -07002595 if (CC((address & VMX_EPTP_PWL_MASK) != VMX_EPTP_PWL_4))
Sean Christopherson55d23752018-12-03 13:53:18 -08002596 return false;
2597
2598 /* Reserved bits should not be set */
Sean Christopherson5497b952019-07-11 08:58:29 -07002599 if (CC(address >> maxphyaddr || ((address >> 7) & 0x1f)))
Sean Christopherson55d23752018-12-03 13:53:18 -08002600 return false;
2601
2602 /* AD, if set, should be supported */
2603 if (address & VMX_EPTP_AD_ENABLE_BIT) {
Sean Christopherson5497b952019-07-11 08:58:29 -07002604 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT)))
Sean Christopherson55d23752018-12-03 13:53:18 -08002605 return false;
2606 }
2607
2608 return true;
2609}
2610
Krish Sadhukhan461b4ba2018-12-12 13:30:07 -05002611/*
2612 * Checks related to VM-Execution Control Fields
2613 */
2614static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu,
2615 struct vmcs12 *vmcs12)
2616{
2617 struct vcpu_vmx *vmx = to_vmx(vcpu);
2618
Sean Christopherson5497b952019-07-11 08:58:29 -07002619 if (CC(!vmx_control_verify(vmcs12->pin_based_vm_exec_control,
2620 vmx->nested.msrs.pinbased_ctls_low,
2621 vmx->nested.msrs.pinbased_ctls_high)) ||
2622 CC(!vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
2623 vmx->nested.msrs.procbased_ctls_low,
2624 vmx->nested.msrs.procbased_ctls_high)))
Krish Sadhukhan461b4ba2018-12-12 13:30:07 -05002625 return -EINVAL;
2626
2627 if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
Sean Christopherson5497b952019-07-11 08:58:29 -07002628 CC(!vmx_control_verify(vmcs12->secondary_vm_exec_control,
2629 vmx->nested.msrs.secondary_ctls_low,
2630 vmx->nested.msrs.secondary_ctls_high)))
Krish Sadhukhan461b4ba2018-12-12 13:30:07 -05002631 return -EINVAL;
2632
Sean Christopherson5497b952019-07-11 08:58:29 -07002633 if (CC(vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu)) ||
Krish Sadhukhan461b4ba2018-12-12 13:30:07 -05002634 nested_vmx_check_io_bitmap_controls(vcpu, vmcs12) ||
2635 nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12) ||
2636 nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12) ||
2637 nested_vmx_check_apic_access_controls(vcpu, vmcs12) ||
2638 nested_vmx_check_apicv_controls(vcpu, vmcs12) ||
2639 nested_vmx_check_nmi_controls(vmcs12) ||
2640 nested_vmx_check_pml_controls(vcpu, vmcs12) ||
2641 nested_vmx_check_unrestricted_guest_controls(vcpu, vmcs12) ||
2642 nested_vmx_check_mode_based_ept_exec_controls(vcpu, vmcs12) ||
2643 nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12) ||
Sean Christopherson5497b952019-07-11 08:58:29 -07002644 CC(nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id))
Krish Sadhukhan461b4ba2018-12-12 13:30:07 -05002645 return -EINVAL;
2646
Sean Christophersonbc441212019-02-12 16:42:23 -08002647 if (!nested_cpu_has_preemption_timer(vmcs12) &&
2648 nested_cpu_has_save_preemption_timer(vmcs12))
2649 return -EINVAL;
2650
Krish Sadhukhan461b4ba2018-12-12 13:30:07 -05002651 if (nested_cpu_has_ept(vmcs12) &&
Sean Christopherson5497b952019-07-11 08:58:29 -07002652 CC(!valid_ept_address(vcpu, vmcs12->ept_pointer)))
Krish Sadhukhan461b4ba2018-12-12 13:30:07 -05002653 return -EINVAL;
2654
2655 if (nested_cpu_has_vmfunc(vmcs12)) {
Sean Christopherson5497b952019-07-11 08:58:29 -07002656 if (CC(vmcs12->vm_function_control &
2657 ~vmx->nested.msrs.vmfunc_controls))
Krish Sadhukhan461b4ba2018-12-12 13:30:07 -05002658 return -EINVAL;
2659
2660 if (nested_cpu_has_eptp_switching(vmcs12)) {
Sean Christopherson5497b952019-07-11 08:58:29 -07002661 if (CC(!nested_cpu_has_ept(vmcs12)) ||
2662 CC(!page_address_valid(vcpu, vmcs12->eptp_list_address)))
Krish Sadhukhan461b4ba2018-12-12 13:30:07 -05002663 return -EINVAL;
2664 }
2665 }
2666
2667 return 0;
2668}
2669
Krish Sadhukhan61446ba2018-12-12 13:30:09 -05002670/*
2671 * Checks related to VM-Exit Control Fields
2672 */
2673static int nested_check_vm_exit_controls(struct kvm_vcpu *vcpu,
2674 struct vmcs12 *vmcs12)
2675{
2676 struct vcpu_vmx *vmx = to_vmx(vcpu);
2677
Sean Christopherson5497b952019-07-11 08:58:29 -07002678 if (CC(!vmx_control_verify(vmcs12->vm_exit_controls,
2679 vmx->nested.msrs.exit_ctls_low,
2680 vmx->nested.msrs.exit_ctls_high)) ||
2681 CC(nested_vmx_check_exit_msr_switch_controls(vcpu, vmcs12)))
Krish Sadhukhan61446ba2018-12-12 13:30:09 -05002682 return -EINVAL;
2683
2684 return 0;
2685}
2686
Krish Sadhukhan5fbf9632018-12-12 13:30:10 -05002687/*
2688 * Checks related to VM-Entry Control Fields
2689 */
2690static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu,
2691 struct vmcs12 *vmcs12)
Sean Christopherson55d23752018-12-03 13:53:18 -08002692{
2693 struct vcpu_vmx *vmx = to_vmx(vcpu);
Sean Christopherson55d23752018-12-03 13:53:18 -08002694
Sean Christopherson5497b952019-07-11 08:58:29 -07002695 if (CC(!vmx_control_verify(vmcs12->vm_entry_controls,
2696 vmx->nested.msrs.entry_ctls_low,
2697 vmx->nested.msrs.entry_ctls_high)))
Krish Sadhukhan5fbf9632018-12-12 13:30:10 -05002698 return -EINVAL;
Sean Christopherson55d23752018-12-03 13:53:18 -08002699
2700 /*
2701 * From the Intel SDM, volume 3:
2702 * Fields relevant to VM-entry event injection must be set properly.
2703 * These fields are the VM-entry interruption-information field, the
2704 * VM-entry exception error code, and the VM-entry instruction length.
2705 */
2706 if (vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) {
2707 u32 intr_info = vmcs12->vm_entry_intr_info_field;
2708 u8 vector = intr_info & INTR_INFO_VECTOR_MASK;
2709 u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK;
2710 bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK;
2711 bool should_have_error_code;
2712 bool urg = nested_cpu_has2(vmcs12,
2713 SECONDARY_EXEC_UNRESTRICTED_GUEST);
2714 bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE;
2715
2716 /* VM-entry interruption-info field: interruption type */
Sean Christopherson5497b952019-07-11 08:58:29 -07002717 if (CC(intr_type == INTR_TYPE_RESERVED) ||
2718 CC(intr_type == INTR_TYPE_OTHER_EVENT &&
2719 !nested_cpu_supports_monitor_trap_flag(vcpu)))
Krish Sadhukhan5fbf9632018-12-12 13:30:10 -05002720 return -EINVAL;
Sean Christopherson55d23752018-12-03 13:53:18 -08002721
2722 /* VM-entry interruption-info field: vector */
Sean Christopherson5497b952019-07-11 08:58:29 -07002723 if (CC(intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) ||
2724 CC(intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) ||
2725 CC(intr_type == INTR_TYPE_OTHER_EVENT && vector != 0))
Krish Sadhukhan5fbf9632018-12-12 13:30:10 -05002726 return -EINVAL;
Sean Christopherson55d23752018-12-03 13:53:18 -08002727
2728 /* VM-entry interruption-info field: deliver error code */
2729 should_have_error_code =
2730 intr_type == INTR_TYPE_HARD_EXCEPTION && prot_mode &&
2731 x86_exception_has_error_code(vector);
Sean Christopherson5497b952019-07-11 08:58:29 -07002732 if (CC(has_error_code != should_have_error_code))
Krish Sadhukhan5fbf9632018-12-12 13:30:10 -05002733 return -EINVAL;
Sean Christopherson55d23752018-12-03 13:53:18 -08002734
2735 /* VM-entry exception error code */
Sean Christopherson5497b952019-07-11 08:58:29 -07002736 if (CC(has_error_code &&
Sean Christopherson567926c2019-10-01 09:21:23 -07002737 vmcs12->vm_entry_exception_error_code & GENMASK(31, 16)))
Krish Sadhukhan5fbf9632018-12-12 13:30:10 -05002738 return -EINVAL;
Sean Christopherson55d23752018-12-03 13:53:18 -08002739
2740 /* VM-entry interruption-info field: reserved bits */
Sean Christopherson5497b952019-07-11 08:58:29 -07002741 if (CC(intr_info & INTR_INFO_RESVD_BITS_MASK))
Krish Sadhukhan5fbf9632018-12-12 13:30:10 -05002742 return -EINVAL;
Sean Christopherson55d23752018-12-03 13:53:18 -08002743
2744 /* VM-entry instruction length */
2745 switch (intr_type) {
2746 case INTR_TYPE_SOFT_EXCEPTION:
2747 case INTR_TYPE_SOFT_INTR:
2748 case INTR_TYPE_PRIV_SW_EXCEPTION:
Sean Christopherson5497b952019-07-11 08:58:29 -07002749 if (CC(vmcs12->vm_entry_instruction_len > 15) ||
2750 CC(vmcs12->vm_entry_instruction_len == 0 &&
2751 CC(!nested_cpu_has_zero_length_injection(vcpu))))
Krish Sadhukhan5fbf9632018-12-12 13:30:10 -05002752 return -EINVAL;
Sean Christopherson55d23752018-12-03 13:53:18 -08002753 }
2754 }
2755
Krish Sadhukhan5fbf9632018-12-12 13:30:10 -05002756 if (nested_vmx_check_entry_msr_switch_controls(vcpu, vmcs12))
2757 return -EINVAL;
2758
2759 return 0;
2760}
2761
Sean Christopherson5478ba32019-04-11 12:18:06 -07002762static int nested_vmx_check_controls(struct kvm_vcpu *vcpu,
2763 struct vmcs12 *vmcs12)
2764{
2765 if (nested_check_vm_execution_controls(vcpu, vmcs12) ||
2766 nested_check_vm_exit_controls(vcpu, vmcs12) ||
2767 nested_check_vm_entry_controls(vcpu, vmcs12))
Paolo Bonzini98d9e852019-04-12 10:19:57 +02002768 return -EINVAL;
Sean Christopherson5478ba32019-04-11 12:18:06 -07002769
2770 return 0;
2771}
2772
Paolo Bonzini98d9e852019-04-12 10:19:57 +02002773static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
2774 struct vmcs12 *vmcs12)
Krish Sadhukhan5fbf9632018-12-12 13:30:10 -05002775{
2776 bool ia32e;
2777
Sean Christopherson5497b952019-07-11 08:58:29 -07002778 if (CC(!nested_host_cr0_valid(vcpu, vmcs12->host_cr0)) ||
2779 CC(!nested_host_cr4_valid(vcpu, vmcs12->host_cr4)) ||
2780 CC(!nested_cr3_valid(vcpu, vmcs12->host_cr3)))
Krish Sadhukhan254b2f32018-12-12 13:30:11 -05002781 return -EINVAL;
Krish Sadhukhan711eff32019-02-07 14:05:30 -05002782
Sean Christopherson5497b952019-07-11 08:58:29 -07002783 if (CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu)) ||
2784 CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_eip, vcpu)))
Krish Sadhukhan711eff32019-02-07 14:05:30 -05002785 return -EINVAL;
2786
Krish Sadhukhanf6b0db1f2019-04-08 17:35:11 -04002787 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) &&
Sean Christopherson5497b952019-07-11 08:58:29 -07002788 CC(!kvm_pat_valid(vmcs12->host_ia32_pat)))
Krish Sadhukhanf6b0db1f2019-04-08 17:35:11 -04002789 return -EINVAL;
2790
Oliver Uptonc547cb62019-11-13 16:17:17 -08002791 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) &&
2792 CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu),
2793 vmcs12->host_ia32_perf_global_ctrl)))
2794 return -EINVAL;
2795
Paolo Bonzinifd3edd42019-09-25 18:33:53 +02002796#ifdef CONFIG_X86_64
2797 ia32e = !!(vcpu->arch.efer & EFER_LMA);
2798#else
2799 ia32e = false;
2800#endif
2801
2802 if (ia32e) {
2803 if (CC(!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)) ||
2804 CC(!(vmcs12->host_cr4 & X86_CR4_PAE)))
2805 return -EINVAL;
2806 } else {
2807 if (CC(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) ||
2808 CC(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) ||
2809 CC(vmcs12->host_cr4 & X86_CR4_PCIDE) ||
2810 CC((vmcs12->host_rip) >> 32))
2811 return -EINVAL;
2812 }
Krish Sadhukhan1ef23e12019-07-03 19:54:35 -04002813
Sean Christopherson5497b952019-07-11 08:58:29 -07002814 if (CC(vmcs12->host_cs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2815 CC(vmcs12->host_ss_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2816 CC(vmcs12->host_ds_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2817 CC(vmcs12->host_es_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2818 CC(vmcs12->host_fs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2819 CC(vmcs12->host_gs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2820 CC(vmcs12->host_tr_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2821 CC(vmcs12->host_cs_selector == 0) ||
2822 CC(vmcs12->host_tr_selector == 0) ||
2823 CC(vmcs12->host_ss_selector == 0 && !ia32e))
Krish Sadhukhan1ef23e12019-07-03 19:54:35 -04002824 return -EINVAL;
2825
2826#ifdef CONFIG_X86_64
Sean Christopherson5497b952019-07-11 08:58:29 -07002827 if (CC(is_noncanonical_address(vmcs12->host_fs_base, vcpu)) ||
2828 CC(is_noncanonical_address(vmcs12->host_gs_base, vcpu)) ||
2829 CC(is_noncanonical_address(vmcs12->host_gdtr_base, vcpu)) ||
2830 CC(is_noncanonical_address(vmcs12->host_idtr_base, vcpu)) ||
Paolo Bonzinifd3edd42019-09-25 18:33:53 +02002831 CC(is_noncanonical_address(vmcs12->host_tr_base, vcpu)) ||
2832 CC(is_noncanonical_address(vmcs12->host_rip, vcpu)))
Krish Sadhukhan58450382019-08-09 12:26:19 -07002833 return -EINVAL;
Krish Sadhukhan1ef23e12019-07-03 19:54:35 -04002834#endif
2835
Krish Sadhukhan5fbf9632018-12-12 13:30:10 -05002836 /*
2837 * If the load IA32_EFER VM-exit control is 1, bits reserved in the
2838 * IA32_EFER MSR must be 0 in the field for that register. In addition,
2839 * the values of the LMA and LME bits in the field must each be that of
2840 * the host address-space size VM-exit control.
2841 */
2842 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) {
Sean Christopherson5497b952019-07-11 08:58:29 -07002843 if (CC(!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer)) ||
2844 CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA)) ||
2845 CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LME)))
Krish Sadhukhan254b2f32018-12-12 13:30:11 -05002846 return -EINVAL;
Krish Sadhukhan5fbf9632018-12-12 13:30:10 -05002847 }
2848
Sean Christopherson55d23752018-12-03 13:53:18 -08002849 return 0;
2850}
2851
2852static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu,
2853 struct vmcs12 *vmcs12)
2854{
KarimAllah Ahmed88925302019-01-31 21:24:41 +01002855 int r = 0;
Sean Christopherson55d23752018-12-03 13:53:18 -08002856 struct vmcs12 *shadow;
KarimAllah Ahmed88925302019-01-31 21:24:41 +01002857 struct kvm_host_map map;
Sean Christopherson55d23752018-12-03 13:53:18 -08002858
2859 if (vmcs12->vmcs_link_pointer == -1ull)
2860 return 0;
2861
Sean Christopherson5497b952019-07-11 08:58:29 -07002862 if (CC(!page_address_valid(vcpu, vmcs12->vmcs_link_pointer)))
Sean Christopherson55d23752018-12-03 13:53:18 -08002863 return -EINVAL;
2864
Sean Christopherson5497b952019-07-11 08:58:29 -07002865 if (CC(kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map)))
Sean Christopherson55d23752018-12-03 13:53:18 -08002866 return -EINVAL;
2867
KarimAllah Ahmed88925302019-01-31 21:24:41 +01002868 shadow = map.hva;
2869
Sean Christopherson5497b952019-07-11 08:58:29 -07002870 if (CC(shadow->hdr.revision_id != VMCS12_REVISION) ||
2871 CC(shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12)))
Sean Christopherson55d23752018-12-03 13:53:18 -08002872 r = -EINVAL;
KarimAllah Ahmed88925302019-01-31 21:24:41 +01002873
2874 kvm_vcpu_unmap(vcpu, &map, false);
Sean Christopherson55d23752018-12-03 13:53:18 -08002875 return r;
2876}
2877
Sean Christopherson55d23752018-12-03 13:53:18 -08002878/*
2879 * Checks related to Guest Non-register State
2880 */
2881static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12)
2882{
Sean Christopherson5497b952019-07-11 08:58:29 -07002883 if (CC(vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE &&
2884 vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT))
Sean Christopherson55d23752018-12-03 13:53:18 -08002885 return -EINVAL;
2886
2887 return 0;
2888}
2889
Sean Christopherson5478ba32019-04-11 12:18:06 -07002890static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu,
2891 struct vmcs12 *vmcs12,
2892 u32 *exit_qual)
Sean Christopherson55d23752018-12-03 13:53:18 -08002893{
2894 bool ia32e;
2895
2896 *exit_qual = ENTRY_FAIL_DEFAULT;
2897
Sean Christopherson5497b952019-07-11 08:58:29 -07002898 if (CC(!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0)) ||
2899 CC(!nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4)))
Sean Christophersonc80add02019-04-11 12:18:09 -07002900 return -EINVAL;
Sean Christopherson55d23752018-12-03 13:53:18 -08002901
Krish Sadhukhande2bc2b2019-04-08 17:35:12 -04002902 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) &&
Sean Christopherson5497b952019-07-11 08:58:29 -07002903 CC(!kvm_pat_valid(vmcs12->guest_ia32_pat)))
Sean Christophersonc80add02019-04-11 12:18:09 -07002904 return -EINVAL;
Sean Christopherson55d23752018-12-03 13:53:18 -08002905
2906 if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) {
2907 *exit_qual = ENTRY_FAIL_VMCS_LINK_PTR;
Sean Christophersonc80add02019-04-11 12:18:09 -07002908 return -EINVAL;
Sean Christopherson55d23752018-12-03 13:53:18 -08002909 }
2910
Oliver Uptonbfc6ad62019-11-13 16:17:16 -08002911 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) &&
2912 CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu),
2913 vmcs12->guest_ia32_perf_global_ctrl)))
2914 return -EINVAL;
2915
Sean Christopherson55d23752018-12-03 13:53:18 -08002916 /*
2917 * If the load IA32_EFER VM-entry control is 1, the following checks
2918 * are performed on the field for the IA32_EFER MSR:
2919 * - Bits reserved in the IA32_EFER MSR must be 0.
2920 * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of
2921 * the IA-32e mode guest VM-exit control. It must also be identical
2922 * to bit 8 (LME) if bit 31 in the CR0 field (corresponding to
2923 * CR0.PG) is 1.
2924 */
2925 if (to_vmx(vcpu)->nested.nested_run_pending &&
2926 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) {
2927 ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0;
Sean Christopherson5497b952019-07-11 08:58:29 -07002928 if (CC(!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer)) ||
2929 CC(ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA)) ||
2930 CC(((vmcs12->guest_cr0 & X86_CR0_PG) &&
2931 ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME))))
Sean Christophersonc80add02019-04-11 12:18:09 -07002932 return -EINVAL;
Sean Christopherson55d23752018-12-03 13:53:18 -08002933 }
2934
2935 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) &&
Sean Christopherson5497b952019-07-11 08:58:29 -07002936 (CC(is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu)) ||
2937 CC((vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD))))
Sean Christophersonc80add02019-04-11 12:18:09 -07002938 return -EINVAL;
Sean Christopherson55d23752018-12-03 13:53:18 -08002939
Sean Christopherson9c3e9222019-04-11 12:18:05 -07002940 if (nested_check_guest_non_reg_state(vmcs12))
Sean Christophersonc80add02019-04-11 12:18:09 -07002941 return -EINVAL;
Sean Christopherson55d23752018-12-03 13:53:18 -08002942
2943 return 0;
2944}
2945
Sean Christopherson453eafb2018-12-20 12:25:17 -08002946static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
Sean Christopherson55d23752018-12-03 13:53:18 -08002947{
2948 struct vcpu_vmx *vmx = to_vmx(vcpu);
2949 unsigned long cr3, cr4;
Sean Christophersonf1727b42019-01-25 07:40:58 -08002950 bool vm_fail;
Sean Christopherson55d23752018-12-03 13:53:18 -08002951
2952 if (!nested_early_check)
2953 return 0;
2954
2955 if (vmx->msr_autoload.host.nr)
2956 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
2957 if (vmx->msr_autoload.guest.nr)
2958 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
2959
2960 preempt_disable();
2961
2962 vmx_prepare_switch_to_guest(vcpu);
2963
2964 /*
2965 * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS,
2966 * which is reserved to '1' by hardware. GUEST_RFLAGS is guaranteed to
2967 * be written (by preparve_vmcs02()) before the "real" VMEnter, i.e.
2968 * there is no need to preserve other bits or save/restore the field.
2969 */
2970 vmcs_writel(GUEST_RFLAGS, 0);
2971
Sean Christopherson55d23752018-12-03 13:53:18 -08002972 cr3 = __get_current_cr3_fast();
2973 if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
2974 vmcs_writel(HOST_CR3, cr3);
2975 vmx->loaded_vmcs->host_state.cr3 = cr3;
2976 }
2977
2978 cr4 = cr4_read_shadow();
2979 if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
2980 vmcs_writel(HOST_CR4, cr4);
2981 vmx->loaded_vmcs->host_state.cr4 = cr4;
2982 }
2983
Sean Christopherson55d23752018-12-03 13:53:18 -08002984 asm(
Sean Christopherson453eafb2018-12-20 12:25:17 -08002985 "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* temporarily adjust RSP for CALL */
Sean Christopherson5a878162019-01-25 07:41:02 -08002986 "cmp %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t"
2987 "je 1f \n\t"
Sean Christophersonfbda0fd2019-01-25 07:41:01 -08002988 __ex("vmwrite %%" _ASM_SP ", %[HOST_RSP]") "\n\t"
Sean Christopherson5a878162019-01-25 07:41:02 -08002989 "mov %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t"
2990 "1: \n\t"
Sean Christopherson453eafb2018-12-20 12:25:17 -08002991 "add $%c[wordsize], %%" _ASM_SP "\n\t" /* un-adjust RSP */
Sean Christopherson55d23752018-12-03 13:53:18 -08002992
2993 /* Check if vmlaunch or vmresume is needed */
Sean Christopherson74dfa272019-01-25 07:41:00 -08002994 "cmpb $0, %c[launched](%[loaded_vmcs])\n\t"
Sean Christopherson453eafb2018-12-20 12:25:17 -08002995
Sean Christophersonf1727b42019-01-25 07:40:58 -08002996 /*
2997 * VMLAUNCH and VMRESUME clear RFLAGS.{CF,ZF} on VM-Exit, set
2998 * RFLAGS.CF on VM-Fail Invalid and set RFLAGS.ZF on VM-Fail
2999 * Valid. vmx_vmenter() directly "returns" RFLAGS, and so the
Sean Christophersonbbc0b822019-01-25 07:40:59 -08003000 * results of VM-Enter is captured via CC_{SET,OUT} to vm_fail.
Sean Christophersonf1727b42019-01-25 07:40:58 -08003001 */
Sean Christopherson453eafb2018-12-20 12:25:17 -08003002 "call vmx_vmenter\n\t"
3003
Sean Christophersonbbc0b822019-01-25 07:40:59 -08003004 CC_SET(be)
3005 : ASM_CALL_CONSTRAINT, CC_OUT(be) (vm_fail)
Sean Christopherson5a878162019-01-25 07:41:02 -08003006 : [HOST_RSP]"r"((unsigned long)HOST_RSP),
Sean Christopherson74dfa272019-01-25 07:41:00 -08003007 [loaded_vmcs]"r"(vmx->loaded_vmcs),
3008 [launched]"i"(offsetof(struct loaded_vmcs, launched)),
Sean Christopherson5a878162019-01-25 07:41:02 -08003009 [host_state_rsp]"i"(offsetof(struct loaded_vmcs, host_state.rsp)),
Sean Christopherson453eafb2018-12-20 12:25:17 -08003010 [wordsize]"i"(sizeof(ulong))
Jan Beulich5a253552019-05-27 02:45:44 -06003011 : "memory"
Sean Christopherson55d23752018-12-03 13:53:18 -08003012 );
3013
Sean Christopherson55d23752018-12-03 13:53:18 -08003014 if (vmx->msr_autoload.host.nr)
3015 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
3016 if (vmx->msr_autoload.guest.nr)
3017 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
3018
Sean Christophersonf1727b42019-01-25 07:40:58 -08003019 if (vm_fail) {
Sean Christopherson380e0052019-07-11 08:58:30 -07003020 u32 error = vmcs_read32(VM_INSTRUCTION_ERROR);
3021
Wanpeng Li541e8862019-05-17 16:49:50 +08003022 preempt_enable();
Sean Christopherson380e0052019-07-11 08:58:30 -07003023
3024 trace_kvm_nested_vmenter_failed(
3025 "early hardware check VM-instruction error: ", error);
3026 WARN_ON_ONCE(error != VMXERR_ENTRY_INVALID_CONTROL_FIELD);
Sean Christopherson55d23752018-12-03 13:53:18 -08003027 return 1;
3028 }
3029
3030 /*
3031 * VMExit clears RFLAGS.IF and DR7, even on a consistency check.
3032 */
3033 local_irq_enable();
3034 if (hw_breakpoint_active())
3035 set_debugreg(__this_cpu_read(cpu_dr7), 7);
Wanpeng Li541e8862019-05-17 16:49:50 +08003036 preempt_enable();
Sean Christopherson55d23752018-12-03 13:53:18 -08003037
3038 /*
3039 * A non-failing VMEntry means we somehow entered guest mode with
3040 * an illegal RIP, and that's just the tip of the iceberg. There
3041 * is no telling what memory has been modified or what state has
3042 * been exposed to unknown code. Hitting this all but guarantees
3043 * a (very critical) hardware issue.
3044 */
3045 WARN_ON(!(vmcs_read32(VM_EXIT_REASON) &
3046 VMX_EXIT_REASONS_FAILED_VMENTRY));
3047
3048 return 0;
3049}
Sean Christopherson55d23752018-12-03 13:53:18 -08003050
3051static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
3052 struct vmcs12 *vmcs12);
3053
Jim Mattson671ddc72019-10-15 10:44:05 -07003054static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
Sean Christopherson55d23752018-12-03 13:53:18 -08003055{
3056 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3057 struct vcpu_vmx *vmx = to_vmx(vcpu);
KarimAllah Ahmed96c66e82019-01-31 21:24:37 +01003058 struct kvm_host_map *map;
Sean Christopherson55d23752018-12-03 13:53:18 -08003059 struct page *page;
3060 u64 hpa;
3061
3062 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
3063 /*
3064 * Translate L1 physical address to host physical
3065 * address for vmcs02. Keep the page pinned, so this
3066 * physical address remains valid. We keep a reference
3067 * to it so we can release it later.
3068 */
3069 if (vmx->nested.apic_access_page) { /* shouldn't happen */
Liran Alonb11494b2019-11-21 00:31:47 +02003070 kvm_release_page_clean(vmx->nested.apic_access_page);
Sean Christopherson55d23752018-12-03 13:53:18 -08003071 vmx->nested.apic_access_page = NULL;
3072 }
3073 page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->apic_access_addr);
Sean Christopherson55d23752018-12-03 13:53:18 -08003074 if (!is_error_page(page)) {
3075 vmx->nested.apic_access_page = page;
3076 hpa = page_to_phys(vmx->nested.apic_access_page);
3077 vmcs_write64(APIC_ACCESS_ADDR, hpa);
3078 } else {
Jim Mattson671ddc72019-10-15 10:44:05 -07003079 pr_debug_ratelimited("%s: no backing 'struct page' for APIC-access address in vmcs12\n",
3080 __func__);
3081 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
3082 vcpu->run->internal.suberror =
3083 KVM_INTERNAL_ERROR_EMULATION;
3084 vcpu->run->internal.ndata = 0;
3085 return false;
Sean Christopherson55d23752018-12-03 13:53:18 -08003086 }
3087 }
3088
3089 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
KarimAllah Ahmed96c66e82019-01-31 21:24:37 +01003090 map = &vmx->nested.virtual_apic_map;
Sean Christopherson55d23752018-12-03 13:53:18 -08003091
KarimAllah Ahmed96c66e82019-01-31 21:24:37 +01003092 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->virtual_apic_page_addr), map)) {
3093 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(map->pfn));
Paolo Bonzini69090812019-04-15 15:16:17 +02003094 } else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) &&
3095 nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) &&
3096 !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
3097 /*
3098 * The processor will never use the TPR shadow, simply
3099 * clear the bit from the execution control. Such a
3100 * configuration is useless, but it happens in tests.
3101 * For any other configuration, failing the vm entry is
3102 * _not_ what the processor does but it's basically the
3103 * only possibility we have.
3104 */
Sean Christopherson2183f562019-05-07 12:17:56 -07003105 exec_controls_clearbit(vmx, CPU_BASED_TPR_SHADOW);
Paolo Bonzini69090812019-04-15 15:16:17 +02003106 } else {
Sean Christophersonca2f5462019-05-07 09:06:33 -07003107 /*
3108 * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR to
3109 * force VM-Entry to fail.
3110 */
3111 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull);
Sean Christopherson55d23752018-12-03 13:53:18 -08003112 }
3113 }
3114
3115 if (nested_cpu_has_posted_intr(vmcs12)) {
KarimAllah Ahmed3278e042019-01-31 21:24:38 +01003116 map = &vmx->nested.pi_desc_map;
3117
3118 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->posted_intr_desc_addr), map)) {
3119 vmx->nested.pi_desc =
3120 (struct pi_desc *)(((void *)map->hva) +
3121 offset_in_page(vmcs12->posted_intr_desc_addr));
3122 vmcs_write64(POSTED_INTR_DESC_ADDR,
3123 pfn_to_hpa(map->pfn) + offset_in_page(vmcs12->posted_intr_desc_addr));
Sean Christopherson55d23752018-12-03 13:53:18 -08003124 }
Sean Christopherson55d23752018-12-03 13:53:18 -08003125 }
3126 if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12))
Sean Christopherson2183f562019-05-07 12:17:56 -07003127 exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS);
Sean Christopherson55d23752018-12-03 13:53:18 -08003128 else
Sean Christopherson2183f562019-05-07 12:17:56 -07003129 exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS);
Jim Mattson671ddc72019-10-15 10:44:05 -07003130 return true;
Sean Christopherson55d23752018-12-03 13:53:18 -08003131}
3132
3133/*
3134 * Intel's VMX Instruction Reference specifies a common set of prerequisites
3135 * for running VMX instructions (except VMXON, whose prerequisites are
3136 * slightly different). It also specifies what exception to inject otherwise.
3137 * Note that many of these exceptions have priority over VM exits, so they
3138 * don't have to be checked again here.
3139 */
3140static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
3141{
3142 if (!to_vmx(vcpu)->nested.vmxon) {
3143 kvm_queue_exception(vcpu, UD_VECTOR);
3144 return 0;
3145 }
3146
3147 if (vmx_get_cpl(vcpu)) {
3148 kvm_inject_gp(vcpu, 0);
3149 return 0;
3150 }
3151
3152 return 1;
3153}
3154
3155static u8 vmx_has_apicv_interrupt(struct kvm_vcpu *vcpu)
3156{
3157 u8 rvi = vmx_get_rvi();
3158 u8 vppr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_PROCPRI);
3159
3160 return ((rvi & 0xf0) > (vppr & 0xf0));
3161}
3162
3163static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
3164 struct vmcs12 *vmcs12);
3165
3166/*
3167 * If from_vmentry is false, this is being called from state restore (either RSM
3168 * or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume.
Jim Mattson671ddc72019-10-15 10:44:05 -07003169 *
3170 * Returns:
3171 * NVMX_ENTRY_SUCCESS: Entered VMX non-root mode
3172 * NVMX_ENTRY_VMFAIL: Consistency check VMFail
3173 * NVMX_ENTRY_VMEXIT: Consistency check VMExit
3174 * NVMX_ENTRY_KVM_INTERNAL_ERROR: KVM internal error
Sean Christopherson55d23752018-12-03 13:53:18 -08003175 */
Jim Mattson671ddc72019-10-15 10:44:05 -07003176enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
3177 bool from_vmentry)
Sean Christopherson55d23752018-12-03 13:53:18 -08003178{
3179 struct vcpu_vmx *vmx = to_vmx(vcpu);
3180 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3181 bool evaluate_pending_interrupts;
3182 u32 exit_reason = EXIT_REASON_INVALID_STATE;
3183 u32 exit_qual;
3184
Sean Christopherson2183f562019-05-07 12:17:56 -07003185 evaluate_pending_interrupts = exec_controls_get(vmx) &
Sean Christopherson55d23752018-12-03 13:53:18 -08003186 (CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_VIRTUAL_NMI_PENDING);
3187 if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu))
3188 evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu);
3189
3190 if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
3191 vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
3192 if (kvm_mpx_supported() &&
3193 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
3194 vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
3195
Sean Christophersonf087a022019-06-07 11:55:34 -07003196 /*
3197 * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and*
3198 * nested early checks are disabled. In the event of a "late" VM-Fail,
3199 * i.e. a VM-Fail detected by hardware but not KVM, KVM must unwind its
3200 * software model to the pre-VMEntry host state. When EPT is disabled,
3201 * GUEST_CR3 holds KVM's shadow CR3, not L1's "real" CR3, which causes
3202 * nested_vmx_restore_host_state() to corrupt vcpu->arch.cr3. Stuffing
3203 * vmcs01.GUEST_CR3 results in the unwind naturally setting arch.cr3 to
3204 * the correct value. Smashing vmcs01.GUEST_CR3 is safe because nested
3205 * VM-Exits, and the unwind, reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is
3206 * guaranteed to be overwritten with a shadow CR3 prior to re-entering
3207 * L1. Don't stuff vmcs01.GUEST_CR3 when using nested early checks as
3208 * KVM modifies vcpu->arch.cr3 if and only if the early hardware checks
3209 * pass, and early VM-Fails do not reset KVM's MMU, i.e. the VM-Fail
3210 * path would need to manually save/restore vmcs01.GUEST_CR3.
3211 */
3212 if (!enable_ept && !nested_early_check)
3213 vmcs_writel(GUEST_CR3, vcpu->arch.cr3);
3214
Sean Christopherson55d23752018-12-03 13:53:18 -08003215 vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
3216
3217 prepare_vmcs02_early(vmx, vmcs12);
3218
3219 if (from_vmentry) {
Jim Mattson671ddc72019-10-15 10:44:05 -07003220 if (unlikely(!nested_get_vmcs12_pages(vcpu)))
3221 return NVMX_VMENTRY_KVM_INTERNAL_ERROR;
Sean Christopherson55d23752018-12-03 13:53:18 -08003222
3223 if (nested_vmx_check_vmentry_hw(vcpu)) {
3224 vmx_switch_vmcs(vcpu, &vmx->vmcs01);
Jim Mattson671ddc72019-10-15 10:44:05 -07003225 return NVMX_VMENTRY_VMFAIL;
Sean Christopherson55d23752018-12-03 13:53:18 -08003226 }
3227
Sean Christopherson5478ba32019-04-11 12:18:06 -07003228 if (nested_vmx_check_guest_state(vcpu, vmcs12, &exit_qual))
Sean Christopherson55d23752018-12-03 13:53:18 -08003229 goto vmentry_fail_vmexit;
3230 }
3231
3232 enter_guest_mode(vcpu);
3233 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
3234 vcpu->arch.tsc_offset += vmcs12->tsc_offset;
3235
3236 if (prepare_vmcs02(vcpu, vmcs12, &exit_qual))
3237 goto vmentry_fail_vmexit_guest_mode;
3238
3239 if (from_vmentry) {
3240 exit_reason = EXIT_REASON_MSR_LOAD_FAIL;
3241 exit_qual = nested_vmx_load_msr(vcpu,
3242 vmcs12->vm_entry_msr_load_addr,
3243 vmcs12->vm_entry_msr_load_count);
3244 if (exit_qual)
3245 goto vmentry_fail_vmexit_guest_mode;
3246 } else {
3247 /*
3248 * The MMU is not initialized to point at the right entities yet and
3249 * "get pages" would need to read data from the guest (i.e. we will
3250 * need to perform gpa to hpa translation). Request a call
3251 * to nested_get_vmcs12_pages before the next VM-entry. The MSRs
3252 * have already been set at vmentry time and should not be reset.
3253 */
3254 kvm_make_request(KVM_REQ_GET_VMCS12_PAGES, vcpu);
3255 }
3256
3257 /*
3258 * If L1 had a pending IRQ/NMI until it executed
3259 * VMLAUNCH/VMRESUME which wasn't delivered because it was
3260 * disallowed (e.g. interrupts disabled), L0 needs to
3261 * evaluate if this pending event should cause an exit from L2
3262 * to L1 or delivered directly to L2 (e.g. In case L1 don't
3263 * intercept EXTERNAL_INTERRUPT).
3264 *
3265 * Usually this would be handled by the processor noticing an
3266 * IRQ/NMI window request, or checking RVI during evaluation of
3267 * pending virtual interrupts. However, this setting was done
3268 * on VMCS01 and now VMCS02 is active instead. Thus, we force L0
3269 * to perform pending event evaluation by requesting a KVM_REQ_EVENT.
3270 */
3271 if (unlikely(evaluate_pending_interrupts))
3272 kvm_make_request(KVM_REQ_EVENT, vcpu);
3273
3274 /*
Paolo Bonzini359a6c32019-01-29 19:14:46 +01003275 * Do not start the preemption timer hrtimer until after we know
3276 * we are successful, so that only nested_vmx_vmexit needs to cancel
3277 * the timer.
3278 */
3279 vmx->nested.preemption_timer_expired = false;
3280 if (nested_cpu_has_preemption_timer(vmcs12))
3281 vmx_start_preemption_timer(vcpu);
3282
3283 /*
Sean Christopherson55d23752018-12-03 13:53:18 -08003284 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point
3285 * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet
3286 * returned as far as L1 is concerned. It will only return (and set
3287 * the success flag) when L2 exits (see nested_vmx_vmexit()).
3288 */
Jim Mattson671ddc72019-10-15 10:44:05 -07003289 return NVMX_VMENTRY_SUCCESS;
Sean Christopherson55d23752018-12-03 13:53:18 -08003290
3291 /*
3292 * A failed consistency check that leads to a VMExit during L1's
3293 * VMEnter to L2 is a variation of a normal VMexit, as explained in
3294 * 26.7 "VM-entry failures during or after loading guest state".
3295 */
3296vmentry_fail_vmexit_guest_mode:
3297 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
3298 vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
3299 leave_guest_mode(vcpu);
3300
3301vmentry_fail_vmexit:
3302 vmx_switch_vmcs(vcpu, &vmx->vmcs01);
3303
3304 if (!from_vmentry)
Jim Mattson671ddc72019-10-15 10:44:05 -07003305 return NVMX_VMENTRY_VMEXIT;
Sean Christopherson55d23752018-12-03 13:53:18 -08003306
3307 load_vmcs12_host_state(vcpu, vmcs12);
3308 vmcs12->vm_exit_reason = exit_reason | VMX_EXIT_REASONS_FAILED_VMENTRY;
3309 vmcs12->exit_qualification = exit_qual;
3310 if (enable_shadow_vmcs || vmx->nested.hv_evmcs)
Sean Christopherson3731905ef2019-05-07 08:36:27 -07003311 vmx->nested.need_vmcs12_to_shadow_sync = true;
Jim Mattson671ddc72019-10-15 10:44:05 -07003312 return NVMX_VMENTRY_VMEXIT;
Sean Christopherson55d23752018-12-03 13:53:18 -08003313}
3314
3315/*
3316 * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1
3317 * for running an L2 nested guest.
3318 */
3319static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
3320{
3321 struct vmcs12 *vmcs12;
Jim Mattson671ddc72019-10-15 10:44:05 -07003322 enum nvmx_vmentry_status status;
Sean Christopherson55d23752018-12-03 13:53:18 -08003323 struct vcpu_vmx *vmx = to_vmx(vcpu);
3324 u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu);
Sean Christopherson55d23752018-12-03 13:53:18 -08003325
3326 if (!nested_vmx_check_permission(vcpu))
3327 return 1;
3328
Vitaly Kuznetsova21a39c2019-06-28 13:23:32 +02003329 if (!nested_vmx_handle_enlightened_vmptrld(vcpu, launch))
Sean Christopherson55d23752018-12-03 13:53:18 -08003330 return 1;
3331
3332 if (!vmx->nested.hv_evmcs && vmx->nested.current_vmptr == -1ull)
3333 return nested_vmx_failInvalid(vcpu);
3334
3335 vmcs12 = get_vmcs12(vcpu);
3336
3337 /*
3338 * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact
3339 * that there *is* a valid VMCS pointer, RFLAGS.CF is set
3340 * rather than RFLAGS.ZF, and no error number is stored to the
3341 * VM-instruction error field.
3342 */
3343 if (vmcs12->hdr.shadow_vmcs)
3344 return nested_vmx_failInvalid(vcpu);
3345
3346 if (vmx->nested.hv_evmcs) {
3347 copy_enlightened_to_vmcs12(vmx);
3348 /* Enlightened VMCS doesn't have launch state */
3349 vmcs12->launch_state = !launch;
3350 } else if (enable_shadow_vmcs) {
3351 copy_shadow_to_vmcs12(vmx);
3352 }
3353
3354 /*
3355 * The nested entry process starts with enforcing various prerequisites
3356 * on vmcs12 as required by the Intel SDM, and act appropriately when
3357 * they fail: As the SDM explains, some conditions should cause the
3358 * instruction to fail, while others will cause the instruction to seem
3359 * to succeed, but return an EXIT_REASON_INVALID_STATE.
3360 * To speed up the normal (success) code path, we should avoid checking
3361 * for misconfigurations which will anyway be caught by the processor
3362 * when using the merged vmcs02.
3363 */
3364 if (interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS)
3365 return nested_vmx_failValid(vcpu,
3366 VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS);
3367
3368 if (vmcs12->launch_state == launch)
3369 return nested_vmx_failValid(vcpu,
3370 launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
3371 : VMXERR_VMRESUME_NONLAUNCHED_VMCS);
3372
Paolo Bonzini98d9e852019-04-12 10:19:57 +02003373 if (nested_vmx_check_controls(vcpu, vmcs12))
3374 return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
Sean Christopherson5478ba32019-04-11 12:18:06 -07003375
Paolo Bonzini98d9e852019-04-12 10:19:57 +02003376 if (nested_vmx_check_host_state(vcpu, vmcs12))
3377 return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD);
Sean Christopherson55d23752018-12-03 13:53:18 -08003378
3379 /*
3380 * We're finally done with prerequisite checking, and can start with
3381 * the nested entry.
3382 */
3383 vmx->nested.nested_run_pending = 1;
Jim Mattson671ddc72019-10-15 10:44:05 -07003384 status = nested_vmx_enter_non_root_mode(vcpu, true);
3385 if (unlikely(status != NVMX_VMENTRY_SUCCESS))
3386 goto vmentry_failed;
Sean Christopherson55d23752018-12-03 13:53:18 -08003387
3388 /* Hide L1D cache contents from the nested guest. */
3389 vmx->vcpu.arch.l1tf_flush_l1d = true;
3390
3391 /*
3392 * Must happen outside of nested_vmx_enter_non_root_mode() as it will
3393 * also be used as part of restoring nVMX state for
3394 * snapshot restore (migration).
3395 *
3396 * In this flow, it is assumed that vmcs12 cache was
3397 * trasferred as part of captured nVMX state and should
3398 * therefore not be read from guest memory (which may not
3399 * exist on destination host yet).
3400 */
3401 nested_cache_shadow_vmcs12(vcpu, vmcs12);
3402
3403 /*
Jim Mattson9ebdfe52018-11-26 11:22:32 -08003404 * If we're entering a halted L2 vcpu and the L2 vcpu won't be
3405 * awakened by event injection or by an NMI-window VM-exit or
3406 * by an interrupt-window VM-exit, halt the vcpu.
Sean Christopherson55d23752018-12-03 13:53:18 -08003407 */
3408 if ((vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) &&
Jim Mattson9ebdfe52018-11-26 11:22:32 -08003409 !(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) &&
3410 !(vmcs12->cpu_based_vm_exec_control & CPU_BASED_VIRTUAL_NMI_PENDING) &&
3411 !((vmcs12->cpu_based_vm_exec_control & CPU_BASED_VIRTUAL_INTR_PENDING) &&
3412 (vmcs12->guest_rflags & X86_EFLAGS_IF))) {
Sean Christopherson55d23752018-12-03 13:53:18 -08003413 vmx->nested.nested_run_pending = 0;
3414 return kvm_vcpu_halt(vcpu);
3415 }
3416 return 1;
Jim Mattson671ddc72019-10-15 10:44:05 -07003417
3418vmentry_failed:
3419 vmx->nested.nested_run_pending = 0;
3420 if (status == NVMX_VMENTRY_KVM_INTERNAL_ERROR)
3421 return 0;
3422 if (status == NVMX_VMENTRY_VMEXIT)
3423 return 1;
3424 WARN_ON_ONCE(status != NVMX_VMENTRY_VMFAIL);
3425 return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
Sean Christopherson55d23752018-12-03 13:53:18 -08003426}
3427
3428/*
3429 * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date
3430 * because L2 may have changed some cr0 bits directly (CRO_GUEST_HOST_MASK).
3431 * This function returns the new value we should put in vmcs12.guest_cr0.
3432 * It's not enough to just return the vmcs02 GUEST_CR0. Rather,
3433 * 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now
3434 * available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0
3435 * didn't trap the bit, because if L1 did, so would L0).
3436 * 2. Bits that L1 asked to trap (and therefore L0 also did) could not have
3437 * been modified by L2, and L1 knows it. So just leave the old value of
3438 * the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0
3439 * isn't relevant, because if L0 traps this bit it can set it to anything.
3440 * 3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have
3441 * changed these bits, and therefore they need to be updated, but L0
3442 * didn't necessarily allow them to be changed in GUEST_CR0 - and rather
3443 * put them in vmcs02 CR0_READ_SHADOW. So take these bits from there.
3444 */
3445static inline unsigned long
3446vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
3447{
3448 return
3449 /*1*/ (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) |
3450 /*2*/ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) |
3451 /*3*/ (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask |
3452 vcpu->arch.cr0_guest_owned_bits));
3453}
3454
3455static inline unsigned long
3456vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
3457{
3458 return
3459 /*1*/ (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) |
3460 /*2*/ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) |
3461 /*3*/ (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask |
3462 vcpu->arch.cr4_guest_owned_bits));
3463}
3464
3465static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
3466 struct vmcs12 *vmcs12)
3467{
3468 u32 idt_vectoring;
3469 unsigned int nr;
3470
3471 if (vcpu->arch.exception.injected) {
3472 nr = vcpu->arch.exception.nr;
3473 idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
3474
3475 if (kvm_exception_is_soft(nr)) {
3476 vmcs12->vm_exit_instruction_len =
3477 vcpu->arch.event_exit_inst_len;
3478 idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION;
3479 } else
3480 idt_vectoring |= INTR_TYPE_HARD_EXCEPTION;
3481
3482 if (vcpu->arch.exception.has_error_code) {
3483 idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK;
3484 vmcs12->idt_vectoring_error_code =
3485 vcpu->arch.exception.error_code;
3486 }
3487
3488 vmcs12->idt_vectoring_info_field = idt_vectoring;
3489 } else if (vcpu->arch.nmi_injected) {
3490 vmcs12->idt_vectoring_info_field =
3491 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR;
3492 } else if (vcpu->arch.interrupt.injected) {
3493 nr = vcpu->arch.interrupt.nr;
3494 idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
3495
3496 if (vcpu->arch.interrupt.soft) {
3497 idt_vectoring |= INTR_TYPE_SOFT_INTR;
3498 vmcs12->vm_entry_instruction_len =
3499 vcpu->arch.event_exit_inst_len;
3500 } else
3501 idt_vectoring |= INTR_TYPE_EXT_INTR;
3502
3503 vmcs12->idt_vectoring_info_field = idt_vectoring;
3504 }
3505}
3506
3507
3508static void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu)
3509{
3510 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3511 gfn_t gfn;
3512
3513 /*
3514 * Don't need to mark the APIC access page dirty; it is never
3515 * written to by the CPU during APIC virtualization.
3516 */
3517
3518 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
3519 gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT;
3520 kvm_vcpu_mark_page_dirty(vcpu, gfn);
3521 }
3522
3523 if (nested_cpu_has_posted_intr(vmcs12)) {
3524 gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT;
3525 kvm_vcpu_mark_page_dirty(vcpu, gfn);
3526 }
3527}
3528
3529static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
3530{
3531 struct vcpu_vmx *vmx = to_vmx(vcpu);
3532 int max_irr;
3533 void *vapic_page;
3534 u16 status;
3535
3536 if (!vmx->nested.pi_desc || !vmx->nested.pi_pending)
3537 return;
3538
3539 vmx->nested.pi_pending = false;
3540 if (!pi_test_and_clear_on(vmx->nested.pi_desc))
3541 return;
3542
3543 max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256);
3544 if (max_irr != 256) {
KarimAllah Ahmed96c66e82019-01-31 21:24:37 +01003545 vapic_page = vmx->nested.virtual_apic_map.hva;
3546 if (!vapic_page)
3547 return;
3548
Sean Christopherson55d23752018-12-03 13:53:18 -08003549 __kvm_apic_update_irr(vmx->nested.pi_desc->pir,
3550 vapic_page, &max_irr);
Sean Christopherson55d23752018-12-03 13:53:18 -08003551 status = vmcs_read16(GUEST_INTR_STATUS);
3552 if ((u8)max_irr > ((u8)status & 0xff)) {
3553 status &= ~0xff;
3554 status |= (u8)max_irr;
3555 vmcs_write16(GUEST_INTR_STATUS, status);
3556 }
3557 }
3558
3559 nested_mark_vmcs12_pages_dirty(vcpu);
3560}
3561
3562static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu,
3563 unsigned long exit_qual)
3564{
3565 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3566 unsigned int nr = vcpu->arch.exception.nr;
3567 u32 intr_info = nr | INTR_INFO_VALID_MASK;
3568
3569 if (vcpu->arch.exception.has_error_code) {
3570 vmcs12->vm_exit_intr_error_code = vcpu->arch.exception.error_code;
3571 intr_info |= INTR_INFO_DELIVER_CODE_MASK;
3572 }
3573
3574 if (kvm_exception_is_soft(nr))
3575 intr_info |= INTR_TYPE_SOFT_EXCEPTION;
3576 else
3577 intr_info |= INTR_TYPE_HARD_EXCEPTION;
3578
3579 if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) &&
3580 vmx_get_nmi_mask(vcpu))
3581 intr_info |= INTR_INFO_UNBLOCK_NMI;
3582
3583 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual);
3584}
3585
3586static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr)
3587{
3588 struct vcpu_vmx *vmx = to_vmx(vcpu);
3589 unsigned long exit_qual;
3590 bool block_nested_events =
3591 vmx->nested.nested_run_pending || kvm_event_needs_reinjection(vcpu);
Liran Alon4b9852f2019-08-26 13:24:49 +03003592 struct kvm_lapic *apic = vcpu->arch.apic;
3593
3594 if (lapic_in_kernel(vcpu) &&
3595 test_bit(KVM_APIC_INIT, &apic->pending_events)) {
3596 if (block_nested_events)
3597 return -EBUSY;
Liran Alone64a8502019-11-11 14:16:05 +02003598 clear_bit(KVM_APIC_INIT, &apic->pending_events);
Liran Alon4b9852f2019-08-26 13:24:49 +03003599 nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0);
3600 return 0;
3601 }
Sean Christopherson55d23752018-12-03 13:53:18 -08003602
3603 if (vcpu->arch.exception.pending &&
3604 nested_vmx_check_exception(vcpu, &exit_qual)) {
3605 if (block_nested_events)
3606 return -EBUSY;
3607 nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
3608 return 0;
3609 }
3610
3611 if (nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) &&
3612 vmx->nested.preemption_timer_expired) {
3613 if (block_nested_events)
3614 return -EBUSY;
3615 nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0);
3616 return 0;
3617 }
3618
3619 if (vcpu->arch.nmi_pending && nested_exit_on_nmi(vcpu)) {
3620 if (block_nested_events)
3621 return -EBUSY;
3622 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
3623 NMI_VECTOR | INTR_TYPE_NMI_INTR |
3624 INTR_INFO_VALID_MASK, 0);
3625 /*
3626 * The NMI-triggered VM exit counts as injection:
3627 * clear this one and block further NMIs.
3628 */
3629 vcpu->arch.nmi_pending = 0;
3630 vmx_set_nmi_mask(vcpu, true);
3631 return 0;
3632 }
3633
3634 if ((kvm_cpu_has_interrupt(vcpu) || external_intr) &&
3635 nested_exit_on_intr(vcpu)) {
3636 if (block_nested_events)
3637 return -EBUSY;
3638 nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0);
3639 return 0;
3640 }
3641
3642 vmx_complete_nested_posted_interrupt(vcpu);
3643 return 0;
3644}
3645
3646static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
3647{
3648 ktime_t remaining =
3649 hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer);
3650 u64 value;
3651
3652 if (ktime_to_ns(remaining) <= 0)
3653 return 0;
3654
3655 value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz;
3656 do_div(value, 1000000);
3657 return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
3658}
3659
Sean Christopherson7952d762019-05-07 08:36:29 -07003660static bool is_vmcs12_ext_field(unsigned long field)
Sean Christopherson55d23752018-12-03 13:53:18 -08003661{
Sean Christopherson7952d762019-05-07 08:36:29 -07003662 switch (field) {
3663 case GUEST_ES_SELECTOR:
3664 case GUEST_CS_SELECTOR:
3665 case GUEST_SS_SELECTOR:
3666 case GUEST_DS_SELECTOR:
3667 case GUEST_FS_SELECTOR:
3668 case GUEST_GS_SELECTOR:
3669 case GUEST_LDTR_SELECTOR:
3670 case GUEST_TR_SELECTOR:
3671 case GUEST_ES_LIMIT:
3672 case GUEST_CS_LIMIT:
3673 case GUEST_SS_LIMIT:
3674 case GUEST_DS_LIMIT:
3675 case GUEST_FS_LIMIT:
3676 case GUEST_GS_LIMIT:
3677 case GUEST_LDTR_LIMIT:
3678 case GUEST_TR_LIMIT:
3679 case GUEST_GDTR_LIMIT:
3680 case GUEST_IDTR_LIMIT:
3681 case GUEST_ES_AR_BYTES:
3682 case GUEST_DS_AR_BYTES:
3683 case GUEST_FS_AR_BYTES:
3684 case GUEST_GS_AR_BYTES:
3685 case GUEST_LDTR_AR_BYTES:
3686 case GUEST_TR_AR_BYTES:
3687 case GUEST_ES_BASE:
3688 case GUEST_CS_BASE:
3689 case GUEST_SS_BASE:
3690 case GUEST_DS_BASE:
3691 case GUEST_FS_BASE:
3692 case GUEST_GS_BASE:
3693 case GUEST_LDTR_BASE:
3694 case GUEST_TR_BASE:
3695 case GUEST_GDTR_BASE:
3696 case GUEST_IDTR_BASE:
3697 case GUEST_PENDING_DBG_EXCEPTIONS:
3698 case GUEST_BNDCFGS:
3699 return true;
3700 default:
3701 break;
3702 }
Sean Christopherson55d23752018-12-03 13:53:18 -08003703
Sean Christopherson7952d762019-05-07 08:36:29 -07003704 return false;
3705}
3706
3707static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
3708 struct vmcs12 *vmcs12)
3709{
3710 struct vcpu_vmx *vmx = to_vmx(vcpu);
Sean Christopherson55d23752018-12-03 13:53:18 -08003711
3712 vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
3713 vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
3714 vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
3715 vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
3716 vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
3717 vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
3718 vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
3719 vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
3720 vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
3721 vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
3722 vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
3723 vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
3724 vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
3725 vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
3726 vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
3727 vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
3728 vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
3729 vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
3730 vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
Sean Christopherson55d23752018-12-03 13:53:18 -08003731 vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
3732 vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
3733 vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
3734 vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES);
3735 vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
3736 vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE);
3737 vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
3738 vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
3739 vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
3740 vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
3741 vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
3742 vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
3743 vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
3744 vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
3745 vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
Sean Christopherson7952d762019-05-07 08:36:29 -07003746 vmcs12->guest_pending_dbg_exceptions =
3747 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
3748 if (kvm_mpx_supported())
3749 vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
3750
3751 vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false;
3752}
3753
3754static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
3755 struct vmcs12 *vmcs12)
3756{
3757 struct vcpu_vmx *vmx = to_vmx(vcpu);
3758 int cpu;
3759
3760 if (!vmx->nested.need_sync_vmcs02_to_vmcs12_rare)
3761 return;
3762
3763
3764 WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01);
3765
3766 cpu = get_cpu();
3767 vmx->loaded_vmcs = &vmx->nested.vmcs02;
3768 vmx_vcpu_load(&vmx->vcpu, cpu);
3769
3770 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
3771
3772 vmx->loaded_vmcs = &vmx->vmcs01;
3773 vmx_vcpu_load(&vmx->vcpu, cpu);
3774 put_cpu();
3775}
3776
3777/*
3778 * Update the guest state fields of vmcs12 to reflect changes that
3779 * occurred while L2 was running. (The "IA-32e mode guest" bit of the
3780 * VM-entry controls is also updated, since this is really a guest
3781 * state bit.)
3782 */
3783static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
3784{
3785 struct vcpu_vmx *vmx = to_vmx(vcpu);
3786
3787 if (vmx->nested.hv_evmcs)
3788 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
3789
3790 vmx->nested.need_sync_vmcs02_to_vmcs12_rare = !vmx->nested.hv_evmcs;
3791
3792 vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
3793 vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12);
3794
3795 vmcs12->guest_rsp = kvm_rsp_read(vcpu);
3796 vmcs12->guest_rip = kvm_rip_read(vcpu);
3797 vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS);
3798
3799 vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
3800 vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
Sean Christopherson55d23752018-12-03 13:53:18 -08003801
Sean Christophersonde70d272019-05-07 09:06:36 -07003802 vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
3803 vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
3804 vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
Sean Christopherson55d23752018-12-03 13:53:18 -08003805
3806 vmcs12->guest_interruptibility_info =
3807 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
Sean Christopherson7952d762019-05-07 08:36:29 -07003808
Sean Christopherson55d23752018-12-03 13:53:18 -08003809 if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
3810 vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT;
3811 else
3812 vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE;
3813
Paolo Bonzinib4b65b52019-01-29 19:12:35 +01003814 if (nested_cpu_has_preemption_timer(vmcs12) &&
3815 vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
Sean Christopherson55d23752018-12-03 13:53:18 -08003816 vmcs12->vmx_preemption_timer_value =
3817 vmx_get_preemption_timer_value(vcpu);
Sean Christopherson55d23752018-12-03 13:53:18 -08003818
3819 /*
3820 * In some cases (usually, nested EPT), L2 is allowed to change its
3821 * own CR3 without exiting. If it has changed it, we must keep it.
3822 * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined
3823 * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12.
3824 *
3825 * Additionally, restore L2's PDPTR to vmcs12.
3826 */
3827 if (enable_ept) {
3828 vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3);
Sean Christophersonc7554efc2019-05-07 09:06:40 -07003829 if (nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) {
3830 vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0);
3831 vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1);
3832 vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2);
3833 vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
3834 }
Sean Christopherson55d23752018-12-03 13:53:18 -08003835 }
3836
3837 vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
3838
3839 if (nested_cpu_has_vid(vmcs12))
3840 vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS);
3841
3842 vmcs12->vm_entry_controls =
3843 (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
3844 (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE);
3845
Sean Christopherson699a1ac2019-05-07 09:06:37 -07003846 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS)
Sean Christopherson55d23752018-12-03 13:53:18 -08003847 kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7);
Sean Christopherson55d23752018-12-03 13:53:18 -08003848
Sean Christopherson55d23752018-12-03 13:53:18 -08003849 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER)
3850 vmcs12->guest_ia32_efer = vcpu->arch.efer;
Sean Christopherson55d23752018-12-03 13:53:18 -08003851}
3852
3853/*
3854 * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits
3855 * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12),
3856 * and this function updates it to reflect the changes to the guest state while
3857 * L2 was running (and perhaps made some exits which were handled directly by L0
3858 * without going back to L1), and to reflect the exit reason.
3859 * Note that we do not have to copy here all VMCS fields, just those that
3860 * could have changed by the L2 guest or the exit - i.e., the guest-state and
3861 * exit-information fields only. Other fields are modified by L1 with VMWRITE,
3862 * which already writes to vmcs12 directly.
3863 */
3864static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
3865 u32 exit_reason, u32 exit_intr_info,
3866 unsigned long exit_qualification)
3867{
Sean Christopherson55d23752018-12-03 13:53:18 -08003868 /* update exit information fields: */
Sean Christopherson55d23752018-12-03 13:53:18 -08003869 vmcs12->vm_exit_reason = exit_reason;
3870 vmcs12->exit_qualification = exit_qualification;
3871 vmcs12->vm_exit_intr_info = exit_intr_info;
3872
3873 vmcs12->idt_vectoring_info_field = 0;
3874 vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
3875 vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
3876
3877 if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) {
3878 vmcs12->launch_state = 1;
3879
3880 /* vm_entry_intr_info_field is cleared on exit. Emulate this
3881 * instead of reading the real value. */
3882 vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK;
3883
3884 /*
3885 * Transfer the event that L0 or L1 may wanted to inject into
3886 * L2 to IDT_VECTORING_INFO_FIELD.
3887 */
3888 vmcs12_save_pending_event(vcpu, vmcs12);
Krish Sadhukhana0d4f802018-12-04 19:00:13 -05003889
3890 /*
3891 * According to spec, there's no need to store the guest's
3892 * MSRs if the exit is due to a VM-entry failure that occurs
3893 * during or after loading the guest state. Since this exit
3894 * does not fall in that category, we need to save the MSRs.
3895 */
3896 if (nested_vmx_store_msr(vcpu,
3897 vmcs12->vm_exit_msr_store_addr,
3898 vmcs12->vm_exit_msr_store_count))
3899 nested_vmx_abort(vcpu,
3900 VMX_ABORT_SAVE_GUEST_MSR_FAIL);
Sean Christopherson55d23752018-12-03 13:53:18 -08003901 }
3902
3903 /*
3904 * Drop what we picked up for L2 via vmx_complete_interrupts. It is
3905 * preserved above and would only end up incorrectly in L1.
3906 */
3907 vcpu->arch.nmi_injected = false;
3908 kvm_clear_exception_queue(vcpu);
3909 kvm_clear_interrupt_queue(vcpu);
3910}
3911
3912/*
3913 * A part of what we need to when the nested L2 guest exits and we want to
3914 * run its L1 parent, is to reset L1's guest state to the host state specified
3915 * in vmcs12.
3916 * This function is to be called not only on normal nested exit, but also on
3917 * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry
3918 * Failures During or After Loading Guest State").
3919 * This function should be called when the active VMCS is L1's (vmcs01).
3920 */
3921static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
3922 struct vmcs12 *vmcs12)
3923{
3924 struct kvm_segment seg;
3925 u32 entry_failure_code;
3926
3927 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
3928 vcpu->arch.efer = vmcs12->host_ia32_efer;
3929 else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
3930 vcpu->arch.efer |= (EFER_LMA | EFER_LME);
3931 else
3932 vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
3933 vmx_set_efer(vcpu, vcpu->arch.efer);
3934
Paolo Bonzinie9c16c72019-04-30 22:07:26 +02003935 kvm_rsp_write(vcpu, vmcs12->host_rsp);
3936 kvm_rip_write(vcpu, vmcs12->host_rip);
Sean Christopherson55d23752018-12-03 13:53:18 -08003937 vmx_set_rflags(vcpu, X86_EFLAGS_FIXED);
3938 vmx_set_interrupt_shadow(vcpu, 0);
3939
3940 /*
3941 * Note that calling vmx_set_cr0 is important, even if cr0 hasn't
3942 * actually changed, because vmx_set_cr0 refers to efer set above.
3943 *
3944 * CR0_GUEST_HOST_MASK is already set in the original vmcs01
3945 * (KVM doesn't change it);
3946 */
3947 vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
3948 vmx_set_cr0(vcpu, vmcs12->host_cr0);
3949
3950 /* Same as above - no reason to call set_cr4_guest_host_mask(). */
3951 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
3952 vmx_set_cr4(vcpu, vmcs12->host_cr4);
3953
3954 nested_ept_uninit_mmu_context(vcpu);
3955
3956 /*
3957 * Only PDPTE load can fail as the value of cr3 was checked on entry and
3958 * couldn't have changed.
3959 */
3960 if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &entry_failure_code))
3961 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL);
3962
3963 if (!enable_ept)
3964 vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
3965
3966 /*
3967 * If vmcs01 doesn't use VPID, CPU flushes TLB on every
3968 * VMEntry/VMExit. Thus, no need to flush TLB.
3969 *
3970 * If vmcs12 doesn't use VPID, L1 expects TLB to be
3971 * flushed on every VMEntry/VMExit.
3972 *
3973 * Otherwise, we can preserve TLB entries as long as we are
3974 * able to tag L1 TLB entries differently than L2 TLB entries.
3975 *
3976 * If vmcs12 uses EPT, we need to execute this flush on EPTP01
3977 * and therefore we request the TLB flush to happen only after VMCS EPTP
3978 * has been set by KVM_REQ_LOAD_CR3.
3979 */
3980 if (enable_vpid &&
3981 (!nested_cpu_has_vpid(vmcs12) || !nested_has_guest_tlb_tag(vcpu))) {
3982 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
3983 }
3984
3985 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
3986 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp);
3987 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip);
3988 vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base);
3989 vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base);
3990 vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF);
3991 vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF);
3992
3993 /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */
3994 if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS)
3995 vmcs_write64(GUEST_BNDCFGS, 0);
3996
3997 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) {
3998 vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat);
3999 vcpu->arch.pat = vmcs12->host_ia32_pat;
4000 }
4001 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
Oliver Upton458151f2019-11-13 16:17:18 -08004002 SET_MSR_OR_WARN(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
4003 vmcs12->host_ia32_perf_global_ctrl);
Sean Christopherson55d23752018-12-03 13:53:18 -08004004
4005 /* Set L1 segment info according to Intel SDM
4006 27.5.2 Loading Host Segment and Descriptor-Table Registers */
4007 seg = (struct kvm_segment) {
4008 .base = 0,
4009 .limit = 0xFFFFFFFF,
4010 .selector = vmcs12->host_cs_selector,
4011 .type = 11,
4012 .present = 1,
4013 .s = 1,
4014 .g = 1
4015 };
4016 if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
4017 seg.l = 1;
4018 else
4019 seg.db = 1;
4020 vmx_set_segment(vcpu, &seg, VCPU_SREG_CS);
4021 seg = (struct kvm_segment) {
4022 .base = 0,
4023 .limit = 0xFFFFFFFF,
4024 .type = 3,
4025 .present = 1,
4026 .s = 1,
4027 .db = 1,
4028 .g = 1
4029 };
4030 seg.selector = vmcs12->host_ds_selector;
4031 vmx_set_segment(vcpu, &seg, VCPU_SREG_DS);
4032 seg.selector = vmcs12->host_es_selector;
4033 vmx_set_segment(vcpu, &seg, VCPU_SREG_ES);
4034 seg.selector = vmcs12->host_ss_selector;
4035 vmx_set_segment(vcpu, &seg, VCPU_SREG_SS);
4036 seg.selector = vmcs12->host_fs_selector;
4037 seg.base = vmcs12->host_fs_base;
4038 vmx_set_segment(vcpu, &seg, VCPU_SREG_FS);
4039 seg.selector = vmcs12->host_gs_selector;
4040 seg.base = vmcs12->host_gs_base;
4041 vmx_set_segment(vcpu, &seg, VCPU_SREG_GS);
4042 seg = (struct kvm_segment) {
4043 .base = vmcs12->host_tr_base,
4044 .limit = 0x67,
4045 .selector = vmcs12->host_tr_selector,
4046 .type = 11,
4047 .present = 1
4048 };
4049 vmx_set_segment(vcpu, &seg, VCPU_SREG_TR);
4050
4051 kvm_set_dr(vcpu, 7, 0x400);
4052 vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
4053
4054 if (cpu_has_vmx_msr_bitmap())
4055 vmx_update_msr_bitmap(vcpu);
4056
4057 if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
4058 vmcs12->vm_exit_msr_load_count))
4059 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
4060}
4061
4062static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx)
4063{
4064 struct shared_msr_entry *efer_msr;
4065 unsigned int i;
4066
4067 if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER)
4068 return vmcs_read64(GUEST_IA32_EFER);
4069
4070 if (cpu_has_load_ia32_efer())
4071 return host_efer;
4072
4073 for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) {
4074 if (vmx->msr_autoload.guest.val[i].index == MSR_EFER)
4075 return vmx->msr_autoload.guest.val[i].value;
4076 }
4077
4078 efer_msr = find_msr_entry(vmx, MSR_EFER);
4079 if (efer_msr)
4080 return efer_msr->data;
4081
4082 return host_efer;
4083}
4084
4085static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
4086{
4087 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
4088 struct vcpu_vmx *vmx = to_vmx(vcpu);
4089 struct vmx_msr_entry g, h;
Sean Christopherson55d23752018-12-03 13:53:18 -08004090 gpa_t gpa;
4091 u32 i, j;
4092
4093 vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT);
4094
4095 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) {
4096 /*
4097 * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set
4098 * as vmcs01.GUEST_DR7 contains a userspace defined value
4099 * and vcpu->arch.dr7 is not squirreled away before the
4100 * nested VMENTER (not worth adding a variable in nested_vmx).
4101 */
4102 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
4103 kvm_set_dr(vcpu, 7, DR7_FIXED_1);
4104 else
4105 WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7)));
4106 }
4107
4108 /*
4109 * Note that calling vmx_set_{efer,cr0,cr4} is important as they
4110 * handle a variety of side effects to KVM's software model.
4111 */
4112 vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx));
4113
4114 vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
4115 vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW));
4116
4117 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
4118 vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW));
4119
4120 nested_ept_uninit_mmu_context(vcpu);
Sean Christophersonf087a022019-06-07 11:55:34 -07004121 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
Sean Christophersoncb3c1e22019-09-27 14:45:22 -07004122 kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
Sean Christopherson55d23752018-12-03 13:53:18 -08004123
4124 /*
4125 * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs
4126 * from vmcs01 (if necessary). The PDPTRs are not loaded on
4127 * VMFail, like everything else we just need to ensure our
4128 * software model is up-to-date.
4129 */
Sean Christophersonf087a022019-06-07 11:55:34 -07004130 if (enable_ept)
4131 ept_save_pdptrs(vcpu);
Sean Christopherson55d23752018-12-03 13:53:18 -08004132
4133 kvm_mmu_reset_context(vcpu);
4134
4135 if (cpu_has_vmx_msr_bitmap())
4136 vmx_update_msr_bitmap(vcpu);
4137
4138 /*
4139 * This nasty bit of open coding is a compromise between blindly
4140 * loading L1's MSRs using the exit load lists (incorrect emulation
4141 * of VMFail), leaving the nested VM's MSRs in the software model
4142 * (incorrect behavior) and snapshotting the modified MSRs (too
4143 * expensive since the lists are unbound by hardware). For each
4144 * MSR that was (prematurely) loaded from the nested VMEntry load
4145 * list, reload it from the exit load list if it exists and differs
4146 * from the guest value. The intent is to stuff host state as
4147 * silently as possible, not to fully process the exit load list.
4148 */
Sean Christopherson55d23752018-12-03 13:53:18 -08004149 for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) {
4150 gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g));
4151 if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) {
4152 pr_debug_ratelimited(
4153 "%s read MSR index failed (%u, 0x%08llx)\n",
4154 __func__, i, gpa);
4155 goto vmabort;
4156 }
4157
4158 for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) {
4159 gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h));
4160 if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) {
4161 pr_debug_ratelimited(
4162 "%s read MSR failed (%u, 0x%08llx)\n",
4163 __func__, j, gpa);
4164 goto vmabort;
4165 }
4166 if (h.index != g.index)
4167 continue;
4168 if (h.value == g.value)
4169 break;
4170
4171 if (nested_vmx_load_msr_check(vcpu, &h)) {
4172 pr_debug_ratelimited(
4173 "%s check failed (%u, 0x%x, 0x%x)\n",
4174 __func__, j, h.index, h.reserved);
4175 goto vmabort;
4176 }
4177
Sean Christophersonf20935d2019-09-05 14:22:54 -07004178 if (kvm_set_msr(vcpu, h.index, h.value)) {
Sean Christopherson55d23752018-12-03 13:53:18 -08004179 pr_debug_ratelimited(
4180 "%s WRMSR failed (%u, 0x%x, 0x%llx)\n",
4181 __func__, j, h.index, h.value);
4182 goto vmabort;
4183 }
4184 }
4185 }
4186
4187 return;
4188
4189vmabort:
4190 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
4191}
4192
4193/*
4194 * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1
4195 * and modify vmcs12 to make it see what it would expect to see there if
4196 * L2 was its real guest. Must only be called when in L2 (is_guest_mode())
4197 */
4198void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
4199 u32 exit_intr_info, unsigned long exit_qualification)
4200{
4201 struct vcpu_vmx *vmx = to_vmx(vcpu);
4202 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
4203
4204 /* trying to cancel vmlaunch/vmresume is a bug */
4205 WARN_ON_ONCE(vmx->nested.nested_run_pending);
4206
4207 leave_guest_mode(vcpu);
4208
Paolo Bonzinib4b65b52019-01-29 19:12:35 +01004209 if (nested_cpu_has_preemption_timer(vmcs12))
4210 hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer);
4211
Sean Christopherson55d23752018-12-03 13:53:18 -08004212 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
4213 vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
4214
4215 if (likely(!vmx->fail)) {
Sean Christopherson3731905ef2019-05-07 08:36:27 -07004216 sync_vmcs02_to_vmcs12(vcpu, vmcs12);
Sean Christophersonf4f83162019-05-07 08:36:26 -07004217
4218 if (exit_reason != -1)
Sean Christopherson55d23752018-12-03 13:53:18 -08004219 prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
4220 exit_qualification);
4221
4222 /*
Sean Christopherson3731905ef2019-05-07 08:36:27 -07004223 * Must happen outside of sync_vmcs02_to_vmcs12() as it will
Sean Christopherson55d23752018-12-03 13:53:18 -08004224 * also be used to capture vmcs12 cache as part of
4225 * capturing nVMX state for snapshot (migration).
4226 *
4227 * Otherwise, this flush will dirty guest memory at a
4228 * point it is already assumed by user-space to be
4229 * immutable.
4230 */
4231 nested_flush_cached_shadow_vmcs12(vcpu, vmcs12);
Sean Christopherson55d23752018-12-03 13:53:18 -08004232 } else {
4233 /*
4234 * The only expected VM-instruction error is "VM entry with
4235 * invalid control field(s)." Anything else indicates a
4236 * problem with L0. And we should never get here with a
4237 * VMFail of any type if early consistency checks are enabled.
4238 */
4239 WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) !=
4240 VMXERR_ENTRY_INVALID_CONTROL_FIELD);
4241 WARN_ON_ONCE(nested_early_check);
4242 }
4243
4244 vmx_switch_vmcs(vcpu, &vmx->vmcs01);
4245
4246 /* Update any VMCS fields that might have changed while L2 ran */
4247 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
4248 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
4249 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
Liran Alon02d496cf2019-11-11 14:30:55 +02004250 if (vmx->nested.l1_tpr_threshold != -1)
4251 vmcs_write32(TPR_THRESHOLD, vmx->nested.l1_tpr_threshold);
Sean Christopherson55d23752018-12-03 13:53:18 -08004252
4253 if (kvm_has_tsc_control)
4254 decache_tsc_multiplier(vmx);
4255
4256 if (vmx->nested.change_vmcs01_virtual_apic_mode) {
4257 vmx->nested.change_vmcs01_virtual_apic_mode = false;
4258 vmx_set_virtual_apic_mode(vcpu);
Sean Christopherson55d23752018-12-03 13:53:18 -08004259 }
4260
Sean Christopherson55d23752018-12-03 13:53:18 -08004261 /* Unpin physical memory we referred to in vmcs02 */
4262 if (vmx->nested.apic_access_page) {
Liran Alonb11494b2019-11-21 00:31:47 +02004263 kvm_release_page_clean(vmx->nested.apic_access_page);
Sean Christopherson55d23752018-12-03 13:53:18 -08004264 vmx->nested.apic_access_page = NULL;
4265 }
KarimAllah Ahmed96c66e82019-01-31 21:24:37 +01004266 kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true);
KarimAllah Ahmed3278e042019-01-31 21:24:38 +01004267 kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true);
4268 vmx->nested.pi_desc = NULL;
Sean Christopherson55d23752018-12-03 13:53:18 -08004269
4270 /*
4271 * We are now running in L2, mmu_notifier will force to reload the
4272 * page's hpa for L2 vmcs. Need to reload it for L1 before entering L1.
4273 */
4274 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
4275
4276 if ((exit_reason != -1) && (enable_shadow_vmcs || vmx->nested.hv_evmcs))
Sean Christopherson3731905ef2019-05-07 08:36:27 -07004277 vmx->nested.need_vmcs12_to_shadow_sync = true;
Sean Christopherson55d23752018-12-03 13:53:18 -08004278
4279 /* in case we halted in L2 */
4280 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
4281
4282 if (likely(!vmx->fail)) {
4283 /*
4284 * TODO: SDM says that with acknowledge interrupt on
4285 * exit, bit 31 of the VM-exit interrupt information
4286 * (valid interrupt) is always set to 1 on
4287 * EXIT_REASON_EXTERNAL_INTERRUPT, so we shouldn't
4288 * need kvm_cpu_has_interrupt(). See the commit
4289 * message for details.
4290 */
4291 if (nested_exit_intr_ack_set(vcpu) &&
4292 exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT &&
4293 kvm_cpu_has_interrupt(vcpu)) {
4294 int irq = kvm_cpu_get_interrupt(vcpu);
4295 WARN_ON(irq < 0);
4296 vmcs12->vm_exit_intr_info = irq |
4297 INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR;
4298 }
4299
4300 if (exit_reason != -1)
4301 trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
4302 vmcs12->exit_qualification,
4303 vmcs12->idt_vectoring_info_field,
4304 vmcs12->vm_exit_intr_info,
4305 vmcs12->vm_exit_intr_error_code,
4306 KVM_ISA_VMX);
4307
4308 load_vmcs12_host_state(vcpu, vmcs12);
4309
4310 return;
4311 }
4312
4313 /*
4314 * After an early L2 VM-entry failure, we're now back
4315 * in L1 which thinks it just finished a VMLAUNCH or
4316 * VMRESUME instruction, so we need to set the failure
4317 * flag and the VM-instruction error field of the VMCS
4318 * accordingly, and skip the emulated instruction.
4319 */
4320 (void)nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
4321
4322 /*
4323 * Restore L1's host state to KVM's software model. We're here
4324 * because a consistency check was caught by hardware, which
4325 * means some amount of guest state has been propagated to KVM's
4326 * model and needs to be unwound to the host's state.
4327 */
4328 nested_vmx_restore_host_state(vcpu);
4329
4330 vmx->fail = 0;
4331}
4332
4333/*
4334 * Decode the memory-address operand of a vmx instruction, as recorded on an
4335 * exit caused by such an instruction (run by a guest hypervisor).
4336 * On success, returns 0. When the operand is invalid, returns 1 and throws
4337 * #UD or #GP.
4338 */
4339int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
Eugene Korenevskyfdb28612019-06-06 00:19:16 +03004340 u32 vmx_instruction_info, bool wr, int len, gva_t *ret)
Sean Christopherson55d23752018-12-03 13:53:18 -08004341{
4342 gva_t off;
4343 bool exn;
4344 struct kvm_segment s;
4345
4346 /*
4347 * According to Vol. 3B, "Information for VM Exits Due to Instruction
4348 * Execution", on an exit, vmx_instruction_info holds most of the
4349 * addressing components of the operand. Only the displacement part
4350 * is put in exit_qualification (see 3B, "Basic VM-Exit Information").
4351 * For how an actual address is calculated from all these components,
4352 * refer to Vol. 1, "Operand Addressing".
4353 */
4354 int scaling = vmx_instruction_info & 3;
4355 int addr_size = (vmx_instruction_info >> 7) & 7;
4356 bool is_reg = vmx_instruction_info & (1u << 10);
4357 int seg_reg = (vmx_instruction_info >> 15) & 7;
4358 int index_reg = (vmx_instruction_info >> 18) & 0xf;
4359 bool index_is_valid = !(vmx_instruction_info & (1u << 22));
4360 int base_reg = (vmx_instruction_info >> 23) & 0xf;
4361 bool base_is_valid = !(vmx_instruction_info & (1u << 27));
4362
4363 if (is_reg) {
4364 kvm_queue_exception(vcpu, UD_VECTOR);
4365 return 1;
4366 }
4367
4368 /* Addr = segment_base + offset */
4369 /* offset = base + [index * scale] + displacement */
4370 off = exit_qualification; /* holds the displacement */
Sean Christopherson946c5222019-01-23 14:39:23 -08004371 if (addr_size == 1)
4372 off = (gva_t)sign_extend64(off, 31);
4373 else if (addr_size == 0)
4374 off = (gva_t)sign_extend64(off, 15);
Sean Christopherson55d23752018-12-03 13:53:18 -08004375 if (base_is_valid)
4376 off += kvm_register_read(vcpu, base_reg);
4377 if (index_is_valid)
4378 off += kvm_register_read(vcpu, index_reg)<<scaling;
4379 vmx_get_segment(vcpu, &s, seg_reg);
Sean Christopherson55d23752018-12-03 13:53:18 -08004380
Sean Christopherson8570f9e2019-01-23 14:39:24 -08004381 /*
4382 * The effective address, i.e. @off, of a memory operand is truncated
4383 * based on the address size of the instruction. Note that this is
4384 * the *effective address*, i.e. the address prior to accounting for
4385 * the segment's base.
4386 */
Sean Christopherson55d23752018-12-03 13:53:18 -08004387 if (addr_size == 1) /* 32 bit */
Sean Christopherson8570f9e2019-01-23 14:39:24 -08004388 off &= 0xffffffff;
4389 else if (addr_size == 0) /* 16 bit */
4390 off &= 0xffff;
Sean Christopherson55d23752018-12-03 13:53:18 -08004391
4392 /* Checks for #GP/#SS exceptions. */
4393 exn = false;
4394 if (is_long_mode(vcpu)) {
Sean Christopherson8570f9e2019-01-23 14:39:24 -08004395 /*
4396 * The virtual/linear address is never truncated in 64-bit
4397 * mode, e.g. a 32-bit address size can yield a 64-bit virtual
4398 * address when using FS/GS with a non-zero base.
4399 */
Liran Alon6694e482019-07-15 18:47:44 +03004400 if (seg_reg == VCPU_SREG_FS || seg_reg == VCPU_SREG_GS)
4401 *ret = s.base + off;
4402 else
4403 *ret = off;
Sean Christopherson8570f9e2019-01-23 14:39:24 -08004404
Sean Christopherson55d23752018-12-03 13:53:18 -08004405 /* Long mode: #GP(0)/#SS(0) if the memory address is in a
4406 * non-canonical form. This is the only check on the memory
4407 * destination for long mode!
4408 */
4409 exn = is_noncanonical_address(*ret, vcpu);
Paolo Bonzinie0dfacb2019-01-30 17:25:38 +01004410 } else {
Sean Christopherson8570f9e2019-01-23 14:39:24 -08004411 /*
4412 * When not in long mode, the virtual/linear address is
4413 * unconditionally truncated to 32 bits regardless of the
4414 * address size.
4415 */
4416 *ret = (s.base + off) & 0xffffffff;
4417
Sean Christopherson55d23752018-12-03 13:53:18 -08004418 /* Protected mode: apply checks for segment validity in the
4419 * following order:
4420 * - segment type check (#GP(0) may be thrown)
4421 * - usability check (#GP(0)/#SS(0))
4422 * - limit check (#GP(0)/#SS(0))
4423 */
4424 if (wr)
4425 /* #GP(0) if the destination operand is located in a
4426 * read-only data segment or any code segment.
4427 */
4428 exn = ((s.type & 0xa) == 0 || (s.type & 8));
4429 else
4430 /* #GP(0) if the source operand is located in an
4431 * execute-only code segment
4432 */
4433 exn = ((s.type & 0xa) == 8);
4434 if (exn) {
4435 kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
4436 return 1;
4437 }
4438 /* Protected mode: #GP(0)/#SS(0) if the segment is unusable.
4439 */
4440 exn = (s.unusable != 0);
Sean Christopherson34333cc2019-01-23 14:39:25 -08004441
4442 /*
4443 * Protected mode: #GP(0)/#SS(0) if the memory operand is
4444 * outside the segment limit. All CPUs that support VMX ignore
4445 * limit checks for flat segments, i.e. segments with base==0,
4446 * limit==0xffffffff and of type expand-up data or code.
Sean Christopherson55d23752018-12-03 13:53:18 -08004447 */
Sean Christopherson34333cc2019-01-23 14:39:25 -08004448 if (!(s.base == 0 && s.limit == 0xffffffff &&
4449 ((s.type & 8) || !(s.type & 4))))
Eugene Korenevskyfdb28612019-06-06 00:19:16 +03004450 exn = exn || ((u64)off + len - 1 > s.limit);
Sean Christopherson55d23752018-12-03 13:53:18 -08004451 }
4452 if (exn) {
4453 kvm_queue_exception_e(vcpu,
4454 seg_reg == VCPU_SREG_SS ?
4455 SS_VECTOR : GP_VECTOR,
4456 0);
4457 return 1;
4458 }
4459
4460 return 0;
4461}
4462
Oliver Upton03a8871a2019-11-13 16:17:20 -08004463void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu)
4464{
4465 struct vcpu_vmx *vmx;
4466
4467 if (!nested_vmx_allowed(vcpu))
4468 return;
4469
4470 vmx = to_vmx(vcpu);
4471 if (kvm_x86_ops->pmu_ops->is_valid_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL)) {
4472 vmx->nested.msrs.entry_ctls_high |=
4473 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
4474 vmx->nested.msrs.exit_ctls_high |=
4475 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
4476 } else {
4477 vmx->nested.msrs.entry_ctls_high &=
4478 ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
4479 vmx->nested.msrs.exit_ctls_high &=
4480 ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
4481 }
4482}
4483
Sean Christopherson55d23752018-12-03 13:53:18 -08004484static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer)
4485{
4486 gva_t gva;
4487 struct x86_exception e;
4488
4489 if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
Eugene Korenevskyfdb28612019-06-06 00:19:16 +03004490 vmcs_read32(VMX_INSTRUCTION_INFO), false,
4491 sizeof(*vmpointer), &gva))
Sean Christopherson55d23752018-12-03 13:53:18 -08004492 return 1;
4493
4494 if (kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e)) {
4495 kvm_inject_page_fault(vcpu, &e);
4496 return 1;
4497 }
4498
4499 return 0;
4500}
4501
4502/*
4503 * Allocate a shadow VMCS and associate it with the currently loaded
4504 * VMCS, unless such a shadow VMCS already exists. The newly allocated
4505 * VMCS is also VMCLEARed, so that it is ready for use.
4506 */
4507static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu)
4508{
4509 struct vcpu_vmx *vmx = to_vmx(vcpu);
4510 struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs;
4511
4512 /*
4513 * We should allocate a shadow vmcs for vmcs01 only when L1
4514 * executes VMXON and free it when L1 executes VMXOFF.
4515 * As it is invalid to execute VMXON twice, we shouldn't reach
4516 * here when vmcs01 already have an allocated shadow vmcs.
4517 */
4518 WARN_ON(loaded_vmcs == &vmx->vmcs01 && loaded_vmcs->shadow_vmcs);
4519
4520 if (!loaded_vmcs->shadow_vmcs) {
4521 loaded_vmcs->shadow_vmcs = alloc_vmcs(true);
4522 if (loaded_vmcs->shadow_vmcs)
4523 vmcs_clear(loaded_vmcs->shadow_vmcs);
4524 }
4525 return loaded_vmcs->shadow_vmcs;
4526}
4527
4528static int enter_vmx_operation(struct kvm_vcpu *vcpu)
4529{
4530 struct vcpu_vmx *vmx = to_vmx(vcpu);
4531 int r;
4532
4533 r = alloc_loaded_vmcs(&vmx->nested.vmcs02);
4534 if (r < 0)
4535 goto out_vmcs02;
4536
Ben Gardon41836832019-02-11 11:02:52 -08004537 vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT);
Sean Christopherson55d23752018-12-03 13:53:18 -08004538 if (!vmx->nested.cached_vmcs12)
4539 goto out_cached_vmcs12;
4540
Ben Gardon41836832019-02-11 11:02:52 -08004541 vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT);
Sean Christopherson55d23752018-12-03 13:53:18 -08004542 if (!vmx->nested.cached_shadow_vmcs12)
4543 goto out_cached_shadow_vmcs12;
4544
4545 if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu))
4546 goto out_shadow_vmcs;
4547
4548 hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
4549 HRTIMER_MODE_REL_PINNED);
4550 vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
4551
4552 vmx->nested.vpid02 = allocate_vpid();
4553
4554 vmx->nested.vmcs02_initialized = false;
4555 vmx->nested.vmxon = true;
Luwei Kangee85dec2018-10-24 16:05:16 +08004556
4557 if (pt_mode == PT_MODE_HOST_GUEST) {
4558 vmx->pt_desc.guest.ctl = 0;
4559 pt_update_intercept_for_msr(vmx);
4560 }
4561
Sean Christopherson55d23752018-12-03 13:53:18 -08004562 return 0;
4563
4564out_shadow_vmcs:
4565 kfree(vmx->nested.cached_shadow_vmcs12);
4566
4567out_cached_shadow_vmcs12:
4568 kfree(vmx->nested.cached_vmcs12);
4569
4570out_cached_vmcs12:
4571 free_loaded_vmcs(&vmx->nested.vmcs02);
4572
4573out_vmcs02:
4574 return -ENOMEM;
4575}
4576
4577/*
4578 * Emulate the VMXON instruction.
4579 * Currently, we just remember that VMX is active, and do not save or even
4580 * inspect the argument to VMXON (the so-called "VMXON pointer") because we
4581 * do not currently need to store anything in that guest-allocated memory
4582 * region. Consequently, VMCLEAR and VMPTRLD also do not verify that the their
4583 * argument is different from the VMXON pointer (which the spec says they do).
4584 */
4585static int handle_vmon(struct kvm_vcpu *vcpu)
4586{
4587 int ret;
4588 gpa_t vmptr;
KarimAllah Ahmed2e408932019-01-31 21:24:31 +01004589 uint32_t revision;
Sean Christopherson55d23752018-12-03 13:53:18 -08004590 struct vcpu_vmx *vmx = to_vmx(vcpu);
Sean Christopherson32ad73d2019-12-20 20:44:55 -08004591 const u64 VMXON_NEEDED_FEATURES = FEAT_CTL_LOCKED
4592 | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
Sean Christopherson55d23752018-12-03 13:53:18 -08004593
4594 /*
4595 * The Intel VMX Instruction Reference lists a bunch of bits that are
4596 * prerequisite to running VMXON, most notably cr4.VMXE must be set to
4597 * 1 (see vmx_set_cr4() for when we allow the guest to set this).
4598 * Otherwise, we should fail with #UD. But most faulting conditions
4599 * have already been checked by hardware, prior to the VM-exit for
4600 * VMXON. We do test guest cr4.VMXE because processor CR4 always has
4601 * that bit set to 1 in non-root mode.
4602 */
4603 if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE)) {
4604 kvm_queue_exception(vcpu, UD_VECTOR);
4605 return 1;
4606 }
4607
4608 /* CPL=0 must be checked manually. */
4609 if (vmx_get_cpl(vcpu)) {
4610 kvm_inject_gp(vcpu, 0);
4611 return 1;
4612 }
4613
4614 if (vmx->nested.vmxon)
4615 return nested_vmx_failValid(vcpu,
4616 VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
4617
4618 if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
4619 != VMXON_NEEDED_FEATURES) {
4620 kvm_inject_gp(vcpu, 0);
4621 return 1;
4622 }
4623
4624 if (nested_vmx_get_vmptr(vcpu, &vmptr))
4625 return 1;
4626
4627 /*
4628 * SDM 3: 24.11.5
4629 * The first 4 bytes of VMXON region contain the supported
4630 * VMCS revision identifier
4631 *
4632 * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case;
4633 * which replaces physical address width with 32
4634 */
KarimAllah Ahmede0bf2662019-01-31 21:24:43 +01004635 if (!page_address_valid(vcpu, vmptr))
Sean Christopherson55d23752018-12-03 13:53:18 -08004636 return nested_vmx_failInvalid(vcpu);
4637
KarimAllah Ahmed2e408932019-01-31 21:24:31 +01004638 if (kvm_read_guest(vcpu->kvm, vmptr, &revision, sizeof(revision)) ||
4639 revision != VMCS12_REVISION)
Sean Christopherson55d23752018-12-03 13:53:18 -08004640 return nested_vmx_failInvalid(vcpu);
4641
Sean Christopherson55d23752018-12-03 13:53:18 -08004642 vmx->nested.vmxon_ptr = vmptr;
4643 ret = enter_vmx_operation(vcpu);
4644 if (ret)
4645 return ret;
4646
4647 return nested_vmx_succeed(vcpu);
4648}
4649
4650static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu)
4651{
4652 struct vcpu_vmx *vmx = to_vmx(vcpu);
4653
4654 if (vmx->nested.current_vmptr == -1ull)
4655 return;
4656
Sean Christopherson7952d762019-05-07 08:36:29 -07004657 copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu));
4658
Sean Christopherson55d23752018-12-03 13:53:18 -08004659 if (enable_shadow_vmcs) {
4660 /* copy to memory all shadowed fields in case
4661 they were modified */
4662 copy_shadow_to_vmcs12(vmx);
Sean Christopherson55d23752018-12-03 13:53:18 -08004663 vmx_disable_shadow_vmcs(vmx);
4664 }
4665 vmx->nested.posted_intr_nv = -1;
4666
4667 /* Flush VMCS12 to guest memory */
4668 kvm_vcpu_write_guest_page(vcpu,
4669 vmx->nested.current_vmptr >> PAGE_SHIFT,
4670 vmx->nested.cached_vmcs12, 0, VMCS12_SIZE);
4671
4672 kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
4673
4674 vmx->nested.current_vmptr = -1ull;
4675}
4676
4677/* Emulate the VMXOFF instruction */
4678static int handle_vmoff(struct kvm_vcpu *vcpu)
4679{
4680 if (!nested_vmx_check_permission(vcpu))
4681 return 1;
Liran Alon4b9852f2019-08-26 13:24:49 +03004682
Sean Christopherson55d23752018-12-03 13:53:18 -08004683 free_nested(vcpu);
Liran Alon4b9852f2019-08-26 13:24:49 +03004684
4685 /* Process a latched INIT during time CPU was in VMX operation */
4686 kvm_make_request(KVM_REQ_EVENT, vcpu);
4687
Sean Christopherson55d23752018-12-03 13:53:18 -08004688 return nested_vmx_succeed(vcpu);
4689}
4690
4691/* Emulate the VMCLEAR instruction */
4692static int handle_vmclear(struct kvm_vcpu *vcpu)
4693{
4694 struct vcpu_vmx *vmx = to_vmx(vcpu);
4695 u32 zero = 0;
4696 gpa_t vmptr;
Vitaly Kuznetsov11e34912019-06-28 13:23:33 +02004697 u64 evmcs_gpa;
Sean Christopherson55d23752018-12-03 13:53:18 -08004698
4699 if (!nested_vmx_check_permission(vcpu))
4700 return 1;
4701
4702 if (nested_vmx_get_vmptr(vcpu, &vmptr))
4703 return 1;
4704
KarimAllah Ahmede0bf2662019-01-31 21:24:43 +01004705 if (!page_address_valid(vcpu, vmptr))
Sean Christopherson55d23752018-12-03 13:53:18 -08004706 return nested_vmx_failValid(vcpu,
4707 VMXERR_VMCLEAR_INVALID_ADDRESS);
4708
4709 if (vmptr == vmx->nested.vmxon_ptr)
4710 return nested_vmx_failValid(vcpu,
4711 VMXERR_VMCLEAR_VMXON_POINTER);
4712
Vitaly Kuznetsov11e34912019-06-28 13:23:33 +02004713 /*
4714 * When Enlightened VMEntry is enabled on the calling CPU we treat
4715 * memory area pointer by vmptr as Enlightened VMCS (as there's no good
4716 * way to distinguish it from VMCS12) and we must not corrupt it by
4717 * writing to the non-existent 'launch_state' field. The area doesn't
4718 * have to be the currently active EVMCS on the calling CPU and there's
4719 * nothing KVM has to do to transition it from 'active' to 'non-active'
4720 * state. It is possible that the area will stay mapped as
4721 * vmx->nested.hv_evmcs but this shouldn't be a problem.
4722 */
4723 if (likely(!vmx->nested.enlightened_vmcs_enabled ||
4724 !nested_enlightened_vmentry(vcpu, &evmcs_gpa))) {
Sean Christopherson55d23752018-12-03 13:53:18 -08004725 if (vmptr == vmx->nested.current_vmptr)
4726 nested_release_vmcs12(vcpu);
4727
4728 kvm_vcpu_write_guest(vcpu,
4729 vmptr + offsetof(struct vmcs12,
4730 launch_state),
4731 &zero, sizeof(zero));
4732 }
4733
4734 return nested_vmx_succeed(vcpu);
4735}
4736
4737static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch);
4738
4739/* Emulate the VMLAUNCH instruction */
4740static int handle_vmlaunch(struct kvm_vcpu *vcpu)
4741{
4742 return nested_vmx_run(vcpu, true);
4743}
4744
4745/* Emulate the VMRESUME instruction */
4746static int handle_vmresume(struct kvm_vcpu *vcpu)
4747{
4748
4749 return nested_vmx_run(vcpu, false);
4750}
4751
4752static int handle_vmread(struct kvm_vcpu *vcpu)
4753{
4754 unsigned long field;
4755 u64 field_value;
4756 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
4757 u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
Eugene Korenevskyfdb28612019-06-06 00:19:16 +03004758 int len;
Sean Christopherson55d23752018-12-03 13:53:18 -08004759 gva_t gva = 0;
4760 struct vmcs12 *vmcs12;
Paolo Bonzinif7eea632019-09-14 00:26:27 +02004761 struct x86_exception e;
Sean Christopherson1c6f0b42019-05-07 08:36:25 -07004762 short offset;
Sean Christopherson55d23752018-12-03 13:53:18 -08004763
4764 if (!nested_vmx_check_permission(vcpu))
4765 return 1;
4766
4767 if (to_vmx(vcpu)->nested.current_vmptr == -1ull)
4768 return nested_vmx_failInvalid(vcpu);
4769
4770 if (!is_guest_mode(vcpu))
4771 vmcs12 = get_vmcs12(vcpu);
4772 else {
4773 /*
4774 * When vmcs->vmcs_link_pointer is -1ull, any VMREAD
4775 * to shadowed-field sets the ALU flags for VMfailInvalid.
4776 */
4777 if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull)
4778 return nested_vmx_failInvalid(vcpu);
4779 vmcs12 = get_shadow_vmcs12(vcpu);
4780 }
4781
4782 /* Decode instruction info and find the field to read */
4783 field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
Sean Christopherson1c6f0b42019-05-07 08:36:25 -07004784
4785 offset = vmcs_field_to_offset(field);
4786 if (offset < 0)
Sean Christopherson55d23752018-12-03 13:53:18 -08004787 return nested_vmx_failValid(vcpu,
4788 VMXERR_UNSUPPORTED_VMCS_COMPONENT);
4789
Sean Christopherson7952d762019-05-07 08:36:29 -07004790 if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field))
4791 copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
4792
Sean Christopherson1c6f0b42019-05-07 08:36:25 -07004793 /* Read the field, zero-extended to a u64 field_value */
4794 field_value = vmcs12_read_any(vmcs12, field, offset);
4795
Sean Christopherson55d23752018-12-03 13:53:18 -08004796 /*
4797 * Now copy part of this value to register or memory, as requested.
4798 * Note that the number of bits actually copied is 32 or 64 depending
4799 * on the guest's mode (32 or 64 bit), not on the given field's length.
4800 */
4801 if (vmx_instruction_info & (1u << 10)) {
4802 kvm_register_writel(vcpu, (((vmx_instruction_info) >> 3) & 0xf),
4803 field_value);
4804 } else {
Eugene Korenevskyfdb28612019-06-06 00:19:16 +03004805 len = is_64_bit_mode(vcpu) ? 8 : 4;
Sean Christopherson55d23752018-12-03 13:53:18 -08004806 if (get_vmx_mem_address(vcpu, exit_qualification,
Eugene Korenevskyfdb28612019-06-06 00:19:16 +03004807 vmx_instruction_info, true, len, &gva))
Sean Christopherson55d23752018-12-03 13:53:18 -08004808 return 1;
4809 /* _system ok, nested_vmx_check_permission has verified cpl=0 */
Paolo Bonzinif7eea632019-09-14 00:26:27 +02004810 if (kvm_write_guest_virt_system(vcpu, gva, &field_value, len, &e))
4811 kvm_inject_page_fault(vcpu, &e);
Sean Christopherson55d23752018-12-03 13:53:18 -08004812 }
4813
4814 return nested_vmx_succeed(vcpu);
4815}
4816
Sean Christophersone2174292019-05-07 08:36:28 -07004817static bool is_shadow_field_rw(unsigned long field)
4818{
4819 switch (field) {
4820#define SHADOW_FIELD_RW(x, y) case x:
4821#include "vmcs_shadow_fields.h"
4822 return true;
4823 default:
4824 break;
4825 }
4826 return false;
4827}
4828
4829static bool is_shadow_field_ro(unsigned long field)
4830{
4831 switch (field) {
4832#define SHADOW_FIELD_RO(x, y) case x:
4833#include "vmcs_shadow_fields.h"
4834 return true;
4835 default:
4836 break;
4837 }
4838 return false;
4839}
Sean Christopherson55d23752018-12-03 13:53:18 -08004840
4841static int handle_vmwrite(struct kvm_vcpu *vcpu)
4842{
4843 unsigned long field;
Eugene Korenevskyfdb28612019-06-06 00:19:16 +03004844 int len;
Sean Christopherson55d23752018-12-03 13:53:18 -08004845 gva_t gva;
4846 struct vcpu_vmx *vmx = to_vmx(vcpu);
4847 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
4848 u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
4849
4850 /* The value to write might be 32 or 64 bits, depending on L1's long
4851 * mode, and eventually we need to write that into a field of several
4852 * possible lengths. The code below first zero-extends the value to 64
4853 * bit (field_value), and then copies only the appropriate number of
4854 * bits into the vmcs12 field.
4855 */
4856 u64 field_value = 0;
4857 struct x86_exception e;
4858 struct vmcs12 *vmcs12;
Sean Christopherson1c6f0b42019-05-07 08:36:25 -07004859 short offset;
Sean Christopherson55d23752018-12-03 13:53:18 -08004860
4861 if (!nested_vmx_check_permission(vcpu))
4862 return 1;
4863
4864 if (vmx->nested.current_vmptr == -1ull)
4865 return nested_vmx_failInvalid(vcpu);
4866
4867 if (vmx_instruction_info & (1u << 10))
4868 field_value = kvm_register_readl(vcpu,
4869 (((vmx_instruction_info) >> 3) & 0xf));
4870 else {
Eugene Korenevskyfdb28612019-06-06 00:19:16 +03004871 len = is_64_bit_mode(vcpu) ? 8 : 4;
Sean Christopherson55d23752018-12-03 13:53:18 -08004872 if (get_vmx_mem_address(vcpu, exit_qualification,
Eugene Korenevskyfdb28612019-06-06 00:19:16 +03004873 vmx_instruction_info, false, len, &gva))
Sean Christopherson55d23752018-12-03 13:53:18 -08004874 return 1;
Eugene Korenevskyfdb28612019-06-06 00:19:16 +03004875 if (kvm_read_guest_virt(vcpu, gva, &field_value, len, &e)) {
Sean Christopherson55d23752018-12-03 13:53:18 -08004876 kvm_inject_page_fault(vcpu, &e);
4877 return 1;
4878 }
4879 }
4880
4881
4882 field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
4883 /*
4884 * If the vCPU supports "VMWRITE to any supported field in the
4885 * VMCS," then the "read-only" fields are actually read/write.
4886 */
4887 if (vmcs_field_readonly(field) &&
4888 !nested_cpu_has_vmwrite_any_field(vcpu))
4889 return nested_vmx_failValid(vcpu,
4890 VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT);
4891
Sean Christopherson7952d762019-05-07 08:36:29 -07004892 if (!is_guest_mode(vcpu)) {
Sean Christopherson55d23752018-12-03 13:53:18 -08004893 vmcs12 = get_vmcs12(vcpu);
Sean Christopherson7952d762019-05-07 08:36:29 -07004894
4895 /*
4896 * Ensure vmcs12 is up-to-date before any VMWRITE that dirties
4897 * vmcs12, else we may crush a field or consume a stale value.
4898 */
4899 if (!is_shadow_field_rw(field))
4900 copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
4901 } else {
Sean Christopherson55d23752018-12-03 13:53:18 -08004902 /*
4903 * When vmcs->vmcs_link_pointer is -1ull, any VMWRITE
4904 * to shadowed-field sets the ALU flags for VMfailInvalid.
4905 */
4906 if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull)
4907 return nested_vmx_failInvalid(vcpu);
4908 vmcs12 = get_shadow_vmcs12(vcpu);
4909 }
4910
Sean Christopherson1c6f0b42019-05-07 08:36:25 -07004911 offset = vmcs_field_to_offset(field);
4912 if (offset < 0)
Sean Christopherson55d23752018-12-03 13:53:18 -08004913 return nested_vmx_failValid(vcpu,
4914 VMXERR_UNSUPPORTED_VMCS_COMPONENT);
4915
4916 /*
Sean Christophersonb6437802019-05-07 08:36:24 -07004917 * Some Intel CPUs intentionally drop the reserved bits of the AR byte
4918 * fields on VMWRITE. Emulate this behavior to ensure consistent KVM
4919 * behavior regardless of the underlying hardware, e.g. if an AR_BYTE
4920 * field is intercepted for VMWRITE but not VMREAD (in L1), then VMREAD
4921 * from L1 will return a different value than VMREAD from L2 (L1 sees
4922 * the stripped down value, L2 sees the full value as stored by KVM).
Sean Christopherson55d23752018-12-03 13:53:18 -08004923 */
Sean Christophersonb6437802019-05-07 08:36:24 -07004924 if (field >= GUEST_ES_AR_BYTES && field <= GUEST_TR_AR_BYTES)
4925 field_value &= 0x1f0ff;
4926
Sean Christopherson1c6f0b42019-05-07 08:36:25 -07004927 vmcs12_write_any(vmcs12, field, offset, field_value);
Sean Christopherson55d23752018-12-03 13:53:18 -08004928
4929 /*
Sean Christophersone2174292019-05-07 08:36:28 -07004930 * Do not track vmcs12 dirty-state if in guest-mode as we actually
4931 * dirty shadow vmcs12 instead of vmcs12. Fields that can be updated
4932 * by L1 without a vmexit are always updated in the vmcs02, i.e. don't
4933 * "dirty" vmcs12, all others go down the prepare_vmcs02() slow path.
Sean Christopherson55d23752018-12-03 13:53:18 -08004934 */
Sean Christophersone2174292019-05-07 08:36:28 -07004935 if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) {
4936 /*
4937 * L1 can read these fields without exiting, ensure the
4938 * shadow VMCS is up-to-date.
4939 */
4940 if (enable_shadow_vmcs && is_shadow_field_ro(field)) {
4941 preempt_disable();
4942 vmcs_load(vmx->vmcs01.shadow_vmcs);
Sean Christophersonfadcead2019-05-07 08:36:23 -07004943
Sean Christophersone2174292019-05-07 08:36:28 -07004944 __vmcs_writel(field, field_value);
Sean Christophersonfadcead2019-05-07 08:36:23 -07004945
Sean Christophersone2174292019-05-07 08:36:28 -07004946 vmcs_clear(vmx->vmcs01.shadow_vmcs);
4947 vmcs_load(vmx->loaded_vmcs->vmcs);
4948 preempt_enable();
Sean Christopherson55d23752018-12-03 13:53:18 -08004949 }
Sean Christophersone2174292019-05-07 08:36:28 -07004950 vmx->nested.dirty_vmcs12 = true;
Sean Christopherson55d23752018-12-03 13:53:18 -08004951 }
4952
4953 return nested_vmx_succeed(vcpu);
4954}
4955
4956static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr)
4957{
4958 vmx->nested.current_vmptr = vmptr;
4959 if (enable_shadow_vmcs) {
Sean Christophersonfe7f895d2019-05-07 12:17:57 -07004960 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_SHADOW_VMCS);
Sean Christopherson55d23752018-12-03 13:53:18 -08004961 vmcs_write64(VMCS_LINK_POINTER,
4962 __pa(vmx->vmcs01.shadow_vmcs));
Sean Christopherson3731905ef2019-05-07 08:36:27 -07004963 vmx->nested.need_vmcs12_to_shadow_sync = true;
Sean Christopherson55d23752018-12-03 13:53:18 -08004964 }
4965 vmx->nested.dirty_vmcs12 = true;
4966}
4967
4968/* Emulate the VMPTRLD instruction */
4969static int handle_vmptrld(struct kvm_vcpu *vcpu)
4970{
4971 struct vcpu_vmx *vmx = to_vmx(vcpu);
4972 gpa_t vmptr;
4973
4974 if (!nested_vmx_check_permission(vcpu))
4975 return 1;
4976
4977 if (nested_vmx_get_vmptr(vcpu, &vmptr))
4978 return 1;
4979
KarimAllah Ahmede0bf2662019-01-31 21:24:43 +01004980 if (!page_address_valid(vcpu, vmptr))
Sean Christopherson55d23752018-12-03 13:53:18 -08004981 return nested_vmx_failValid(vcpu,
4982 VMXERR_VMPTRLD_INVALID_ADDRESS);
4983
4984 if (vmptr == vmx->nested.vmxon_ptr)
4985 return nested_vmx_failValid(vcpu,
4986 VMXERR_VMPTRLD_VMXON_POINTER);
4987
4988 /* Forbid normal VMPTRLD if Enlightened version was used */
4989 if (vmx->nested.hv_evmcs)
4990 return 1;
4991
4992 if (vmx->nested.current_vmptr != vmptr) {
KarimAllah Ahmedb146b832019-01-31 21:24:35 +01004993 struct kvm_host_map map;
Sean Christopherson55d23752018-12-03 13:53:18 -08004994 struct vmcs12 *new_vmcs12;
Sean Christopherson55d23752018-12-03 13:53:18 -08004995
KarimAllah Ahmedb146b832019-01-31 21:24:35 +01004996 if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmptr), &map)) {
Sean Christopherson55d23752018-12-03 13:53:18 -08004997 /*
4998 * Reads from an unbacked page return all 1s,
4999 * which means that the 32 bits located at the
5000 * given physical address won't match the required
5001 * VMCS12_REVISION identifier.
5002 */
Vitaly Kuznetsov826c1362019-01-09 18:22:56 +01005003 return nested_vmx_failValid(vcpu,
Sean Christopherson55d23752018-12-03 13:53:18 -08005004 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
Sean Christopherson55d23752018-12-03 13:53:18 -08005005 }
KarimAllah Ahmedb146b832019-01-31 21:24:35 +01005006
5007 new_vmcs12 = map.hva;
5008
Sean Christopherson55d23752018-12-03 13:53:18 -08005009 if (new_vmcs12->hdr.revision_id != VMCS12_REVISION ||
5010 (new_vmcs12->hdr.shadow_vmcs &&
5011 !nested_cpu_has_vmx_shadow_vmcs(vcpu))) {
KarimAllah Ahmedb146b832019-01-31 21:24:35 +01005012 kvm_vcpu_unmap(vcpu, &map, false);
Sean Christopherson55d23752018-12-03 13:53:18 -08005013 return nested_vmx_failValid(vcpu,
5014 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
5015 }
5016
5017 nested_release_vmcs12(vcpu);
5018
5019 /*
5020 * Load VMCS12 from guest memory since it is not already
5021 * cached.
5022 */
5023 memcpy(vmx->nested.cached_vmcs12, new_vmcs12, VMCS12_SIZE);
KarimAllah Ahmedb146b832019-01-31 21:24:35 +01005024 kvm_vcpu_unmap(vcpu, &map, false);
Sean Christopherson55d23752018-12-03 13:53:18 -08005025
5026 set_current_vmptr(vmx, vmptr);
5027 }
5028
5029 return nested_vmx_succeed(vcpu);
5030}
5031
5032/* Emulate the VMPTRST instruction */
5033static int handle_vmptrst(struct kvm_vcpu *vcpu)
5034{
5035 unsigned long exit_qual = vmcs_readl(EXIT_QUALIFICATION);
5036 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5037 gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr;
5038 struct x86_exception e;
5039 gva_t gva;
5040
5041 if (!nested_vmx_check_permission(vcpu))
5042 return 1;
5043
5044 if (unlikely(to_vmx(vcpu)->nested.hv_evmcs))
5045 return 1;
5046
Eugene Korenevskyfdb28612019-06-06 00:19:16 +03005047 if (get_vmx_mem_address(vcpu, exit_qual, instr_info,
5048 true, sizeof(gpa_t), &gva))
Sean Christopherson55d23752018-12-03 13:53:18 -08005049 return 1;
5050 /* *_system ok, nested_vmx_check_permission has verified cpl=0 */
5051 if (kvm_write_guest_virt_system(vcpu, gva, (void *)&current_vmptr,
5052 sizeof(gpa_t), &e)) {
5053 kvm_inject_page_fault(vcpu, &e);
5054 return 1;
5055 }
5056 return nested_vmx_succeed(vcpu);
5057}
5058
5059/* Emulate the INVEPT instruction */
5060static int handle_invept(struct kvm_vcpu *vcpu)
5061{
5062 struct vcpu_vmx *vmx = to_vmx(vcpu);
5063 u32 vmx_instruction_info, types;
5064 unsigned long type;
5065 gva_t gva;
5066 struct x86_exception e;
5067 struct {
5068 u64 eptp, gpa;
5069 } operand;
5070
5071 if (!(vmx->nested.msrs.secondary_ctls_high &
5072 SECONDARY_EXEC_ENABLE_EPT) ||
5073 !(vmx->nested.msrs.ept_caps & VMX_EPT_INVEPT_BIT)) {
5074 kvm_queue_exception(vcpu, UD_VECTOR);
5075 return 1;
5076 }
5077
5078 if (!nested_vmx_check_permission(vcpu))
5079 return 1;
5080
5081 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5082 type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
5083
5084 types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
5085
5086 if (type >= 32 || !(types & (1 << type)))
5087 return nested_vmx_failValid(vcpu,
5088 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
5089
5090 /* According to the Intel VMX instruction reference, the memory
5091 * operand is read even if it isn't needed (e.g., for type==global)
5092 */
5093 if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
Eugene Korenevskyfdb28612019-06-06 00:19:16 +03005094 vmx_instruction_info, false, sizeof(operand), &gva))
Sean Christopherson55d23752018-12-03 13:53:18 -08005095 return 1;
5096 if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
5097 kvm_inject_page_fault(vcpu, &e);
5098 return 1;
5099 }
5100
5101 switch (type) {
5102 case VMX_EPT_EXTENT_GLOBAL:
Sean Christopherson55d23752018-12-03 13:53:18 -08005103 case VMX_EPT_EXTENT_CONTEXT:
Jim Mattsonb1190192019-06-13 09:16:08 -07005104 /*
5105 * TODO: Sync the necessary shadow EPT roots here, rather than
5106 * at the next emulated VM-entry.
5107 */
Sean Christopherson55d23752018-12-03 13:53:18 -08005108 break;
5109 default:
5110 BUG_ON(1);
5111 break;
5112 }
5113
5114 return nested_vmx_succeed(vcpu);
5115}
5116
5117static int handle_invvpid(struct kvm_vcpu *vcpu)
5118{
5119 struct vcpu_vmx *vmx = to_vmx(vcpu);
5120 u32 vmx_instruction_info;
5121 unsigned long type, types;
5122 gva_t gva;
5123 struct x86_exception e;
5124 struct {
5125 u64 vpid;
5126 u64 gla;
5127 } operand;
5128 u16 vpid02;
5129
5130 if (!(vmx->nested.msrs.secondary_ctls_high &
5131 SECONDARY_EXEC_ENABLE_VPID) ||
5132 !(vmx->nested.msrs.vpid_caps & VMX_VPID_INVVPID_BIT)) {
5133 kvm_queue_exception(vcpu, UD_VECTOR);
5134 return 1;
5135 }
5136
5137 if (!nested_vmx_check_permission(vcpu))
5138 return 1;
5139
5140 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5141 type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
5142
5143 types = (vmx->nested.msrs.vpid_caps &
5144 VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8;
5145
5146 if (type >= 32 || !(types & (1 << type)))
5147 return nested_vmx_failValid(vcpu,
5148 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
5149
5150 /* according to the intel vmx instruction reference, the memory
5151 * operand is read even if it isn't needed (e.g., for type==global)
5152 */
5153 if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
Eugene Korenevskyfdb28612019-06-06 00:19:16 +03005154 vmx_instruction_info, false, sizeof(operand), &gva))
Sean Christopherson55d23752018-12-03 13:53:18 -08005155 return 1;
5156 if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
5157 kvm_inject_page_fault(vcpu, &e);
5158 return 1;
5159 }
5160 if (operand.vpid >> 16)
5161 return nested_vmx_failValid(vcpu,
5162 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
5163
5164 vpid02 = nested_get_vpid02(vcpu);
5165 switch (type) {
5166 case VMX_VPID_EXTENT_INDIVIDUAL_ADDR:
5167 if (!operand.vpid ||
5168 is_noncanonical_address(operand.gla, vcpu))
5169 return nested_vmx_failValid(vcpu,
5170 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
5171 if (cpu_has_vmx_invvpid_individual_addr()) {
5172 __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR,
5173 vpid02, operand.gla);
5174 } else
5175 __vmx_flush_tlb(vcpu, vpid02, false);
5176 break;
5177 case VMX_VPID_EXTENT_SINGLE_CONTEXT:
5178 case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL:
5179 if (!operand.vpid)
5180 return nested_vmx_failValid(vcpu,
5181 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
5182 __vmx_flush_tlb(vcpu, vpid02, false);
5183 break;
5184 case VMX_VPID_EXTENT_ALL_CONTEXT:
5185 __vmx_flush_tlb(vcpu, vpid02, false);
5186 break;
5187 default:
5188 WARN_ON_ONCE(1);
5189 return kvm_skip_emulated_instruction(vcpu);
5190 }
5191
5192 return nested_vmx_succeed(vcpu);
5193}
5194
5195static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu,
5196 struct vmcs12 *vmcs12)
5197{
Sean Christopherson2b3eaf82019-04-30 10:36:19 -07005198 u32 index = kvm_rcx_read(vcpu);
Sean Christopherson55d23752018-12-03 13:53:18 -08005199 u64 address;
5200 bool accessed_dirty;
5201 struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
5202
5203 if (!nested_cpu_has_eptp_switching(vmcs12) ||
5204 !nested_cpu_has_ept(vmcs12))
5205 return 1;
5206
5207 if (index >= VMFUNC_EPTP_ENTRIES)
5208 return 1;
5209
5210
5211 if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT,
5212 &address, index * 8, 8))
5213 return 1;
5214
5215 accessed_dirty = !!(address & VMX_EPTP_AD_ENABLE_BIT);
5216
5217 /*
5218 * If the (L2) guest does a vmfunc to the currently
5219 * active ept pointer, we don't have to do anything else
5220 */
5221 if (vmcs12->ept_pointer != address) {
5222 if (!valid_ept_address(vcpu, address))
5223 return 1;
5224
5225 kvm_mmu_unload(vcpu);
5226 mmu->ept_ad = accessed_dirty;
5227 mmu->mmu_role.base.ad_disabled = !accessed_dirty;
5228 vmcs12->ept_pointer = address;
5229 /*
5230 * TODO: Check what's the correct approach in case
5231 * mmu reload fails. Currently, we just let the next
5232 * reload potentially fail
5233 */
5234 kvm_mmu_reload(vcpu);
5235 }
5236
5237 return 0;
5238}
5239
5240static int handle_vmfunc(struct kvm_vcpu *vcpu)
5241{
5242 struct vcpu_vmx *vmx = to_vmx(vcpu);
5243 struct vmcs12 *vmcs12;
Sean Christopherson2b3eaf82019-04-30 10:36:19 -07005244 u32 function = kvm_rax_read(vcpu);
Sean Christopherson55d23752018-12-03 13:53:18 -08005245
5246 /*
5247 * VMFUNC is only supported for nested guests, but we always enable the
5248 * secondary control for simplicity; for non-nested mode, fake that we
5249 * didn't by injecting #UD.
5250 */
5251 if (!is_guest_mode(vcpu)) {
5252 kvm_queue_exception(vcpu, UD_VECTOR);
5253 return 1;
5254 }
5255
5256 vmcs12 = get_vmcs12(vcpu);
5257 if ((vmcs12->vm_function_control & (1 << function)) == 0)
5258 goto fail;
5259
5260 switch (function) {
5261 case 0:
5262 if (nested_vmx_eptp_switching(vcpu, vmcs12))
5263 goto fail;
5264 break;
5265 default:
5266 goto fail;
5267 }
5268 return kvm_skip_emulated_instruction(vcpu);
5269
5270fail:
5271 nested_vmx_vmexit(vcpu, vmx->exit_reason,
5272 vmcs_read32(VM_EXIT_INTR_INFO),
5273 vmcs_readl(EXIT_QUALIFICATION));
5274 return 1;
5275}
5276
5277
5278static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
5279 struct vmcs12 *vmcs12)
5280{
5281 unsigned long exit_qualification;
5282 gpa_t bitmap, last_bitmap;
5283 unsigned int port;
5284 int size;
5285 u8 b;
5286
5287 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
5288 return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING);
5289
5290 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5291
5292 port = exit_qualification >> 16;
5293 size = (exit_qualification & 7) + 1;
5294
5295 last_bitmap = (gpa_t)-1;
5296 b = -1;
5297
5298 while (size > 0) {
5299 if (port < 0x8000)
5300 bitmap = vmcs12->io_bitmap_a;
5301 else if (port < 0x10000)
5302 bitmap = vmcs12->io_bitmap_b;
5303 else
5304 return true;
5305 bitmap += (port & 0x7fff) / 8;
5306
5307 if (last_bitmap != bitmap)
5308 if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1))
5309 return true;
5310 if (b & (1 << (port & 7)))
5311 return true;
5312
5313 port++;
5314 size--;
5315 last_bitmap = bitmap;
5316 }
5317
5318 return false;
5319}
5320
5321/*
5322 * Return 1 if we should exit from L2 to L1 to handle an MSR access access,
5323 * rather than handle it ourselves in L0. I.e., check whether L1 expressed
5324 * disinterest in the current event (read or write a specific MSR) by using an
5325 * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps.
5326 */
5327static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
5328 struct vmcs12 *vmcs12, u32 exit_reason)
5329{
Sean Christopherson2b3eaf82019-04-30 10:36:19 -07005330 u32 msr_index = kvm_rcx_read(vcpu);
Sean Christopherson55d23752018-12-03 13:53:18 -08005331 gpa_t bitmap;
5332
5333 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
5334 return true;
5335
5336 /*
5337 * The MSR_BITMAP page is divided into four 1024-byte bitmaps,
5338 * for the four combinations of read/write and low/high MSR numbers.
5339 * First we need to figure out which of the four to use:
5340 */
5341 bitmap = vmcs12->msr_bitmap;
5342 if (exit_reason == EXIT_REASON_MSR_WRITE)
5343 bitmap += 2048;
5344 if (msr_index >= 0xc0000000) {
5345 msr_index -= 0xc0000000;
5346 bitmap += 1024;
5347 }
5348
5349 /* Then read the msr_index'th bit from this bitmap: */
5350 if (msr_index < 1024*8) {
5351 unsigned char b;
5352 if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1))
5353 return true;
5354 return 1 & (b >> (msr_index & 7));
5355 } else
5356 return true; /* let L1 handle the wrong parameter */
5357}
5358
5359/*
5360 * Return 1 if we should exit from L2 to L1 to handle a CR access exit,
5361 * rather than handle it ourselves in L0. I.e., check if L1 wanted to
5362 * intercept (via guest_host_mask etc.) the current event.
5363 */
5364static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
5365 struct vmcs12 *vmcs12)
5366{
5367 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5368 int cr = exit_qualification & 15;
5369 int reg;
5370 unsigned long val;
5371
5372 switch ((exit_qualification >> 4) & 3) {
5373 case 0: /* mov to cr */
5374 reg = (exit_qualification >> 8) & 15;
5375 val = kvm_register_readl(vcpu, reg);
5376 switch (cr) {
5377 case 0:
5378 if (vmcs12->cr0_guest_host_mask &
5379 (val ^ vmcs12->cr0_read_shadow))
5380 return true;
5381 break;
5382 case 3:
5383 if ((vmcs12->cr3_target_count >= 1 &&
5384 vmcs12->cr3_target_value0 == val) ||
5385 (vmcs12->cr3_target_count >= 2 &&
5386 vmcs12->cr3_target_value1 == val) ||
5387 (vmcs12->cr3_target_count >= 3 &&
5388 vmcs12->cr3_target_value2 == val) ||
5389 (vmcs12->cr3_target_count >= 4 &&
5390 vmcs12->cr3_target_value3 == val))
5391 return false;
5392 if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING))
5393 return true;
5394 break;
5395 case 4:
5396 if (vmcs12->cr4_guest_host_mask &
5397 (vmcs12->cr4_read_shadow ^ val))
5398 return true;
5399 break;
5400 case 8:
5401 if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING))
5402 return true;
5403 break;
5404 }
5405 break;
5406 case 2: /* clts */
5407 if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) &&
5408 (vmcs12->cr0_read_shadow & X86_CR0_TS))
5409 return true;
5410 break;
5411 case 1: /* mov from cr */
5412 switch (cr) {
5413 case 3:
5414 if (vmcs12->cpu_based_vm_exec_control &
5415 CPU_BASED_CR3_STORE_EXITING)
5416 return true;
5417 break;
5418 case 8:
5419 if (vmcs12->cpu_based_vm_exec_control &
5420 CPU_BASED_CR8_STORE_EXITING)
5421 return true;
5422 break;
5423 }
5424 break;
5425 case 3: /* lmsw */
5426 /*
5427 * lmsw can change bits 1..3 of cr0, and only set bit 0 of
5428 * cr0. Other attempted changes are ignored, with no exit.
5429 */
5430 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
5431 if (vmcs12->cr0_guest_host_mask & 0xe &
5432 (val ^ vmcs12->cr0_read_shadow))
5433 return true;
5434 if ((vmcs12->cr0_guest_host_mask & 0x1) &&
5435 !(vmcs12->cr0_read_shadow & 0x1) &&
5436 (val & 0x1))
5437 return true;
5438 break;
5439 }
5440 return false;
5441}
5442
5443static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu,
5444 struct vmcs12 *vmcs12, gpa_t bitmap)
5445{
5446 u32 vmx_instruction_info;
5447 unsigned long field;
5448 u8 b;
5449
5450 if (!nested_cpu_has_shadow_vmcs(vmcs12))
5451 return true;
5452
5453 /* Decode instruction info and find the field to access */
5454 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5455 field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
5456
5457 /* Out-of-range fields always cause a VM exit from L2 to L1 */
5458 if (field >> 15)
5459 return true;
5460
5461 if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1))
5462 return true;
5463
5464 return 1 & (b >> (field & 7));
5465}
5466
5467/*
5468 * Return 1 if we should exit from L2 to L1 to handle an exit, or 0 if we
5469 * should handle it ourselves in L0 (and then continue L2). Only call this
5470 * when in is_guest_mode (L2).
5471 */
5472bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
5473{
5474 u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
5475 struct vcpu_vmx *vmx = to_vmx(vcpu);
5476 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
5477
5478 if (vmx->nested.nested_run_pending)
5479 return false;
5480
5481 if (unlikely(vmx->fail)) {
Sean Christopherson380e0052019-07-11 08:58:30 -07005482 trace_kvm_nested_vmenter_failed(
5483 "hardware VM-instruction error: ",
5484 vmcs_read32(VM_INSTRUCTION_ERROR));
Sean Christopherson55d23752018-12-03 13:53:18 -08005485 return true;
5486 }
5487
5488 /*
5489 * The host physical addresses of some pages of guest memory
5490 * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC
5491 * Page). The CPU may write to these pages via their host
5492 * physical address while L2 is running, bypassing any
5493 * address-translation-based dirty tracking (e.g. EPT write
5494 * protection).
5495 *
5496 * Mark them dirty on every exit from L2 to prevent them from
5497 * getting out of sync with dirty tracking.
5498 */
5499 nested_mark_vmcs12_pages_dirty(vcpu);
5500
5501 trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason,
5502 vmcs_readl(EXIT_QUALIFICATION),
5503 vmx->idt_vectoring_info,
5504 intr_info,
5505 vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
5506 KVM_ISA_VMX);
5507
5508 switch (exit_reason) {
5509 case EXIT_REASON_EXCEPTION_NMI:
5510 if (is_nmi(intr_info))
5511 return false;
5512 else if (is_page_fault(intr_info))
5513 return !vmx->vcpu.arch.apf.host_apf_reason && enable_ept;
5514 else if (is_debug(intr_info) &&
5515 vcpu->guest_debug &
5516 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
5517 return false;
5518 else if (is_breakpoint(intr_info) &&
5519 vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
5520 return false;
5521 return vmcs12->exception_bitmap &
5522 (1u << (intr_info & INTR_INFO_VECTOR_MASK));
5523 case EXIT_REASON_EXTERNAL_INTERRUPT:
5524 return false;
5525 case EXIT_REASON_TRIPLE_FAULT:
5526 return true;
5527 case EXIT_REASON_PENDING_INTERRUPT:
5528 return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_INTR_PENDING);
5529 case EXIT_REASON_NMI_WINDOW:
5530 return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING);
5531 case EXIT_REASON_TASK_SWITCH:
5532 return true;
5533 case EXIT_REASON_CPUID:
5534 return true;
5535 case EXIT_REASON_HLT:
5536 return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING);
5537 case EXIT_REASON_INVD:
5538 return true;
5539 case EXIT_REASON_INVLPG:
5540 return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
5541 case EXIT_REASON_RDPMC:
5542 return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING);
5543 case EXIT_REASON_RDRAND:
5544 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING);
5545 case EXIT_REASON_RDSEED:
5546 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING);
5547 case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP:
5548 return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING);
5549 case EXIT_REASON_VMREAD:
5550 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12,
5551 vmcs12->vmread_bitmap);
5552 case EXIT_REASON_VMWRITE:
5553 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12,
5554 vmcs12->vmwrite_bitmap);
5555 case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR:
5556 case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD:
5557 case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME:
5558 case EXIT_REASON_VMOFF: case EXIT_REASON_VMON:
5559 case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID:
5560 /*
5561 * VMX instructions trap unconditionally. This allows L1 to
5562 * emulate them for its L2 guest, i.e., allows 3-level nesting!
5563 */
5564 return true;
5565 case EXIT_REASON_CR_ACCESS:
5566 return nested_vmx_exit_handled_cr(vcpu, vmcs12);
5567 case EXIT_REASON_DR_ACCESS:
5568 return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING);
5569 case EXIT_REASON_IO_INSTRUCTION:
5570 return nested_vmx_exit_handled_io(vcpu, vmcs12);
5571 case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR:
5572 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC);
5573 case EXIT_REASON_MSR_READ:
5574 case EXIT_REASON_MSR_WRITE:
5575 return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason);
5576 case EXIT_REASON_INVALID_STATE:
5577 return true;
5578 case EXIT_REASON_MWAIT_INSTRUCTION:
5579 return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING);
5580 case EXIT_REASON_MONITOR_TRAP_FLAG:
5581 return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_TRAP_FLAG);
5582 case EXIT_REASON_MONITOR_INSTRUCTION:
5583 return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING);
5584 case EXIT_REASON_PAUSE_INSTRUCTION:
5585 return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) ||
5586 nested_cpu_has2(vmcs12,
5587 SECONDARY_EXEC_PAUSE_LOOP_EXITING);
5588 case EXIT_REASON_MCE_DURING_VMENTRY:
5589 return false;
5590 case EXIT_REASON_TPR_BELOW_THRESHOLD:
5591 return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW);
5592 case EXIT_REASON_APIC_ACCESS:
5593 case EXIT_REASON_APIC_WRITE:
5594 case EXIT_REASON_EOI_INDUCED:
5595 /*
5596 * The controls for "virtualize APIC accesses," "APIC-
5597 * register virtualization," and "virtual-interrupt
5598 * delivery" only come from vmcs12.
5599 */
5600 return true;
5601 case EXIT_REASON_EPT_VIOLATION:
5602 /*
5603 * L0 always deals with the EPT violation. If nested EPT is
5604 * used, and the nested mmu code discovers that the address is
5605 * missing in the guest EPT table (EPT12), the EPT violation
5606 * will be injected with nested_ept_inject_page_fault()
5607 */
5608 return false;
5609 case EXIT_REASON_EPT_MISCONFIG:
5610 /*
5611 * L2 never uses directly L1's EPT, but rather L0's own EPT
5612 * table (shadow on EPT) or a merged EPT table that L0 built
5613 * (EPT on EPT). So any problems with the structure of the
5614 * table is L0's fault.
5615 */
5616 return false;
5617 case EXIT_REASON_INVPCID:
5618 return
5619 nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) &&
5620 nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
5621 case EXIT_REASON_WBINVD:
5622 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING);
5623 case EXIT_REASON_XSETBV:
5624 return true;
5625 case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS:
5626 /*
5627 * This should never happen, since it is not possible to
5628 * set XSS to a non-zero value---neither in L1 nor in L2.
5629 * If if it were, XSS would have to be checked against
5630 * the XSS exit bitmap in vmcs12.
5631 */
5632 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
5633 case EXIT_REASON_PREEMPTION_TIMER:
5634 return false;
5635 case EXIT_REASON_PML_FULL:
5636 /* We emulate PML support to L1. */
5637 return false;
5638 case EXIT_REASON_VMFUNC:
5639 /* VM functions are emulated through L2->L0 vmexits. */
5640 return false;
5641 case EXIT_REASON_ENCLS:
5642 /* SGX is never exposed to L1 */
5643 return false;
Tao Xubf653b72019-07-16 14:55:51 +08005644 case EXIT_REASON_UMWAIT:
5645 case EXIT_REASON_TPAUSE:
5646 return nested_cpu_has2(vmcs12,
5647 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE);
Sean Christopherson55d23752018-12-03 13:53:18 -08005648 default:
5649 return true;
5650 }
5651}
5652
5653
5654static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
5655 struct kvm_nested_state __user *user_kvm_nested_state,
5656 u32 user_data_size)
5657{
5658 struct vcpu_vmx *vmx;
5659 struct vmcs12 *vmcs12;
5660 struct kvm_nested_state kvm_state = {
5661 .flags = 0,
Liran Alon6ca00df2019-06-16 15:03:10 +03005662 .format = KVM_STATE_NESTED_FORMAT_VMX,
Sean Christopherson55d23752018-12-03 13:53:18 -08005663 .size = sizeof(kvm_state),
Liran Alon6ca00df2019-06-16 15:03:10 +03005664 .hdr.vmx.vmxon_pa = -1ull,
5665 .hdr.vmx.vmcs12_pa = -1ull,
Sean Christopherson55d23752018-12-03 13:53:18 -08005666 };
Liran Alon6ca00df2019-06-16 15:03:10 +03005667 struct kvm_vmx_nested_state_data __user *user_vmx_nested_state =
5668 &user_kvm_nested_state->data.vmx[0];
Sean Christopherson55d23752018-12-03 13:53:18 -08005669
5670 if (!vcpu)
Liran Alon6ca00df2019-06-16 15:03:10 +03005671 return kvm_state.size + sizeof(*user_vmx_nested_state);
Sean Christopherson55d23752018-12-03 13:53:18 -08005672
5673 vmx = to_vmx(vcpu);
5674 vmcs12 = get_vmcs12(vcpu);
5675
Sean Christopherson55d23752018-12-03 13:53:18 -08005676 if (nested_vmx_allowed(vcpu) &&
5677 (vmx->nested.vmxon || vmx->nested.smm.vmxon)) {
Liran Alon6ca00df2019-06-16 15:03:10 +03005678 kvm_state.hdr.vmx.vmxon_pa = vmx->nested.vmxon_ptr;
5679 kvm_state.hdr.vmx.vmcs12_pa = vmx->nested.current_vmptr;
Sean Christopherson55d23752018-12-03 13:53:18 -08005680
5681 if (vmx_has_valid_vmcs12(vcpu)) {
Liran Alon6ca00df2019-06-16 15:03:10 +03005682 kvm_state.size += sizeof(user_vmx_nested_state->vmcs12);
Sean Christopherson55d23752018-12-03 13:53:18 -08005683
Liran Alon323d73a2019-06-26 16:09:27 +03005684 if (vmx->nested.hv_evmcs)
5685 kvm_state.flags |= KVM_STATE_NESTED_EVMCS;
5686
Sean Christopherson55d23752018-12-03 13:53:18 -08005687 if (is_guest_mode(vcpu) &&
5688 nested_cpu_has_shadow_vmcs(vmcs12) &&
5689 vmcs12->vmcs_link_pointer != -1ull)
Liran Alon6ca00df2019-06-16 15:03:10 +03005690 kvm_state.size += sizeof(user_vmx_nested_state->shadow_vmcs12);
Sean Christopherson55d23752018-12-03 13:53:18 -08005691 }
5692
5693 if (vmx->nested.smm.vmxon)
Liran Alon6ca00df2019-06-16 15:03:10 +03005694 kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON;
Sean Christopherson55d23752018-12-03 13:53:18 -08005695
5696 if (vmx->nested.smm.guest_mode)
Liran Alon6ca00df2019-06-16 15:03:10 +03005697 kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE;
Sean Christopherson55d23752018-12-03 13:53:18 -08005698
5699 if (is_guest_mode(vcpu)) {
5700 kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE;
5701
5702 if (vmx->nested.nested_run_pending)
5703 kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING;
5704 }
5705 }
5706
5707 if (user_data_size < kvm_state.size)
5708 goto out;
5709
5710 if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state)))
5711 return -EFAULT;
5712
5713 if (!vmx_has_valid_vmcs12(vcpu))
5714 goto out;
5715
5716 /*
5717 * When running L2, the authoritative vmcs12 state is in the
5718 * vmcs02. When running L1, the authoritative vmcs12 state is
5719 * in the shadow or enlightened vmcs linked to vmcs01, unless
Sean Christopherson3731905ef2019-05-07 08:36:27 -07005720 * need_vmcs12_to_shadow_sync is set, in which case, the authoritative
Sean Christopherson55d23752018-12-03 13:53:18 -08005721 * vmcs12 state is in the vmcs12 already.
5722 */
5723 if (is_guest_mode(vcpu)) {
Sean Christopherson3731905ef2019-05-07 08:36:27 -07005724 sync_vmcs02_to_vmcs12(vcpu, vmcs12);
Sean Christopherson7952d762019-05-07 08:36:29 -07005725 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
Sean Christopherson3731905ef2019-05-07 08:36:27 -07005726 } else if (!vmx->nested.need_vmcs12_to_shadow_sync) {
Sean Christopherson55d23752018-12-03 13:53:18 -08005727 if (vmx->nested.hv_evmcs)
5728 copy_enlightened_to_vmcs12(vmx);
5729 else if (enable_shadow_vmcs)
5730 copy_shadow_to_vmcs12(vmx);
5731 }
5732
Liran Alon6ca00df2019-06-16 15:03:10 +03005733 BUILD_BUG_ON(sizeof(user_vmx_nested_state->vmcs12) < VMCS12_SIZE);
5734 BUILD_BUG_ON(sizeof(user_vmx_nested_state->shadow_vmcs12) < VMCS12_SIZE);
5735
Tom Roeder3a33d032019-01-24 13:48:20 -08005736 /*
5737 * Copy over the full allocated size of vmcs12 rather than just the size
5738 * of the struct.
5739 */
Liran Alon6ca00df2019-06-16 15:03:10 +03005740 if (copy_to_user(user_vmx_nested_state->vmcs12, vmcs12, VMCS12_SIZE))
Sean Christopherson55d23752018-12-03 13:53:18 -08005741 return -EFAULT;
5742
5743 if (nested_cpu_has_shadow_vmcs(vmcs12) &&
5744 vmcs12->vmcs_link_pointer != -1ull) {
Liran Alon6ca00df2019-06-16 15:03:10 +03005745 if (copy_to_user(user_vmx_nested_state->shadow_vmcs12,
Tom Roeder3a33d032019-01-24 13:48:20 -08005746 get_shadow_vmcs12(vcpu), VMCS12_SIZE))
Sean Christopherson55d23752018-12-03 13:53:18 -08005747 return -EFAULT;
5748 }
5749
5750out:
5751 return kvm_state.size;
5752}
5753
5754/*
5755 * Forcibly leave nested mode in order to be able to reset the VCPU later on.
5756 */
5757void vmx_leave_nested(struct kvm_vcpu *vcpu)
5758{
5759 if (is_guest_mode(vcpu)) {
5760 to_vmx(vcpu)->nested.nested_run_pending = 0;
5761 nested_vmx_vmexit(vcpu, -1, 0, 0);
5762 }
5763 free_nested(vcpu);
5764}
5765
5766static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
5767 struct kvm_nested_state __user *user_kvm_nested_state,
5768 struct kvm_nested_state *kvm_state)
5769{
5770 struct vcpu_vmx *vmx = to_vmx(vcpu);
5771 struct vmcs12 *vmcs12;
5772 u32 exit_qual;
Liran Alon6ca00df2019-06-16 15:03:10 +03005773 struct kvm_vmx_nested_state_data __user *user_vmx_nested_state =
5774 &user_kvm_nested_state->data.vmx[0];
Sean Christopherson55d23752018-12-03 13:53:18 -08005775 int ret;
5776
Liran Alon6ca00df2019-06-16 15:03:10 +03005777 if (kvm_state->format != KVM_STATE_NESTED_FORMAT_VMX)
Sean Christopherson55d23752018-12-03 13:53:18 -08005778 return -EINVAL;
5779
Liran Alon6ca00df2019-06-16 15:03:10 +03005780 if (kvm_state->hdr.vmx.vmxon_pa == -1ull) {
5781 if (kvm_state->hdr.vmx.smm.flags)
Sean Christopherson55d23752018-12-03 13:53:18 -08005782 return -EINVAL;
5783
Liran Alon6ca00df2019-06-16 15:03:10 +03005784 if (kvm_state->hdr.vmx.vmcs12_pa != -1ull)
Sean Christopherson55d23752018-12-03 13:53:18 -08005785 return -EINVAL;
5786
Liran Alon323d73a2019-06-26 16:09:27 +03005787 /*
5788 * KVM_STATE_NESTED_EVMCS used to signal that KVM should
5789 * enable eVMCS capability on vCPU. However, since then
5790 * code was changed such that flag signals vmcs12 should
5791 * be copied into eVMCS in guest memory.
5792 *
5793 * To preserve backwards compatability, allow user
5794 * to set this flag even when there is no VMXON region.
5795 */
Paolo Bonzini9fd58872019-06-19 16:52:27 +02005796 if (kvm_state->flags & ~KVM_STATE_NESTED_EVMCS)
5797 return -EINVAL;
5798 } else {
5799 if (!nested_vmx_allowed(vcpu))
5800 return -EINVAL;
Sean Christopherson55d23752018-12-03 13:53:18 -08005801
Paolo Bonzini9fd58872019-06-19 16:52:27 +02005802 if (!page_address_valid(vcpu, kvm_state->hdr.vmx.vmxon_pa))
5803 return -EINVAL;
Liran Alon323d73a2019-06-26 16:09:27 +03005804 }
Sean Christopherson55d23752018-12-03 13:53:18 -08005805
Liran Alon6ca00df2019-06-16 15:03:10 +03005806 if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
Sean Christopherson55d23752018-12-03 13:53:18 -08005807 (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
5808 return -EINVAL;
5809
Liran Alon6ca00df2019-06-16 15:03:10 +03005810 if (kvm_state->hdr.vmx.smm.flags &
Sean Christopherson55d23752018-12-03 13:53:18 -08005811 ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON))
5812 return -EINVAL;
5813
5814 /*
5815 * SMM temporarily disables VMX, so we cannot be in guest mode,
5816 * nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags
5817 * must be zero.
5818 */
Liran Alon65b712f12019-06-25 14:26:42 +03005819 if (is_smm(vcpu) ?
5820 (kvm_state->flags &
5821 (KVM_STATE_NESTED_GUEST_MODE | KVM_STATE_NESTED_RUN_PENDING))
5822 : kvm_state->hdr.vmx.smm.flags)
Sean Christopherson55d23752018-12-03 13:53:18 -08005823 return -EINVAL;
5824
Liran Alon6ca00df2019-06-16 15:03:10 +03005825 if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
5826 !(kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON))
Sean Christopherson55d23752018-12-03 13:53:18 -08005827 return -EINVAL;
5828
Liran Alon323d73a2019-06-26 16:09:27 +03005829 if ((kvm_state->flags & KVM_STATE_NESTED_EVMCS) &&
5830 (!nested_vmx_allowed(vcpu) || !vmx->nested.enlightened_vmcs_enabled))
Paolo Bonzini9fd58872019-06-19 16:52:27 +02005831 return -EINVAL;
5832
Liran Alon323d73a2019-06-26 16:09:27 +03005833 vmx_leave_nested(vcpu);
Paolo Bonzini9fd58872019-06-19 16:52:27 +02005834
Liran Alon6ca00df2019-06-16 15:03:10 +03005835 if (kvm_state->hdr.vmx.vmxon_pa == -1ull)
Sean Christopherson55d23752018-12-03 13:53:18 -08005836 return 0;
5837
Liran Alon6ca00df2019-06-16 15:03:10 +03005838 vmx->nested.vmxon_ptr = kvm_state->hdr.vmx.vmxon_pa;
Sean Christopherson55d23752018-12-03 13:53:18 -08005839 ret = enter_vmx_operation(vcpu);
5840 if (ret)
5841 return ret;
5842
5843 /* Empty 'VMXON' state is permitted */
Jim Mattsone8ab8d22019-01-17 11:55:58 -08005844 if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12))
Sean Christopherson55d23752018-12-03 13:53:18 -08005845 return 0;
5846
Liran Alon6ca00df2019-06-16 15:03:10 +03005847 if (kvm_state->hdr.vmx.vmcs12_pa != -1ull) {
5848 if (kvm_state->hdr.vmx.vmcs12_pa == kvm_state->hdr.vmx.vmxon_pa ||
5849 !page_address_valid(vcpu, kvm_state->hdr.vmx.vmcs12_pa))
Sean Christopherson55d23752018-12-03 13:53:18 -08005850 return -EINVAL;
5851
Liran Alon6ca00df2019-06-16 15:03:10 +03005852 set_current_vmptr(vmx, kvm_state->hdr.vmx.vmcs12_pa);
Sean Christopherson55d23752018-12-03 13:53:18 -08005853 } else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) {
5854 /*
5855 * Sync eVMCS upon entry as we may not have
5856 * HV_X64_MSR_VP_ASSIST_PAGE set up yet.
5857 */
Sean Christopherson3731905ef2019-05-07 08:36:27 -07005858 vmx->nested.need_vmcs12_to_shadow_sync = true;
Sean Christopherson55d23752018-12-03 13:53:18 -08005859 } else {
5860 return -EINVAL;
5861 }
5862
Liran Alon6ca00df2019-06-16 15:03:10 +03005863 if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) {
Sean Christopherson55d23752018-12-03 13:53:18 -08005864 vmx->nested.smm.vmxon = true;
5865 vmx->nested.vmxon = false;
5866
Liran Alon6ca00df2019-06-16 15:03:10 +03005867 if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE)
Sean Christopherson55d23752018-12-03 13:53:18 -08005868 vmx->nested.smm.guest_mode = true;
5869 }
5870
5871 vmcs12 = get_vmcs12(vcpu);
Liran Alon6ca00df2019-06-16 15:03:10 +03005872 if (copy_from_user(vmcs12, user_vmx_nested_state->vmcs12, sizeof(*vmcs12)))
Sean Christopherson55d23752018-12-03 13:53:18 -08005873 return -EFAULT;
5874
5875 if (vmcs12->hdr.revision_id != VMCS12_REVISION)
5876 return -EINVAL;
5877
5878 if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
5879 return 0;
5880
Sean Christopherson21be4ca2019-05-08 11:04:32 -07005881 vmx->nested.nested_run_pending =
5882 !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING);
5883
5884 ret = -EINVAL;
Sean Christopherson55d23752018-12-03 13:53:18 -08005885 if (nested_cpu_has_shadow_vmcs(vmcs12) &&
5886 vmcs12->vmcs_link_pointer != -1ull) {
5887 struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu);
5888
Liran Alon6ca00df2019-06-16 15:03:10 +03005889 if (kvm_state->size <
5890 sizeof(*kvm_state) +
5891 sizeof(user_vmx_nested_state->vmcs12) + sizeof(*shadow_vmcs12))
Sean Christopherson21be4ca2019-05-08 11:04:32 -07005892 goto error_guest_mode;
Sean Christopherson55d23752018-12-03 13:53:18 -08005893
5894 if (copy_from_user(shadow_vmcs12,
Liran Alon6ca00df2019-06-16 15:03:10 +03005895 user_vmx_nested_state->shadow_vmcs12,
5896 sizeof(*shadow_vmcs12))) {
Sean Christopherson21be4ca2019-05-08 11:04:32 -07005897 ret = -EFAULT;
5898 goto error_guest_mode;
5899 }
Sean Christopherson55d23752018-12-03 13:53:18 -08005900
5901 if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION ||
5902 !shadow_vmcs12->hdr.shadow_vmcs)
Sean Christopherson21be4ca2019-05-08 11:04:32 -07005903 goto error_guest_mode;
Sean Christopherson55d23752018-12-03 13:53:18 -08005904 }
5905
Sean Christopherson5478ba32019-04-11 12:18:06 -07005906 if (nested_vmx_check_controls(vcpu, vmcs12) ||
5907 nested_vmx_check_host_state(vcpu, vmcs12) ||
5908 nested_vmx_check_guest_state(vcpu, vmcs12, &exit_qual))
Sean Christopherson21be4ca2019-05-08 11:04:32 -07005909 goto error_guest_mode;
Sean Christopherson55d23752018-12-03 13:53:18 -08005910
5911 vmx->nested.dirty_vmcs12 = true;
5912 ret = nested_vmx_enter_non_root_mode(vcpu, false);
Sean Christopherson21be4ca2019-05-08 11:04:32 -07005913 if (ret)
5914 goto error_guest_mode;
Sean Christopherson55d23752018-12-03 13:53:18 -08005915
5916 return 0;
Sean Christopherson21be4ca2019-05-08 11:04:32 -07005917
5918error_guest_mode:
5919 vmx->nested.nested_run_pending = 0;
5920 return ret;
Sean Christopherson55d23752018-12-03 13:53:18 -08005921}
5922
Xiaoyao Li1b842922019-10-20 17:11:01 +08005923void nested_vmx_set_vmcs_shadowing_bitmap(void)
Sean Christopherson55d23752018-12-03 13:53:18 -08005924{
5925 if (enable_shadow_vmcs) {
Sean Christopherson55d23752018-12-03 13:53:18 -08005926 vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap));
Sean Christophersonfadcead2019-05-07 08:36:23 -07005927 vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap));
Sean Christopherson55d23752018-12-03 13:53:18 -08005928 }
5929}
5930
5931/*
5932 * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be
5933 * returned for the various VMX controls MSRs when nested VMX is enabled.
5934 * The same values should also be used to verify that vmcs12 control fields are
5935 * valid during nested entry from L1 to L2.
5936 * Each of these control msrs has a low and high 32-bit half: A low bit is on
5937 * if the corresponding bit in the (32-bit) control field *must* be on, and a
5938 * bit in the high half is on if the corresponding bit in the control field
5939 * may be on. See also vmx_control_verify().
5940 */
5941void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps,
5942 bool apicv)
5943{
5944 /*
5945 * Note that as a general rule, the high half of the MSRs (bits in
5946 * the control fields which may be 1) should be initialized by the
5947 * intersection of the underlying hardware's MSR (i.e., features which
5948 * can be supported) and the list of features we want to expose -
5949 * because they are known to be properly supported in our code.
5950 * Also, usually, the low half of the MSRs (bits which must be 1) can
5951 * be set to 0, meaning that L1 may turn off any of these bits. The
5952 * reason is that if one of these bits is necessary, it will appear
5953 * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control
5954 * fields of vmcs01 and vmcs02, will turn these bits off - and
5955 * nested_vmx_exit_reflected() will not pass related exits to L1.
5956 * These rules have exceptions below.
5957 */
5958
5959 /* pin-based controls */
5960 rdmsr(MSR_IA32_VMX_PINBASED_CTLS,
5961 msrs->pinbased_ctls_low,
5962 msrs->pinbased_ctls_high);
5963 msrs->pinbased_ctls_low |=
5964 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
5965 msrs->pinbased_ctls_high &=
5966 PIN_BASED_EXT_INTR_MASK |
5967 PIN_BASED_NMI_EXITING |
5968 PIN_BASED_VIRTUAL_NMIS |
5969 (apicv ? PIN_BASED_POSTED_INTR : 0);
5970 msrs->pinbased_ctls_high |=
5971 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
5972 PIN_BASED_VMX_PREEMPTION_TIMER;
5973
5974 /* exit controls */
5975 rdmsr(MSR_IA32_VMX_EXIT_CTLS,
5976 msrs->exit_ctls_low,
5977 msrs->exit_ctls_high);
5978 msrs->exit_ctls_low =
5979 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
5980
5981 msrs->exit_ctls_high &=
5982#ifdef CONFIG_X86_64
5983 VM_EXIT_HOST_ADDR_SPACE_SIZE |
5984#endif
5985 VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
5986 msrs->exit_ctls_high |=
5987 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
5988 VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
5989 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT;
5990
5991 /* We support free control of debug control saving. */
5992 msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS;
5993
5994 /* entry controls */
5995 rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
5996 msrs->entry_ctls_low,
5997 msrs->entry_ctls_high);
5998 msrs->entry_ctls_low =
5999 VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
6000 msrs->entry_ctls_high &=
6001#ifdef CONFIG_X86_64
6002 VM_ENTRY_IA32E_MODE |
6003#endif
6004 VM_ENTRY_LOAD_IA32_PAT;
6005 msrs->entry_ctls_high |=
6006 (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER);
6007
6008 /* We support free control of debug control loading. */
6009 msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS;
6010
6011 /* cpu-based controls */
6012 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
6013 msrs->procbased_ctls_low,
6014 msrs->procbased_ctls_high);
6015 msrs->procbased_ctls_low =
6016 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
6017 msrs->procbased_ctls_high &=
6018 CPU_BASED_VIRTUAL_INTR_PENDING |
6019 CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING |
6020 CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING |
6021 CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING |
6022 CPU_BASED_CR3_STORE_EXITING |
6023#ifdef CONFIG_X86_64
6024 CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING |
6025#endif
6026 CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
6027 CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG |
6028 CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING |
6029 CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING |
6030 CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
6031 /*
6032 * We can allow some features even when not supported by the
6033 * hardware. For example, L1 can specify an MSR bitmap - and we
6034 * can use it to avoid exits to L1 - even when L0 runs L2
6035 * without MSR bitmaps.
6036 */
6037 msrs->procbased_ctls_high |=
6038 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
6039 CPU_BASED_USE_MSR_BITMAPS;
6040
6041 /* We support free control of CR3 access interception. */
6042 msrs->procbased_ctls_low &=
6043 ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING);
6044
6045 /*
6046 * secondary cpu-based controls. Do not include those that
6047 * depend on CPUID bits, they are added later by vmx_cpuid_update.
6048 */
Vitaly Kuznetsov6b1971c2019-02-07 11:42:14 +01006049 if (msrs->procbased_ctls_high & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)
6050 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
6051 msrs->secondary_ctls_low,
6052 msrs->secondary_ctls_high);
6053
Sean Christopherson55d23752018-12-03 13:53:18 -08006054 msrs->secondary_ctls_low = 0;
6055 msrs->secondary_ctls_high &=
6056 SECONDARY_EXEC_DESC |
Paolo Bonzini6defc592019-07-02 14:39:29 +02006057 SECONDARY_EXEC_RDTSCP |
Sean Christopherson55d23752018-12-03 13:53:18 -08006058 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
Paolo Bonzini6defc592019-07-02 14:39:29 +02006059 SECONDARY_EXEC_WBINVD_EXITING |
Sean Christopherson55d23752018-12-03 13:53:18 -08006060 SECONDARY_EXEC_APIC_REGISTER_VIRT |
6061 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
Paolo Bonzini6defc592019-07-02 14:39:29 +02006062 SECONDARY_EXEC_RDRAND_EXITING |
6063 SECONDARY_EXEC_ENABLE_INVPCID |
6064 SECONDARY_EXEC_RDSEED_EXITING |
6065 SECONDARY_EXEC_XSAVES;
Sean Christopherson55d23752018-12-03 13:53:18 -08006066
6067 /*
6068 * We can emulate "VMCS shadowing," even if the hardware
6069 * doesn't support it.
6070 */
6071 msrs->secondary_ctls_high |=
6072 SECONDARY_EXEC_SHADOW_VMCS;
6073
6074 if (enable_ept) {
6075 /* nested EPT: emulate EPT also to L1 */
6076 msrs->secondary_ctls_high |=
6077 SECONDARY_EXEC_ENABLE_EPT;
6078 msrs->ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
6079 VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT;
6080 if (cpu_has_vmx_ept_execute_only())
6081 msrs->ept_caps |=
6082 VMX_EPT_EXECUTE_ONLY_BIT;
6083 msrs->ept_caps &= ept_caps;
6084 msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT |
6085 VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT |
6086 VMX_EPT_1GB_PAGE_BIT;
6087 if (enable_ept_ad_bits) {
6088 msrs->secondary_ctls_high |=
6089 SECONDARY_EXEC_ENABLE_PML;
6090 msrs->ept_caps |= VMX_EPT_AD_BIT;
6091 }
6092 }
6093
6094 if (cpu_has_vmx_vmfunc()) {
6095 msrs->secondary_ctls_high |=
6096 SECONDARY_EXEC_ENABLE_VMFUNC;
6097 /*
6098 * Advertise EPTP switching unconditionally
6099 * since we emulate it
6100 */
6101 if (enable_ept)
6102 msrs->vmfunc_controls =
6103 VMX_VMFUNC_EPTP_SWITCHING;
6104 }
6105
6106 /*
6107 * Old versions of KVM use the single-context version without
6108 * checking for support, so declare that it is supported even
6109 * though it is treated as global context. The alternative is
6110 * not failing the single-context invvpid, and it is worse.
6111 */
6112 if (enable_vpid) {
6113 msrs->secondary_ctls_high |=
6114 SECONDARY_EXEC_ENABLE_VPID;
6115 msrs->vpid_caps = VMX_VPID_INVVPID_BIT |
6116 VMX_VPID_EXTENT_SUPPORTED_MASK;
6117 }
6118
6119 if (enable_unrestricted_guest)
6120 msrs->secondary_ctls_high |=
6121 SECONDARY_EXEC_UNRESTRICTED_GUEST;
6122
6123 if (flexpriority_enabled)
6124 msrs->secondary_ctls_high |=
6125 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
6126
6127 /* miscellaneous data */
6128 rdmsr(MSR_IA32_VMX_MISC,
6129 msrs->misc_low,
6130 msrs->misc_high);
6131 msrs->misc_low &= VMX_MISC_SAVE_EFER_LMA;
6132 msrs->misc_low |=
6133 MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS |
6134 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
6135 VMX_MISC_ACTIVITY_HLT;
6136 msrs->misc_high = 0;
6137
6138 /*
6139 * This MSR reports some information about VMX support. We
6140 * should return information about the VMX we emulate for the
6141 * guest, and the VMCS structure we give it - not about the
6142 * VMX support of the underlying hardware.
6143 */
6144 msrs->basic =
6145 VMCS12_REVISION |
6146 VMX_BASIC_TRUE_CTLS |
6147 ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |
6148 (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);
6149
6150 if (cpu_has_vmx_basic_inout())
6151 msrs->basic |= VMX_BASIC_INOUT;
6152
6153 /*
6154 * These MSRs specify bits which the guest must keep fixed on
6155 * while L1 is in VMXON mode (in L1's root mode, or running an L2).
6156 * We picked the standard core2 setting.
6157 */
6158#define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE)
6159#define VMXON_CR4_ALWAYSON X86_CR4_VMXE
6160 msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON;
6161 msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON;
6162
6163 /* These MSRs specify bits which the guest must keep fixed off. */
6164 rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1);
6165 rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1);
6166
6167 /* highest index: VMX_PREEMPTION_TIMER_VALUE */
6168 msrs->vmcs_enum = VMCS12_MAX_FIELD_INDEX << 1;
6169}
6170
6171void nested_vmx_hardware_unsetup(void)
6172{
6173 int i;
6174
6175 if (enable_shadow_vmcs) {
6176 for (i = 0; i < VMX_BITMAP_NR; i++)
6177 free_page((unsigned long)vmx_bitmap[i]);
6178 }
6179}
6180
6181__init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *))
6182{
6183 int i;
6184
6185 if (!cpu_has_vmx_shadow_vmcs())
6186 enable_shadow_vmcs = 0;
6187 if (enable_shadow_vmcs) {
6188 for (i = 0; i < VMX_BITMAP_NR; i++) {
Ben Gardon41836832019-02-11 11:02:52 -08006189 /*
6190 * The vmx_bitmap is not tied to a VM and so should
6191 * not be charged to a memcg.
6192 */
Sean Christopherson55d23752018-12-03 13:53:18 -08006193 vmx_bitmap[i] = (unsigned long *)
6194 __get_free_page(GFP_KERNEL);
6195 if (!vmx_bitmap[i]) {
6196 nested_vmx_hardware_unsetup();
6197 return -ENOMEM;
6198 }
6199 }
6200
6201 init_vmcs_shadow_fields();
6202 }
6203
Liran Aloncc877672019-11-18 21:11:21 +02006204 exit_handlers[EXIT_REASON_VMCLEAR] = handle_vmclear;
6205 exit_handlers[EXIT_REASON_VMLAUNCH] = handle_vmlaunch;
6206 exit_handlers[EXIT_REASON_VMPTRLD] = handle_vmptrld;
6207 exit_handlers[EXIT_REASON_VMPTRST] = handle_vmptrst;
6208 exit_handlers[EXIT_REASON_VMREAD] = handle_vmread;
6209 exit_handlers[EXIT_REASON_VMRESUME] = handle_vmresume;
6210 exit_handlers[EXIT_REASON_VMWRITE] = handle_vmwrite;
6211 exit_handlers[EXIT_REASON_VMOFF] = handle_vmoff;
6212 exit_handlers[EXIT_REASON_VMON] = handle_vmon;
6213 exit_handlers[EXIT_REASON_INVEPT] = handle_invept;
6214 exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid;
6215 exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc;
Sean Christopherson55d23752018-12-03 13:53:18 -08006216
6217 kvm_x86_ops->check_nested_events = vmx_check_nested_events;
6218 kvm_x86_ops->get_nested_state = vmx_get_nested_state;
6219 kvm_x86_ops->set_nested_state = vmx_set_nested_state;
Liran Aloncc877672019-11-18 21:11:21 +02006220 kvm_x86_ops->get_vmcs12_pages = nested_get_vmcs12_pages;
Sean Christopherson55d23752018-12-03 13:53:18 -08006221 kvm_x86_ops->nested_enable_evmcs = nested_enable_evmcs;
Vitaly Kuznetsove2e871a2018-12-10 18:21:55 +01006222 kvm_x86_ops->nested_get_evmcs_version = nested_get_evmcs_version;
Sean Christopherson55d23752018-12-03 13:53:18 -08006223
6224 return 0;
6225}