Blame - arch/x86/kvm/mmu/spte.c - SHIFTPHONES/kernel/common

blob: d9c5665a55e97d5e1426be6140fa69c559f9496a [file] [log] [blame]

Paolo Bonzini	5a9624a	2020-10-16 10:29:37 -0400	[diff] [blame^]	1	// SPDX-License-Identifier: GPL-2.0-only
				2	/*
				3	* Kernel-based Virtual Machine driver for Linux
				4	*
				5	* Macros and functions to access KVM PTEs (also known as SPTEs)
				6	*
				7	* Copyright (C) 2006 Qumranet, Inc.
				8	* Copyright 2020 Red Hat, Inc. and/or its affiliates.
				9	*/
				10
				11
				12	#include <linux/kvm_host.h>
				13	#include "mmu.h"
				14	#include "mmu_internal.h"
				15	#include "x86.h"
				16	#include "spte.h"
				17
				18	#include <asm/e820/api.h>
				19
				20	u64 __read_mostly shadow_nx_mask;
				21	u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
				22	u64 __read_mostly shadow_user_mask;
				23	u64 __read_mostly shadow_accessed_mask;
				24	u64 __read_mostly shadow_dirty_mask;
				25	u64 __read_mostly shadow_mmio_value;
				26	u64 __read_mostly shadow_mmio_access_mask;
				27	u64 __read_mostly shadow_present_mask;
				28	u64 __read_mostly shadow_me_mask;
				29	u64 __read_mostly shadow_acc_track_mask;
				30
				31	u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
				32	u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask;
				33
				34	u8 __read_mostly shadow_phys_bits;
				35
				36	static u64 generation_mmio_spte_mask(u64 gen)
				37	{
				38	u64 mask;
				39
				40	WARN_ON(gen & ~MMIO_SPTE_GEN_MASK);
				41	BUILD_BUG_ON((MMIO_SPTE_GEN_HIGH_MASK \| MMIO_SPTE_GEN_LOW_MASK) & SPTE_SPECIAL_MASK);
				42
				43	mask = (gen << MMIO_SPTE_GEN_LOW_START) & MMIO_SPTE_GEN_LOW_MASK;
				44	mask \|= (gen << MMIO_SPTE_GEN_HIGH_START) & MMIO_SPTE_GEN_HIGH_MASK;
				45	return mask;
				46	}
				47
				48	u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access)
				49	{
				50	u64 gen = kvm_vcpu_memslots(vcpu)->generation & MMIO_SPTE_GEN_MASK;
				51	u64 mask = generation_mmio_spte_mask(gen);
				52	u64 gpa = gfn << PAGE_SHIFT;
				53
				54	access &= shadow_mmio_access_mask;
				55	mask \|= shadow_mmio_value \| access;
				56	mask \|= gpa \| shadow_nonpresent_or_rsvd_mask;
				57	mask \|= (gpa & shadow_nonpresent_or_rsvd_mask)
				58	<< shadow_nonpresent_or_rsvd_mask_len;
				59
				60	return mask;
				61	}
				62
				63	static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
				64	{
				65	if (pfn_valid(pfn))
				66	return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn)) &&
				67	/*
				68	* Some reserved pages, such as those from NVDIMM
				69	* DAX devices, are not for MMIO, and can be mapped
				70	* with cached memory type for better performance.
				71	* However, the above check misconceives those pages
				72	* as MMIO, and results in KVM mapping them with UC
				73	* memory type, which would hurt the performance.
				74	* Therefore, we check the host memory type in addition
				75	* and only treat UC/UC-/WC pages as MMIO.
				76	*/
				77	(!pat_enabled() \|\| pat_pfn_immune_to_uc_mtrr(pfn));
				78
				79	return !e820__mapped_raw_any(pfn_to_hpa(pfn),
				80	pfn_to_hpa(pfn + 1) - 1,
				81	E820_TYPE_RAM);
				82	}
				83
				84	int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level,
				85	gfn_t gfn, kvm_pfn_t pfn, u64 old_spte, bool speculative,
				86	bool can_unsync, bool host_writable, bool ad_disabled,
				87	u64 *new_spte)
				88	{
				89	u64 spte = 0;
				90	int ret = 0;
				91
				92	if (ad_disabled)
				93	spte \|= SPTE_AD_DISABLED_MASK;
				94	else if (kvm_vcpu_ad_need_write_protect(vcpu))
				95	spte \|= SPTE_AD_WRPROT_ONLY_MASK;
				96
				97	/*
				98	* For the EPT case, shadow_present_mask is 0 if hardware
				99	* supports exec-only page table entries. In that case,
				100	* ACC_USER_MASK and shadow_user_mask are used to represent
				101	* read access. See FNAME(gpte_access) in paging_tmpl.h.
				102	*/
				103	spte \|= shadow_present_mask;
				104	if (!speculative)
				105	spte \|= spte_shadow_accessed_mask(spte);
				106
				107	if (level > PG_LEVEL_4K && (pte_access & ACC_EXEC_MASK) &&
				108	is_nx_huge_page_enabled()) {
				109	pte_access &= ~ACC_EXEC_MASK;
				110	}
				111
				112	if (pte_access & ACC_EXEC_MASK)
				113	spte \|= shadow_x_mask;
				114	else
				115	spte \|= shadow_nx_mask;
				116
				117	if (pte_access & ACC_USER_MASK)
				118	spte \|= shadow_user_mask;
				119
				120	if (level > PG_LEVEL_4K)
				121	spte \|= PT_PAGE_SIZE_MASK;
				122	if (tdp_enabled)
				123	spte \|= kvm_x86_ops.get_mt_mask(vcpu, gfn,
				124	kvm_is_mmio_pfn(pfn));
				125
				126	if (host_writable)
				127	spte \|= SPTE_HOST_WRITEABLE;
				128	else
				129	pte_access &= ~ACC_WRITE_MASK;
				130
				131	if (!kvm_is_mmio_pfn(pfn))
				132	spte \|= shadow_me_mask;
				133
				134	spte \|= (u64)pfn << PAGE_SHIFT;
				135
				136	if (pte_access & ACC_WRITE_MASK) {
				137	spte \|= PT_WRITABLE_MASK \| SPTE_MMU_WRITEABLE;
				138
				139	/*
				140	* Optimization: for pte sync, if spte was writable the hash
				141	* lookup is unnecessary (and expensive). Write protection
				142	* is responsibility of mmu_get_page / kvm_sync_page.
				143	* Same reasoning can be applied to dirty page accounting.
				144	*/
				145	if (!can_unsync && is_writable_pte(old_spte))
				146	goto out;
				147
				148	if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
				149	pgprintk("%s: found shadow page for %llx, marking ro\n",
				150	__func__, gfn);
				151	ret \|= SET_SPTE_WRITE_PROTECTED_PT;
				152	pte_access &= ~ACC_WRITE_MASK;
				153	spte &= ~(PT_WRITABLE_MASK \| SPTE_MMU_WRITEABLE);
				154	}
				155	}
				156
				157	if (pte_access & ACC_WRITE_MASK)
				158	spte \|= spte_shadow_dirty_mask(spte);
				159
				160	if (speculative)
				161	spte = mark_spte_for_access_track(spte);
				162
				163	out:
				164	*new_spte = spte;
				165	return ret;
				166	}
				167
				168	u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled)
				169	{
				170	u64 spte;
				171
				172	spte = __pa(child_pt) \| shadow_present_mask \| PT_WRITABLE_MASK \|
				173	shadow_user_mask \| shadow_x_mask \| shadow_me_mask;
				174
				175	if (ad_disabled)
				176	spte \|= SPTE_AD_DISABLED_MASK;
				177	else
				178	spte \|= shadow_accessed_mask;
				179
				180	return spte;
				181	}
				182
				183	u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn)
				184	{
				185	u64 new_spte;
				186
				187	new_spte = old_spte & ~PT64_BASE_ADDR_MASK;
				188	new_spte \|= (u64)new_pfn << PAGE_SHIFT;
				189
				190	new_spte &= ~PT_WRITABLE_MASK;
				191	new_spte &= ~SPTE_HOST_WRITEABLE;
				192
				193	new_spte = mark_spte_for_access_track(new_spte);
				194
				195	return new_spte;
				196	}
				197
				198	static u8 kvm_get_shadow_phys_bits(void)
				199	{
				200	/*
				201	* boot_cpu_data.x86_phys_bits is reduced when MKTME or SME are detected
				202	* in CPU detection code, but the processor treats those reduced bits as
				203	* 'keyID' thus they are not reserved bits. Therefore KVM needs to look at
				204	* the physical address bits reported by CPUID.
				205	*/
				206	if (likely(boot_cpu_data.extended_cpuid_level >= 0x80000008))
				207	return cpuid_eax(0x80000008) & 0xff;
				208
				209	/*
				210	* Quite weird to have VMX or SVM but not MAXPHYADDR; probably a VM with
				211	* custom CPUID. Proceed with whatever the kernel found since these features
				212	* aren't virtualizable (SME/SEV also require CPUIDs higher than 0x80000008).
				213	*/
				214	return boot_cpu_data.x86_phys_bits;
				215	}
				216
				217	u64 mark_spte_for_access_track(u64 spte)
				218	{
				219	if (spte_ad_enabled(spte))
				220	return spte & ~shadow_accessed_mask;
				221
				222	if (is_access_track_spte(spte))
				223	return spte;
				224
				225	/*
				226	* Making an Access Tracking PTE will result in removal of write access
				227	* from the PTE. So, verify that we will be able to restore the write
				228	* access in the fast page fault path later on.
				229	*/
				230	WARN_ONCE((spte & PT_WRITABLE_MASK) &&
				231	!spte_can_locklessly_be_made_writable(spte),
				232	"kvm: Writable SPTE is not locklessly dirty-trackable\n");
				233
				234	WARN_ONCE(spte & (shadow_acc_track_saved_bits_mask <<
				235	shadow_acc_track_saved_bits_shift),
				236	"kvm: Access Tracking saved bit locations are not zero\n");
				237
				238	spte \|= (spte & shadow_acc_track_saved_bits_mask) <<
				239	shadow_acc_track_saved_bits_shift;
				240	spte &= ~shadow_acc_track_mask;
				241
				242	return spte;
				243	}
				244
				245	void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 access_mask)
				246	{
				247	BUG_ON((u64)(unsigned)access_mask != access_mask);
				248	WARN_ON(mmio_value & (shadow_nonpresent_or_rsvd_mask << shadow_nonpresent_or_rsvd_mask_len));
				249	WARN_ON(mmio_value & shadow_nonpresent_or_rsvd_lower_gfn_mask);
				250	shadow_mmio_value = mmio_value \| SPTE_MMIO_MASK;
				251	shadow_mmio_access_mask = access_mask;
				252	}
				253	EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
				254
				255	/*
				256	* Sets the shadow PTE masks used by the MMU.
				257	*
				258	* Assumptions:
				259	* - Setting either @accessed_mask or @dirty_mask requires setting both
				260	* - At least one of @accessed_mask or @acc_track_mask must be set
				261	*/
				262	void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
				263	u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask,
				264	u64 acc_track_mask, u64 me_mask)
				265	{
				266	BUG_ON(!dirty_mask != !accessed_mask);
				267	BUG_ON(!accessed_mask && !acc_track_mask);
				268	BUG_ON(acc_track_mask & SPTE_SPECIAL_MASK);
				269
				270	shadow_user_mask = user_mask;
				271	shadow_accessed_mask = accessed_mask;
				272	shadow_dirty_mask = dirty_mask;
				273	shadow_nx_mask = nx_mask;
				274	shadow_x_mask = x_mask;
				275	shadow_present_mask = p_mask;
				276	shadow_acc_track_mask = acc_track_mask;
				277	shadow_me_mask = me_mask;
				278	}
				279	EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
				280
				281	void kvm_mmu_reset_all_pte_masks(void)
				282	{
				283	u8 low_phys_bits;
				284
				285	shadow_user_mask = 0;
				286	shadow_accessed_mask = 0;
				287	shadow_dirty_mask = 0;
				288	shadow_nx_mask = 0;
				289	shadow_x_mask = 0;
				290	shadow_present_mask = 0;
				291	shadow_acc_track_mask = 0;
				292
				293	shadow_phys_bits = kvm_get_shadow_phys_bits();
				294
				295	/*
				296	* If the CPU has 46 or less physical address bits, then set an
				297	* appropriate mask to guard against L1TF attacks. Otherwise, it is
				298	* assumed that the CPU is not vulnerable to L1TF.
				299	*
				300	* Some Intel CPUs address the L1 cache using more PA bits than are
				301	* reported by CPUID. Use the PA width of the L1 cache when possible
				302	* to achieve more effective mitigation, e.g. if system RAM overlaps
				303	* the most significant bits of legal physical address space.
				304	*/
				305	shadow_nonpresent_or_rsvd_mask = 0;
				306	low_phys_bits = boot_cpu_data.x86_phys_bits;
				307	if (boot_cpu_has_bug(X86_BUG_L1TF) &&
				308	!WARN_ON_ONCE(boot_cpu_data.x86_cache_bits >=
				309	52 - shadow_nonpresent_or_rsvd_mask_len)) {
				310	low_phys_bits = boot_cpu_data.x86_cache_bits
				311	- shadow_nonpresent_or_rsvd_mask_len;
				312	shadow_nonpresent_or_rsvd_mask =
				313	rsvd_bits(low_phys_bits, boot_cpu_data.x86_cache_bits - 1);
				314	}
				315
				316	shadow_nonpresent_or_rsvd_lower_gfn_mask =
				317	GENMASK_ULL(low_phys_bits - 1, PAGE_SHIFT);
				318	}