blob: e431d2d8e3684b35f01a9a8d5e1c517d8bed8f92 [file] [log] [blame]
Thomas Gleixnerd94d71c2019-05-29 07:12:40 -07001// SPDX-License-Identifier: GPL-2.0-only
Christoffer Dall749cf76c2013-01-20 18:28:06 -05002/*
3 * Copyright (C) 2012 - Virtual Open Systems and Columbia University
4 * Author: Christoffer Dall <c.dall@virtualopensystems.com>
Christoffer Dall749cf76c2013-01-20 18:28:06 -05005 */
Christoffer Dall342cd0a2013-01-20 18:28:06 -05006
7#include <linux/mman.h>
8#include <linux/kvm_host.h>
9#include <linux/io.h>
Christoffer Dallad361f02012-11-01 17:14:45 +010010#include <linux/hugetlb.h>
James Morse196f8782017-06-20 17:11:48 +010011#include <linux/sched/signal.h>
Christoffer Dall45e96ea2013-01-20 18:43:58 -050012#include <trace/events/kvm.h>
Christoffer Dall342cd0a2013-01-20 18:28:06 -050013#include <asm/pgalloc.h>
Christoffer Dall94f8e642013-01-20 18:28:12 -050014#include <asm/cacheflush.h>
Christoffer Dall342cd0a2013-01-20 18:28:06 -050015#include <asm/kvm_arm.h>
16#include <asm/kvm_mmu.h>
Will Deacon0f9d09b2020-09-11 14:25:12 +010017#include <asm/kvm_pgtable.h>
James Morse0db5e022019-01-29 18:48:49 +000018#include <asm/kvm_ras.h>
Christoffer Dalld5d81842013-01-20 18:28:07 -050019#include <asm/kvm_asm.h>
Christoffer Dall94f8e642013-01-20 18:28:12 -050020#include <asm/kvm_emulate.h>
Marc Zyngier1e947ba2015-01-29 11:59:54 +000021#include <asm/virt.h>
Christoffer Dalld5d81842013-01-20 18:28:07 -050022
23#include "trace.h"
Christoffer Dall342cd0a2013-01-20 18:28:06 -050024
Will Deacon0f9d09b2020-09-11 14:25:12 +010025static struct kvm_pgtable *hyp_pgtable;
Christoffer Dall342cd0a2013-01-20 18:28:06 -050026static DEFINE_MUTEX(kvm_hyp_pgd_mutex);
27
Marc Zyngier5a677ce2013-04-12 19:12:06 +010028static unsigned long hyp_idmap_start;
29static unsigned long hyp_idmap_end;
30static phys_addr_t hyp_idmap_vector;
31
Marc Zyngiere3f019b2017-12-04 17:04:38 +000032static unsigned long io_map_base;
33
Marc Zyngier6d674e22019-12-11 16:56:48 +000034
Will Deacon52bae932020-09-11 14:25:17 +010035/*
36 * Release kvm_mmu_lock periodically if the memory region is large. Otherwise,
37 * we may see kernel panics with CONFIG_DETECT_HUNG_TASK,
38 * CONFIG_LOCKUP_DETECTOR, CONFIG_LOCKDEP. Additionally, holding the lock too
39 * long will also starve other vCPUs. We have to also make sure that the page
40 * tables are not freed while we released the lock.
41 */
42static int stage2_apply_range(struct kvm *kvm, phys_addr_t addr,
43 phys_addr_t end,
44 int (*fn)(struct kvm_pgtable *, u64, u64),
45 bool resched)
46{
47 int ret;
48 u64 next;
49
50 do {
51 struct kvm_pgtable *pgt = kvm->arch.mmu.pgt;
52 if (!pgt)
53 return -EINVAL;
54
55 next = stage2_pgd_addr_end(kvm, addr, end);
56 ret = fn(pgt, addr, next - addr);
57 if (ret)
58 break;
59
60 if (resched && next != end)
61 cond_resched_lock(&kvm->mmu_lock);
62 } while (addr = next, addr != end);
63
64 return ret;
65}
66
Quentin Perretcc38d612020-09-11 14:25:21 +010067#define stage2_apply_range_resched(kvm, addr, end, fn) \
68 stage2_apply_range(kvm, addr, end, fn, true)
69
Mario Smarduch15a49a42015-01-15 15:58:58 -080070static bool memslot_is_logging(struct kvm_memory_slot *memslot)
71{
Mario Smarduch15a49a42015-01-15 15:58:58 -080072 return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY);
Mario Smarduch72760302015-01-15 15:59:01 -080073}
74
75/**
76 * kvm_flush_remote_tlbs() - flush all VM TLB entries for v7/8
77 * @kvm: pointer to kvm structure.
78 *
79 * Interface to HYP function to flush all VM TLB entries
80 */
81void kvm_flush_remote_tlbs(struct kvm *kvm)
82{
Christoffer Dalla0e50aa2019-01-04 21:09:05 +010083 kvm_call_hyp(__kvm_tlb_flush_vmid, &kvm->arch.mmu);
Mario Smarduch15a49a42015-01-15 15:58:58 -080084}
Christoffer Dallad361f02012-11-01 17:14:45 +010085
Ard Biesheuvele6fab542015-11-10 15:11:20 +010086static bool kvm_is_device_pfn(unsigned long pfn)
87{
88 return !pfn_valid(pfn);
89}
90
Marc Zyngier363ef892014-12-19 16:48:06 +000091/*
92 * Unmapping vs dcache management:
93 *
94 * If a guest maps certain memory pages as uncached, all writes will
95 * bypass the data cache and go directly to RAM. However, the CPUs
96 * can still speculate reads (not writes) and fill cache lines with
97 * data.
98 *
99 * Those cache lines will be *clean* cache lines though, so a
100 * clean+invalidate operation is equivalent to an invalidate
101 * operation, because no cache lines are marked dirty.
102 *
103 * Those clean cache lines could be filled prior to an uncached write
104 * by the guest, and the cache coherent IO subsystem would therefore
105 * end up writing old data to disk.
106 *
107 * This is why right after unmapping a page/section and invalidating
Will Deacon52bae932020-09-11 14:25:17 +0100108 * the corresponding TLBs, we flush to make sure the IO subsystem will
109 * never hit in the cache.
Marc Zyngiere48d53a2018-04-06 12:27:28 +0100110 *
111 * This is all avoided on systems that have ARM64_HAS_STAGE2_FWB, as
112 * we then fully enforce cacheability of RAM, no matter what the guest
113 * does.
Marc Zyngier363ef892014-12-19 16:48:06 +0000114 */
Suzuki K Poulose7a1c8312016-03-23 12:08:02 +0000115/**
116 * unmap_stage2_range -- Clear stage2 page table entries to unmap a range
Xiaofei Tanc9c02792020-09-17 09:47:49 +0800117 * @mmu: The KVM stage-2 MMU pointer
Suzuki K Poulose7a1c8312016-03-23 12:08:02 +0000118 * @start: The intermediate physical base address of the range to unmap
119 * @size: The size of the area to unmap
Xiaofei Tanc9c02792020-09-17 09:47:49 +0800120 * @may_block: Whether or not we are permitted to block
Suzuki K Poulose7a1c8312016-03-23 12:08:02 +0000121 *
122 * Clear a range of stage-2 mappings, lowering the various ref-counts. Must
123 * be called while holding mmu_lock (unless for freeing the stage2 pgd before
124 * destroying the VM), otherwise another faulting VCPU may come in and mess
125 * with things behind our backs.
126 */
Will Deaconb5331372020-08-11 11:27:25 +0100127static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size,
128 bool may_block)
Christoffer Dall4f853a72014-05-09 23:31:31 +0200129{
Christoffer Dalla0e50aa2019-01-04 21:09:05 +0100130 struct kvm *kvm = mmu->kvm;
Will Deacon52bae932020-09-11 14:25:17 +0100131 phys_addr_t end = start + size;
Christoffer Dall4f853a72014-05-09 23:31:31 +0200132
Suzuki K Poulose8b3405e2017-04-03 15:12:43 +0100133 assert_spin_locked(&kvm->mmu_lock);
Jia He47a91b72018-05-21 11:05:30 +0800134 WARN_ON(size & ~PAGE_MASK);
Will Deacon52bae932020-09-11 14:25:17 +0100135 WARN_ON(stage2_apply_range(kvm, start, end, kvm_pgtable_stage2_unmap,
136 may_block));
Marc Zyngier000d3992013-03-05 02:43:17 +0000137}
138
Will Deaconb5331372020-08-11 11:27:25 +0100139static void unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size)
140{
141 __unmap_stage2_range(mmu, start, size, true);
142}
143
Marc Zyngier9d218a12014-01-15 12:50:23 +0000144static void stage2_flush_memslot(struct kvm *kvm,
145 struct kvm_memory_slot *memslot)
146{
147 phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
148 phys_addr_t end = addr + PAGE_SIZE * memslot->npages;
Marc Zyngier9d218a12014-01-15 12:50:23 +0000149
Quentin Perret8d5207b2020-09-11 14:25:23 +0100150 stage2_apply_range_resched(kvm, addr, end, kvm_pgtable_stage2_flush);
Marc Zyngier9d218a12014-01-15 12:50:23 +0000151}
152
153/**
154 * stage2_flush_vm - Invalidate cache for pages mapped in stage 2
155 * @kvm: The struct kvm pointer
156 *
157 * Go through the stage 2 page tables and invalidate any cache lines
158 * backing memory already mapped to the VM.
159 */
Marc Zyngier3c1e7162014-12-19 16:05:31 +0000160static void stage2_flush_vm(struct kvm *kvm)
Marc Zyngier9d218a12014-01-15 12:50:23 +0000161{
162 struct kvm_memslots *slots;
163 struct kvm_memory_slot *memslot;
164 int idx;
165
166 idx = srcu_read_lock(&kvm->srcu);
167 spin_lock(&kvm->mmu_lock);
168
169 slots = kvm_memslots(kvm);
170 kvm_for_each_memslot(memslot, slots)
171 stage2_flush_memslot(kvm, memslot);
172
173 spin_unlock(&kvm->mmu_lock);
174 srcu_read_unlock(&kvm->srcu, idx);
175}
176
Marc Zyngier000d3992013-03-05 02:43:17 +0000177/**
Marc Zyngier4f728272013-04-12 19:12:05 +0100178 * free_hyp_pgds - free Hyp-mode page tables
Marc Zyngier000d3992013-03-05 02:43:17 +0000179 */
Marc Zyngier4f728272013-04-12 19:12:05 +0100180void free_hyp_pgds(void)
Marc Zyngier000d3992013-03-05 02:43:17 +0000181{
Marc Zyngierd157f4a2013-04-12 19:12:07 +0100182 mutex_lock(&kvm_hyp_pgd_mutex);
Will Deacon0f9d09b2020-09-11 14:25:12 +0100183 if (hyp_pgtable) {
184 kvm_pgtable_hyp_destroy(hyp_pgtable);
185 kfree(hyp_pgtable);
Marc Zyngiere3f019b2017-12-04 17:04:38 +0000186 }
Christoffer Dall342cd0a2013-01-20 18:28:06 -0500187 mutex_unlock(&kvm_hyp_pgd_mutex);
188}
189
Will Deacon0f9d09b2020-09-11 14:25:12 +0100190static int __create_hyp_mappings(unsigned long start, unsigned long size,
191 unsigned long phys, enum kvm_pgtable_prot prot)
Christoffer Dall342cd0a2013-01-20 18:28:06 -0500192{
Will Deacon0f9d09b2020-09-11 14:25:12 +0100193 int err;
Christoffer Dall342cd0a2013-01-20 18:28:06 -0500194
Christoffer Dall342cd0a2013-01-20 18:28:06 -0500195 mutex_lock(&kvm_hyp_pgd_mutex);
Will Deacon0f9d09b2020-09-11 14:25:12 +0100196 err = kvm_pgtable_hyp_map(hyp_pgtable, start, size, phys, prot);
Christoffer Dall342cd0a2013-01-20 18:28:06 -0500197 mutex_unlock(&kvm_hyp_pgd_mutex);
Will Deacon0f9d09b2020-09-11 14:25:12 +0100198
Christoffer Dall342cd0a2013-01-20 18:28:06 -0500199 return err;
200}
201
Christoffer Dall40c27292013-11-15 13:14:12 -0800202static phys_addr_t kvm_kaddr_to_phys(void *kaddr)
203{
204 if (!is_vmalloc_addr(kaddr)) {
205 BUG_ON(!virt_addr_valid(kaddr));
206 return __pa(kaddr);
207 } else {
208 return page_to_phys(vmalloc_to_page(kaddr)) +
209 offset_in_page(kaddr);
210 }
211}
212
Christoffer Dall342cd0a2013-01-20 18:28:06 -0500213/**
Marc Zyngier06e8c3b2012-10-28 01:09:14 +0100214 * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode
Christoffer Dall342cd0a2013-01-20 18:28:06 -0500215 * @from: The virtual kernel start address of the range
216 * @to: The virtual kernel end address of the range (exclusive)
Marc Zyngierc8dddec2016-06-13 15:00:45 +0100217 * @prot: The protection to be applied to this range
Christoffer Dall342cd0a2013-01-20 18:28:06 -0500218 *
Marc Zyngier06e8c3b2012-10-28 01:09:14 +0100219 * The same virtual address as the kernel virtual address is also used
220 * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying
221 * physical pages.
Christoffer Dall342cd0a2013-01-20 18:28:06 -0500222 */
Will Deacon0f9d09b2020-09-11 14:25:12 +0100223int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot)
Christoffer Dall342cd0a2013-01-20 18:28:06 -0500224{
Christoffer Dall40c27292013-11-15 13:14:12 -0800225 phys_addr_t phys_addr;
226 unsigned long virt_addr;
Marc Zyngier6c41a412016-06-30 18:40:51 +0100227 unsigned long start = kern_hyp_va((unsigned long)from);
228 unsigned long end = kern_hyp_va((unsigned long)to);
Marc Zyngier6060df82013-04-12 19:12:01 +0100229
Marc Zyngier1e947ba2015-01-29 11:59:54 +0000230 if (is_kernel_in_hyp_mode())
231 return 0;
232
Christoffer Dall40c27292013-11-15 13:14:12 -0800233 start = start & PAGE_MASK;
234 end = PAGE_ALIGN(end);
Marc Zyngier6060df82013-04-12 19:12:01 +0100235
Christoffer Dall40c27292013-11-15 13:14:12 -0800236 for (virt_addr = start; virt_addr < end; virt_addr += PAGE_SIZE) {
237 int err;
238
239 phys_addr = kvm_kaddr_to_phys(from + virt_addr - start);
Will Deacon0f9d09b2020-09-11 14:25:12 +0100240 err = __create_hyp_mappings(virt_addr, PAGE_SIZE, phys_addr,
Marc Zyngierc8dddec2016-06-13 15:00:45 +0100241 prot);
Christoffer Dall40c27292013-11-15 13:14:12 -0800242 if (err)
243 return err;
244 }
245
246 return 0;
Christoffer Dall342cd0a2013-01-20 18:28:06 -0500247}
248
Marc Zyngierdc2e4632018-02-13 11:00:29 +0000249static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size,
Will Deacon0f9d09b2020-09-11 14:25:12 +0100250 unsigned long *haddr,
251 enum kvm_pgtable_prot prot)
Christoffer Dall342cd0a2013-01-20 18:28:06 -0500252{
Marc Zyngiere3f019b2017-12-04 17:04:38 +0000253 unsigned long base;
254 int ret = 0;
Marc Zyngier6060df82013-04-12 19:12:01 +0100255
Marc Zyngiere3f019b2017-12-04 17:04:38 +0000256 mutex_lock(&kvm_hyp_pgd_mutex);
Marc Zyngier6060df82013-04-12 19:12:01 +0100257
Marc Zyngiere3f019b2017-12-04 17:04:38 +0000258 /*
Fuad Tabba656012c2020-04-01 15:03:10 +0100259 * This assumes that we have enough space below the idmap
Marc Zyngiere3f019b2017-12-04 17:04:38 +0000260 * page to allocate our VAs. If not, the check below will
261 * kick. A potential alternative would be to detect that
262 * overflow and switch to an allocation above the idmap.
263 *
264 * The allocated size is always a multiple of PAGE_SIZE.
265 */
266 size = PAGE_ALIGN(size + offset_in_page(phys_addr));
267 base = io_map_base - size;
Marc Zyngier1bb32a42017-12-04 16:43:23 +0000268
Marc Zyngiere3f019b2017-12-04 17:04:38 +0000269 /*
270 * Verify that BIT(VA_BITS - 1) hasn't been flipped by
271 * allocating the new area, as it would indicate we've
272 * overflowed the idmap/IO address range.
273 */
274 if ((base ^ io_map_base) & BIT(VA_BITS - 1))
275 ret = -ENOMEM;
276 else
277 io_map_base = base;
278
279 mutex_unlock(&kvm_hyp_pgd_mutex);
280
281 if (ret)
282 goto out;
283
Will Deacon0f9d09b2020-09-11 14:25:12 +0100284 ret = __create_hyp_mappings(base, size, phys_addr, prot);
Marc Zyngiere3f019b2017-12-04 17:04:38 +0000285 if (ret)
286 goto out;
287
Marc Zyngierdc2e4632018-02-13 11:00:29 +0000288 *haddr = base + offset_in_page(phys_addr);
Marc Zyngiere3f019b2017-12-04 17:04:38 +0000289out:
Marc Zyngierdc2e4632018-02-13 11:00:29 +0000290 return ret;
291}
292
293/**
294 * create_hyp_io_mappings - Map IO into both kernel and HYP
295 * @phys_addr: The physical start address which gets mapped
296 * @size: Size of the region being mapped
297 * @kaddr: Kernel VA for this mapping
298 * @haddr: HYP VA for this mapping
299 */
300int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size,
301 void __iomem **kaddr,
302 void __iomem **haddr)
303{
304 unsigned long addr;
305 int ret;
306
307 *kaddr = ioremap(phys_addr, size);
308 if (!*kaddr)
309 return -ENOMEM;
310
311 if (is_kernel_in_hyp_mode()) {
312 *haddr = *kaddr;
313 return 0;
314 }
315
316 ret = __create_hyp_private_mapping(phys_addr, size,
317 &addr, PAGE_HYP_DEVICE);
Marc Zyngier1bb32a42017-12-04 16:43:23 +0000318 if (ret) {
319 iounmap(*kaddr);
320 *kaddr = NULL;
Marc Zyngierdc2e4632018-02-13 11:00:29 +0000321 *haddr = NULL;
Marc Zyngier1bb32a42017-12-04 16:43:23 +0000322 return ret;
323 }
324
Marc Zyngierdc2e4632018-02-13 11:00:29 +0000325 *haddr = (void __iomem *)addr;
326 return 0;
327}
328
329/**
330 * create_hyp_exec_mappings - Map an executable range into HYP
331 * @phys_addr: The physical start address which gets mapped
332 * @size: Size of the region being mapped
333 * @haddr: HYP VA for this mapping
334 */
335int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size,
336 void **haddr)
337{
338 unsigned long addr;
339 int ret;
340
341 BUG_ON(is_kernel_in_hyp_mode());
342
343 ret = __create_hyp_private_mapping(phys_addr, size,
344 &addr, PAGE_HYP_EXEC);
345 if (ret) {
346 *haddr = NULL;
347 return ret;
348 }
349
350 *haddr = (void *)addr;
Marc Zyngier1bb32a42017-12-04 16:43:23 +0000351 return 0;
Christoffer Dall342cd0a2013-01-20 18:28:06 -0500352}
353
Christoffer Dalld5d81842013-01-20 18:28:07 -0500354/**
Christoffer Dalla0e50aa2019-01-04 21:09:05 +0100355 * kvm_init_stage2_mmu - Initialise a S2 MMU strucrure
356 * @kvm: The pointer to the KVM structure
357 * @mmu: The pointer to the s2 MMU structure
Christoffer Dalld5d81842013-01-20 18:28:07 -0500358 *
Will Deacon71233d02020-09-11 14:25:13 +0100359 * Allocates only the stage-2 HW PGD level table(s).
Christoffer Dalld5d81842013-01-20 18:28:07 -0500360 * Note we don't need locking here as this is only called when the VM is
361 * created, which can only be done once.
362 */
Christoffer Dalla0e50aa2019-01-04 21:09:05 +0100363int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu)
Christoffer Dalld5d81842013-01-20 18:28:07 -0500364{
Will Deacon71233d02020-09-11 14:25:13 +0100365 int cpu, err;
366 struct kvm_pgtable *pgt;
Christoffer Dalld5d81842013-01-20 18:28:07 -0500367
Will Deacon71233d02020-09-11 14:25:13 +0100368 if (mmu->pgt != NULL) {
Christoffer Dalld5d81842013-01-20 18:28:07 -0500369 kvm_err("kvm_arch already initialized?\n");
370 return -EINVAL;
371 }
372
Will Deacon71233d02020-09-11 14:25:13 +0100373 pgt = kzalloc(sizeof(*pgt), GFP_KERNEL);
374 if (!pgt)
Marc Zyngiera9873702015-03-10 19:06:59 +0000375 return -ENOMEM;
376
Will Deacon71233d02020-09-11 14:25:13 +0100377 err = kvm_pgtable_stage2_init(pgt, kvm);
378 if (err)
379 goto out_free_pgtable;
Christoffer Dalle329fb72018-12-11 15:26:31 +0100380
Christoffer Dalla0e50aa2019-01-04 21:09:05 +0100381 mmu->last_vcpu_ran = alloc_percpu(typeof(*mmu->last_vcpu_ran));
382 if (!mmu->last_vcpu_ran) {
Will Deacon71233d02020-09-11 14:25:13 +0100383 err = -ENOMEM;
384 goto out_destroy_pgtable;
Christoffer Dalla0e50aa2019-01-04 21:09:05 +0100385 }
386
387 for_each_possible_cpu(cpu)
388 *per_cpu_ptr(mmu->last_vcpu_ran, cpu) = -1;
389
390 mmu->kvm = kvm;
Will Deacon71233d02020-09-11 14:25:13 +0100391 mmu->pgt = pgt;
392 mmu->pgd_phys = __pa(pgt->pgd);
Christoffer Dalla0e50aa2019-01-04 21:09:05 +0100393 mmu->vmid.vmid_gen = 0;
Christoffer Dalld5d81842013-01-20 18:28:07 -0500394 return 0;
Will Deacon71233d02020-09-11 14:25:13 +0100395
396out_destroy_pgtable:
397 kvm_pgtable_stage2_destroy(pgt);
398out_free_pgtable:
399 kfree(pgt);
400 return err;
Christoffer Dalld5d81842013-01-20 18:28:07 -0500401}
402
Christoffer Dall957db102014-11-27 10:35:03 +0100403static void stage2_unmap_memslot(struct kvm *kvm,
404 struct kvm_memory_slot *memslot)
405{
406 hva_t hva = memslot->userspace_addr;
407 phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
408 phys_addr_t size = PAGE_SIZE * memslot->npages;
409 hva_t reg_end = hva + size;
410
411 /*
412 * A memory region could potentially cover multiple VMAs, and any holes
413 * between them, so iterate over all of them to find out if we should
414 * unmap any of them.
415 *
416 * +--------------------------------------------+
417 * +---------------+----------------+ +----------------+
418 * | : VMA 1 | VMA 2 | | VMA 3 : |
419 * +---------------+----------------+ +----------------+
420 * | memory region |
421 * +--------------------------------------------+
422 */
423 do {
424 struct vm_area_struct *vma = find_vma(current->mm, hva);
425 hva_t vm_start, vm_end;
426
427 if (!vma || vma->vm_start >= reg_end)
428 break;
429
430 /*
431 * Take the intersection of this VMA with the memory region
432 */
433 vm_start = max(hva, vma->vm_start);
434 vm_end = min(reg_end, vma->vm_end);
435
436 if (!(vma->vm_flags & VM_PFNMAP)) {
437 gpa_t gpa = addr + (vm_start - memslot->userspace_addr);
Christoffer Dalla0e50aa2019-01-04 21:09:05 +0100438 unmap_stage2_range(&kvm->arch.mmu, gpa, vm_end - vm_start);
Christoffer Dall957db102014-11-27 10:35:03 +0100439 }
440 hva = vm_end;
441 } while (hva < reg_end);
442}
443
444/**
445 * stage2_unmap_vm - Unmap Stage-2 RAM mappings
446 * @kvm: The struct kvm pointer
447 *
Fuad Tabba656012c2020-04-01 15:03:10 +0100448 * Go through the memregions and unmap any regular RAM
Christoffer Dall957db102014-11-27 10:35:03 +0100449 * backing memory already mapped to the VM.
450 */
451void stage2_unmap_vm(struct kvm *kvm)
452{
453 struct kvm_memslots *slots;
454 struct kvm_memory_slot *memslot;
455 int idx;
456
457 idx = srcu_read_lock(&kvm->srcu);
Michel Lespinasse89154dd2020-06-08 21:33:29 -0700458 mmap_read_lock(current->mm);
Christoffer Dall957db102014-11-27 10:35:03 +0100459 spin_lock(&kvm->mmu_lock);
460
461 slots = kvm_memslots(kvm);
462 kvm_for_each_memslot(memslot, slots)
463 stage2_unmap_memslot(kvm, memslot);
464
465 spin_unlock(&kvm->mmu_lock);
Michel Lespinasse89154dd2020-06-08 21:33:29 -0700466 mmap_read_unlock(current->mm);
Christoffer Dall957db102014-11-27 10:35:03 +0100467 srcu_read_unlock(&kvm->srcu, idx);
468}
469
Christoffer Dalla0e50aa2019-01-04 21:09:05 +0100470void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
Christoffer Dalld5d81842013-01-20 18:28:07 -0500471{
Christoffer Dalla0e50aa2019-01-04 21:09:05 +0100472 struct kvm *kvm = mmu->kvm;
Will Deacon71233d02020-09-11 14:25:13 +0100473 struct kvm_pgtable *pgt = NULL;
Christoffer Dalld5d81842013-01-20 18:28:07 -0500474
Suzuki K Poulose8b3405e2017-04-03 15:12:43 +0100475 spin_lock(&kvm->mmu_lock);
Will Deacon71233d02020-09-11 14:25:13 +0100476 pgt = mmu->pgt;
477 if (pgt) {
Will Deacon71233d02020-09-11 14:25:13 +0100478 mmu->pgd_phys = 0;
479 mmu->pgt = NULL;
480 free_percpu(mmu->last_vcpu_ran);
Suzuki K Poulose6c0d7062017-05-03 15:17:51 +0100481 }
Suzuki K Poulose8b3405e2017-04-03 15:12:43 +0100482 spin_unlock(&kvm->mmu_lock);
483
Will Deacon71233d02020-09-11 14:25:13 +0100484 if (pgt) {
485 kvm_pgtable_stage2_destroy(pgt);
486 kfree(pgt);
Christoffer Dalla0e50aa2019-01-04 21:09:05 +0100487 }
Christoffer Dalld5d81842013-01-20 18:28:07 -0500488}
489
Christoffer Dalld5d81842013-01-20 18:28:07 -0500490/**
491 * kvm_phys_addr_ioremap - map a device range to guest IPA
492 *
493 * @kvm: The KVM pointer
494 * @guest_ipa: The IPA at which to insert the mapping
495 * @pa: The physical address of the device
496 * @size: The size of the mapping
Xiaofei Tanc9c02792020-09-17 09:47:49 +0800497 * @writable: Whether or not to create a writable mapping
Christoffer Dalld5d81842013-01-20 18:28:07 -0500498 */
499int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
Ard Biesheuvelc40f2f82014-09-17 14:56:18 -0700500 phys_addr_t pa, unsigned long size, bool writable)
Christoffer Dalld5d81842013-01-20 18:28:07 -0500501{
Will Deacon02bbd372020-09-11 14:25:15 +0100502 phys_addr_t addr;
Christoffer Dalld5d81842013-01-20 18:28:07 -0500503 int ret = 0;
Sean Christophersonc1a33ae2020-07-02 19:35:42 -0700504 struct kvm_mmu_memory_cache cache = { 0, __GFP_ZERO, NULL, };
Will Deacon02bbd372020-09-11 14:25:15 +0100505 struct kvm_pgtable *pgt = kvm->arch.mmu.pgt;
506 enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_DEVICE |
507 KVM_PGTABLE_PROT_R |
508 (writable ? KVM_PGTABLE_PROT_W : 0);
Christoffer Dalld5d81842013-01-20 18:28:07 -0500509
Will Deacon02bbd372020-09-11 14:25:15 +0100510 size += offset_in_page(guest_ipa);
511 guest_ipa &= PAGE_MASK;
Christoffer Dalld5d81842013-01-20 18:28:07 -0500512
Will Deacon02bbd372020-09-11 14:25:15 +0100513 for (addr = guest_ipa; addr < guest_ipa + size; addr += PAGE_SIZE) {
Sean Christophersonc1a33ae2020-07-02 19:35:42 -0700514 ret = kvm_mmu_topup_memory_cache(&cache,
515 kvm_mmu_cache_min_pages(kvm));
Christoffer Dalld5d81842013-01-20 18:28:07 -0500516 if (ret)
Will Deacon02bbd372020-09-11 14:25:15 +0100517 break;
518
Christoffer Dalld5d81842013-01-20 18:28:07 -0500519 spin_lock(&kvm->mmu_lock);
Will Deacon02bbd372020-09-11 14:25:15 +0100520 ret = kvm_pgtable_stage2_map(pgt, addr, PAGE_SIZE, pa, prot,
521 &cache);
Christoffer Dalld5d81842013-01-20 18:28:07 -0500522 spin_unlock(&kvm->mmu_lock);
523 if (ret)
Will Deacon02bbd372020-09-11 14:25:15 +0100524 break;
Christoffer Dalld5d81842013-01-20 18:28:07 -0500525
Will Deacon02bbd372020-09-11 14:25:15 +0100526 pa += PAGE_SIZE;
Christoffer Dalld5d81842013-01-20 18:28:07 -0500527 }
528
Sean Christophersonc1a33ae2020-07-02 19:35:42 -0700529 kvm_mmu_free_memory_cache(&cache);
Christoffer Dalld5d81842013-01-20 18:28:07 -0500530 return ret;
531}
532
Mario Smarduchc6473552015-01-15 15:58:56 -0800533/**
Mario Smarduchc6473552015-01-15 15:58:56 -0800534 * stage2_wp_range() - write protect stage2 memory region range
Xiaofei Tanc9c02792020-09-17 09:47:49 +0800535 * @mmu: The KVM stage-2 MMU pointer
Mario Smarduchc6473552015-01-15 15:58:56 -0800536 * @addr: Start address of range
537 * @end: End address of range
538 */
Christoffer Dalla0e50aa2019-01-04 21:09:05 +0100539static void stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end)
Mario Smarduchc6473552015-01-15 15:58:56 -0800540{
Christoffer Dalla0e50aa2019-01-04 21:09:05 +0100541 struct kvm *kvm = mmu->kvm;
Quentin Perretcc38d612020-09-11 14:25:21 +0100542 stage2_apply_range_resched(kvm, addr, end, kvm_pgtable_stage2_wrprotect);
Mario Smarduchc6473552015-01-15 15:58:56 -0800543}
544
545/**
546 * kvm_mmu_wp_memory_region() - write protect stage 2 entries for memory slot
547 * @kvm: The KVM pointer
548 * @slot: The memory slot to write protect
549 *
550 * Called to start logging dirty pages after memory region
551 * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns
Punit Agrawal4ea5af52018-12-11 17:10:37 +0000552 * all present PUD, PMD and PTEs are write protected in the memory region.
Mario Smarduchc6473552015-01-15 15:58:56 -0800553 * Afterwards read of dirty page log can be called.
554 *
555 * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired,
556 * serializing operations for VM memory regions.
557 */
558void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
559{
Paolo Bonzini9f6b8022015-05-17 16:20:07 +0200560 struct kvm_memslots *slots = kvm_memslots(kvm);
561 struct kvm_memory_slot *memslot = id_to_memslot(slots, slot);
Sean Christopherson0577d1a2020-02-18 13:07:31 -0800562 phys_addr_t start, end;
563
564 if (WARN_ON_ONCE(!memslot))
565 return;
566
567 start = memslot->base_gfn << PAGE_SHIFT;
568 end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
Mario Smarduchc6473552015-01-15 15:58:56 -0800569
570 spin_lock(&kvm->mmu_lock);
Christoffer Dalla0e50aa2019-01-04 21:09:05 +0100571 stage2_wp_range(&kvm->arch.mmu, start, end);
Mario Smarduchc6473552015-01-15 15:58:56 -0800572 spin_unlock(&kvm->mmu_lock);
573 kvm_flush_remote_tlbs(kvm);
574}
Mario Smarduch53c810c2015-01-15 15:58:57 -0800575
576/**
Kai Huang3b0f1d02015-01-28 10:54:23 +0800577 * kvm_mmu_write_protect_pt_masked() - write protect dirty pages
Mario Smarduch53c810c2015-01-15 15:58:57 -0800578 * @kvm: The KVM pointer
579 * @slot: The memory slot associated with mask
580 * @gfn_offset: The gfn offset in memory slot
581 * @mask: The mask of dirty pages at offset 'gfn_offset' in this memory
582 * slot to be write protected
583 *
584 * Walks bits set in mask write protects the associated pte's. Caller must
585 * acquire kvm_mmu_lock.
586 */
Kai Huang3b0f1d02015-01-28 10:54:23 +0800587static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
Mario Smarduch53c810c2015-01-15 15:58:57 -0800588 struct kvm_memory_slot *slot,
589 gfn_t gfn_offset, unsigned long mask)
590{
591 phys_addr_t base_gfn = slot->base_gfn + gfn_offset;
592 phys_addr_t start = (base_gfn + __ffs(mask)) << PAGE_SHIFT;
593 phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT;
594
Christoffer Dalla0e50aa2019-01-04 21:09:05 +0100595 stage2_wp_range(&kvm->arch.mmu, start, end);
Mario Smarduch53c810c2015-01-15 15:58:57 -0800596}
Mario Smarduchc6473552015-01-15 15:58:56 -0800597
Kai Huang3b0f1d02015-01-28 10:54:23 +0800598/*
599 * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
600 * dirty pages.
601 *
602 * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
603 * enable dirty logging for them.
604 */
605void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
606 struct kvm_memory_slot *slot,
607 gfn_t gfn_offset, unsigned long mask)
608{
609 kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
610}
611
Marc Zyngier17ab9d52017-10-23 17:11:22 +0100612static void clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long size)
Marc Zyngier0d3e4d42015-01-05 21:13:24 +0000613{
Marc Zyngier17ab9d52017-10-23 17:11:22 +0100614 __clean_dcache_guest_page(pfn, size);
Marc Zyngiera15f6932017-10-23 17:11:15 +0100615}
616
Marc Zyngier17ab9d52017-10-23 17:11:22 +0100617static void invalidate_icache_guest_page(kvm_pfn_t pfn, unsigned long size)
Marc Zyngiera15f6932017-10-23 17:11:15 +0100618{
Marc Zyngier17ab9d52017-10-23 17:11:22 +0100619 __invalidate_icache_guest_page(pfn, size);
Marc Zyngier0d3e4d42015-01-05 21:13:24 +0000620}
621
James Morse1559b752019-12-17 12:38:09 +0000622static void kvm_send_hwpoison_signal(unsigned long address, short lsb)
James Morse196f8782017-06-20 17:11:48 +0100623{
Eric W. Biederman795a8372018-04-16 13:39:10 -0500624 send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current);
James Morse196f8782017-06-20 17:11:48 +0100625}
626
Suzuki K Poulosea80868f2019-03-12 09:52:51 +0000627static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot,
628 unsigned long hva,
629 unsigned long map_size)
Christoffer Dall6794ad52018-11-02 08:53:22 +0100630{
Shaokun Zhangc2be79a2019-02-19 17:22:21 +0800631 gpa_t gpa_start;
Christoffer Dall6794ad52018-11-02 08:53:22 +0100632 hva_t uaddr_start, uaddr_end;
633 size_t size;
634
Suzuki K Poulose9f283612020-05-07 20:35:45 +0800635 /* The memslot and the VMA are guaranteed to be aligned to PAGE_SIZE */
636 if (map_size == PAGE_SIZE)
637 return true;
638
Christoffer Dall6794ad52018-11-02 08:53:22 +0100639 size = memslot->npages * PAGE_SIZE;
640
641 gpa_start = memslot->base_gfn << PAGE_SHIFT;
Christoffer Dall6794ad52018-11-02 08:53:22 +0100642
643 uaddr_start = memslot->userspace_addr;
644 uaddr_end = uaddr_start + size;
645
646 /*
647 * Pages belonging to memslots that don't have the same alignment
Suzuki K Poulosea80868f2019-03-12 09:52:51 +0000648 * within a PMD/PUD for userspace and IPA cannot be mapped with stage-2
649 * PMD/PUD entries, because we'll end up mapping the wrong pages.
Christoffer Dall6794ad52018-11-02 08:53:22 +0100650 *
651 * Consider a layout like the following:
652 *
653 * memslot->userspace_addr:
654 * +-----+--------------------+--------------------+---+
Suzuki K Poulosea80868f2019-03-12 09:52:51 +0000655 * |abcde|fgh Stage-1 block | Stage-1 block tv|xyz|
Christoffer Dall6794ad52018-11-02 08:53:22 +0100656 * +-----+--------------------+--------------------+---+
657 *
Suzuki K Poulose9f283612020-05-07 20:35:45 +0800658 * memslot->base_gfn << PAGE_SHIFT:
Christoffer Dall6794ad52018-11-02 08:53:22 +0100659 * +---+--------------------+--------------------+-----+
Suzuki K Poulosea80868f2019-03-12 09:52:51 +0000660 * |abc|def Stage-2 block | Stage-2 block |tvxyz|
Christoffer Dall6794ad52018-11-02 08:53:22 +0100661 * +---+--------------------+--------------------+-----+
662 *
Suzuki K Poulosea80868f2019-03-12 09:52:51 +0000663 * If we create those stage-2 blocks, we'll end up with this incorrect
Christoffer Dall6794ad52018-11-02 08:53:22 +0100664 * mapping:
665 * d -> f
666 * e -> g
667 * f -> h
668 */
Suzuki K Poulosea80868f2019-03-12 09:52:51 +0000669 if ((gpa_start & (map_size - 1)) != (uaddr_start & (map_size - 1)))
Christoffer Dall6794ad52018-11-02 08:53:22 +0100670 return false;
671
672 /*
673 * Next, let's make sure we're not trying to map anything not covered
Suzuki K Poulosea80868f2019-03-12 09:52:51 +0000674 * by the memslot. This means we have to prohibit block size mappings
675 * for the beginning and end of a non-block aligned and non-block sized
Christoffer Dall6794ad52018-11-02 08:53:22 +0100676 * memory slot (illustrated by the head and tail parts of the
677 * userspace view above containing pages 'abcde' and 'xyz',
678 * respectively).
679 *
680 * Note that it doesn't matter if we do the check using the
681 * userspace_addr or the base_gfn, as both are equally aligned (per
682 * the check above) and equally sized.
683 */
Suzuki K Poulosea80868f2019-03-12 09:52:51 +0000684 return (hva & ~(map_size - 1)) >= uaddr_start &&
685 (hva & ~(map_size - 1)) + map_size <= uaddr_end;
Christoffer Dall6794ad52018-11-02 08:53:22 +0100686}
687
Suzuki K Poulose0529c902020-05-07 20:35:46 +0800688/*
689 * Check if the given hva is backed by a transparent huge page (THP) and
690 * whether it can be mapped using block mapping in stage2. If so, adjust
691 * the stage2 PFN and IPA accordingly. Only PMD_SIZE THPs are currently
692 * supported. This will need to be updated to support other THP sizes.
693 *
694 * Returns the size of the mapping.
695 */
696static unsigned long
697transparent_hugepage_adjust(struct kvm_memory_slot *memslot,
698 unsigned long hva, kvm_pfn_t *pfnp,
699 phys_addr_t *ipap)
700{
701 kvm_pfn_t pfn = *pfnp;
702
703 /*
704 * Make sure the adjustment is done only for THP pages. Also make
705 * sure that the HVA and IPA are sufficiently aligned and that the
706 * block map is contained within the memslot.
707 */
708 if (kvm_is_transparent_hugepage(pfn) &&
709 fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) {
710 /*
711 * The address we faulted on is backed by a transparent huge
712 * page. However, because we map the compound huge page and
713 * not the individual tail page, we need to transfer the
714 * refcount to the head page. We have to be careful that the
715 * THP doesn't start to split while we are adjusting the
716 * refcounts.
717 *
718 * We are sure this doesn't happen, because mmu_notifier_retry
719 * was successful and we are holding the mmu_lock, so if this
720 * THP is trying to split, it will be blocked in the mmu
721 * notifier before touching any of the pages, specifically
722 * before being able to call __split_huge_page_refcount().
723 *
724 * We can therefore safely transfer the refcount from PG_tail
725 * to PG_head and switch the pfn from a tail page to the head
726 * page accordingly.
727 */
728 *ipap &= PMD_MASK;
729 kvm_release_pfn_clean(pfn);
730 pfn &= ~(PTRS_PER_PMD - 1);
731 kvm_get_pfn(pfn);
732 *pfnp = pfn;
733
734 return PMD_SIZE;
735 }
736
737 /* Use page mapping if we cannot use block mapping. */
738 return PAGE_SIZE;
739}
740
Christoffer Dall94f8e642013-01-20 18:28:12 -0500741static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
Christoffer Dall98047882014-08-19 12:18:04 +0200742 struct kvm_memory_slot *memslot, unsigned long hva,
Christoffer Dall94f8e642013-01-20 18:28:12 -0500743 unsigned long fault_status)
744{
Will Deaconffd1b632020-09-30 11:24:42 +0100745 int ret = 0;
Punit Agrawal6396b852018-12-11 17:10:35 +0000746 bool write_fault, writable, force_pte = false;
Will Deacon6f745f12020-09-11 14:25:25 +0100747 bool exec_fault;
748 bool device = false;
Christoffer Dall94f8e642013-01-20 18:28:12 -0500749 unsigned long mmu_seq;
Christoffer Dallad361f02012-11-01 17:14:45 +0100750 struct kvm *kvm = vcpu->kvm;
Christoffer Dall94f8e642013-01-20 18:28:12 -0500751 struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
Christoffer Dallad361f02012-11-01 17:14:45 +0100752 struct vm_area_struct *vma;
James Morse1559b752019-12-17 12:38:09 +0000753 short vma_shift;
Will Deacon6f745f12020-09-11 14:25:25 +0100754 gfn_t gfn;
Dan Williamsba049e92016-01-15 16:56:11 -0800755 kvm_pfn_t pfn;
Mario Smarduch15a49a42015-01-15 15:58:58 -0800756 bool logging_active = memslot_is_logging(memslot);
Will Deacon6f745f12020-09-11 14:25:25 +0100757 unsigned long vma_pagesize;
758 enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
759 struct kvm_pgtable *pgt;
Christoffer Dall94f8e642013-01-20 18:28:12 -0500760
Ard Biesheuvela7d079c2014-09-09 11:27:09 +0100761 write_fault = kvm_is_write_fault(vcpu);
Marc Zyngierd0e22b42017-10-23 17:11:19 +0100762 exec_fault = kvm_vcpu_trap_is_iabt(vcpu);
763 VM_BUG_ON(write_fault && exec_fault);
764
765 if (fault_status == FSC_PERM && !write_fault && !exec_fault) {
Christoffer Dall94f8e642013-01-20 18:28:12 -0500766 kvm_err("Unexpected L2 read permission error\n");
767 return -EFAULT;
768 }
769
Christoffer Dallad361f02012-11-01 17:14:45 +0100770 /* Let's check if we will get back a huge page backed by hugetlbfs */
Michel Lespinasse89154dd2020-06-08 21:33:29 -0700771 mmap_read_lock(current->mm);
Christoffer Dallad361f02012-11-01 17:14:45 +0100772 vma = find_vma_intersection(current->mm, hva, hva + 1);
Ard Biesheuvel37b54402014-09-17 14:56:17 -0700773 if (unlikely(!vma)) {
774 kvm_err("Failed to find VMA for hva 0x%lx\n", hva);
Michel Lespinasse89154dd2020-06-08 21:33:29 -0700775 mmap_read_unlock(current->mm);
Ard Biesheuvel37b54402014-09-17 14:56:17 -0700776 return -EFAULT;
777 }
778
James Morse1559b752019-12-17 12:38:09 +0000779 if (is_vm_hugetlb_page(vma))
780 vma_shift = huge_page_shift(hstate_vma(vma));
781 else
782 vma_shift = PAGE_SHIFT;
783
Suzuki K Poulosea80868f2019-03-12 09:52:51 +0000784 if (logging_active ||
Alexandru Elisei523b3992020-09-10 14:33:51 +0100785 (vma->vm_flags & VM_PFNMAP)) {
Suzuki K Poulosea80868f2019-03-12 09:52:51 +0000786 force_pte = true;
Alexandru Elisei523b3992020-09-10 14:33:51 +0100787 vma_shift = PAGE_SHIFT;
Suzuki K Poulosea80868f2019-03-12 09:52:51 +0000788 }
789
Gavin Shan2f40c462020-10-26 10:06:26 +1100790 switch (vma_shift) {
791 case PUD_SHIFT:
792 if (fault_supports_stage2_huge_mapping(memslot, hva, PUD_SIZE))
793 break;
794 fallthrough;
795 case CONT_PMD_SHIFT:
796 vma_shift = PMD_SHIFT;
797 fallthrough;
798 case PMD_SHIFT:
799 if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE))
800 break;
801 fallthrough;
802 case CONT_PTE_SHIFT:
Alexandru Elisei523b3992020-09-10 14:33:51 +0100803 vma_shift = PAGE_SHIFT;
Gavin Shan2f40c462020-10-26 10:06:26 +1100804 force_pte = true;
805 fallthrough;
806 case PAGE_SHIFT:
807 break;
808 default:
809 WARN_ONCE(1, "Unknown vma_shift %d", vma_shift);
Alexandru Elisei523b3992020-09-10 14:33:51 +0100810 }
811
812 vma_pagesize = 1UL << vma_shift;
Will Deacon6f745f12020-09-11 14:25:25 +0100813 if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE)
Alexandru Elisei523b3992020-09-10 14:33:51 +0100814 fault_ipa &= ~(vma_pagesize - 1);
Will Deacon6f745f12020-09-11 14:25:25 +0100815
816 gfn = fault_ipa >> PAGE_SHIFT;
Michel Lespinasse89154dd2020-06-08 21:33:29 -0700817 mmap_read_unlock(current->mm);
Christoffer Dallad361f02012-11-01 17:14:45 +0100818
Will Deacon6f745f12020-09-11 14:25:25 +0100819 /*
820 * Permission faults just need to update the existing leaf entry,
821 * and so normally don't require allocations from the memcache. The
822 * only exception to this is when dirty logging is enabled at runtime
823 * and a write fault needs to collapse a block entry into a table.
824 */
825 if (fault_status != FSC_PERM || (logging_active && write_fault)) {
826 ret = kvm_mmu_topup_memory_cache(memcache,
827 kvm_mmu_cache_min_pages(kvm));
828 if (ret)
829 return ret;
830 }
Christoffer Dall94f8e642013-01-20 18:28:12 -0500831
832 mmu_seq = vcpu->kvm->mmu_notifier_seq;
833 /*
834 * Ensure the read of mmu_notifier_seq happens before we call
835 * gfn_to_pfn_prot (which calls get_user_pages), so that we don't risk
836 * the page we just got a reference to gets unmapped before we have a
837 * chance to grab the mmu_lock, which ensure that if the page gets
838 * unmapped afterwards, the call to kvm_unmap_hva will take it away
839 * from us again properly. This smp_rmb() interacts with the smp_wmb()
840 * in kvm_mmu_notifier_invalidate_<page|range_end>.
841 */
842 smp_rmb();
843
Christoffer Dallad361f02012-11-01 17:14:45 +0100844 pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable);
James Morse196f8782017-06-20 17:11:48 +0100845 if (pfn == KVM_PFN_ERR_HWPOISON) {
James Morse1559b752019-12-17 12:38:09 +0000846 kvm_send_hwpoison_signal(hva, vma_shift);
James Morse196f8782017-06-20 17:11:48 +0100847 return 0;
848 }
Christoffer Dall9ac71592016-08-17 10:46:10 +0200849 if (is_error_noslot_pfn(pfn))
Christoffer Dall94f8e642013-01-20 18:28:12 -0500850 return -EFAULT;
851
Mario Smarduch15a49a42015-01-15 15:58:58 -0800852 if (kvm_is_device_pfn(pfn)) {
Will Deacon6f745f12020-09-11 14:25:25 +0100853 device = true;
854 } else if (logging_active && !write_fault) {
Mario Smarduch15a49a42015-01-15 15:58:58 -0800855 /*
856 * Only actually map the page as writable if this was a write
857 * fault.
858 */
Will Deacon6f745f12020-09-11 14:25:25 +0100859 writable = false;
Mario Smarduch15a49a42015-01-15 15:58:58 -0800860 }
Kim Phillipsb8865762014-06-26 01:45:51 +0100861
Will Deacon6f745f12020-09-11 14:25:25 +0100862 if (exec_fault && device)
Marc Zyngier6d674e22019-12-11 16:56:48 +0000863 return -ENOEXEC;
864
Christoffer Dallad361f02012-11-01 17:14:45 +0100865 spin_lock(&kvm->mmu_lock);
Will Deacon6f745f12020-09-11 14:25:25 +0100866 pgt = vcpu->arch.hw_mmu->pgt;
Christoffer Dallad361f02012-11-01 17:14:45 +0100867 if (mmu_notifier_retry(kvm, mmu_seq))
Christoffer Dall94f8e642013-01-20 18:28:12 -0500868 goto out_unlock;
Mario Smarduch15a49a42015-01-15 15:58:58 -0800869
Suzuki K Poulose0529c902020-05-07 20:35:46 +0800870 /*
871 * If we are not forced to use page mapping, check if we are
872 * backed by a THP and thus use block mapping if possible.
873 */
874 if (vma_pagesize == PAGE_SIZE && !force_pte)
875 vma_pagesize = transparent_hugepage_adjust(memslot, hva,
876 &pfn, &fault_ipa);
Will Deacon6f745f12020-09-11 14:25:25 +0100877 if (writable) {
878 prot |= KVM_PGTABLE_PROT_W;
Punit Agrawal3f58bf62018-12-11 17:10:34 +0000879 kvm_set_pfn_dirty(pfn);
Will Deacon6f745f12020-09-11 14:25:25 +0100880 mark_page_dirty(kvm, gfn);
881 }
Punit Agrawal3f58bf62018-12-11 17:10:34 +0000882
Will Deacon6f745f12020-09-11 14:25:25 +0100883 if (fault_status != FSC_PERM && !device)
Punit Agrawal3f58bf62018-12-11 17:10:34 +0000884 clean_dcache_guest_page(pfn, vma_pagesize);
885
Will Deacon6f745f12020-09-11 14:25:25 +0100886 if (exec_fault) {
887 prot |= KVM_PGTABLE_PROT_X;
Punit Agrawal3f58bf62018-12-11 17:10:34 +0000888 invalidate_icache_guest_page(pfn, vma_pagesize);
Will Deacon6f745f12020-09-11 14:25:25 +0100889 }
Punit Agrawal3f58bf62018-12-11 17:10:34 +0000890
Will Deacon6f745f12020-09-11 14:25:25 +0100891 if (device)
892 prot |= KVM_PGTABLE_PROT_DEVICE;
893 else if (cpus_have_const_cap(ARM64_HAS_CACHE_DIC))
894 prot |= KVM_PGTABLE_PROT_X;
Punit Agrawal6396b852018-12-11 17:10:35 +0000895
Will Deacon6f745f12020-09-11 14:25:25 +0100896 if (fault_status == FSC_PERM && !(logging_active && writable)) {
897 ret = kvm_pgtable_stage2_relax_perms(pgt, fault_ipa, prot);
Christoffer Dallad361f02012-11-01 17:14:45 +0100898 } else {
Will Deacon6f745f12020-09-11 14:25:25 +0100899 ret = kvm_pgtable_stage2_map(pgt, fault_ipa, vma_pagesize,
900 __pfn_to_phys(pfn), prot,
901 memcache);
Christoffer Dall94f8e642013-01-20 18:28:12 -0500902 }
Christoffer Dallad361f02012-11-01 17:14:45 +0100903
Christoffer Dall94f8e642013-01-20 18:28:12 -0500904out_unlock:
Christoffer Dallad361f02012-11-01 17:14:45 +0100905 spin_unlock(&kvm->mmu_lock);
Marc Zyngier35307b92015-03-12 18:16:51 +0000906 kvm_set_pfn_accessed(pfn);
Christoffer Dall94f8e642013-01-20 18:28:12 -0500907 kvm_release_pfn_clean(pfn);
Christoffer Dallad361f02012-11-01 17:14:45 +0100908 return ret;
Christoffer Dall94f8e642013-01-20 18:28:12 -0500909}
910
Will Deaconee8efad2020-09-11 14:25:19 +0100911/* Resolve the access fault by making the page young again. */
Marc Zyngieraeda9132015-03-12 18:16:52 +0000912static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
913{
Will Deaconee8efad2020-09-11 14:25:19 +0100914 pte_t pte;
915 kvm_pte_t kpte;
916 struct kvm_s2_mmu *mmu;
Marc Zyngieraeda9132015-03-12 18:16:52 +0000917
918 trace_kvm_access_fault(fault_ipa);
919
920 spin_lock(&vcpu->kvm->mmu_lock);
Will Deaconee8efad2020-09-11 14:25:19 +0100921 mmu = vcpu->arch.hw_mmu;
922 kpte = kvm_pgtable_stage2_mkyoung(mmu->pgt, fault_ipa);
Marc Zyngieraeda9132015-03-12 18:16:52 +0000923 spin_unlock(&vcpu->kvm->mmu_lock);
Will Deaconee8efad2020-09-11 14:25:19 +0100924
925 pte = __pte(kpte);
926 if (pte_valid(pte))
927 kvm_set_pfn_accessed(pte_pfn(pte));
Marc Zyngieraeda9132015-03-12 18:16:52 +0000928}
929
Christoffer Dall94f8e642013-01-20 18:28:12 -0500930/**
931 * kvm_handle_guest_abort - handles all 2nd stage aborts
932 * @vcpu: the VCPU pointer
Christoffer Dall94f8e642013-01-20 18:28:12 -0500933 *
934 * Any abort that gets to the host is almost guaranteed to be caused by a
935 * missing second stage translation table entry, which can mean that either the
936 * guest simply needs more memory and we must allocate an appropriate page or it
937 * can mean that the guest tried to access I/O memory, which is emulated by user
938 * space. The distinction is based on the IPA causing the fault and whether this
939 * memory region has been registered as standard RAM by user space.
940 */
Tianjia Zhang74cc7e02020-06-23 21:14:15 +0800941int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
Christoffer Dall342cd0a2013-01-20 18:28:06 -0500942{
Christoffer Dall94f8e642013-01-20 18:28:12 -0500943 unsigned long fault_status;
944 phys_addr_t fault_ipa;
945 struct kvm_memory_slot *memslot;
Christoffer Dall98047882014-08-19 12:18:04 +0200946 unsigned long hva;
947 bool is_iabt, write_fault, writable;
Christoffer Dall94f8e642013-01-20 18:28:12 -0500948 gfn_t gfn;
949 int ret, idx;
950
Tyler Baicar621f48e2017-06-21 12:17:14 -0600951 fault_status = kvm_vcpu_trap_get_fault_type(vcpu);
952
953 fault_ipa = kvm_vcpu_get_fault_ipa(vcpu);
James Morsebb428922017-07-18 13:37:41 +0100954 is_iabt = kvm_vcpu_trap_is_iabt(vcpu);
Tyler Baicar621f48e2017-06-21 12:17:14 -0600955
James Morsebb428922017-07-18 13:37:41 +0100956 /* Synchronous External Abort? */
Will Deaconc9a636f2020-07-29 11:28:18 +0100957 if (kvm_vcpu_abt_issea(vcpu)) {
James Morsebb428922017-07-18 13:37:41 +0100958 /*
959 * For RAS the host kernel may handle this abort.
960 * There is no need to pass the error into the guest.
961 */
Will Deacon84b951a2020-07-29 11:28:19 +0100962 if (kvm_handle_guest_sea(fault_ipa, kvm_vcpu_get_esr(vcpu)))
James Morsebb428922017-07-18 13:37:41 +0100963 kvm_inject_vabt(vcpu);
Will Deacon84b951a2020-07-29 11:28:19 +0100964
965 return 1;
Marc Zyngier40557102016-09-06 14:02:15 +0100966 }
967
Gavin Shan3a949f42020-06-30 11:57:05 +1000968 trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_esr(vcpu),
Marc Zyngier7393b592012-09-17 19:27:09 +0100969 kvm_vcpu_get_hfar(vcpu), fault_ipa);
Christoffer Dall94f8e642013-01-20 18:28:12 -0500970
971 /* Check the stage-2 fault is trans. fault or write fault */
Marc Zyngier35307b92015-03-12 18:16:51 +0000972 if (fault_status != FSC_FAULT && fault_status != FSC_PERM &&
973 fault_status != FSC_ACCESS) {
Christoffer Dall0496daa52014-09-26 12:29:34 +0200974 kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n",
975 kvm_vcpu_trap_get_class(vcpu),
976 (unsigned long)kvm_vcpu_trap_get_fault(vcpu),
Gavin Shan3a949f42020-06-30 11:57:05 +1000977 (unsigned long)kvm_vcpu_get_esr(vcpu));
Christoffer Dall94f8e642013-01-20 18:28:12 -0500978 return -EFAULT;
979 }
980
981 idx = srcu_read_lock(&vcpu->kvm->srcu);
982
983 gfn = fault_ipa >> PAGE_SHIFT;
Christoffer Dall98047882014-08-19 12:18:04 +0200984 memslot = gfn_to_memslot(vcpu->kvm, gfn);
985 hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable);
Ard Biesheuvela7d079c2014-09-09 11:27:09 +0100986 write_fault = kvm_is_write_fault(vcpu);
Christoffer Dall98047882014-08-19 12:18:04 +0200987 if (kvm_is_error_hva(hva) || (write_fault && !writable)) {
Will Deacon022c8322020-07-29 11:28:21 +0100988 /*
989 * The guest has put either its instructions or its page-tables
990 * somewhere it shouldn't have. Userspace won't be able to do
991 * anything about this (there's no syndrome for a start), so
992 * re-inject the abort back into the guest.
993 */
Christoffer Dall94f8e642013-01-20 18:28:12 -0500994 if (is_iabt) {
Marc Zyngier6d674e22019-12-11 16:56:48 +0000995 ret = -ENOEXEC;
996 goto out;
Christoffer Dall94f8e642013-01-20 18:28:12 -0500997 }
998
Will Deacon022c8322020-07-29 11:28:21 +0100999 if (kvm_vcpu_dabt_iss1tw(vcpu)) {
1000 kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu));
1001 ret = 1;
1002 goto out_unlock;
1003 }
1004
Marc Zyngiercfe39502012-12-12 14:42:09 +00001005 /*
Marc Zyngier57c841f2016-01-29 15:01:28 +00001006 * Check for a cache maintenance operation. Since we
1007 * ended-up here, we know it is outside of any memory
1008 * slot. But we can't find out if that is for a device,
1009 * or if the guest is just being stupid. The only thing
1010 * we know for sure is that this range cannot be cached.
1011 *
1012 * So let's assume that the guest is just being
1013 * cautious, and skip the instruction.
1014 */
Will Deacon54dc0d22020-07-29 11:28:20 +01001015 if (kvm_is_error_hva(hva) && kvm_vcpu_dabt_is_cm(vcpu)) {
Marc Zyngier57c841f2016-01-29 15:01:28 +00001016 kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
1017 ret = 1;
1018 goto out_unlock;
1019 }
1020
1021 /*
Marc Zyngiercfe39502012-12-12 14:42:09 +00001022 * The IPA is reported as [MAX:12], so we need to
1023 * complement it with the bottom 12 bits from the
1024 * faulting VA. This is always 12 bits, irrespective
1025 * of the page size.
1026 */
1027 fault_ipa |= kvm_vcpu_get_hfar(vcpu) & ((1 << 12) - 1);
Tianjia Zhang74cc7e02020-06-23 21:14:15 +08001028 ret = io_mem_abort(vcpu, fault_ipa);
Christoffer Dall94f8e642013-01-20 18:28:12 -05001029 goto out_unlock;
1030 }
1031
Christoffer Dallc3058d52014-10-10 12:14:29 +02001032 /* Userspace should not be able to register out-of-bounds IPAs */
Suzuki K Poulosee55cac52018-09-26 17:32:44 +01001033 VM_BUG_ON(fault_ipa >= kvm_phys_size(vcpu->kvm));
Christoffer Dallc3058d52014-10-10 12:14:29 +02001034
Marc Zyngieraeda9132015-03-12 18:16:52 +00001035 if (fault_status == FSC_ACCESS) {
1036 handle_access_fault(vcpu, fault_ipa);
1037 ret = 1;
1038 goto out_unlock;
1039 }
1040
Christoffer Dall98047882014-08-19 12:18:04 +02001041 ret = user_mem_abort(vcpu, fault_ipa, memslot, hva, fault_status);
Christoffer Dall94f8e642013-01-20 18:28:12 -05001042 if (ret == 0)
1043 ret = 1;
Marc Zyngier6d674e22019-12-11 16:56:48 +00001044out:
1045 if (ret == -ENOEXEC) {
1046 kvm_inject_pabt(vcpu, kvm_vcpu_get_hfar(vcpu));
1047 ret = 1;
1048 }
Christoffer Dall94f8e642013-01-20 18:28:12 -05001049out_unlock:
1050 srcu_read_unlock(&vcpu->kvm->srcu, idx);
1051 return ret;
Christoffer Dall342cd0a2013-01-20 18:28:06 -05001052}
1053
Marc Zyngier1d2ebac2015-03-12 18:16:50 +00001054static int handle_hva_to_gpa(struct kvm *kvm,
1055 unsigned long start,
1056 unsigned long end,
1057 int (*handler)(struct kvm *kvm,
Suzuki K Poulose056aad62017-03-20 18:26:42 +00001058 gpa_t gpa, u64 size,
1059 void *data),
Marc Zyngier1d2ebac2015-03-12 18:16:50 +00001060 void *data)
Christoffer Dalld5d81842013-01-20 18:28:07 -05001061{
1062 struct kvm_memslots *slots;
1063 struct kvm_memory_slot *memslot;
Marc Zyngier1d2ebac2015-03-12 18:16:50 +00001064 int ret = 0;
Christoffer Dalld5d81842013-01-20 18:28:07 -05001065
1066 slots = kvm_memslots(kvm);
1067
1068 /* we only care about the pages that the guest sees */
1069 kvm_for_each_memslot(memslot, slots) {
1070 unsigned long hva_start, hva_end;
Suzuki K Poulose056aad62017-03-20 18:26:42 +00001071 gfn_t gpa;
Christoffer Dalld5d81842013-01-20 18:28:07 -05001072
1073 hva_start = max(start, memslot->userspace_addr);
1074 hva_end = min(end, memslot->userspace_addr +
1075 (memslot->npages << PAGE_SHIFT));
1076 if (hva_start >= hva_end)
1077 continue;
1078
Suzuki K Poulose056aad62017-03-20 18:26:42 +00001079 gpa = hva_to_gfn_memslot(hva_start, memslot) << PAGE_SHIFT;
1080 ret |= handler(kvm, gpa, (u64)(hva_end - hva_start), data);
Christoffer Dalld5d81842013-01-20 18:28:07 -05001081 }
Marc Zyngier1d2ebac2015-03-12 18:16:50 +00001082
1083 return ret;
Christoffer Dalld5d81842013-01-20 18:28:07 -05001084}
1085
Suzuki K Poulose056aad62017-03-20 18:26:42 +00001086static int kvm_unmap_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
Christoffer Dalld5d81842013-01-20 18:28:07 -05001087{
Will Deaconb5331372020-08-11 11:27:25 +01001088 unsigned flags = *(unsigned *)data;
1089 bool may_block = flags & MMU_NOTIFIER_RANGE_BLOCKABLE;
1090
1091 __unmap_stage2_range(&kvm->arch.mmu, gpa, size, may_block);
Marc Zyngier1d2ebac2015-03-12 18:16:50 +00001092 return 0;
Christoffer Dalld5d81842013-01-20 18:28:07 -05001093}
1094
Christoffer Dalld5d81842013-01-20 18:28:07 -05001095int kvm_unmap_hva_range(struct kvm *kvm,
Will Deaconfdfe7cb2020-08-11 11:27:24 +01001096 unsigned long start, unsigned long end, unsigned flags)
Christoffer Dalld5d81842013-01-20 18:28:07 -05001097{
Will Deacon063deeb2020-09-11 14:25:26 +01001098 if (!kvm->arch.mmu.pgt)
Christoffer Dalld5d81842013-01-20 18:28:07 -05001099 return 0;
1100
1101 trace_kvm_unmap_hva_range(start, end);
Will Deaconb5331372020-08-11 11:27:25 +01001102 handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, &flags);
Christoffer Dalld5d81842013-01-20 18:28:07 -05001103 return 0;
1104}
1105
Suzuki K Poulose056aad62017-03-20 18:26:42 +00001106static int kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
Christoffer Dalld5d81842013-01-20 18:28:07 -05001107{
Will Deacone9edb172020-09-11 14:25:16 +01001108 kvm_pfn_t *pfn = (kvm_pfn_t *)data;
Christoffer Dalld5d81842013-01-20 18:28:07 -05001109
Suzuki K Poulose056aad62017-03-20 18:26:42 +00001110 WARN_ON(size != PAGE_SIZE);
Will Deacone9edb172020-09-11 14:25:16 +01001111
Mario Smarduch15a49a42015-01-15 15:58:58 -08001112 /*
Will Deacone9edb172020-09-11 14:25:16 +01001113 * The MMU notifiers will have unmapped a huge PMD before calling
1114 * ->change_pte() (which in turn calls kvm_set_spte_hva()) and
1115 * therefore we never need to clear out a huge PMD through this
1116 * calling path and a memcache is not required.
Mario Smarduch15a49a42015-01-15 15:58:58 -08001117 */
Will Deacone9edb172020-09-11 14:25:16 +01001118 kvm_pgtable_stage2_map(kvm->arch.mmu.pgt, gpa, PAGE_SIZE,
1119 __pfn_to_phys(*pfn), KVM_PGTABLE_PROT_R, NULL);
Marc Zyngier1d2ebac2015-03-12 18:16:50 +00001120 return 0;
Christoffer Dalld5d81842013-01-20 18:28:07 -05001121}
1122
Lan Tianyu748c0e32018-12-06 21:21:10 +08001123int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
Christoffer Dalld5d81842013-01-20 18:28:07 -05001124{
1125 unsigned long end = hva + PAGE_SIZE;
Marc Zyngier694556d2018-08-23 09:58:27 +01001126 kvm_pfn_t pfn = pte_pfn(pte);
Christoffer Dalld5d81842013-01-20 18:28:07 -05001127
Will Deacone9edb172020-09-11 14:25:16 +01001128 if (!kvm->arch.mmu.pgt)
Lan Tianyu748c0e32018-12-06 21:21:10 +08001129 return 0;
Christoffer Dalld5d81842013-01-20 18:28:07 -05001130
1131 trace_kvm_set_spte_hva(hva);
Marc Zyngier694556d2018-08-23 09:58:27 +01001132
1133 /*
1134 * We've moved a page around, probably through CoW, so let's treat it
1135 * just like a translation fault and clean the cache to the PoC.
1136 */
1137 clean_dcache_guest_page(pfn, PAGE_SIZE);
Will Deacone9edb172020-09-11 14:25:16 +01001138 handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &pfn);
Lan Tianyu748c0e32018-12-06 21:21:10 +08001139 return 0;
Christoffer Dalld5d81842013-01-20 18:28:07 -05001140}
1141
Suzuki K Poulose056aad62017-03-20 18:26:42 +00001142static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
Marc Zyngier35307b92015-03-12 18:16:51 +00001143{
Will Deaconee8efad2020-09-11 14:25:19 +01001144 pte_t pte;
1145 kvm_pte_t kpte;
Marc Zyngier35307b92015-03-12 18:16:51 +00001146
Punit Agrawal35a63962018-12-11 17:10:40 +00001147 WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
Will Deaconee8efad2020-09-11 14:25:19 +01001148 kpte = kvm_pgtable_stage2_mkold(kvm->arch.mmu.pgt, gpa);
1149 pte = __pte(kpte);
1150 return pte_valid(pte) && pte_young(pte);
Marc Zyngier35307b92015-03-12 18:16:51 +00001151}
1152
Suzuki K Poulose056aad62017-03-20 18:26:42 +00001153static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
Marc Zyngier35307b92015-03-12 18:16:51 +00001154{
Punit Agrawal35a63962018-12-11 17:10:40 +00001155 WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
Will Deaconee8efad2020-09-11 14:25:19 +01001156 return kvm_pgtable_stage2_is_young(kvm->arch.mmu.pgt, gpa);
Marc Zyngier35307b92015-03-12 18:16:51 +00001157}
1158
1159int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
1160{
Will Deacon063deeb2020-09-11 14:25:26 +01001161 if (!kvm->arch.mmu.pgt)
Suzuki K Poulose7e5a6722017-07-05 09:57:00 +01001162 return 0;
Marc Zyngier35307b92015-03-12 18:16:51 +00001163 trace_kvm_age_hva(start, end);
1164 return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL);
1165}
1166
1167int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
1168{
Will Deacon063deeb2020-09-11 14:25:26 +01001169 if (!kvm->arch.mmu.pgt)
Suzuki K Poulose7e5a6722017-07-05 09:57:00 +01001170 return 0;
Marc Zyngier35307b92015-03-12 18:16:51 +00001171 trace_kvm_test_age_hva(hva);
Gavin Shancf2d23e2020-01-21 16:56:59 +11001172 return handle_hva_to_gpa(kvm, hva, hva + PAGE_SIZE,
1173 kvm_test_age_hva_handler, NULL);
Marc Zyngier35307b92015-03-12 18:16:51 +00001174}
1175
Christoffer Dall342cd0a2013-01-20 18:28:06 -05001176phys_addr_t kvm_mmu_get_httbr(void)
1177{
Will Deacon0f9d09b2020-09-11 14:25:12 +01001178 return __pa(hyp_pgtable->pgd);
Christoffer Dall342cd0a2013-01-20 18:28:06 -05001179}
1180
Marc Zyngier5a677ce2013-04-12 19:12:06 +01001181phys_addr_t kvm_get_idmap_vector(void)
1182{
1183 return hyp_idmap_vector;
1184}
1185
Will Deacon0f9d09b2020-09-11 14:25:12 +01001186static int kvm_map_idmap_text(void)
Marc Zyngier0535a3e2016-06-30 18:40:43 +01001187{
Will Deacon0f9d09b2020-09-11 14:25:12 +01001188 unsigned long size = hyp_idmap_end - hyp_idmap_start;
1189 int err = __create_hyp_mappings(hyp_idmap_start, size, hyp_idmap_start,
1190 PAGE_HYP_EXEC);
Marc Zyngier0535a3e2016-06-30 18:40:43 +01001191 if (err)
1192 kvm_err("Failed to idmap %lx-%lx\n",
1193 hyp_idmap_start, hyp_idmap_end);
1194
1195 return err;
1196}
1197
Christoffer Dall342cd0a2013-01-20 18:28:06 -05001198int kvm_mmu_init(void)
1199{
Marc Zyngier2fb41052013-04-12 19:12:03 +01001200 int err;
Will Deacon0f9d09b2020-09-11 14:25:12 +01001201 u32 hyp_va_bits;
Marc Zyngier2fb41052013-04-12 19:12:03 +01001202
Andrew Scull0a787912020-05-19 11:40:36 +01001203 hyp_idmap_start = __pa_symbol(__hyp_idmap_text_start);
Marc Zyngier46fef152018-03-12 14:25:10 +00001204 hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE);
Andrew Scull0a787912020-05-19 11:40:36 +01001205 hyp_idmap_end = __pa_symbol(__hyp_idmap_text_end);
Marc Zyngier46fef152018-03-12 14:25:10 +00001206 hyp_idmap_end = ALIGN(hyp_idmap_end, PAGE_SIZE);
Andrew Scull0a787912020-05-19 11:40:36 +01001207 hyp_idmap_vector = __pa_symbol(__kvm_hyp_init);
Marc Zyngier5a677ce2013-04-12 19:12:06 +01001208
Ard Biesheuvel06f75a12015-03-19 16:42:26 +00001209 /*
1210 * We rely on the linker script to ensure at build time that the HYP
1211 * init code does not cross a page boundary.
1212 */
1213 BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK);
Marc Zyngier5a677ce2013-04-12 19:12:06 +01001214
Will Deacon0f9d09b2020-09-11 14:25:12 +01001215 hyp_va_bits = 64 - ((idmap_t0sz & TCR_T0SZ_MASK) >> TCR_T0SZ_OFFSET);
1216 kvm_debug("Using %u-bit virtual addresses at EL2\n", hyp_va_bits);
Marc Zyngierb4ef0492017-12-03 20:04:51 +00001217 kvm_debug("IDMAP page: %lx\n", hyp_idmap_start);
1218 kvm_debug("HYP VA range: %lx:%lx\n",
1219 kern_hyp_va(PAGE_OFFSET),
1220 kern_hyp_va((unsigned long)high_memory - 1));
Marc Zyngiereac378a2016-06-30 18:40:50 +01001221
Marc Zyngier6c41a412016-06-30 18:40:51 +01001222 if (hyp_idmap_start >= kern_hyp_va(PAGE_OFFSET) &&
Marc Zyngiered57cac2017-12-03 18:22:49 +00001223 hyp_idmap_start < kern_hyp_va((unsigned long)high_memory - 1) &&
Marc Zyngierd2896d42016-08-22 09:01:17 +01001224 hyp_idmap_start != (unsigned long)__hyp_idmap_text_start) {
Marc Zyngiereac378a2016-06-30 18:40:50 +01001225 /*
1226 * The idmap page is intersecting with the VA space,
1227 * it is not safe to continue further.
1228 */
1229 kvm_err("IDMAP intersecting with HYP VA, unable to continue\n");
1230 err = -EINVAL;
1231 goto out;
1232 }
1233
Will Deacon0f9d09b2020-09-11 14:25:12 +01001234 hyp_pgtable = kzalloc(sizeof(*hyp_pgtable), GFP_KERNEL);
1235 if (!hyp_pgtable) {
1236 kvm_err("Hyp mode page-table not allocated\n");
Marc Zyngier2fb41052013-04-12 19:12:03 +01001237 err = -ENOMEM;
1238 goto out;
1239 }
1240
Will Deacon0f9d09b2020-09-11 14:25:12 +01001241 err = kvm_pgtable_hyp_init(hyp_pgtable, hyp_va_bits);
1242 if (err)
1243 goto out_free_pgtable;
Marc Zyngier0535a3e2016-06-30 18:40:43 +01001244
Will Deacon0f9d09b2020-09-11 14:25:12 +01001245 err = kvm_map_idmap_text();
1246 if (err)
1247 goto out_destroy_pgtable;
Marc Zyngier5a677ce2013-04-12 19:12:06 +01001248
Marc Zyngiere3f019b2017-12-04 17:04:38 +00001249 io_map_base = hyp_idmap_start;
Christoffer Dalld5d81842013-01-20 18:28:07 -05001250 return 0;
Will Deacon0f9d09b2020-09-11 14:25:12 +01001251
1252out_destroy_pgtable:
1253 kvm_pgtable_hyp_destroy(hyp_pgtable);
1254out_free_pgtable:
1255 kfree(hyp_pgtable);
1256 hyp_pgtable = NULL;
Marc Zyngier2fb41052013-04-12 19:12:03 +01001257out:
Marc Zyngier2fb41052013-04-12 19:12:03 +01001258 return err;
Christoffer Dall342cd0a2013-01-20 18:28:06 -05001259}
Eric Augerdf6ce242014-06-06 11:10:23 +02001260
1261void kvm_arch_commit_memory_region(struct kvm *kvm,
Paolo Bonzini09170a42015-05-18 13:59:39 +02001262 const struct kvm_userspace_memory_region *mem,
Sean Christopherson9d4c1972020-02-18 13:07:24 -08001263 struct kvm_memory_slot *old,
Paolo Bonzinif36f3f22015-05-18 13:20:23 +02001264 const struct kvm_memory_slot *new,
Eric Augerdf6ce242014-06-06 11:10:23 +02001265 enum kvm_mr_change change)
1266{
Mario Smarduchc6473552015-01-15 15:58:56 -08001267 /*
1268 * At this point memslot has been committed and there is an
Fuad Tabba656012c2020-04-01 15:03:10 +01001269 * allocated dirty_bitmap[], dirty pages will be tracked while the
Mario Smarduchc6473552015-01-15 15:58:56 -08001270 * memory slot is write protected.
1271 */
Keqian Zhuc8626262020-04-13 20:20:23 +08001272 if (change != KVM_MR_DELETE && mem->flags & KVM_MEM_LOG_DIRTY_PAGES) {
1273 /*
1274 * If we're with initial-all-set, we don't need to write
1275 * protect any pages because they're all reported as dirty.
1276 * Huge pages and normal pages will be write protect gradually.
1277 */
1278 if (!kvm_dirty_log_manual_protect_and_init_set(kvm)) {
1279 kvm_mmu_wp_memory_region(kvm, mem->slot);
1280 }
1281 }
Eric Augerdf6ce242014-06-06 11:10:23 +02001282}
1283
1284int kvm_arch_prepare_memory_region(struct kvm *kvm,
1285 struct kvm_memory_slot *memslot,
Paolo Bonzini09170a42015-05-18 13:59:39 +02001286 const struct kvm_userspace_memory_region *mem,
Eric Augerdf6ce242014-06-06 11:10:23 +02001287 enum kvm_mr_change change)
1288{
Ard Biesheuvel8eef9122014-10-10 17:00:32 +02001289 hva_t hva = mem->userspace_addr;
1290 hva_t reg_end = hva + mem->memory_size;
1291 bool writable = !(mem->flags & KVM_MEM_READONLY);
1292 int ret = 0;
1293
Mario Smarduch15a49a42015-01-15 15:58:58 -08001294 if (change != KVM_MR_CREATE && change != KVM_MR_MOVE &&
1295 change != KVM_MR_FLAGS_ONLY)
Ard Biesheuvel8eef9122014-10-10 17:00:32 +02001296 return 0;
1297
1298 /*
Christoffer Dallc3058d52014-10-10 12:14:29 +02001299 * Prevent userspace from creating a memory region outside of the IPA
1300 * space addressable by the KVM guest IPA space.
1301 */
1302 if (memslot->base_gfn + memslot->npages >=
Suzuki K Poulosee55cac52018-09-26 17:32:44 +01001303 (kvm_phys_size(kvm) >> PAGE_SHIFT))
Christoffer Dallc3058d52014-10-10 12:14:29 +02001304 return -EFAULT;
1305
Michel Lespinasse89154dd2020-06-08 21:33:29 -07001306 mmap_read_lock(current->mm);
Christoffer Dallc3058d52014-10-10 12:14:29 +02001307 /*
Ard Biesheuvel8eef9122014-10-10 17:00:32 +02001308 * A memory region could potentially cover multiple VMAs, and any holes
1309 * between them, so iterate over all of them to find out if we can map
1310 * any of them right now.
1311 *
1312 * +--------------------------------------------+
1313 * +---------------+----------------+ +----------------+
1314 * | : VMA 1 | VMA 2 | | VMA 3 : |
1315 * +---------------+----------------+ +----------------+
1316 * | memory region |
1317 * +--------------------------------------------+
1318 */
1319 do {
1320 struct vm_area_struct *vma = find_vma(current->mm, hva);
1321 hva_t vm_start, vm_end;
1322
1323 if (!vma || vma->vm_start >= reg_end)
1324 break;
1325
1326 /*
Ard Biesheuvel8eef9122014-10-10 17:00:32 +02001327 * Take the intersection of this VMA with the memory region
1328 */
1329 vm_start = max(hva, vma->vm_start);
1330 vm_end = min(reg_end, vma->vm_end);
1331
1332 if (vma->vm_flags & VM_PFNMAP) {
1333 gpa_t gpa = mem->guest_phys_addr +
1334 (vm_start - mem->userspace_addr);
Marek Majtykaca09f022015-09-16 12:04:55 +02001335 phys_addr_t pa;
1336
1337 pa = (phys_addr_t)vma->vm_pgoff << PAGE_SHIFT;
1338 pa += vm_start - vma->vm_start;
Ard Biesheuvel8eef9122014-10-10 17:00:32 +02001339
Mario Smarduch15a49a42015-01-15 15:58:58 -08001340 /* IO region dirty page logging not allowed */
Marc Zyngier72f31042017-03-16 18:20:50 +00001341 if (memslot->flags & KVM_MEM_LOG_DIRTY_PAGES) {
1342 ret = -EINVAL;
1343 goto out;
1344 }
Mario Smarduch15a49a42015-01-15 15:58:58 -08001345
Ard Biesheuvel8eef9122014-10-10 17:00:32 +02001346 ret = kvm_phys_addr_ioremap(kvm, gpa, pa,
1347 vm_end - vm_start,
1348 writable);
1349 if (ret)
1350 break;
1351 }
1352 hva = vm_end;
1353 } while (hva < reg_end);
1354
Mario Smarduch15a49a42015-01-15 15:58:58 -08001355 if (change == KVM_MR_FLAGS_ONLY)
Marc Zyngier72f31042017-03-16 18:20:50 +00001356 goto out;
Mario Smarduch15a49a42015-01-15 15:58:58 -08001357
Ard Biesheuvel849260c2014-11-17 14:58:53 +00001358 spin_lock(&kvm->mmu_lock);
1359 if (ret)
Christoffer Dalla0e50aa2019-01-04 21:09:05 +01001360 unmap_stage2_range(&kvm->arch.mmu, mem->guest_phys_addr, mem->memory_size);
Alexandru Eliseiada329e2020-09-15 18:04:42 +01001361 else if (!cpus_have_final_cap(ARM64_HAS_STAGE2_FWB))
Ard Biesheuvel849260c2014-11-17 14:58:53 +00001362 stage2_flush_memslot(kvm, memslot);
1363 spin_unlock(&kvm->mmu_lock);
Marc Zyngier72f31042017-03-16 18:20:50 +00001364out:
Michel Lespinasse89154dd2020-06-08 21:33:29 -07001365 mmap_read_unlock(current->mm);
Ard Biesheuvel8eef9122014-10-10 17:00:32 +02001366 return ret;
Eric Augerdf6ce242014-06-06 11:10:23 +02001367}
1368
Sean Christophersone96c81e2020-02-18 13:07:27 -08001369void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
Eric Augerdf6ce242014-06-06 11:10:23 +02001370{
1371}
1372
Sean Christopherson15248252019-02-05 12:54:17 -08001373void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
Eric Augerdf6ce242014-06-06 11:10:23 +02001374{
1375}
1376
1377void kvm_arch_flush_shadow_all(struct kvm *kvm)
1378{
Christoffer Dalla0e50aa2019-01-04 21:09:05 +01001379 kvm_free_stage2_pgd(&kvm->arch.mmu);
Eric Augerdf6ce242014-06-06 11:10:23 +02001380}
1381
1382void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
1383 struct kvm_memory_slot *slot)
1384{
Ard Biesheuvel8eef9122014-10-10 17:00:32 +02001385 gpa_t gpa = slot->base_gfn << PAGE_SHIFT;
1386 phys_addr_t size = slot->npages << PAGE_SHIFT;
1387
1388 spin_lock(&kvm->mmu_lock);
Christoffer Dalla0e50aa2019-01-04 21:09:05 +01001389 unmap_stage2_range(&kvm->arch.mmu, gpa, size);
Ard Biesheuvel8eef9122014-10-10 17:00:32 +02001390 spin_unlock(&kvm->mmu_lock);
Eric Augerdf6ce242014-06-06 11:10:23 +02001391}
Marc Zyngier3c1e7162014-12-19 16:05:31 +00001392
1393/*
1394 * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized).
1395 *
1396 * Main problems:
1397 * - S/W ops are local to a CPU (not broadcast)
1398 * - We have line migration behind our back (speculation)
1399 * - System caches don't support S/W at all (damn!)
1400 *
1401 * In the face of the above, the best we can do is to try and convert
1402 * S/W ops to VA ops. Because the guest is not allowed to infer the
1403 * S/W to PA mapping, it can only use S/W to nuke the whole cache,
1404 * which is a rather good thing for us.
1405 *
1406 * Also, it is only used when turning caches on/off ("The expected
1407 * usage of the cache maintenance instructions that operate by set/way
1408 * is associated with the cache maintenance instructions associated
1409 * with the powerdown and powerup of caches, if this is required by
1410 * the implementation.").
1411 *
1412 * We use the following policy:
1413 *
1414 * - If we trap a S/W operation, we enable VM trapping to detect
1415 * caches being turned on/off, and do a full clean.
1416 *
1417 * - We flush the caches on both caches being turned on and off.
1418 *
1419 * - Once the caches are enabled, we stop trapping VM ops.
1420 */
1421void kvm_set_way_flush(struct kvm_vcpu *vcpu)
1422{
Christoffer Dall3df59d82017-08-03 12:09:05 +02001423 unsigned long hcr = *vcpu_hcr(vcpu);
Marc Zyngier3c1e7162014-12-19 16:05:31 +00001424
1425 /*
1426 * If this is the first time we do a S/W operation
1427 * (i.e. HCR_TVM not set) flush the whole memory, and set the
1428 * VM trapping.
1429 *
1430 * Otherwise, rely on the VM trapping to wait for the MMU +
1431 * Caches to be turned off. At that point, we'll be able to
1432 * clean the caches again.
1433 */
1434 if (!(hcr & HCR_TVM)) {
1435 trace_kvm_set_way_flush(*vcpu_pc(vcpu),
1436 vcpu_has_cache_enabled(vcpu));
1437 stage2_flush_vm(vcpu->kvm);
Christoffer Dall3df59d82017-08-03 12:09:05 +02001438 *vcpu_hcr(vcpu) = hcr | HCR_TVM;
Marc Zyngier3c1e7162014-12-19 16:05:31 +00001439 }
1440}
1441
1442void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled)
1443{
1444 bool now_enabled = vcpu_has_cache_enabled(vcpu);
1445
1446 /*
1447 * If switching the MMU+caches on, need to invalidate the caches.
1448 * If switching it off, need to clean the caches.
1449 * Clean + invalidate does the trick always.
1450 */
1451 if (now_enabled != was_enabled)
1452 stage2_flush_vm(vcpu->kvm);
1453
1454 /* Caches are now on, stop trapping VM ops (until a S/W op) */
1455 if (now_enabled)
Christoffer Dall3df59d82017-08-03 12:09:05 +02001456 *vcpu_hcr(vcpu) &= ~HCR_TVM;
Marc Zyngier3c1e7162014-12-19 16:05:31 +00001457
1458 trace_kvm_toggle_cache(*vcpu_pc(vcpu), was_enabled, now_enabled);
1459}