blob: 77cb2d28f2a43a96dd0114ef7d10f2fa60a48b96 [file] [log] [blame]
Thomas Gleixnerd94d71c2019-05-29 07:12:40 -07001// SPDX-License-Identifier: GPL-2.0-only
Christoffer Dall749cf76c2013-01-20 18:28:06 -05002/*
3 * Copyright (C) 2012 - Virtual Open Systems and Columbia University
4 * Author: Christoffer Dall <c.dall@virtualopensystems.com>
Christoffer Dall749cf76c2013-01-20 18:28:06 -05005 */
Christoffer Dall342cd0a2013-01-20 18:28:06 -05006
7#include <linux/mman.h>
8#include <linux/kvm_host.h>
9#include <linux/io.h>
Christoffer Dallad361f02012-11-01 17:14:45 +010010#include <linux/hugetlb.h>
James Morse196f8782017-06-20 17:11:48 +010011#include <linux/sched/signal.h>
Christoffer Dall45e96ea2013-01-20 18:43:58 -050012#include <trace/events/kvm.h>
Christoffer Dall342cd0a2013-01-20 18:28:06 -050013#include <asm/pgalloc.h>
Christoffer Dall94f8e642013-01-20 18:28:12 -050014#include <asm/cacheflush.h>
Christoffer Dall342cd0a2013-01-20 18:28:06 -050015#include <asm/kvm_arm.h>
16#include <asm/kvm_mmu.h>
Will Deacon0f9d09b2020-09-11 14:25:12 +010017#include <asm/kvm_pgtable.h>
James Morse0db5e022019-01-29 18:48:49 +000018#include <asm/kvm_ras.h>
Christoffer Dalld5d81842013-01-20 18:28:07 -050019#include <asm/kvm_asm.h>
Christoffer Dall94f8e642013-01-20 18:28:12 -050020#include <asm/kvm_emulate.h>
Marc Zyngier1e947ba2015-01-29 11:59:54 +000021#include <asm/virt.h>
Christoffer Dalld5d81842013-01-20 18:28:07 -050022
23#include "trace.h"
Christoffer Dall342cd0a2013-01-20 18:28:06 -050024
Will Deacon0f9d09b2020-09-11 14:25:12 +010025static struct kvm_pgtable *hyp_pgtable;
Christoffer Dall342cd0a2013-01-20 18:28:06 -050026static DEFINE_MUTEX(kvm_hyp_pgd_mutex);
27
Marc Zyngier5a677ce2013-04-12 19:12:06 +010028static unsigned long hyp_idmap_start;
29static unsigned long hyp_idmap_end;
30static phys_addr_t hyp_idmap_vector;
31
Marc Zyngiere3f019b2017-12-04 17:04:38 +000032static unsigned long io_map_base;
33
Marc Zyngier6d674e22019-12-11 16:56:48 +000034
Will Deacon52bae932020-09-11 14:25:17 +010035/*
36 * Release kvm_mmu_lock periodically if the memory region is large. Otherwise,
37 * we may see kernel panics with CONFIG_DETECT_HUNG_TASK,
38 * CONFIG_LOCKUP_DETECTOR, CONFIG_LOCKDEP. Additionally, holding the lock too
39 * long will also starve other vCPUs. We have to also make sure that the page
40 * tables are not freed while we released the lock.
41 */
42static int stage2_apply_range(struct kvm *kvm, phys_addr_t addr,
43 phys_addr_t end,
44 int (*fn)(struct kvm_pgtable *, u64, u64),
45 bool resched)
46{
47 int ret;
48 u64 next;
49
50 do {
51 struct kvm_pgtable *pgt = kvm->arch.mmu.pgt;
52 if (!pgt)
53 return -EINVAL;
54
55 next = stage2_pgd_addr_end(kvm, addr, end);
56 ret = fn(pgt, addr, next - addr);
57 if (ret)
58 break;
59
60 if (resched && next != end)
61 cond_resched_lock(&kvm->mmu_lock);
62 } while (addr = next, addr != end);
63
64 return ret;
65}
66
Quentin Perretcc38d612020-09-11 14:25:21 +010067#define stage2_apply_range_resched(kvm, addr, end, fn) \
68 stage2_apply_range(kvm, addr, end, fn, true)
69
Mario Smarduch15a49a42015-01-15 15:58:58 -080070static bool memslot_is_logging(struct kvm_memory_slot *memslot)
71{
Mario Smarduch15a49a42015-01-15 15:58:58 -080072 return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY);
Mario Smarduch72760302015-01-15 15:59:01 -080073}
74
75/**
76 * kvm_flush_remote_tlbs() - flush all VM TLB entries for v7/8
77 * @kvm: pointer to kvm structure.
78 *
79 * Interface to HYP function to flush all VM TLB entries
80 */
81void kvm_flush_remote_tlbs(struct kvm *kvm)
82{
Christoffer Dalla0e50aa2019-01-04 21:09:05 +010083 kvm_call_hyp(__kvm_tlb_flush_vmid, &kvm->arch.mmu);
Mario Smarduch15a49a42015-01-15 15:58:58 -080084}
Christoffer Dallad361f02012-11-01 17:14:45 +010085
Ard Biesheuvele6fab542015-11-10 15:11:20 +010086static bool kvm_is_device_pfn(unsigned long pfn)
87{
88 return !pfn_valid(pfn);
89}
90
Marc Zyngier363ef892014-12-19 16:48:06 +000091/*
92 * Unmapping vs dcache management:
93 *
94 * If a guest maps certain memory pages as uncached, all writes will
95 * bypass the data cache and go directly to RAM. However, the CPUs
96 * can still speculate reads (not writes) and fill cache lines with
97 * data.
98 *
99 * Those cache lines will be *clean* cache lines though, so a
100 * clean+invalidate operation is equivalent to an invalidate
101 * operation, because no cache lines are marked dirty.
102 *
103 * Those clean cache lines could be filled prior to an uncached write
104 * by the guest, and the cache coherent IO subsystem would therefore
105 * end up writing old data to disk.
106 *
107 * This is why right after unmapping a page/section and invalidating
Will Deacon52bae932020-09-11 14:25:17 +0100108 * the corresponding TLBs, we flush to make sure the IO subsystem will
109 * never hit in the cache.
Marc Zyngiere48d53a2018-04-06 12:27:28 +0100110 *
111 * This is all avoided on systems that have ARM64_HAS_STAGE2_FWB, as
112 * we then fully enforce cacheability of RAM, no matter what the guest
113 * does.
Marc Zyngier363ef892014-12-19 16:48:06 +0000114 */
Suzuki K Poulose7a1c8312016-03-23 12:08:02 +0000115/**
116 * unmap_stage2_range -- Clear stage2 page table entries to unmap a range
Xiaofei Tanc9c02792020-09-17 09:47:49 +0800117 * @mmu: The KVM stage-2 MMU pointer
Suzuki K Poulose7a1c8312016-03-23 12:08:02 +0000118 * @start: The intermediate physical base address of the range to unmap
119 * @size: The size of the area to unmap
Xiaofei Tanc9c02792020-09-17 09:47:49 +0800120 * @may_block: Whether or not we are permitted to block
Suzuki K Poulose7a1c8312016-03-23 12:08:02 +0000121 *
122 * Clear a range of stage-2 mappings, lowering the various ref-counts. Must
123 * be called while holding mmu_lock (unless for freeing the stage2 pgd before
124 * destroying the VM), otherwise another faulting VCPU may come in and mess
125 * with things behind our backs.
126 */
Will Deaconb5331372020-08-11 11:27:25 +0100127static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size,
128 bool may_block)
Christoffer Dall4f853a72014-05-09 23:31:31 +0200129{
Christoffer Dalla0e50aa2019-01-04 21:09:05 +0100130 struct kvm *kvm = mmu->kvm;
Will Deacon52bae932020-09-11 14:25:17 +0100131 phys_addr_t end = start + size;
Christoffer Dall4f853a72014-05-09 23:31:31 +0200132
Suzuki K Poulose8b3405e2017-04-03 15:12:43 +0100133 assert_spin_locked(&kvm->mmu_lock);
Jia He47a91b72018-05-21 11:05:30 +0800134 WARN_ON(size & ~PAGE_MASK);
Will Deacon52bae932020-09-11 14:25:17 +0100135 WARN_ON(stage2_apply_range(kvm, start, end, kvm_pgtable_stage2_unmap,
136 may_block));
Marc Zyngier000d3992013-03-05 02:43:17 +0000137}
138
Will Deaconb5331372020-08-11 11:27:25 +0100139static void unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size)
140{
141 __unmap_stage2_range(mmu, start, size, true);
142}
143
Marc Zyngier9d218a12014-01-15 12:50:23 +0000144static void stage2_flush_memslot(struct kvm *kvm,
145 struct kvm_memory_slot *memslot)
146{
147 phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
148 phys_addr_t end = addr + PAGE_SIZE * memslot->npages;
Marc Zyngier9d218a12014-01-15 12:50:23 +0000149
Quentin Perret8d5207b2020-09-11 14:25:23 +0100150 stage2_apply_range_resched(kvm, addr, end, kvm_pgtable_stage2_flush);
Marc Zyngier9d218a12014-01-15 12:50:23 +0000151}
152
153/**
154 * stage2_flush_vm - Invalidate cache for pages mapped in stage 2
155 * @kvm: The struct kvm pointer
156 *
157 * Go through the stage 2 page tables and invalidate any cache lines
158 * backing memory already mapped to the VM.
159 */
Marc Zyngier3c1e7162014-12-19 16:05:31 +0000160static void stage2_flush_vm(struct kvm *kvm)
Marc Zyngier9d218a12014-01-15 12:50:23 +0000161{
162 struct kvm_memslots *slots;
163 struct kvm_memory_slot *memslot;
164 int idx;
165
166 idx = srcu_read_lock(&kvm->srcu);
167 spin_lock(&kvm->mmu_lock);
168
169 slots = kvm_memslots(kvm);
170 kvm_for_each_memslot(memslot, slots)
171 stage2_flush_memslot(kvm, memslot);
172
173 spin_unlock(&kvm->mmu_lock);
174 srcu_read_unlock(&kvm->srcu, idx);
175}
176
Marc Zyngier000d3992013-03-05 02:43:17 +0000177/**
Marc Zyngier4f728272013-04-12 19:12:05 +0100178 * free_hyp_pgds - free Hyp-mode page tables
Marc Zyngier000d3992013-03-05 02:43:17 +0000179 */
Marc Zyngier4f728272013-04-12 19:12:05 +0100180void free_hyp_pgds(void)
Marc Zyngier000d3992013-03-05 02:43:17 +0000181{
Marc Zyngierd157f4a2013-04-12 19:12:07 +0100182 mutex_lock(&kvm_hyp_pgd_mutex);
Will Deacon0f9d09b2020-09-11 14:25:12 +0100183 if (hyp_pgtable) {
184 kvm_pgtable_hyp_destroy(hyp_pgtable);
185 kfree(hyp_pgtable);
Marc Zyngiere3f019b2017-12-04 17:04:38 +0000186 }
Christoffer Dall342cd0a2013-01-20 18:28:06 -0500187 mutex_unlock(&kvm_hyp_pgd_mutex);
188}
189
Will Deacon0f9d09b2020-09-11 14:25:12 +0100190static int __create_hyp_mappings(unsigned long start, unsigned long size,
191 unsigned long phys, enum kvm_pgtable_prot prot)
Christoffer Dall342cd0a2013-01-20 18:28:06 -0500192{
Will Deacon0f9d09b2020-09-11 14:25:12 +0100193 int err;
Christoffer Dall342cd0a2013-01-20 18:28:06 -0500194
Christoffer Dall342cd0a2013-01-20 18:28:06 -0500195 mutex_lock(&kvm_hyp_pgd_mutex);
Will Deacon0f9d09b2020-09-11 14:25:12 +0100196 err = kvm_pgtable_hyp_map(hyp_pgtable, start, size, phys, prot);
Christoffer Dall342cd0a2013-01-20 18:28:06 -0500197 mutex_unlock(&kvm_hyp_pgd_mutex);
Will Deacon0f9d09b2020-09-11 14:25:12 +0100198
Christoffer Dall342cd0a2013-01-20 18:28:06 -0500199 return err;
200}
201
Christoffer Dall40c27292013-11-15 13:14:12 -0800202static phys_addr_t kvm_kaddr_to_phys(void *kaddr)
203{
204 if (!is_vmalloc_addr(kaddr)) {
205 BUG_ON(!virt_addr_valid(kaddr));
206 return __pa(kaddr);
207 } else {
208 return page_to_phys(vmalloc_to_page(kaddr)) +
209 offset_in_page(kaddr);
210 }
211}
212
Christoffer Dall342cd0a2013-01-20 18:28:06 -0500213/**
Marc Zyngier06e8c3b2012-10-28 01:09:14 +0100214 * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode
Christoffer Dall342cd0a2013-01-20 18:28:06 -0500215 * @from: The virtual kernel start address of the range
216 * @to: The virtual kernel end address of the range (exclusive)
Marc Zyngierc8dddec2016-06-13 15:00:45 +0100217 * @prot: The protection to be applied to this range
Christoffer Dall342cd0a2013-01-20 18:28:06 -0500218 *
Marc Zyngier06e8c3b2012-10-28 01:09:14 +0100219 * The same virtual address as the kernel virtual address is also used
220 * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying
221 * physical pages.
Christoffer Dall342cd0a2013-01-20 18:28:06 -0500222 */
Will Deacon0f9d09b2020-09-11 14:25:12 +0100223int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot)
Christoffer Dall342cd0a2013-01-20 18:28:06 -0500224{
Christoffer Dall40c27292013-11-15 13:14:12 -0800225 phys_addr_t phys_addr;
226 unsigned long virt_addr;
Marc Zyngier6c41a412016-06-30 18:40:51 +0100227 unsigned long start = kern_hyp_va((unsigned long)from);
228 unsigned long end = kern_hyp_va((unsigned long)to);
Marc Zyngier6060df82013-04-12 19:12:01 +0100229
Marc Zyngier1e947ba2015-01-29 11:59:54 +0000230 if (is_kernel_in_hyp_mode())
231 return 0;
232
Christoffer Dall40c27292013-11-15 13:14:12 -0800233 start = start & PAGE_MASK;
234 end = PAGE_ALIGN(end);
Marc Zyngier6060df82013-04-12 19:12:01 +0100235
Christoffer Dall40c27292013-11-15 13:14:12 -0800236 for (virt_addr = start; virt_addr < end; virt_addr += PAGE_SIZE) {
237 int err;
238
239 phys_addr = kvm_kaddr_to_phys(from + virt_addr - start);
Will Deacon0f9d09b2020-09-11 14:25:12 +0100240 err = __create_hyp_mappings(virt_addr, PAGE_SIZE, phys_addr,
Marc Zyngierc8dddec2016-06-13 15:00:45 +0100241 prot);
Christoffer Dall40c27292013-11-15 13:14:12 -0800242 if (err)
243 return err;
244 }
245
246 return 0;
Christoffer Dall342cd0a2013-01-20 18:28:06 -0500247}
248
Marc Zyngierdc2e4632018-02-13 11:00:29 +0000249static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size,
Will Deacon0f9d09b2020-09-11 14:25:12 +0100250 unsigned long *haddr,
251 enum kvm_pgtable_prot prot)
Christoffer Dall342cd0a2013-01-20 18:28:06 -0500252{
Marc Zyngiere3f019b2017-12-04 17:04:38 +0000253 unsigned long base;
254 int ret = 0;
Marc Zyngier6060df82013-04-12 19:12:01 +0100255
Marc Zyngiere3f019b2017-12-04 17:04:38 +0000256 mutex_lock(&kvm_hyp_pgd_mutex);
Marc Zyngier6060df82013-04-12 19:12:01 +0100257
Marc Zyngiere3f019b2017-12-04 17:04:38 +0000258 /*
Fuad Tabba656012c2020-04-01 15:03:10 +0100259 * This assumes that we have enough space below the idmap
Marc Zyngiere3f019b2017-12-04 17:04:38 +0000260 * page to allocate our VAs. If not, the check below will
261 * kick. A potential alternative would be to detect that
262 * overflow and switch to an allocation above the idmap.
263 *
264 * The allocated size is always a multiple of PAGE_SIZE.
265 */
266 size = PAGE_ALIGN(size + offset_in_page(phys_addr));
267 base = io_map_base - size;
Marc Zyngier1bb32a42017-12-04 16:43:23 +0000268
Marc Zyngiere3f019b2017-12-04 17:04:38 +0000269 /*
270 * Verify that BIT(VA_BITS - 1) hasn't been flipped by
271 * allocating the new area, as it would indicate we've
272 * overflowed the idmap/IO address range.
273 */
274 if ((base ^ io_map_base) & BIT(VA_BITS - 1))
275 ret = -ENOMEM;
276 else
277 io_map_base = base;
278
279 mutex_unlock(&kvm_hyp_pgd_mutex);
280
281 if (ret)
282 goto out;
283
Will Deacon0f9d09b2020-09-11 14:25:12 +0100284 ret = __create_hyp_mappings(base, size, phys_addr, prot);
Marc Zyngiere3f019b2017-12-04 17:04:38 +0000285 if (ret)
286 goto out;
287
Marc Zyngierdc2e4632018-02-13 11:00:29 +0000288 *haddr = base + offset_in_page(phys_addr);
Marc Zyngiere3f019b2017-12-04 17:04:38 +0000289out:
Marc Zyngierdc2e4632018-02-13 11:00:29 +0000290 return ret;
291}
292
293/**
294 * create_hyp_io_mappings - Map IO into both kernel and HYP
295 * @phys_addr: The physical start address which gets mapped
296 * @size: Size of the region being mapped
297 * @kaddr: Kernel VA for this mapping
298 * @haddr: HYP VA for this mapping
299 */
300int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size,
301 void __iomem **kaddr,
302 void __iomem **haddr)
303{
304 unsigned long addr;
305 int ret;
306
307 *kaddr = ioremap(phys_addr, size);
308 if (!*kaddr)
309 return -ENOMEM;
310
311 if (is_kernel_in_hyp_mode()) {
312 *haddr = *kaddr;
313 return 0;
314 }
315
316 ret = __create_hyp_private_mapping(phys_addr, size,
317 &addr, PAGE_HYP_DEVICE);
Marc Zyngier1bb32a42017-12-04 16:43:23 +0000318 if (ret) {
319 iounmap(*kaddr);
320 *kaddr = NULL;
Marc Zyngierdc2e4632018-02-13 11:00:29 +0000321 *haddr = NULL;
Marc Zyngier1bb32a42017-12-04 16:43:23 +0000322 return ret;
323 }
324
Marc Zyngierdc2e4632018-02-13 11:00:29 +0000325 *haddr = (void __iomem *)addr;
326 return 0;
327}
328
329/**
330 * create_hyp_exec_mappings - Map an executable range into HYP
331 * @phys_addr: The physical start address which gets mapped
332 * @size: Size of the region being mapped
333 * @haddr: HYP VA for this mapping
334 */
335int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size,
336 void **haddr)
337{
338 unsigned long addr;
339 int ret;
340
341 BUG_ON(is_kernel_in_hyp_mode());
342
343 ret = __create_hyp_private_mapping(phys_addr, size,
344 &addr, PAGE_HYP_EXEC);
345 if (ret) {
346 *haddr = NULL;
347 return ret;
348 }
349
350 *haddr = (void *)addr;
Marc Zyngier1bb32a42017-12-04 16:43:23 +0000351 return 0;
Christoffer Dall342cd0a2013-01-20 18:28:06 -0500352}
353
Christoffer Dalld5d81842013-01-20 18:28:07 -0500354/**
Christoffer Dalla0e50aa2019-01-04 21:09:05 +0100355 * kvm_init_stage2_mmu - Initialise a S2 MMU strucrure
356 * @kvm: The pointer to the KVM structure
357 * @mmu: The pointer to the s2 MMU structure
Christoffer Dalld5d81842013-01-20 18:28:07 -0500358 *
Will Deacon71233d02020-09-11 14:25:13 +0100359 * Allocates only the stage-2 HW PGD level table(s).
Christoffer Dalld5d81842013-01-20 18:28:07 -0500360 * Note we don't need locking here as this is only called when the VM is
361 * created, which can only be done once.
362 */
Christoffer Dalla0e50aa2019-01-04 21:09:05 +0100363int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu)
Christoffer Dalld5d81842013-01-20 18:28:07 -0500364{
Will Deacon71233d02020-09-11 14:25:13 +0100365 int cpu, err;
366 struct kvm_pgtable *pgt;
Christoffer Dalld5d81842013-01-20 18:28:07 -0500367
Will Deacon71233d02020-09-11 14:25:13 +0100368 if (mmu->pgt != NULL) {
Christoffer Dalld5d81842013-01-20 18:28:07 -0500369 kvm_err("kvm_arch already initialized?\n");
370 return -EINVAL;
371 }
372
Will Deacon71233d02020-09-11 14:25:13 +0100373 pgt = kzalloc(sizeof(*pgt), GFP_KERNEL);
374 if (!pgt)
Marc Zyngiera9873702015-03-10 19:06:59 +0000375 return -ENOMEM;
376
Will Deacon71233d02020-09-11 14:25:13 +0100377 err = kvm_pgtable_stage2_init(pgt, kvm);
378 if (err)
379 goto out_free_pgtable;
Christoffer Dalle329fb72018-12-11 15:26:31 +0100380
Christoffer Dalla0e50aa2019-01-04 21:09:05 +0100381 mmu->last_vcpu_ran = alloc_percpu(typeof(*mmu->last_vcpu_ran));
382 if (!mmu->last_vcpu_ran) {
Will Deacon71233d02020-09-11 14:25:13 +0100383 err = -ENOMEM;
384 goto out_destroy_pgtable;
Christoffer Dalla0e50aa2019-01-04 21:09:05 +0100385 }
386
387 for_each_possible_cpu(cpu)
388 *per_cpu_ptr(mmu->last_vcpu_ran, cpu) = -1;
389
390 mmu->kvm = kvm;
Will Deacon71233d02020-09-11 14:25:13 +0100391 mmu->pgt = pgt;
392 mmu->pgd_phys = __pa(pgt->pgd);
Christoffer Dalla0e50aa2019-01-04 21:09:05 +0100393 mmu->vmid.vmid_gen = 0;
Christoffer Dalld5d81842013-01-20 18:28:07 -0500394 return 0;
Will Deacon71233d02020-09-11 14:25:13 +0100395
396out_destroy_pgtable:
397 kvm_pgtable_stage2_destroy(pgt);
398out_free_pgtable:
399 kfree(pgt);
400 return err;
Christoffer Dalld5d81842013-01-20 18:28:07 -0500401}
402
Christoffer Dall957db102014-11-27 10:35:03 +0100403static void stage2_unmap_memslot(struct kvm *kvm,
404 struct kvm_memory_slot *memslot)
405{
406 hva_t hva = memslot->userspace_addr;
407 phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
408 phys_addr_t size = PAGE_SIZE * memslot->npages;
409 hva_t reg_end = hva + size;
410
411 /*
412 * A memory region could potentially cover multiple VMAs, and any holes
413 * between them, so iterate over all of them to find out if we should
414 * unmap any of them.
415 *
416 * +--------------------------------------------+
417 * +---------------+----------------+ +----------------+
418 * | : VMA 1 | VMA 2 | | VMA 3 : |
419 * +---------------+----------------+ +----------------+
420 * | memory region |
421 * +--------------------------------------------+
422 */
423 do {
424 struct vm_area_struct *vma = find_vma(current->mm, hva);
425 hva_t vm_start, vm_end;
426
427 if (!vma || vma->vm_start >= reg_end)
428 break;
429
430 /*
431 * Take the intersection of this VMA with the memory region
432 */
433 vm_start = max(hva, vma->vm_start);
434 vm_end = min(reg_end, vma->vm_end);
435
436 if (!(vma->vm_flags & VM_PFNMAP)) {
437 gpa_t gpa = addr + (vm_start - memslot->userspace_addr);
Christoffer Dalla0e50aa2019-01-04 21:09:05 +0100438 unmap_stage2_range(&kvm->arch.mmu, gpa, vm_end - vm_start);
Christoffer Dall957db102014-11-27 10:35:03 +0100439 }
440 hva = vm_end;
441 } while (hva < reg_end);
442}
443
444/**
445 * stage2_unmap_vm - Unmap Stage-2 RAM mappings
446 * @kvm: The struct kvm pointer
447 *
Fuad Tabba656012c2020-04-01 15:03:10 +0100448 * Go through the memregions and unmap any regular RAM
Christoffer Dall957db102014-11-27 10:35:03 +0100449 * backing memory already mapped to the VM.
450 */
451void stage2_unmap_vm(struct kvm *kvm)
452{
453 struct kvm_memslots *slots;
454 struct kvm_memory_slot *memslot;
455 int idx;
456
457 idx = srcu_read_lock(&kvm->srcu);
Michel Lespinasse89154dd2020-06-08 21:33:29 -0700458 mmap_read_lock(current->mm);
Christoffer Dall957db102014-11-27 10:35:03 +0100459 spin_lock(&kvm->mmu_lock);
460
461 slots = kvm_memslots(kvm);
462 kvm_for_each_memslot(memslot, slots)
463 stage2_unmap_memslot(kvm, memslot);
464
465 spin_unlock(&kvm->mmu_lock);
Michel Lespinasse89154dd2020-06-08 21:33:29 -0700466 mmap_read_unlock(current->mm);
Christoffer Dall957db102014-11-27 10:35:03 +0100467 srcu_read_unlock(&kvm->srcu, idx);
468}
469
Christoffer Dalla0e50aa2019-01-04 21:09:05 +0100470void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
Christoffer Dalld5d81842013-01-20 18:28:07 -0500471{
Christoffer Dalla0e50aa2019-01-04 21:09:05 +0100472 struct kvm *kvm = mmu->kvm;
Will Deacon71233d02020-09-11 14:25:13 +0100473 struct kvm_pgtable *pgt = NULL;
Christoffer Dalld5d81842013-01-20 18:28:07 -0500474
Suzuki K Poulose8b3405e2017-04-03 15:12:43 +0100475 spin_lock(&kvm->mmu_lock);
Will Deacon71233d02020-09-11 14:25:13 +0100476 pgt = mmu->pgt;
477 if (pgt) {
Will Deacon71233d02020-09-11 14:25:13 +0100478 mmu->pgd_phys = 0;
479 mmu->pgt = NULL;
480 free_percpu(mmu->last_vcpu_ran);
Suzuki K Poulose6c0d7062017-05-03 15:17:51 +0100481 }
Suzuki K Poulose8b3405e2017-04-03 15:12:43 +0100482 spin_unlock(&kvm->mmu_lock);
483
Will Deacon71233d02020-09-11 14:25:13 +0100484 if (pgt) {
485 kvm_pgtable_stage2_destroy(pgt);
486 kfree(pgt);
Christoffer Dalla0e50aa2019-01-04 21:09:05 +0100487 }
Christoffer Dalld5d81842013-01-20 18:28:07 -0500488}
489
Christoffer Dalld5d81842013-01-20 18:28:07 -0500490/**
491 * kvm_phys_addr_ioremap - map a device range to guest IPA
492 *
493 * @kvm: The KVM pointer
494 * @guest_ipa: The IPA at which to insert the mapping
495 * @pa: The physical address of the device
496 * @size: The size of the mapping
Xiaofei Tanc9c02792020-09-17 09:47:49 +0800497 * @writable: Whether or not to create a writable mapping
Christoffer Dalld5d81842013-01-20 18:28:07 -0500498 */
499int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
Ard Biesheuvelc40f2f82014-09-17 14:56:18 -0700500 phys_addr_t pa, unsigned long size, bool writable)
Christoffer Dalld5d81842013-01-20 18:28:07 -0500501{
Will Deacon02bbd372020-09-11 14:25:15 +0100502 phys_addr_t addr;
Christoffer Dalld5d81842013-01-20 18:28:07 -0500503 int ret = 0;
Sean Christophersonc1a33ae2020-07-02 19:35:42 -0700504 struct kvm_mmu_memory_cache cache = { 0, __GFP_ZERO, NULL, };
Will Deacon02bbd372020-09-11 14:25:15 +0100505 struct kvm_pgtable *pgt = kvm->arch.mmu.pgt;
506 enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_DEVICE |
507 KVM_PGTABLE_PROT_R |
508 (writable ? KVM_PGTABLE_PROT_W : 0);
Christoffer Dalld5d81842013-01-20 18:28:07 -0500509
Will Deacon02bbd372020-09-11 14:25:15 +0100510 size += offset_in_page(guest_ipa);
511 guest_ipa &= PAGE_MASK;
Christoffer Dalld5d81842013-01-20 18:28:07 -0500512
Will Deacon02bbd372020-09-11 14:25:15 +0100513 for (addr = guest_ipa; addr < guest_ipa + size; addr += PAGE_SIZE) {
Sean Christophersonc1a33ae2020-07-02 19:35:42 -0700514 ret = kvm_mmu_topup_memory_cache(&cache,
515 kvm_mmu_cache_min_pages(kvm));
Christoffer Dalld5d81842013-01-20 18:28:07 -0500516 if (ret)
Will Deacon02bbd372020-09-11 14:25:15 +0100517 break;
518
Christoffer Dalld5d81842013-01-20 18:28:07 -0500519 spin_lock(&kvm->mmu_lock);
Will Deacon02bbd372020-09-11 14:25:15 +0100520 ret = kvm_pgtable_stage2_map(pgt, addr, PAGE_SIZE, pa, prot,
521 &cache);
Christoffer Dalld5d81842013-01-20 18:28:07 -0500522 spin_unlock(&kvm->mmu_lock);
523 if (ret)
Will Deacon02bbd372020-09-11 14:25:15 +0100524 break;
Christoffer Dalld5d81842013-01-20 18:28:07 -0500525
Will Deacon02bbd372020-09-11 14:25:15 +0100526 pa += PAGE_SIZE;
Christoffer Dalld5d81842013-01-20 18:28:07 -0500527 }
528
Sean Christophersonc1a33ae2020-07-02 19:35:42 -0700529 kvm_mmu_free_memory_cache(&cache);
Christoffer Dalld5d81842013-01-20 18:28:07 -0500530 return ret;
531}
532
Mario Smarduchc6473552015-01-15 15:58:56 -0800533/**
Mario Smarduchc6473552015-01-15 15:58:56 -0800534 * stage2_wp_range() - write protect stage2 memory region range
Xiaofei Tanc9c02792020-09-17 09:47:49 +0800535 * @mmu: The KVM stage-2 MMU pointer
Mario Smarduchc6473552015-01-15 15:58:56 -0800536 * @addr: Start address of range
537 * @end: End address of range
538 */
Christoffer Dalla0e50aa2019-01-04 21:09:05 +0100539static void stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end)
Mario Smarduchc6473552015-01-15 15:58:56 -0800540{
Christoffer Dalla0e50aa2019-01-04 21:09:05 +0100541 struct kvm *kvm = mmu->kvm;
Quentin Perretcc38d612020-09-11 14:25:21 +0100542 stage2_apply_range_resched(kvm, addr, end, kvm_pgtable_stage2_wrprotect);
Mario Smarduchc6473552015-01-15 15:58:56 -0800543}
544
545/**
546 * kvm_mmu_wp_memory_region() - write protect stage 2 entries for memory slot
547 * @kvm: The KVM pointer
548 * @slot: The memory slot to write protect
549 *
550 * Called to start logging dirty pages after memory region
551 * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns
Punit Agrawal4ea5af52018-12-11 17:10:37 +0000552 * all present PUD, PMD and PTEs are write protected in the memory region.
Mario Smarduchc6473552015-01-15 15:58:56 -0800553 * Afterwards read of dirty page log can be called.
554 *
555 * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired,
556 * serializing operations for VM memory regions.
557 */
558void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
559{
Paolo Bonzini9f6b8022015-05-17 16:20:07 +0200560 struct kvm_memslots *slots = kvm_memslots(kvm);
561 struct kvm_memory_slot *memslot = id_to_memslot(slots, slot);
Sean Christopherson0577d1a2020-02-18 13:07:31 -0800562 phys_addr_t start, end;
563
564 if (WARN_ON_ONCE(!memslot))
565 return;
566
567 start = memslot->base_gfn << PAGE_SHIFT;
568 end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
Mario Smarduchc6473552015-01-15 15:58:56 -0800569
570 spin_lock(&kvm->mmu_lock);
Christoffer Dalla0e50aa2019-01-04 21:09:05 +0100571 stage2_wp_range(&kvm->arch.mmu, start, end);
Mario Smarduchc6473552015-01-15 15:58:56 -0800572 spin_unlock(&kvm->mmu_lock);
573 kvm_flush_remote_tlbs(kvm);
574}
Mario Smarduch53c810c2015-01-15 15:58:57 -0800575
576/**
Kai Huang3b0f1d02015-01-28 10:54:23 +0800577 * kvm_mmu_write_protect_pt_masked() - write protect dirty pages
Mario Smarduch53c810c2015-01-15 15:58:57 -0800578 * @kvm: The KVM pointer
579 * @slot: The memory slot associated with mask
580 * @gfn_offset: The gfn offset in memory slot
581 * @mask: The mask of dirty pages at offset 'gfn_offset' in this memory
582 * slot to be write protected
583 *
584 * Walks bits set in mask write protects the associated pte's. Caller must
585 * acquire kvm_mmu_lock.
586 */
Kai Huang3b0f1d02015-01-28 10:54:23 +0800587static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
Mario Smarduch53c810c2015-01-15 15:58:57 -0800588 struct kvm_memory_slot *slot,
589 gfn_t gfn_offset, unsigned long mask)
590{
591 phys_addr_t base_gfn = slot->base_gfn + gfn_offset;
592 phys_addr_t start = (base_gfn + __ffs(mask)) << PAGE_SHIFT;
593 phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT;
594
Christoffer Dalla0e50aa2019-01-04 21:09:05 +0100595 stage2_wp_range(&kvm->arch.mmu, start, end);
Mario Smarduch53c810c2015-01-15 15:58:57 -0800596}
Mario Smarduchc6473552015-01-15 15:58:56 -0800597
Kai Huang3b0f1d02015-01-28 10:54:23 +0800598/*
599 * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
600 * dirty pages.
601 *
602 * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
603 * enable dirty logging for them.
604 */
605void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
606 struct kvm_memory_slot *slot,
607 gfn_t gfn_offset, unsigned long mask)
608{
609 kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
610}
611
Marc Zyngier17ab9d52017-10-23 17:11:22 +0100612static void clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long size)
Marc Zyngier0d3e4d42015-01-05 21:13:24 +0000613{
Marc Zyngier17ab9d52017-10-23 17:11:22 +0100614 __clean_dcache_guest_page(pfn, size);
Marc Zyngiera15f6932017-10-23 17:11:15 +0100615}
616
Marc Zyngier17ab9d52017-10-23 17:11:22 +0100617static void invalidate_icache_guest_page(kvm_pfn_t pfn, unsigned long size)
Marc Zyngiera15f6932017-10-23 17:11:15 +0100618{
Marc Zyngier17ab9d52017-10-23 17:11:22 +0100619 __invalidate_icache_guest_page(pfn, size);
Marc Zyngier0d3e4d42015-01-05 21:13:24 +0000620}
621
James Morse1559b752019-12-17 12:38:09 +0000622static void kvm_send_hwpoison_signal(unsigned long address, short lsb)
James Morse196f8782017-06-20 17:11:48 +0100623{
Eric W. Biederman795a8372018-04-16 13:39:10 -0500624 send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current);
James Morse196f8782017-06-20 17:11:48 +0100625}
626
Suzuki K Poulosea80868f2019-03-12 09:52:51 +0000627static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot,
628 unsigned long hva,
629 unsigned long map_size)
Christoffer Dall6794ad52018-11-02 08:53:22 +0100630{
Shaokun Zhangc2be79a2019-02-19 17:22:21 +0800631 gpa_t gpa_start;
Christoffer Dall6794ad52018-11-02 08:53:22 +0100632 hva_t uaddr_start, uaddr_end;
633 size_t size;
634
Suzuki K Poulose9f283612020-05-07 20:35:45 +0800635 /* The memslot and the VMA are guaranteed to be aligned to PAGE_SIZE */
636 if (map_size == PAGE_SIZE)
637 return true;
638
Christoffer Dall6794ad52018-11-02 08:53:22 +0100639 size = memslot->npages * PAGE_SIZE;
640
641 gpa_start = memslot->base_gfn << PAGE_SHIFT;
Christoffer Dall6794ad52018-11-02 08:53:22 +0100642
643 uaddr_start = memslot->userspace_addr;
644 uaddr_end = uaddr_start + size;
645
646 /*
647 * Pages belonging to memslots that don't have the same alignment
Suzuki K Poulosea80868f2019-03-12 09:52:51 +0000648 * within a PMD/PUD for userspace and IPA cannot be mapped with stage-2
649 * PMD/PUD entries, because we'll end up mapping the wrong pages.
Christoffer Dall6794ad52018-11-02 08:53:22 +0100650 *
651 * Consider a layout like the following:
652 *
653 * memslot->userspace_addr:
654 * +-----+--------------------+--------------------+---+
Suzuki K Poulosea80868f2019-03-12 09:52:51 +0000655 * |abcde|fgh Stage-1 block | Stage-1 block tv|xyz|
Christoffer Dall6794ad52018-11-02 08:53:22 +0100656 * +-----+--------------------+--------------------+---+
657 *
Suzuki K Poulose9f283612020-05-07 20:35:45 +0800658 * memslot->base_gfn << PAGE_SHIFT:
Christoffer Dall6794ad52018-11-02 08:53:22 +0100659 * +---+--------------------+--------------------+-----+
Suzuki K Poulosea80868f2019-03-12 09:52:51 +0000660 * |abc|def Stage-2 block | Stage-2 block |tvxyz|
Christoffer Dall6794ad52018-11-02 08:53:22 +0100661 * +---+--------------------+--------------------+-----+
662 *
Suzuki K Poulosea80868f2019-03-12 09:52:51 +0000663 * If we create those stage-2 blocks, we'll end up with this incorrect
Christoffer Dall6794ad52018-11-02 08:53:22 +0100664 * mapping:
665 * d -> f
666 * e -> g
667 * f -> h
668 */
Suzuki K Poulosea80868f2019-03-12 09:52:51 +0000669 if ((gpa_start & (map_size - 1)) != (uaddr_start & (map_size - 1)))
Christoffer Dall6794ad52018-11-02 08:53:22 +0100670 return false;
671
672 /*
673 * Next, let's make sure we're not trying to map anything not covered
Suzuki K Poulosea80868f2019-03-12 09:52:51 +0000674 * by the memslot. This means we have to prohibit block size mappings
675 * for the beginning and end of a non-block aligned and non-block sized
Christoffer Dall6794ad52018-11-02 08:53:22 +0100676 * memory slot (illustrated by the head and tail parts of the
677 * userspace view above containing pages 'abcde' and 'xyz',
678 * respectively).
679 *
680 * Note that it doesn't matter if we do the check using the
681 * userspace_addr or the base_gfn, as both are equally aligned (per
682 * the check above) and equally sized.
683 */
Suzuki K Poulosea80868f2019-03-12 09:52:51 +0000684 return (hva & ~(map_size - 1)) >= uaddr_start &&
685 (hva & ~(map_size - 1)) + map_size <= uaddr_end;
Christoffer Dall6794ad52018-11-02 08:53:22 +0100686}
687
Suzuki K Poulose0529c902020-05-07 20:35:46 +0800688/*
689 * Check if the given hva is backed by a transparent huge page (THP) and
690 * whether it can be mapped using block mapping in stage2. If so, adjust
691 * the stage2 PFN and IPA accordingly. Only PMD_SIZE THPs are currently
692 * supported. This will need to be updated to support other THP sizes.
693 *
694 * Returns the size of the mapping.
695 */
696static unsigned long
697transparent_hugepage_adjust(struct kvm_memory_slot *memslot,
698 unsigned long hva, kvm_pfn_t *pfnp,
699 phys_addr_t *ipap)
700{
701 kvm_pfn_t pfn = *pfnp;
702
703 /*
704 * Make sure the adjustment is done only for THP pages. Also make
705 * sure that the HVA and IPA are sufficiently aligned and that the
706 * block map is contained within the memslot.
707 */
708 if (kvm_is_transparent_hugepage(pfn) &&
709 fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) {
710 /*
711 * The address we faulted on is backed by a transparent huge
712 * page. However, because we map the compound huge page and
713 * not the individual tail page, we need to transfer the
714 * refcount to the head page. We have to be careful that the
715 * THP doesn't start to split while we are adjusting the
716 * refcounts.
717 *
718 * We are sure this doesn't happen, because mmu_notifier_retry
719 * was successful and we are holding the mmu_lock, so if this
720 * THP is trying to split, it will be blocked in the mmu
721 * notifier before touching any of the pages, specifically
722 * before being able to call __split_huge_page_refcount().
723 *
724 * We can therefore safely transfer the refcount from PG_tail
725 * to PG_head and switch the pfn from a tail page to the head
726 * page accordingly.
727 */
728 *ipap &= PMD_MASK;
729 kvm_release_pfn_clean(pfn);
730 pfn &= ~(PTRS_PER_PMD - 1);
731 kvm_get_pfn(pfn);
732 *pfnp = pfn;
733
734 return PMD_SIZE;
735 }
736
737 /* Use page mapping if we cannot use block mapping. */
738 return PAGE_SIZE;
739}
740
Christoffer Dall94f8e642013-01-20 18:28:12 -0500741static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
Christoffer Dall98047882014-08-19 12:18:04 +0200742 struct kvm_memory_slot *memslot, unsigned long hva,
Christoffer Dall94f8e642013-01-20 18:28:12 -0500743 unsigned long fault_status)
744{
Will Deaconffd1b632020-09-30 11:24:42 +0100745 int ret = 0;
Punit Agrawal6396b852018-12-11 17:10:35 +0000746 bool write_fault, writable, force_pte = false;
Will Deacon6f745f12020-09-11 14:25:25 +0100747 bool exec_fault;
748 bool device = false;
Christoffer Dall94f8e642013-01-20 18:28:12 -0500749 unsigned long mmu_seq;
Christoffer Dallad361f02012-11-01 17:14:45 +0100750 struct kvm *kvm = vcpu->kvm;
Christoffer Dall94f8e642013-01-20 18:28:12 -0500751 struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
Christoffer Dallad361f02012-11-01 17:14:45 +0100752 struct vm_area_struct *vma;
James Morse1559b752019-12-17 12:38:09 +0000753 short vma_shift;
Will Deacon6f745f12020-09-11 14:25:25 +0100754 gfn_t gfn;
Dan Williamsba049e92016-01-15 16:56:11 -0800755 kvm_pfn_t pfn;
Mario Smarduch15a49a42015-01-15 15:58:58 -0800756 bool logging_active = memslot_is_logging(memslot);
Yanan Wang7d894832020-12-02 04:10:34 +0800757 unsigned long fault_level = kvm_vcpu_trap_get_fault_level(vcpu);
758 unsigned long vma_pagesize, fault_granule;
Will Deacon6f745f12020-09-11 14:25:25 +0100759 enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
760 struct kvm_pgtable *pgt;
Christoffer Dall94f8e642013-01-20 18:28:12 -0500761
Yanan Wang7d894832020-12-02 04:10:34 +0800762 fault_granule = 1UL << ARM64_HW_PGTABLE_LEVEL_SHIFT(fault_level);
Ard Biesheuvela7d079c2014-09-09 11:27:09 +0100763 write_fault = kvm_is_write_fault(vcpu);
Marc Zyngierc4ad98e2020-09-15 11:42:17 +0100764 exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu);
Marc Zyngierd0e22b42017-10-23 17:11:19 +0100765 VM_BUG_ON(write_fault && exec_fault);
766
767 if (fault_status == FSC_PERM && !write_fault && !exec_fault) {
Christoffer Dall94f8e642013-01-20 18:28:12 -0500768 kvm_err("Unexpected L2 read permission error\n");
769 return -EFAULT;
770 }
771
Christoffer Dallad361f02012-11-01 17:14:45 +0100772 /* Let's check if we will get back a huge page backed by hugetlbfs */
Michel Lespinasse89154dd2020-06-08 21:33:29 -0700773 mmap_read_lock(current->mm);
Christoffer Dallad361f02012-11-01 17:14:45 +0100774 vma = find_vma_intersection(current->mm, hva, hva + 1);
Ard Biesheuvel37b54402014-09-17 14:56:17 -0700775 if (unlikely(!vma)) {
776 kvm_err("Failed to find VMA for hva 0x%lx\n", hva);
Michel Lespinasse89154dd2020-06-08 21:33:29 -0700777 mmap_read_unlock(current->mm);
Ard Biesheuvel37b54402014-09-17 14:56:17 -0700778 return -EFAULT;
779 }
780
James Morse1559b752019-12-17 12:38:09 +0000781 if (is_vm_hugetlb_page(vma))
782 vma_shift = huge_page_shift(hstate_vma(vma));
783 else
784 vma_shift = PAGE_SHIFT;
785
Suzuki K Poulosea80868f2019-03-12 09:52:51 +0000786 if (logging_active ||
Alexandru Elisei523b3992020-09-10 14:33:51 +0100787 (vma->vm_flags & VM_PFNMAP)) {
Suzuki K Poulosea80868f2019-03-12 09:52:51 +0000788 force_pte = true;
Alexandru Elisei523b3992020-09-10 14:33:51 +0100789 vma_shift = PAGE_SHIFT;
Suzuki K Poulosea80868f2019-03-12 09:52:51 +0000790 }
791
Gavin Shan2f40c462020-10-26 10:06:26 +1100792 switch (vma_shift) {
Gavin Shanfaf00032020-11-03 11:30:09 +1100793#ifndef __PAGETABLE_PMD_FOLDED
Gavin Shan2f40c462020-10-26 10:06:26 +1100794 case PUD_SHIFT:
795 if (fault_supports_stage2_huge_mapping(memslot, hva, PUD_SIZE))
796 break;
797 fallthrough;
Gavin Shanfaf00032020-11-03 11:30:09 +1100798#endif
Gavin Shan2f40c462020-10-26 10:06:26 +1100799 case CONT_PMD_SHIFT:
800 vma_shift = PMD_SHIFT;
801 fallthrough;
802 case PMD_SHIFT:
803 if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE))
804 break;
805 fallthrough;
806 case CONT_PTE_SHIFT:
Alexandru Elisei523b3992020-09-10 14:33:51 +0100807 vma_shift = PAGE_SHIFT;
Gavin Shan2f40c462020-10-26 10:06:26 +1100808 force_pte = true;
809 fallthrough;
810 case PAGE_SHIFT:
811 break;
812 default:
813 WARN_ONCE(1, "Unknown vma_shift %d", vma_shift);
Alexandru Elisei523b3992020-09-10 14:33:51 +0100814 }
815
816 vma_pagesize = 1UL << vma_shift;
Will Deacon6f745f12020-09-11 14:25:25 +0100817 if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE)
Alexandru Elisei523b3992020-09-10 14:33:51 +0100818 fault_ipa &= ~(vma_pagesize - 1);
Will Deacon6f745f12020-09-11 14:25:25 +0100819
820 gfn = fault_ipa >> PAGE_SHIFT;
Michel Lespinasse89154dd2020-06-08 21:33:29 -0700821 mmap_read_unlock(current->mm);
Christoffer Dallad361f02012-11-01 17:14:45 +0100822
Will Deacon6f745f12020-09-11 14:25:25 +0100823 /*
824 * Permission faults just need to update the existing leaf entry,
825 * and so normally don't require allocations from the memcache. The
826 * only exception to this is when dirty logging is enabled at runtime
827 * and a write fault needs to collapse a block entry into a table.
828 */
829 if (fault_status != FSC_PERM || (logging_active && write_fault)) {
830 ret = kvm_mmu_topup_memory_cache(memcache,
831 kvm_mmu_cache_min_pages(kvm));
832 if (ret)
833 return ret;
834 }
Christoffer Dall94f8e642013-01-20 18:28:12 -0500835
836 mmu_seq = vcpu->kvm->mmu_notifier_seq;
837 /*
838 * Ensure the read of mmu_notifier_seq happens before we call
839 * gfn_to_pfn_prot (which calls get_user_pages), so that we don't risk
840 * the page we just got a reference to gets unmapped before we have a
841 * chance to grab the mmu_lock, which ensure that if the page gets
842 * unmapped afterwards, the call to kvm_unmap_hva will take it away
843 * from us again properly. This smp_rmb() interacts with the smp_wmb()
844 * in kvm_mmu_notifier_invalidate_<page|range_end>.
845 */
846 smp_rmb();
847
Christoffer Dallad361f02012-11-01 17:14:45 +0100848 pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable);
James Morse196f8782017-06-20 17:11:48 +0100849 if (pfn == KVM_PFN_ERR_HWPOISON) {
James Morse1559b752019-12-17 12:38:09 +0000850 kvm_send_hwpoison_signal(hva, vma_shift);
James Morse196f8782017-06-20 17:11:48 +0100851 return 0;
852 }
Christoffer Dall9ac71592016-08-17 10:46:10 +0200853 if (is_error_noslot_pfn(pfn))
Christoffer Dall94f8e642013-01-20 18:28:12 -0500854 return -EFAULT;
855
Mario Smarduch15a49a42015-01-15 15:58:58 -0800856 if (kvm_is_device_pfn(pfn)) {
Will Deacon6f745f12020-09-11 14:25:25 +0100857 device = true;
Santosh Shukla91a2c342020-10-26 16:54:07 +0530858 force_pte = true;
Will Deacon6f745f12020-09-11 14:25:25 +0100859 } else if (logging_active && !write_fault) {
Mario Smarduch15a49a42015-01-15 15:58:58 -0800860 /*
861 * Only actually map the page as writable if this was a write
862 * fault.
863 */
Will Deacon6f745f12020-09-11 14:25:25 +0100864 writable = false;
Mario Smarduch15a49a42015-01-15 15:58:58 -0800865 }
Kim Phillipsb8865762014-06-26 01:45:51 +0100866
Will Deacon6f745f12020-09-11 14:25:25 +0100867 if (exec_fault && device)
Marc Zyngier6d674e22019-12-11 16:56:48 +0000868 return -ENOEXEC;
869
Christoffer Dallad361f02012-11-01 17:14:45 +0100870 spin_lock(&kvm->mmu_lock);
Will Deacon6f745f12020-09-11 14:25:25 +0100871 pgt = vcpu->arch.hw_mmu->pgt;
Christoffer Dallad361f02012-11-01 17:14:45 +0100872 if (mmu_notifier_retry(kvm, mmu_seq))
Christoffer Dall94f8e642013-01-20 18:28:12 -0500873 goto out_unlock;
Mario Smarduch15a49a42015-01-15 15:58:58 -0800874
Suzuki K Poulose0529c902020-05-07 20:35:46 +0800875 /*
876 * If we are not forced to use page mapping, check if we are
877 * backed by a THP and thus use block mapping if possible.
878 */
879 if (vma_pagesize == PAGE_SIZE && !force_pte)
880 vma_pagesize = transparent_hugepage_adjust(memslot, hva,
881 &pfn, &fault_ipa);
Yanan Wang509552e2021-01-14 20:13:50 +0800882 if (writable)
Will Deacon6f745f12020-09-11 14:25:25 +0100883 prot |= KVM_PGTABLE_PROT_W;
Punit Agrawal3f58bf62018-12-11 17:10:34 +0000884
Will Deacon6f745f12020-09-11 14:25:25 +0100885 if (fault_status != FSC_PERM && !device)
Punit Agrawal3f58bf62018-12-11 17:10:34 +0000886 clean_dcache_guest_page(pfn, vma_pagesize);
887
Will Deacon6f745f12020-09-11 14:25:25 +0100888 if (exec_fault) {
889 prot |= KVM_PGTABLE_PROT_X;
Punit Agrawal3f58bf62018-12-11 17:10:34 +0000890 invalidate_icache_guest_page(pfn, vma_pagesize);
Will Deacon6f745f12020-09-11 14:25:25 +0100891 }
Punit Agrawal3f58bf62018-12-11 17:10:34 +0000892
Will Deacon6f745f12020-09-11 14:25:25 +0100893 if (device)
894 prot |= KVM_PGTABLE_PROT_DEVICE;
895 else if (cpus_have_const_cap(ARM64_HAS_CACHE_DIC))
896 prot |= KVM_PGTABLE_PROT_X;
Punit Agrawal6396b852018-12-11 17:10:35 +0000897
Yanan Wang7d894832020-12-02 04:10:34 +0800898 /*
899 * Under the premise of getting a FSC_PERM fault, we just need to relax
900 * permissions only if vma_pagesize equals fault_granule. Otherwise,
901 * kvm_pgtable_stage2_map() should be called to change block size.
902 */
903 if (fault_status == FSC_PERM && vma_pagesize == fault_granule) {
Will Deacon6f745f12020-09-11 14:25:25 +0100904 ret = kvm_pgtable_stage2_relax_perms(pgt, fault_ipa, prot);
Christoffer Dallad361f02012-11-01 17:14:45 +0100905 } else {
Will Deacon6f745f12020-09-11 14:25:25 +0100906 ret = kvm_pgtable_stage2_map(pgt, fault_ipa, vma_pagesize,
907 __pfn_to_phys(pfn), prot,
908 memcache);
Christoffer Dall94f8e642013-01-20 18:28:12 -0500909 }
Christoffer Dallad361f02012-11-01 17:14:45 +0100910
Yanan Wang509552e2021-01-14 20:13:50 +0800911 /* Mark the page dirty only if the fault is handled successfully */
912 if (writable && !ret) {
913 kvm_set_pfn_dirty(pfn);
914 mark_page_dirty(kvm, gfn);
915 }
916
Christoffer Dall94f8e642013-01-20 18:28:12 -0500917out_unlock:
Christoffer Dallad361f02012-11-01 17:14:45 +0100918 spin_unlock(&kvm->mmu_lock);
Marc Zyngier35307b92015-03-12 18:16:51 +0000919 kvm_set_pfn_accessed(pfn);
Christoffer Dall94f8e642013-01-20 18:28:12 -0500920 kvm_release_pfn_clean(pfn);
Yanan Wang509552e2021-01-14 20:13:50 +0800921 return ret != -EAGAIN ? ret : 0;
Christoffer Dall94f8e642013-01-20 18:28:12 -0500922}
923
Will Deaconee8efad2020-09-11 14:25:19 +0100924/* Resolve the access fault by making the page young again. */
Marc Zyngieraeda9132015-03-12 18:16:52 +0000925static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
926{
Will Deaconee8efad2020-09-11 14:25:19 +0100927 pte_t pte;
928 kvm_pte_t kpte;
929 struct kvm_s2_mmu *mmu;
Marc Zyngieraeda9132015-03-12 18:16:52 +0000930
931 trace_kvm_access_fault(fault_ipa);
932
933 spin_lock(&vcpu->kvm->mmu_lock);
Will Deaconee8efad2020-09-11 14:25:19 +0100934 mmu = vcpu->arch.hw_mmu;
935 kpte = kvm_pgtable_stage2_mkyoung(mmu->pgt, fault_ipa);
Marc Zyngieraeda9132015-03-12 18:16:52 +0000936 spin_unlock(&vcpu->kvm->mmu_lock);
Will Deaconee8efad2020-09-11 14:25:19 +0100937
938 pte = __pte(kpte);
939 if (pte_valid(pte))
940 kvm_set_pfn_accessed(pte_pfn(pte));
Marc Zyngieraeda9132015-03-12 18:16:52 +0000941}
942
Christoffer Dall94f8e642013-01-20 18:28:12 -0500943/**
944 * kvm_handle_guest_abort - handles all 2nd stage aborts
945 * @vcpu: the VCPU pointer
Christoffer Dall94f8e642013-01-20 18:28:12 -0500946 *
947 * Any abort that gets to the host is almost guaranteed to be caused by a
948 * missing second stage translation table entry, which can mean that either the
949 * guest simply needs more memory and we must allocate an appropriate page or it
950 * can mean that the guest tried to access I/O memory, which is emulated by user
951 * space. The distinction is based on the IPA causing the fault and whether this
952 * memory region has been registered as standard RAM by user space.
953 */
Tianjia Zhang74cc7e02020-06-23 21:14:15 +0800954int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
Christoffer Dall342cd0a2013-01-20 18:28:06 -0500955{
Christoffer Dall94f8e642013-01-20 18:28:12 -0500956 unsigned long fault_status;
957 phys_addr_t fault_ipa;
958 struct kvm_memory_slot *memslot;
Christoffer Dall98047882014-08-19 12:18:04 +0200959 unsigned long hva;
960 bool is_iabt, write_fault, writable;
Christoffer Dall94f8e642013-01-20 18:28:12 -0500961 gfn_t gfn;
962 int ret, idx;
963
Tyler Baicar621f48e2017-06-21 12:17:14 -0600964 fault_status = kvm_vcpu_trap_get_fault_type(vcpu);
965
966 fault_ipa = kvm_vcpu_get_fault_ipa(vcpu);
James Morsebb428922017-07-18 13:37:41 +0100967 is_iabt = kvm_vcpu_trap_is_iabt(vcpu);
Tyler Baicar621f48e2017-06-21 12:17:14 -0600968
James Morsebb428922017-07-18 13:37:41 +0100969 /* Synchronous External Abort? */
Will Deaconc9a636f2020-07-29 11:28:18 +0100970 if (kvm_vcpu_abt_issea(vcpu)) {
James Morsebb428922017-07-18 13:37:41 +0100971 /*
972 * For RAS the host kernel may handle this abort.
973 * There is no need to pass the error into the guest.
974 */
Will Deacon84b951a2020-07-29 11:28:19 +0100975 if (kvm_handle_guest_sea(fault_ipa, kvm_vcpu_get_esr(vcpu)))
James Morsebb428922017-07-18 13:37:41 +0100976 kvm_inject_vabt(vcpu);
Will Deacon84b951a2020-07-29 11:28:19 +0100977
978 return 1;
Marc Zyngier40557102016-09-06 14:02:15 +0100979 }
980
Gavin Shan3a949f42020-06-30 11:57:05 +1000981 trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_esr(vcpu),
Marc Zyngier7393b592012-09-17 19:27:09 +0100982 kvm_vcpu_get_hfar(vcpu), fault_ipa);
Christoffer Dall94f8e642013-01-20 18:28:12 -0500983
984 /* Check the stage-2 fault is trans. fault or write fault */
Marc Zyngier35307b92015-03-12 18:16:51 +0000985 if (fault_status != FSC_FAULT && fault_status != FSC_PERM &&
986 fault_status != FSC_ACCESS) {
Christoffer Dall0496daa52014-09-26 12:29:34 +0200987 kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n",
988 kvm_vcpu_trap_get_class(vcpu),
989 (unsigned long)kvm_vcpu_trap_get_fault(vcpu),
Gavin Shan3a949f42020-06-30 11:57:05 +1000990 (unsigned long)kvm_vcpu_get_esr(vcpu));
Christoffer Dall94f8e642013-01-20 18:28:12 -0500991 return -EFAULT;
992 }
993
994 idx = srcu_read_lock(&vcpu->kvm->srcu);
995
996 gfn = fault_ipa >> PAGE_SHIFT;
Christoffer Dall98047882014-08-19 12:18:04 +0200997 memslot = gfn_to_memslot(vcpu->kvm, gfn);
998 hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable);
Ard Biesheuvela7d079c2014-09-09 11:27:09 +0100999 write_fault = kvm_is_write_fault(vcpu);
Christoffer Dall98047882014-08-19 12:18:04 +02001000 if (kvm_is_error_hva(hva) || (write_fault && !writable)) {
Will Deacon022c8322020-07-29 11:28:21 +01001001 /*
1002 * The guest has put either its instructions or its page-tables
1003 * somewhere it shouldn't have. Userspace won't be able to do
1004 * anything about this (there's no syndrome for a start), so
1005 * re-inject the abort back into the guest.
1006 */
Christoffer Dall94f8e642013-01-20 18:28:12 -05001007 if (is_iabt) {
Marc Zyngier6d674e22019-12-11 16:56:48 +00001008 ret = -ENOEXEC;
1009 goto out;
Christoffer Dall94f8e642013-01-20 18:28:12 -05001010 }
1011
Marc Zyngierc4ad98e2020-09-15 11:42:17 +01001012 if (kvm_vcpu_abt_iss1tw(vcpu)) {
Will Deacon022c8322020-07-29 11:28:21 +01001013 kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu));
1014 ret = 1;
1015 goto out_unlock;
1016 }
1017
Marc Zyngiercfe39502012-12-12 14:42:09 +00001018 /*
Marc Zyngier57c841f2016-01-29 15:01:28 +00001019 * Check for a cache maintenance operation. Since we
1020 * ended-up here, we know it is outside of any memory
1021 * slot. But we can't find out if that is for a device,
1022 * or if the guest is just being stupid. The only thing
1023 * we know for sure is that this range cannot be cached.
1024 *
1025 * So let's assume that the guest is just being
1026 * cautious, and skip the instruction.
1027 */
Will Deacon54dc0d22020-07-29 11:28:20 +01001028 if (kvm_is_error_hva(hva) && kvm_vcpu_dabt_is_cm(vcpu)) {
Marc Zyngiercdb5e022020-10-14 09:29:27 +01001029 kvm_incr_pc(vcpu);
Marc Zyngier57c841f2016-01-29 15:01:28 +00001030 ret = 1;
1031 goto out_unlock;
1032 }
1033
1034 /*
Marc Zyngiercfe39502012-12-12 14:42:09 +00001035 * The IPA is reported as [MAX:12], so we need to
1036 * complement it with the bottom 12 bits from the
1037 * faulting VA. This is always 12 bits, irrespective
1038 * of the page size.
1039 */
1040 fault_ipa |= kvm_vcpu_get_hfar(vcpu) & ((1 << 12) - 1);
Tianjia Zhang74cc7e02020-06-23 21:14:15 +08001041 ret = io_mem_abort(vcpu, fault_ipa);
Christoffer Dall94f8e642013-01-20 18:28:12 -05001042 goto out_unlock;
1043 }
1044
Christoffer Dallc3058d52014-10-10 12:14:29 +02001045 /* Userspace should not be able to register out-of-bounds IPAs */
Suzuki K Poulosee55cac52018-09-26 17:32:44 +01001046 VM_BUG_ON(fault_ipa >= kvm_phys_size(vcpu->kvm));
Christoffer Dallc3058d52014-10-10 12:14:29 +02001047
Marc Zyngieraeda9132015-03-12 18:16:52 +00001048 if (fault_status == FSC_ACCESS) {
1049 handle_access_fault(vcpu, fault_ipa);
1050 ret = 1;
1051 goto out_unlock;
1052 }
1053
Christoffer Dall98047882014-08-19 12:18:04 +02001054 ret = user_mem_abort(vcpu, fault_ipa, memslot, hva, fault_status);
Christoffer Dall94f8e642013-01-20 18:28:12 -05001055 if (ret == 0)
1056 ret = 1;
Marc Zyngier6d674e22019-12-11 16:56:48 +00001057out:
1058 if (ret == -ENOEXEC) {
1059 kvm_inject_pabt(vcpu, kvm_vcpu_get_hfar(vcpu));
1060 ret = 1;
1061 }
Christoffer Dall94f8e642013-01-20 18:28:12 -05001062out_unlock:
1063 srcu_read_unlock(&vcpu->kvm->srcu, idx);
1064 return ret;
Christoffer Dall342cd0a2013-01-20 18:28:06 -05001065}
1066
Marc Zyngier1d2ebac2015-03-12 18:16:50 +00001067static int handle_hva_to_gpa(struct kvm *kvm,
1068 unsigned long start,
1069 unsigned long end,
1070 int (*handler)(struct kvm *kvm,
Suzuki K Poulose056aad62017-03-20 18:26:42 +00001071 gpa_t gpa, u64 size,
1072 void *data),
Marc Zyngier1d2ebac2015-03-12 18:16:50 +00001073 void *data)
Christoffer Dalld5d81842013-01-20 18:28:07 -05001074{
1075 struct kvm_memslots *slots;
1076 struct kvm_memory_slot *memslot;
Marc Zyngier1d2ebac2015-03-12 18:16:50 +00001077 int ret = 0;
Christoffer Dalld5d81842013-01-20 18:28:07 -05001078
1079 slots = kvm_memslots(kvm);
1080
1081 /* we only care about the pages that the guest sees */
1082 kvm_for_each_memslot(memslot, slots) {
1083 unsigned long hva_start, hva_end;
Suzuki K Poulose056aad62017-03-20 18:26:42 +00001084 gfn_t gpa;
Christoffer Dalld5d81842013-01-20 18:28:07 -05001085
1086 hva_start = max(start, memslot->userspace_addr);
1087 hva_end = min(end, memslot->userspace_addr +
1088 (memslot->npages << PAGE_SHIFT));
1089 if (hva_start >= hva_end)
1090 continue;
1091
Suzuki K Poulose056aad62017-03-20 18:26:42 +00001092 gpa = hva_to_gfn_memslot(hva_start, memslot) << PAGE_SHIFT;
1093 ret |= handler(kvm, gpa, (u64)(hva_end - hva_start), data);
Christoffer Dalld5d81842013-01-20 18:28:07 -05001094 }
Marc Zyngier1d2ebac2015-03-12 18:16:50 +00001095
1096 return ret;
Christoffer Dalld5d81842013-01-20 18:28:07 -05001097}
1098
Suzuki K Poulose056aad62017-03-20 18:26:42 +00001099static int kvm_unmap_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
Christoffer Dalld5d81842013-01-20 18:28:07 -05001100{
Will Deaconb5331372020-08-11 11:27:25 +01001101 unsigned flags = *(unsigned *)data;
1102 bool may_block = flags & MMU_NOTIFIER_RANGE_BLOCKABLE;
1103
1104 __unmap_stage2_range(&kvm->arch.mmu, gpa, size, may_block);
Marc Zyngier1d2ebac2015-03-12 18:16:50 +00001105 return 0;
Christoffer Dalld5d81842013-01-20 18:28:07 -05001106}
1107
Christoffer Dalld5d81842013-01-20 18:28:07 -05001108int kvm_unmap_hva_range(struct kvm *kvm,
Will Deaconfdfe7cb2020-08-11 11:27:24 +01001109 unsigned long start, unsigned long end, unsigned flags)
Christoffer Dalld5d81842013-01-20 18:28:07 -05001110{
Will Deacon063deeb2020-09-11 14:25:26 +01001111 if (!kvm->arch.mmu.pgt)
Christoffer Dalld5d81842013-01-20 18:28:07 -05001112 return 0;
1113
1114 trace_kvm_unmap_hva_range(start, end);
Will Deaconb5331372020-08-11 11:27:25 +01001115 handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, &flags);
Christoffer Dalld5d81842013-01-20 18:28:07 -05001116 return 0;
1117}
1118
Suzuki K Poulose056aad62017-03-20 18:26:42 +00001119static int kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
Christoffer Dalld5d81842013-01-20 18:28:07 -05001120{
Will Deacone9edb172020-09-11 14:25:16 +01001121 kvm_pfn_t *pfn = (kvm_pfn_t *)data;
Christoffer Dalld5d81842013-01-20 18:28:07 -05001122
Suzuki K Poulose056aad62017-03-20 18:26:42 +00001123 WARN_ON(size != PAGE_SIZE);
Will Deacone9edb172020-09-11 14:25:16 +01001124
Mario Smarduch15a49a42015-01-15 15:58:58 -08001125 /*
Will Deacone9edb172020-09-11 14:25:16 +01001126 * The MMU notifiers will have unmapped a huge PMD before calling
1127 * ->change_pte() (which in turn calls kvm_set_spte_hva()) and
1128 * therefore we never need to clear out a huge PMD through this
1129 * calling path and a memcache is not required.
Mario Smarduch15a49a42015-01-15 15:58:58 -08001130 */
Will Deacone9edb172020-09-11 14:25:16 +01001131 kvm_pgtable_stage2_map(kvm->arch.mmu.pgt, gpa, PAGE_SIZE,
1132 __pfn_to_phys(*pfn), KVM_PGTABLE_PROT_R, NULL);
Marc Zyngier1d2ebac2015-03-12 18:16:50 +00001133 return 0;
Christoffer Dalld5d81842013-01-20 18:28:07 -05001134}
1135
Lan Tianyu748c0e32018-12-06 21:21:10 +08001136int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
Christoffer Dalld5d81842013-01-20 18:28:07 -05001137{
1138 unsigned long end = hva + PAGE_SIZE;
Marc Zyngier694556d2018-08-23 09:58:27 +01001139 kvm_pfn_t pfn = pte_pfn(pte);
Christoffer Dalld5d81842013-01-20 18:28:07 -05001140
Will Deacone9edb172020-09-11 14:25:16 +01001141 if (!kvm->arch.mmu.pgt)
Lan Tianyu748c0e32018-12-06 21:21:10 +08001142 return 0;
Christoffer Dalld5d81842013-01-20 18:28:07 -05001143
1144 trace_kvm_set_spte_hva(hva);
Marc Zyngier694556d2018-08-23 09:58:27 +01001145
1146 /*
1147 * We've moved a page around, probably through CoW, so let's treat it
1148 * just like a translation fault and clean the cache to the PoC.
1149 */
1150 clean_dcache_guest_page(pfn, PAGE_SIZE);
Will Deacone9edb172020-09-11 14:25:16 +01001151 handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &pfn);
Lan Tianyu748c0e32018-12-06 21:21:10 +08001152 return 0;
Christoffer Dalld5d81842013-01-20 18:28:07 -05001153}
1154
Suzuki K Poulose056aad62017-03-20 18:26:42 +00001155static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
Marc Zyngier35307b92015-03-12 18:16:51 +00001156{
Will Deaconee8efad2020-09-11 14:25:19 +01001157 pte_t pte;
1158 kvm_pte_t kpte;
Marc Zyngier35307b92015-03-12 18:16:51 +00001159
Punit Agrawal35a63962018-12-11 17:10:40 +00001160 WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
Will Deaconee8efad2020-09-11 14:25:19 +01001161 kpte = kvm_pgtable_stage2_mkold(kvm->arch.mmu.pgt, gpa);
1162 pte = __pte(kpte);
1163 return pte_valid(pte) && pte_young(pte);
Marc Zyngier35307b92015-03-12 18:16:51 +00001164}
1165
Suzuki K Poulose056aad62017-03-20 18:26:42 +00001166static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
Marc Zyngier35307b92015-03-12 18:16:51 +00001167{
Punit Agrawal35a63962018-12-11 17:10:40 +00001168 WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
Will Deaconee8efad2020-09-11 14:25:19 +01001169 return kvm_pgtable_stage2_is_young(kvm->arch.mmu.pgt, gpa);
Marc Zyngier35307b92015-03-12 18:16:51 +00001170}
1171
1172int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
1173{
Will Deacon063deeb2020-09-11 14:25:26 +01001174 if (!kvm->arch.mmu.pgt)
Suzuki K Poulose7e5a6722017-07-05 09:57:00 +01001175 return 0;
Marc Zyngier35307b92015-03-12 18:16:51 +00001176 trace_kvm_age_hva(start, end);
1177 return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL);
1178}
1179
1180int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
1181{
Will Deacon063deeb2020-09-11 14:25:26 +01001182 if (!kvm->arch.mmu.pgt)
Suzuki K Poulose7e5a6722017-07-05 09:57:00 +01001183 return 0;
Marc Zyngier35307b92015-03-12 18:16:51 +00001184 trace_kvm_test_age_hva(hva);
Gavin Shancf2d23e2020-01-21 16:56:59 +11001185 return handle_hva_to_gpa(kvm, hva, hva + PAGE_SIZE,
1186 kvm_test_age_hva_handler, NULL);
Marc Zyngier35307b92015-03-12 18:16:51 +00001187}
1188
Christoffer Dall342cd0a2013-01-20 18:28:06 -05001189phys_addr_t kvm_mmu_get_httbr(void)
1190{
Will Deacon0f9d09b2020-09-11 14:25:12 +01001191 return __pa(hyp_pgtable->pgd);
Christoffer Dall342cd0a2013-01-20 18:28:06 -05001192}
1193
Marc Zyngier5a677ce2013-04-12 19:12:06 +01001194phys_addr_t kvm_get_idmap_vector(void)
1195{
1196 return hyp_idmap_vector;
1197}
1198
Will Deacon0f9d09b2020-09-11 14:25:12 +01001199static int kvm_map_idmap_text(void)
Marc Zyngier0535a3e2016-06-30 18:40:43 +01001200{
Will Deacon0f9d09b2020-09-11 14:25:12 +01001201 unsigned long size = hyp_idmap_end - hyp_idmap_start;
1202 int err = __create_hyp_mappings(hyp_idmap_start, size, hyp_idmap_start,
1203 PAGE_HYP_EXEC);
Marc Zyngier0535a3e2016-06-30 18:40:43 +01001204 if (err)
1205 kvm_err("Failed to idmap %lx-%lx\n",
1206 hyp_idmap_start, hyp_idmap_end);
1207
1208 return err;
1209}
1210
Christoffer Dall342cd0a2013-01-20 18:28:06 -05001211int kvm_mmu_init(void)
1212{
Marc Zyngier2fb41052013-04-12 19:12:03 +01001213 int err;
Will Deacon0f9d09b2020-09-11 14:25:12 +01001214 u32 hyp_va_bits;
Marc Zyngier2fb41052013-04-12 19:12:03 +01001215
Andrew Scull0a787912020-05-19 11:40:36 +01001216 hyp_idmap_start = __pa_symbol(__hyp_idmap_text_start);
Marc Zyngier46fef152018-03-12 14:25:10 +00001217 hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE);
Andrew Scull0a787912020-05-19 11:40:36 +01001218 hyp_idmap_end = __pa_symbol(__hyp_idmap_text_end);
Marc Zyngier46fef152018-03-12 14:25:10 +00001219 hyp_idmap_end = ALIGN(hyp_idmap_end, PAGE_SIZE);
Andrew Scull0a787912020-05-19 11:40:36 +01001220 hyp_idmap_vector = __pa_symbol(__kvm_hyp_init);
Marc Zyngier5a677ce2013-04-12 19:12:06 +01001221
Ard Biesheuvel06f75a12015-03-19 16:42:26 +00001222 /*
1223 * We rely on the linker script to ensure at build time that the HYP
1224 * init code does not cross a page boundary.
1225 */
1226 BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK);
Marc Zyngier5a677ce2013-04-12 19:12:06 +01001227
Will Deacon0f9d09b2020-09-11 14:25:12 +01001228 hyp_va_bits = 64 - ((idmap_t0sz & TCR_T0SZ_MASK) >> TCR_T0SZ_OFFSET);
1229 kvm_debug("Using %u-bit virtual addresses at EL2\n", hyp_va_bits);
Marc Zyngierb4ef0492017-12-03 20:04:51 +00001230 kvm_debug("IDMAP page: %lx\n", hyp_idmap_start);
1231 kvm_debug("HYP VA range: %lx:%lx\n",
1232 kern_hyp_va(PAGE_OFFSET),
1233 kern_hyp_va((unsigned long)high_memory - 1));
Marc Zyngiereac378a2016-06-30 18:40:50 +01001234
Marc Zyngier6c41a412016-06-30 18:40:51 +01001235 if (hyp_idmap_start >= kern_hyp_va(PAGE_OFFSET) &&
Marc Zyngiered57cac2017-12-03 18:22:49 +00001236 hyp_idmap_start < kern_hyp_va((unsigned long)high_memory - 1) &&
Marc Zyngierd2896d42016-08-22 09:01:17 +01001237 hyp_idmap_start != (unsigned long)__hyp_idmap_text_start) {
Marc Zyngiereac378a2016-06-30 18:40:50 +01001238 /*
1239 * The idmap page is intersecting with the VA space,
1240 * it is not safe to continue further.
1241 */
1242 kvm_err("IDMAP intersecting with HYP VA, unable to continue\n");
1243 err = -EINVAL;
1244 goto out;
1245 }
1246
Will Deacon0f9d09b2020-09-11 14:25:12 +01001247 hyp_pgtable = kzalloc(sizeof(*hyp_pgtable), GFP_KERNEL);
1248 if (!hyp_pgtable) {
1249 kvm_err("Hyp mode page-table not allocated\n");
Marc Zyngier2fb41052013-04-12 19:12:03 +01001250 err = -ENOMEM;
1251 goto out;
1252 }
1253
Will Deacon0f9d09b2020-09-11 14:25:12 +01001254 err = kvm_pgtable_hyp_init(hyp_pgtable, hyp_va_bits);
1255 if (err)
1256 goto out_free_pgtable;
Marc Zyngier0535a3e2016-06-30 18:40:43 +01001257
Will Deacon0f9d09b2020-09-11 14:25:12 +01001258 err = kvm_map_idmap_text();
1259 if (err)
1260 goto out_destroy_pgtable;
Marc Zyngier5a677ce2013-04-12 19:12:06 +01001261
Marc Zyngiere3f019b2017-12-04 17:04:38 +00001262 io_map_base = hyp_idmap_start;
Christoffer Dalld5d81842013-01-20 18:28:07 -05001263 return 0;
Will Deacon0f9d09b2020-09-11 14:25:12 +01001264
1265out_destroy_pgtable:
1266 kvm_pgtable_hyp_destroy(hyp_pgtable);
1267out_free_pgtable:
1268 kfree(hyp_pgtable);
1269 hyp_pgtable = NULL;
Marc Zyngier2fb41052013-04-12 19:12:03 +01001270out:
Marc Zyngier2fb41052013-04-12 19:12:03 +01001271 return err;
Christoffer Dall342cd0a2013-01-20 18:28:06 -05001272}
Eric Augerdf6ce242014-06-06 11:10:23 +02001273
1274void kvm_arch_commit_memory_region(struct kvm *kvm,
Paolo Bonzini09170a42015-05-18 13:59:39 +02001275 const struct kvm_userspace_memory_region *mem,
Sean Christopherson9d4c1972020-02-18 13:07:24 -08001276 struct kvm_memory_slot *old,
Paolo Bonzinif36f3f22015-05-18 13:20:23 +02001277 const struct kvm_memory_slot *new,
Eric Augerdf6ce242014-06-06 11:10:23 +02001278 enum kvm_mr_change change)
1279{
Mario Smarduchc6473552015-01-15 15:58:56 -08001280 /*
1281 * At this point memslot has been committed and there is an
Fuad Tabba656012c2020-04-01 15:03:10 +01001282 * allocated dirty_bitmap[], dirty pages will be tracked while the
Mario Smarduchc6473552015-01-15 15:58:56 -08001283 * memory slot is write protected.
1284 */
Keqian Zhuc8626262020-04-13 20:20:23 +08001285 if (change != KVM_MR_DELETE && mem->flags & KVM_MEM_LOG_DIRTY_PAGES) {
1286 /*
1287 * If we're with initial-all-set, we don't need to write
1288 * protect any pages because they're all reported as dirty.
1289 * Huge pages and normal pages will be write protect gradually.
1290 */
1291 if (!kvm_dirty_log_manual_protect_and_init_set(kvm)) {
1292 kvm_mmu_wp_memory_region(kvm, mem->slot);
1293 }
1294 }
Eric Augerdf6ce242014-06-06 11:10:23 +02001295}
1296
1297int kvm_arch_prepare_memory_region(struct kvm *kvm,
1298 struct kvm_memory_slot *memslot,
Paolo Bonzini09170a42015-05-18 13:59:39 +02001299 const struct kvm_userspace_memory_region *mem,
Eric Augerdf6ce242014-06-06 11:10:23 +02001300 enum kvm_mr_change change)
1301{
Ard Biesheuvel8eef9122014-10-10 17:00:32 +02001302 hva_t hva = mem->userspace_addr;
1303 hva_t reg_end = hva + mem->memory_size;
1304 bool writable = !(mem->flags & KVM_MEM_READONLY);
1305 int ret = 0;
1306
Mario Smarduch15a49a42015-01-15 15:58:58 -08001307 if (change != KVM_MR_CREATE && change != KVM_MR_MOVE &&
1308 change != KVM_MR_FLAGS_ONLY)
Ard Biesheuvel8eef9122014-10-10 17:00:32 +02001309 return 0;
1310
1311 /*
Christoffer Dallc3058d52014-10-10 12:14:29 +02001312 * Prevent userspace from creating a memory region outside of the IPA
1313 * space addressable by the KVM guest IPA space.
1314 */
1315 if (memslot->base_gfn + memslot->npages >=
Suzuki K Poulosee55cac52018-09-26 17:32:44 +01001316 (kvm_phys_size(kvm) >> PAGE_SHIFT))
Christoffer Dallc3058d52014-10-10 12:14:29 +02001317 return -EFAULT;
1318
Michel Lespinasse89154dd2020-06-08 21:33:29 -07001319 mmap_read_lock(current->mm);
Christoffer Dallc3058d52014-10-10 12:14:29 +02001320 /*
Ard Biesheuvel8eef9122014-10-10 17:00:32 +02001321 * A memory region could potentially cover multiple VMAs, and any holes
1322 * between them, so iterate over all of them to find out if we can map
1323 * any of them right now.
1324 *
1325 * +--------------------------------------------+
1326 * +---------------+----------------+ +----------------+
1327 * | : VMA 1 | VMA 2 | | VMA 3 : |
1328 * +---------------+----------------+ +----------------+
1329 * | memory region |
1330 * +--------------------------------------------+
1331 */
1332 do {
1333 struct vm_area_struct *vma = find_vma(current->mm, hva);
1334 hva_t vm_start, vm_end;
1335
1336 if (!vma || vma->vm_start >= reg_end)
1337 break;
1338
1339 /*
Ard Biesheuvel8eef9122014-10-10 17:00:32 +02001340 * Take the intersection of this VMA with the memory region
1341 */
1342 vm_start = max(hva, vma->vm_start);
1343 vm_end = min(reg_end, vma->vm_end);
1344
1345 if (vma->vm_flags & VM_PFNMAP) {
1346 gpa_t gpa = mem->guest_phys_addr +
1347 (vm_start - mem->userspace_addr);
Marek Majtykaca09f022015-09-16 12:04:55 +02001348 phys_addr_t pa;
1349
1350 pa = (phys_addr_t)vma->vm_pgoff << PAGE_SHIFT;
1351 pa += vm_start - vma->vm_start;
Ard Biesheuvel8eef9122014-10-10 17:00:32 +02001352
Mario Smarduch15a49a42015-01-15 15:58:58 -08001353 /* IO region dirty page logging not allowed */
Marc Zyngier72f31042017-03-16 18:20:50 +00001354 if (memslot->flags & KVM_MEM_LOG_DIRTY_PAGES) {
1355 ret = -EINVAL;
1356 goto out;
1357 }
Mario Smarduch15a49a42015-01-15 15:58:58 -08001358
Ard Biesheuvel8eef9122014-10-10 17:00:32 +02001359 ret = kvm_phys_addr_ioremap(kvm, gpa, pa,
1360 vm_end - vm_start,
1361 writable);
1362 if (ret)
1363 break;
1364 }
1365 hva = vm_end;
1366 } while (hva < reg_end);
1367
Mario Smarduch15a49a42015-01-15 15:58:58 -08001368 if (change == KVM_MR_FLAGS_ONLY)
Marc Zyngier72f31042017-03-16 18:20:50 +00001369 goto out;
Mario Smarduch15a49a42015-01-15 15:58:58 -08001370
Ard Biesheuvel849260c2014-11-17 14:58:53 +00001371 spin_lock(&kvm->mmu_lock);
1372 if (ret)
Christoffer Dalla0e50aa2019-01-04 21:09:05 +01001373 unmap_stage2_range(&kvm->arch.mmu, mem->guest_phys_addr, mem->memory_size);
Alexandru Eliseiada329e2020-09-15 18:04:42 +01001374 else if (!cpus_have_final_cap(ARM64_HAS_STAGE2_FWB))
Ard Biesheuvel849260c2014-11-17 14:58:53 +00001375 stage2_flush_memslot(kvm, memslot);
1376 spin_unlock(&kvm->mmu_lock);
Marc Zyngier72f31042017-03-16 18:20:50 +00001377out:
Michel Lespinasse89154dd2020-06-08 21:33:29 -07001378 mmap_read_unlock(current->mm);
Ard Biesheuvel8eef9122014-10-10 17:00:32 +02001379 return ret;
Eric Augerdf6ce242014-06-06 11:10:23 +02001380}
1381
Sean Christophersone96c81e2020-02-18 13:07:27 -08001382void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
Eric Augerdf6ce242014-06-06 11:10:23 +02001383{
1384}
1385
Sean Christopherson15248252019-02-05 12:54:17 -08001386void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
Eric Augerdf6ce242014-06-06 11:10:23 +02001387{
1388}
1389
1390void kvm_arch_flush_shadow_all(struct kvm *kvm)
1391{
Christoffer Dalla0e50aa2019-01-04 21:09:05 +01001392 kvm_free_stage2_pgd(&kvm->arch.mmu);
Eric Augerdf6ce242014-06-06 11:10:23 +02001393}
1394
1395void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
1396 struct kvm_memory_slot *slot)
1397{
Ard Biesheuvel8eef9122014-10-10 17:00:32 +02001398 gpa_t gpa = slot->base_gfn << PAGE_SHIFT;
1399 phys_addr_t size = slot->npages << PAGE_SHIFT;
1400
1401 spin_lock(&kvm->mmu_lock);
Christoffer Dalla0e50aa2019-01-04 21:09:05 +01001402 unmap_stage2_range(&kvm->arch.mmu, gpa, size);
Ard Biesheuvel8eef9122014-10-10 17:00:32 +02001403 spin_unlock(&kvm->mmu_lock);
Eric Augerdf6ce242014-06-06 11:10:23 +02001404}
Marc Zyngier3c1e7162014-12-19 16:05:31 +00001405
1406/*
1407 * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized).
1408 *
1409 * Main problems:
1410 * - S/W ops are local to a CPU (not broadcast)
1411 * - We have line migration behind our back (speculation)
1412 * - System caches don't support S/W at all (damn!)
1413 *
1414 * In the face of the above, the best we can do is to try and convert
1415 * S/W ops to VA ops. Because the guest is not allowed to infer the
1416 * S/W to PA mapping, it can only use S/W to nuke the whole cache,
1417 * which is a rather good thing for us.
1418 *
1419 * Also, it is only used when turning caches on/off ("The expected
1420 * usage of the cache maintenance instructions that operate by set/way
1421 * is associated with the cache maintenance instructions associated
1422 * with the powerdown and powerup of caches, if this is required by
1423 * the implementation.").
1424 *
1425 * We use the following policy:
1426 *
1427 * - If we trap a S/W operation, we enable VM trapping to detect
1428 * caches being turned on/off, and do a full clean.
1429 *
1430 * - We flush the caches on both caches being turned on and off.
1431 *
1432 * - Once the caches are enabled, we stop trapping VM ops.
1433 */
1434void kvm_set_way_flush(struct kvm_vcpu *vcpu)
1435{
Christoffer Dall3df59d82017-08-03 12:09:05 +02001436 unsigned long hcr = *vcpu_hcr(vcpu);
Marc Zyngier3c1e7162014-12-19 16:05:31 +00001437
1438 /*
1439 * If this is the first time we do a S/W operation
1440 * (i.e. HCR_TVM not set) flush the whole memory, and set the
1441 * VM trapping.
1442 *
1443 * Otherwise, rely on the VM trapping to wait for the MMU +
1444 * Caches to be turned off. At that point, we'll be able to
1445 * clean the caches again.
1446 */
1447 if (!(hcr & HCR_TVM)) {
1448 trace_kvm_set_way_flush(*vcpu_pc(vcpu),
1449 vcpu_has_cache_enabled(vcpu));
1450 stage2_flush_vm(vcpu->kvm);
Christoffer Dall3df59d82017-08-03 12:09:05 +02001451 *vcpu_hcr(vcpu) = hcr | HCR_TVM;
Marc Zyngier3c1e7162014-12-19 16:05:31 +00001452 }
1453}
1454
1455void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled)
1456{
1457 bool now_enabled = vcpu_has_cache_enabled(vcpu);
1458
1459 /*
1460 * If switching the MMU+caches on, need to invalidate the caches.
1461 * If switching it off, need to clean the caches.
1462 * Clean + invalidate does the trick always.
1463 */
1464 if (now_enabled != was_enabled)
1465 stage2_flush_vm(vcpu->kvm);
1466
1467 /* Caches are now on, stop trapping VM ops (until a S/W op) */
1468 if (now_enabled)
Christoffer Dall3df59d82017-08-03 12:09:05 +02001469 *vcpu_hcr(vcpu) &= ~HCR_TVM;
Marc Zyngier3c1e7162014-12-19 16:05:31 +00001470
1471 trace_kvm_toggle_cache(*vcpu_pc(vcpu), was_enabled, now_enabled);
1472}