blob: 58d31da8a2f7c2eac20f64da3952f43614592341 [file] [log] [blame]
Thomas Gleixner20c8ccb2019-06-04 10:11:32 +02001// SPDX-License-Identifier: GPL-2.0-only
Avi Kivity6aa8b732006-12-10 02:21:36 -08002/*
3 * Kernel-based Virtual Machine driver for Linux
4 *
5 * This module enables machines with Intel VT-x extensions to run virtual
6 * machines without emulation or binary translation.
7 *
8 * Copyright (C) 2006 Qumranet, Inc.
Nicolas Kaiser9611c182010-10-06 14:23:22 +02009 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
Avi Kivity6aa8b732006-12-10 02:21:36 -080010 *
11 * Authors:
12 * Avi Kivity <avi@qumranet.com>
13 * Yaniv Kamay <yaniv@qumranet.com>
Avi Kivity6aa8b732006-12-10 02:21:36 -080014 */
15
Andre Przywaraaf669ac2015-03-26 14:39:29 +000016#include <kvm/iodev.h>
Avi Kivity6aa8b732006-12-10 02:21:36 -080017
Avi Kivityedf88412007-12-16 11:02:48 +020018#include <linux/kvm_host.h>
Avi Kivity6aa8b732006-12-10 02:21:36 -080019#include <linux/kvm.h>
20#include <linux/module.h>
21#include <linux/errno.h>
Avi Kivity6aa8b732006-12-10 02:21:36 -080022#include <linux/percpu.h>
Avi Kivity6aa8b732006-12-10 02:21:36 -080023#include <linux/mm.h>
24#include <linux/miscdevice.h>
25#include <linux/vmalloc.h>
Avi Kivity6aa8b732006-12-10 02:21:36 -080026#include <linux/reboot.h>
Avi Kivity6aa8b732006-12-10 02:21:36 -080027#include <linux/debugfs.h>
28#include <linux/highmem.h>
29#include <linux/file.h>
Rafael J. Wysockifb3600c2011-03-23 22:16:23 +010030#include <linux/syscore_ops.h>
Avi Kivity774c47f2007-02-12 00:54:47 -080031#include <linux/cpu.h>
Ingo Molnar174cd4b2017-02-02 19:15:33 +010032#include <linux/sched/signal.h>
Ingo Molnar6e84f312017-02-08 18:51:29 +010033#include <linux/sched/mm.h>
Ingo Molnar03441a32017-02-08 18:51:35 +010034#include <linux/sched/stat.h>
Avi Kivityd9e368d2007-06-07 19:18:30 +030035#include <linux/cpumask.h>
36#include <linux/smp.h>
Avi Kivityd6d28162007-06-28 08:38:16 -040037#include <linux/anon_inodes.h>
Avi Kivity04d2cc72007-09-10 18:10:54 +030038#include <linux/profile.h>
Anthony Liguori7aa81cc2007-09-17 14:57:50 -050039#include <linux/kvm_para.h>
Izik Eidus6fc138d2007-10-09 19:20:39 +020040#include <linux/pagemap.h>
Anthony Liguori8d4e1282007-10-18 09:59:34 -050041#include <linux/mman.h>
Anthony Liguori35149e22008-04-02 14:46:56 -050042#include <linux/swap.h>
Sheng Yange56d5322009-03-12 21:45:39 +080043#include <linux/bitops.h>
Marcelo Tosatti547de292009-05-07 17:55:13 -030044#include <linux/spinlock.h>
Arnd Bergmann6ff58942009-10-22 14:19:27 +020045#include <linux/compat.h>
Marcelo Tosattibc6678a2009-12-23 14:35:21 -020046#include <linux/srcu.h>
Joerg Roedel8f0b1ab2010-01-28 12:37:56 +010047#include <linux/hugetlb.h>
Tejun Heo5a0e3ad2010-03-24 17:04:11 +090048#include <linux/slab.h>
Sasha Levin743eeb02011-07-27 16:00:48 +030049#include <linux/sort.h>
50#include <linux/bsearch.h>
Paolo Bonzinic011d232019-05-17 14:08:53 +020051#include <linux/io.h>
Wanpeng Li2eb06c32019-05-17 16:49:49 +080052#include <linux/lockdep.h>
Junaid Shahidc57c8042019-11-04 12:22:02 +010053#include <linux/kthread.h>
Sergey Senozhatsky2fdef3a2021-06-06 11:10:44 +090054#include <linux/suspend.h>
Avi Kivity6aa8b732006-12-10 02:21:36 -080055
Avi Kivitye4956062007-06-28 14:15:57 -040056#include <asm/processor.h>
David Matlack2ea75be2014-09-19 16:03:25 -070057#include <asm/ioctl.h>
Linus Torvalds7c0f6ba2016-12-24 11:46:01 -080058#include <linux/uaccess.h>
Avi Kivity6aa8b732006-12-10 02:21:36 -080059
Laurent Vivier5f94c172008-05-30 16:05:54 +020060#include "coalesced_mmio.h"
Gleb Natapovaf585b92010-10-14 11:22:46 +020061#include "async_pf.h"
David Woodhouse982ed0d2021-12-10 16:36:21 +000062#include "kvm_mm.h"
Paolo Bonzini3c3c29f2014-09-24 13:02:46 +020063#include "vfio.h"
Laurent Vivier5f94c172008-05-30 16:05:54 +020064
Marcelo Tosatti229456f2009-06-17 09:22:14 -030065#define CREATE_TRACE_POINTS
66#include <trace/events/kvm.h>
67
Peter Xufb04a1e2020-09-30 21:22:22 -040068#include <linux/kvm_dirty_ring.h>
69
Janosch Frank536a6f82016-05-18 13:26:23 +020070/* Worst case buffer size needed for holding an integer. */
71#define ITOA_MAX_LEN 12
72
Avi Kivity6aa8b732006-12-10 02:21:36 -080073MODULE_AUTHOR("Qumranet");
74MODULE_LICENSE("GPL");
75
David Hildenbrand920552b2015-09-18 12:34:53 +020076/* Architectures should define their poll value according to the halt latency */
Suraj Jitindar Singhec76d812016-10-14 11:53:19 +110077unsigned int halt_poll_ns = KVM_HALT_POLL_NS_DEFAULT;
Roman Storozhenko039c5d12017-06-27 12:51:18 +030078module_param(halt_poll_ns, uint, 0644);
Suraj Jitindar Singhec76d812016-10-14 11:53:19 +110079EXPORT_SYMBOL_GPL(halt_poll_ns);
Paolo Bonzinif7819512015-02-04 18:20:58 +010080
Wanpeng Liaca6ff22015-09-03 22:07:38 +080081/* Default doubles per-vcpu halt_poll_ns. */
Suraj Jitindar Singhec76d812016-10-14 11:53:19 +110082unsigned int halt_poll_ns_grow = 2;
Roman Storozhenko039c5d12017-06-27 12:51:18 +030083module_param(halt_poll_ns_grow, uint, 0644);
Suraj Jitindar Singhec76d812016-10-14 11:53:19 +110084EXPORT_SYMBOL_GPL(halt_poll_ns_grow);
Wanpeng Liaca6ff22015-09-03 22:07:38 +080085
Nir Weiner49113d32019-01-27 12:17:15 +020086/* The start value to grow halt_poll_ns from */
87unsigned int halt_poll_ns_grow_start = 10000; /* 10us */
88module_param(halt_poll_ns_grow_start, uint, 0644);
89EXPORT_SYMBOL_GPL(halt_poll_ns_grow_start);
90
Wanpeng Liaca6ff22015-09-03 22:07:38 +080091/* Default resets per-vcpu halt_poll_ns . */
Suraj Jitindar Singhec76d812016-10-14 11:53:19 +110092unsigned int halt_poll_ns_shrink;
Roman Storozhenko039c5d12017-06-27 12:51:18 +030093module_param(halt_poll_ns_shrink, uint, 0644);
Suraj Jitindar Singhec76d812016-10-14 11:53:19 +110094EXPORT_SYMBOL_GPL(halt_poll_ns_shrink);
Wanpeng Liaca6ff22015-09-03 22:07:38 +080095
Marcelo Tosattifa40a822009-06-04 15:08:24 -030096/*
97 * Ordering of locks:
98 *
Xiubo Lib7d409d2015-02-26 14:58:24 +080099 * kvm->lock --> kvm->slots_lock --> kvm->irq_lock
Marcelo Tosattifa40a822009-06-04 15:08:24 -0300100 */
101
Junaid Shahid0d9ce162019-01-03 17:14:28 -0800102DEFINE_MUTEX(kvm_lock);
Paolo Bonzini4a937f92013-09-10 12:58:35 +0200103static DEFINE_RAW_SPINLOCK(kvm_count_lock);
Zhang Xiantaoe9b11c12007-11-14 20:38:21 +0800104LIST_HEAD(vm_list);
Avi Kivity133de902007-02-12 00:54:44 -0800105
Rusty Russell7f59f492008-12-07 21:25:45 +1030106static cpumask_var_t cpus_hardware_enabled;
Xiubo Lif4fee932015-02-26 14:58:21 +0800107static int kvm_usage_count;
Alexander Graf10474ae2009-09-15 11:37:46 +0200108static atomic_t hardware_enable_failed;
Avi Kivity1b6c0162007-05-24 13:03:52 +0300109
Sean Christophersonaaba2982019-12-18 13:55:16 -0800110static struct kmem_cache *kvm_vcpu_cache;
Avi Kivity1165f5f2007-04-19 17:27:43 +0300111
Avi Kivity15ad7142007-07-11 18:17:21 +0300112static __read_mostly struct preempt_ops kvm_preempt_ops;
Paolo Bonzini7495e222020-01-09 09:57:19 -0500113static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_running_vcpu);
Avi Kivity15ad7142007-07-11 18:17:21 +0300114
Hollis Blanchard76f7c872008-04-15 16:05:42 -0500115struct dentry *kvm_debugfs_dir;
Paul Mackerrase23a8082015-03-28 14:21:01 +1100116EXPORT_SYMBOL_GPL(kvm_debugfs_dir);
Avi Kivity6aa8b732006-12-10 02:21:36 -0800117
Milan Pandurov09cbcef2019-12-13 14:07:21 +0100118static const struct file_operations stat_fops_per_vm;
Janosch Frank536a6f82016-05-18 13:26:23 +0200119
Avi Kivitybccf2152007-02-21 18:04:26 +0200120static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
121 unsigned long arg);
Christian Borntraegerde8e5d72015-02-03 09:35:15 +0100122#ifdef CONFIG_KVM_COMPAT
Alexander Graf1dda6062011-06-08 02:45:37 +0200123static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl,
124 unsigned long arg);
Marc Zyngier7ddfd3e2018-06-17 10:16:21 +0100125#define KVM_COMPAT(c) .compat_ioctl = (c)
126#else
Marc Zyngier9cb09e72019-11-14 13:17:39 +0000127/*
128 * For architectures that don't implement a compat infrastructure,
129 * adopt a double line of defense:
130 * - Prevent a compat task from opening /dev/kvm
131 * - If the open has been done by a 64bit task, and the KVM fd
132 * passed to a compat task, let the ioctls fail.
133 */
Marc Zyngier7ddfd3e2018-06-17 10:16:21 +0100134static long kvm_no_compat_ioctl(struct file *file, unsigned int ioctl,
135 unsigned long arg) { return -EINVAL; }
Marc Zyngierb9876e62019-11-13 16:05:23 +0000136
137static int kvm_no_compat_open(struct inode *inode, struct file *file)
138{
139 return is_compat_task() ? -ENODEV : 0;
140}
141#define KVM_COMPAT(c) .compat_ioctl = kvm_no_compat_ioctl, \
142 .open = kvm_no_compat_open
Alexander Graf1dda6062011-06-08 02:45:37 +0200143#endif
Alexander Graf10474ae2009-09-15 11:37:46 +0200144static int hardware_enable_all(void);
145static void hardware_disable_all(void);
Avi Kivitybccf2152007-02-21 18:04:26 +0200146
Marcelo Tosattie93f8a02009-12-23 14:35:24 -0200147static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
Stephen Hemminger79408762013-12-29 12:12:29 -0800148
Andi Kleen52480132014-02-08 08:51:57 +0100149__visible bool kvm_rebooting;
Avi Kivityb7c41452010-12-02 17:52:50 +0200150EXPORT_SYMBOL_GPL(kvm_rebooting);
Avi Kivity4ecac3f2008-05-13 13:23:38 +0300151
Claudio Imbrenda286de8f2017-07-12 17:56:44 +0200152#define KVM_EVENT_CREATE_VM 0
153#define KVM_EVENT_DESTROY_VM 1
154static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm);
155static unsigned long long kvm_createvm_count;
156static unsigned long long kvm_active_vms;
157
Vitaly Kuznetsovbaff59c2021-09-03 09:51:40 +0200158static DEFINE_PER_CPU(cpumask_var_t, cpu_kick_mask);
159
Eiichi Tsukatae649b3f2020-06-06 13:26:27 +0900160__weak void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
161 unsigned long start, unsigned long end)
Radim Krčmářb1394e72017-11-30 19:05:45 +0100162{
163}
164
Sean Christophersona78986a2019-11-11 14:12:27 -0800165bool kvm_is_zone_device_pfn(kvm_pfn_t pfn)
166{
167 /*
168 * The metadata used by is_zone_device_page() to determine whether or
169 * not a page is ZONE_DEVICE is guaranteed to be valid if and only if
170 * the device has been pinned, e.g. by get_user_pages(). WARN if the
171 * page_count() is zero to help detect bad usage of this helper.
172 */
173 if (!pfn_valid(pfn) || WARN_ON_ONCE(!page_count(pfn_to_page(pfn))))
174 return false;
175
176 return is_zone_device_page(pfn_to_page(pfn));
177}
178
Dan Williamsba049e92016-01-15 16:56:11 -0800179bool kvm_is_reserved_pfn(kvm_pfn_t pfn)
Ben-Ami Yassourcbff90a2008-07-28 19:26:24 +0300180{
Sean Christophersona78986a2019-11-11 14:12:27 -0800181 /*
182 * ZONE_DEVICE pages currently set PG_reserved, but from a refcounting
183 * perspective they are "normal" pages, albeit with slightly different
184 * usage rules.
185 */
Andrea Arcangeli11feeb42013-07-25 03:04:38 +0200186 if (pfn_valid(pfn))
Sean Christophersona78986a2019-11-11 14:12:27 -0800187 return PageReserved(pfn_to_page(pfn)) &&
Zhuang Yanying7df003c2019-10-12 11:37:31 +0800188 !is_zero_pfn(pfn) &&
Sean Christophersona78986a2019-11-11 14:12:27 -0800189 !kvm_is_zone_device_pfn(pfn);
Ben-Ami Yassourcbff90a2008-07-28 19:26:24 +0300190
191 return true;
192}
193
Avi Kivity6aa8b732006-12-10 02:21:36 -0800194/*
195 * Switches to specified vcpu, until a matching vcpu_put()
196 */
Christoffer Dallec7660c2017-12-04 21:35:23 +0100197void vcpu_load(struct kvm_vcpu *vcpu)
Avi Kivity6aa8b732006-12-10 02:21:36 -0800198{
Christoffer Dallec7660c2017-12-04 21:35:23 +0100199 int cpu = get_cpu();
Paolo Bonzini7495e222020-01-09 09:57:19 -0500200
201 __this_cpu_write(kvm_running_vcpu, vcpu);
Avi Kivity15ad7142007-07-11 18:17:21 +0300202 preempt_notifier_register(&vcpu->preempt_notifier);
Carsten Otte313a3dc2007-10-11 19:16:52 +0200203 kvm_arch_vcpu_load(vcpu, cpu);
Avi Kivity15ad7142007-07-11 18:17:21 +0300204 put_cpu();
Avi Kivitybccf2152007-02-21 18:04:26 +0200205}
Jim Mattson2f1fe812016-07-08 15:36:06 -0700206EXPORT_SYMBOL_GPL(vcpu_load);
Avi Kivitybccf2152007-02-21 18:04:26 +0200207
Carsten Otte313a3dc2007-10-11 19:16:52 +0200208void vcpu_put(struct kvm_vcpu *vcpu)
Avi Kivity6aa8b732006-12-10 02:21:36 -0800209{
Avi Kivity15ad7142007-07-11 18:17:21 +0300210 preempt_disable();
Carsten Otte313a3dc2007-10-11 19:16:52 +0200211 kvm_arch_vcpu_put(vcpu);
Avi Kivity15ad7142007-07-11 18:17:21 +0300212 preempt_notifier_unregister(&vcpu->preempt_notifier);
Paolo Bonzini7495e222020-01-09 09:57:19 -0500213 __this_cpu_write(kvm_running_vcpu, NULL);
Avi Kivity15ad7142007-07-11 18:17:21 +0300214 preempt_enable();
Avi Kivity6aa8b732006-12-10 02:21:36 -0800215}
Jim Mattson2f1fe812016-07-08 15:36:06 -0700216EXPORT_SYMBOL_GPL(vcpu_put);
Avi Kivity6aa8b732006-12-10 02:21:36 -0800217
Paolo Bonzini7a97cec2017-04-27 14:33:43 +0200218/* TODO: merge with kvm_arch_vcpu_should_kick */
219static bool kvm_request_needs_ipi(struct kvm_vcpu *vcpu, unsigned req)
220{
221 int mode = kvm_vcpu_exiting_guest_mode(vcpu);
222
223 /*
224 * We need to wait for the VCPU to reenable interrupts and get out of
225 * READING_SHADOW_PAGE_TABLES mode.
226 */
227 if (req & KVM_REQUEST_WAIT)
228 return mode != OUTSIDE_GUEST_MODE;
229
230 /*
231 * Need to kick a running VCPU, but otherwise there is nothing to do.
232 */
233 return mode == IN_GUEST_MODE;
234}
235
Avi Kivityd9e368d2007-06-07 19:18:30 +0300236static void ack_flush(void *_completed)
237{
Avi Kivityd9e368d2007-06-07 19:18:30 +0300238}
239
Vitaly Kuznetsov620b2432021-09-03 09:51:41 +0200240static inline bool kvm_kick_many_cpus(struct cpumask *cpus, bool wait)
Paolo Bonzinib49defe2017-06-30 13:25:45 +0200241{
Paolo Bonzinib49defe2017-06-30 13:25:45 +0200242 if (cpumask_empty(cpus))
243 return false;
244
245 smp_call_function_many(cpus, ack_flush, NULL, wait);
246 return true;
247}
248
Vitaly Kuznetsovae0946c2021-09-03 09:51:37 +0200249static void kvm_make_vcpu_request(struct kvm *kvm, struct kvm_vcpu *vcpu,
Vitaly Kuznetsov620b2432021-09-03 09:51:41 +0200250 unsigned int req, struct cpumask *tmp,
Vitaly Kuznetsovae0946c2021-09-03 09:51:37 +0200251 int current_cpu)
252{
253 int cpu;
254
255 kvm_make_request(req, vcpu);
256
257 if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu))
258 return;
259
260 /*
Vitaly Kuznetsovae0946c2021-09-03 09:51:37 +0200261 * Note, the vCPU could get migrated to a different pCPU at any point
262 * after kvm_request_needs_ipi(), which could result in sending an IPI
263 * to the previous pCPU. But, that's OK because the purpose of the IPI
264 * is to ensure the vCPU returns to OUTSIDE_GUEST_MODE, which is
265 * satisfied if the vCPU migrates. Entering READING_SHADOW_PAGE_TABLES
266 * after this point is also OK, as the requirement is only that KVM wait
267 * for vCPUs that were reading SPTEs _before_ any changes were
268 * finalized. See kvm_vcpu_kick() for more details on handling requests.
269 */
270 if (kvm_request_needs_ipi(vcpu, req)) {
271 cpu = READ_ONCE(vcpu->cpu);
272 if (cpu != -1 && cpu != current_cpu)
273 __cpumask_set_cpu(cpu, tmp);
274 }
275}
276
Vitaly Kuznetsov7053df42018-05-16 17:21:28 +0200277bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req,
Vitaly Kuznetsov620b2432021-09-03 09:51:41 +0200278 unsigned long *vcpu_bitmap)
Avi Kivityd9e368d2007-06-07 19:18:30 +0300279{
Avi Kivityd9e368d2007-06-07 19:18:30 +0300280 struct kvm_vcpu *vcpu;
Vitaly Kuznetsov620b2432021-09-03 09:51:41 +0200281 struct cpumask *cpus;
Vitaly Kuznetsovae0946c2021-09-03 09:51:37 +0200282 int i, me;
Vitaly Kuznetsov7053df42018-05-16 17:21:28 +0200283 bool called;
Rusty Russell6ef7a1b2008-12-08 20:28:04 +1030284
Xiao Guangrong3cba4132011-01-12 15:41:22 +0800285 me = get_cpu();
Vitaly Kuznetsov7053df42018-05-16 17:21:28 +0200286
Vitaly Kuznetsov620b2432021-09-03 09:51:41 +0200287 cpus = this_cpu_cpumask_var_ptr(cpu_kick_mask);
288 cpumask_clear(cpus);
289
Vitaly Kuznetsovae0946c2021-09-03 09:51:37 +0200290 for_each_set_bit(i, vcpu_bitmap, KVM_MAX_VCPUS) {
291 vcpu = kvm_get_vcpu(kvm, i);
Vitaly Kuznetsov381cecc2021-09-03 09:51:38 +0200292 if (!vcpu)
Vitaly Kuznetsov7053df42018-05-16 17:21:28 +0200293 continue;
Vitaly Kuznetsov620b2432021-09-03 09:51:41 +0200294 kvm_make_vcpu_request(kvm, vcpu, req, cpus, me);
Avi Kivityd9e368d2007-06-07 19:18:30 +0300295 }
Vitaly Kuznetsov7053df42018-05-16 17:21:28 +0200296
Vitaly Kuznetsov620b2432021-09-03 09:51:41 +0200297 called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT));
Xiao Guangrong3cba4132011-01-12 15:41:22 +0800298 put_cpu();
Vitaly Kuznetsov7053df42018-05-16 17:21:28 +0200299
300 return called;
301}
302
Suravee Suthikulpanit54163a32020-05-06 08:17:53 -0500303bool kvm_make_all_cpus_request_except(struct kvm *kvm, unsigned int req,
304 struct kvm_vcpu *except)
Vitaly Kuznetsov7053df42018-05-16 17:21:28 +0200305{
Vitaly Kuznetsovae0946c2021-09-03 09:51:37 +0200306 struct kvm_vcpu *vcpu;
Vitaly Kuznetsovbaff59c2021-09-03 09:51:40 +0200307 struct cpumask *cpus;
Marc Zyngier46808a42021-11-16 16:04:02 +0000308 unsigned long i;
Vitaly Kuznetsov7053df42018-05-16 17:21:28 +0200309 bool called;
Marc Zyngier46808a42021-11-16 16:04:02 +0000310 int me;
Vitaly Kuznetsov7053df42018-05-16 17:21:28 +0200311
Vitaly Kuznetsovae0946c2021-09-03 09:51:37 +0200312 me = get_cpu();
313
Vitaly Kuznetsovbaff59c2021-09-03 09:51:40 +0200314 cpus = this_cpu_cpumask_var_ptr(cpu_kick_mask);
315 cpumask_clear(cpus);
316
Vitaly Kuznetsovae0946c2021-09-03 09:51:37 +0200317 kvm_for_each_vcpu(i, vcpu, kvm) {
318 if (vcpu == except)
319 continue;
320 kvm_make_vcpu_request(kvm, vcpu, req, cpus, me);
321 }
322
323 called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT));
324 put_cpu();
Vitaly Kuznetsov7053df42018-05-16 17:21:28 +0200325
Rusty Russell49846892008-12-08 20:26:24 +1030326 return called;
327}
328
Suravee Suthikulpanit54163a32020-05-06 08:17:53 -0500329bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
330{
331 return kvm_make_all_cpus_request_except(kvm, req, NULL);
332}
Marcelo Tosattia2486022021-05-26 14:20:14 -0300333EXPORT_SYMBOL_GPL(kvm_make_all_cpus_request);
Suravee Suthikulpanit54163a32020-05-06 08:17:53 -0500334
Mario Smarducha6d51012015-01-15 15:58:52 -0800335#ifndef CONFIG_HAVE_KVM_ARCH_TLB_FLUSH_ALL
Rusty Russell49846892008-12-08 20:26:24 +1030336void kvm_flush_remote_tlbs(struct kvm *kvm)
337{
Jing Zhang3cc4e142021-08-17 00:26:39 +0000338 ++kvm->stat.generic.remote_tlb_flush_requests;
Lai Jiangshan6bc6db02021-09-18 08:56:29 +0800339
Lan Tianyu4ae3cb32016-03-13 11:10:28 +0800340 /*
341 * We want to publish modifications to the page tables before reading
342 * mode. Pairs with a memory barrier in arch-specific code.
343 * - x86: smp_mb__after_srcu_read_unlock in vcpu_enter_guest
344 * and smp_mb in walk_shadow_page_lockless_begin/end.
345 * - powerpc: smp_mb in kvmppc_prepare_to_enter.
346 *
347 * There is already an smp_mb__after_atomic() before
348 * kvm_make_all_cpus_request() reads vcpu->mode. We reuse that
349 * barrier here.
350 */
Tianyu Lanb08660e2018-07-19 08:40:17 +0000351 if (!kvm_arch_flush_remote_tlb(kvm)
352 || kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
Jing Zhang0193cc92021-06-18 22:27:03 +0000353 ++kvm->stat.generic.remote_tlb_flush;
Avi Kivityd9e368d2007-06-07 19:18:30 +0300354}
Aneesh Kumar K.V2ba9f0d2013-10-07 22:17:59 +0530355EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs);
Mario Smarducha6d51012015-01-15 15:58:52 -0800356#endif
Avi Kivityd9e368d2007-06-07 19:18:30 +0300357
Marcelo Tosatti2e53d632008-02-20 14:47:24 -0500358void kvm_reload_remote_mmus(struct kvm *kvm)
359{
Tang Chen445b8232014-09-24 15:57:55 +0800360 kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
Marcelo Tosatti2e53d632008-02-20 14:47:24 -0500361}
362
Sean Christopherson6926f952020-07-02 19:35:39 -0700363#ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE
364static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc,
365 gfp_t gfp_flags)
366{
367 gfp_flags |= mc->gfp_zero;
368
369 if (mc->kmem_cache)
370 return kmem_cache_alloc(mc->kmem_cache, gfp_flags);
371 else
372 return (void *)__get_free_page(gfp_flags);
373}
374
375int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min)
376{
377 void *obj;
378
379 if (mc->nobjs >= min)
380 return 0;
381 while (mc->nobjs < ARRAY_SIZE(mc->objects)) {
382 obj = mmu_memory_cache_alloc_obj(mc, GFP_KERNEL_ACCOUNT);
383 if (!obj)
384 return mc->nobjs >= min ? 0 : -ENOMEM;
385 mc->objects[mc->nobjs++] = obj;
386 }
387 return 0;
388}
389
390int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc)
391{
392 return mc->nobjs;
393}
394
395void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
396{
397 while (mc->nobjs) {
398 if (mc->kmem_cache)
399 kmem_cache_free(mc->kmem_cache, mc->objects[--mc->nobjs]);
400 else
401 free_page((unsigned long)mc->objects[--mc->nobjs]);
402 }
403}
404
405void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
406{
407 void *p;
408
409 if (WARN_ON(!mc->nobjs))
410 p = mmu_memory_cache_alloc_obj(mc, GFP_ATOMIC | __GFP_ACCOUNT);
411 else
412 p = mc->objects[--mc->nobjs];
413 BUG_ON(!p);
414 return p;
415}
416#endif
417
Sean Christopherson8bd826d2019-12-18 13:55:30 -0800418static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
Rusty Russellfb3f0f52007-07-27 17:16:56 +1000419{
Rusty Russellfb3f0f52007-07-27 17:16:56 +1000420 mutex_init(&vcpu->mutex);
421 vcpu->cpu = -1;
Rusty Russellfb3f0f52007-07-27 17:16:56 +1000422 vcpu->kvm = kvm;
423 vcpu->vcpu_id = id;
Rik van Riel34bb10b2011-02-01 09:52:41 -0500424 vcpu->pid = NULL;
Sean Christopherson510958e2021-10-08 19:11:57 -0700425#ifndef __KVM_HAVE_ARCH_WQP
Davidlohr Buesoda4ad882020-04-23 22:48:37 -0700426 rcuwait_init(&vcpu->wait);
Sean Christopherson510958e2021-10-08 19:11:57 -0700427#endif
Gleb Natapovaf585b92010-10-14 11:22:46 +0200428 kvm_async_pf_vcpu_init(vcpu);
Rusty Russellfb3f0f52007-07-27 17:16:56 +1000429
Raghavendra K T4c088492012-07-18 19:07:46 +0530430 kvm_vcpu_set_in_spin_loop(vcpu, false);
431 kvm_vcpu_set_dy_eligible(vcpu, false);
Raghavendra K T3a08a8f2013-03-04 23:32:07 +0530432 vcpu->preempted = false;
Wanpeng Lid73eb572019-07-18 19:39:06 +0800433 vcpu->ready = false;
Sean Christophersond5c48de2019-12-18 13:55:17 -0800434 preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
Maciej S. Szmigieroa54d8062021-12-06 20:54:30 +0100435 vcpu->last_used_slot = NULL;
Rusty Russellfb3f0f52007-07-27 17:16:56 +1000436}
Rusty Russellfb3f0f52007-07-27 17:16:56 +1000437
Marc Zyngier27592ae2021-11-16 16:03:57 +0000438static void kvm_vcpu_destroy(struct kvm_vcpu *vcpu)
Sean Christopherson4543bdc2019-12-18 13:55:14 -0800439{
Peter Xufb04a1e2020-09-30 21:22:22 -0400440 kvm_dirty_ring_free(&vcpu->dirty_ring);
Sean Christopherson4543bdc2019-12-18 13:55:14 -0800441 kvm_arch_vcpu_destroy(vcpu);
Sean Christophersone529ef62019-12-18 13:55:15 -0800442
Sean Christopherson9941d222019-12-18 13:55:29 -0800443 /*
444 * No need for rcu_read_lock as VCPU_RUN is the only place that changes
445 * the vcpu->pid pointer, and at destruction time all file descriptors
446 * are already gone.
447 */
448 put_pid(rcu_dereference_protected(vcpu->pid, 1));
449
Sean Christopherson8bd826d2019-12-18 13:55:30 -0800450 free_page((unsigned long)vcpu->run);
Sean Christophersone529ef62019-12-18 13:55:15 -0800451 kmem_cache_free(kvm_vcpu_cache, vcpu);
Sean Christopherson4543bdc2019-12-18 13:55:14 -0800452}
Marc Zyngier27592ae2021-11-16 16:03:57 +0000453
454void kvm_destroy_vcpus(struct kvm *kvm)
455{
Marc Zyngier46808a42021-11-16 16:04:02 +0000456 unsigned long i;
Marc Zyngier27592ae2021-11-16 16:03:57 +0000457 struct kvm_vcpu *vcpu;
458
459 kvm_for_each_vcpu(i, vcpu, kvm) {
460 kvm_vcpu_destroy(vcpu);
Marc Zyngierc5b07752021-11-16 16:04:01 +0000461 xa_erase(&kvm->vcpu_array, i);
Marc Zyngier27592ae2021-11-16 16:03:57 +0000462 }
463
464 atomic_set(&kvm->online_vcpus, 0);
465}
466EXPORT_SYMBOL_GPL(kvm_destroy_vcpus);
Sean Christopherson4543bdc2019-12-18 13:55:14 -0800467
Andrea Arcangelie930bff2008-07-25 16:24:52 +0200468#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
469static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
470{
471 return container_of(mn, struct kvm, mmu_notifier);
472}
473
Eiichi Tsukatae649b3f2020-06-06 13:26:27 +0900474static void kvm_mmu_notifier_invalidate_range(struct mmu_notifier *mn,
475 struct mm_struct *mm,
476 unsigned long start, unsigned long end)
477{
478 struct kvm *kvm = mmu_notifier_to_kvm(mn);
479 int idx;
480
481 idx = srcu_read_lock(&kvm->srcu);
482 kvm_arch_mmu_notifier_invalidate_range(kvm, start, end);
483 srcu_read_unlock(&kvm->srcu, idx);
484}
485
Sean Christopherson3039bcc2021-04-01 17:56:50 -0700486typedef bool (*hva_handler_t)(struct kvm *kvm, struct kvm_gfn_range *range);
487
Sean Christophersonf922bd92021-04-01 17:56:55 -0700488typedef void (*on_lock_fn_t)(struct kvm *kvm, unsigned long start,
489 unsigned long end);
490
Sean Christopherson3039bcc2021-04-01 17:56:50 -0700491struct kvm_hva_range {
492 unsigned long start;
493 unsigned long end;
494 pte_t pte;
495 hva_handler_t handler;
Sean Christophersonf922bd92021-04-01 17:56:55 -0700496 on_lock_fn_t on_lock;
Sean Christopherson3039bcc2021-04-01 17:56:50 -0700497 bool flush_on_ret;
498 bool may_block;
499};
500
Sean Christophersonf922bd92021-04-01 17:56:55 -0700501/*
502 * Use a dedicated stub instead of NULL to indicate that there is no callback
503 * function/handler. The compiler technically can't guarantee that a real
504 * function will have a non-zero address, and so it will generate code to
505 * check for !NULL, whereas comparing against a stub will be elided at compile
506 * time (unless the compiler is getting long in the tooth, e.g. gcc 4.9).
507 */
508static void kvm_null_fn(void)
509{
510
511}
512#define IS_KVM_NULL_FN(fn) ((fn) == (void *)kvm_null_fn)
513
Maciej S. Szmigieroed922732021-12-06 20:54:28 +0100514/* Iterate over each memslot intersecting [start, last] (inclusive) range */
515#define kvm_for_each_memslot_in_hva_range(node, slots, start, last) \
516 for (node = interval_tree_iter_first(&slots->hva_tree, start, last); \
517 node; \
518 node = interval_tree_iter_next(node, start, last)) \
519
Sean Christopherson3039bcc2021-04-01 17:56:50 -0700520static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
521 const struct kvm_hva_range *range)
522{
Sean Christopherson8931a452021-04-01 17:56:56 -0700523 bool ret = false, locked = false;
Sean Christophersonf922bd92021-04-01 17:56:55 -0700524 struct kvm_gfn_range gfn_range;
Sean Christopherson3039bcc2021-04-01 17:56:50 -0700525 struct kvm_memory_slot *slot;
526 struct kvm_memslots *slots;
Sean Christopherson3039bcc2021-04-01 17:56:50 -0700527 int i, idx;
528
Maciej S. Szmigieroed922732021-12-06 20:54:28 +0100529 if (WARN_ON_ONCE(range->end <= range->start))
530 return 0;
531
Sean Christophersonf922bd92021-04-01 17:56:55 -0700532 /* A null handler is allowed if and only if on_lock() is provided. */
533 if (WARN_ON_ONCE(IS_KVM_NULL_FN(range->on_lock) &&
534 IS_KVM_NULL_FN(range->handler)))
535 return 0;
536
Sean Christopherson3039bcc2021-04-01 17:56:50 -0700537 idx = srcu_read_lock(&kvm->srcu);
538
539 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
Maciej S. Szmigieroed922732021-12-06 20:54:28 +0100540 struct interval_tree_node *node;
541
Sean Christopherson3039bcc2021-04-01 17:56:50 -0700542 slots = __kvm_memslots(kvm, i);
Maciej S. Szmigieroed922732021-12-06 20:54:28 +0100543 kvm_for_each_memslot_in_hva_range(node, slots,
544 range->start, range->end - 1) {
Sean Christopherson3039bcc2021-04-01 17:56:50 -0700545 unsigned long hva_start, hva_end;
546
Maciej S. Szmigieroa54d8062021-12-06 20:54:30 +0100547 slot = container_of(node, struct kvm_memory_slot, hva_node[slots->node_idx]);
Sean Christopherson3039bcc2021-04-01 17:56:50 -0700548 hva_start = max(range->start, slot->userspace_addr);
549 hva_end = min(range->end, slot->userspace_addr +
550 (slot->npages << PAGE_SHIFT));
Sean Christopherson3039bcc2021-04-01 17:56:50 -0700551
552 /*
553 * To optimize for the likely case where the address
554 * range is covered by zero or one memslots, don't
555 * bother making these conditional (to avoid writes on
556 * the second or later invocation of the handler).
557 */
558 gfn_range.pte = range->pte;
559 gfn_range.may_block = range->may_block;
560
561 /*
562 * {gfn(page) | page intersects with [hva_start, hva_end)} =
563 * {gfn_start, gfn_start+1, ..., gfn_end-1}.
564 */
565 gfn_range.start = hva_to_gfn_memslot(hva_start, slot);
566 gfn_range.end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, slot);
567 gfn_range.slot = slot;
568
Sean Christopherson8931a452021-04-01 17:56:56 -0700569 if (!locked) {
570 locked = true;
571 KVM_MMU_LOCK(kvm);
Paolo Bonzini071064f2021-08-03 03:45:41 -0400572 if (!IS_KVM_NULL_FN(range->on_lock))
573 range->on_lock(kvm, range->start, range->end);
574 if (IS_KVM_NULL_FN(range->handler))
575 break;
Sean Christopherson8931a452021-04-01 17:56:56 -0700576 }
Sean Christopherson3039bcc2021-04-01 17:56:50 -0700577 ret |= range->handler(kvm, &gfn_range);
578 }
579 }
580
Lai Jiangshan6bc6db02021-09-18 08:56:29 +0800581 if (range->flush_on_ret && ret)
Sean Christopherson3039bcc2021-04-01 17:56:50 -0700582 kvm_flush_remote_tlbs(kvm);
583
Sean Christopherson8931a452021-04-01 17:56:56 -0700584 if (locked)
585 KVM_MMU_UNLOCK(kvm);
Sean Christophersonf922bd92021-04-01 17:56:55 -0700586
Sean Christopherson3039bcc2021-04-01 17:56:50 -0700587 srcu_read_unlock(&kvm->srcu, idx);
588
589 /* The notifiers are averse to booleans. :-( */
590 return (int)ret;
591}
592
593static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn,
594 unsigned long start,
595 unsigned long end,
596 pte_t pte,
597 hva_handler_t handler)
598{
599 struct kvm *kvm = mmu_notifier_to_kvm(mn);
600 const struct kvm_hva_range range = {
601 .start = start,
602 .end = end,
603 .pte = pte,
604 .handler = handler,
Sean Christophersonf922bd92021-04-01 17:56:55 -0700605 .on_lock = (void *)kvm_null_fn,
Sean Christopherson3039bcc2021-04-01 17:56:50 -0700606 .flush_on_ret = true,
607 .may_block = false,
608 };
Sean Christopherson3039bcc2021-04-01 17:56:50 -0700609
Sean Christophersonf922bd92021-04-01 17:56:55 -0700610 return __kvm_handle_hva_range(kvm, &range);
Sean Christopherson3039bcc2021-04-01 17:56:50 -0700611}
612
613static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn,
614 unsigned long start,
615 unsigned long end,
616 hva_handler_t handler)
617{
618 struct kvm *kvm = mmu_notifier_to_kvm(mn);
619 const struct kvm_hva_range range = {
620 .start = start,
621 .end = end,
622 .pte = __pte(0),
623 .handler = handler,
Sean Christophersonf922bd92021-04-01 17:56:55 -0700624 .on_lock = (void *)kvm_null_fn,
Sean Christopherson3039bcc2021-04-01 17:56:50 -0700625 .flush_on_ret = false,
626 .may_block = false,
627 };
Sean Christopherson3039bcc2021-04-01 17:56:50 -0700628
Sean Christophersonf922bd92021-04-01 17:56:55 -0700629 return __kvm_handle_hva_range(kvm, &range);
Sean Christopherson3039bcc2021-04-01 17:56:50 -0700630}
Izik Eidus3da0dd42009-09-23 21:47:18 +0300631static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
632 struct mm_struct *mm,
633 unsigned long address,
634 pte_t pte)
635{
636 struct kvm *kvm = mmu_notifier_to_kvm(mn);
637
Sean Christopherson501b9182021-03-25 19:19:48 -0700638 trace_kvm_set_spte_hva(address);
639
Sean Christophersonc13fda22021-04-02 02:56:49 +0200640 /*
Paolo Bonzini52ac8b32021-05-27 08:09:15 -0400641 * .change_pte() must be surrounded by .invalidate_range_{start,end}().
Paolo Bonzini071064f2021-08-03 03:45:41 -0400642 * If mmu_notifier_count is zero, then no in-progress invalidations,
643 * including this one, found a relevant memslot at start(); rechecking
644 * memslots here is unnecessary. Note, a false positive (count elevated
645 * by a different invalidation) is sub-optimal but functionally ok.
Sean Christophersonc13fda22021-04-02 02:56:49 +0200646 */
Paolo Bonzini52ac8b32021-05-27 08:09:15 -0400647 WARN_ON_ONCE(!READ_ONCE(kvm->mn_active_invalidate_count));
Paolo Bonzini071064f2021-08-03 03:45:41 -0400648 if (!READ_ONCE(kvm->mmu_notifier_count))
649 return;
Sean Christophersonc13fda22021-04-02 02:56:49 +0200650
Sean Christopherson3039bcc2021-04-01 17:56:50 -0700651 kvm_handle_hva_range(mn, address, address + 1, pte, kvm_set_spte_gfn);
Izik Eidus3da0dd42009-09-23 21:47:18 +0300652}
653
Maxim Levitskyedb298c2021-08-10 23:52:39 +0300654void kvm_inc_notifier_count(struct kvm *kvm, unsigned long start,
Sean Christophersonf922bd92021-04-01 17:56:55 -0700655 unsigned long end)
Andrea Arcangelie930bff2008-07-25 16:24:52 +0200656{
Andrea Arcangelie930bff2008-07-25 16:24:52 +0200657 /*
658 * The count increase must become visible at unlock time as no
659 * spte can be established without taking the mmu_lock and
660 * count is also read inside the mmu_lock critical section.
661 */
662 kvm->mmu_notifier_count++;
David Stevens4a42d842021-02-22 11:45:22 +0900663 if (likely(kvm->mmu_notifier_count == 1)) {
Sean Christophersonf922bd92021-04-01 17:56:55 -0700664 kvm->mmu_notifier_range_start = start;
665 kvm->mmu_notifier_range_end = end;
David Stevens4a42d842021-02-22 11:45:22 +0900666 } else {
667 /*
668 * Fully tracking multiple concurrent ranges has dimishing
669 * returns. Keep things simple and just find the minimal range
670 * which includes the current and new ranges. As there won't be
671 * enough information to subtract a range after its invalidate
672 * completes, any ranges invalidated concurrently will
673 * accumulate and persist until all outstanding invalidates
674 * complete.
675 */
676 kvm->mmu_notifier_range_start =
Sean Christophersonf922bd92021-04-01 17:56:55 -0700677 min(kvm->mmu_notifier_range_start, start);
David Stevens4a42d842021-02-22 11:45:22 +0900678 kvm->mmu_notifier_range_end =
Sean Christophersonf922bd92021-04-01 17:56:55 -0700679 max(kvm->mmu_notifier_range_end, end);
David Stevens4a42d842021-02-22 11:45:22 +0900680 }
Sean Christophersonf922bd92021-04-01 17:56:55 -0700681}
682
683static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
684 const struct mmu_notifier_range *range)
685{
686 struct kvm *kvm = mmu_notifier_to_kvm(mn);
687 const struct kvm_hva_range hva_range = {
688 .start = range->start,
689 .end = range->end,
690 .pte = __pte(0),
691 .handler = kvm_unmap_gfn_range,
692 .on_lock = kvm_inc_notifier_count,
693 .flush_on_ret = true,
694 .may_block = mmu_notifier_range_blockable(range),
695 };
696
697 trace_kvm_unmap_hva_range(range->start, range->end);
Sean Christopherson3039bcc2021-04-01 17:56:50 -0700698
Paolo Bonzini52ac8b32021-05-27 08:09:15 -0400699 /*
700 * Prevent memslot modification between range_start() and range_end()
701 * so that conditionally locking provides the same result in both
702 * functions. Without that guarantee, the mmu_notifier_count
703 * adjustments will be imbalanced.
704 *
705 * Pairs with the decrement in range_end().
706 */
707 spin_lock(&kvm->mn_invalidate_lock);
708 kvm->mn_active_invalidate_count++;
709 spin_unlock(&kvm->mn_invalidate_lock);
710
David Woodhouse982ed0d2021-12-10 16:36:21 +0000711 gfn_to_pfn_cache_invalidate_start(kvm, range->start, range->end,
712 hva_range.may_block);
713
Sean Christopherson3039bcc2021-04-01 17:56:50 -0700714 __kvm_handle_hva_range(kvm, &hva_range);
Takuya Yoshikawa565f3be2012-02-10 15:28:31 +0900715
Eiichi Tsukatae649b3f2020-06-06 13:26:27 +0900716 return 0;
Andrea Arcangelie930bff2008-07-25 16:24:52 +0200717}
718
Maxim Levitskyedb298c2021-08-10 23:52:39 +0300719void kvm_dec_notifier_count(struct kvm *kvm, unsigned long start,
Sean Christophersonf922bd92021-04-01 17:56:55 -0700720 unsigned long end)
Andrea Arcangelie930bff2008-07-25 16:24:52 +0200721{
Andrea Arcangelie930bff2008-07-25 16:24:52 +0200722 /*
723 * This sequence increase will notify the kvm page fault that
724 * the page that is going to be mapped in the spte could have
725 * been freed.
726 */
727 kvm->mmu_notifier_seq++;
Paul Mackerrasa355aa52011-12-12 12:37:21 +0000728 smp_wmb();
Andrea Arcangelie930bff2008-07-25 16:24:52 +0200729 /*
730 * The above sequence increase must be visible before the
Paul Mackerrasa355aa52011-12-12 12:37:21 +0000731 * below count decrease, which is ensured by the smp_wmb above
732 * in conjunction with the smp_rmb in mmu_notifier_retry().
Andrea Arcangelie930bff2008-07-25 16:24:52 +0200733 */
734 kvm->mmu_notifier_count--;
Sean Christophersonf922bd92021-04-01 17:56:55 -0700735}
736
737static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
738 const struct mmu_notifier_range *range)
739{
740 struct kvm *kvm = mmu_notifier_to_kvm(mn);
741 const struct kvm_hva_range hva_range = {
742 .start = range->start,
743 .end = range->end,
744 .pte = __pte(0),
745 .handler = (void *)kvm_null_fn,
746 .on_lock = kvm_dec_notifier_count,
747 .flush_on_ret = false,
748 .may_block = mmu_notifier_range_blockable(range),
749 };
Paolo Bonzini52ac8b32021-05-27 08:09:15 -0400750 bool wake;
Sean Christophersonf922bd92021-04-01 17:56:55 -0700751
752 __kvm_handle_hva_range(kvm, &hva_range);
Andrea Arcangelie930bff2008-07-25 16:24:52 +0200753
Paolo Bonzini52ac8b32021-05-27 08:09:15 -0400754 /* Pairs with the increment in range_start(). */
755 spin_lock(&kvm->mn_invalidate_lock);
756 wake = (--kvm->mn_active_invalidate_count == 0);
757 spin_unlock(&kvm->mn_invalidate_lock);
758
759 /*
760 * There can only be one waiter, since the wait happens under
761 * slots_lock.
762 */
763 if (wake)
764 rcuwait_wake_up(&kvm->mn_memslots_update_rcuwait);
765
Andrea Arcangelie930bff2008-07-25 16:24:52 +0200766 BUG_ON(kvm->mmu_notifier_count < 0);
767}
768
769static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
770 struct mm_struct *mm,
Andres Lagar-Cavilla57128462014-09-22 14:54:42 -0700771 unsigned long start,
772 unsigned long end)
Andrea Arcangelie930bff2008-07-25 16:24:52 +0200773{
Sean Christopherson501b9182021-03-25 19:19:48 -0700774 trace_kvm_age_hva(start, end);
775
Sean Christopherson3039bcc2021-04-01 17:56:50 -0700776 return kvm_handle_hva_range(mn, start, end, __pte(0), kvm_age_gfn);
Andrea Arcangelie930bff2008-07-25 16:24:52 +0200777}
778
Vladimir Davydov1d7715c2015-09-09 15:35:41 -0700779static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
780 struct mm_struct *mm,
781 unsigned long start,
782 unsigned long end)
783{
Sean Christopherson501b9182021-03-25 19:19:48 -0700784 trace_kvm_age_hva(start, end);
785
Vladimir Davydov1d7715c2015-09-09 15:35:41 -0700786 /*
787 * Even though we do not flush TLB, this will still adversely
788 * affect performance on pre-Haswell Intel EPT, where there is
789 * no EPT Access Bit to clear so that we have to tear down EPT
790 * tables instead. If we find this unacceptable, we can always
791 * add a parameter to kvm_age_hva so that it effectively doesn't
792 * do anything on clear_young.
793 *
794 * Also note that currently we never issue secondary TLB flushes
795 * from clear_young, leaving this job up to the regular system
796 * cadence. If we find this inaccurate, we might come up with a
797 * more sophisticated heuristic later.
798 */
Sean Christopherson3039bcc2021-04-01 17:56:50 -0700799 return kvm_handle_hva_range_no_flush(mn, start, end, kvm_age_gfn);
Vladimir Davydov1d7715c2015-09-09 15:35:41 -0700800}
801
Andrea Arcangeli8ee53822011-01-13 15:47:10 -0800802static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
803 struct mm_struct *mm,
804 unsigned long address)
805{
Sean Christopherson501b9182021-03-25 19:19:48 -0700806 trace_kvm_test_age_hva(address);
807
Sean Christopherson3039bcc2021-04-01 17:56:50 -0700808 return kvm_handle_hva_range_no_flush(mn, address, address + 1,
809 kvm_test_age_gfn);
Andrea Arcangeli8ee53822011-01-13 15:47:10 -0800810}
811
Marcelo Tosatti85db06e2008-12-10 21:23:26 +0100812static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
813 struct mm_struct *mm)
814{
815 struct kvm *kvm = mmu_notifier_to_kvm(mn);
Lai Jiangshaneda2bed2010-04-20 14:29:29 +0800816 int idx;
817
818 idx = srcu_read_lock(&kvm->srcu);
Marcelo Tosatti2df72e92012-08-24 15:54:57 -0300819 kvm_arch_flush_shadow_all(kvm);
Lai Jiangshaneda2bed2010-04-20 14:29:29 +0800820 srcu_read_unlock(&kvm->srcu, idx);
Marcelo Tosatti85db06e2008-12-10 21:23:26 +0100821}
822
Andrea Arcangelie930bff2008-07-25 16:24:52 +0200823static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
Eiichi Tsukatae649b3f2020-06-06 13:26:27 +0900824 .invalidate_range = kvm_mmu_notifier_invalidate_range,
Andrea Arcangelie930bff2008-07-25 16:24:52 +0200825 .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
826 .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
827 .clear_flush_young = kvm_mmu_notifier_clear_flush_young,
Vladimir Davydov1d7715c2015-09-09 15:35:41 -0700828 .clear_young = kvm_mmu_notifier_clear_young,
Andrea Arcangeli8ee53822011-01-13 15:47:10 -0800829 .test_young = kvm_mmu_notifier_test_young,
Izik Eidus3da0dd42009-09-23 21:47:18 +0300830 .change_pte = kvm_mmu_notifier_change_pte,
Marcelo Tosatti85db06e2008-12-10 21:23:26 +0100831 .release = kvm_mmu_notifier_release,
Andrea Arcangelie930bff2008-07-25 16:24:52 +0200832};
Avi Kivity4c07b0a2009-12-20 14:54:04 +0200833
834static int kvm_init_mmu_notifier(struct kvm *kvm)
835{
836 kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
837 return mmu_notifier_register(&kvm->mmu_notifier, current->mm);
838}
839
840#else /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */
841
842static int kvm_init_mmu_notifier(struct kvm *kvm)
843{
844 return 0;
845}
846
Andrea Arcangelie930bff2008-07-25 16:24:52 +0200847#endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */
848
Sergey Senozhatsky2fdef3a2021-06-06 11:10:44 +0900849#ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
850static int kvm_pm_notifier_call(struct notifier_block *bl,
851 unsigned long state,
852 void *unused)
853{
854 struct kvm *kvm = container_of(bl, struct kvm, pm_notifier);
855
856 return kvm_arch_pm_notifier(kvm, state);
857}
858
859static void kvm_init_pm_notifier(struct kvm *kvm)
860{
861 kvm->pm_notifier.notifier_call = kvm_pm_notifier_call;
862 /* Suspend KVM before we suspend ftrace, RCU, etc. */
863 kvm->pm_notifier.priority = INT_MAX;
864 register_pm_notifier(&kvm->pm_notifier);
865}
866
867static void kvm_destroy_pm_notifier(struct kvm *kvm)
868{
869 unregister_pm_notifier(&kvm->pm_notifier);
870}
871#else /* !CONFIG_HAVE_KVM_PM_NOTIFIER */
872static void kvm_init_pm_notifier(struct kvm *kvm)
873{
874}
875
876static void kvm_destroy_pm_notifier(struct kvm *kvm)
877{
878}
879#endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */
880
Paolo Bonzinia47d2b02015-05-17 11:41:37 +0200881static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
882{
883 if (!memslot->dirty_bitmap)
884 return;
885
886 kvfree(memslot->dirty_bitmap);
887 memslot->dirty_bitmap = NULL;
888}
889
Maciej S. Szmigieroa54d8062021-12-06 20:54:30 +0100890/* This does not remove the slot from struct kvm_memslots data structures */
Sean Christophersone96c81e2020-02-18 13:07:27 -0800891static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
Paolo Bonzinia47d2b02015-05-17 11:41:37 +0200892{
Sean Christophersone96c81e2020-02-18 13:07:27 -0800893 kvm_destroy_dirty_bitmap(slot);
Paolo Bonzinia47d2b02015-05-17 11:41:37 +0200894
Sean Christophersone96c81e2020-02-18 13:07:27 -0800895 kvm_arch_free_memslot(kvm, slot);
Paolo Bonzinia47d2b02015-05-17 11:41:37 +0200896
Maciej S. Szmigieroa54d8062021-12-06 20:54:30 +0100897 kfree(slot);
Paolo Bonzinia47d2b02015-05-17 11:41:37 +0200898}
899
900static void kvm_free_memslots(struct kvm *kvm, struct kvm_memslots *slots)
901{
Maciej S. Szmigieroa54d8062021-12-06 20:54:30 +0100902 struct hlist_node *idnode;
Paolo Bonzinia47d2b02015-05-17 11:41:37 +0200903 struct kvm_memory_slot *memslot;
Maciej S. Szmigieroa54d8062021-12-06 20:54:30 +0100904 int bkt;
Paolo Bonzinia47d2b02015-05-17 11:41:37 +0200905
Maciej S. Szmigieroa54d8062021-12-06 20:54:30 +0100906 /*
907 * The same memslot objects live in both active and inactive sets,
908 * arbitrarily free using index '1' so the second invocation of this
909 * function isn't operating over a structure with dangling pointers
910 * (even though this function isn't actually touching them).
911 */
912 if (!slots->node_idx)
Paolo Bonzinia47d2b02015-05-17 11:41:37 +0200913 return;
914
Maciej S. Szmigieroa54d8062021-12-06 20:54:30 +0100915 hash_for_each_safe(slots->id_hash, bkt, idnode, memslot, id_node[1])
Sean Christophersone96c81e2020-02-18 13:07:27 -0800916 kvm_free_memslot(kvm, memslot);
Xiao Guangrongbf3e05b2011-11-24 17:40:57 +0800917}
918
Jing Zhangbc9e9e62021-06-23 17:28:46 -0400919static umode_t kvm_stats_debugfs_mode(const struct _kvm_stats_desc *pdesc)
920{
921 switch (pdesc->desc.flags & KVM_STATS_TYPE_MASK) {
922 case KVM_STATS_TYPE_INSTANT:
923 return 0444;
924 case KVM_STATS_TYPE_CUMULATIVE:
925 case KVM_STATS_TYPE_PEAK:
926 default:
927 return 0644;
928 }
929}
930
931
Janosch Frank536a6f82016-05-18 13:26:23 +0200932static void kvm_destroy_vm_debugfs(struct kvm *kvm)
933{
934 int i;
Jing Zhangbc9e9e62021-06-23 17:28:46 -0400935 int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc +
936 kvm_vcpu_stats_header.num_desc;
Janosch Frank536a6f82016-05-18 13:26:23 +0200937
938 if (!kvm->debugfs_dentry)
939 return;
940
941 debugfs_remove_recursive(kvm->debugfs_dentry);
942
Luiz Capitulino9d5a1dc2016-09-07 14:47:21 -0400943 if (kvm->debugfs_stat_data) {
944 for (i = 0; i < kvm_debugfs_num_entries; i++)
945 kfree(kvm->debugfs_stat_data[i]);
946 kfree(kvm->debugfs_stat_data);
947 }
Janosch Frank536a6f82016-05-18 13:26:23 +0200948}
949
950static int kvm_create_vm_debugfs(struct kvm *kvm, int fd)
951{
Paolo Bonzini85cd39a2021-08-04 05:28:52 -0400952 static DEFINE_MUTEX(kvm_debugfs_lock);
953 struct dentry *dent;
Janosch Frank536a6f82016-05-18 13:26:23 +0200954 char dir_name[ITOA_MAX_LEN * 2];
955 struct kvm_stat_data *stat_data;
Jing Zhangbc9e9e62021-06-23 17:28:46 -0400956 const struct _kvm_stats_desc *pdesc;
Peter Xu3165af72021-07-30 18:04:49 -0400957 int i, ret;
Jing Zhangbc9e9e62021-06-23 17:28:46 -0400958 int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc +
959 kvm_vcpu_stats_header.num_desc;
Janosch Frank536a6f82016-05-18 13:26:23 +0200960
961 if (!debugfs_initialized())
962 return 0;
963
964 snprintf(dir_name, sizeof(dir_name), "%d-%d", task_pid_nr(current), fd);
Paolo Bonzini85cd39a2021-08-04 05:28:52 -0400965 mutex_lock(&kvm_debugfs_lock);
966 dent = debugfs_lookup(dir_name, kvm_debugfs_dir);
967 if (dent) {
968 pr_warn_ratelimited("KVM: debugfs: duplicate directory %s\n", dir_name);
969 dput(dent);
970 mutex_unlock(&kvm_debugfs_lock);
971 return 0;
972 }
973 dent = debugfs_create_dir(dir_name, kvm_debugfs_dir);
974 mutex_unlock(&kvm_debugfs_lock);
975 if (IS_ERR(dent))
976 return 0;
Janosch Frank536a6f82016-05-18 13:26:23 +0200977
Paolo Bonzini85cd39a2021-08-04 05:28:52 -0400978 kvm->debugfs_dentry = dent;
Janosch Frank536a6f82016-05-18 13:26:23 +0200979 kvm->debugfs_stat_data = kcalloc(kvm_debugfs_num_entries,
980 sizeof(*kvm->debugfs_stat_data),
Ben Gardonb12ce362019-02-11 11:02:49 -0800981 GFP_KERNEL_ACCOUNT);
Janosch Frank536a6f82016-05-18 13:26:23 +0200982 if (!kvm->debugfs_stat_data)
983 return -ENOMEM;
984
Jing Zhangbc9e9e62021-06-23 17:28:46 -0400985 for (i = 0; i < kvm_vm_stats_header.num_desc; ++i) {
986 pdesc = &kvm_vm_stats_desc[i];
Ben Gardonb12ce362019-02-11 11:02:49 -0800987 stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
Janosch Frank536a6f82016-05-18 13:26:23 +0200988 if (!stat_data)
989 return -ENOMEM;
990
991 stat_data->kvm = kvm;
Jing Zhangbc9e9e62021-06-23 17:28:46 -0400992 stat_data->desc = pdesc;
993 stat_data->kind = KVM_STAT_VM;
994 kvm->debugfs_stat_data[i] = stat_data;
995 debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
996 kvm->debugfs_dentry, stat_data,
997 &stat_fops_per_vm);
998 }
999
1000 for (i = 0; i < kvm_vcpu_stats_header.num_desc; ++i) {
1001 pdesc = &kvm_vcpu_stats_desc[i];
1002 stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
1003 if (!stat_data)
1004 return -ENOMEM;
1005
1006 stat_data->kvm = kvm;
1007 stat_data->desc = pdesc;
1008 stat_data->kind = KVM_STAT_VCPU;
Pavel Skripkin004d62e2021-07-01 22:55:00 +03001009 kvm->debugfs_stat_data[i + kvm_vm_stats_header.num_desc] = stat_data;
Jing Zhangbc9e9e62021-06-23 17:28:46 -04001010 debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
Milan Pandurov09cbcef2019-12-13 14:07:21 +01001011 kvm->debugfs_dentry, stat_data,
1012 &stat_fops_per_vm);
Janosch Frank536a6f82016-05-18 13:26:23 +02001013 }
Peter Xu3165af72021-07-30 18:04:49 -04001014
1015 ret = kvm_arch_create_vm_debugfs(kvm);
1016 if (ret) {
1017 kvm_destroy_vm_debugfs(kvm);
1018 return i;
1019 }
1020
Janosch Frank536a6f82016-05-18 13:26:23 +02001021 return 0;
1022}
1023
Junaid Shahid1aa9b952019-11-04 20:26:00 +01001024/*
1025 * Called after the VM is otherwise initialized, but just before adding it to
1026 * the vm_list.
1027 */
1028int __weak kvm_arch_post_init_vm(struct kvm *kvm)
1029{
1030 return 0;
1031}
1032
1033/*
1034 * Called just after removing the VM from the vm_list, but before doing any
1035 * other destruction.
1036 */
1037void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm)
1038{
1039}
1040
Peter Xu3165af72021-07-30 18:04:49 -04001041/*
1042 * Called after per-vm debugfs created. When called kvm->debugfs_dentry should
1043 * be setup already, so we can create arch-specific debugfs entries under it.
1044 * Cleanup should be automatic done in kvm_destroy_vm_debugfs() recursively, so
1045 * a per-arch destroy interface is not needed.
1046 */
1047int __weak kvm_arch_create_vm_debugfs(struct kvm *kvm)
1048{
1049 return 0;
1050}
1051
Carsten Ottee08b9632012-01-04 10:25:20 +01001052static struct kvm *kvm_create_vm(unsigned long type)
Avi Kivity6aa8b732006-12-10 02:21:36 -08001053{
Jan Kiszkad89f5ef2010-11-09 17:02:49 +01001054 struct kvm *kvm = kvm_arch_alloc_vm();
Maciej S. Szmigieroa54d8062021-12-06 20:54:30 +01001055 struct kvm_memslots *slots;
Jim Mattson91219232019-10-24 16:03:26 -07001056 int r = -ENOMEM;
Maciej S. Szmigieroa54d8062021-12-06 20:54:30 +01001057 int i, j;
Avi Kivity6aa8b732006-12-10 02:21:36 -08001058
Jan Kiszkad89f5ef2010-11-09 17:02:49 +01001059 if (!kvm)
1060 return ERR_PTR(-ENOMEM);
1061
Ben Gardon531810c2021-02-02 10:57:24 -08001062 KVM_MMU_LOCK_INIT(kvm);
Vegard Nossumf1f10072017-02-27 14:30:07 -08001063 mmgrab(current->mm);
Paolo Bonzinie9ad4ec2016-03-21 10:15:25 +01001064 kvm->mm = current->mm;
1065 kvm_eventfd_init(kvm);
1066 mutex_init(&kvm->lock);
1067 mutex_init(&kvm->irq_lock);
1068 mutex_init(&kvm->slots_lock);
Ben Gardonb10a0382021-05-18 10:34:11 -07001069 mutex_init(&kvm->slots_arch_lock);
Paolo Bonzini52ac8b32021-05-27 08:09:15 -04001070 spin_lock_init(&kvm->mn_invalidate_lock);
1071 rcuwait_init(&kvm->mn_memslots_update_rcuwait);
Marc Zyngierc5b07752021-11-16 16:04:01 +00001072 xa_init(&kvm->vcpu_array);
Paolo Bonzini52ac8b32021-05-27 08:09:15 -04001073
David Woodhouse982ed0d2021-12-10 16:36:21 +00001074 INIT_LIST_HEAD(&kvm->gpc_list);
1075 spin_lock_init(&kvm->gpc_lock);
Paolo Bonzinie9ad4ec2016-03-21 10:15:25 +01001076
Paolo Bonzinie9ad4ec2016-03-21 10:15:25 +01001077 INIT_LIST_HEAD(&kvm->devices);
1078
Jim Mattson91219232019-10-24 16:03:26 -07001079 BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
1080
Paolo Bonzini8a441192019-11-04 12:16:49 +01001081 if (init_srcu_struct(&kvm->srcu))
1082 goto out_err_no_srcu;
1083 if (init_srcu_struct(&kvm->irq_srcu))
1084 goto out_err_no_irq_srcu;
1085
Paolo Bonzinie2d3fca2019-11-04 13:23:53 +01001086 refcount_set(&kvm->users_count, 1);
Jim Mattson91219232019-10-24 16:03:26 -07001087 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
Maciej S. Szmigieroa54d8062021-12-06 20:54:30 +01001088 for (j = 0; j < 2; j++) {
1089 slots = &kvm->__memslots[i][j];
Jim Mattson91219232019-10-24 16:03:26 -07001090
Maciej S. Szmigieroa54d8062021-12-06 20:54:30 +01001091 atomic_long_set(&slots->last_used_slot, (unsigned long)NULL);
1092 slots->hva_tree = RB_ROOT_CACHED;
1093 slots->gfn_tree = RB_ROOT;
1094 hash_init(slots->id_hash);
1095 slots->node_idx = j;
1096
1097 /* Generations must be different for each address space. */
1098 slots->generation = i;
1099 }
1100
1101 rcu_assign_pointer(kvm->memslots[i], &kvm->__memslots[i][0]);
Jim Mattson91219232019-10-24 16:03:26 -07001102 }
1103
1104 for (i = 0; i < KVM_NR_BUSES; i++) {
1105 rcu_assign_pointer(kvm->buses[i],
1106 kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL_ACCOUNT));
1107 if (!kvm->buses[i])
Jim Mattsona97b0e72019-10-25 13:34:58 +02001108 goto out_err_no_arch_destroy_vm;
Jim Mattson91219232019-10-24 16:03:26 -07001109 }
1110
David Matlackacd05782020-04-17 15:14:46 -07001111 kvm->max_halt_poll_ns = halt_poll_ns;
1112
Carsten Ottee08b9632012-01-04 10:25:20 +01001113 r = kvm_arch_init_vm(kvm, type);
Jan Kiszkad89f5ef2010-11-09 17:02:49 +01001114 if (r)
Jim Mattsona97b0e72019-10-25 13:34:58 +02001115 goto out_err_no_arch_destroy_vm;
Alexander Graf10474ae2009-09-15 11:37:46 +02001116
1117 r = hardware_enable_all();
1118 if (r)
Christian Borntraeger719d93c2014-01-16 13:44:20 +01001119 goto out_err_no_disable;
Alexander Graf10474ae2009-09-15 11:37:46 +02001120
Paolo Bonzinic77dcac2014-08-06 14:24:45 +02001121#ifdef CONFIG_HAVE_KVM_IRQFD
Gleb Natapov136bdfe2009-08-24 11:54:23 +03001122 INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
Avi Kivity75858a82009-01-04 17:10:50 +02001123#endif
Avi Kivity6aa8b732006-12-10 02:21:36 -08001124
Mike Waychison74b5c5b2011-06-03 13:04:53 -07001125 r = kvm_init_mmu_notifier(kvm);
1126 if (r)
Junaid Shahid1aa9b952019-11-04 20:26:00 +01001127 goto out_err_no_mmu_notifier;
1128
1129 r = kvm_arch_post_init_vm(kvm);
1130 if (r)
Mike Waychison74b5c5b2011-06-03 13:04:53 -07001131 goto out_err;
1132
Junaid Shahid0d9ce162019-01-03 17:14:28 -08001133 mutex_lock(&kvm_lock);
Rusty Russell5e58cfe2007-07-23 17:08:21 +10001134 list_add(&kvm->vm_list, &vm_list);
Junaid Shahid0d9ce162019-01-03 17:14:28 -08001135 mutex_unlock(&kvm_lock);
Jan Kiszkad89f5ef2010-11-09 17:02:49 +01001136
Peter Zijlstra2ecd9d22015-07-03 18:53:58 +02001137 preempt_notifier_inc();
Sergey Senozhatsky2fdef3a2021-06-06 11:10:44 +09001138 kvm_init_pm_notifier(kvm);
Peter Zijlstra2ecd9d22015-07-03 18:53:58 +02001139
Avi Kivityf17abe92007-02-21 19:28:04 +02001140 return kvm;
Alexander Graf10474ae2009-09-15 11:37:46 +02001141
1142out_err:
Junaid Shahid1aa9b952019-11-04 20:26:00 +01001143#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
1144 if (kvm->mmu_notifier.ops)
1145 mmu_notifier_unregister(&kvm->mmu_notifier, current->mm);
1146#endif
1147out_err_no_mmu_notifier:
Alexander Graf10474ae2009-09-15 11:37:46 +02001148 hardware_disable_all();
Christian Borntraeger719d93c2014-01-16 13:44:20 +01001149out_err_no_disable:
Jim Mattsona97b0e72019-10-25 13:34:58 +02001150 kvm_arch_destroy_vm(kvm);
Jim Mattsona97b0e72019-10-25 13:34:58 +02001151out_err_no_arch_destroy_vm:
Paolo Bonzinie2d3fca2019-11-04 13:23:53 +01001152 WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count));
Marcelo Tosattie93f8a02009-12-23 14:35:24 -02001153 for (i = 0; i < KVM_NR_BUSES; i++)
Paolo Bonzini3898da92017-08-02 17:55:54 +02001154 kfree(kvm_get_bus(kvm, i));
Paolo Bonzini8a441192019-11-04 12:16:49 +01001155 cleanup_srcu_struct(&kvm->irq_srcu);
1156out_err_no_irq_srcu:
1157 cleanup_srcu_struct(&kvm->srcu);
1158out_err_no_srcu:
Jan Kiszkad89f5ef2010-11-09 17:02:49 +01001159 kvm_arch_free_vm(kvm);
Paolo Bonzinie9ad4ec2016-03-21 10:15:25 +01001160 mmdrop(current->mm);
Alexander Graf10474ae2009-09-15 11:37:46 +02001161 return ERR_PTR(r);
Avi Kivityf17abe92007-02-21 19:28:04 +02001162}
1163
Scott Wood07f0a7b2013-04-25 14:11:23 +00001164static void kvm_destroy_devices(struct kvm *kvm)
1165{
Geliang Tange6e3b5a2016-01-01 19:47:12 +08001166 struct kvm_device *dev, *tmp;
Scott Wood07f0a7b2013-04-25 14:11:23 +00001167
Christoffer Dalla28ebea2016-08-09 19:13:01 +02001168 /*
1169 * We do not need to take the kvm->lock here, because nobody else
1170 * has a reference to the struct kvm at this point and therefore
1171 * cannot access the devices list anyhow.
1172 */
Geliang Tange6e3b5a2016-01-01 19:47:12 +08001173 list_for_each_entry_safe(dev, tmp, &kvm->devices, vm_node) {
1174 list_del(&dev->vm_node);
Scott Wood07f0a7b2013-04-25 14:11:23 +00001175 dev->ops->destroy(dev);
1176 }
1177}
1178
Avi Kivityf17abe92007-02-21 19:28:04 +02001179static void kvm_destroy_vm(struct kvm *kvm)
1180{
Marcelo Tosattie93f8a02009-12-23 14:35:24 -02001181 int i;
Avi Kivity6d4e4c42007-11-21 16:41:05 +02001182 struct mm_struct *mm = kvm->mm;
1183
Sergey Senozhatsky2fdef3a2021-06-06 11:10:44 +09001184 kvm_destroy_pm_notifier(kvm);
Claudio Imbrenda286de8f2017-07-12 17:56:44 +02001185 kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm);
Janosch Frank536a6f82016-05-18 13:26:23 +02001186 kvm_destroy_vm_debugfs(kvm);
Sheng Yangad8ba2c2009-01-06 10:03:02 +08001187 kvm_arch_sync_events(kvm);
Junaid Shahid0d9ce162019-01-03 17:14:28 -08001188 mutex_lock(&kvm_lock);
Avi Kivity133de902007-02-12 00:54:44 -08001189 list_del(&kvm->vm_list);
Junaid Shahid0d9ce162019-01-03 17:14:28 -08001190 mutex_unlock(&kvm_lock);
Junaid Shahid1aa9b952019-11-04 20:26:00 +01001191 kvm_arch_pre_destroy_vm(kvm);
1192
Avi Kivity399ec802008-11-19 13:58:46 +02001193 kvm_free_irq_routing(kvm);
Peter Xudf630b82017-03-15 16:01:17 +08001194 for (i = 0; i < KVM_NR_BUSES; i++) {
Paolo Bonzini3898da92017-08-02 17:55:54 +02001195 struct kvm_io_bus *bus = kvm_get_bus(kvm, i);
Christian Borntraeger4a12f952017-07-07 10:51:38 +02001196
Christian Borntraeger4a12f952017-07-07 10:51:38 +02001197 if (bus)
1198 kvm_io_bus_destroy(bus);
Peter Xudf630b82017-03-15 16:01:17 +08001199 kvm->buses[i] = NULL;
1200 }
Avi Kivity980da6c2009-12-20 15:13:43 +02001201 kvm_coalesced_mmio_free(kvm);
Andrea Arcangelie930bff2008-07-25 16:24:52 +02001202#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
1203 mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
Paolo Bonzini52ac8b32021-05-27 08:09:15 -04001204 /*
1205 * At this point, pending calls to invalidate_range_start()
1206 * have completed but no more MMU notifiers will run, so
1207 * mn_active_invalidate_count may remain unbalanced.
1208 * No threads can be waiting in install_new_memslots as the
1209 * last reference on KVM has been dropped, but freeing
1210 * memslots would deadlock without this manual intervention.
1211 */
1212 WARN_ON(rcuwait_active(&kvm->mn_memslots_update_rcuwait));
1213 kvm->mn_active_invalidate_count = 0;
Gleb Natapovf00be0c2009-03-19 12:20:36 +02001214#else
Marcelo Tosatti2df72e92012-08-24 15:54:57 -03001215 kvm_arch_flush_shadow_all(kvm);
Andrea Arcangelie930bff2008-07-25 16:24:52 +02001216#endif
Zhang Xiantaod19a9cd2007-11-18 18:43:45 +08001217 kvm_arch_destroy_vm(kvm);
Scott Wood07f0a7b2013-04-25 14:11:23 +00001218 kvm_destroy_devices(kvm);
Maciej S. Szmigieroa54d8062021-12-06 20:54:30 +01001219 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
1220 kvm_free_memslots(kvm, &kvm->__memslots[i][0]);
1221 kvm_free_memslots(kvm, &kvm->__memslots[i][1]);
1222 }
Paolo Bonzini820b3fc2014-06-03 13:44:17 +02001223 cleanup_srcu_struct(&kvm->irq_srcu);
Jan Kiszkad89f5ef2010-11-09 17:02:49 +01001224 cleanup_srcu_struct(&kvm->srcu);
1225 kvm_arch_free_vm(kvm);
Peter Zijlstra2ecd9d22015-07-03 18:53:58 +02001226 preempt_notifier_dec();
Alexander Graf10474ae2009-09-15 11:37:46 +02001227 hardware_disable_all();
Avi Kivity6d4e4c42007-11-21 16:41:05 +02001228 mmdrop(mm);
Avi Kivityf17abe92007-02-21 19:28:04 +02001229}
1230
Izik Eidusd39f13b2008-03-30 16:01:25 +03001231void kvm_get_kvm(struct kvm *kvm)
1232{
Elena Reshetovae3736c32017-02-20 13:06:21 +02001233 refcount_inc(&kvm->users_count);
Izik Eidusd39f13b2008-03-30 16:01:25 +03001234}
1235EXPORT_SYMBOL_GPL(kvm_get_kvm);
1236
Peter Xu605c7132021-06-25 11:32:07 -04001237/*
1238 * Make sure the vm is not during destruction, which is a safe version of
1239 * kvm_get_kvm(). Return true if kvm referenced successfully, false otherwise.
1240 */
1241bool kvm_get_kvm_safe(struct kvm *kvm)
1242{
1243 return refcount_inc_not_zero(&kvm->users_count);
1244}
1245EXPORT_SYMBOL_GPL(kvm_get_kvm_safe);
1246
Izik Eidusd39f13b2008-03-30 16:01:25 +03001247void kvm_put_kvm(struct kvm *kvm)
1248{
Elena Reshetovae3736c32017-02-20 13:06:21 +02001249 if (refcount_dec_and_test(&kvm->users_count))
Izik Eidusd39f13b2008-03-30 16:01:25 +03001250 kvm_destroy_vm(kvm);
1251}
1252EXPORT_SYMBOL_GPL(kvm_put_kvm);
1253
Sean Christopherson149487b2019-10-21 15:58:42 -07001254/*
1255 * Used to put a reference that was taken on behalf of an object associated
1256 * with a user-visible file descriptor, e.g. a vcpu or device, if installation
1257 * of the new file descriptor fails and the reference cannot be transferred to
1258 * its final owner. In such cases, the caller is still actively using @kvm and
1259 * will fail miserably if the refcount unexpectedly hits zero.
1260 */
1261void kvm_put_kvm_no_destroy(struct kvm *kvm)
1262{
1263 WARN_ON(refcount_dec_and_test(&kvm->users_count));
1264}
1265EXPORT_SYMBOL_GPL(kvm_put_kvm_no_destroy);
Izik Eidusd39f13b2008-03-30 16:01:25 +03001266
Avi Kivityf17abe92007-02-21 19:28:04 +02001267static int kvm_vm_release(struct inode *inode, struct file *filp)
1268{
1269 struct kvm *kvm = filp->private_data;
1270
Gregory Haskins721eecbf2009-05-20 10:30:49 -04001271 kvm_irqfd_release(kvm);
1272
Izik Eidusd39f13b2008-03-30 16:01:25 +03001273 kvm_put_kvm(kvm);
Avi Kivity6aa8b732006-12-10 02:21:36 -08001274 return 0;
1275}
1276
Takuya Yoshikawa515a0122010-10-27 18:23:54 +09001277/*
1278 * Allocation size is twice as large as the actual dirty bitmap size.
Sean Christopherson0dff0842020-02-18 13:07:29 -08001279 * See kvm_vm_ioctl_get_dirty_log() why this is needed.
Takuya Yoshikawa515a0122010-10-27 18:23:54 +09001280 */
Jay Zhou3c9bd402020-02-27 09:32:27 +08001281static int kvm_alloc_dirty_bitmap(struct kvm_memory_slot *memslot)
Takuya Yoshikawaa36a57b12010-10-27 18:22:19 +09001282{
Takuya Yoshikawa515a0122010-10-27 18:23:54 +09001283 unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot);
Takuya Yoshikawaa36a57b12010-10-27 18:22:19 +09001284
Ben Gardonb12ce362019-02-11 11:02:49 -08001285 memslot->dirty_bitmap = kvzalloc(dirty_bytes, GFP_KERNEL_ACCOUNT);
Takuya Yoshikawaa36a57b12010-10-27 18:22:19 +09001286 if (!memslot->dirty_bitmap)
1287 return -ENOMEM;
1288
Takuya Yoshikawaa36a57b12010-10-27 18:22:19 +09001289 return 0;
1290}
1291
Maciej S. Szmigieroa54d8062021-12-06 20:54:30 +01001292static struct kvm_memslots *kvm_get_inactive_memslots(struct kvm *kvm, int as_id)
Xiao Guangrongbf3e05b2011-11-24 17:40:57 +08001293{
Maciej S. Szmigieroa54d8062021-12-06 20:54:30 +01001294 struct kvm_memslots *active = __kvm_memslots(kvm, as_id);
1295 int node_idx_inactive = active->node_idx ^ 1;
Xiao Guangrongf85e2cb2011-11-24 17:41:54 +08001296
Maciej S. Szmigieroa54d8062021-12-06 20:54:30 +01001297 return &kvm->__memslots[as_id][node_idx_inactive];
1298}
Igor Mammedov0e60b072014-12-01 17:29:26 +00001299
Maciej S. Szmigieroa54d8062021-12-06 20:54:30 +01001300/*
1301 * Helper to get the address space ID when one of memslot pointers may be NULL.
1302 * This also serves as a sanity that at least one of the pointers is non-NULL,
1303 * and that their address space IDs don't diverge.
1304 */
1305static int kvm_memslots_get_as_id(struct kvm_memory_slot *a,
1306 struct kvm_memory_slot *b)
1307{
1308 if (WARN_ON_ONCE(!a && !b))
1309 return 0;
Sean Christopherson0577d1a2020-02-18 13:07:31 -08001310
Maciej S. Szmigieroa54d8062021-12-06 20:54:30 +01001311 if (!a)
1312 return b->as_id;
1313 if (!b)
1314 return a->as_id;
Sean Christopherson0774a962020-03-20 13:55:40 -07001315
Maciej S. Szmigieroa54d8062021-12-06 20:54:30 +01001316 WARN_ON_ONCE(a->as_id != b->as_id);
1317 return a->as_id;
1318}
1319
1320static void kvm_insert_gfn_node(struct kvm_memslots *slots,
1321 struct kvm_memory_slot *slot)
1322{
1323 struct rb_root *gfn_tree = &slots->gfn_tree;
1324 struct rb_node **node, *parent;
1325 int idx = slots->node_idx;
1326
1327 parent = NULL;
1328 for (node = &gfn_tree->rb_node; *node; ) {
1329 struct kvm_memory_slot *tmp;
1330
1331 tmp = container_of(*node, struct kvm_memory_slot, gfn_node[idx]);
1332 parent = *node;
1333 if (slot->base_gfn < tmp->base_gfn)
1334 node = &(*node)->rb_left;
1335 else if (slot->base_gfn > tmp->base_gfn)
1336 node = &(*node)->rb_right;
1337 else
1338 BUG();
Igor Mammedov7f379cf2014-12-01 17:29:24 +00001339 }
Maciej S. Szmigieroa54d8062021-12-06 20:54:30 +01001340
1341 rb_link_node(&slot->gfn_node[idx], parent, node);
1342 rb_insert_color(&slot->gfn_node[idx], gfn_tree);
1343}
1344
1345static void kvm_erase_gfn_node(struct kvm_memslots *slots,
1346 struct kvm_memory_slot *slot)
1347{
1348 rb_erase(&slot->gfn_node[slots->node_idx], &slots->gfn_tree);
1349}
1350
1351static void kvm_replace_gfn_node(struct kvm_memslots *slots,
1352 struct kvm_memory_slot *old,
1353 struct kvm_memory_slot *new)
1354{
1355 int idx = slots->node_idx;
1356
1357 WARN_ON_ONCE(old->base_gfn != new->base_gfn);
1358
1359 rb_replace_node(&old->gfn_node[idx], &new->gfn_node[idx],
1360 &slots->gfn_tree);
Sean Christopherson0577d1a2020-02-18 13:07:31 -08001361}
1362
1363/*
Maciej S. Szmigieroa54d8062021-12-06 20:54:30 +01001364 * Replace @old with @new in the inactive memslots.
1365 *
1366 * With NULL @old this simply adds @new.
1367 * With NULL @new this simply removes @old.
1368 *
1369 * If @new is non-NULL its hva_node[slots_idx] range has to be set
1370 * appropriately.
Sean Christopherson0577d1a2020-02-18 13:07:31 -08001371 */
Maciej S. Szmigieroa54d8062021-12-06 20:54:30 +01001372static void kvm_replace_memslot(struct kvm *kvm,
Maciej S. Szmigiero26b83452021-12-06 20:54:27 +01001373 struct kvm_memory_slot *old,
1374 struct kvm_memory_slot *new)
Sean Christopherson0577d1a2020-02-18 13:07:31 -08001375{
Maciej S. Szmigieroa54d8062021-12-06 20:54:30 +01001376 int as_id = kvm_memslots_get_as_id(old, new);
1377 struct kvm_memslots *slots = kvm_get_inactive_memslots(kvm, as_id);
1378 int idx = slots->node_idx;
Sean Christopherson0577d1a2020-02-18 13:07:31 -08001379
Maciej S. Szmigiero26b83452021-12-06 20:54:27 +01001380 if (old) {
Maciej S. Szmigieroa54d8062021-12-06 20:54:30 +01001381 hash_del(&old->id_node[idx]);
1382 interval_tree_remove(&old->hva_node[idx], &slots->hva_tree);
Sean Christopherson0577d1a2020-02-18 13:07:31 -08001383
Maciej S. Szmigieroa54d8062021-12-06 20:54:30 +01001384 if ((long)old == atomic_long_read(&slots->last_used_slot))
1385 atomic_long_set(&slots->last_used_slot, (long)new);
1386
1387 if (!new) {
1388 kvm_erase_gfn_node(slots, old);
Maciej S. Szmigiero26b83452021-12-06 20:54:27 +01001389 return;
Maciej S. Szmigieroa54d8062021-12-06 20:54:30 +01001390 }
1391 }
Paolo Bonziniefbeec72014-12-27 18:01:00 +01001392
1393 /*
Maciej S. Szmigieroa54d8062021-12-06 20:54:30 +01001394 * Initialize @new's hva range. Do this even when replacing an @old
1395 * slot, kvm_copy_memslot() deliberately does not touch node data.
Paolo Bonziniefbeec72014-12-27 18:01:00 +01001396 */
Maciej S. Szmigieroa54d8062021-12-06 20:54:30 +01001397 new->hva_node[idx].start = new->userspace_addr;
1398 new->hva_node[idx].last = new->userspace_addr +
1399 (new->npages << PAGE_SHIFT) - 1;
Xiao Guangrongf85e2cb2011-11-24 17:41:54 +08001400
Maciej S. Szmigieroa54d8062021-12-06 20:54:30 +01001401 /*
1402 * (Re)Add the new memslot. There is no O(1) interval_tree_replace(),
1403 * hva_node needs to be swapped with remove+insert even though hva can't
1404 * change when replacing an existing slot.
1405 */
1406 hash_add(slots->id_hash, &new->id_node[idx], new->id);
1407 interval_tree_insert(&new->hva_node[idx], &slots->hva_tree);
Sean Christopherson0577d1a2020-02-18 13:07:31 -08001408
Maciej S. Szmigieroa54d8062021-12-06 20:54:30 +01001409 /*
1410 * If the memslot gfn is unchanged, rb_replace_node() can be used to
1411 * switch the node in the gfn tree instead of removing the old and
1412 * inserting the new as two separate operations. Replacement is a
1413 * single O(1) operation versus two O(log(n)) operations for
1414 * remove+insert.
1415 */
1416 if (old && old->base_gfn == new->base_gfn) {
1417 kvm_replace_gfn_node(slots, old, new);
Sean Christopherson0577d1a2020-02-18 13:07:31 -08001418 } else {
Maciej S. Szmigieroa54d8062021-12-06 20:54:30 +01001419 if (old)
1420 kvm_erase_gfn_node(slots, old);
1421 kvm_insert_gfn_node(slots, new);
Sean Christopherson0577d1a2020-02-18 13:07:31 -08001422 }
Xiao Guangrongbf3e05b2011-11-24 17:40:57 +08001423}
1424
Paolo Bonzini09170a42015-05-18 13:59:39 +02001425static int check_memory_region_flags(const struct kvm_userspace_memory_region *mem)
Xiao Guangronga50d64d2012-08-21 10:58:13 +08001426{
Xiao Guangrong4d8b81a2012-08-21 11:02:51 +08001427 u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
1428
Christoffer Dall0f8a4de2014-08-26 14:00:37 +02001429#ifdef __KVM_HAVE_READONLY_MEM
Xiao Guangrong4d8b81a2012-08-21 11:02:51 +08001430 valid_flags |= KVM_MEM_READONLY;
1431#endif
1432
1433 if (mem->flags & ~valid_flags)
Xiao Guangronga50d64d2012-08-21 10:58:13 +08001434 return -EINVAL;
1435
1436 return 0;
1437}
1438
Maciej S. Szmigieroa54d8062021-12-06 20:54:30 +01001439static void kvm_swap_active_memslots(struct kvm *kvm, int as_id)
Gleb Natapov7ec4fb42012-12-24 17:49:30 +02001440{
Maciej S. Szmigieroa54d8062021-12-06 20:54:30 +01001441 struct kvm_memslots *slots = kvm_get_inactive_memslots(kvm, as_id);
1442
1443 /* Grab the generation from the activate memslots. */
1444 u64 gen = __kvm_memslots(kvm, as_id)->generation;
Gleb Natapov7ec4fb42012-12-24 17:49:30 +02001445
Sean Christopherson361209e2019-02-05 13:01:14 -08001446 WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
1447 slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
David Matlackee3d1572014-08-18 15:46:06 -07001448
Paolo Bonzini52ac8b32021-05-27 08:09:15 -04001449 /*
1450 * Do not store the new memslots while there are invalidations in
Paolo Bonzini071064f2021-08-03 03:45:41 -04001451 * progress, otherwise the locking in invalidate_range_start and
1452 * invalidate_range_end will be unbalanced.
Paolo Bonzini52ac8b32021-05-27 08:09:15 -04001453 */
1454 spin_lock(&kvm->mn_invalidate_lock);
1455 prepare_to_rcuwait(&kvm->mn_memslots_update_rcuwait);
1456 while (kvm->mn_active_invalidate_count) {
1457 set_current_state(TASK_UNINTERRUPTIBLE);
1458 spin_unlock(&kvm->mn_invalidate_lock);
1459 schedule();
1460 spin_lock(&kvm->mn_invalidate_lock);
1461 }
1462 finish_rcuwait(&kvm->mn_memslots_update_rcuwait);
Paolo Bonzinif481b062015-05-17 17:30:37 +02001463 rcu_assign_pointer(kvm->memslots[as_id], slots);
Paolo Bonzini52ac8b32021-05-27 08:09:15 -04001464 spin_unlock(&kvm->mn_invalidate_lock);
Ben Gardonb10a0382021-05-18 10:34:11 -07001465
1466 /*
1467 * Acquired in kvm_set_memslot. Must be released before synchronize
1468 * SRCU below in order to avoid deadlock with another thread
1469 * acquiring the slots_arch_lock in an srcu critical section.
1470 */
1471 mutex_unlock(&kvm->slots_arch_lock);
1472
Gleb Natapov7ec4fb42012-12-24 17:49:30 +02001473 synchronize_srcu_expedited(&kvm->srcu);
Takuya Yoshikawae59dbe02013-07-04 13:40:29 +09001474
David Matlackee3d1572014-08-18 15:46:06 -07001475 /*
Sean Christopherson361209e2019-02-05 13:01:14 -08001476 * Increment the new memslot generation a second time, dropping the
Miaohe Lin00116792019-12-11 14:26:23 +08001477 * update in-progress flag and incrementing the generation based on
Sean Christopherson361209e2019-02-05 13:01:14 -08001478 * the number of address spaces. This provides a unique and easily
1479 * identifiable generation number while the memslots are in flux.
1480 */
1481 gen = slots->generation & ~KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
1482
1483 /*
Paolo Bonzini4bd518f2017-02-03 20:44:51 -08001484 * Generations must be unique even across address spaces. We do not need
1485 * a global counter for that, instead the generation space is evenly split
1486 * across address spaces. For example, with two address spaces, address
Sean Christopherson164bf7e2019-02-05 13:01:18 -08001487 * space 0 will use generations 0, 2, 4, ... while address space 1 will
1488 * use generations 1, 3, 5, ...
David Matlackee3d1572014-08-18 15:46:06 -07001489 */
Sean Christopherson164bf7e2019-02-05 13:01:18 -08001490 gen += KVM_ADDRESS_SPACE_NUM;
David Matlackee3d1572014-08-18 15:46:06 -07001491
Sean Christopherson15248252019-02-05 12:54:17 -08001492 kvm_arch_memslots_updated(kvm, gen);
1493
1494 slots->generation = gen;
Gleb Natapov7ec4fb42012-12-24 17:49:30 +02001495}
1496
Sean Christopherson07921662021-12-06 20:54:19 +01001497static int kvm_prepare_memory_region(struct kvm *kvm,
1498 const struct kvm_memory_slot *old,
1499 struct kvm_memory_slot *new,
1500 enum kvm_mr_change change)
Ben Gardonddc12f22021-05-18 10:34:10 -07001501{
Sean Christophersoncf47f502020-02-18 13:07:23 -08001502 int r;
1503
Ben Gardonb10a0382021-05-18 10:34:11 -07001504 /*
Sean Christopherson07921662021-12-06 20:54:19 +01001505 * If dirty logging is disabled, nullify the bitmap; the old bitmap
1506 * will be freed on "commit". If logging is enabled in both old and
1507 * new, reuse the existing bitmap. If logging is enabled only in the
1508 * new and KVM isn't using a ring buffer, allocate and initialize a
1509 * new bitmap.
1510 */
Sean Christopherson244893f2021-12-06 20:54:35 +01001511 if (change != KVM_MR_DELETE) {
1512 if (!(new->flags & KVM_MEM_LOG_DIRTY_PAGES))
1513 new->dirty_bitmap = NULL;
1514 else if (old && old->dirty_bitmap)
1515 new->dirty_bitmap = old->dirty_bitmap;
1516 else if (!kvm->dirty_ring_size) {
1517 r = kvm_alloc_dirty_bitmap(new);
1518 if (r)
1519 return r;
Sean Christopherson07921662021-12-06 20:54:19 +01001520
Sean Christopherson244893f2021-12-06 20:54:35 +01001521 if (kvm_dirty_log_manual_protect_and_init_set(kvm))
1522 bitmap_set(new->dirty_bitmap, 0, new->npages);
1523 }
Sean Christopherson07921662021-12-06 20:54:19 +01001524 }
1525
1526 r = kvm_arch_prepare_memory_region(kvm, old, new, change);
1527
1528 /* Free the bitmap on failure if it was allocated above. */
Sean Christopherson244893f2021-12-06 20:54:35 +01001529 if (r && new && new->dirty_bitmap && old && !old->dirty_bitmap)
Sean Christopherson07921662021-12-06 20:54:19 +01001530 kvm_destroy_dirty_bitmap(new);
1531
1532 return r;
1533}
1534
1535static void kvm_commit_memory_region(struct kvm *kvm,
1536 struct kvm_memory_slot *old,
1537 const struct kvm_memory_slot *new,
1538 enum kvm_mr_change change)
1539{
1540 /*
1541 * Update the total number of memslot pages before calling the arch
1542 * hook so that architectures can consume the result directly.
1543 */
1544 if (change == KVM_MR_DELETE)
1545 kvm->nr_memslot_pages -= old->npages;
1546 else if (change == KVM_MR_CREATE)
1547 kvm->nr_memslot_pages += new->npages;
1548
1549 kvm_arch_commit_memory_region(kvm, old, new, change);
1550
Maciej S. Szmigieroa54d8062021-12-06 20:54:30 +01001551 switch (change) {
1552 case KVM_MR_CREATE:
1553 /* Nothing more to do. */
1554 break;
1555 case KVM_MR_DELETE:
1556 /* Free the old memslot and all its metadata. */
Sean Christopherson07921662021-12-06 20:54:19 +01001557 kvm_free_memslot(kvm, old);
Maciej S. Szmigieroa54d8062021-12-06 20:54:30 +01001558 break;
1559 case KVM_MR_MOVE:
1560 case KVM_MR_FLAGS_ONLY:
1561 /*
1562 * Free the dirty bitmap as needed; the below check encompasses
1563 * both the flags and whether a ring buffer is being used)
1564 */
1565 if (old->dirty_bitmap && !new->dirty_bitmap)
1566 kvm_destroy_dirty_bitmap(old);
1567
1568 /*
1569 * The final quirk. Free the detached, old slot, but only its
1570 * memory, not any metadata. Metadata, including arch specific
1571 * data, may be reused by @new.
1572 */
1573 kfree(old);
1574 break;
1575 default:
1576 BUG();
1577 }
1578}
1579
1580/*
1581 * Activate @new, which must be installed in the inactive slots by the caller,
1582 * by swapping the active slots and then propagating @new to @old once @old is
1583 * unreachable and can be safely modified.
1584 *
1585 * With NULL @old this simply adds @new to @active (while swapping the sets).
1586 * With NULL @new this simply removes @old from @active and frees it
1587 * (while also swapping the sets).
1588 */
1589static void kvm_activate_memslot(struct kvm *kvm,
1590 struct kvm_memory_slot *old,
1591 struct kvm_memory_slot *new)
1592{
1593 int as_id = kvm_memslots_get_as_id(old, new);
1594
1595 kvm_swap_active_memslots(kvm, as_id);
1596
1597 /* Propagate the new memslot to the now inactive memslots. */
1598 kvm_replace_memslot(kvm, old, new);
1599}
1600
1601static void kvm_copy_memslot(struct kvm_memory_slot *dest,
1602 const struct kvm_memory_slot *src)
1603{
1604 dest->base_gfn = src->base_gfn;
1605 dest->npages = src->npages;
1606 dest->dirty_bitmap = src->dirty_bitmap;
1607 dest->arch = src->arch;
1608 dest->userspace_addr = src->userspace_addr;
1609 dest->flags = src->flags;
1610 dest->id = src->id;
1611 dest->as_id = src->as_id;
1612}
1613
1614static void kvm_invalidate_memslot(struct kvm *kvm,
1615 struct kvm_memory_slot *old,
Sean Christopherson244893f2021-12-06 20:54:35 +01001616 struct kvm_memory_slot *invalid_slot)
Maciej S. Szmigieroa54d8062021-12-06 20:54:30 +01001617{
1618 /*
1619 * Mark the current slot INVALID. As with all memslot modifications,
1620 * this must be done on an unreachable slot to avoid modifying the
1621 * current slot in the active tree.
1622 */
Sean Christopherson244893f2021-12-06 20:54:35 +01001623 kvm_copy_memslot(invalid_slot, old);
1624 invalid_slot->flags |= KVM_MEMSLOT_INVALID;
1625 kvm_replace_memslot(kvm, old, invalid_slot);
Maciej S. Szmigieroa54d8062021-12-06 20:54:30 +01001626
1627 /*
1628 * Activate the slot that is now marked INVALID, but don't propagate
1629 * the slot to the now inactive slots. The slot is either going to be
1630 * deleted or recreated as a new slot.
1631 */
1632 kvm_swap_active_memslots(kvm, old->as_id);
1633
1634 /*
1635 * From this point no new shadow pages pointing to a deleted, or moved,
1636 * memslot will be created. Validation of sp->gfn happens in:
1637 * - gfn_to_hva (kvm_read_guest, gfn_to_pfn)
1638 * - kvm_is_visible_gfn (mmu_check_root)
1639 */
Maciej S. Szmigierobcb63dc2021-12-06 20:54:31 +01001640 kvm_arch_flush_shadow_memslot(kvm, old);
Maciej S. Szmigieroa54d8062021-12-06 20:54:30 +01001641
1642 /* Was released by kvm_swap_active_memslots, reacquire. */
1643 mutex_lock(&kvm->slots_arch_lock);
1644
1645 /*
1646 * Copy the arch-specific field of the newly-installed slot back to the
1647 * old slot as the arch data could have changed between releasing
1648 * slots_arch_lock in install_new_memslots() and re-acquiring the lock
1649 * above. Writers are required to retrieve memslots *after* acquiring
1650 * slots_arch_lock, thus the active slot's data is guaranteed to be fresh.
1651 */
Sean Christopherson244893f2021-12-06 20:54:35 +01001652 old->arch = invalid_slot->arch;
Maciej S. Szmigieroa54d8062021-12-06 20:54:30 +01001653}
1654
1655static void kvm_create_memslot(struct kvm *kvm,
Sean Christopherson244893f2021-12-06 20:54:35 +01001656 struct kvm_memory_slot *new)
Maciej S. Szmigieroa54d8062021-12-06 20:54:30 +01001657{
Sean Christopherson244893f2021-12-06 20:54:35 +01001658 /* Add the new memslot to the inactive set and activate. */
1659 kvm_replace_memslot(kvm, NULL, new);
1660 kvm_activate_memslot(kvm, NULL, new);
Maciej S. Szmigieroa54d8062021-12-06 20:54:30 +01001661}
1662
1663static void kvm_delete_memslot(struct kvm *kvm,
1664 struct kvm_memory_slot *old,
1665 struct kvm_memory_slot *invalid_slot)
1666{
1667 /*
1668 * Remove the old memslot (in the inactive memslots) by passing NULL as
Sean Christopherson244893f2021-12-06 20:54:35 +01001669 * the "new" slot, and for the invalid version in the active slots.
Maciej S. Szmigieroa54d8062021-12-06 20:54:30 +01001670 */
1671 kvm_replace_memslot(kvm, old, NULL);
Maciej S. Szmigieroa54d8062021-12-06 20:54:30 +01001672 kvm_activate_memslot(kvm, invalid_slot, NULL);
Maciej S. Szmigieroa54d8062021-12-06 20:54:30 +01001673}
1674
Sean Christopherson244893f2021-12-06 20:54:35 +01001675static void kvm_move_memslot(struct kvm *kvm,
1676 struct kvm_memory_slot *old,
1677 struct kvm_memory_slot *new,
1678 struct kvm_memory_slot *invalid_slot)
Maciej S. Szmigieroa54d8062021-12-06 20:54:30 +01001679{
Maciej S. Szmigieroa54d8062021-12-06 20:54:30 +01001680 /*
Sean Christopherson244893f2021-12-06 20:54:35 +01001681 * Replace the old memslot in the inactive slots, and then swap slots
1682 * and replace the current INVALID with the new as well.
Maciej S. Szmigieroa54d8062021-12-06 20:54:30 +01001683 */
Sean Christopherson244893f2021-12-06 20:54:35 +01001684 kvm_replace_memslot(kvm, old, new);
1685 kvm_activate_memslot(kvm, invalid_slot, new);
Maciej S. Szmigieroa54d8062021-12-06 20:54:30 +01001686}
1687
1688static void kvm_update_flags_memslot(struct kvm *kvm,
1689 struct kvm_memory_slot *old,
Sean Christopherson244893f2021-12-06 20:54:35 +01001690 struct kvm_memory_slot *new)
Maciej S. Szmigieroa54d8062021-12-06 20:54:30 +01001691{
1692 /*
1693 * Similar to the MOVE case, but the slot doesn't need to be zapped as
1694 * an intermediate step. Instead, the old memslot is simply replaced
1695 * with a new, updated copy in both memslot sets.
1696 */
Sean Christopherson244893f2021-12-06 20:54:35 +01001697 kvm_replace_memslot(kvm, old, new);
1698 kvm_activate_memslot(kvm, old, new);
Sean Christopherson07921662021-12-06 20:54:19 +01001699}
1700
Sean Christophersoncf47f502020-02-18 13:07:23 -08001701static int kvm_set_memslot(struct kvm *kvm,
Maciej S. Szmigieroa54d8062021-12-06 20:54:30 +01001702 struct kvm_memory_slot *old,
Sean Christophersonce5f0212021-12-06 20:54:10 +01001703 struct kvm_memory_slot *new,
Sean Christophersoncf47f502020-02-18 13:07:23 -08001704 enum kvm_mr_change change)
1705{
Sean Christopherson244893f2021-12-06 20:54:35 +01001706 struct kvm_memory_slot *invalid_slot;
Sean Christophersoncf47f502020-02-18 13:07:23 -08001707 int r;
1708
Ben Gardonb10a0382021-05-18 10:34:11 -07001709 /*
Maciej S. Szmigieroa54d8062021-12-06 20:54:30 +01001710 * Released in kvm_swap_active_memslots.
Ben Gardonb10a0382021-05-18 10:34:11 -07001711 *
1712 * Must be held from before the current memslots are copied until
1713 * after the new memslots are installed with rcu_assign_pointer,
Maciej S. Szmigieroa54d8062021-12-06 20:54:30 +01001714 * then released before the synchronize srcu in kvm_swap_active_memslots.
Ben Gardonb10a0382021-05-18 10:34:11 -07001715 *
1716 * When modifying memslots outside of the slots_lock, must be held
1717 * before reading the pointer to the current memslots until after all
1718 * changes to those memslots are complete.
1719 *
1720 * These rules ensure that installing new memslots does not lose
1721 * changes made to the previous memslots.
1722 */
1723 mutex_lock(&kvm->slots_arch_lock);
1724
Maciej S. Szmigieroa54d8062021-12-06 20:54:30 +01001725 /*
1726 * Invalidate the old slot if it's being deleted or moved. This is
1727 * done prior to actually deleting/moving the memslot to allow vCPUs to
1728 * continue running by ensuring there are no mappings or shadow pages
1729 * for the memslot when it is deleted/moved. Without pre-invalidation
1730 * (and without a lock), a window would exist between effecting the
1731 * delete/move and committing the changes in arch code where KVM or a
1732 * guest could access a non-existent memslot.
Sean Christopherson244893f2021-12-06 20:54:35 +01001733 *
1734 * Modifications are done on a temporary, unreachable slot. The old
1735 * slot needs to be preserved in case a later step fails and the
1736 * invalidation needs to be reverted.
Maciej S. Szmigieroa54d8062021-12-06 20:54:30 +01001737 */
Sean Christopherson244893f2021-12-06 20:54:35 +01001738 if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) {
1739 invalid_slot = kzalloc(sizeof(*invalid_slot), GFP_KERNEL_ACCOUNT);
1740 if (!invalid_slot) {
1741 mutex_unlock(&kvm->slots_arch_lock);
1742 return -ENOMEM;
1743 }
1744 kvm_invalidate_memslot(kvm, old, invalid_slot);
Ben Gardonb10a0382021-05-18 10:34:11 -07001745 }
Sean Christophersoncf47f502020-02-18 13:07:23 -08001746
Maciej S. Szmigieroa54d8062021-12-06 20:54:30 +01001747 r = kvm_prepare_memory_region(kvm, old, new, change);
1748 if (r) {
Sean Christophersoncf47f502020-02-18 13:07:23 -08001749 /*
Maciej S. Szmigieroa54d8062021-12-06 20:54:30 +01001750 * For DELETE/MOVE, revert the above INVALID change. No
1751 * modifications required since the original slot was preserved
1752 * in the inactive slots. Changing the active memslots also
1753 * release slots_arch_lock.
Sean Christophersoncf47f502020-02-18 13:07:23 -08001754 */
Sean Christopherson244893f2021-12-06 20:54:35 +01001755 if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) {
1756 kvm_activate_memslot(kvm, invalid_slot, old);
1757 kfree(invalid_slot);
1758 } else {
Maciej S. Szmigieroa54d8062021-12-06 20:54:30 +01001759 mutex_unlock(&kvm->slots_arch_lock);
Sean Christopherson244893f2021-12-06 20:54:35 +01001760 }
Maciej S. Szmigieroa54d8062021-12-06 20:54:30 +01001761 return r;
Sean Christophersoncf47f502020-02-18 13:07:23 -08001762 }
1763
Sean Christophersonbda44d82021-11-04 00:25:02 +00001764 /*
Maciej S. Szmigieroa54d8062021-12-06 20:54:30 +01001765 * For DELETE and MOVE, the working slot is now active as the INVALID
1766 * version of the old slot. MOVE is particularly special as it reuses
1767 * the old slot and returns a copy of the old slot (in working_slot).
1768 * For CREATE, there is no old slot. For DELETE and FLAGS_ONLY, the
1769 * old slot is detached but otherwise preserved.
Sean Christophersonbda44d82021-11-04 00:25:02 +00001770 */
Maciej S. Szmigieroa54d8062021-12-06 20:54:30 +01001771 if (change == KVM_MR_CREATE)
Sean Christopherson244893f2021-12-06 20:54:35 +01001772 kvm_create_memslot(kvm, new);
Maciej S. Szmigieroa54d8062021-12-06 20:54:30 +01001773 else if (change == KVM_MR_DELETE)
Sean Christopherson244893f2021-12-06 20:54:35 +01001774 kvm_delete_memslot(kvm, old, invalid_slot);
Maciej S. Szmigieroa54d8062021-12-06 20:54:30 +01001775 else if (change == KVM_MR_MOVE)
Sean Christopherson244893f2021-12-06 20:54:35 +01001776 kvm_move_memslot(kvm, old, new, invalid_slot);
Maciej S. Szmigieroa54d8062021-12-06 20:54:30 +01001777 else if (change == KVM_MR_FLAGS_ONLY)
Sean Christopherson244893f2021-12-06 20:54:35 +01001778 kvm_update_flags_memslot(kvm, old, new);
Maciej S. Szmigiero4e4d30c2021-12-06 20:54:09 +01001779 else
Maciej S. Szmigieroa54d8062021-12-06 20:54:30 +01001780 BUG();
Sean Christophersonbda44d82021-11-04 00:25:02 +00001781
Sean Christopherson244893f2021-12-06 20:54:35 +01001782 /* Free the temporary INVALID slot used for DELETE and MOVE. */
1783 if (change == KVM_MR_DELETE || change == KVM_MR_MOVE)
1784 kfree(invalid_slot);
Sean Christophersonbda44d82021-11-04 00:25:02 +00001785
Maciej S. Szmigieroa54d8062021-12-06 20:54:30 +01001786 /*
1787 * No need to refresh new->arch, changes after dropping slots_arch_lock
1788 * will directly hit the final, active memsot. Architectures are
1789 * responsible for knowing that new->arch may be stale.
1790 */
1791 kvm_commit_memory_region(kvm, old, new, change);
Sean Christophersoncf47f502020-02-18 13:07:23 -08001792
Sean Christophersoncf47f502020-02-18 13:07:23 -08001793 return 0;
Sean Christophersoncf47f502020-02-18 13:07:23 -08001794}
1795
Maciej S. Szmigiero44401a22021-12-06 20:54:33 +01001796static bool kvm_check_memslot_overlap(struct kvm_memslots *slots, int id,
1797 gfn_t start, gfn_t end)
Sean Christopherson5c0b4f32020-02-18 13:07:26 -08001798{
Maciej S. Szmigiero44401a22021-12-06 20:54:33 +01001799 struct kvm_memslot_iter iter;
Sean Christopherson5c0b4f32020-02-18 13:07:26 -08001800
Maciej S. Szmigiero44401a22021-12-06 20:54:33 +01001801 kvm_for_each_memslot_in_gfn_range(&iter, slots, start, end) {
1802 if (iter.slot->id != id)
1803 return true;
1804 }
Sean Christopherson5c0b4f32020-02-18 13:07:26 -08001805
Maciej S. Szmigiero44401a22021-12-06 20:54:33 +01001806 return false;
Sean Christopherson5c0b4f32020-02-18 13:07:26 -08001807}
1808
Avi Kivity6aa8b732006-12-10 02:21:36 -08001809/*
Avi Kivity6aa8b732006-12-10 02:21:36 -08001810 * Allocate some memory and give it an address in the guest physical address
1811 * space.
1812 *
1813 * Discontiguous memory is allowed, mostly for framebuffers.
Sheng Yangf78e0e22007-10-29 09:40:42 +08001814 *
Dominik Dingel02d5d552014-10-27 16:22:56 +01001815 * Must be called holding kvm->slots_lock for write.
Avi Kivity6aa8b732006-12-10 02:21:36 -08001816 */
Sheng Yangf78e0e22007-10-29 09:40:42 +08001817int __kvm_set_memory_region(struct kvm *kvm,
Paolo Bonzini09170a42015-05-18 13:59:39 +02001818 const struct kvm_userspace_memory_region *mem)
Avi Kivity6aa8b732006-12-10 02:21:36 -08001819{
Sean Christopherson244893f2021-12-06 20:54:35 +01001820 struct kvm_memory_slot *old, *new;
Maciej S. Szmigiero44401a22021-12-06 20:54:33 +01001821 struct kvm_memslots *slots;
Takuya Yoshikawaf64c0392013-01-29 11:00:07 +09001822 enum kvm_mr_change change;
Sean Christopherson0f9bdef2021-12-06 20:54:34 +01001823 unsigned long npages;
1824 gfn_t base_gfn;
Sean Christopherson163da372020-02-18 13:07:28 -08001825 int as_id, id;
1826 int r;
Avi Kivity6aa8b732006-12-10 02:21:36 -08001827
Xiao Guangronga50d64d2012-08-21 10:58:13 +08001828 r = check_memory_region_flags(mem);
1829 if (r)
Sean Christopherson71a4c302020-02-18 13:07:22 -08001830 return r;
Xiao Guangronga50d64d2012-08-21 10:58:13 +08001831
Paolo Bonzinif481b062015-05-17 17:30:37 +02001832 as_id = mem->slot >> 16;
1833 id = (u16)mem->slot;
1834
Avi Kivity6aa8b732006-12-10 02:21:36 -08001835 /* General sanity checks */
Sean Christopherson6b285a52021-11-04 00:25:03 +00001836 if ((mem->memory_size & (PAGE_SIZE - 1)) ||
1837 (mem->memory_size != (unsigned long)mem->memory_size))
Sean Christopherson71a4c302020-02-18 13:07:22 -08001838 return -EINVAL;
Avi Kivity6aa8b732006-12-10 02:21:36 -08001839 if (mem->guest_phys_addr & (PAGE_SIZE - 1))
Sean Christopherson71a4c302020-02-18 13:07:22 -08001840 return -EINVAL;
Takuya Yoshikawafa3d3152011-05-07 16:35:38 +09001841 /* We can read the guest memory with __xxx_user() later on. */
Paolo Bonzini09d952c2020-06-01 04:17:45 -04001842 if ((mem->userspace_addr & (PAGE_SIZE - 1)) ||
Marc Zyngier139bc8a2021-01-21 12:08:15 +00001843 (mem->userspace_addr != untagged_addr(mem->userspace_addr)) ||
Linus Torvalds96d4f262019-01-03 18:57:57 -08001844 !access_ok((void __user *)(unsigned long)mem->userspace_addr,
Paolo Bonzini09d952c2020-06-01 04:17:45 -04001845 mem->memory_size))
Sean Christopherson71a4c302020-02-18 13:07:22 -08001846 return -EINVAL;
Paolo Bonzinif481b062015-05-17 17:30:37 +02001847 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_MEM_SLOTS_NUM)
Sean Christopherson71a4c302020-02-18 13:07:22 -08001848 return -EINVAL;
Avi Kivity6aa8b732006-12-10 02:21:36 -08001849 if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
Sean Christopherson71a4c302020-02-18 13:07:22 -08001850 return -EINVAL;
Sean Christopherson0f9bdef2021-12-06 20:54:34 +01001851 if ((mem->memory_size >> PAGE_SHIFT) > KVM_MEM_MAX_NR_PAGES)
Sean Christopherson163da372020-02-18 13:07:28 -08001852 return -EINVAL;
1853
Maciej S. Szmigiero44401a22021-12-06 20:54:33 +01001854 slots = __kvm_memslots(kvm, as_id);
1855
Avi Kivity6aa8b732006-12-10 02:21:36 -08001856 /*
Sean Christopherson7cd08552021-12-06 20:54:22 +01001857 * Note, the old memslot (and the pointer itself!) may be invalidated
1858 * and/or destroyed by kvm_set_memslot().
Sean Christopherson5c0b4f32020-02-18 13:07:26 -08001859 */
Maciej S. Szmigiero44401a22021-12-06 20:54:33 +01001860 old = id_to_memslot(slots, id);
Sean Christopherson5c0b4f32020-02-18 13:07:26 -08001861
Sean Christopherson47ea7d92021-12-06 20:54:08 +01001862 if (!mem->memory_size) {
Sean Christopherson7cd08552021-12-06 20:54:22 +01001863 if (!old || !old->npages)
Sean Christopherson71a4c302020-02-18 13:07:22 -08001864 return -EINVAL;
Paolo Bonzini09170a42015-05-18 13:59:39 +02001865
Sean Christopherson7cd08552021-12-06 20:54:22 +01001866 if (WARN_ON_ONCE(kvm->nr_memslot_pages < old->npages))
Sean Christopherson47ea7d92021-12-06 20:54:08 +01001867 return -EIO;
1868
Sean Christopherson244893f2021-12-06 20:54:35 +01001869 return kvm_set_memslot(kvm, old, NULL, KVM_MR_DELETE);
Sean Christopherson47ea7d92021-12-06 20:54:08 +01001870 }
Takuya Yoshikawa75d61fb2013-01-30 19:40:41 +09001871
Sean Christopherson0f9bdef2021-12-06 20:54:34 +01001872 base_gfn = (mem->guest_phys_addr >> PAGE_SHIFT);
1873 npages = (mem->memory_size >> PAGE_SHIFT);
Takuya Yoshikawaf64c0392013-01-29 11:00:07 +09001874
Sean Christopherson7cd08552021-12-06 20:54:22 +01001875 if (!old || !old->npages) {
Takuya Yoshikawaf64c0392013-01-29 11:00:07 +09001876 change = KVM_MR_CREATE;
Sean Christophersonafa319a2021-12-06 20:54:07 +01001877
1878 /*
1879 * To simplify KVM internals, the total number of pages across
1880 * all memslots must fit in an unsigned long.
1881 */
Sean Christopherson0f9bdef2021-12-06 20:54:34 +01001882 if ((kvm->nr_memslot_pages + npages) < kvm->nr_memslot_pages)
Sean Christophersonafa319a2021-12-06 20:54:07 +01001883 return -EINVAL;
Takuya Yoshikawaf64c0392013-01-29 11:00:07 +09001884 } else { /* Modify an existing slot. */
Sean Christopherson0f9bdef2021-12-06 20:54:34 +01001885 if ((mem->userspace_addr != old->userspace_addr) ||
1886 (npages != old->npages) ||
1887 ((mem->flags ^ old->flags) & KVM_MEM_READONLY))
Takuya Yoshikawaf64c0392013-01-29 11:00:07 +09001888 return -EINVAL;
1889
Sean Christopherson0f9bdef2021-12-06 20:54:34 +01001890 if (base_gfn != old->base_gfn)
Sean Christopherson5c0b4f32020-02-18 13:07:26 -08001891 change = KVM_MR_MOVE;
Sean Christopherson0f9bdef2021-12-06 20:54:34 +01001892 else if (mem->flags != old->flags)
Sean Christopherson5c0b4f32020-02-18 13:07:26 -08001893 change = KVM_MR_FLAGS_ONLY;
1894 else /* Nothing to change. */
1895 return 0;
Paolo Bonzini09170a42015-05-18 13:59:39 +02001896 }
Avi Kivity6aa8b732006-12-10 02:21:36 -08001897
Maciej S. Szmigiero44401a22021-12-06 20:54:33 +01001898 if ((change == KVM_MR_CREATE || change == KVM_MR_MOVE) &&
Sean Christopherson0f9bdef2021-12-06 20:54:34 +01001899 kvm_check_memslot_overlap(slots, id, base_gfn, base_gfn + npages))
Maciej S. Szmigiero44401a22021-12-06 20:54:33 +01001900 return -EEXIST;
Avi Kivity6aa8b732006-12-10 02:21:36 -08001901
Sean Christopherson244893f2021-12-06 20:54:35 +01001902 /* Allocate a slot that will persist in the memslot. */
1903 new = kzalloc(sizeof(*new), GFP_KERNEL_ACCOUNT);
1904 if (!new)
1905 return -ENOMEM;
Jay Zhou3c9bd402020-02-27 09:32:27 +08001906
Sean Christopherson244893f2021-12-06 20:54:35 +01001907 new->as_id = as_id;
1908 new->id = id;
1909 new->base_gfn = base_gfn;
1910 new->npages = npages;
1911 new->flags = mem->flags;
1912 new->userspace_addr = mem->userspace_addr;
Avi Kivity6aa8b732006-12-10 02:21:36 -08001913
Sean Christopherson244893f2021-12-06 20:54:35 +01001914 r = kvm_set_memslot(kvm, old, new, change);
Sean Christophersoncf47f502020-02-18 13:07:23 -08001915 if (r)
Sean Christopherson244893f2021-12-06 20:54:35 +01001916 kfree(new);
Avi Kivity6aa8b732006-12-10 02:21:36 -08001917 return r;
Izik Eidus210c7c42007-10-24 23:52:57 +02001918}
Sheng Yangf78e0e22007-10-29 09:40:42 +08001919EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
1920
1921int kvm_set_memory_region(struct kvm *kvm,
Paolo Bonzini09170a42015-05-18 13:59:39 +02001922 const struct kvm_userspace_memory_region *mem)
Sheng Yangf78e0e22007-10-29 09:40:42 +08001923{
1924 int r;
1925
Marcelo Tosatti79fac952009-12-23 14:35:26 -02001926 mutex_lock(&kvm->slots_lock);
Takuya Yoshikawa47ae31e2013-02-27 19:43:00 +09001927 r = __kvm_set_memory_region(kvm, mem);
Marcelo Tosatti79fac952009-12-23 14:35:26 -02001928 mutex_unlock(&kvm->slots_lock);
Sheng Yangf78e0e22007-10-29 09:40:42 +08001929 return r;
1930}
Izik Eidus210c7c42007-10-24 23:52:57 +02001931EXPORT_SYMBOL_GPL(kvm_set_memory_region);
1932
Stephen Hemminger79408762013-12-29 12:12:29 -08001933static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
1934 struct kvm_userspace_memory_region *mem)
Izik Eidus210c7c42007-10-24 23:52:57 +02001935{
Paolo Bonzinif481b062015-05-17 17:30:37 +02001936 if ((u16)mem->slot >= KVM_USER_MEM_SLOTS)
Izik Eiduse0d62c72007-10-24 23:57:46 +02001937 return -EINVAL;
Paolo Bonzini09170a42015-05-18 13:59:39 +02001938
Takuya Yoshikawa47ae31e2013-02-27 19:43:00 +09001939 return kvm_set_memory_region(kvm, mem);
Avi Kivity6aa8b732006-12-10 02:21:36 -08001940}
1941
Sean Christopherson0dff0842020-02-18 13:07:29 -08001942#ifndef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
Sean Christopherson2a49f612020-02-18 13:07:30 -08001943/**
1944 * kvm_get_dirty_log - get a snapshot of dirty pages
1945 * @kvm: pointer to kvm instance
1946 * @log: slot id and address to which we copy the log
1947 * @is_dirty: set to '1' if any dirty pages were found
1948 * @memslot: set to the associated memslot, always valid on success
1949 */
1950int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log,
1951 int *is_dirty, struct kvm_memory_slot **memslot)
Avi Kivity6aa8b732006-12-10 02:21:36 -08001952{
Paolo Bonzini9f6b8022015-05-17 16:20:07 +02001953 struct kvm_memslots *slots;
Markus Elfring843574a2017-01-22 17:41:07 +01001954 int i, as_id, id;
Takuya Yoshikawa87bf6e72010-04-12 19:35:35 +09001955 unsigned long n;
Avi Kivity6aa8b732006-12-10 02:21:36 -08001956 unsigned long any = 0;
1957
Peter Xub2cc64c2020-09-30 21:22:24 -04001958 /* Dirty ring tracking is exclusive to dirty log tracking */
1959 if (kvm->dirty_ring_size)
1960 return -ENXIO;
1961
Sean Christopherson2a49f612020-02-18 13:07:30 -08001962 *memslot = NULL;
1963 *is_dirty = 0;
1964
Paolo Bonzinif481b062015-05-17 17:30:37 +02001965 as_id = log->slot >> 16;
1966 id = (u16)log->slot;
1967 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
Markus Elfring843574a2017-01-22 17:41:07 +01001968 return -EINVAL;
Avi Kivity6aa8b732006-12-10 02:21:36 -08001969
Paolo Bonzinif481b062015-05-17 17:30:37 +02001970 slots = __kvm_memslots(kvm, as_id);
Sean Christopherson2a49f612020-02-18 13:07:30 -08001971 *memslot = id_to_memslot(slots, id);
Sean Christopherson0577d1a2020-02-18 13:07:31 -08001972 if (!(*memslot) || !(*memslot)->dirty_bitmap)
Markus Elfring843574a2017-01-22 17:41:07 +01001973 return -ENOENT;
Avi Kivity6aa8b732006-12-10 02:21:36 -08001974
Sean Christopherson2a49f612020-02-18 13:07:30 -08001975 kvm_arch_sync_dirty_log(kvm, *memslot);
1976
1977 n = kvm_dirty_bitmap_bytes(*memslot);
Avi Kivity6aa8b732006-12-10 02:21:36 -08001978
Uri Lublincd1a4a92007-02-22 16:43:09 +02001979 for (i = 0; !any && i < n/sizeof(long); ++i)
Sean Christopherson2a49f612020-02-18 13:07:30 -08001980 any = (*memslot)->dirty_bitmap[i];
Avi Kivity6aa8b732006-12-10 02:21:36 -08001981
Sean Christopherson2a49f612020-02-18 13:07:30 -08001982 if (copy_to_user(log->dirty_bitmap, (*memslot)->dirty_bitmap, n))
Markus Elfring843574a2017-01-22 17:41:07 +01001983 return -EFAULT;
Avi Kivity6aa8b732006-12-10 02:21:36 -08001984
Zhang Xiantao5bb064d2007-11-18 20:29:43 +08001985 if (any)
1986 *is_dirty = 1;
Markus Elfring843574a2017-01-22 17:41:07 +01001987 return 0;
Avi Kivity6aa8b732006-12-10 02:21:36 -08001988}
Aneesh Kumar K.V2ba9f0d2013-10-07 22:17:59 +05301989EXPORT_SYMBOL_GPL(kvm_get_dirty_log);
Avi Kivity6aa8b732006-12-10 02:21:36 -08001990
Sean Christopherson0dff0842020-02-18 13:07:29 -08001991#else /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
Mario Smarduchba0513b2015-01-15 15:58:53 -08001992/**
Jiang Biaob8b00222019-04-23 19:40:30 +08001993 * kvm_get_dirty_log_protect - get a snapshot of dirty pages
Paolo Bonzini2a31b9d2018-10-23 02:36:47 +02001994 * and reenable dirty page tracking for the corresponding pages.
Mario Smarduchba0513b2015-01-15 15:58:53 -08001995 * @kvm: pointer to kvm instance
1996 * @log: slot id and address to which we copy the log
Mario Smarduchba0513b2015-01-15 15:58:53 -08001997 *
1998 * We need to keep it in mind that VCPU threads can write to the bitmap
1999 * concurrently. So, to avoid losing track of dirty pages we keep the
2000 * following order:
2001 *
2002 * 1. Take a snapshot of the bit and clear it if needed.
2003 * 2. Write protect the corresponding page.
2004 * 3. Copy the snapshot to the userspace.
2005 * 4. Upon return caller flushes TLB's if needed.
2006 *
2007 * Between 2 and 4, the guest may write to the page using the remaining TLB
2008 * entry. This is not a problem because the page is reported dirty using
2009 * the snapshot taken before and step 4 ensures that writes done after
2010 * exiting to userspace will be logged for the next call.
2011 *
2012 */
Sean Christopherson0dff0842020-02-18 13:07:29 -08002013static int kvm_get_dirty_log_protect(struct kvm *kvm, struct kvm_dirty_log *log)
Mario Smarduchba0513b2015-01-15 15:58:53 -08002014{
Paolo Bonzini9f6b8022015-05-17 16:20:07 +02002015 struct kvm_memslots *slots;
Mario Smarduchba0513b2015-01-15 15:58:53 -08002016 struct kvm_memory_slot *memslot;
Markus Elfring58d6db32017-01-22 17:30:16 +01002017 int i, as_id, id;
Mario Smarduchba0513b2015-01-15 15:58:53 -08002018 unsigned long n;
2019 unsigned long *dirty_bitmap;
2020 unsigned long *dirty_bitmap_buffer;
Sean Christopherson0dff0842020-02-18 13:07:29 -08002021 bool flush;
Mario Smarduchba0513b2015-01-15 15:58:53 -08002022
Peter Xub2cc64c2020-09-30 21:22:24 -04002023 /* Dirty ring tracking is exclusive to dirty log tracking */
2024 if (kvm->dirty_ring_size)
2025 return -ENXIO;
2026
Paolo Bonzinif481b062015-05-17 17:30:37 +02002027 as_id = log->slot >> 16;
2028 id = (u16)log->slot;
2029 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
Markus Elfring58d6db32017-01-22 17:30:16 +01002030 return -EINVAL;
Mario Smarduchba0513b2015-01-15 15:58:53 -08002031
Paolo Bonzinif481b062015-05-17 17:30:37 +02002032 slots = __kvm_memslots(kvm, as_id);
2033 memslot = id_to_memslot(slots, id);
Sean Christopherson0577d1a2020-02-18 13:07:31 -08002034 if (!memslot || !memslot->dirty_bitmap)
2035 return -ENOENT;
Mario Smarduchba0513b2015-01-15 15:58:53 -08002036
2037 dirty_bitmap = memslot->dirty_bitmap;
Mario Smarduchba0513b2015-01-15 15:58:53 -08002038
Sean Christopherson0dff0842020-02-18 13:07:29 -08002039 kvm_arch_sync_dirty_log(kvm, memslot);
2040
Mario Smarduchba0513b2015-01-15 15:58:53 -08002041 n = kvm_dirty_bitmap_bytes(memslot);
Sean Christopherson0dff0842020-02-18 13:07:29 -08002042 flush = false;
Paolo Bonzini2a31b9d2018-10-23 02:36:47 +02002043 if (kvm->manual_dirty_log_protect) {
2044 /*
2045 * Unlike kvm_get_dirty_log, we always return false in *flush,
2046 * because no flush is needed until KVM_CLEAR_DIRTY_LOG. There
2047 * is some code duplication between this function and
2048 * kvm_get_dirty_log, but hopefully all architecture
2049 * transition to kvm_get_dirty_log_protect and kvm_get_dirty_log
2050 * can be eliminated.
2051 */
2052 dirty_bitmap_buffer = dirty_bitmap;
2053 } else {
2054 dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
2055 memset(dirty_bitmap_buffer, 0, n);
Mario Smarduchba0513b2015-01-15 15:58:53 -08002056
Ben Gardon531810c2021-02-02 10:57:24 -08002057 KVM_MMU_LOCK(kvm);
Paolo Bonzini2a31b9d2018-10-23 02:36:47 +02002058 for (i = 0; i < n / sizeof(long); i++) {
2059 unsigned long mask;
2060 gfn_t offset;
Mario Smarduchba0513b2015-01-15 15:58:53 -08002061
Paolo Bonzini2a31b9d2018-10-23 02:36:47 +02002062 if (!dirty_bitmap[i])
2063 continue;
Mario Smarduchba0513b2015-01-15 15:58:53 -08002064
Sean Christopherson0dff0842020-02-18 13:07:29 -08002065 flush = true;
Paolo Bonzini2a31b9d2018-10-23 02:36:47 +02002066 mask = xchg(&dirty_bitmap[i], 0);
2067 dirty_bitmap_buffer[i] = mask;
Mario Smarduchba0513b2015-01-15 15:58:53 -08002068
Lan Tianyua67794c2019-02-02 17:20:27 +08002069 offset = i * BITS_PER_LONG;
2070 kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
2071 offset, mask);
Takuya Yoshikawa58d29302015-03-17 16:19:58 +09002072 }
Ben Gardon531810c2021-02-02 10:57:24 -08002073 KVM_MMU_UNLOCK(kvm);
Mario Smarduchba0513b2015-01-15 15:58:53 -08002074 }
2075
Sean Christopherson0dff0842020-02-18 13:07:29 -08002076 if (flush)
2077 kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
2078
Mario Smarduchba0513b2015-01-15 15:58:53 -08002079 if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
Markus Elfring58d6db32017-01-22 17:30:16 +01002080 return -EFAULT;
2081 return 0;
Mario Smarduchba0513b2015-01-15 15:58:53 -08002082}
Sean Christopherson0dff0842020-02-18 13:07:29 -08002083
2084
2085/**
2086 * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
2087 * @kvm: kvm instance
2088 * @log: slot id and address to which we copy the log
2089 *
2090 * Steps 1-4 below provide general overview of dirty page logging. See
2091 * kvm_get_dirty_log_protect() function description for additional details.
2092 *
2093 * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we
2094 * always flush the TLB (step 4) even if previous step failed and the dirty
2095 * bitmap may be corrupt. Regardless of previous outcome the KVM logging API
2096 * does not preclude user space subsequent dirty log read. Flushing TLB ensures
2097 * writes will be marked dirty for next log read.
2098 *
2099 * 1. Take a snapshot of the bit and clear it if needed.
2100 * 2. Write protect the corresponding page.
2101 * 3. Copy the snapshot to the userspace.
2102 * 4. Flush TLB's if needed.
2103 */
2104static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
2105 struct kvm_dirty_log *log)
2106{
2107 int r;
2108
2109 mutex_lock(&kvm->slots_lock);
2110
2111 r = kvm_get_dirty_log_protect(kvm, log);
2112
2113 mutex_unlock(&kvm->slots_lock);
2114 return r;
2115}
Paolo Bonzini2a31b9d2018-10-23 02:36:47 +02002116
2117/**
2118 * kvm_clear_dirty_log_protect - clear dirty bits in the bitmap
2119 * and reenable dirty page tracking for the corresponding pages.
2120 * @kvm: pointer to kvm instance
2121 * @log: slot id and address from which to fetch the bitmap of dirty pages
2122 */
Sean Christopherson0dff0842020-02-18 13:07:29 -08002123static int kvm_clear_dirty_log_protect(struct kvm *kvm,
2124 struct kvm_clear_dirty_log *log)
Paolo Bonzini2a31b9d2018-10-23 02:36:47 +02002125{
2126 struct kvm_memslots *slots;
2127 struct kvm_memory_slot *memslot;
Tomas Bortoli98938aa2019-01-02 18:29:37 +01002128 int as_id, id;
Paolo Bonzini2a31b9d2018-10-23 02:36:47 +02002129 gfn_t offset;
Tomas Bortoli98938aa2019-01-02 18:29:37 +01002130 unsigned long i, n;
Paolo Bonzini2a31b9d2018-10-23 02:36:47 +02002131 unsigned long *dirty_bitmap;
2132 unsigned long *dirty_bitmap_buffer;
Sean Christopherson0dff0842020-02-18 13:07:29 -08002133 bool flush;
Paolo Bonzini2a31b9d2018-10-23 02:36:47 +02002134
Peter Xub2cc64c2020-09-30 21:22:24 -04002135 /* Dirty ring tracking is exclusive to dirty log tracking */
2136 if (kvm->dirty_ring_size)
2137 return -ENXIO;
2138
Paolo Bonzini2a31b9d2018-10-23 02:36:47 +02002139 as_id = log->slot >> 16;
2140 id = (u16)log->slot;
2141 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
2142 return -EINVAL;
2143
Paolo Bonzini76d58e02019-04-17 15:28:44 +02002144 if (log->first_page & 63)
Paolo Bonzini2a31b9d2018-10-23 02:36:47 +02002145 return -EINVAL;
2146
2147 slots = __kvm_memslots(kvm, as_id);
2148 memslot = id_to_memslot(slots, id);
Sean Christopherson0577d1a2020-02-18 13:07:31 -08002149 if (!memslot || !memslot->dirty_bitmap)
2150 return -ENOENT;
Paolo Bonzini2a31b9d2018-10-23 02:36:47 +02002151
2152 dirty_bitmap = memslot->dirty_bitmap;
Paolo Bonzini2a31b9d2018-10-23 02:36:47 +02002153
Peter Xu4ddc9202019-05-08 17:15:45 +08002154 n = ALIGN(log->num_pages, BITS_PER_LONG) / 8;
Tomas Bortoli98938aa2019-01-02 18:29:37 +01002155
2156 if (log->first_page > memslot->npages ||
Paolo Bonzini76d58e02019-04-17 15:28:44 +02002157 log->num_pages > memslot->npages - log->first_page ||
2158 (log->num_pages < memslot->npages - log->first_page && (log->num_pages & 63)))
2159 return -EINVAL;
Tomas Bortoli98938aa2019-01-02 18:29:37 +01002160
Sean Christopherson0dff0842020-02-18 13:07:29 -08002161 kvm_arch_sync_dirty_log(kvm, memslot);
2162
2163 flush = false;
Paolo Bonzini2a31b9d2018-10-23 02:36:47 +02002164 dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
2165 if (copy_from_user(dirty_bitmap_buffer, log->dirty_bitmap, n))
2166 return -EFAULT;
2167
Ben Gardon531810c2021-02-02 10:57:24 -08002168 KVM_MMU_LOCK(kvm);
Peter Xu53eac7a2019-05-08 17:15:46 +08002169 for (offset = log->first_page, i = offset / BITS_PER_LONG,
2170 n = DIV_ROUND_UP(log->num_pages, BITS_PER_LONG); n--;
Paolo Bonzini2a31b9d2018-10-23 02:36:47 +02002171 i++, offset += BITS_PER_LONG) {
2172 unsigned long mask = *dirty_bitmap_buffer++;
2173 atomic_long_t *p = (atomic_long_t *) &dirty_bitmap[i];
2174 if (!mask)
2175 continue;
2176
2177 mask &= atomic_long_fetch_andnot(mask, p);
2178
2179 /*
2180 * mask contains the bits that really have been cleared. This
2181 * never includes any bits beyond the length of the memslot (if
2182 * the length is not aligned to 64 pages), therefore it is not
2183 * a problem if userspace sets them in log->dirty_bitmap.
2184 */
2185 if (mask) {
Sean Christopherson0dff0842020-02-18 13:07:29 -08002186 flush = true;
Paolo Bonzini2a31b9d2018-10-23 02:36:47 +02002187 kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
2188 offset, mask);
2189 }
2190 }
Ben Gardon531810c2021-02-02 10:57:24 -08002191 KVM_MMU_UNLOCK(kvm);
Paolo Bonzini2a31b9d2018-10-23 02:36:47 +02002192
Sean Christopherson0dff0842020-02-18 13:07:29 -08002193 if (flush)
2194 kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
2195
Paolo Bonzini2a31b9d2018-10-23 02:36:47 +02002196 return 0;
2197}
Sean Christopherson0dff0842020-02-18 13:07:29 -08002198
2199static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm,
2200 struct kvm_clear_dirty_log *log)
2201{
2202 int r;
2203
2204 mutex_lock(&kvm->slots_lock);
2205
2206 r = kvm_clear_dirty_log_protect(kvm, log);
2207
2208 mutex_unlock(&kvm->slots_lock);
2209 return r;
2210}
2211#endif /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
Mario Smarduchba0513b2015-01-15 15:58:53 -08002212
Gleb Natapov49c77542010-10-18 15:22:23 +02002213struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
2214{
2215 return __gfn_to_memslot(kvm_memslots(kvm), gfn);
2216}
Avi Kivitya1f4d3952010-06-21 11:44:20 +03002217EXPORT_SYMBOL_GPL(gfn_to_memslot);
Avi Kivity6aa8b732006-12-10 02:21:36 -08002218
Paolo Bonzini8e734852015-05-17 13:58:53 +02002219struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn)
2220{
David Matlackfe22ed82021-08-04 22:28:40 +00002221 struct kvm_memslots *slots = kvm_vcpu_memslots(vcpu);
Maciej S. Szmigieroa54d8062021-12-06 20:54:30 +01002222 u64 gen = slots->generation;
David Matlackfe22ed82021-08-04 22:28:40 +00002223 struct kvm_memory_slot *slot;
David Matlackfe22ed82021-08-04 22:28:40 +00002224
Maciej S. Szmigieroa54d8062021-12-06 20:54:30 +01002225 /*
2226 * This also protects against using a memslot from a different address space,
2227 * since different address spaces have different generation numbers.
2228 */
2229 if (unlikely(gen != vcpu->last_used_slot_gen)) {
2230 vcpu->last_used_slot = NULL;
2231 vcpu->last_used_slot_gen = gen;
2232 }
2233
2234 slot = try_get_memslot(vcpu->last_used_slot, gfn);
David Matlackfe22ed82021-08-04 22:28:40 +00002235 if (slot)
2236 return slot;
2237
2238 /*
2239 * Fall back to searching all memslots. We purposely use
2240 * search_memslots() instead of __gfn_to_memslot() to avoid
Maciej S. Szmigieroa54d8062021-12-06 20:54:30 +01002241 * thrashing the VM-wide last_used_slot in kvm_memslots.
David Matlackfe22ed82021-08-04 22:28:40 +00002242 */
Maciej S. Szmigieroa54d8062021-12-06 20:54:30 +01002243 slot = search_memslots(slots, gfn, false);
David Matlackfe22ed82021-08-04 22:28:40 +00002244 if (slot) {
Maciej S. Szmigieroa54d8062021-12-06 20:54:30 +01002245 vcpu->last_used_slot = slot;
David Matlackfe22ed82021-08-04 22:28:40 +00002246 return slot;
2247 }
2248
2249 return NULL;
Paolo Bonzini8e734852015-05-17 13:58:53 +02002250}
2251
Yaowei Bai33e94152015-11-14 11:21:06 +08002252bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
Izik Eiduse0d62c72007-10-24 23:57:46 +02002253{
Xiao Guangrongbf3e05b2011-11-24 17:40:57 +08002254 struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn);
Izik Eiduse0d62c72007-10-24 23:57:46 +02002255
Paolo Bonzinic36b7152020-04-16 09:48:07 -04002256 return kvm_is_visible_memslot(memslot);
Izik Eiduse0d62c72007-10-24 23:57:46 +02002257}
2258EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
2259
Vitaly Kuznetsov995decb2020-07-08 16:00:23 +02002260bool kvm_vcpu_is_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
2261{
2262 struct kvm_memory_slot *memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2263
2264 return kvm_is_visible_memslot(memslot);
2265}
2266EXPORT_SYMBOL_GPL(kvm_vcpu_is_visible_gfn);
2267
Sean Christophersonf9b84e12020-01-08 12:24:37 -08002268unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn)
Joerg Roedel8f0b1ab2010-01-28 12:37:56 +01002269{
2270 struct vm_area_struct *vma;
2271 unsigned long addr, size;
2272
2273 size = PAGE_SIZE;
2274
Sean Christopherson42cde482020-01-08 12:24:38 -08002275 addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gfn, NULL);
Joerg Roedel8f0b1ab2010-01-28 12:37:56 +01002276 if (kvm_is_error_hva(addr))
2277 return PAGE_SIZE;
2278
Michel Lespinassed8ed45c2020-06-08 21:33:25 -07002279 mmap_read_lock(current->mm);
Joerg Roedel8f0b1ab2010-01-28 12:37:56 +01002280 vma = find_vma(current->mm, addr);
2281 if (!vma)
2282 goto out;
2283
2284 size = vma_kernel_pagesize(vma);
2285
2286out:
Michel Lespinassed8ed45c2020-06-08 21:33:25 -07002287 mmap_read_unlock(current->mm);
Joerg Roedel8f0b1ab2010-01-28 12:37:56 +01002288
2289 return size;
2290}
2291
Ben Gardon8283e362021-11-15 15:45:58 -08002292static bool memslot_is_readonly(const struct kvm_memory_slot *slot)
Xiao Guangrong4d8b81a2012-08-21 11:02:51 +08002293{
2294 return slot->flags & KVM_MEM_READONLY;
2295}
2296
Ben Gardon8283e362021-11-15 15:45:58 -08002297static unsigned long __gfn_to_hva_many(const struct kvm_memory_slot *slot, gfn_t gfn,
Xiao Guangrong4d8b81a2012-08-21 11:02:51 +08002298 gfn_t *nr_pages, bool write)
Izik Eidus539cb662007-11-11 22:05:04 +02002299{
Marcelo Tosattibc6678a2009-12-23 14:35:21 -02002300 if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
Xiao Guangrongca3a4902012-08-21 11:01:50 +08002301 return KVM_HVA_ERR_BAD;
Xiao Guangrong48987782010-08-22 19:11:43 +08002302
Xiao Guangrong4d8b81a2012-08-21 11:02:51 +08002303 if (memslot_is_readonly(slot) && write)
2304 return KVM_HVA_ERR_RO_BAD;
Xiao Guangrong48987782010-08-22 19:11:43 +08002305
2306 if (nr_pages)
2307 *nr_pages = slot->npages - (gfn - slot->base_gfn);
2308
Xiao Guangrong4d8b81a2012-08-21 11:02:51 +08002309 return __gfn_to_hva_memslot(slot, gfn);
Izik Eidus539cb662007-11-11 22:05:04 +02002310}
Xiao Guangrong48987782010-08-22 19:11:43 +08002311
Xiao Guangrong4d8b81a2012-08-21 11:02:51 +08002312static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
2313 gfn_t *nr_pages)
2314{
2315 return __gfn_to_hva_many(slot, gfn, nr_pages, true);
2316}
2317
2318unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
Stephen Hemminger79408762013-12-29 12:12:29 -08002319 gfn_t gfn)
Xiao Guangrong4d8b81a2012-08-21 11:02:51 +08002320{
2321 return gfn_to_hva_many(slot, gfn, NULL);
2322}
2323EXPORT_SYMBOL_GPL(gfn_to_hva_memslot);
2324
Xiao Guangrong48987782010-08-22 19:11:43 +08002325unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
2326{
Gleb Natapov49c77542010-10-18 15:22:23 +02002327 return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL);
Xiao Guangrong48987782010-08-22 19:11:43 +08002328}
Sheng Yang0d150292008-04-25 21:44:50 +08002329EXPORT_SYMBOL_GPL(gfn_to_hva);
Izik Eidus539cb662007-11-11 22:05:04 +02002330
Paolo Bonzini8e734852015-05-17 13:58:53 +02002331unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn)
2332{
2333 return gfn_to_hva_many(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, NULL);
2334}
2335EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_hva);
2336
Xiao Guangrong86ab8cf2012-08-21 10:59:53 +08002337/*
Wei Yang970c0d42018-10-09 10:41:15 +08002338 * Return the hva of a @gfn and the R/W attribute if possible.
2339 *
2340 * @slot: the kvm_memory_slot which contains @gfn
2341 * @gfn: the gfn to be translated
2342 * @writable: used to return the read/write attribute of the @slot if the hva
2343 * is valid and @writable is not NULL
Xiao Guangrong86ab8cf2012-08-21 10:59:53 +08002344 */
Christoffer Dall64d83122014-08-19 12:15:00 +02002345unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot,
2346 gfn_t gfn, bool *writable)
Gleb Natapov80300892010-10-19 18:13:41 +02002347{
Gleb Natapova2ac07f2013-10-01 19:58:36 +03002348 unsigned long hva = __gfn_to_hva_many(slot, gfn, NULL, false);
2349
2350 if (!kvm_is_error_hva(hva) && writable)
Paolo Bonziniba6a3542013-09-09 13:52:33 +02002351 *writable = !memslot_is_readonly(slot);
2352
Gleb Natapova2ac07f2013-10-01 19:58:36 +03002353 return hva;
Xiao Guangrong86ab8cf2012-08-21 10:59:53 +08002354}
2355
Christoffer Dall64d83122014-08-19 12:15:00 +02002356unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable)
2357{
2358 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
2359
2360 return gfn_to_hva_memslot_prot(slot, gfn, writable);
2361}
2362
Paolo Bonzini8e734852015-05-17 13:58:53 +02002363unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable)
2364{
2365 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2366
2367 return gfn_to_hva_memslot_prot(slot, gfn, writable);
2368}
2369
Huang Yingfafc3db2011-01-30 11:15:49 +08002370static inline int check_user_page_hwpoison(unsigned long addr)
2371{
Lorenzo Stoakes0d731752016-10-24 10:57:25 +01002372 int rc, flags = FOLL_HWPOISON | FOLL_WRITE;
Huang Yingfafc3db2011-01-30 11:15:49 +08002373
Lorenzo Stoakes0d731752016-10-24 10:57:25 +01002374 rc = get_user_pages(addr, 1, flags, NULL, NULL);
Huang Yingfafc3db2011-01-30 11:15:49 +08002375 return rc == -EHWPOISON;
2376}
2377
Xiao Guangrong2fc84312012-08-21 11:00:22 +08002378/*
Paolo Bonzinib9b33da2018-07-27 17:44:41 +02002379 * The fast path to get the writable pfn which will be stored in @pfn,
2380 * true indicates success, otherwise false is returned. It's also the
Miaohe Lin311497e2019-12-11 14:26:25 +08002381 * only part that runs if we can in atomic context.
Xiao Guangrong2fc84312012-08-21 11:00:22 +08002382 */
Paolo Bonzinib9b33da2018-07-27 17:44:41 +02002383static bool hva_to_pfn_fast(unsigned long addr, bool write_fault,
2384 bool *writable, kvm_pfn_t *pfn)
Xiao Guangrong2fc84312012-08-21 11:00:22 +08002385{
2386 struct page *page[1];
Xiao Guangrong2fc84312012-08-21 11:00:22 +08002387
Xiao Guangrong12ce13f2012-08-21 11:00:49 +08002388 /*
2389 * Fast pin a writable pfn only if it is a write fault request
2390 * or the caller allows to map a writable pfn for a read fault
2391 * request.
2392 */
2393 if (!(write_fault || writable))
2394 return false;
2395
Souptick Joarderdadbb612020-06-07 21:40:55 -07002396 if (get_user_page_fast_only(addr, FOLL_WRITE, page)) {
Xiao Guangrong2fc84312012-08-21 11:00:22 +08002397 *pfn = page_to_pfn(page[0]);
2398
2399 if (writable)
2400 *writable = true;
2401 return true;
2402 }
2403
2404 return false;
2405}
2406
2407/*
2408 * The slow path to get the pfn of the specified host virtual address,
2409 * 1 indicates success, -errno is returned if error is detected.
2410 */
2411static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
Dan Williamsba049e92016-01-15 16:56:11 -08002412 bool *writable, kvm_pfn_t *pfn)
Avi Kivity954bbbc2007-03-30 14:02:32 +03002413{
Al Viroce530532017-11-19 17:47:33 -05002414 unsigned int flags = FOLL_HWPOISON;
2415 struct page *page;
Gleb Natapovaf585b92010-10-14 11:22:46 +02002416 int npages = 0;
Xiao Guangrong2fc84312012-08-21 11:00:22 +08002417
2418 might_sleep();
2419
2420 if (writable)
2421 *writable = write_fault;
2422
Al Viroce530532017-11-19 17:47:33 -05002423 if (write_fault)
2424 flags |= FOLL_WRITE;
2425 if (async)
2426 flags |= FOLL_NOWAIT;
Lorenzo Stoakesd4944b02016-10-13 01:20:12 +01002427
Al Viroce530532017-11-19 17:47:33 -05002428 npages = get_user_pages_unlocked(addr, 1, &page, flags);
Xiao Guangrong2fc84312012-08-21 11:00:22 +08002429 if (npages != 1)
2430 return npages;
2431
2432 /* map read fault as writable if possible */
Xiao Guangrong12ce13f2012-08-21 11:00:49 +08002433 if (unlikely(!write_fault) && writable) {
Al Viroce530532017-11-19 17:47:33 -05002434 struct page *wpage;
Xiao Guangrong2fc84312012-08-21 11:00:22 +08002435
Souptick Joarderdadbb612020-06-07 21:40:55 -07002436 if (get_user_page_fast_only(addr, FOLL_WRITE, &wpage)) {
Xiao Guangrong2fc84312012-08-21 11:00:22 +08002437 *writable = true;
Al Viroce530532017-11-19 17:47:33 -05002438 put_page(page);
2439 page = wpage;
Xiao Guangrong2fc84312012-08-21 11:00:22 +08002440 }
Xiao Guangrong2fc84312012-08-21 11:00:22 +08002441 }
Al Viroce530532017-11-19 17:47:33 -05002442 *pfn = page_to_pfn(page);
Xiao Guangrong2fc84312012-08-21 11:00:22 +08002443 return npages;
2444}
2445
Xiao Guangrong4d8b81a2012-08-21 11:02:51 +08002446static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault)
2447{
2448 if (unlikely(!(vma->vm_flags & VM_READ)))
2449 return false;
2450
2451 if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE))))
2452 return false;
2453
2454 return true;
2455}
2456
Nicholas Pigginf8be1562021-06-24 08:29:04 -04002457static int kvm_try_get_pfn(kvm_pfn_t pfn)
2458{
2459 if (kvm_is_reserved_pfn(pfn))
2460 return 1;
2461 return get_page_unless_zero(pfn_to_page(pfn));
2462}
2463
Paolo Bonzini92176a82016-06-07 16:22:47 +02002464static int hva_to_pfn_remapped(struct vm_area_struct *vma,
Xianting Tian16255662022-01-24 10:04:56 +08002465 unsigned long addr, bool write_fault,
2466 bool *writable, kvm_pfn_t *p_pfn)
Paolo Bonzini92176a82016-06-07 16:22:47 +02002467{
Sean Christophersona9545772021-02-08 12:19:40 -08002468 kvm_pfn_t pfn;
Paolo Bonzinibd2fae82021-02-01 05:12:11 -05002469 pte_t *ptep;
2470 spinlock_t *ptl;
Paolo Bonziniadd6a0c2016-06-07 17:51:18 +02002471 int r;
2472
Paolo Bonzini9fd6dad2021-02-05 05:07:11 -05002473 r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
Paolo Bonziniadd6a0c2016-06-07 17:51:18 +02002474 if (r) {
2475 /*
2476 * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does
2477 * not call the fault handler, so do it here.
2478 */
2479 bool unlocked = false;
Peter Xu64019a22020-08-11 18:39:01 -07002480 r = fixup_user_fault(current->mm, addr,
Paolo Bonziniadd6a0c2016-06-07 17:51:18 +02002481 (write_fault ? FAULT_FLAG_WRITE : 0),
2482 &unlocked);
Paolo Bonzinia8387d02020-05-29 05:42:55 -04002483 if (unlocked)
2484 return -EAGAIN;
Paolo Bonziniadd6a0c2016-06-07 17:51:18 +02002485 if (r)
2486 return r;
2487
Paolo Bonzini9fd6dad2021-02-05 05:07:11 -05002488 r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
Paolo Bonziniadd6a0c2016-06-07 17:51:18 +02002489 if (r)
2490 return r;
Paolo Bonzinibd2fae82021-02-01 05:12:11 -05002491 }
Paolo Bonziniadd6a0c2016-06-07 17:51:18 +02002492
Paolo Bonzinibd2fae82021-02-01 05:12:11 -05002493 if (write_fault && !pte_write(*ptep)) {
2494 pfn = KVM_PFN_ERR_RO_FAULT;
2495 goto out;
Paolo Bonziniadd6a0c2016-06-07 17:51:18 +02002496 }
2497
KarimAllah Ahmeda340b3e2018-01-17 19:18:56 +01002498 if (writable)
Paolo Bonzinibd2fae82021-02-01 05:12:11 -05002499 *writable = pte_write(*ptep);
2500 pfn = pte_pfn(*ptep);
Paolo Bonziniadd6a0c2016-06-07 17:51:18 +02002501
2502 /*
2503 * Get a reference here because callers of *hva_to_pfn* and
2504 * *gfn_to_pfn* ultimately call kvm_release_pfn_clean on the
2505 * returned pfn. This is only needed if the VMA has VM_MIXEDMAP
Marc Zyngier36c3ce62021-07-26 16:35:52 +01002506 * set, but the kvm_try_get_pfn/kvm_release_pfn_clean pair will
Paolo Bonziniadd6a0c2016-06-07 17:51:18 +02002507 * simply do nothing for reserved pfns.
2508 *
2509 * Whoever called remap_pfn_range is also going to call e.g.
2510 * unmap_mapping_range before the underlying pages are freed,
2511 * causing a call to our MMU notifier.
Nicholas Pigginf8be1562021-06-24 08:29:04 -04002512 *
2513 * Certain IO or PFNMAP mappings can be backed with valid
2514 * struct pages, but be allocated without refcounting e.g.,
2515 * tail pages of non-compound higher order allocations, which
2516 * would then underflow the refcount when the caller does the
2517 * required put_page. Don't allow those pages here.
Paolo Bonziniadd6a0c2016-06-07 17:51:18 +02002518 */
Nicholas Pigginf8be1562021-06-24 08:29:04 -04002519 if (!kvm_try_get_pfn(pfn))
2520 r = -EFAULT;
Paolo Bonziniadd6a0c2016-06-07 17:51:18 +02002521
Paolo Bonzinibd2fae82021-02-01 05:12:11 -05002522out:
2523 pte_unmap_unlock(ptep, ptl);
Paolo Bonziniadd6a0c2016-06-07 17:51:18 +02002524 *p_pfn = pfn;
Nicholas Pigginf8be1562021-06-24 08:29:04 -04002525
2526 return r;
Paolo Bonzini92176a82016-06-07 16:22:47 +02002527}
2528
Xiao Guangrong12ce13f2012-08-21 11:00:49 +08002529/*
2530 * Pin guest page in memory and return its pfn.
2531 * @addr: host virtual address which maps memory to the guest
2532 * @atomic: whether this function can sleep
2533 * @async: whether this function need to wait IO complete if the
2534 * host page is not in the memory
2535 * @write_fault: whether we should get a writable host page
2536 * @writable: whether it allows to map a writable host page for !@write_fault
2537 *
2538 * The function will map a writable host page for these two cases:
2539 * 1): @write_fault = true
2540 * 2): @write_fault = false && @writable, @writable will tell the caller
2541 * whether the mapping is writable.
2542 */
David Woodhouse982ed0d2021-12-10 16:36:21 +00002543kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
2544 bool write_fault, bool *writable)
Xiao Guangrong2fc84312012-08-21 11:00:22 +08002545{
2546 struct vm_area_struct *vma;
Dan Williamsba049e92016-01-15 16:56:11 -08002547 kvm_pfn_t pfn = 0;
Paolo Bonzini92176a82016-06-07 16:22:47 +02002548 int npages, r;
Avi Kivity954bbbc2007-03-30 14:02:32 +03002549
Gleb Natapovaf585b92010-10-14 11:22:46 +02002550 /* we can do it either atomically or asynchronously, not both */
2551 BUG_ON(atomic && async);
2552
Paolo Bonzinib9b33da2018-07-27 17:44:41 +02002553 if (hva_to_pfn_fast(addr, write_fault, writable, &pfn))
Xiao Guangrong2fc84312012-08-21 11:00:22 +08002554 return pfn;
Marcelo Tosatti612819c2010-10-22 14:18:18 -02002555
Xiao Guangrong2fc84312012-08-21 11:00:22 +08002556 if (atomic)
2557 return KVM_PFN_ERR_FAULT;
Marcelo Tosatti612819c2010-10-22 14:18:18 -02002558
Xiao Guangrong2fc84312012-08-21 11:00:22 +08002559 npages = hva_to_pfn_slow(addr, async, write_fault, writable, &pfn);
2560 if (npages == 1)
2561 return pfn;
Gleb Natapovaf585b92010-10-14 11:22:46 +02002562
Michel Lespinassed8ed45c2020-06-08 21:33:25 -07002563 mmap_read_lock(current->mm);
Xiao Guangrong2fc84312012-08-21 11:00:22 +08002564 if (npages == -EHWPOISON ||
2565 (!async && check_user_page_hwpoison(addr))) {
2566 pfn = KVM_PFN_ERR_HWPOISON;
2567 goto exit;
Xiao Guangrong887c08a2010-08-22 19:10:28 +08002568 }
Izik Eidus539cb662007-11-11 22:05:04 +02002569
Paolo Bonzinia8387d02020-05-29 05:42:55 -04002570retry:
Liam Howlettfc98c032021-06-28 19:39:17 -07002571 vma = vma_lookup(current->mm, addr);
Anthony Liguori8d4e1282007-10-18 09:59:34 -05002572
Xiao Guangrong2fc84312012-08-21 11:00:22 +08002573 if (vma == NULL)
2574 pfn = KVM_PFN_ERR_FAULT;
Paolo Bonzini92176a82016-06-07 16:22:47 +02002575 else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) {
Xianting Tian16255662022-01-24 10:04:56 +08002576 r = hva_to_pfn_remapped(vma, addr, write_fault, writable, &pfn);
Paolo Bonzinia8387d02020-05-29 05:42:55 -04002577 if (r == -EAGAIN)
2578 goto retry;
Paolo Bonzini92176a82016-06-07 16:22:47 +02002579 if (r < 0)
2580 pfn = KVM_PFN_ERR_FAULT;
Xiao Guangrong2fc84312012-08-21 11:00:22 +08002581 } else {
Xiao Guangrong4d8b81a2012-08-21 11:02:51 +08002582 if (async && vma_is_valid(vma, write_fault))
Xiao Guangrong2fc84312012-08-21 11:00:22 +08002583 *async = true;
2584 pfn = KVM_PFN_ERR_FAULT;
2585 }
2586exit:
Michel Lespinassed8ed45c2020-06-08 21:33:25 -07002587 mmap_read_unlock(current->mm);
Anthony Liguori2e2e3732008-04-30 15:37:07 -05002588 return pfn;
Anthony Liguori35149e22008-04-02 14:46:56 -05002589}
2590
Ben Gardon8283e362021-11-15 15:45:58 -08002591kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn,
Dan Williamsba049e92016-01-15 16:56:11 -08002592 bool atomic, bool *async, bool write_fault,
David Stevens4a42d842021-02-22 11:45:22 +09002593 bool *writable, hva_t *hva)
Xiao Guangrong887c08a2010-08-22 19:10:28 +08002594{
Xiao Guangrong4d8b81a2012-08-21 11:02:51 +08002595 unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
2596
David Stevens4a42d842021-02-22 11:45:22 +09002597 if (hva)
2598 *hva = addr;
2599
Paolo Bonzinib2740d32016-02-23 15:36:01 +01002600 if (addr == KVM_HVA_ERR_RO_BAD) {
2601 if (writable)
2602 *writable = false;
Xiao Guangrong4d8b81a2012-08-21 11:02:51 +08002603 return KVM_PFN_ERR_RO_FAULT;
Paolo Bonzinib2740d32016-02-23 15:36:01 +01002604 }
Xiao Guangrong4d8b81a2012-08-21 11:02:51 +08002605
Paolo Bonzinib2740d32016-02-23 15:36:01 +01002606 if (kvm_is_error_hva(addr)) {
2607 if (writable)
2608 *writable = false;
Xiao Guangrong81c52c52012-10-16 20:10:59 +08002609 return KVM_PFN_NOSLOT;
Paolo Bonzinib2740d32016-02-23 15:36:01 +01002610 }
Xiao Guangrong4d8b81a2012-08-21 11:02:51 +08002611
2612 /* Do not map writable pfn in the readonly memslot. */
2613 if (writable && memslot_is_readonly(slot)) {
2614 *writable = false;
2615 writable = NULL;
2616 }
2617
2618 return hva_to_pfn(addr, atomic, async, write_fault,
2619 writable);
Xiao Guangrong887c08a2010-08-22 19:10:28 +08002620}
Paolo Bonzini35204692015-04-02 11:20:48 +02002621EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot);
Xiao Guangrong887c08a2010-08-22 19:10:28 +08002622
Dan Williamsba049e92016-01-15 16:56:11 -08002623kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
Marcelo Tosatti612819c2010-10-22 14:18:18 -02002624 bool *writable)
2625{
Paolo Bonzinie37afc62015-05-19 16:09:04 +02002626 return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, NULL,
David Stevens4a42d842021-02-22 11:45:22 +09002627 write_fault, writable, NULL);
Marcelo Tosatti612819c2010-10-22 14:18:18 -02002628}
2629EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
2630
Ben Gardon8283e362021-11-15 15:45:58 -08002631kvm_pfn_t gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn)
Marcelo Tosatti506f0d62009-12-23 14:35:19 -02002632{
David Stevens4a42d842021-02-22 11:45:22 +09002633 return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL, NULL);
Marcelo Tosatti506f0d62009-12-23 14:35:19 -02002634}
Paolo Bonzinie37afc62015-05-19 16:09:04 +02002635EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot);
Marcelo Tosatti506f0d62009-12-23 14:35:19 -02002636
Ben Gardon8283e362021-11-15 15:45:58 -08002637kvm_pfn_t gfn_to_pfn_memslot_atomic(const struct kvm_memory_slot *slot, gfn_t gfn)
Xiao Guangrong037d92d2012-08-21 10:59:12 +08002638{
David Stevens4a42d842021-02-22 11:45:22 +09002639 return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL, NULL);
Xiao Guangrong037d92d2012-08-21 10:59:12 +08002640}
2641EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
2642
Dan Williamsba049e92016-01-15 16:56:11 -08002643kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn)
Paolo Bonzini8e734852015-05-17 13:58:53 +02002644{
2645 return gfn_to_pfn_memslot_atomic(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
2646}
2647EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn_atomic);
2648
Dan Williamsba049e92016-01-15 16:56:11 -08002649kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
Paolo Bonzinie37afc62015-05-19 16:09:04 +02002650{
2651 return gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn);
2652}
2653EXPORT_SYMBOL_GPL(gfn_to_pfn);
2654
Dan Williamsba049e92016-01-15 16:56:11 -08002655kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn)
Paolo Bonzini8e734852015-05-17 13:58:53 +02002656{
2657 return gfn_to_pfn_memslot(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
2658}
2659EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn);
2660
Paolo Bonzinid9ef13c2015-05-19 16:01:50 +02002661int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
2662 struct page **pages, int nr_pages)
Xiao Guangrong48987782010-08-22 19:11:43 +08002663{
2664 unsigned long addr;
Arnd Bergmann076b9252017-08-10 14:14:39 +02002665 gfn_t entry = 0;
Xiao Guangrong48987782010-08-22 19:11:43 +08002666
Paolo Bonzinid9ef13c2015-05-19 16:01:50 +02002667 addr = gfn_to_hva_many(slot, gfn, &entry);
Xiao Guangrong48987782010-08-22 19:11:43 +08002668 if (kvm_is_error_hva(addr))
2669 return -1;
2670
2671 if (entry < nr_pages)
2672 return 0;
2673
Souptick Joarderdadbb612020-06-07 21:40:55 -07002674 return get_user_pages_fast_only(addr, nr_pages, FOLL_WRITE, pages);
Xiao Guangrong48987782010-08-22 19:11:43 +08002675}
2676EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
2677
Dan Williamsba049e92016-01-15 16:56:11 -08002678static struct page *kvm_pfn_to_page(kvm_pfn_t pfn)
Xiao Guangronga2766322012-07-26 11:58:59 +08002679{
Xiao Guangrong81c52c52012-10-16 20:10:59 +08002680 if (is_error_noslot_pfn(pfn))
Xiao Guangrong6cede2e2012-08-03 15:41:22 +08002681 return KVM_ERR_PTR_BAD_PAGE;
Xiao Guangronga2766322012-07-26 11:58:59 +08002682
Ard Biesheuvelbf4bea82014-11-10 08:33:56 +00002683 if (kvm_is_reserved_pfn(pfn)) {
Xiao Guangrongcb9aaa32012-08-03 15:42:10 +08002684 WARN_ON(1);
2685 return KVM_ERR_PTR_BAD_PAGE;
2686 }
2687
Xiao Guangronga2766322012-07-26 11:58:59 +08002688 return pfn_to_page(pfn);
2689}
2690
Anthony Liguori35149e22008-04-02 14:46:56 -05002691struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
2692{
Dan Williamsba049e92016-01-15 16:56:11 -08002693 kvm_pfn_t pfn;
Anthony Liguori2e2e3732008-04-30 15:37:07 -05002694
2695 pfn = gfn_to_pfn(kvm, gfn);
Anthony Liguori2e2e3732008-04-30 15:37:07 -05002696
Xiao Guangronga2766322012-07-26 11:58:59 +08002697 return kvm_pfn_to_page(pfn);
Avi Kivity954bbbc2007-03-30 14:02:32 +03002698}
2699EXPORT_SYMBOL_GPL(gfn_to_page);
2700
David Woodhouse357a18a2021-11-15 16:50:27 +00002701void kvm_release_pfn(kvm_pfn_t pfn, bool dirty)
Boris Ostrovsky91724812019-12-05 01:30:51 +00002702{
2703 if (pfn == 0)
2704 return;
2705
Boris Ostrovsky91724812019-12-05 01:30:51 +00002706 if (dirty)
2707 kvm_release_pfn_dirty(pfn);
2708 else
2709 kvm_release_pfn_clean(pfn);
2710}
2711
David Woodhouse357a18a2021-11-15 16:50:27 +00002712int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map)
KarimAllah Ahmede45adf62019-01-31 21:24:34 +01002713{
2714 kvm_pfn_t pfn;
2715 void *hva = NULL;
2716 struct page *page = KVM_UNMAPPED_PAGE;
2717
2718 if (!map)
2719 return -EINVAL;
2720
David Woodhouse357a18a2021-11-15 16:50:27 +00002721 pfn = gfn_to_pfn(vcpu->kvm, gfn);
KarimAllah Ahmede45adf62019-01-31 21:24:34 +01002722 if (is_error_noslot_pfn(pfn))
2723 return -EINVAL;
2724
2725 if (pfn_valid(pfn)) {
2726 page = pfn_to_page(pfn);
David Woodhouse357a18a2021-11-15 16:50:27 +00002727 hva = kmap(page);
Paolo Bonzinid30b2142019-05-20 12:06:36 +02002728#ifdef CONFIG_HAS_IOMEM
Boris Ostrovsky91724812019-12-05 01:30:51 +00002729 } else {
David Woodhouse357a18a2021-11-15 16:50:27 +00002730 hva = memremap(pfn_to_hpa(pfn), PAGE_SIZE, MEMREMAP_WB);
Paolo Bonzinid30b2142019-05-20 12:06:36 +02002731#endif
KarimAllah Ahmede45adf62019-01-31 21:24:34 +01002732 }
2733
2734 if (!hva)
2735 return -EFAULT;
2736
2737 map->page = page;
2738 map->hva = hva;
2739 map->pfn = pfn;
2740 map->gfn = gfn;
2741
2742 return 0;
2743}
KarimAllah Ahmede45adf62019-01-31 21:24:34 +01002744EXPORT_SYMBOL_GPL(kvm_vcpu_map);
2745
David Woodhouse357a18a2021-11-15 16:50:27 +00002746void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty)
KarimAllah Ahmede45adf62019-01-31 21:24:34 +01002747{
2748 if (!map)
2749 return;
2750
2751 if (!map->hva)
2752 return;
2753
David Woodhouse357a18a2021-11-15 16:50:27 +00002754 if (map->page != KVM_UNMAPPED_PAGE)
2755 kunmap(map->page);
Christian Borntraegereb1f2f32019-05-27 10:28:25 +02002756#ifdef CONFIG_HAS_IOMEM
Boris Ostrovsky91724812019-12-05 01:30:51 +00002757 else
David Woodhouse357a18a2021-11-15 16:50:27 +00002758 memunmap(map->hva);
Christian Borntraegereb1f2f32019-05-27 10:28:25 +02002759#endif
KarimAllah Ahmede45adf62019-01-31 21:24:34 +01002760
Boris Ostrovsky91724812019-12-05 01:30:51 +00002761 if (dirty)
David Woodhouse357a18a2021-11-15 16:50:27 +00002762 kvm_vcpu_mark_page_dirty(vcpu, map->gfn);
Boris Ostrovsky91724812019-12-05 01:30:51 +00002763
David Woodhouse357a18a2021-11-15 16:50:27 +00002764 kvm_release_pfn(map->pfn, dirty);
KarimAllah Ahmede45adf62019-01-31 21:24:34 +01002765
2766 map->hva = NULL;
2767 map->page = NULL;
2768}
2769EXPORT_SYMBOL_GPL(kvm_vcpu_unmap);
2770
Paolo Bonzini8e734852015-05-17 13:58:53 +02002771struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn)
2772{
Dan Williamsba049e92016-01-15 16:56:11 -08002773 kvm_pfn_t pfn;
Paolo Bonzini8e734852015-05-17 13:58:53 +02002774
2775 pfn = kvm_vcpu_gfn_to_pfn(vcpu, gfn);
2776
2777 return kvm_pfn_to_page(pfn);
2778}
2779EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_page);
2780
Izik Eidusb4231d62007-11-20 11:49:33 +02002781void kvm_release_page_clean(struct page *page)
2782{
Xiao Guangrong32cad842012-08-03 15:42:52 +08002783 WARN_ON(is_error_page(page));
2784
Anthony Liguori35149e22008-04-02 14:46:56 -05002785 kvm_release_pfn_clean(page_to_pfn(page));
Izik Eidusb4231d62007-11-20 11:49:33 +02002786}
2787EXPORT_SYMBOL_GPL(kvm_release_page_clean);
2788
Dan Williamsba049e92016-01-15 16:56:11 -08002789void kvm_release_pfn_clean(kvm_pfn_t pfn)
Anthony Liguori35149e22008-04-02 14:46:56 -05002790{
Ard Biesheuvelbf4bea82014-11-10 08:33:56 +00002791 if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn))
Anthony Liguori2e2e3732008-04-30 15:37:07 -05002792 put_page(pfn_to_page(pfn));
Anthony Liguori35149e22008-04-02 14:46:56 -05002793}
2794EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
2795
Izik Eidusb4231d62007-11-20 11:49:33 +02002796void kvm_release_page_dirty(struct page *page)
Izik Eidus8a7ae052007-10-18 11:09:33 +02002797{
Xiao Guangronga2766322012-07-26 11:58:59 +08002798 WARN_ON(is_error_page(page));
2799
Anthony Liguori35149e22008-04-02 14:46:56 -05002800 kvm_release_pfn_dirty(page_to_pfn(page));
Izik Eidus8a7ae052007-10-18 11:09:33 +02002801}
Izik Eidusb4231d62007-11-20 11:49:33 +02002802EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
Izik Eidus8a7ae052007-10-18 11:09:33 +02002803
David Hildenbrandf7a65092017-09-01 17:11:43 +02002804void kvm_release_pfn_dirty(kvm_pfn_t pfn)
Anthony Liguori35149e22008-04-02 14:46:56 -05002805{
2806 kvm_set_pfn_dirty(pfn);
2807 kvm_release_pfn_clean(pfn);
2808}
David Hildenbrandf7a65092017-09-01 17:11:43 +02002809EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);
Anthony Liguori35149e22008-04-02 14:46:56 -05002810
Dan Williamsba049e92016-01-15 16:56:11 -08002811void kvm_set_pfn_dirty(kvm_pfn_t pfn)
Anthony Liguori35149e22008-04-02 14:46:56 -05002812{
Miaohe Lind29c03a2019-12-05 11:05:05 +08002813 if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn))
2814 SetPageDirty(pfn_to_page(pfn));
Anthony Liguori35149e22008-04-02 14:46:56 -05002815}
2816EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
2817
Dan Williamsba049e92016-01-15 16:56:11 -08002818void kvm_set_pfn_accessed(kvm_pfn_t pfn)
Anthony Liguori35149e22008-04-02 14:46:56 -05002819{
Sean Christophersona78986a2019-11-11 14:12:27 -08002820 if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn))
Anthony Liguori2e2e3732008-04-30 15:37:07 -05002821 mark_page_accessed(pfn_to_page(pfn));
Anthony Liguori35149e22008-04-02 14:46:56 -05002822}
2823EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
2824
Izik Eidus195aefd2007-10-01 22:14:18 +02002825static int next_segment(unsigned long len, int offset)
2826{
2827 if (len > PAGE_SIZE - offset)
2828 return PAGE_SIZE - offset;
2829 else
2830 return len;
2831}
2832
Paolo Bonzini8e734852015-05-17 13:58:53 +02002833static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn,
2834 void *data, int offset, int len)
Izik Eidus195aefd2007-10-01 22:14:18 +02002835{
Izik Eiduse0506bc2007-11-11 22:10:22 +02002836 int r;
2837 unsigned long addr;
Izik Eidus195aefd2007-10-01 22:14:18 +02002838
Paolo Bonzini8e734852015-05-17 13:58:53 +02002839 addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
Izik Eiduse0506bc2007-11-11 22:10:22 +02002840 if (kvm_is_error_hva(addr))
Izik Eidus195aefd2007-10-01 22:14:18 +02002841 return -EFAULT;
Paolo Bonzini3180a7f2015-04-02 14:08:20 +02002842 r = __copy_from_user(data, (void __user *)addr + offset, len);
Izik Eiduse0506bc2007-11-11 22:10:22 +02002843 if (r)
2844 return -EFAULT;
Izik Eidus195aefd2007-10-01 22:14:18 +02002845 return 0;
2846}
Paolo Bonzini8e734852015-05-17 13:58:53 +02002847
2848int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
2849 int len)
2850{
2851 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
2852
2853 return __kvm_read_guest_page(slot, gfn, data, offset, len);
2854}
Izik Eidus195aefd2007-10-01 22:14:18 +02002855EXPORT_SYMBOL_GPL(kvm_read_guest_page);
2856
Paolo Bonzini8e734852015-05-17 13:58:53 +02002857int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data,
2858 int offset, int len)
2859{
2860 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2861
2862 return __kvm_read_guest_page(slot, gfn, data, offset, len);
2863}
2864EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_page);
2865
Izik Eidus195aefd2007-10-01 22:14:18 +02002866int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len)
2867{
2868 gfn_t gfn = gpa >> PAGE_SHIFT;
2869 int seg;
2870 int offset = offset_in_page(gpa);
2871 int ret;
2872
2873 while ((seg = next_segment(len, offset)) != 0) {
2874 ret = kvm_read_guest_page(kvm, gfn, data, offset, seg);
2875 if (ret < 0)
2876 return ret;
2877 offset = 0;
2878 len -= seg;
2879 data += seg;
2880 ++gfn;
2881 }
2882 return 0;
2883}
2884EXPORT_SYMBOL_GPL(kvm_read_guest);
2885
Paolo Bonzini8e734852015-05-17 13:58:53 +02002886int kvm_vcpu_read_guest(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, unsigned long len)
2887{
2888 gfn_t gfn = gpa >> PAGE_SHIFT;
2889 int seg;
2890 int offset = offset_in_page(gpa);
2891 int ret;
2892
2893 while ((seg = next_segment(len, offset)) != 0) {
2894 ret = kvm_vcpu_read_guest_page(vcpu, gfn, data, offset, seg);
2895 if (ret < 0)
2896 return ret;
2897 offset = 0;
2898 len -= seg;
2899 data += seg;
2900 ++gfn;
2901 }
2902 return 0;
2903}
2904EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest);
2905
2906static int __kvm_read_guest_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
2907 void *data, int offset, unsigned long len)
Marcelo Tosatti7ec54582007-12-20 19:18:23 -05002908{
2909 int r;
2910 unsigned long addr;
Marcelo Tosatti7ec54582007-12-20 19:18:23 -05002911
Paolo Bonzini8e734852015-05-17 13:58:53 +02002912 addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
Marcelo Tosatti7ec54582007-12-20 19:18:23 -05002913 if (kvm_is_error_hva(addr))
2914 return -EFAULT;
Andrea Arcangeli0aac03f2008-01-30 19:57:35 +01002915 pagefault_disable();
Paolo Bonzini3180a7f2015-04-02 14:08:20 +02002916 r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
Andrea Arcangeli0aac03f2008-01-30 19:57:35 +01002917 pagefault_enable();
Marcelo Tosatti7ec54582007-12-20 19:18:23 -05002918 if (r)
2919 return -EFAULT;
2920 return 0;
2921}
Marcelo Tosatti7ec54582007-12-20 19:18:23 -05002922
Paolo Bonzini8e734852015-05-17 13:58:53 +02002923int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa,
2924 void *data, unsigned long len)
2925{
2926 gfn_t gfn = gpa >> PAGE_SHIFT;
2927 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2928 int offset = offset_in_page(gpa);
2929
2930 return __kvm_read_guest_atomic(slot, gfn, data, offset, len);
2931}
2932EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic);
2933
Peter Xu28bd7262020-09-30 21:20:34 -04002934static int __kvm_write_guest_page(struct kvm *kvm,
2935 struct kvm_memory_slot *memslot, gfn_t gfn,
Paolo Bonzini8e734852015-05-17 13:58:53 +02002936 const void *data, int offset, int len)
Izik Eidus195aefd2007-10-01 22:14:18 +02002937{
Izik Eiduse0506bc2007-11-11 22:10:22 +02002938 int r;
2939 unsigned long addr;
Izik Eidus195aefd2007-10-01 22:14:18 +02002940
Radim Krčmář251eb842015-04-10 21:47:27 +02002941 addr = gfn_to_hva_memslot(memslot, gfn);
Izik Eiduse0506bc2007-11-11 22:10:22 +02002942 if (kvm_is_error_hva(addr))
Izik Eidus195aefd2007-10-01 22:14:18 +02002943 return -EFAULT;
Xiao Guangrong8b0cedf2011-05-15 23:22:04 +08002944 r = __copy_to_user((void __user *)addr + offset, data, len);
Izik Eiduse0506bc2007-11-11 22:10:22 +02002945 if (r)
2946 return -EFAULT;
Peter Xu28bd7262020-09-30 21:20:34 -04002947 mark_page_dirty_in_slot(kvm, memslot, gfn);
Izik Eidus195aefd2007-10-01 22:14:18 +02002948 return 0;
2949}
Paolo Bonzini8e734852015-05-17 13:58:53 +02002950
2951int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn,
2952 const void *data, int offset, int len)
2953{
2954 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
2955
Peter Xu28bd7262020-09-30 21:20:34 -04002956 return __kvm_write_guest_page(kvm, slot, gfn, data, offset, len);
Paolo Bonzini8e734852015-05-17 13:58:53 +02002957}
Izik Eidus195aefd2007-10-01 22:14:18 +02002958EXPORT_SYMBOL_GPL(kvm_write_guest_page);
2959
Paolo Bonzini8e734852015-05-17 13:58:53 +02002960int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
2961 const void *data, int offset, int len)
2962{
2963 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2964
Peter Xu28bd7262020-09-30 21:20:34 -04002965 return __kvm_write_guest_page(vcpu->kvm, slot, gfn, data, offset, len);
Paolo Bonzini8e734852015-05-17 13:58:53 +02002966}
2967EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_page);
2968
Izik Eidus195aefd2007-10-01 22:14:18 +02002969int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
2970 unsigned long len)
2971{
2972 gfn_t gfn = gpa >> PAGE_SHIFT;
2973 int seg;
2974 int offset = offset_in_page(gpa);
2975 int ret;
2976
2977 while ((seg = next_segment(len, offset)) != 0) {
2978 ret = kvm_write_guest_page(kvm, gfn, data, offset, seg);
2979 if (ret < 0)
2980 return ret;
2981 offset = 0;
2982 len -= seg;
2983 data += seg;
2984 ++gfn;
2985 }
2986 return 0;
2987}
Wincy Vanff651cb2014-12-11 08:52:58 +03002988EXPORT_SYMBOL_GPL(kvm_write_guest);
Izik Eidus195aefd2007-10-01 22:14:18 +02002989
Paolo Bonzini8e734852015-05-17 13:58:53 +02002990int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data,
2991 unsigned long len)
2992{
2993 gfn_t gfn = gpa >> PAGE_SHIFT;
2994 int seg;
2995 int offset = offset_in_page(gpa);
2996 int ret;
2997
2998 while ((seg = next_segment(len, offset)) != 0) {
2999 ret = kvm_vcpu_write_guest_page(vcpu, gfn, data, offset, seg);
3000 if (ret < 0)
3001 return ret;
3002 offset = 0;
3003 len -= seg;
3004 data += seg;
3005 ++gfn;
3006 }
3007 return 0;
3008}
3009EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest);
3010
Paolo Bonzini5a2d4362017-02-03 20:32:28 -08003011static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots,
3012 struct gfn_to_hva_cache *ghc,
3013 gpa_t gpa, unsigned long len)
Gleb Natapov49c77542010-10-18 15:22:23 +02003014{
Gleb Natapov49c77542010-10-18 15:22:23 +02003015 int offset = offset_in_page(gpa);
Andrew Honig8f964522013-03-29 09:35:21 -07003016 gfn_t start_gfn = gpa >> PAGE_SHIFT;
3017 gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT;
3018 gfn_t nr_pages_needed = end_gfn - start_gfn + 1;
3019 gfn_t nr_pages_avail;
Gleb Natapov49c77542010-10-18 15:22:23 +02003020
Sean Christopherson6ad1e292020-01-09 14:58:55 -05003021 /* Update ghc->generation before performing any error checks. */
Gleb Natapov49c77542010-10-18 15:22:23 +02003022 ghc->generation = slots->generation;
Sean Christopherson6ad1e292020-01-09 14:58:55 -05003023
3024 if (start_gfn > end_gfn) {
3025 ghc->hva = KVM_HVA_ERR_BAD;
3026 return -EINVAL;
3027 }
Jim Mattsonf1b9dd52018-12-17 13:53:33 -08003028
3029 /*
3030 * If the requested region crosses two memslots, we still
3031 * verify that the entire region is valid here.
3032 */
Sean Christopherson6ad1e292020-01-09 14:58:55 -05003033 for ( ; start_gfn <= end_gfn; start_gfn += nr_pages_avail) {
Jim Mattsonf1b9dd52018-12-17 13:53:33 -08003034 ghc->memslot = __gfn_to_memslot(slots, start_gfn);
3035 ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn,
3036 &nr_pages_avail);
3037 if (kvm_is_error_hva(ghc->hva))
Sean Christopherson6ad1e292020-01-09 14:58:55 -05003038 return -EFAULT;
Andrew Honig8f964522013-03-29 09:35:21 -07003039 }
Jim Mattsonf1b9dd52018-12-17 13:53:33 -08003040
3041 /* Use the slow path for cross page reads and writes. */
Sean Christopherson6ad1e292020-01-09 14:58:55 -05003042 if (nr_pages_needed == 1)
Jim Mattsonf1b9dd52018-12-17 13:53:33 -08003043 ghc->hva += offset;
3044 else
3045 ghc->memslot = NULL;
3046
Sean Christopherson6ad1e292020-01-09 14:58:55 -05003047 ghc->gpa = gpa;
3048 ghc->len = len;
3049 return 0;
Gleb Natapov49c77542010-10-18 15:22:23 +02003050}
Paolo Bonzini5a2d4362017-02-03 20:32:28 -08003051
Paolo Bonzini4e335d92017-05-02 16:20:18 +02003052int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
Paolo Bonzini5a2d4362017-02-03 20:32:28 -08003053 gpa_t gpa, unsigned long len)
3054{
Paolo Bonzini4e335d92017-05-02 16:20:18 +02003055 struct kvm_memslots *slots = kvm_memslots(kvm);
Paolo Bonzini5a2d4362017-02-03 20:32:28 -08003056 return __kvm_gfn_to_hva_cache_init(slots, ghc, gpa, len);
3057}
Paolo Bonzini4e335d92017-05-02 16:20:18 +02003058EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init);
Gleb Natapov49c77542010-10-18 15:22:23 +02003059
Paolo Bonzini4e335d92017-05-02 16:20:18 +02003060int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
Jim Mattson7a86dab2018-12-14 14:34:43 -08003061 void *data, unsigned int offset,
3062 unsigned long len)
Gleb Natapov49c77542010-10-18 15:22:23 +02003063{
Paolo Bonzini4e335d92017-05-02 16:20:18 +02003064 struct kvm_memslots *slots = kvm_memslots(kvm);
Gleb Natapov49c77542010-10-18 15:22:23 +02003065 int r;
Pan Xinhui4ec6e862016-11-02 05:08:34 -04003066 gpa_t gpa = ghc->gpa + offset;
Gleb Natapov49c77542010-10-18 15:22:23 +02003067
Paolo Bonzini5f25e712021-11-22 18:24:01 -05003068 if (WARN_ON_ONCE(len + offset > ghc->len))
3069 return -EINVAL;
Andrew Honig8f964522013-03-29 09:35:21 -07003070
Sean Christophersondc9ce712020-01-09 15:56:20 -08003071 if (slots->generation != ghc->generation) {
3072 if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))
3073 return -EFAULT;
3074 }
Andrew Honig8f964522013-03-29 09:35:21 -07003075
Gleb Natapov49c77542010-10-18 15:22:23 +02003076 if (kvm_is_error_hva(ghc->hva))
3077 return -EFAULT;
3078
Sean Christophersonfcfbc612020-01-09 15:56:18 -08003079 if (unlikely(!ghc->memslot))
3080 return kvm_write_guest(kvm, gpa, data, len);
3081
Pan Xinhui4ec6e862016-11-02 05:08:34 -04003082 r = __copy_to_user((void __user *)ghc->hva + offset, data, len);
Gleb Natapov49c77542010-10-18 15:22:23 +02003083 if (r)
3084 return -EFAULT;
Peter Xu28bd7262020-09-30 21:20:34 -04003085 mark_page_dirty_in_slot(kvm, ghc->memslot, gpa >> PAGE_SHIFT);
Gleb Natapov49c77542010-10-18 15:22:23 +02003086
3087 return 0;
3088}
Paolo Bonzini4e335d92017-05-02 16:20:18 +02003089EXPORT_SYMBOL_GPL(kvm_write_guest_offset_cached);
Pan Xinhui4ec6e862016-11-02 05:08:34 -04003090
Paolo Bonzini4e335d92017-05-02 16:20:18 +02003091int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3092 void *data, unsigned long len)
Pan Xinhui4ec6e862016-11-02 05:08:34 -04003093{
Paolo Bonzini4e335d92017-05-02 16:20:18 +02003094 return kvm_write_guest_offset_cached(kvm, ghc, data, 0, len);
Pan Xinhui4ec6e862016-11-02 05:08:34 -04003095}
Paolo Bonzini4e335d92017-05-02 16:20:18 +02003096EXPORT_SYMBOL_GPL(kvm_write_guest_cached);
Gleb Natapov49c77542010-10-18 15:22:23 +02003097
Vitaly Kuznetsov0958f0c2020-05-25 16:41:19 +02003098int kvm_read_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3099 void *data, unsigned int offset,
3100 unsigned long len)
Gleb Natapove03b6442011-07-11 15:28:11 -04003101{
Paolo Bonzini4e335d92017-05-02 16:20:18 +02003102 struct kvm_memslots *slots = kvm_memslots(kvm);
Gleb Natapove03b6442011-07-11 15:28:11 -04003103 int r;
Vitaly Kuznetsov0958f0c2020-05-25 16:41:19 +02003104 gpa_t gpa = ghc->gpa + offset;
Gleb Natapove03b6442011-07-11 15:28:11 -04003105
Paolo Bonzini5f25e712021-11-22 18:24:01 -05003106 if (WARN_ON_ONCE(len + offset > ghc->len))
3107 return -EINVAL;
Andrew Honig8f964522013-03-29 09:35:21 -07003108
Sean Christophersondc9ce712020-01-09 15:56:20 -08003109 if (slots->generation != ghc->generation) {
3110 if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))
3111 return -EFAULT;
3112 }
Andrew Honig8f964522013-03-29 09:35:21 -07003113
Gleb Natapove03b6442011-07-11 15:28:11 -04003114 if (kvm_is_error_hva(ghc->hva))
3115 return -EFAULT;
3116
Sean Christophersonfcfbc612020-01-09 15:56:18 -08003117 if (unlikely(!ghc->memslot))
Vitaly Kuznetsov0958f0c2020-05-25 16:41:19 +02003118 return kvm_read_guest(kvm, gpa, data, len);
Sean Christophersonfcfbc612020-01-09 15:56:18 -08003119
Vitaly Kuznetsov0958f0c2020-05-25 16:41:19 +02003120 r = __copy_from_user(data, (void __user *)ghc->hva + offset, len);
Gleb Natapove03b6442011-07-11 15:28:11 -04003121 if (r)
3122 return -EFAULT;
3123
3124 return 0;
3125}
Vitaly Kuznetsov0958f0c2020-05-25 16:41:19 +02003126EXPORT_SYMBOL_GPL(kvm_read_guest_offset_cached);
3127
3128int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3129 void *data, unsigned long len)
3130{
3131 return kvm_read_guest_offset_cached(kvm, ghc, data, 0, len);
3132}
Paolo Bonzini4e335d92017-05-02 16:20:18 +02003133EXPORT_SYMBOL_GPL(kvm_read_guest_cached);
Gleb Natapove03b6442011-07-11 15:28:11 -04003134
Izik Eidus195aefd2007-10-01 22:14:18 +02003135int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
3136{
Paolo Bonzini2f541442020-11-06 05:25:09 -05003137 const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
Izik Eidus195aefd2007-10-01 22:14:18 +02003138 gfn_t gfn = gpa >> PAGE_SHIFT;
3139 int seg;
3140 int offset = offset_in_page(gpa);
3141 int ret;
3142
Kevin Mulveybfda0e82015-02-20 08:21:36 -05003143 while ((seg = next_segment(len, offset)) != 0) {
Paolo Bonzini2f541442020-11-06 05:25:09 -05003144 ret = kvm_write_guest_page(kvm, gfn, zero_page, offset, len);
Izik Eidus195aefd2007-10-01 22:14:18 +02003145 if (ret < 0)
3146 return ret;
3147 offset = 0;
3148 len -= seg;
3149 ++gfn;
3150 }
3151 return 0;
3152}
3153EXPORT_SYMBOL_GPL(kvm_clear_guest);
3154
Peter Xu28bd7262020-09-30 21:20:34 -04003155void mark_page_dirty_in_slot(struct kvm *kvm,
Ben Gardon8283e362021-11-15 15:45:58 -08003156 const struct kvm_memory_slot *memslot,
Peter Xu28bd7262020-09-30 21:20:34 -04003157 gfn_t gfn)
Avi Kivity6aa8b732006-12-10 02:21:36 -08003158{
David Woodhouse2efd61a2021-12-10 16:36:20 +00003159 struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
3160
Christian Borntraegere09fccb2022-01-13 13:29:24 +01003161#ifdef CONFIG_HAVE_KVM_DIRTY_RING
David Woodhouse2efd61a2021-12-10 16:36:20 +00003162 if (WARN_ON_ONCE(!vcpu) || WARN_ON_ONCE(vcpu->kvm != kvm))
3163 return;
Christian Borntraegere09fccb2022-01-13 13:29:24 +01003164#endif
David Woodhouse2efd61a2021-12-10 16:36:20 +00003165
Peter Xu044c59c2020-09-30 21:22:26 -04003166 if (memslot && kvm_slot_dirty_track_enabled(memslot)) {
Rusty Russell7e9d6192007-07-31 20:41:14 +10003167 unsigned long rel_gfn = gfn - memslot->base_gfn;
Peter Xufb04a1e2020-09-30 21:22:22 -04003168 u32 slot = (memslot->as_id << 16) | memslot->id;
Avi Kivity6aa8b732006-12-10 02:21:36 -08003169
Peter Xufb04a1e2020-09-30 21:22:22 -04003170 if (kvm->dirty_ring_size)
David Woodhouse2efd61a2021-12-10 16:36:20 +00003171 kvm_dirty_ring_push(&vcpu->dirty_ring,
Peter Xufb04a1e2020-09-30 21:22:22 -04003172 slot, rel_gfn);
3173 else
3174 set_bit_le(rel_gfn, memslot->dirty_bitmap);
Avi Kivity6aa8b732006-12-10 02:21:36 -08003175 }
3176}
Ben Gardona6a0b052020-10-14 11:26:55 -07003177EXPORT_SYMBOL_GPL(mark_page_dirty_in_slot);
Avi Kivity6aa8b732006-12-10 02:21:36 -08003178
Gleb Natapov49c77542010-10-18 15:22:23 +02003179void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
3180{
3181 struct kvm_memory_slot *memslot;
3182
3183 memslot = gfn_to_memslot(kvm, gfn);
Peter Xu28bd7262020-09-30 21:20:34 -04003184 mark_page_dirty_in_slot(kvm, memslot, gfn);
Gleb Natapov49c77542010-10-18 15:22:23 +02003185}
Aneesh Kumar K.V2ba9f0d2013-10-07 22:17:59 +05303186EXPORT_SYMBOL_GPL(mark_page_dirty);
Gleb Natapov49c77542010-10-18 15:22:23 +02003187
Paolo Bonzini8e734852015-05-17 13:58:53 +02003188void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn)
3189{
3190 struct kvm_memory_slot *memslot;
3191
3192 memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
Peter Xu28bd7262020-09-30 21:20:34 -04003193 mark_page_dirty_in_slot(vcpu->kvm, memslot, gfn);
Paolo Bonzini8e734852015-05-17 13:58:53 +02003194}
3195EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty);
3196
Jan H. Schönherr20b70352017-11-24 22:39:01 +01003197void kvm_sigset_activate(struct kvm_vcpu *vcpu)
3198{
3199 if (!vcpu->sigset_active)
3200 return;
3201
3202 /*
3203 * This does a lockless modification of ->real_blocked, which is fine
3204 * because, only current can change ->real_blocked and all readers of
3205 * ->real_blocked don't care as long ->real_blocked is always a subset
3206 * of ->blocked.
3207 */
3208 sigprocmask(SIG_SETMASK, &vcpu->sigset, &current->real_blocked);
3209}
3210
3211void kvm_sigset_deactivate(struct kvm_vcpu *vcpu)
3212{
3213 if (!vcpu->sigset_active)
3214 return;
3215
3216 sigprocmask(SIG_SETMASK, &current->real_blocked, NULL);
3217 sigemptyset(&current->real_blocked);
3218}
3219
Wanpeng Liaca6ff22015-09-03 22:07:38 +08003220static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
3221{
Nir Weinerdee339b2019-01-27 12:17:16 +02003222 unsigned int old, val, grow, grow_start;
Wanpeng Liaca6ff22015-09-03 22:07:38 +08003223
Wanpeng Li2cbd7822015-09-03 22:07:39 +08003224 old = val = vcpu->halt_poll_ns;
Nir Weinerdee339b2019-01-27 12:17:16 +02003225 grow_start = READ_ONCE(halt_poll_ns_grow_start);
Christian Borntraeger6b6de682016-02-09 13:47:55 +01003226 grow = READ_ONCE(halt_poll_ns_grow);
Nir Weiner7fa08e72019-01-27 12:17:14 +02003227 if (!grow)
3228 goto out;
3229
Nir Weinerdee339b2019-01-27 12:17:16 +02003230 val *= grow;
3231 if (val < grow_start)
3232 val = grow_start;
Wanpeng Liaca6ff22015-09-03 22:07:38 +08003233
David Matlack258785e2021-05-06 15:24:43 +00003234 if (val > vcpu->kvm->max_halt_poll_ns)
3235 val = vcpu->kvm->max_halt_poll_ns;
David Matlack313f6362016-03-08 16:19:44 -08003236
Wanpeng Liaca6ff22015-09-03 22:07:38 +08003237 vcpu->halt_poll_ns = val;
Nir Weiner7fa08e72019-01-27 12:17:14 +02003238out:
Wanpeng Li2cbd7822015-09-03 22:07:39 +08003239 trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old);
Wanpeng Liaca6ff22015-09-03 22:07:38 +08003240}
3241
3242static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu)
3243{
Sergey Senozhatskyae232ea2021-09-02 12:11:00 +09003244 unsigned int old, val, shrink, grow_start;
Wanpeng Liaca6ff22015-09-03 22:07:38 +08003245
Wanpeng Li2cbd7822015-09-03 22:07:39 +08003246 old = val = vcpu->halt_poll_ns;
Christian Borntraeger6b6de682016-02-09 13:47:55 +01003247 shrink = READ_ONCE(halt_poll_ns_shrink);
Sergey Senozhatskyae232ea2021-09-02 12:11:00 +09003248 grow_start = READ_ONCE(halt_poll_ns_grow_start);
Christian Borntraeger6b6de682016-02-09 13:47:55 +01003249 if (shrink == 0)
Wanpeng Liaca6ff22015-09-03 22:07:38 +08003250 val = 0;
3251 else
Christian Borntraeger6b6de682016-02-09 13:47:55 +01003252 val /= shrink;
Wanpeng Liaca6ff22015-09-03 22:07:38 +08003253
Sergey Senozhatskyae232ea2021-09-02 12:11:00 +09003254 if (val < grow_start)
3255 val = 0;
3256
Wanpeng Liaca6ff22015-09-03 22:07:38 +08003257 vcpu->halt_poll_ns = val;
Wanpeng Li2cbd7822015-09-03 22:07:39 +08003258 trace_kvm_halt_poll_ns_shrink(vcpu->vcpu_id, val, old);
Wanpeng Liaca6ff22015-09-03 22:07:38 +08003259}
3260
Paolo Bonzinif7819512015-02-04 18:20:58 +01003261static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
3262{
Junaid Shahid50c28f22018-06-27 14:59:11 -07003263 int ret = -EINTR;
3264 int idx = srcu_read_lock(&vcpu->kvm->srcu);
3265
Paolo Bonzinif7819512015-02-04 18:20:58 +01003266 if (kvm_arch_vcpu_runnable(vcpu)) {
3267 kvm_make_request(KVM_REQ_UNHALT, vcpu);
Junaid Shahid50c28f22018-06-27 14:59:11 -07003268 goto out;
Paolo Bonzinif7819512015-02-04 18:20:58 +01003269 }
3270 if (kvm_cpu_has_pending_timer(vcpu))
Junaid Shahid50c28f22018-06-27 14:59:11 -07003271 goto out;
Paolo Bonzinif7819512015-02-04 18:20:58 +01003272 if (signal_pending(current))
Junaid Shahid50c28f22018-06-27 14:59:11 -07003273 goto out;
Marcelo Tosatti084071d2021-05-25 10:41:17 -03003274 if (kvm_check_request(KVM_REQ_UNBLOCK, vcpu))
3275 goto out;
Paolo Bonzinif7819512015-02-04 18:20:58 +01003276
Junaid Shahid50c28f22018-06-27 14:59:11 -07003277 ret = 0;
3278out:
3279 srcu_read_unlock(&vcpu->kvm->srcu, idx);
3280 return ret;
Paolo Bonzinif7819512015-02-04 18:20:58 +01003281}
3282
Eddie Dongb6958ce2007-07-18 12:15:21 +03003283/*
Sean Christophersonfac42682021-10-08 19:12:07 -07003284 * Block the vCPU until the vCPU is runnable, an event arrives, or a signal is
3285 * pending. This is mostly used when halting a vCPU, but may also be used
3286 * directly for other vCPU non-runnable states, e.g. x86's Wait-For-SIPI.
Eddie Dongb6958ce2007-07-18 12:15:21 +03003287 */
Sean Christophersonfac42682021-10-08 19:12:07 -07003288bool kvm_vcpu_block(struct kvm_vcpu *vcpu)
Eddie Dongb6958ce2007-07-18 12:15:21 +03003289{
Sean Christophersonfac42682021-10-08 19:12:07 -07003290 struct rcuwait *wait = kvm_arch_vcpu_get_wait(vcpu);
Paolo Bonzinif7819512015-02-04 18:20:58 +01003291 bool waited = false;
Sean Christophersonfac42682021-10-08 19:12:07 -07003292
Jing Zhangc3858332021-10-08 19:12:08 -07003293 vcpu->stat.generic.blocking = 1;
Paolo Bonzinif7819512015-02-04 18:20:58 +01003294
Marc Zyngier07ab0f82019-08-02 11:37:09 +01003295 kvm_arch_vcpu_blocking(vcpu);
3296
Sean Christophersonfac42682021-10-08 19:12:07 -07003297 prepare_to_rcuwait(wait);
Marcelo Tosattie5c239c2008-05-08 19:47:01 -03003298 for (;;) {
Davidlohr Buesoda4ad882020-04-23 22:48:37 -07003299 set_current_state(TASK_INTERRUPTIBLE);
Eddie Dongb6958ce2007-07-18 12:15:21 +03003300
Paolo Bonzinif7819512015-02-04 18:20:58 +01003301 if (kvm_vcpu_check_block(vcpu) < 0)
Marcelo Tosattie5c239c2008-05-08 19:47:01 -03003302 break;
3303
Paolo Bonzinif7819512015-02-04 18:20:58 +01003304 waited = true;
Eddie Dongb6958ce2007-07-18 12:15:21 +03003305 schedule();
Eddie Dongb6958ce2007-07-18 12:15:21 +03003306 }
Sean Christophersonfac42682021-10-08 19:12:07 -07003307 finish_rcuwait(wait);
3308
3309 kvm_arch_vcpu_unblocking(vcpu);
3310
Jing Zhangc3858332021-10-08 19:12:08 -07003311 vcpu->stat.generic.blocking = 0;
3312
Sean Christophersonfac42682021-10-08 19:12:07 -07003313 return waited;
3314}
3315
Sean Christopherson29e72892021-10-08 19:11:59 -07003316static inline void update_halt_poll_stats(struct kvm_vcpu *vcpu, ktime_t start,
3317 ktime_t end, bool success)
Eddie Dongb6958ce2007-07-18 12:15:21 +03003318{
Sean Christopherson30c943472021-10-08 19:12:00 -07003319 struct kvm_vcpu_stat_generic *stats = &vcpu->stat.generic;
Sean Christopherson29e72892021-10-08 19:11:59 -07003320 u64 poll_ns = ktime_to_ns(ktime_sub(end, start));
3321
Sean Christopherson30c943472021-10-08 19:12:00 -07003322 ++vcpu->stat.generic.halt_attempted_poll;
3323
3324 if (success) {
3325 ++vcpu->stat.generic.halt_successful_poll;
3326
3327 if (!vcpu_valid_wakeup(vcpu))
3328 ++vcpu->stat.generic.halt_poll_invalid;
3329
3330 stats->halt_poll_success_ns += poll_ns;
3331 KVM_STATS_LOG_HIST_UPDATE(stats->halt_poll_success_hist, poll_ns);
3332 } else {
3333 stats->halt_poll_fail_ns += poll_ns;
3334 KVM_STATS_LOG_HIST_UPDATE(stats->halt_poll_fail_hist, poll_ns);
3335 }
Marcelo Tosattie5c239c2008-05-08 19:47:01 -03003336}
Eddie Dongb6958ce2007-07-18 12:15:21 +03003337
Sean Christophersonfac42682021-10-08 19:12:07 -07003338/*
3339 * Emulate a vCPU halt condition, e.g. HLT on x86, WFI on arm, etc... If halt
3340 * polling is enabled, busy wait for a short time before blocking to avoid the
3341 * expensive block+unblock sequence if a wake event arrives soon after the vCPU
3342 * is halted.
3343 */
Sean Christopherson91b99ea2021-10-08 19:12:06 -07003344void kvm_vcpu_halt(struct kvm_vcpu *vcpu)
Yaozu Dong3fca0362007-04-25 16:49:19 +03003345{
Sean Christopherson6f390912021-10-08 19:11:56 -07003346 bool halt_poll_allowed = !kvm_arch_no_poll(vcpu);
Sean Christopherson8df6a612021-10-08 19:11:58 -07003347 bool do_halt_poll = halt_poll_allowed && vcpu->halt_poll_ns;
Avi Kivity6aa8b732006-12-10 02:21:36 -08003348 ktime_t start, cur, poll_end;
Avi Kivity6aa8b732006-12-10 02:21:36 -08003349 bool waited = false;
Sean Christopherson91b99ea2021-10-08 19:12:06 -07003350 u64 halt_ns;
Avi Kivity6aa8b732006-12-10 02:21:36 -08003351
Avi Kivity6aa8b732006-12-10 02:21:36 -08003352 start = cur = poll_end = ktime_get();
Sean Christopherson8df6a612021-10-08 19:11:58 -07003353 if (do_halt_poll) {
Sean Christopherson109a9822021-10-08 19:12:09 -07003354 ktime_t stop = ktime_add_ns(start, vcpu->halt_poll_ns);
Zhai, Edwind255f4f2009-10-09 18:03:20 +08003355
Zhai, Edwind255f4f2009-10-09 18:03:20 +08003356 do {
3357 /*
3358 * This sets KVM_REQ_UNHALT if an interrupt
3359 * arrives.
3360 */
Sean Christopherson30c943472021-10-08 19:12:00 -07003361 if (kvm_vcpu_check_block(vcpu) < 0)
Avi Kivity6aa8b732006-12-10 02:21:36 -08003362 goto out;
Avi Kivity6aa8b732006-12-10 02:21:36 -08003363 cpu_relax();
3364 poll_end = cur = ktime_get();
3365 } while (kvm_vcpu_can_poll(cur, stop));
Avi Kivity6aa8b732006-12-10 02:21:36 -08003366 }
3367
Sean Christophersonfac42682021-10-08 19:12:07 -07003368 waited = kvm_vcpu_block(vcpu);
Sean Christophersonf6c60d02021-10-08 19:12:04 -07003369
Paolo Bonzinif7819512015-02-04 18:20:58 +01003370 cur = ktime_get();
Jing Zhang87bcc5f2021-08-02 16:56:32 +00003371 if (waited) {
3372 vcpu->stat.generic.halt_wait_ns +=
3373 ktime_to_ns(cur) - ktime_to_ns(poll_end);
Jing Zhang8ccba532021-08-02 16:56:33 +00003374 KVM_STATS_LOG_HIST_UPDATE(vcpu->stat.generic.halt_wait_hist,
3375 ktime_to_ns(cur) - ktime_to_ns(poll_end));
Jing Zhang87bcc5f2021-08-02 16:56:32 +00003376 }
Paolo Bonzinif7819512015-02-04 18:20:58 +01003377out:
Sean Christopherson91b99ea2021-10-08 19:12:06 -07003378 /* The total time the vCPU was "halted", including polling time. */
3379 halt_ns = ktime_to_ns(cur) - ktime_to_ns(start);
Wanpeng Liaca6ff22015-09-03 22:07:38 +08003380
Sean Christopherson29e72892021-10-08 19:11:59 -07003381 /*
3382 * Note, halt-polling is considered successful so long as the vCPU was
3383 * never actually scheduled out, i.e. even if the wake event arrived
3384 * after of the halt-polling loop itself, but before the full wait.
3385 */
Sean Christopherson8df6a612021-10-08 19:11:58 -07003386 if (do_halt_poll)
Sean Christopherson29e72892021-10-08 19:11:59 -07003387 update_halt_poll_stats(vcpu, start, poll_end, !waited);
David Matlackcb953122020-05-08 11:22:40 -07003388
Sean Christopherson6f390912021-10-08 19:11:56 -07003389 if (halt_poll_allowed) {
Wanpeng Li44551b22019-09-29 09:06:56 +08003390 if (!vcpu_valid_wakeup(vcpu)) {
Wanpeng Liaca6ff22015-09-03 22:07:38 +08003391 shrink_halt_poll_ns(vcpu);
David Matlackacd05782020-04-17 15:14:46 -07003392 } else if (vcpu->kvm->max_halt_poll_ns) {
Sean Christopherson91b99ea2021-10-08 19:12:06 -07003393 if (halt_ns <= vcpu->halt_poll_ns)
Wanpeng Li44551b22019-09-29 09:06:56 +08003394 ;
3395 /* we had a long block, shrink polling */
David Matlackacd05782020-04-17 15:14:46 -07003396 else if (vcpu->halt_poll_ns &&
Sean Christopherson91b99ea2021-10-08 19:12:06 -07003397 halt_ns > vcpu->kvm->max_halt_poll_ns)
Wanpeng Li44551b22019-09-29 09:06:56 +08003398 shrink_halt_poll_ns(vcpu);
3399 /* we had a short halt and our poll time is too small */
David Matlackacd05782020-04-17 15:14:46 -07003400 else if (vcpu->halt_poll_ns < vcpu->kvm->max_halt_poll_ns &&
Sean Christopherson91b99ea2021-10-08 19:12:06 -07003401 halt_ns < vcpu->kvm->max_halt_poll_ns)
Wanpeng Li44551b22019-09-29 09:06:56 +08003402 grow_halt_poll_ns(vcpu);
3403 } else {
3404 vcpu->halt_poll_ns = 0;
3405 }
3406 }
Wanpeng Liaca6ff22015-09-03 22:07:38 +08003407
Sean Christopherson91b99ea2021-10-08 19:12:06 -07003408 trace_kvm_vcpu_wakeup(halt_ns, waited, vcpu_valid_wakeup(vcpu));
Avi Kivity6aa8b732006-12-10 02:21:36 -08003409}
Sean Christopherson91b99ea2021-10-08 19:12:06 -07003410EXPORT_SYMBOL_GPL(kvm_vcpu_halt);
Avi Kivity6aa8b732006-12-10 02:21:36 -08003411
Radim Krčmář178f02f2017-04-26 22:32:26 +02003412bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu)
Christoffer Dallb6d33832012-03-08 16:44:24 -05003413{
Sean Christophersond92a5d12021-10-08 19:12:12 -07003414 if (__kvm_vcpu_wake_up(vcpu)) {
Wanpeng Lid73eb572019-07-18 19:39:06 +08003415 WRITE_ONCE(vcpu->ready, true);
Jing Zhang0193cc92021-06-18 22:27:03 +00003416 ++vcpu->stat.generic.halt_wakeup;
Radim Krčmář178f02f2017-04-26 22:32:26 +02003417 return true;
Christoffer Dallb6d33832012-03-08 16:44:24 -05003418 }
3419
Radim Krčmář178f02f2017-04-26 22:32:26 +02003420 return false;
Radim Krčmářdd1a4cc2016-05-04 14:09:44 -05003421}
3422EXPORT_SYMBOL_GPL(kvm_vcpu_wake_up);
3423
Paolo Bonzini0266c892017-05-04 15:14:13 +02003424#ifndef CONFIG_S390
Radim Krčmářdd1a4cc2016-05-04 14:09:44 -05003425/*
3426 * Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode.
3427 */
3428void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
3429{
Sean Christopherson85b64042021-08-27 11:25:09 +02003430 int me, cpu;
Radim Krčmářdd1a4cc2016-05-04 14:09:44 -05003431
Radim Krčmář178f02f2017-04-26 22:32:26 +02003432 if (kvm_vcpu_wake_up(vcpu))
3433 return;
3434
Paolo Bonziniaefdc2e2021-10-20 06:38:05 -04003435 me = get_cpu();
3436 /*
3437 * The only state change done outside the vcpu mutex is IN_GUEST_MODE
3438 * to EXITING_GUEST_MODE. Therefore the moderately expensive "should
3439 * kick" check does not need atomic operations if kvm_vcpu_kick is used
3440 * within the vCPU thread itself.
3441 */
3442 if (vcpu == __this_cpu_read(kvm_running_vcpu)) {
3443 if (vcpu->mode == IN_GUEST_MODE)
3444 WRITE_ONCE(vcpu->mode, EXITING_GUEST_MODE);
3445 goto out;
3446 }
3447
Sean Christopherson85b64042021-08-27 11:25:09 +02003448 /*
3449 * Note, the vCPU could get migrated to a different pCPU at any point
3450 * after kvm_arch_vcpu_should_kick(), which could result in sending an
3451 * IPI to the previous pCPU. But, that's ok because the purpose of the
3452 * IPI is to force the vCPU to leave IN_GUEST_MODE, and migrating the
3453 * vCPU also requires it to leave IN_GUEST_MODE.
3454 */
Sean Christopherson85b64042021-08-27 11:25:09 +02003455 if (kvm_arch_vcpu_should_kick(vcpu)) {
3456 cpu = READ_ONCE(vcpu->cpu);
3457 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
Christoffer Dallb6d33832012-03-08 16:44:24 -05003458 smp_send_reschedule(cpu);
Sean Christopherson85b64042021-08-27 11:25:09 +02003459 }
Paolo Bonziniaefdc2e2021-10-20 06:38:05 -04003460out:
Christoffer Dallb6d33832012-03-08 16:44:24 -05003461 put_cpu();
3462}
Yang Zhanga20ed542013-04-11 19:25:15 +08003463EXPORT_SYMBOL_GPL(kvm_vcpu_kick);
Paolo Bonzini0266c892017-05-04 15:14:13 +02003464#endif /* !CONFIG_S390 */
Christoffer Dallb6d33832012-03-08 16:44:24 -05003465
Dan Carpenterfa933842014-05-23 13:20:42 +03003466int kvm_vcpu_yield_to(struct kvm_vcpu *target)
Konstantin Weitz41628d32012-04-25 15:30:38 +02003467{
3468 struct pid *pid;
3469 struct task_struct *task = NULL;
Dan Carpenterfa933842014-05-23 13:20:42 +03003470 int ret = 0;
Konstantin Weitz41628d32012-04-25 15:30:38 +02003471
3472 rcu_read_lock();
3473 pid = rcu_dereference(target->pid);
3474 if (pid)
Sam Bobroff27fbe64b2014-09-19 09:40:41 +10003475 task = get_pid_task(pid, PIDTYPE_PID);
Konstantin Weitz41628d32012-04-25 15:30:38 +02003476 rcu_read_unlock();
3477 if (!task)
Raghavendra K Tc45c5282013-01-22 13:09:24 +05303478 return ret;
Raghavendra K Tc45c5282013-01-22 13:09:24 +05303479 ret = yield_to(task, 1);
Konstantin Weitz41628d32012-04-25 15:30:38 +02003480 put_task_struct(task);
Raghavendra K Tc45c5282013-01-22 13:09:24 +05303481
3482 return ret;
Konstantin Weitz41628d32012-04-25 15:30:38 +02003483}
3484EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to);
3485
Raghavendra K T06e48c52012-07-19 15:17:52 +05303486/*
3487 * Helper that checks whether a VCPU is eligible for directed yield.
3488 * Most eligible candidate to yield is decided by following heuristics:
3489 *
3490 * (a) VCPU which has not done pl-exit or cpu relax intercepted recently
3491 * (preempted lock holder), indicated by @in_spin_loop.
Fuad Tabba656012c2020-04-01 15:03:10 +01003492 * Set at the beginning and cleared at the end of interception/PLE handler.
Raghavendra K T06e48c52012-07-19 15:17:52 +05303493 *
3494 * (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get
3495 * chance last time (mostly it has become eligible now since we have probably
3496 * yielded to lockholder in last iteration. This is done by toggling
3497 * @dy_eligible each time a VCPU checked for eligibility.)
3498 *
3499 * Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding
3500 * to preempted lock-holder could result in wrong VCPU selection and CPU
3501 * burning. Giving priority for a potential lock-holder increases lock
3502 * progress.
3503 *
3504 * Since algorithm is based on heuristics, accessing another VCPU data without
3505 * locking does not harm. It may result in trying to yield to same VCPU, fail
3506 * and continue with next VCPU and so on.
3507 */
Stephen Hemminger79408762013-12-29 12:12:29 -08003508static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
Raghavendra K T06e48c52012-07-19 15:17:52 +05303509{
Scott Wood4a55dd72014-01-09 18:43:16 -06003510#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
Raghavendra K T06e48c52012-07-19 15:17:52 +05303511 bool eligible;
3512
3513 eligible = !vcpu->spin_loop.in_spin_loop ||
Christian Borntraeger34656112014-09-04 21:13:31 +02003514 vcpu->spin_loop.dy_eligible;
Raghavendra K T06e48c52012-07-19 15:17:52 +05303515
3516 if (vcpu->spin_loop.in_spin_loop)
3517 kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible);
3518
3519 return eligible;
Scott Wood4a55dd72014-01-09 18:43:16 -06003520#else
3521 return true;
Raghavendra K T06e48c52012-07-19 15:17:52 +05303522#endif
Scott Wood4a55dd72014-01-09 18:43:16 -06003523}
Raghavendra K Tc45c5282013-01-22 13:09:24 +05303524
Wanpeng Li17e433b2019-08-05 10:03:19 +08003525/*
3526 * Unlike kvm_arch_vcpu_runnable, this function is called outside
3527 * a vcpu_load/vcpu_put pair. However, for most architectures
3528 * kvm_arch_vcpu_runnable does not require vcpu_load.
3529 */
3530bool __weak kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
3531{
3532 return kvm_arch_vcpu_runnable(vcpu);
3533}
3534
3535static bool vcpu_dy_runnable(struct kvm_vcpu *vcpu)
3536{
3537 if (kvm_arch_dy_runnable(vcpu))
3538 return true;
3539
3540#ifdef CONFIG_KVM_ASYNC_PF
3541 if (!list_empty_careful(&vcpu->async_pf.done))
3542 return true;
3543#endif
3544
3545 return false;
3546}
3547
Wanpeng Li52acd222021-04-16 11:08:10 +08003548bool __weak kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
3549{
3550 return false;
3551}
3552
Longpeng(Mike)199b5762017-08-08 12:05:32 +08003553void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
Zhai, Edwind255f4f2009-10-09 18:03:20 +08003554{
Rik van Riel217ece62011-02-01 09:53:28 -05003555 struct kvm *kvm = me->kvm;
3556 struct kvm_vcpu *vcpu;
3557 int last_boosted_vcpu = me->kvm->last_boosted_vcpu;
Marc Zyngier46808a42021-11-16 16:04:02 +00003558 unsigned long i;
Rik van Riel217ece62011-02-01 09:53:28 -05003559 int yielded = 0;
Raghavendra K Tc45c5282013-01-22 13:09:24 +05303560 int try = 3;
Rik van Riel217ece62011-02-01 09:53:28 -05003561 int pass;
Zhai, Edwind255f4f2009-10-09 18:03:20 +08003562
Raghavendra K T4c088492012-07-18 19:07:46 +05303563 kvm_vcpu_set_in_spin_loop(me, true);
Rik van Riel217ece62011-02-01 09:53:28 -05003564 /*
3565 * We boost the priority of a VCPU that is runnable but not
3566 * currently running, because it got preempted by something
3567 * else and called schedule in __vcpu_run. Hopefully that
3568 * VCPU is holding the lock that we need and will release it.
3569 * We approximate round-robin by starting at the last boosted VCPU.
3570 */
Raghavendra K Tc45c5282013-01-22 13:09:24 +05303571 for (pass = 0; pass < 2 && !yielded && try; pass++) {
Rik van Riel217ece62011-02-01 09:53:28 -05003572 kvm_for_each_vcpu(i, vcpu, kvm) {
Rik van Riel5cfc2aa2012-06-19 16:51:04 -04003573 if (!pass && i <= last_boosted_vcpu) {
Rik van Riel217ece62011-02-01 09:53:28 -05003574 i = last_boosted_vcpu;
3575 continue;
3576 } else if (pass && i > last_boosted_vcpu)
3577 break;
Wanpeng Lid73eb572019-07-18 19:39:06 +08003578 if (!READ_ONCE(vcpu->ready))
Raghavendra K T7bc7ae22013-03-04 23:32:27 +05303579 continue;
Rik van Riel217ece62011-02-01 09:53:28 -05003580 if (vcpu == me)
3581 continue;
Sean Christophersond92a5d12021-10-08 19:12:12 -07003582 if (kvm_vcpu_is_blocking(vcpu) && !vcpu_dy_runnable(vcpu))
Rik van Riel217ece62011-02-01 09:53:28 -05003583 continue;
Wanpeng Li046ddee2019-08-01 11:30:14 +08003584 if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode &&
Wanpeng Li52acd222021-04-16 11:08:10 +08003585 !kvm_arch_dy_has_pending_interrupt(vcpu) &&
3586 !kvm_arch_vcpu_in_kernel(vcpu))
Longpeng(Mike)199b5762017-08-08 12:05:32 +08003587 continue;
Raghavendra K T06e48c52012-07-19 15:17:52 +05303588 if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
3589 continue;
Raghavendra K Tc45c5282013-01-22 13:09:24 +05303590
3591 yielded = kvm_vcpu_yield_to(vcpu);
3592 if (yielded > 0) {
Rik van Riel217ece62011-02-01 09:53:28 -05003593 kvm->last_boosted_vcpu = i;
Rik van Riel217ece62011-02-01 09:53:28 -05003594 break;
Raghavendra K Tc45c5282013-01-22 13:09:24 +05303595 } else if (yielded < 0) {
3596 try--;
3597 if (!try)
3598 break;
Rik van Riel217ece62011-02-01 09:53:28 -05003599 }
Rik van Riel217ece62011-02-01 09:53:28 -05003600 }
3601 }
Raghavendra K T4c088492012-07-18 19:07:46 +05303602 kvm_vcpu_set_in_spin_loop(me, false);
Raghavendra K T06e48c52012-07-19 15:17:52 +05303603
3604 /* Ensure vcpu is not eligible during next spinloop */
3605 kvm_vcpu_set_dy_eligible(me, false);
Zhai, Edwind255f4f2009-10-09 18:03:20 +08003606}
3607EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
3608
Peter Xufb04a1e2020-09-30 21:22:22 -04003609static bool kvm_page_in_dirty_ring(struct kvm *kvm, unsigned long pgoff)
3610{
David Woodhousedc70ec22021-11-21 12:54:40 +00003611#ifdef CONFIG_HAVE_KVM_DIRTY_RING
Peter Xufb04a1e2020-09-30 21:22:22 -04003612 return (pgoff >= KVM_DIRTY_LOG_PAGE_OFFSET) &&
3613 (pgoff < KVM_DIRTY_LOG_PAGE_OFFSET +
3614 kvm->dirty_ring_size / PAGE_SIZE);
3615#else
3616 return false;
3617#endif
3618}
3619
Souptick Joarder1499fa82018-04-19 00:49:58 +05303620static vm_fault_t kvm_vcpu_fault(struct vm_fault *vmf)
Avi Kivity9a2bb7f2007-02-22 12:58:31 +02003621{
Dave Jiang11bac802017-02-24 14:56:41 -08003622 struct kvm_vcpu *vcpu = vmf->vma->vm_file->private_data;
Avi Kivity9a2bb7f2007-02-22 12:58:31 +02003623 struct page *page;
3624
npiggin@suse.dee4a533a2007-12-05 18:15:52 +11003625 if (vmf->pgoff == 0)
Avi Kivity039576c2007-03-20 12:46:50 +02003626 page = virt_to_page(vcpu->run);
Avi Kivity09566762008-01-23 18:14:23 +02003627#ifdef CONFIG_X86
npiggin@suse.dee4a533a2007-12-05 18:15:52 +11003628 else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET)
Zhang Xiantaoad312c72007-12-13 23:50:52 +08003629 page = virt_to_page(vcpu->arch.pio_data);
Avi Kivity09566762008-01-23 18:14:23 +02003630#endif
Paolo Bonzini4b4357e2017-03-31 13:53:23 +02003631#ifdef CONFIG_KVM_MMIO
Laurent Vivier5f94c172008-05-30 16:05:54 +02003632 else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET)
3633 page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
3634#endif
Peter Xufb04a1e2020-09-30 21:22:22 -04003635 else if (kvm_page_in_dirty_ring(vcpu->kvm, vmf->pgoff))
3636 page = kvm_dirty_ring_get_page(
3637 &vcpu->dirty_ring,
3638 vmf->pgoff - KVM_DIRTY_LOG_PAGE_OFFSET);
Avi Kivity039576c2007-03-20 12:46:50 +02003639 else
Carsten Otte5b1c1492012-01-04 10:25:23 +01003640 return kvm_arch_vcpu_fault(vcpu, vmf);
Avi Kivity9a2bb7f2007-02-22 12:58:31 +02003641 get_page(page);
npiggin@suse.dee4a533a2007-12-05 18:15:52 +11003642 vmf->page = page;
3643 return 0;
Avi Kivity9a2bb7f2007-02-22 12:58:31 +02003644}
3645
Alexey Dobriyanf0f37e2f2009-09-27 22:29:37 +04003646static const struct vm_operations_struct kvm_vcpu_vm_ops = {
npiggin@suse.dee4a533a2007-12-05 18:15:52 +11003647 .fault = kvm_vcpu_fault,
Avi Kivity9a2bb7f2007-02-22 12:58:31 +02003648};
3649
3650static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
3651{
Peter Xufb04a1e2020-09-30 21:22:22 -04003652 struct kvm_vcpu *vcpu = file->private_data;
Yang Li11476d22021-09-29 15:28:46 +08003653 unsigned long pages = vma_pages(vma);
Peter Xufb04a1e2020-09-30 21:22:22 -04003654
3655 if ((kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff) ||
3656 kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff + pages - 1)) &&
3657 ((vma->vm_flags & VM_EXEC) || !(vma->vm_flags & VM_SHARED)))
3658 return -EINVAL;
3659
Avi Kivity9a2bb7f2007-02-22 12:58:31 +02003660 vma->vm_ops = &kvm_vcpu_vm_ops;
3661 return 0;
3662}
3663
Avi Kivitybccf2152007-02-21 18:04:26 +02003664static int kvm_vcpu_release(struct inode *inode, struct file *filp)
3665{
3666 struct kvm_vcpu *vcpu = filp->private_data;
3667
Al Viro66c0b392008-04-19 20:33:56 +01003668 kvm_put_kvm(vcpu->kvm);
Avi Kivitybccf2152007-02-21 18:04:26 +02003669 return 0;
3670}
3671
Christian Borntraeger3d3aab12008-12-02 11:17:32 +01003672static struct file_operations kvm_vcpu_fops = {
Avi Kivitybccf2152007-02-21 18:04:26 +02003673 .release = kvm_vcpu_release,
3674 .unlocked_ioctl = kvm_vcpu_ioctl,
Avi Kivity9a2bb7f2007-02-22 12:58:31 +02003675 .mmap = kvm_vcpu_mmap,
Arnd Bergmann6038f372010-08-15 18:52:59 +02003676 .llseek = noop_llseek,
Marc Zyngier7ddfd3e2018-06-17 10:16:21 +01003677 KVM_COMPAT(kvm_vcpu_compat_ioctl),
Avi Kivitybccf2152007-02-21 18:04:26 +02003678};
3679
3680/*
3681 * Allocates an inode for the vcpu.
3682 */
3683static int create_vcpu_fd(struct kvm_vcpu *vcpu)
3684{
Masatake YAMATOe46b4692018-01-20 04:04:22 +09003685 char name[8 + 1 + ITOA_MAX_LEN + 1];
3686
3687 snprintf(name, sizeof(name), "kvm-vcpu:%d", vcpu->vcpu_id);
3688 return anon_inode_getfd(name, &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC);
Avi Kivitybccf2152007-02-21 18:04:26 +02003689}
3690
Greg KH3e7093d2019-07-31 20:56:20 +02003691static void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
Luiz Capitulino45b59392016-09-16 10:27:35 -04003692{
Paolo Bonzini741cbba2019-08-03 08:14:25 +02003693#ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS
Paolo Bonzinid56f5132020-06-04 15:16:52 +02003694 struct dentry *debugfs_dentry;
Luiz Capitulino45b59392016-09-16 10:27:35 -04003695 char dir_name[ITOA_MAX_LEN * 2];
Luiz Capitulino45b59392016-09-16 10:27:35 -04003696
Luiz Capitulino45b59392016-09-16 10:27:35 -04003697 if (!debugfs_initialized())
Greg KH3e7093d2019-07-31 20:56:20 +02003698 return;
Luiz Capitulino45b59392016-09-16 10:27:35 -04003699
3700 snprintf(dir_name, sizeof(dir_name), "vcpu%d", vcpu->vcpu_id);
Paolo Bonzinid56f5132020-06-04 15:16:52 +02003701 debugfs_dentry = debugfs_create_dir(dir_name,
3702 vcpu->kvm->debugfs_dentry);
Luiz Capitulino45b59392016-09-16 10:27:35 -04003703
Paolo Bonzinid56f5132020-06-04 15:16:52 +02003704 kvm_arch_create_vcpu_debugfs(vcpu, debugfs_dentry);
Paolo Bonzini741cbba2019-08-03 08:14:25 +02003705#endif
Luiz Capitulino45b59392016-09-16 10:27:35 -04003706}
3707
Avi Kivityc5ea7662007-02-20 18:41:05 +02003708/*
3709 * Creates some virtual cpus. Good luck creating more than one.
3710 */
Gleb Natapov73880c82009-06-09 15:56:28 +03003711static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
Avi Kivityc5ea7662007-02-20 18:41:05 +02003712{
3713 int r;
David Hildenbrande09fefd2015-11-05 09:03:50 +01003714 struct kvm_vcpu *vcpu;
Sean Christopherson8bd826d2019-12-18 13:55:30 -08003715 struct page *page;
Avi Kivityc5ea7662007-02-20 18:41:05 +02003716
Juergen Grossa1c42dd2021-09-13 15:57:44 +02003717 if (id >= KVM_MAX_VCPU_IDS)
Andy Honig338c7db2013-11-18 16:09:22 -08003718 return -EINVAL;
3719
Paolo Bonzini6c7caeb2016-06-13 14:48:25 +02003720 mutex_lock(&kvm->lock);
3721 if (kvm->created_vcpus == KVM_MAX_VCPUS) {
3722 mutex_unlock(&kvm->lock);
3723 return -EINVAL;
3724 }
3725
3726 kvm->created_vcpus++;
3727 mutex_unlock(&kvm->lock);
3728
Sean Christopherson897cc382019-12-18 13:55:09 -08003729 r = kvm_arch_vcpu_precreate(kvm, id);
3730 if (r)
3731 goto vcpu_decrement;
3732
Sean Christopherson85f47932021-04-06 12:07:40 -07003733 vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT);
Sean Christophersone529ef62019-12-18 13:55:15 -08003734 if (!vcpu) {
3735 r = -ENOMEM;
Paolo Bonzini6c7caeb2016-06-13 14:48:25 +02003736 goto vcpu_decrement;
3737 }
Avi Kivityc5ea7662007-02-20 18:41:05 +02003738
Peter Xufcd97ad2020-01-09 09:57:12 -05003739 BUILD_BUG_ON(sizeof(struct kvm_run) > PAGE_SIZE);
Shakeel Butt93bb59c2020-12-18 14:01:38 -08003740 page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
Sean Christopherson8bd826d2019-12-18 13:55:30 -08003741 if (!page) {
3742 r = -ENOMEM;
Sean Christophersone529ef62019-12-18 13:55:15 -08003743 goto vcpu_free;
Sean Christopherson8bd826d2019-12-18 13:55:30 -08003744 }
3745 vcpu->run = page_address(page);
3746
3747 kvm_vcpu_init(vcpu, kvm, id);
Sean Christophersone529ef62019-12-18 13:55:15 -08003748
3749 r = kvm_arch_vcpu_create(vcpu);
3750 if (r)
Sean Christopherson8bd826d2019-12-18 13:55:30 -08003751 goto vcpu_free_run_page;
Sean Christophersone529ef62019-12-18 13:55:15 -08003752
Peter Xufb04a1e2020-09-30 21:22:22 -04003753 if (kvm->dirty_ring_size) {
3754 r = kvm_dirty_ring_alloc(&vcpu->dirty_ring,
3755 id, kvm->dirty_ring_size);
3756 if (r)
3757 goto arch_vcpu_destroy;
3758 }
3759
Shaohua Li11ec2802007-07-23 14:51:37 +08003760 mutex_lock(&kvm->lock);
David Hildenbrande09fefd2015-11-05 09:03:50 +01003761 if (kvm_get_vcpu_by_id(kvm, id)) {
3762 r = -EEXIST;
3763 goto unlock_vcpu_destroy;
3764 }
Gleb Natapov73880c82009-06-09 15:56:28 +03003765
Radim Krčmář8750e722019-11-07 07:53:42 -05003766 vcpu->vcpu_idx = atomic_read(&kvm->online_vcpus);
Marc Zyngierc5b07752021-11-16 16:04:01 +00003767 r = xa_insert(&kvm->vcpu_array, vcpu->vcpu_idx, vcpu, GFP_KERNEL_ACCOUNT);
3768 BUG_ON(r == -EBUSY);
3769 if (r)
3770 goto unlock_vcpu_destroy;
Rusty Russellfb3f0f52007-07-27 17:16:56 +10003771
Jing Zhangce55c042021-06-18 22:27:06 +00003772 /* Fill the stats id string for the vcpu */
3773 snprintf(vcpu->stats_id, sizeof(vcpu->stats_id), "kvm-%d/vcpu-%d",
3774 task_pid_nr(current), id);
3775
Rusty Russellfb3f0f52007-07-27 17:16:56 +10003776 /* Now it's all set up, let userspace reach it */
Al Viro66c0b392008-04-19 20:33:56 +01003777 kvm_get_kvm(kvm);
Avi Kivitybccf2152007-02-21 18:04:26 +02003778 r = create_vcpu_fd(vcpu);
Gleb Natapov73880c82009-06-09 15:56:28 +03003779 if (r < 0) {
Marc Zyngierc5b07752021-11-16 16:04:01 +00003780 xa_erase(&kvm->vcpu_array, vcpu->vcpu_idx);
Sean Christopherson149487b2019-10-21 15:58:42 -07003781 kvm_put_kvm_no_destroy(kvm);
Jan Kiszkad7805922011-05-23 10:33:05 +02003782 goto unlock_vcpu_destroy;
Gleb Natapov73880c82009-06-09 15:56:28 +03003783 }
3784
Paolo Bonzinidd489242015-07-29 11:32:20 +02003785 /*
Marc Zyngierc5b07752021-11-16 16:04:01 +00003786 * Pairs with smp_rmb() in kvm_get_vcpu. Store the vcpu
3787 * pointer before kvm->online_vcpu's incremented value.
Paolo Bonzinidd489242015-07-29 11:32:20 +02003788 */
Gleb Natapov73880c82009-06-09 15:56:28 +03003789 smp_wmb();
3790 atomic_inc(&kvm->online_vcpus);
3791
Gleb Natapov73880c82009-06-09 15:56:28 +03003792 mutex_unlock(&kvm->lock);
Marcelo Tosatti42897d82012-11-27 23:29:02 -02003793 kvm_arch_vcpu_postcreate(vcpu);
Paolo Bonzini63d04342020-04-01 00:42:22 +02003794 kvm_create_vcpu_debugfs(vcpu);
Avi Kivitybccf2152007-02-21 18:04:26 +02003795 return r;
Avi Kivityc5ea7662007-02-20 18:41:05 +02003796
Jan Kiszkad7805922011-05-23 10:33:05 +02003797unlock_vcpu_destroy:
Glauber Costa7d8fece2008-09-17 23:16:59 -03003798 mutex_unlock(&kvm->lock);
Peter Xufb04a1e2020-09-30 21:22:22 -04003799 kvm_dirty_ring_free(&vcpu->dirty_ring);
3800arch_vcpu_destroy:
Hollis Blanchardd40ccc62007-11-19 14:04:43 -06003801 kvm_arch_vcpu_destroy(vcpu);
Sean Christopherson8bd826d2019-12-18 13:55:30 -08003802vcpu_free_run_page:
3803 free_page((unsigned long)vcpu->run);
Sean Christophersone529ef62019-12-18 13:55:15 -08003804vcpu_free:
3805 kmem_cache_free(kvm_vcpu_cache, vcpu);
Paolo Bonzini6c7caeb2016-06-13 14:48:25 +02003806vcpu_decrement:
3807 mutex_lock(&kvm->lock);
3808 kvm->created_vcpus--;
3809 mutex_unlock(&kvm->lock);
Avi Kivityc5ea7662007-02-20 18:41:05 +02003810 return r;
3811}
3812
Avi Kivity1961d272007-03-05 19:46:05 +02003813static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
3814{
3815 if (sigset) {
3816 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
3817 vcpu->sigset_active = 1;
3818 vcpu->sigset = *sigset;
3819 } else
3820 vcpu->sigset_active = 0;
3821 return 0;
3822}
3823
Jing Zhangce55c042021-06-18 22:27:06 +00003824static ssize_t kvm_vcpu_stats_read(struct file *file, char __user *user_buffer,
3825 size_t size, loff_t *offset)
3826{
3827 struct kvm_vcpu *vcpu = file->private_data;
3828
3829 return kvm_stats_read(vcpu->stats_id, &kvm_vcpu_stats_header,
3830 &kvm_vcpu_stats_desc[0], &vcpu->stat,
3831 sizeof(vcpu->stat), user_buffer, size, offset);
3832}
3833
3834static const struct file_operations kvm_vcpu_stats_fops = {
3835 .read = kvm_vcpu_stats_read,
3836 .llseek = noop_llseek,
3837};
3838
3839static int kvm_vcpu_ioctl_get_stats_fd(struct kvm_vcpu *vcpu)
3840{
3841 int fd;
3842 struct file *file;
3843 char name[15 + ITOA_MAX_LEN + 1];
3844
3845 snprintf(name, sizeof(name), "kvm-vcpu-stats:%d", vcpu->vcpu_id);
3846
3847 fd = get_unused_fd_flags(O_CLOEXEC);
3848 if (fd < 0)
3849 return fd;
3850
3851 file = anon_inode_getfile(name, &kvm_vcpu_stats_fops, vcpu, O_RDONLY);
3852 if (IS_ERR(file)) {
3853 put_unused_fd(fd);
3854 return PTR_ERR(file);
3855 }
3856 file->f_mode |= FMODE_PREAD;
3857 fd_install(fd, file);
3858
3859 return fd;
3860}
3861
Avi Kivitybccf2152007-02-21 18:04:26 +02003862static long kvm_vcpu_ioctl(struct file *filp,
3863 unsigned int ioctl, unsigned long arg)
Avi Kivity6aa8b732006-12-10 02:21:36 -08003864{
Avi Kivitybccf2152007-02-21 18:04:26 +02003865 struct kvm_vcpu *vcpu = filp->private_data;
Al Viro2f3669872007-02-09 16:38:35 +00003866 void __user *argp = (void __user *)arg;
Carsten Otte313a3dc2007-10-11 19:16:52 +02003867 int r;
Dave Hansenfa3795a2008-08-11 10:01:46 -07003868 struct kvm_fpu *fpu = NULL;
3869 struct kvm_sregs *kvm_sregs = NULL;
Avi Kivity6aa8b732006-12-10 02:21:36 -08003870
Paolo Bonzinif4d31652021-11-11 10:13:38 -05003871 if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_dead)
Avi Kivity6d4e4c42007-11-21 16:41:05 +02003872 return -EIO;
Avi Kivity2122ff52010-05-13 11:25:04 +03003873
David Matlack2ea75be2014-09-19 16:03:25 -07003874 if (unlikely(_IOC_TYPE(ioctl) != KVMIO))
3875 return -EINVAL;
3876
Avi Kivity2122ff52010-05-13 11:25:04 +03003877 /*
Paolo Bonzini5cb09442017-12-12 17:41:34 +01003878 * Some architectures have vcpu ioctls that are asynchronous to vcpu
3879 * execution; mutex_lock() would break them.
Avi Kivity2122ff52010-05-13 11:25:04 +03003880 */
Paolo Bonzini5cb09442017-12-12 17:41:34 +01003881 r = kvm_arch_vcpu_async_ioctl(filp, ioctl, arg);
3882 if (r != -ENOIOCTLCMD)
Michael S. Tsirkin9fc77442012-09-16 11:50:30 +03003883 return r;
Avi Kivity2122ff52010-05-13 11:25:04 +03003884
Christoffer Dallec7660c2017-12-04 21:35:23 +01003885 if (mutex_lock_killable(&vcpu->mutex))
3886 return -EINTR;
Avi Kivity6aa8b732006-12-10 02:21:36 -08003887 switch (ioctl) {
Christian Borntraeger0e4524a2017-07-06 14:44:28 +02003888 case KVM_RUN: {
3889 struct pid *oldpid;
Avi Kivityf0fe5102007-03-07 13:11:17 +02003890 r = -EINVAL;
3891 if (arg)
3892 goto out;
Christian Borntraeger0e4524a2017-07-06 14:44:28 +02003893 oldpid = rcu_access_pointer(vcpu->pid);
Eric W. Biederman71dbc8a2017-07-16 21:39:32 -05003894 if (unlikely(oldpid != task_pid(current))) {
Christian Borntraeger7a72f7a2014-08-05 16:44:14 +02003895 /* The thread running this VCPU changed. */
Christoffer Dallbd2a6392018-02-23 17:23:57 +01003896 struct pid *newpid;
Xiubo Lif95ef0cd2015-02-26 14:58:23 +08003897
Christoffer Dallbd2a6392018-02-23 17:23:57 +01003898 r = kvm_arch_vcpu_run_pid_change(vcpu);
3899 if (r)
3900 break;
3901
3902 newpid = get_task_pid(current, PIDTYPE_PID);
Christian Borntraeger7a72f7a2014-08-05 16:44:14 +02003903 rcu_assign_pointer(vcpu->pid, newpid);
3904 if (oldpid)
3905 synchronize_rcu();
3906 put_pid(oldpid);
3907 }
Tianjia Zhang1b94f6f2020-04-16 13:10:57 +08003908 r = kvm_arch_vcpu_ioctl_run(vcpu);
Gleb Natapov64be5002010-10-24 16:49:08 +02003909 trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
Avi Kivity6aa8b732006-12-10 02:21:36 -08003910 break;
Christian Borntraeger0e4524a2017-07-06 14:44:28 +02003911 }
Avi Kivity6aa8b732006-12-10 02:21:36 -08003912 case KVM_GET_REGS: {
Xiantao Zhang3e4bb3a2008-02-25 18:52:20 +08003913 struct kvm_regs *kvm_regs;
Avi Kivity6aa8b732006-12-10 02:21:36 -08003914
Xiantao Zhang3e4bb3a2008-02-25 18:52:20 +08003915 r = -ENOMEM;
Ben Gardonb12ce362019-02-11 11:02:49 -08003916 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL_ACCOUNT);
Xiantao Zhang3e4bb3a2008-02-25 18:52:20 +08003917 if (!kvm_regs)
3918 goto out;
3919 r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs);
Avi Kivity6aa8b732006-12-10 02:21:36 -08003920 if (r)
Xiantao Zhang3e4bb3a2008-02-25 18:52:20 +08003921 goto out_free1;
Avi Kivity6aa8b732006-12-10 02:21:36 -08003922 r = -EFAULT;
Xiantao Zhang3e4bb3a2008-02-25 18:52:20 +08003923 if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs)))
3924 goto out_free1;
Avi Kivity6aa8b732006-12-10 02:21:36 -08003925 r = 0;
Xiantao Zhang3e4bb3a2008-02-25 18:52:20 +08003926out_free1:
3927 kfree(kvm_regs);
Avi Kivity6aa8b732006-12-10 02:21:36 -08003928 break;
3929 }
3930 case KVM_SET_REGS: {
Xiantao Zhang3e4bb3a2008-02-25 18:52:20 +08003931 struct kvm_regs *kvm_regs;
Avi Kivity6aa8b732006-12-10 02:21:36 -08003932
Sasha Levinff5c2c02011-12-04 19:36:29 +02003933 kvm_regs = memdup_user(argp, sizeof(*kvm_regs));
3934 if (IS_ERR(kvm_regs)) {
3935 r = PTR_ERR(kvm_regs);
Xiantao Zhang3e4bb3a2008-02-25 18:52:20 +08003936 goto out;
Sasha Levinff5c2c02011-12-04 19:36:29 +02003937 }
Xiantao Zhang3e4bb3a2008-02-25 18:52:20 +08003938 r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs);
Xiantao Zhang3e4bb3a2008-02-25 18:52:20 +08003939 kfree(kvm_regs);
Avi Kivity6aa8b732006-12-10 02:21:36 -08003940 break;
3941 }
3942 case KVM_GET_SREGS: {
Ben Gardonb12ce362019-02-11 11:02:49 -08003943 kvm_sregs = kzalloc(sizeof(struct kvm_sregs),
3944 GFP_KERNEL_ACCOUNT);
Dave Hansenfa3795a2008-08-11 10:01:46 -07003945 r = -ENOMEM;
3946 if (!kvm_sregs)
3947 goto out;
3948 r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs);
Avi Kivity6aa8b732006-12-10 02:21:36 -08003949 if (r)
3950 goto out;
3951 r = -EFAULT;
Dave Hansenfa3795a2008-08-11 10:01:46 -07003952 if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs)))
Avi Kivity6aa8b732006-12-10 02:21:36 -08003953 goto out;
3954 r = 0;
3955 break;
3956 }
3957 case KVM_SET_SREGS: {
Sasha Levinff5c2c02011-12-04 19:36:29 +02003958 kvm_sregs = memdup_user(argp, sizeof(*kvm_sregs));
3959 if (IS_ERR(kvm_sregs)) {
3960 r = PTR_ERR(kvm_sregs);
Guo Chao18595412012-11-02 18:33:21 +08003961 kvm_sregs = NULL;
Avi Kivity6aa8b732006-12-10 02:21:36 -08003962 goto out;
Sasha Levinff5c2c02011-12-04 19:36:29 +02003963 }
Dave Hansenfa3795a2008-08-11 10:01:46 -07003964 r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs);
Avi Kivity6aa8b732006-12-10 02:21:36 -08003965 break;
3966 }
Marcelo Tosatti62d9f0d2008-04-11 13:24:45 -03003967 case KVM_GET_MP_STATE: {
3968 struct kvm_mp_state mp_state;
3969
3970 r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state);
3971 if (r)
3972 goto out;
3973 r = -EFAULT;
Xiubo Li893bdbf2015-02-26 14:58:19 +08003974 if (copy_to_user(argp, &mp_state, sizeof(mp_state)))
Marcelo Tosatti62d9f0d2008-04-11 13:24:45 -03003975 goto out;
3976 r = 0;
3977 break;
3978 }
3979 case KVM_SET_MP_STATE: {
3980 struct kvm_mp_state mp_state;
3981
3982 r = -EFAULT;
Xiubo Li893bdbf2015-02-26 14:58:19 +08003983 if (copy_from_user(&mp_state, argp, sizeof(mp_state)))
Marcelo Tosatti62d9f0d2008-04-11 13:24:45 -03003984 goto out;
3985 r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state);
Marcelo Tosatti62d9f0d2008-04-11 13:24:45 -03003986 break;
3987 }
Avi Kivity6aa8b732006-12-10 02:21:36 -08003988 case KVM_TRANSLATE: {
3989 struct kvm_translation tr;
3990
3991 r = -EFAULT;
Xiubo Li893bdbf2015-02-26 14:58:19 +08003992 if (copy_from_user(&tr, argp, sizeof(tr)))
Avi Kivity6aa8b732006-12-10 02:21:36 -08003993 goto out;
Zhang Xiantao8b006792007-11-16 13:05:55 +08003994 r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
Avi Kivity6aa8b732006-12-10 02:21:36 -08003995 if (r)
3996 goto out;
3997 r = -EFAULT;
Xiubo Li893bdbf2015-02-26 14:58:19 +08003998 if (copy_to_user(argp, &tr, sizeof(tr)))
Avi Kivity6aa8b732006-12-10 02:21:36 -08003999 goto out;
4000 r = 0;
4001 break;
4002 }
Jan Kiszkad0bfb942008-12-15 13:52:10 +01004003 case KVM_SET_GUEST_DEBUG: {
4004 struct kvm_guest_debug dbg;
Avi Kivity6aa8b732006-12-10 02:21:36 -08004005
4006 r = -EFAULT;
Xiubo Li893bdbf2015-02-26 14:58:19 +08004007 if (copy_from_user(&dbg, argp, sizeof(dbg)))
Avi Kivity6aa8b732006-12-10 02:21:36 -08004008 goto out;
Jan Kiszkad0bfb942008-12-15 13:52:10 +01004009 r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg);
Avi Kivity6aa8b732006-12-10 02:21:36 -08004010 break;
4011 }
Avi Kivity1961d272007-03-05 19:46:05 +02004012 case KVM_SET_SIGNAL_MASK: {
4013 struct kvm_signal_mask __user *sigmask_arg = argp;
4014 struct kvm_signal_mask kvm_sigmask;
4015 sigset_t sigset, *p;
4016
4017 p = NULL;
4018 if (argp) {
4019 r = -EFAULT;
4020 if (copy_from_user(&kvm_sigmask, argp,
Xiubo Li893bdbf2015-02-26 14:58:19 +08004021 sizeof(kvm_sigmask)))
Avi Kivity1961d272007-03-05 19:46:05 +02004022 goto out;
4023 r = -EINVAL;
Xiubo Li893bdbf2015-02-26 14:58:19 +08004024 if (kvm_sigmask.len != sizeof(sigset))
Avi Kivity1961d272007-03-05 19:46:05 +02004025 goto out;
4026 r = -EFAULT;
4027 if (copy_from_user(&sigset, sigmask_arg->sigset,
Xiubo Li893bdbf2015-02-26 14:58:19 +08004028 sizeof(sigset)))
Avi Kivity1961d272007-03-05 19:46:05 +02004029 goto out;
4030 p = &sigset;
4031 }
Andi Kleen376d41f2010-06-10 13:10:47 +02004032 r = kvm_vcpu_ioctl_set_sigmask(vcpu, p);
Avi Kivity1961d272007-03-05 19:46:05 +02004033 break;
4034 }
Avi Kivityb8836732007-04-01 16:34:31 +03004035 case KVM_GET_FPU: {
Ben Gardonb12ce362019-02-11 11:02:49 -08004036 fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL_ACCOUNT);
Dave Hansenfa3795a2008-08-11 10:01:46 -07004037 r = -ENOMEM;
4038 if (!fpu)
4039 goto out;
4040 r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu);
Avi Kivityb8836732007-04-01 16:34:31 +03004041 if (r)
4042 goto out;
4043 r = -EFAULT;
Dave Hansenfa3795a2008-08-11 10:01:46 -07004044 if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu)))
Avi Kivityb8836732007-04-01 16:34:31 +03004045 goto out;
4046 r = 0;
4047 break;
4048 }
4049 case KVM_SET_FPU: {
Sasha Levinff5c2c02011-12-04 19:36:29 +02004050 fpu = memdup_user(argp, sizeof(*fpu));
4051 if (IS_ERR(fpu)) {
4052 r = PTR_ERR(fpu);
Guo Chao18595412012-11-02 18:33:21 +08004053 fpu = NULL;
Avi Kivityb8836732007-04-01 16:34:31 +03004054 goto out;
Sasha Levinff5c2c02011-12-04 19:36:29 +02004055 }
Dave Hansenfa3795a2008-08-11 10:01:46 -07004056 r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu);
Avi Kivityb8836732007-04-01 16:34:31 +03004057 break;
4058 }
Jing Zhangce55c042021-06-18 22:27:06 +00004059 case KVM_GET_STATS_FD: {
4060 r = kvm_vcpu_ioctl_get_stats_fd(vcpu);
4061 break;
4062 }
Avi Kivitybccf2152007-02-21 18:04:26 +02004063 default:
Carsten Otte313a3dc2007-10-11 19:16:52 +02004064 r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
Avi Kivitybccf2152007-02-21 18:04:26 +02004065 }
4066out:
Christoffer Dallec7660c2017-12-04 21:35:23 +01004067 mutex_unlock(&vcpu->mutex);
Dave Hansenfa3795a2008-08-11 10:01:46 -07004068 kfree(fpu);
4069 kfree(kvm_sregs);
Avi Kivitybccf2152007-02-21 18:04:26 +02004070 return r;
4071}
4072
Christian Borntraegerde8e5d72015-02-03 09:35:15 +01004073#ifdef CONFIG_KVM_COMPAT
Alexander Graf1dda6062011-06-08 02:45:37 +02004074static long kvm_vcpu_compat_ioctl(struct file *filp,
4075 unsigned int ioctl, unsigned long arg)
4076{
4077 struct kvm_vcpu *vcpu = filp->private_data;
4078 void __user *argp = compat_ptr(arg);
4079 int r;
4080
Paolo Bonzinif4d31652021-11-11 10:13:38 -05004081 if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_dead)
Alexander Graf1dda6062011-06-08 02:45:37 +02004082 return -EIO;
4083
4084 switch (ioctl) {
4085 case KVM_SET_SIGNAL_MASK: {
4086 struct kvm_signal_mask __user *sigmask_arg = argp;
4087 struct kvm_signal_mask kvm_sigmask;
Alexander Graf1dda6062011-06-08 02:45:37 +02004088 sigset_t sigset;
4089
4090 if (argp) {
4091 r = -EFAULT;
4092 if (copy_from_user(&kvm_sigmask, argp,
Xiubo Li893bdbf2015-02-26 14:58:19 +08004093 sizeof(kvm_sigmask)))
Alexander Graf1dda6062011-06-08 02:45:37 +02004094 goto out;
4095 r = -EINVAL;
Al Viro3968cf62017-09-03 21:45:17 -04004096 if (kvm_sigmask.len != sizeof(compat_sigset_t))
Alexander Graf1dda6062011-06-08 02:45:37 +02004097 goto out;
4098 r = -EFAULT;
Paolo Bonzini1393b4a2020-07-02 05:39:31 -04004099 if (get_compat_sigset(&sigset,
4100 (compat_sigset_t __user *)sigmask_arg->sigset))
Alexander Graf1dda6062011-06-08 02:45:37 +02004101 goto out;
Alan Cox760a9a32012-08-22 14:34:11 +01004102 r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
4103 } else
4104 r = kvm_vcpu_ioctl_set_sigmask(vcpu, NULL);
Alexander Graf1dda6062011-06-08 02:45:37 +02004105 break;
4106 }
4107 default:
4108 r = kvm_vcpu_ioctl(filp, ioctl, arg);
4109 }
4110
4111out:
4112 return r;
4113}
4114#endif
4115
Cédric Le Goatera1cd3f02019-04-18 12:39:36 +02004116static int kvm_device_mmap(struct file *filp, struct vm_area_struct *vma)
4117{
4118 struct kvm_device *dev = filp->private_data;
4119
4120 if (dev->ops->mmap)
4121 return dev->ops->mmap(dev, vma);
4122
4123 return -ENODEV;
4124}
4125
Scott Wood852b6d52013-04-12 14:08:42 +00004126static int kvm_device_ioctl_attr(struct kvm_device *dev,
4127 int (*accessor)(struct kvm_device *dev,
4128 struct kvm_device_attr *attr),
4129 unsigned long arg)
4130{
4131 struct kvm_device_attr attr;
4132
4133 if (!accessor)
4134 return -EPERM;
4135
4136 if (copy_from_user(&attr, (void __user *)arg, sizeof(attr)))
4137 return -EFAULT;
4138
4139 return accessor(dev, &attr);
4140}
4141
4142static long kvm_device_ioctl(struct file *filp, unsigned int ioctl,
4143 unsigned long arg)
4144{
4145 struct kvm_device *dev = filp->private_data;
4146
Paolo Bonzinif4d31652021-11-11 10:13:38 -05004147 if (dev->kvm->mm != current->mm || dev->kvm->vm_dead)
Sean Christophersonddba9182019-02-15 12:48:39 -08004148 return -EIO;
4149
Scott Wood852b6d52013-04-12 14:08:42 +00004150 switch (ioctl) {
4151 case KVM_SET_DEVICE_ATTR:
4152 return kvm_device_ioctl_attr(dev, dev->ops->set_attr, arg);
4153 case KVM_GET_DEVICE_ATTR:
4154 return kvm_device_ioctl_attr(dev, dev->ops->get_attr, arg);
4155 case KVM_HAS_DEVICE_ATTR:
4156 return kvm_device_ioctl_attr(dev, dev->ops->has_attr, arg);
4157 default:
4158 if (dev->ops->ioctl)
4159 return dev->ops->ioctl(dev, ioctl, arg);
4160
4161 return -ENOTTY;
4162 }
4163}
4164
Scott Wood852b6d52013-04-12 14:08:42 +00004165static int kvm_device_release(struct inode *inode, struct file *filp)
4166{
4167 struct kvm_device *dev = filp->private_data;
4168 struct kvm *kvm = dev->kvm;
4169
Cédric Le Goater2bde9b32019-04-18 12:39:41 +02004170 if (dev->ops->release) {
4171 mutex_lock(&kvm->lock);
4172 list_del(&dev->vm_node);
4173 dev->ops->release(dev);
4174 mutex_unlock(&kvm->lock);
4175 }
4176
Scott Wood852b6d52013-04-12 14:08:42 +00004177 kvm_put_kvm(kvm);
4178 return 0;
4179}
4180
4181static const struct file_operations kvm_device_fops = {
4182 .unlocked_ioctl = kvm_device_ioctl,
4183 .release = kvm_device_release,
Marc Zyngier7ddfd3e2018-06-17 10:16:21 +01004184 KVM_COMPAT(kvm_device_ioctl),
Cédric Le Goatera1cd3f02019-04-18 12:39:36 +02004185 .mmap = kvm_device_mmap,
Scott Wood852b6d52013-04-12 14:08:42 +00004186};
4187
4188struct kvm_device *kvm_device_from_filp(struct file *filp)
4189{
4190 if (filp->f_op != &kvm_device_fops)
4191 return NULL;
4192
4193 return filp->private_data;
4194}
4195
Steven Price8538cb22019-10-21 16:28:19 +01004196static const struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = {
Will Deacond60eacb2014-09-02 10:27:33 +01004197#ifdef CONFIG_KVM_MPIC
4198 [KVM_DEV_TYPE_FSL_MPIC_20] = &kvm_mpic_ops,
4199 [KVM_DEV_TYPE_FSL_MPIC_42] = &kvm_mpic_ops,
4200#endif
Will Deacond60eacb2014-09-02 10:27:33 +01004201};
4202
Steven Price8538cb22019-10-21 16:28:19 +01004203int kvm_register_device_ops(const struct kvm_device_ops *ops, u32 type)
Will Deacond60eacb2014-09-02 10:27:33 +01004204{
4205 if (type >= ARRAY_SIZE(kvm_device_ops_table))
4206 return -ENOSPC;
4207
4208 if (kvm_device_ops_table[type] != NULL)
4209 return -EEXIST;
4210
4211 kvm_device_ops_table[type] = ops;
4212 return 0;
4213}
4214
Wanpeng Li571ee1b2014-10-09 18:30:08 +08004215void kvm_unregister_device_ops(u32 type)
4216{
4217 if (kvm_device_ops_table[type] != NULL)
4218 kvm_device_ops_table[type] = NULL;
4219}
4220
Scott Wood852b6d52013-04-12 14:08:42 +00004221static int kvm_ioctl_create_device(struct kvm *kvm,
4222 struct kvm_create_device *cd)
4223{
Steven Price8538cb22019-10-21 16:28:19 +01004224 const struct kvm_device_ops *ops = NULL;
Scott Wood852b6d52013-04-12 14:08:42 +00004225 struct kvm_device *dev;
4226 bool test = cd->flags & KVM_CREATE_DEVICE_TEST;
Paolo Bonzini1d487e92019-04-11 11:16:47 +02004227 int type;
Scott Wood852b6d52013-04-12 14:08:42 +00004228 int ret;
4229
Will Deacond60eacb2014-09-02 10:27:33 +01004230 if (cd->type >= ARRAY_SIZE(kvm_device_ops_table))
Scott Wood852b6d52013-04-12 14:08:42 +00004231 return -ENODEV;
Will Deacond60eacb2014-09-02 10:27:33 +01004232
Paolo Bonzini1d487e92019-04-11 11:16:47 +02004233 type = array_index_nospec(cd->type, ARRAY_SIZE(kvm_device_ops_table));
4234 ops = kvm_device_ops_table[type];
Will Deacond60eacb2014-09-02 10:27:33 +01004235 if (ops == NULL)
4236 return -ENODEV;
Scott Wood852b6d52013-04-12 14:08:42 +00004237
4238 if (test)
4239 return 0;
4240
Ben Gardonb12ce362019-02-11 11:02:49 -08004241 dev = kzalloc(sizeof(*dev), GFP_KERNEL_ACCOUNT);
Scott Wood852b6d52013-04-12 14:08:42 +00004242 if (!dev)
4243 return -ENOMEM;
4244
4245 dev->ops = ops;
4246 dev->kvm = kvm;
Scott Wood852b6d52013-04-12 14:08:42 +00004247
Christoffer Dalla28ebea2016-08-09 19:13:01 +02004248 mutex_lock(&kvm->lock);
Paolo Bonzini1d487e92019-04-11 11:16:47 +02004249 ret = ops->create(dev, type);
Scott Wood852b6d52013-04-12 14:08:42 +00004250 if (ret < 0) {
Christoffer Dalla28ebea2016-08-09 19:13:01 +02004251 mutex_unlock(&kvm->lock);
Scott Wood852b6d52013-04-12 14:08:42 +00004252 kfree(dev);
4253 return ret;
4254 }
Christoffer Dalla28ebea2016-08-09 19:13:01 +02004255 list_add(&dev->vm_node, &kvm->devices);
4256 mutex_unlock(&kvm->lock);
Scott Wood852b6d52013-04-12 14:08:42 +00004257
Christoffer Dall023e9fd2016-08-09 19:13:00 +02004258 if (ops->init)
4259 ops->init(dev);
4260
Jann Horncfa39382019-01-26 01:54:33 +01004261 kvm_get_kvm(kvm);
Yann Droneaud24009b02013-08-24 22:14:07 +02004262 ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC);
Scott Wood852b6d52013-04-12 14:08:42 +00004263 if (ret < 0) {
Sean Christopherson149487b2019-10-21 15:58:42 -07004264 kvm_put_kvm_no_destroy(kvm);
Christoffer Dalla28ebea2016-08-09 19:13:01 +02004265 mutex_lock(&kvm->lock);
4266 list_del(&dev->vm_node);
4267 mutex_unlock(&kvm->lock);
Dan Carpentera0f1d212016-11-30 22:21:05 +03004268 ops->destroy(dev);
Scott Wood852b6d52013-04-12 14:08:42 +00004269 return ret;
4270 }
4271
Scott Wood852b6d52013-04-12 14:08:42 +00004272 cd->fd = ret;
4273 return 0;
4274}
4275
Alexander Graf92b591a2014-07-14 18:33:08 +02004276static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
4277{
4278 switch (arg) {
4279 case KVM_CAP_USER_MEMORY:
4280 case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
4281 case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS:
Alexander Graf92b591a2014-07-14 18:33:08 +02004282 case KVM_CAP_INTERNAL_ERROR_DATA:
4283#ifdef CONFIG_HAVE_KVM_MSI
4284 case KVM_CAP_SIGNAL_MSI:
4285#endif
Paul Mackerras297e2102014-06-30 20:51:13 +10004286#ifdef CONFIG_HAVE_KVM_IRQFD
Paolo Bonzinidc9be0f2015-03-05 11:54:46 +01004287 case KVM_CAP_IRQFD:
Alexander Graf92b591a2014-07-14 18:33:08 +02004288 case KVM_CAP_IRQFD_RESAMPLE:
4289#endif
Jason Wange9ea5062015-09-15 14:41:59 +08004290 case KVM_CAP_IOEVENTFD_ANY_LENGTH:
Alexander Graf92b591a2014-07-14 18:33:08 +02004291 case KVM_CAP_CHECK_EXTENSION_VM:
Paolo Bonzinie5d83c72017-02-16 10:40:56 +01004292 case KVM_CAP_ENABLE_CAP_VM:
David Matlackacd05782020-04-17 15:14:46 -07004293 case KVM_CAP_HALT_POLL:
Alexander Graf92b591a2014-07-14 18:33:08 +02004294 return 1;
Paolo Bonzini4b4357e2017-03-31 13:53:23 +02004295#ifdef CONFIG_KVM_MMIO
Paolo Bonzini30422552017-03-31 13:53:22 +02004296 case KVM_CAP_COALESCED_MMIO:
4297 return KVM_COALESCED_MMIO_PAGE_OFFSET;
Peng Hao0804c842018-10-14 07:09:55 +08004298 case KVM_CAP_COALESCED_PIO:
4299 return 1;
Paolo Bonzini30422552017-03-31 13:53:22 +02004300#endif
Jay Zhou3c9bd402020-02-27 09:32:27 +08004301#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
4302 case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2:
4303 return KVM_DIRTY_LOG_MANUAL_CAPS;
4304#endif
Alexander Graf92b591a2014-07-14 18:33:08 +02004305#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
4306 case KVM_CAP_IRQ_ROUTING:
4307 return KVM_MAX_IRQ_ROUTES;
4308#endif
Paolo Bonzinif481b062015-05-17 17:30:37 +02004309#if KVM_ADDRESS_SPACE_NUM > 1
4310 case KVM_CAP_MULTI_ADDRESS_SPACE:
4311 return KVM_ADDRESS_SPACE_NUM;
4312#endif
Paolo Bonzinic110ae52019-03-28 17:24:03 +01004313 case KVM_CAP_NR_MEMSLOTS:
4314 return KVM_USER_MEM_SLOTS;
Peter Xufb04a1e2020-09-30 21:22:22 -04004315 case KVM_CAP_DIRTY_LOG_RING:
David Woodhousedc70ec22021-11-21 12:54:40 +00004316#ifdef CONFIG_HAVE_KVM_DIRTY_RING
Peter Xufb04a1e2020-09-30 21:22:22 -04004317 return KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct kvm_dirty_gfn);
4318#else
4319 return 0;
4320#endif
Jing Zhangce55c042021-06-18 22:27:06 +00004321 case KVM_CAP_BINARY_STATS_FD:
4322 return 1;
Alexander Graf92b591a2014-07-14 18:33:08 +02004323 default:
4324 break;
4325 }
4326 return kvm_vm_ioctl_check_extension(kvm, arg);
4327}
4328
Peter Xufb04a1e2020-09-30 21:22:22 -04004329static int kvm_vm_ioctl_enable_dirty_log_ring(struct kvm *kvm, u32 size)
4330{
4331 int r;
4332
4333 if (!KVM_DIRTY_LOG_PAGE_OFFSET)
4334 return -EINVAL;
4335
4336 /* the size should be power of 2 */
4337 if (!size || (size & (size - 1)))
4338 return -EINVAL;
4339
4340 /* Should be bigger to keep the reserved entries, or a page */
4341 if (size < kvm_dirty_ring_get_rsvd_entries() *
4342 sizeof(struct kvm_dirty_gfn) || size < PAGE_SIZE)
4343 return -EINVAL;
4344
4345 if (size > KVM_DIRTY_RING_MAX_ENTRIES *
4346 sizeof(struct kvm_dirty_gfn))
4347 return -E2BIG;
4348
4349 /* We only allow it to set once */
4350 if (kvm->dirty_ring_size)
4351 return -EINVAL;
4352
4353 mutex_lock(&kvm->lock);
4354
4355 if (kvm->created_vcpus) {
4356 /* We don't allow to change this value after vcpu created */
4357 r = -EINVAL;
4358 } else {
4359 kvm->dirty_ring_size = size;
4360 r = 0;
4361 }
4362
4363 mutex_unlock(&kvm->lock);
4364 return r;
4365}
4366
4367static int kvm_vm_ioctl_reset_dirty_pages(struct kvm *kvm)
4368{
Marc Zyngier46808a42021-11-16 16:04:02 +00004369 unsigned long i;
Peter Xufb04a1e2020-09-30 21:22:22 -04004370 struct kvm_vcpu *vcpu;
4371 int cleared = 0;
4372
4373 if (!kvm->dirty_ring_size)
4374 return -EINVAL;
4375
4376 mutex_lock(&kvm->slots_lock);
4377
4378 kvm_for_each_vcpu(i, vcpu, kvm)
4379 cleared += kvm_dirty_ring_reset(vcpu->kvm, &vcpu->dirty_ring);
4380
4381 mutex_unlock(&kvm->slots_lock);
4382
4383 if (cleared)
4384 kvm_flush_remote_tlbs(kvm);
4385
4386 return cleared;
4387}
4388
Paolo Bonzinie5d83c72017-02-16 10:40:56 +01004389int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm,
4390 struct kvm_enable_cap *cap)
4391{
4392 return -EINVAL;
4393}
4394
4395static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm,
4396 struct kvm_enable_cap *cap)
4397{
4398 switch (cap->cap) {
Paolo Bonzini2a31b9d2018-10-23 02:36:47 +02004399#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
Jay Zhou3c9bd402020-02-27 09:32:27 +08004400 case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2: {
4401 u64 allowed_options = KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE;
4402
4403 if (cap->args[0] & KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE)
4404 allowed_options = KVM_DIRTY_LOG_MANUAL_CAPS;
4405
4406 if (cap->flags || (cap->args[0] & ~allowed_options))
Paolo Bonzini2a31b9d2018-10-23 02:36:47 +02004407 return -EINVAL;
4408 kvm->manual_dirty_log_protect = cap->args[0];
4409 return 0;
Jay Zhou3c9bd402020-02-27 09:32:27 +08004410 }
Paolo Bonzini2a31b9d2018-10-23 02:36:47 +02004411#endif
David Matlackacd05782020-04-17 15:14:46 -07004412 case KVM_CAP_HALT_POLL: {
4413 if (cap->flags || cap->args[0] != (unsigned int)cap->args[0])
4414 return -EINVAL;
4415
4416 kvm->max_halt_poll_ns = cap->args[0];
4417 return 0;
4418 }
Peter Xufb04a1e2020-09-30 21:22:22 -04004419 case KVM_CAP_DIRTY_LOG_RING:
4420 return kvm_vm_ioctl_enable_dirty_log_ring(kvm, cap->args[0]);
Paolo Bonzinie5d83c72017-02-16 10:40:56 +01004421 default:
4422 return kvm_vm_ioctl_enable_cap(kvm, cap);
4423 }
4424}
4425
Jing Zhangfcfe1ba2021-06-18 22:27:05 +00004426static ssize_t kvm_vm_stats_read(struct file *file, char __user *user_buffer,
4427 size_t size, loff_t *offset)
4428{
4429 struct kvm *kvm = file->private_data;
4430
4431 return kvm_stats_read(kvm->stats_id, &kvm_vm_stats_header,
4432 &kvm_vm_stats_desc[0], &kvm->stat,
4433 sizeof(kvm->stat), user_buffer, size, offset);
4434}
4435
4436static const struct file_operations kvm_vm_stats_fops = {
4437 .read = kvm_vm_stats_read,
4438 .llseek = noop_llseek,
4439};
4440
4441static int kvm_vm_ioctl_get_stats_fd(struct kvm *kvm)
4442{
4443 int fd;
4444 struct file *file;
4445
4446 fd = get_unused_fd_flags(O_CLOEXEC);
4447 if (fd < 0)
4448 return fd;
4449
4450 file = anon_inode_getfile("kvm-vm-stats",
4451 &kvm_vm_stats_fops, kvm, O_RDONLY);
4452 if (IS_ERR(file)) {
4453 put_unused_fd(fd);
4454 return PTR_ERR(file);
4455 }
4456 file->f_mode |= FMODE_PREAD;
4457 fd_install(fd, file);
4458
4459 return fd;
4460}
4461
Avi Kivitybccf2152007-02-21 18:04:26 +02004462static long kvm_vm_ioctl(struct file *filp,
4463 unsigned int ioctl, unsigned long arg)
4464{
4465 struct kvm *kvm = filp->private_data;
4466 void __user *argp = (void __user *)arg;
Carsten Otte1fe779f2007-10-29 16:08:35 +01004467 int r;
Avi Kivitybccf2152007-02-21 18:04:26 +02004468
Paolo Bonzinif4d31652021-11-11 10:13:38 -05004469 if (kvm->mm != current->mm || kvm->vm_dead)
Avi Kivity6d4e4c42007-11-21 16:41:05 +02004470 return -EIO;
Avi Kivitybccf2152007-02-21 18:04:26 +02004471 switch (ioctl) {
4472 case KVM_CREATE_VCPU:
4473 r = kvm_vm_ioctl_create_vcpu(kvm, arg);
Avi Kivitybccf2152007-02-21 18:04:26 +02004474 break;
Paolo Bonzinie5d83c72017-02-16 10:40:56 +01004475 case KVM_ENABLE_CAP: {
4476 struct kvm_enable_cap cap;
4477
4478 r = -EFAULT;
4479 if (copy_from_user(&cap, argp, sizeof(cap)))
4480 goto out;
4481 r = kvm_vm_ioctl_enable_cap_generic(kvm, &cap);
4482 break;
4483 }
Izik Eidus6fc138d2007-10-09 19:20:39 +02004484 case KVM_SET_USER_MEMORY_REGION: {
4485 struct kvm_userspace_memory_region kvm_userspace_mem;
4486
4487 r = -EFAULT;
4488 if (copy_from_user(&kvm_userspace_mem, argp,
Xiubo Li893bdbf2015-02-26 14:58:19 +08004489 sizeof(kvm_userspace_mem)))
Izik Eidus6fc138d2007-10-09 19:20:39 +02004490 goto out;
4491
Takuya Yoshikawa47ae31e2013-02-27 19:43:00 +09004492 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem);
Avi Kivity6aa8b732006-12-10 02:21:36 -08004493 break;
4494 }
4495 case KVM_GET_DIRTY_LOG: {
4496 struct kvm_dirty_log log;
4497
4498 r = -EFAULT;
Xiubo Li893bdbf2015-02-26 14:58:19 +08004499 if (copy_from_user(&log, argp, sizeof(log)))
Avi Kivity6aa8b732006-12-10 02:21:36 -08004500 goto out;
Avi Kivity2c6f5df2007-02-20 18:27:58 +02004501 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
Avi Kivity6aa8b732006-12-10 02:21:36 -08004502 break;
4503 }
Paolo Bonzini2a31b9d2018-10-23 02:36:47 +02004504#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
4505 case KVM_CLEAR_DIRTY_LOG: {
4506 struct kvm_clear_dirty_log log;
4507
4508 r = -EFAULT;
4509 if (copy_from_user(&log, argp, sizeof(log)))
4510 goto out;
4511 r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
4512 break;
4513 }
4514#endif
Paolo Bonzini4b4357e2017-03-31 13:53:23 +02004515#ifdef CONFIG_KVM_MMIO
Laurent Vivier5f94c172008-05-30 16:05:54 +02004516 case KVM_REGISTER_COALESCED_MMIO: {
4517 struct kvm_coalesced_mmio_zone zone;
Xiubo Lif95ef0cd2015-02-26 14:58:23 +08004518
Laurent Vivier5f94c172008-05-30 16:05:54 +02004519 r = -EFAULT;
Xiubo Li893bdbf2015-02-26 14:58:19 +08004520 if (copy_from_user(&zone, argp, sizeof(zone)))
Laurent Vivier5f94c172008-05-30 16:05:54 +02004521 goto out;
Laurent Vivier5f94c172008-05-30 16:05:54 +02004522 r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone);
Laurent Vivier5f94c172008-05-30 16:05:54 +02004523 break;
4524 }
4525 case KVM_UNREGISTER_COALESCED_MMIO: {
4526 struct kvm_coalesced_mmio_zone zone;
Xiubo Lif95ef0cd2015-02-26 14:58:23 +08004527
Laurent Vivier5f94c172008-05-30 16:05:54 +02004528 r = -EFAULT;
Xiubo Li893bdbf2015-02-26 14:58:19 +08004529 if (copy_from_user(&zone, argp, sizeof(zone)))
Laurent Vivier5f94c172008-05-30 16:05:54 +02004530 goto out;
Laurent Vivier5f94c172008-05-30 16:05:54 +02004531 r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone);
Laurent Vivier5f94c172008-05-30 16:05:54 +02004532 break;
4533 }
4534#endif
Gregory Haskins721eecbf2009-05-20 10:30:49 -04004535 case KVM_IRQFD: {
4536 struct kvm_irqfd data;
4537
4538 r = -EFAULT;
Xiubo Li893bdbf2015-02-26 14:58:19 +08004539 if (copy_from_user(&data, argp, sizeof(data)))
Gregory Haskins721eecbf2009-05-20 10:30:49 -04004540 goto out;
Alex Williamsond4db2932012-06-29 09:56:08 -06004541 r = kvm_irqfd(kvm, &data);
Gregory Haskins721eecbf2009-05-20 10:30:49 -04004542 break;
4543 }
Gregory Haskinsd34e6b12009-07-07 17:08:49 -04004544 case KVM_IOEVENTFD: {
4545 struct kvm_ioeventfd data;
4546
4547 r = -EFAULT;
Xiubo Li893bdbf2015-02-26 14:58:19 +08004548 if (copy_from_user(&data, argp, sizeof(data)))
Gregory Haskinsd34e6b12009-07-07 17:08:49 -04004549 goto out;
4550 r = kvm_ioeventfd(kvm, &data);
4551 break;
4552 }
Jan Kiszka07975ad2012-03-29 21:14:12 +02004553#ifdef CONFIG_HAVE_KVM_MSI
4554 case KVM_SIGNAL_MSI: {
4555 struct kvm_msi msi;
4556
4557 r = -EFAULT;
Xiubo Li893bdbf2015-02-26 14:58:19 +08004558 if (copy_from_user(&msi, argp, sizeof(msi)))
Jan Kiszka07975ad2012-03-29 21:14:12 +02004559 goto out;
4560 r = kvm_send_userspace_msi(kvm, &msi);
4561 break;
4562 }
4563#endif
Christoffer Dall23d43cf2012-07-24 08:51:20 -04004564#ifdef __KVM_HAVE_IRQ_LINE
4565 case KVM_IRQ_LINE_STATUS:
4566 case KVM_IRQ_LINE: {
4567 struct kvm_irq_level irq_event;
4568
4569 r = -EFAULT;
Xiubo Li893bdbf2015-02-26 14:58:19 +08004570 if (copy_from_user(&irq_event, argp, sizeof(irq_event)))
Christoffer Dall23d43cf2012-07-24 08:51:20 -04004571 goto out;
4572
Yang Zhangaa2fbe62013-04-11 19:21:40 +08004573 r = kvm_vm_ioctl_irq_line(kvm, &irq_event,
4574 ioctl == KVM_IRQ_LINE_STATUS);
Christoffer Dall23d43cf2012-07-24 08:51:20 -04004575 if (r)
4576 goto out;
4577
4578 r = -EFAULT;
4579 if (ioctl == KVM_IRQ_LINE_STATUS) {
Xiubo Li893bdbf2015-02-26 14:58:19 +08004580 if (copy_to_user(argp, &irq_event, sizeof(irq_event)))
Christoffer Dall23d43cf2012-07-24 08:51:20 -04004581 goto out;
4582 }
4583
4584 r = 0;
4585 break;
4586 }
4587#endif
Alexander Grafaa8d5942013-04-15 21:12:53 +02004588#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
4589 case KVM_SET_GSI_ROUTING: {
4590 struct kvm_irq_routing routing;
4591 struct kvm_irq_routing __user *urouting;
Paolo Bonzinif8c1b852016-06-01 14:09:22 +02004592 struct kvm_irq_routing_entry *entries = NULL;
Alexander Grafaa8d5942013-04-15 21:12:53 +02004593
4594 r = -EFAULT;
4595 if (copy_from_user(&routing, argp, sizeof(routing)))
4596 goto out;
4597 r = -EINVAL;
David Hildenbrand5c0aea02017-04-28 17:06:20 +02004598 if (!kvm_arch_can_set_irq_routing(kvm))
4599 goto out;
Xiubo Licaf1ff22016-06-15 18:00:33 +08004600 if (routing.nr > KVM_MAX_IRQ_ROUTES)
Alexander Grafaa8d5942013-04-15 21:12:53 +02004601 goto out;
4602 if (routing.flags)
4603 goto out;
Paolo Bonzinif8c1b852016-06-01 14:09:22 +02004604 if (routing.nr) {
Paolo Bonzinif8c1b852016-06-01 14:09:22 +02004605 urouting = argp;
Denis Efremov7ec28e22020-06-03 13:11:31 +03004606 entries = vmemdup_user(urouting->entries,
4607 array_size(sizeof(*entries),
4608 routing.nr));
4609 if (IS_ERR(entries)) {
4610 r = PTR_ERR(entries);
4611 goto out;
4612 }
Paolo Bonzinif8c1b852016-06-01 14:09:22 +02004613 }
Alexander Grafaa8d5942013-04-15 21:12:53 +02004614 r = kvm_set_irq_routing(kvm, entries, routing.nr,
4615 routing.flags);
Denis Efremov7ec28e22020-06-03 13:11:31 +03004616 kvfree(entries);
Alexander Grafaa8d5942013-04-15 21:12:53 +02004617 break;
4618 }
4619#endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */
Scott Wood852b6d52013-04-12 14:08:42 +00004620 case KVM_CREATE_DEVICE: {
4621 struct kvm_create_device cd;
4622
4623 r = -EFAULT;
4624 if (copy_from_user(&cd, argp, sizeof(cd)))
4625 goto out;
4626
4627 r = kvm_ioctl_create_device(kvm, &cd);
4628 if (r)
4629 goto out;
4630
4631 r = -EFAULT;
4632 if (copy_to_user(argp, &cd, sizeof(cd)))
4633 goto out;
4634
4635 r = 0;
4636 break;
4637 }
Alexander Graf92b591a2014-07-14 18:33:08 +02004638 case KVM_CHECK_EXTENSION:
4639 r = kvm_vm_ioctl_check_extension_generic(kvm, arg);
4640 break;
Peter Xufb04a1e2020-09-30 21:22:22 -04004641 case KVM_RESET_DIRTY_RINGS:
4642 r = kvm_vm_ioctl_reset_dirty_pages(kvm);
4643 break;
Jing Zhangfcfe1ba2021-06-18 22:27:05 +00004644 case KVM_GET_STATS_FD:
4645 r = kvm_vm_ioctl_get_stats_fd(kvm);
4646 break;
Avi Kivityf17abe92007-02-21 19:28:04 +02004647 default:
Carsten Otte1fe779f2007-10-29 16:08:35 +01004648 r = kvm_arch_vm_ioctl(filp, ioctl, arg);
Avi Kivityf17abe92007-02-21 19:28:04 +02004649 }
4650out:
4651 return r;
4652}
4653
Christian Borntraegerde8e5d72015-02-03 09:35:15 +01004654#ifdef CONFIG_KVM_COMPAT
Arnd Bergmann6ff58942009-10-22 14:19:27 +02004655struct compat_kvm_dirty_log {
4656 __u32 slot;
4657 __u32 padding1;
4658 union {
4659 compat_uptr_t dirty_bitmap; /* one bit per page */
4660 __u64 padding2;
4661 };
4662};
4663
Paolo Bonzini8750f9b2021-07-27 08:43:10 -04004664struct compat_kvm_clear_dirty_log {
4665 __u32 slot;
4666 __u32 num_pages;
4667 __u64 first_page;
4668 union {
4669 compat_uptr_t dirty_bitmap; /* one bit per page */
4670 __u64 padding2;
4671 };
4672};
4673
Arnd Bergmann6ff58942009-10-22 14:19:27 +02004674static long kvm_vm_compat_ioctl(struct file *filp,
4675 unsigned int ioctl, unsigned long arg)
4676{
4677 struct kvm *kvm = filp->private_data;
4678 int r;
4679
Paolo Bonzinif4d31652021-11-11 10:13:38 -05004680 if (kvm->mm != current->mm || kvm->vm_dead)
Arnd Bergmann6ff58942009-10-22 14:19:27 +02004681 return -EIO;
4682 switch (ioctl) {
Paolo Bonzini8750f9b2021-07-27 08:43:10 -04004683#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
4684 case KVM_CLEAR_DIRTY_LOG: {
4685 struct compat_kvm_clear_dirty_log compat_log;
4686 struct kvm_clear_dirty_log log;
4687
4688 if (copy_from_user(&compat_log, (void __user *)arg,
4689 sizeof(compat_log)))
4690 return -EFAULT;
4691 log.slot = compat_log.slot;
4692 log.num_pages = compat_log.num_pages;
4693 log.first_page = compat_log.first_page;
4694 log.padding2 = compat_log.padding2;
4695 log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
4696
4697 r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
4698 break;
4699 }
4700#endif
Arnd Bergmann6ff58942009-10-22 14:19:27 +02004701 case KVM_GET_DIRTY_LOG: {
4702 struct compat_kvm_dirty_log compat_log;
4703 struct kvm_dirty_log log;
4704
Arnd Bergmann6ff58942009-10-22 14:19:27 +02004705 if (copy_from_user(&compat_log, (void __user *)arg,
4706 sizeof(compat_log)))
Markus Elfringf6a3b162017-01-22 11:30:21 +01004707 return -EFAULT;
Arnd Bergmann6ff58942009-10-22 14:19:27 +02004708 log.slot = compat_log.slot;
4709 log.padding1 = compat_log.padding1;
4710 log.padding2 = compat_log.padding2;
4711 log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
4712
4713 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
Arnd Bergmann6ff58942009-10-22 14:19:27 +02004714 break;
4715 }
4716 default:
4717 r = kvm_vm_ioctl(filp, ioctl, arg);
4718 }
Arnd Bergmann6ff58942009-10-22 14:19:27 +02004719 return r;
4720}
4721#endif
4722
Christian Borntraeger3d3aab12008-12-02 11:17:32 +01004723static struct file_operations kvm_vm_fops = {
Avi Kivityf17abe92007-02-21 19:28:04 +02004724 .release = kvm_vm_release,
4725 .unlocked_ioctl = kvm_vm_ioctl,
Arnd Bergmann6038f372010-08-15 18:52:59 +02004726 .llseek = noop_llseek,
Marc Zyngier7ddfd3e2018-06-17 10:16:21 +01004727 KVM_COMPAT(kvm_vm_compat_ioctl),
Avi Kivityf17abe92007-02-21 19:28:04 +02004728};
4729
Nathan Tempelman54526d12021-04-08 22:32:14 +00004730bool file_is_kvm(struct file *file)
4731{
4732 return file && file->f_op == &kvm_vm_fops;
4733}
4734EXPORT_SYMBOL_GPL(file_is_kvm);
4735
Carsten Ottee08b9632012-01-04 10:25:20 +01004736static int kvm_dev_ioctl_create_vm(unsigned long type)
Avi Kivityf17abe92007-02-21 19:28:04 +02004737{
Heiko Carstensaac87632010-10-27 17:22:10 +02004738 int r;
Avi Kivityf17abe92007-02-21 19:28:04 +02004739 struct kvm *kvm;
Al Viro506cfba2016-07-14 18:54:17 +02004740 struct file *file;
Avi Kivityf17abe92007-02-21 19:28:04 +02004741
Carsten Ottee08b9632012-01-04 10:25:20 +01004742 kvm = kvm_create_vm(type);
Avi Kivityd6d28162007-06-28 08:38:16 -04004743 if (IS_ERR(kvm))
4744 return PTR_ERR(kvm);
Paolo Bonzini4b4357e2017-03-31 13:53:23 +02004745#ifdef CONFIG_KVM_MMIO
Takuya Yoshikawa6ce5a092010-03-15 22:13:30 +09004746 r = kvm_coalesced_mmio_init(kvm);
Markus Elfring78588332017-11-21 13:40:17 +01004747 if (r < 0)
4748 goto put_kvm;
Takuya Yoshikawa6ce5a092010-03-15 22:13:30 +09004749#endif
Al Viro506cfba2016-07-14 18:54:17 +02004750 r = get_unused_fd_flags(O_CLOEXEC);
Markus Elfring78588332017-11-21 13:40:17 +01004751 if (r < 0)
4752 goto put_kvm;
4753
Jing Zhangfcfe1ba2021-06-18 22:27:05 +00004754 snprintf(kvm->stats_id, sizeof(kvm->stats_id),
4755 "kvm-%d", task_pid_nr(current));
4756
Al Viro506cfba2016-07-14 18:54:17 +02004757 file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
4758 if (IS_ERR(file)) {
4759 put_unused_fd(r);
Markus Elfring78588332017-11-21 13:40:17 +01004760 r = PTR_ERR(file);
4761 goto put_kvm;
Al Viro506cfba2016-07-14 18:54:17 +02004762 }
Janosch Frank536a6f82016-05-18 13:26:23 +02004763
Paolo Bonzini525df862017-06-27 15:45:09 +02004764 /*
4765 * Don't call kvm_put_kvm anymore at this point; file->f_op is
4766 * already set, with ->release() being kvm_vm_release(). In error
4767 * cases it will be called by the final fput(file) and will take
4768 * care of doing kvm_put_kvm(kvm).
4769 */
Janosch Frank536a6f82016-05-18 13:26:23 +02004770 if (kvm_create_vm_debugfs(kvm, r) < 0) {
Al Viro506cfba2016-07-14 18:54:17 +02004771 put_unused_fd(r);
4772 fput(file);
Janosch Frank536a6f82016-05-18 13:26:23 +02004773 return -ENOMEM;
4774 }
Claudio Imbrenda286de8f2017-07-12 17:56:44 +02004775 kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm);
Avi Kivityf17abe92007-02-21 19:28:04 +02004776
Al Viro506cfba2016-07-14 18:54:17 +02004777 fd_install(r, file);
Heiko Carstensaac87632010-10-27 17:22:10 +02004778 return r;
Markus Elfring78588332017-11-21 13:40:17 +01004779
4780put_kvm:
4781 kvm_put_kvm(kvm);
4782 return r;
Avi Kivityf17abe92007-02-21 19:28:04 +02004783}
4784
4785static long kvm_dev_ioctl(struct file *filp,
4786 unsigned int ioctl, unsigned long arg)
4787{
Avi Kivity07c45a32007-03-07 13:05:38 +02004788 long r = -EINVAL;
Avi Kivityf17abe92007-02-21 19:28:04 +02004789
4790 switch (ioctl) {
4791 case KVM_GET_API_VERSION:
Avi Kivityf0fe5102007-03-07 13:11:17 +02004792 if (arg)
4793 goto out;
Avi Kivityf17abe92007-02-21 19:28:04 +02004794 r = KVM_API_VERSION;
4795 break;
4796 case KVM_CREATE_VM:
Carsten Ottee08b9632012-01-04 10:25:20 +01004797 r = kvm_dev_ioctl_create_vm(arg);
Avi Kivityf17abe92007-02-21 19:28:04 +02004798 break;
Zhang Xiantao018d00d2007-11-15 23:07:47 +08004799 case KVM_CHECK_EXTENSION:
Alexander Graf784aa3d2014-07-14 18:27:35 +02004800 r = kvm_vm_ioctl_check_extension_generic(NULL, arg);
Avi Kivity5d308f42007-03-01 17:56:20 +02004801 break;
Avi Kivity07c45a32007-03-07 13:05:38 +02004802 case KVM_GET_VCPU_MMAP_SIZE:
Avi Kivity07c45a32007-03-07 13:05:38 +02004803 if (arg)
4804 goto out;
Avi Kivityadb1ff42008-01-24 15:13:08 +02004805 r = PAGE_SIZE; /* struct kvm_run */
4806#ifdef CONFIG_X86
4807 r += PAGE_SIZE; /* pio data page */
4808#endif
Paolo Bonzini4b4357e2017-03-31 13:53:23 +02004809#ifdef CONFIG_KVM_MMIO
Laurent Vivier5f94c172008-05-30 16:05:54 +02004810 r += PAGE_SIZE; /* coalesced mmio ring page */
4811#endif
Avi Kivity07c45a32007-03-07 13:05:38 +02004812 break;
Feng(Eric) Liud4c9ff22008-04-10 08:47:53 -04004813 case KVM_TRACE_ENABLE:
4814 case KVM_TRACE_PAUSE:
4815 case KVM_TRACE_DISABLE:
Marcelo Tosatti2023a292009-06-18 11:47:28 -03004816 r = -EOPNOTSUPP;
Feng(Eric) Liud4c9ff22008-04-10 08:47:53 -04004817 break;
Avi Kivity6aa8b732006-12-10 02:21:36 -08004818 default:
Carsten Otte043405e2007-10-10 17:16:19 +02004819 return kvm_arch_dev_ioctl(filp, ioctl, arg);
Avi Kivity6aa8b732006-12-10 02:21:36 -08004820 }
4821out:
4822 return r;
4823}
4824
Avi Kivity6aa8b732006-12-10 02:21:36 -08004825static struct file_operations kvm_chardev_ops = {
Avi Kivity6aa8b732006-12-10 02:21:36 -08004826 .unlocked_ioctl = kvm_dev_ioctl,
Arnd Bergmann6038f372010-08-15 18:52:59 +02004827 .llseek = noop_llseek,
Marc Zyngier7ddfd3e2018-06-17 10:16:21 +01004828 KVM_COMPAT(kvm_dev_ioctl),
Avi Kivity6aa8b732006-12-10 02:21:36 -08004829};
4830
4831static struct miscdevice kvm_dev = {
Avi Kivitybbe44322007-03-04 13:27:36 +02004832 KVM_MINOR,
Avi Kivity6aa8b732006-12-10 02:21:36 -08004833 "kvm",
4834 &kvm_chardev_ops,
4835};
4836
Takuya Yoshikawa75b71272010-11-16 17:37:41 +09004837static void hardware_enable_nolock(void *junk)
Avi Kivity1b6c0162007-05-24 13:03:52 +03004838{
4839 int cpu = raw_smp_processor_id();
Alexander Graf10474ae2009-09-15 11:37:46 +02004840 int r;
Avi Kivity1b6c0162007-05-24 13:03:52 +03004841
Rusty Russell7f59f492008-12-07 21:25:45 +10304842 if (cpumask_test_cpu(cpu, cpus_hardware_enabled))
Avi Kivity1b6c0162007-05-24 13:03:52 +03004843 return;
Alexander Graf10474ae2009-09-15 11:37:46 +02004844
Rusty Russell7f59f492008-12-07 21:25:45 +10304845 cpumask_set_cpu(cpu, cpus_hardware_enabled);
Alexander Graf10474ae2009-09-15 11:37:46 +02004846
Radim Krčmář13a34e02014-08-28 15:13:03 +02004847 r = kvm_arch_hardware_enable();
Alexander Graf10474ae2009-09-15 11:37:46 +02004848
4849 if (r) {
4850 cpumask_clear_cpu(cpu, cpus_hardware_enabled);
4851 atomic_inc(&hardware_enable_failed);
Xiubo Li1170adc2015-02-26 14:58:26 +08004852 pr_info("kvm: enabling virtualization on CPU%d failed\n", cpu);
Alexander Graf10474ae2009-09-15 11:37:46 +02004853 }
Avi Kivity1b6c0162007-05-24 13:03:52 +03004854}
4855
Thomas Gleixner8c18b2d2016-07-13 17:16:37 +00004856static int kvm_starting_cpu(unsigned int cpu)
Takuya Yoshikawa75b71272010-11-16 17:37:41 +09004857{
Paolo Bonzini4a937f92013-09-10 12:58:35 +02004858 raw_spin_lock(&kvm_count_lock);
Paolo Bonzini4fa92fb2013-09-10 12:57:17 +02004859 if (kvm_usage_count)
4860 hardware_enable_nolock(NULL);
Paolo Bonzini4a937f92013-09-10 12:58:35 +02004861 raw_spin_unlock(&kvm_count_lock);
Thomas Gleixner8c18b2d2016-07-13 17:16:37 +00004862 return 0;
Takuya Yoshikawa75b71272010-11-16 17:37:41 +09004863}
4864
4865static void hardware_disable_nolock(void *junk)
Avi Kivity1b6c0162007-05-24 13:03:52 +03004866{
4867 int cpu = raw_smp_processor_id();
4868
Rusty Russell7f59f492008-12-07 21:25:45 +10304869 if (!cpumask_test_cpu(cpu, cpus_hardware_enabled))
Avi Kivity1b6c0162007-05-24 13:03:52 +03004870 return;
Rusty Russell7f59f492008-12-07 21:25:45 +10304871 cpumask_clear_cpu(cpu, cpus_hardware_enabled);
Radim Krčmář13a34e02014-08-28 15:13:03 +02004872 kvm_arch_hardware_disable();
Avi Kivity1b6c0162007-05-24 13:03:52 +03004873}
4874
Thomas Gleixner8c18b2d2016-07-13 17:16:37 +00004875static int kvm_dying_cpu(unsigned int cpu)
Takuya Yoshikawa75b71272010-11-16 17:37:41 +09004876{
Paolo Bonzini4a937f92013-09-10 12:58:35 +02004877 raw_spin_lock(&kvm_count_lock);
Paolo Bonzini4fa92fb2013-09-10 12:57:17 +02004878 if (kvm_usage_count)
4879 hardware_disable_nolock(NULL);
Paolo Bonzini4a937f92013-09-10 12:58:35 +02004880 raw_spin_unlock(&kvm_count_lock);
Thomas Gleixner8c18b2d2016-07-13 17:16:37 +00004881 return 0;
Takuya Yoshikawa75b71272010-11-16 17:37:41 +09004882}
4883
Alexander Graf10474ae2009-09-15 11:37:46 +02004884static void hardware_disable_all_nolock(void)
4885{
4886 BUG_ON(!kvm_usage_count);
4887
4888 kvm_usage_count--;
4889 if (!kvm_usage_count)
Takuya Yoshikawa75b71272010-11-16 17:37:41 +09004890 on_each_cpu(hardware_disable_nolock, NULL, 1);
Alexander Graf10474ae2009-09-15 11:37:46 +02004891}
4892
4893static void hardware_disable_all(void)
4894{
Paolo Bonzini4a937f92013-09-10 12:58:35 +02004895 raw_spin_lock(&kvm_count_lock);
Alexander Graf10474ae2009-09-15 11:37:46 +02004896 hardware_disable_all_nolock();
Paolo Bonzini4a937f92013-09-10 12:58:35 +02004897 raw_spin_unlock(&kvm_count_lock);
Alexander Graf10474ae2009-09-15 11:37:46 +02004898}
4899
4900static int hardware_enable_all(void)
4901{
4902 int r = 0;
4903
Paolo Bonzini4a937f92013-09-10 12:58:35 +02004904 raw_spin_lock(&kvm_count_lock);
Alexander Graf10474ae2009-09-15 11:37:46 +02004905
4906 kvm_usage_count++;
4907 if (kvm_usage_count == 1) {
4908 atomic_set(&hardware_enable_failed, 0);
Takuya Yoshikawa75b71272010-11-16 17:37:41 +09004909 on_each_cpu(hardware_enable_nolock, NULL, 1);
Alexander Graf10474ae2009-09-15 11:37:46 +02004910
4911 if (atomic_read(&hardware_enable_failed)) {
4912 hardware_disable_all_nolock();
4913 r = -EBUSY;
4914 }
4915 }
4916
Paolo Bonzini4a937f92013-09-10 12:58:35 +02004917 raw_spin_unlock(&kvm_count_lock);
Alexander Graf10474ae2009-09-15 11:37:46 +02004918
4919 return r;
4920}
4921
Rusty Russell9a2b85c2007-07-17 23:17:55 +10004922static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
Mike Dayd77c26f2007-10-08 09:02:08 -04004923 void *v)
Rusty Russell9a2b85c2007-07-17 23:17:55 +10004924{
Sheng Yang8e1c1812009-04-29 11:09:04 +08004925 /*
4926 * Some (well, at least mine) BIOSes hang on reboot if
4927 * in vmx root mode.
4928 *
4929 * And Intel TXT required VMX off for all cpu when system shutdown.
4930 */
Xiubo Li1170adc2015-02-26 14:58:26 +08004931 pr_info("kvm: exiting hardware virtualization\n");
Sheng Yang8e1c1812009-04-29 11:09:04 +08004932 kvm_rebooting = true;
Takuya Yoshikawa75b71272010-11-16 17:37:41 +09004933 on_each_cpu(hardware_disable_nolock, NULL, 1);
Rusty Russell9a2b85c2007-07-17 23:17:55 +10004934 return NOTIFY_OK;
4935}
4936
4937static struct notifier_block kvm_reboot_notifier = {
4938 .notifier_call = kvm_reboot,
4939 .priority = 0,
4940};
4941
Marcelo Tosattie93f8a02009-12-23 14:35:24 -02004942static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
Gregory Haskins2eeb2e92007-05-31 14:08:53 -04004943{
4944 int i;
4945
4946 for (i = 0; i < bus->dev_count; i++) {
Sasha Levin743eeb02011-07-27 16:00:48 +03004947 struct kvm_io_device *pos = bus->range[i].dev;
Gregory Haskins2eeb2e92007-05-31 14:08:53 -04004948
4949 kvm_iodevice_destructor(pos);
4950 }
Marcelo Tosattie93f8a02009-12-23 14:35:24 -02004951 kfree(bus);
Gregory Haskins2eeb2e92007-05-31 14:08:53 -04004952}
4953
Paolo Bonzinic21fbff2013-08-27 15:41:41 +02004954static inline int kvm_io_bus_cmp(const struct kvm_io_range *r1,
Xiubo Li20e87b72015-02-26 14:58:25 +08004955 const struct kvm_io_range *r2)
Sasha Levin743eeb02011-07-27 16:00:48 +03004956{
Jason Wang8f4216c72015-09-15 14:41:57 +08004957 gpa_t addr1 = r1->addr;
4958 gpa_t addr2 = r2->addr;
4959
4960 if (addr1 < addr2)
Sasha Levin743eeb02011-07-27 16:00:48 +03004961 return -1;
Jason Wang8f4216c72015-09-15 14:41:57 +08004962
4963 /* If r2->len == 0, match the exact address. If r2->len != 0,
4964 * accept any overlapping write. Any order is acceptable for
4965 * overlapping ranges, because kvm_io_bus_get_first_dev ensures
4966 * we process all of them.
4967 */
4968 if (r2->len) {
4969 addr1 += r1->len;
4970 addr2 += r2->len;
4971 }
4972
4973 if (addr1 > addr2)
Sasha Levin743eeb02011-07-27 16:00:48 +03004974 return 1;
Jason Wang8f4216c72015-09-15 14:41:57 +08004975
Sasha Levin743eeb02011-07-27 16:00:48 +03004976 return 0;
4977}
4978
Paolo Bonzinia343c9b2013-07-16 13:03:29 +02004979static int kvm_io_bus_sort_cmp(const void *p1, const void *p2)
4980{
Paolo Bonzinic21fbff2013-08-27 15:41:41 +02004981 return kvm_io_bus_cmp(p1, p2);
Paolo Bonzinia343c9b2013-07-16 13:03:29 +02004982}
4983
Geoff Levand39369f72013-04-05 19:20:30 +00004984static int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus,
Sasha Levin743eeb02011-07-27 16:00:48 +03004985 gpa_t addr, int len)
4986{
4987 struct kvm_io_range *range, key;
4988 int off;
4989
4990 key = (struct kvm_io_range) {
4991 .addr = addr,
4992 .len = len,
4993 };
4994
4995 range = bsearch(&key, bus->range, bus->dev_count,
4996 sizeof(struct kvm_io_range), kvm_io_bus_sort_cmp);
4997 if (range == NULL)
4998 return -ENOENT;
4999
5000 off = range - bus->range;
5001
Paolo Bonzinic21fbff2013-08-27 15:41:41 +02005002 while (off > 0 && kvm_io_bus_cmp(&key, &bus->range[off-1]) == 0)
Sasha Levin743eeb02011-07-27 16:00:48 +03005003 off--;
5004
5005 return off;
5006}
5007
Nikolay Nikolaeve32edf42015-03-26 14:39:28 +00005008static int __kvm_io_bus_write(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
Cornelia Huck126a5af2013-07-03 16:30:53 +02005009 struct kvm_io_range *range, const void *val)
5010{
5011 int idx;
5012
5013 idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
5014 if (idx < 0)
5015 return -EOPNOTSUPP;
5016
5017 while (idx < bus->dev_count &&
Paolo Bonzinic21fbff2013-08-27 15:41:41 +02005018 kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
Nikolay Nikolaeve32edf42015-03-26 14:39:28 +00005019 if (!kvm_iodevice_write(vcpu, bus->range[idx].dev, range->addr,
Cornelia Huck126a5af2013-07-03 16:30:53 +02005020 range->len, val))
5021 return idx;
5022 idx++;
5023 }
5024
5025 return -EOPNOTSUPP;
5026}
5027
Michael S. Tsirkinbda90202009-06-29 22:24:32 +03005028/* kvm_io_bus_write - called under kvm->slots_lock */
Nikolay Nikolaeve32edf42015-03-26 14:39:28 +00005029int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
Michael S. Tsirkinbda90202009-06-29 22:24:32 +03005030 int len, const void *val)
Gregory Haskins2eeb2e92007-05-31 14:08:53 -04005031{
Cornelia Huck126a5af2013-07-03 16:30:53 +02005032 struct kvm_io_bus *bus;
5033 struct kvm_io_range range;
5034 int r;
5035
5036 range = (struct kvm_io_range) {
5037 .addr = addr,
5038 .len = len,
5039 };
5040
Nikolay Nikolaeve32edf42015-03-26 14:39:28 +00005041 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
David Hildenbrand90db1042017-03-23 18:24:19 +01005042 if (!bus)
5043 return -ENOMEM;
Nikolay Nikolaeve32edf42015-03-26 14:39:28 +00005044 r = __kvm_io_bus_write(vcpu, bus, &range, val);
Cornelia Huck126a5af2013-07-03 16:30:53 +02005045 return r < 0 ? r : 0;
5046}
Leo Yana2420102019-02-22 16:10:09 +08005047EXPORT_SYMBOL_GPL(kvm_io_bus_write);
Cornelia Huck126a5af2013-07-03 16:30:53 +02005048
5049/* kvm_io_bus_write_cookie - called under kvm->slots_lock */
Nikolay Nikolaeve32edf42015-03-26 14:39:28 +00005050int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx,
5051 gpa_t addr, int len, const void *val, long cookie)
Cornelia Huck126a5af2013-07-03 16:30:53 +02005052{
Lai Jiangshan90d83dc2010-04-19 17:41:23 +08005053 struct kvm_io_bus *bus;
Sasha Levin743eeb02011-07-27 16:00:48 +03005054 struct kvm_io_range range;
5055
5056 range = (struct kvm_io_range) {
5057 .addr = addr,
5058 .len = len,
5059 };
Lai Jiangshan90d83dc2010-04-19 17:41:23 +08005060
Nikolay Nikolaeve32edf42015-03-26 14:39:28 +00005061 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
David Hildenbrand90db1042017-03-23 18:24:19 +01005062 if (!bus)
5063 return -ENOMEM;
Cornelia Huck126a5af2013-07-03 16:30:53 +02005064
5065 /* First try the device referenced by cookie. */
5066 if ((cookie >= 0) && (cookie < bus->dev_count) &&
Paolo Bonzinic21fbff2013-08-27 15:41:41 +02005067 (kvm_io_bus_cmp(&range, &bus->range[cookie]) == 0))
Nikolay Nikolaeve32edf42015-03-26 14:39:28 +00005068 if (!kvm_iodevice_write(vcpu, bus->range[cookie].dev, addr, len,
Cornelia Huck126a5af2013-07-03 16:30:53 +02005069 val))
5070 return cookie;
5071
5072 /*
5073 * cookie contained garbage; fall back to search and return the
5074 * correct cookie value.
5075 */
Nikolay Nikolaeve32edf42015-03-26 14:39:28 +00005076 return __kvm_io_bus_write(vcpu, bus, &range, val);
Cornelia Huck126a5af2013-07-03 16:30:53 +02005077}
5078
Nikolay Nikolaeve32edf42015-03-26 14:39:28 +00005079static int __kvm_io_bus_read(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
5080 struct kvm_io_range *range, void *val)
Cornelia Huck126a5af2013-07-03 16:30:53 +02005081{
5082 int idx;
5083
5084 idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
Sasha Levin743eeb02011-07-27 16:00:48 +03005085 if (idx < 0)
5086 return -EOPNOTSUPP;
5087
5088 while (idx < bus->dev_count &&
Paolo Bonzinic21fbff2013-08-27 15:41:41 +02005089 kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
Nikolay Nikolaeve32edf42015-03-26 14:39:28 +00005090 if (!kvm_iodevice_read(vcpu, bus->range[idx].dev, range->addr,
Cornelia Huck126a5af2013-07-03 16:30:53 +02005091 range->len, val))
5092 return idx;
Sasha Levin743eeb02011-07-27 16:00:48 +03005093 idx++;
5094 }
5095
Michael S. Tsirkinbda90202009-06-29 22:24:32 +03005096 return -EOPNOTSUPP;
5097}
Gregory Haskins2eeb2e92007-05-31 14:08:53 -04005098
Michael S. Tsirkinbda90202009-06-29 22:24:32 +03005099/* kvm_io_bus_read - called under kvm->slots_lock */
Nikolay Nikolaeve32edf42015-03-26 14:39:28 +00005100int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
Marcelo Tosattie93f8a02009-12-23 14:35:24 -02005101 int len, void *val)
Michael S. Tsirkinbda90202009-06-29 22:24:32 +03005102{
Cornelia Huck126a5af2013-07-03 16:30:53 +02005103 struct kvm_io_bus *bus;
5104 struct kvm_io_range range;
5105 int r;
5106
5107 range = (struct kvm_io_range) {
5108 .addr = addr,
5109 .len = len,
5110 };
5111
Nikolay Nikolaeve32edf42015-03-26 14:39:28 +00005112 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
David Hildenbrand90db1042017-03-23 18:24:19 +01005113 if (!bus)
5114 return -ENOMEM;
Nikolay Nikolaeve32edf42015-03-26 14:39:28 +00005115 r = __kvm_io_bus_read(vcpu, bus, &range, val);
Cornelia Huck126a5af2013-07-03 16:30:53 +02005116 return r < 0 ? r : 0;
5117}
5118
Marcelo Tosatti79fac952009-12-23 14:35:26 -02005119/* Caller must hold slots_lock. */
Sasha Levin743eeb02011-07-27 16:00:48 +03005120int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
5121 int len, struct kvm_io_device *dev)
Michael S. Tsirkin6c474692009-06-29 22:24:26 +03005122{
Gal Hammerd4c67a72018-01-16 15:34:41 +02005123 int i;
Marcelo Tosattie93f8a02009-12-23 14:35:24 -02005124 struct kvm_io_bus *new_bus, *bus;
Gal Hammerd4c67a72018-01-16 15:34:41 +02005125 struct kvm_io_range range;
Gregory Haskins090b7af2009-07-07 17:08:44 -04005126
Christian Borntraeger4a12f952017-07-07 10:51:38 +02005127 bus = kvm_get_bus(kvm, bus_idx);
David Hildenbrand90db1042017-03-23 18:24:19 +01005128 if (!bus)
5129 return -ENOMEM;
5130
Amos Kong6ea34c92013-05-25 06:44:15 +08005131 /* exclude ioeventfd which is limited by maximum fd */
5132 if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1)
Gregory Haskins090b7af2009-07-07 17:08:44 -04005133 return -ENOSPC;
5134
Gustavo A. R. Silva90952cd2019-01-30 17:07:47 +01005135 new_bus = kmalloc(struct_size(bus, range, bus->dev_count + 1),
Ben Gardonb12ce362019-02-11 11:02:49 -08005136 GFP_KERNEL_ACCOUNT);
Marcelo Tosattie93f8a02009-12-23 14:35:24 -02005137 if (!new_bus)
5138 return -ENOMEM;
Gal Hammerd4c67a72018-01-16 15:34:41 +02005139
5140 range = (struct kvm_io_range) {
5141 .addr = addr,
5142 .len = len,
5143 .dev = dev,
5144 };
5145
5146 for (i = 0; i < bus->dev_count; i++)
5147 if (kvm_io_bus_cmp(&bus->range[i], &range) > 0)
5148 break;
5149
5150 memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range));
5151 new_bus->dev_count++;
5152 new_bus->range[i] = range;
5153 memcpy(new_bus->range + i + 1, bus->range + i,
5154 (bus->dev_count - i) * sizeof(struct kvm_io_range));
Marcelo Tosattie93f8a02009-12-23 14:35:24 -02005155 rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
5156 synchronize_srcu_expedited(&kvm->srcu);
5157 kfree(bus);
Gregory Haskins090b7af2009-07-07 17:08:44 -04005158
5159 return 0;
5160}
5161
Sean Christopherson5d3c4c7932021-04-12 15:20:49 -07005162int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
5163 struct kvm_io_device *dev)
Gregory Haskins090b7af2009-07-07 17:08:44 -04005164{
Rustam Kovhaevf6588662020-09-07 11:55:35 -07005165 int i, j;
Marcelo Tosattie93f8a02009-12-23 14:35:24 -02005166 struct kvm_io_bus *new_bus, *bus;
Michael S. Tsirkin6c474692009-06-29 22:24:26 +03005167
Sean Christopherson7c896d32021-04-12 15:20:50 -07005168 lockdep_assert_held(&kvm->slots_lock);
5169
Christian Borntraeger4a12f952017-07-07 10:51:38 +02005170 bus = kvm_get_bus(kvm, bus_idx);
Peter Xudf630b82017-03-15 16:01:17 +08005171 if (!bus)
Sean Christopherson5d3c4c7932021-04-12 15:20:49 -07005172 return 0;
Peter Xudf630b82017-03-15 16:01:17 +08005173
Sean Christopherson7c896d32021-04-12 15:20:50 -07005174 for (i = 0; i < bus->dev_count; i++) {
Amos Konga13007162012-03-09 12:17:32 +08005175 if (bus->range[i].dev == dev) {
Gregory Haskins090b7af2009-07-07 17:08:44 -04005176 break;
5177 }
Sean Christopherson7c896d32021-04-12 15:20:50 -07005178 }
Marcelo Tosattie93f8a02009-12-23 14:35:24 -02005179
David Hildenbrand90db1042017-03-23 18:24:19 +01005180 if (i == bus->dev_count)
Sean Christopherson5d3c4c7932021-04-12 15:20:49 -07005181 return 0;
Amos Konga13007162012-03-09 12:17:32 +08005182
Gustavo A. R. Silva90952cd2019-01-30 17:07:47 +01005183 new_bus = kmalloc(struct_size(bus, range, bus->dev_count - 1),
Ben Gardonb12ce362019-02-11 11:02:49 -08005184 GFP_KERNEL_ACCOUNT);
Rustam Kovhaevf6588662020-09-07 11:55:35 -07005185 if (new_bus) {
Rustam Kovhaev871c4332020-09-18 05:05:00 -07005186 memcpy(new_bus, bus, struct_size(bus, range, i));
Rustam Kovhaevf6588662020-09-07 11:55:35 -07005187 new_bus->dev_count--;
5188 memcpy(new_bus->range + i, bus->range + i + 1,
Rustam Kovhaev871c4332020-09-18 05:05:00 -07005189 flex_array_size(new_bus, range, new_bus->dev_count - i));
Sean Christopherson2ee37572021-04-12 15:20:48 -07005190 }
5191
5192 rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
5193 synchronize_srcu_expedited(&kvm->srcu);
5194
5195 /* Destroy the old bus _after_ installing the (null) bus. */
5196 if (!new_bus) {
David Hildenbrand90db1042017-03-23 18:24:19 +01005197 pr_err("kvm: failed to shrink bus, removing it completely\n");
Rustam Kovhaevf6588662020-09-07 11:55:35 -07005198 for (j = 0; j < bus->dev_count; j++) {
5199 if (j == i)
5200 continue;
5201 kvm_iodevice_destructor(bus->range[j].dev);
5202 }
David Hildenbrand90db1042017-03-23 18:24:19 +01005203 }
Amos Konga13007162012-03-09 12:17:32 +08005204
Marcelo Tosattie93f8a02009-12-23 14:35:24 -02005205 kfree(bus);
Sean Christopherson5d3c4c7932021-04-12 15:20:49 -07005206 return new_bus ? 0 : -ENOMEM;
Gregory Haskins2eeb2e92007-05-31 14:08:53 -04005207}
5208
Andre Przywara8a39d002016-07-15 12:43:26 +01005209struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,
5210 gpa_t addr)
5211{
5212 struct kvm_io_bus *bus;
5213 int dev_idx, srcu_idx;
5214 struct kvm_io_device *iodev = NULL;
5215
5216 srcu_idx = srcu_read_lock(&kvm->srcu);
5217
5218 bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
David Hildenbrand90db1042017-03-23 18:24:19 +01005219 if (!bus)
5220 goto out_unlock;
Andre Przywara8a39d002016-07-15 12:43:26 +01005221
5222 dev_idx = kvm_io_bus_get_first_dev(bus, addr, 1);
5223 if (dev_idx < 0)
5224 goto out_unlock;
5225
5226 iodev = bus->range[dev_idx].dev;
5227
5228out_unlock:
5229 srcu_read_unlock(&kvm->srcu, srcu_idx);
5230
5231 return iodev;
5232}
5233EXPORT_SYMBOL_GPL(kvm_io_bus_get_dev);
5234
Janosch Frank536a6f82016-05-18 13:26:23 +02005235static int kvm_debugfs_open(struct inode *inode, struct file *file,
5236 int (*get)(void *, u64 *), int (*set)(void *, u64),
5237 const char *fmt)
5238{
5239 struct kvm_stat_data *stat_data = (struct kvm_stat_data *)
5240 inode->i_private;
5241
Peter Xu605c7132021-06-25 11:32:07 -04005242 /*
5243 * The debugfs files are a reference to the kvm struct which
5244 * is still valid when kvm_destroy_vm is called. kvm_get_kvm_safe
5245 * avoids the race between open and the removal of the debugfs directory.
Janosch Frank536a6f82016-05-18 13:26:23 +02005246 */
Peter Xu605c7132021-06-25 11:32:07 -04005247 if (!kvm_get_kvm_safe(stat_data->kvm))
Janosch Frank536a6f82016-05-18 13:26:23 +02005248 return -ENOENT;
5249
Paolo Bonzini833b45d2019-09-30 18:48:44 +02005250 if (simple_attr_open(inode, file, get,
Jing Zhangbc9e9e62021-06-23 17:28:46 -04005251 kvm_stats_debugfs_mode(stat_data->desc) & 0222
Milan Pandurov09cbcef2019-12-13 14:07:21 +01005252 ? set : NULL,
5253 fmt)) {
Janosch Frank536a6f82016-05-18 13:26:23 +02005254 kvm_put_kvm(stat_data->kvm);
5255 return -ENOMEM;
5256 }
5257
5258 return 0;
5259}
5260
5261static int kvm_debugfs_release(struct inode *inode, struct file *file)
5262{
5263 struct kvm_stat_data *stat_data = (struct kvm_stat_data *)
5264 inode->i_private;
5265
5266 simple_attr_release(inode, file);
5267 kvm_put_kvm(stat_data->kvm);
5268
5269 return 0;
5270}
5271
Milan Pandurov09cbcef2019-12-13 14:07:21 +01005272static int kvm_get_stat_per_vm(struct kvm *kvm, size_t offset, u64 *val)
Janosch Frank536a6f82016-05-18 13:26:23 +02005273{
Jing Zhangbc9e9e62021-06-23 17:28:46 -04005274 *val = *(u64 *)((void *)(&kvm->stat) + offset);
Janosch Frank536a6f82016-05-18 13:26:23 +02005275
5276 return 0;
5277}
5278
Milan Pandurov09cbcef2019-12-13 14:07:21 +01005279static int kvm_clear_stat_per_vm(struct kvm *kvm, size_t offset)
Suraj Jitindar Singhce35ef22016-10-19 13:49:47 +11005280{
Jing Zhangbc9e9e62021-06-23 17:28:46 -04005281 *(u64 *)((void *)(&kvm->stat) + offset) = 0;
Suraj Jitindar Singhce35ef22016-10-19 13:49:47 +11005282
5283 return 0;
5284}
5285
Milan Pandurov09cbcef2019-12-13 14:07:21 +01005286static int kvm_get_stat_per_vcpu(struct kvm *kvm, size_t offset, u64 *val)
Janosch Frank536a6f82016-05-18 13:26:23 +02005287{
Marc Zyngier46808a42021-11-16 16:04:02 +00005288 unsigned long i;
Janosch Frank536a6f82016-05-18 13:26:23 +02005289 struct kvm_vcpu *vcpu;
5290
5291 *val = 0;
5292
Milan Pandurov09cbcef2019-12-13 14:07:21 +01005293 kvm_for_each_vcpu(i, vcpu, kvm)
Jing Zhangbc9e9e62021-06-23 17:28:46 -04005294 *val += *(u64 *)((void *)(&vcpu->stat) + offset);
Janosch Frank536a6f82016-05-18 13:26:23 +02005295
5296 return 0;
5297}
5298
Milan Pandurov09cbcef2019-12-13 14:07:21 +01005299static int kvm_clear_stat_per_vcpu(struct kvm *kvm, size_t offset)
Suraj Jitindar Singhce35ef22016-10-19 13:49:47 +11005300{
Marc Zyngier46808a42021-11-16 16:04:02 +00005301 unsigned long i;
Suraj Jitindar Singhce35ef22016-10-19 13:49:47 +11005302 struct kvm_vcpu *vcpu;
5303
Milan Pandurov09cbcef2019-12-13 14:07:21 +01005304 kvm_for_each_vcpu(i, vcpu, kvm)
Jing Zhangbc9e9e62021-06-23 17:28:46 -04005305 *(u64 *)((void *)(&vcpu->stat) + offset) = 0;
Milan Pandurov09cbcef2019-12-13 14:07:21 +01005306
5307 return 0;
5308}
5309
5310static int kvm_stat_data_get(void *data, u64 *val)
5311{
5312 int r = -EFAULT;
5313 struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
5314
Jing Zhangbc9e9e62021-06-23 17:28:46 -04005315 switch (stat_data->kind) {
Milan Pandurov09cbcef2019-12-13 14:07:21 +01005316 case KVM_STAT_VM:
5317 r = kvm_get_stat_per_vm(stat_data->kvm,
Jing Zhangbc9e9e62021-06-23 17:28:46 -04005318 stat_data->desc->desc.offset, val);
Milan Pandurov09cbcef2019-12-13 14:07:21 +01005319 break;
5320 case KVM_STAT_VCPU:
5321 r = kvm_get_stat_per_vcpu(stat_data->kvm,
Jing Zhangbc9e9e62021-06-23 17:28:46 -04005322 stat_data->desc->desc.offset, val);
Milan Pandurov09cbcef2019-12-13 14:07:21 +01005323 break;
5324 }
5325
5326 return r;
5327}
5328
5329static int kvm_stat_data_clear(void *data, u64 val)
5330{
5331 int r = -EFAULT;
5332 struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
5333
Suraj Jitindar Singhce35ef22016-10-19 13:49:47 +11005334 if (val)
5335 return -EINVAL;
5336
Jing Zhangbc9e9e62021-06-23 17:28:46 -04005337 switch (stat_data->kind) {
Milan Pandurov09cbcef2019-12-13 14:07:21 +01005338 case KVM_STAT_VM:
5339 r = kvm_clear_stat_per_vm(stat_data->kvm,
Jing Zhangbc9e9e62021-06-23 17:28:46 -04005340 stat_data->desc->desc.offset);
Milan Pandurov09cbcef2019-12-13 14:07:21 +01005341 break;
5342 case KVM_STAT_VCPU:
5343 r = kvm_clear_stat_per_vcpu(stat_data->kvm,
Jing Zhangbc9e9e62021-06-23 17:28:46 -04005344 stat_data->desc->desc.offset);
Milan Pandurov09cbcef2019-12-13 14:07:21 +01005345 break;
5346 }
Suraj Jitindar Singhce35ef22016-10-19 13:49:47 +11005347
Milan Pandurov09cbcef2019-12-13 14:07:21 +01005348 return r;
Suraj Jitindar Singhce35ef22016-10-19 13:49:47 +11005349}
5350
Milan Pandurov09cbcef2019-12-13 14:07:21 +01005351static int kvm_stat_data_open(struct inode *inode, struct file *file)
Janosch Frank536a6f82016-05-18 13:26:23 +02005352{
5353 __simple_attr_check_format("%llu\n", 0ull);
Milan Pandurov09cbcef2019-12-13 14:07:21 +01005354 return kvm_debugfs_open(inode, file, kvm_stat_data_get,
5355 kvm_stat_data_clear, "%llu\n");
Janosch Frank536a6f82016-05-18 13:26:23 +02005356}
5357
Milan Pandurov09cbcef2019-12-13 14:07:21 +01005358static const struct file_operations stat_fops_per_vm = {
5359 .owner = THIS_MODULE,
5360 .open = kvm_stat_data_open,
Janosch Frank536a6f82016-05-18 13:26:23 +02005361 .release = kvm_debugfs_release,
Milan Pandurov09cbcef2019-12-13 14:07:21 +01005362 .read = simple_attr_read,
5363 .write = simple_attr_write,
5364 .llseek = no_llseek,
Janosch Frank536a6f82016-05-18 13:26:23 +02005365};
5366
Christoph Hellwig8b88b092008-02-08 04:20:26 -08005367static int vm_stat_get(void *_offset, u64 *val)
Avi Kivityba1389b2007-11-18 16:24:12 +02005368{
5369 unsigned offset = (long)_offset;
Avi Kivityba1389b2007-11-18 16:24:12 +02005370 struct kvm *kvm;
Janosch Frank536a6f82016-05-18 13:26:23 +02005371 u64 tmp_val;
Avi Kivityba1389b2007-11-18 16:24:12 +02005372
Christoph Hellwig8b88b092008-02-08 04:20:26 -08005373 *val = 0;
Junaid Shahid0d9ce162019-01-03 17:14:28 -08005374 mutex_lock(&kvm_lock);
Janosch Frank536a6f82016-05-18 13:26:23 +02005375 list_for_each_entry(kvm, &vm_list, vm_list) {
Milan Pandurov09cbcef2019-12-13 14:07:21 +01005376 kvm_get_stat_per_vm(kvm, offset, &tmp_val);
Janosch Frank536a6f82016-05-18 13:26:23 +02005377 *val += tmp_val;
5378 }
Junaid Shahid0d9ce162019-01-03 17:14:28 -08005379 mutex_unlock(&kvm_lock);
Christoph Hellwig8b88b092008-02-08 04:20:26 -08005380 return 0;
Avi Kivityba1389b2007-11-18 16:24:12 +02005381}
5382
Suraj Jitindar Singhce35ef22016-10-19 13:49:47 +11005383static int vm_stat_clear(void *_offset, u64 val)
5384{
5385 unsigned offset = (long)_offset;
5386 struct kvm *kvm;
Suraj Jitindar Singhce35ef22016-10-19 13:49:47 +11005387
5388 if (val)
5389 return -EINVAL;
5390
Junaid Shahid0d9ce162019-01-03 17:14:28 -08005391 mutex_lock(&kvm_lock);
Suraj Jitindar Singhce35ef22016-10-19 13:49:47 +11005392 list_for_each_entry(kvm, &vm_list, vm_list) {
Milan Pandurov09cbcef2019-12-13 14:07:21 +01005393 kvm_clear_stat_per_vm(kvm, offset);
Suraj Jitindar Singhce35ef22016-10-19 13:49:47 +11005394 }
Junaid Shahid0d9ce162019-01-03 17:14:28 -08005395 mutex_unlock(&kvm_lock);
Suraj Jitindar Singhce35ef22016-10-19 13:49:47 +11005396
5397 return 0;
5398}
5399
5400DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, vm_stat_clear, "%llu\n");
Jing Zhangbc9e9e62021-06-23 17:28:46 -04005401DEFINE_SIMPLE_ATTRIBUTE(vm_stat_readonly_fops, vm_stat_get, NULL, "%llu\n");
Avi Kivityba1389b2007-11-18 16:24:12 +02005402
Christoph Hellwig8b88b092008-02-08 04:20:26 -08005403static int vcpu_stat_get(void *_offset, u64 *val)
Avi Kivity1165f5f2007-04-19 17:27:43 +03005404{
5405 unsigned offset = (long)_offset;
Avi Kivity1165f5f2007-04-19 17:27:43 +03005406 struct kvm *kvm;
Janosch Frank536a6f82016-05-18 13:26:23 +02005407 u64 tmp_val;
Avi Kivity1165f5f2007-04-19 17:27:43 +03005408
Christoph Hellwig8b88b092008-02-08 04:20:26 -08005409 *val = 0;
Junaid Shahid0d9ce162019-01-03 17:14:28 -08005410 mutex_lock(&kvm_lock);
Janosch Frank536a6f82016-05-18 13:26:23 +02005411 list_for_each_entry(kvm, &vm_list, vm_list) {
Milan Pandurov09cbcef2019-12-13 14:07:21 +01005412 kvm_get_stat_per_vcpu(kvm, offset, &tmp_val);
Janosch Frank536a6f82016-05-18 13:26:23 +02005413 *val += tmp_val;
5414 }
Junaid Shahid0d9ce162019-01-03 17:14:28 -08005415 mutex_unlock(&kvm_lock);
Christoph Hellwig8b88b092008-02-08 04:20:26 -08005416 return 0;
Avi Kivity1165f5f2007-04-19 17:27:43 +03005417}
5418
Suraj Jitindar Singhce35ef22016-10-19 13:49:47 +11005419static int vcpu_stat_clear(void *_offset, u64 val)
5420{
5421 unsigned offset = (long)_offset;
5422 struct kvm *kvm;
Suraj Jitindar Singhce35ef22016-10-19 13:49:47 +11005423
5424 if (val)
5425 return -EINVAL;
5426
Junaid Shahid0d9ce162019-01-03 17:14:28 -08005427 mutex_lock(&kvm_lock);
Suraj Jitindar Singhce35ef22016-10-19 13:49:47 +11005428 list_for_each_entry(kvm, &vm_list, vm_list) {
Milan Pandurov09cbcef2019-12-13 14:07:21 +01005429 kvm_clear_stat_per_vcpu(kvm, offset);
Suraj Jitindar Singhce35ef22016-10-19 13:49:47 +11005430 }
Junaid Shahid0d9ce162019-01-03 17:14:28 -08005431 mutex_unlock(&kvm_lock);
Suraj Jitindar Singhce35ef22016-10-19 13:49:47 +11005432
5433 return 0;
5434}
5435
5436DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, vcpu_stat_clear,
5437 "%llu\n");
Jing Zhangbc9e9e62021-06-23 17:28:46 -04005438DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_readonly_fops, vcpu_stat_get, NULL, "%llu\n");
Avi Kivity1165f5f2007-04-19 17:27:43 +03005439
Claudio Imbrenda286de8f2017-07-12 17:56:44 +02005440static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm)
5441{
5442 struct kobj_uevent_env *env;
Claudio Imbrenda286de8f2017-07-12 17:56:44 +02005443 unsigned long long created, active;
5444
5445 if (!kvm_dev.this_device || !kvm)
5446 return;
5447
Junaid Shahid0d9ce162019-01-03 17:14:28 -08005448 mutex_lock(&kvm_lock);
Claudio Imbrenda286de8f2017-07-12 17:56:44 +02005449 if (type == KVM_EVENT_CREATE_VM) {
5450 kvm_createvm_count++;
5451 kvm_active_vms++;
5452 } else if (type == KVM_EVENT_DESTROY_VM) {
5453 kvm_active_vms--;
5454 }
5455 created = kvm_createvm_count;
5456 active = kvm_active_vms;
Junaid Shahid0d9ce162019-01-03 17:14:28 -08005457 mutex_unlock(&kvm_lock);
Claudio Imbrenda286de8f2017-07-12 17:56:44 +02005458
Ben Gardonb12ce362019-02-11 11:02:49 -08005459 env = kzalloc(sizeof(*env), GFP_KERNEL_ACCOUNT);
Claudio Imbrenda286de8f2017-07-12 17:56:44 +02005460 if (!env)
5461 return;
5462
5463 add_uevent_var(env, "CREATED=%llu", created);
5464 add_uevent_var(env, "COUNT=%llu", active);
5465
Claudio Imbrendafdeaf7e2017-07-24 13:40:03 +02005466 if (type == KVM_EVENT_CREATE_VM) {
Claudio Imbrenda286de8f2017-07-12 17:56:44 +02005467 add_uevent_var(env, "EVENT=create");
Claudio Imbrendafdeaf7e2017-07-24 13:40:03 +02005468 kvm->userspace_pid = task_pid_nr(current);
5469 } else if (type == KVM_EVENT_DESTROY_VM) {
Claudio Imbrenda286de8f2017-07-12 17:56:44 +02005470 add_uevent_var(env, "EVENT=destroy");
Claudio Imbrendafdeaf7e2017-07-24 13:40:03 +02005471 }
5472 add_uevent_var(env, "PID=%d", kvm->userspace_pid);
Claudio Imbrenda286de8f2017-07-12 17:56:44 +02005473
Paolo Bonzini85cd39a2021-08-04 05:28:52 -04005474 if (kvm->debugfs_dentry) {
Ben Gardonb12ce362019-02-11 11:02:49 -08005475 char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL_ACCOUNT);
Claudio Imbrenda286de8f2017-07-12 17:56:44 +02005476
Claudio Imbrendafdeaf7e2017-07-24 13:40:03 +02005477 if (p) {
5478 tmp = dentry_path_raw(kvm->debugfs_dentry, p, PATH_MAX);
5479 if (!IS_ERR(tmp))
5480 add_uevent_var(env, "STATS_PATH=%s", tmp);
5481 kfree(p);
Claudio Imbrenda286de8f2017-07-12 17:56:44 +02005482 }
5483 }
5484 /* no need for checks, since we are adding at most only 5 keys */
5485 env->envp[env->envp_idx++] = NULL;
5486 kobject_uevent_env(&kvm_dev.this_device->kobj, KOBJ_CHANGE, env->envp);
5487 kfree(env);
Claudio Imbrenda286de8f2017-07-12 17:56:44 +02005488}
5489
Greg Kroah-Hartman929f45e2018-05-29 18:22:04 +02005490static void kvm_init_debug(void)
Avi Kivity6aa8b732006-12-10 02:21:36 -08005491{
Jing Zhangbc9e9e62021-06-23 17:28:46 -04005492 const struct file_operations *fops;
5493 const struct _kvm_stats_desc *pdesc;
5494 int i;
Avi Kivity6aa8b732006-12-10 02:21:36 -08005495
Hollis Blanchard76f7c872008-04-15 16:05:42 -05005496 kvm_debugfs_dir = debugfs_create_dir("kvm", NULL);
Hamo4f69b682011-12-15 14:23:16 +08005497
Jing Zhangbc9e9e62021-06-23 17:28:46 -04005498 for (i = 0; i < kvm_vm_stats_header.num_desc; ++i) {
5499 pdesc = &kvm_vm_stats_desc[i];
5500 if (kvm_stats_debugfs_mode(pdesc) & 0222)
5501 fops = &vm_stat_fops;
5502 else
5503 fops = &vm_stat_readonly_fops;
5504 debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
5505 kvm_debugfs_dir,
5506 (void *)(long)pdesc->desc.offset, fops);
5507 }
5508
5509 for (i = 0; i < kvm_vcpu_stats_header.num_desc; ++i) {
5510 pdesc = &kvm_vcpu_stats_desc[i];
5511 if (kvm_stats_debugfs_mode(pdesc) & 0222)
5512 fops = &vcpu_stat_fops;
5513 else
5514 fops = &vcpu_stat_readonly_fops;
5515 debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
5516 kvm_debugfs_dir,
5517 (void *)(long)pdesc->desc.offset, fops);
Hamo4f69b682011-12-15 14:23:16 +08005518 }
Avi Kivity6aa8b732006-12-10 02:21:36 -08005519}
5520
Rafael J. Wysockifb3600c2011-03-23 22:16:23 +01005521static int kvm_suspend(void)
Avi Kivity59ae6c62007-02-12 00:54:48 -08005522{
Alexander Graf10474ae2009-09-15 11:37:46 +02005523 if (kvm_usage_count)
Takuya Yoshikawa75b71272010-11-16 17:37:41 +09005524 hardware_disable_nolock(NULL);
Avi Kivity59ae6c62007-02-12 00:54:48 -08005525 return 0;
5526}
5527
Rafael J. Wysockifb3600c2011-03-23 22:16:23 +01005528static void kvm_resume(void)
Avi Kivity59ae6c62007-02-12 00:54:48 -08005529{
Zachary Amsdenca84d1a2010-08-19 22:07:28 -10005530 if (kvm_usage_count) {
Wanpeng Li2eb06c32019-05-17 16:49:49 +08005531#ifdef CONFIG_LOCKDEP
5532 WARN_ON(lockdep_is_held(&kvm_count_lock));
5533#endif
Takuya Yoshikawa75b71272010-11-16 17:37:41 +09005534 hardware_enable_nolock(NULL);
Zachary Amsdenca84d1a2010-08-19 22:07:28 -10005535 }
Avi Kivity59ae6c62007-02-12 00:54:48 -08005536}
5537
Rafael J. Wysockifb3600c2011-03-23 22:16:23 +01005538static struct syscore_ops kvm_syscore_ops = {
Avi Kivity59ae6c62007-02-12 00:54:48 -08005539 .suspend = kvm_suspend,
5540 .resume = kvm_resume,
5541};
5542
Avi Kivity15ad7142007-07-11 18:17:21 +03005543static inline
5544struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
5545{
5546 return container_of(pn, struct kvm_vcpu, preempt_notifier);
5547}
5548
5549static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
5550{
5551 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
Xiubo Lif95ef0cd2015-02-26 14:58:23 +08005552
Wanpeng Li046ddee2019-08-01 11:30:14 +08005553 WRITE_ONCE(vcpu->preempted, false);
Wanpeng Lid73eb572019-07-18 19:39:06 +08005554 WRITE_ONCE(vcpu->ready, false);
Avi Kivity15ad7142007-07-11 18:17:21 +03005555
Paolo Bonzini7495e222020-01-09 09:57:19 -05005556 __this_cpu_write(kvm_running_vcpu, vcpu);
Radim Krčmáře790d9e2014-08-21 18:08:05 +02005557 kvm_arch_sched_in(vcpu, cpu);
Zhang Xiantaoe9b11c12007-11-14 20:38:21 +08005558 kvm_arch_vcpu_load(vcpu, cpu);
Avi Kivity15ad7142007-07-11 18:17:21 +03005559}
5560
5561static void kvm_sched_out(struct preempt_notifier *pn,
5562 struct task_struct *next)
5563{
5564 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
5565
Peter Zijlstra3ba9f932021-06-11 10:28:13 +02005566 if (current->on_rq) {
Wanpeng Li046ddee2019-08-01 11:30:14 +08005567 WRITE_ONCE(vcpu->preempted, true);
Wanpeng Lid73eb572019-07-18 19:39:06 +08005568 WRITE_ONCE(vcpu->ready, true);
5569 }
Zhang Xiantaoe9b11c12007-11-14 20:38:21 +08005570 kvm_arch_vcpu_put(vcpu);
Paolo Bonzini7495e222020-01-09 09:57:19 -05005571 __this_cpu_write(kvm_running_vcpu, NULL);
5572}
5573
5574/**
5575 * kvm_get_running_vcpu - get the vcpu running on the current CPU.
Marc Zyngier1f03b2b2020-02-07 16:34:10 +00005576 *
5577 * We can disable preemption locally around accessing the per-CPU variable,
5578 * and use the resolved vcpu pointer after enabling preemption again,
5579 * because even if the current thread is migrated to another CPU, reading
5580 * the per-CPU value later will give us the same value as we update the
5581 * per-CPU variable in the preempt notifier handlers.
Paolo Bonzini7495e222020-01-09 09:57:19 -05005582 */
5583struct kvm_vcpu *kvm_get_running_vcpu(void)
5584{
Marc Zyngier1f03b2b2020-02-07 16:34:10 +00005585 struct kvm_vcpu *vcpu;
5586
5587 preempt_disable();
5588 vcpu = __this_cpu_read(kvm_running_vcpu);
5589 preempt_enable();
5590
5591 return vcpu;
Paolo Bonzini7495e222020-01-09 09:57:19 -05005592}
Wanpeng Li379a3c82020-04-28 14:23:27 +08005593EXPORT_SYMBOL_GPL(kvm_get_running_vcpu);
Paolo Bonzini7495e222020-01-09 09:57:19 -05005594
5595/**
5596 * kvm_get_running_vcpus - get the per-CPU array of currently running vcpus.
5597 */
5598struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void)
5599{
5600 return &kvm_running_vcpu;
Avi Kivity15ad7142007-07-11 18:17:21 +03005601}
5602
Sean Christophersone1bfc242021-11-11 02:07:33 +00005603#ifdef CONFIG_GUEST_PERF_EVENTS
5604static unsigned int kvm_guest_state(void)
5605{
5606 struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
5607 unsigned int state;
5608
5609 if (!kvm_arch_pmi_in_guest(vcpu))
5610 return 0;
5611
5612 state = PERF_GUEST_ACTIVE;
5613 if (!kvm_arch_vcpu_in_kernel(vcpu))
5614 state |= PERF_GUEST_USER;
5615
5616 return state;
5617}
5618
5619static unsigned long kvm_guest_get_ip(void)
5620{
5621 struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
5622
5623 /* Retrieving the IP must be guarded by a call to kvm_guest_state(). */
5624 if (WARN_ON_ONCE(!kvm_arch_pmi_in_guest(vcpu)))
5625 return 0;
5626
5627 return kvm_arch_vcpu_get_ip(vcpu);
5628}
5629
5630static struct perf_guest_info_callbacks kvm_guest_cbs = {
5631 .state = kvm_guest_state,
5632 .get_ip = kvm_guest_get_ip,
5633 .handle_intel_pt_intr = NULL,
5634};
5635
5636void kvm_register_perf_callbacks(unsigned int (*pt_intr_handler)(void))
5637{
5638 kvm_guest_cbs.handle_intel_pt_intr = pt_intr_handler;
5639 perf_register_guest_info_callbacks(&kvm_guest_cbs);
5640}
5641void kvm_unregister_perf_callbacks(void)
5642{
5643 perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
5644}
5645#endif
5646
Sean Christophersonb9904082020-03-21 13:25:55 -07005647struct kvm_cpu_compat_check {
5648 void *opaque;
5649 int *ret;
5650};
5651
5652static void check_processor_compat(void *data)
Sean Christophersonf257d6d2019-04-19 22:18:17 -07005653{
Sean Christophersonb9904082020-03-21 13:25:55 -07005654 struct kvm_cpu_compat_check *c = data;
5655
5656 *c->ret = kvm_arch_check_processor_compat(c->opaque);
Sean Christophersonf257d6d2019-04-19 22:18:17 -07005657}
5658
Avi Kivity0ee75be2010-04-28 15:39:01 +03005659int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
Rusty Russellc16f8622007-07-30 21:12:19 +10005660 struct module *module)
Avi Kivity6aa8b732006-12-10 02:21:36 -08005661{
Sean Christophersonb9904082020-03-21 13:25:55 -07005662 struct kvm_cpu_compat_check c;
Avi Kivity6aa8b732006-12-10 02:21:36 -08005663 int r;
Yang, Sheng002c7f72007-07-31 14:23:01 +03005664 int cpu;
Avi Kivity6aa8b732006-12-10 02:21:36 -08005665
Zhang Xiantaof8c16bb2007-11-14 20:40:21 +08005666 r = kvm_arch_init(opaque);
5667 if (r)
Zhang Xiantaod23087842007-11-29 15:35:39 +08005668 goto out_fail;
Zhang Xiantaocb498ea2007-11-14 20:39:31 +08005669
Asias He7dac16c2013-05-08 10:57:29 +08005670 /*
5671 * kvm_arch_init makes sure there's at most one caller
5672 * for architectures that support multiple implementations,
5673 * like intel and amd on x86.
Paolo Bonzini36343f62016-10-26 13:35:56 +02005674 * kvm_arch_init must be called before kvm_irqfd_init to avoid creating
5675 * conflicts in case kvm is already setup for another implementation.
Asias He7dac16c2013-05-08 10:57:29 +08005676 */
Paolo Bonzini36343f62016-10-26 13:35:56 +02005677 r = kvm_irqfd_init();
5678 if (r)
5679 goto out_irqfd;
Asias He7dac16c2013-05-08 10:57:29 +08005680
Avi Kivity8437a6172009-06-06 14:52:35 -07005681 if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
Rusty Russell7f59f492008-12-07 21:25:45 +10305682 r = -ENOMEM;
5683 goto out_free_0;
5684 }
5685
Sean Christophersonb9904082020-03-21 13:25:55 -07005686 r = kvm_arch_hardware_setup(opaque);
Avi Kivity6aa8b732006-12-10 02:21:36 -08005687 if (r < 0)
Miaohe Linfaf0be22019-11-23 10:45:50 +08005688 goto out_free_1;
Avi Kivity6aa8b732006-12-10 02:21:36 -08005689
Sean Christophersonb9904082020-03-21 13:25:55 -07005690 c.ret = &r;
5691 c.opaque = opaque;
Yang, Sheng002c7f72007-07-31 14:23:01 +03005692 for_each_online_cpu(cpu) {
Sean Christophersonb9904082020-03-21 13:25:55 -07005693 smp_call_function_single(cpu, check_processor_compat, &c, 1);
Yang, Sheng002c7f72007-07-31 14:23:01 +03005694 if (r < 0)
Miaohe Linfaf0be22019-11-23 10:45:50 +08005695 goto out_free_2;
Yang, Sheng002c7f72007-07-31 14:23:01 +03005696 }
5697
Thomas Gleixner73c1b412016-12-21 20:19:54 +01005698 r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_STARTING, "kvm/cpu:starting",
Thomas Gleixner8c18b2d2016-07-13 17:16:37 +00005699 kvm_starting_cpu, kvm_dying_cpu);
Avi Kivity774c47f2007-02-12 00:54:47 -08005700 if (r)
Zhang Xiantaod23087842007-11-29 15:35:39 +08005701 goto out_free_2;
Avi Kivity6aa8b732006-12-10 02:21:36 -08005702 register_reboot_notifier(&kvm_reboot_notifier);
5703
Rusty Russellc16f8622007-07-30 21:12:19 +10005704 /* A kmem cache lets us meet the alignment requirements of fx_save. */
Avi Kivity0ee75be2010-04-28 15:39:01 +03005705 if (!vcpu_align)
5706 vcpu_align = __alignof__(struct kvm_vcpu);
Paolo Bonzini46515732017-10-26 15:45:46 +02005707 kvm_vcpu_cache =
5708 kmem_cache_create_usercopy("kvm_vcpu", vcpu_size, vcpu_align,
5709 SLAB_ACCOUNT,
5710 offsetof(struct kvm_vcpu, arch),
Jing Zhangce55c042021-06-18 22:27:06 +00005711 offsetofend(struct kvm_vcpu, stats_id)
5712 - offsetof(struct kvm_vcpu, arch),
Paolo Bonzini46515732017-10-26 15:45:46 +02005713 NULL);
Rusty Russellc16f8622007-07-30 21:12:19 +10005714 if (!kvm_vcpu_cache) {
5715 r = -ENOMEM;
Rafael J. Wysockifb3600c2011-03-23 22:16:23 +01005716 goto out_free_3;
Rusty Russellc16f8622007-07-30 21:12:19 +10005717 }
5718
Vitaly Kuznetsovbaff59c2021-09-03 09:51:40 +02005719 for_each_possible_cpu(cpu) {
5720 if (!alloc_cpumask_var_node(&per_cpu(cpu_kick_mask, cpu),
5721 GFP_KERNEL, cpu_to_node(cpu))) {
5722 r = -ENOMEM;
5723 goto out_free_4;
5724 }
5725 }
5726
Gleb Natapovaf585b92010-10-14 11:22:46 +02005727 r = kvm_async_pf_init();
5728 if (r)
Vitaly Kuznetsovbaff59c2021-09-03 09:51:40 +02005729 goto out_free_5;
Gleb Natapovaf585b92010-10-14 11:22:46 +02005730
Avi Kivity6aa8b732006-12-10 02:21:36 -08005731 kvm_chardev_ops.owner = module;
Christian Borntraeger3d3aab12008-12-02 11:17:32 +01005732 kvm_vm_fops.owner = module;
5733 kvm_vcpu_fops.owner = module;
Avi Kivity6aa8b732006-12-10 02:21:36 -08005734
5735 r = misc_register(&kvm_dev);
5736 if (r) {
Xiubo Li1170adc2015-02-26 14:58:26 +08005737 pr_err("kvm: misc device register failed\n");
Gleb Natapovaf585b92010-10-14 11:22:46 +02005738 goto out_unreg;
Avi Kivity6aa8b732006-12-10 02:21:36 -08005739 }
5740
Rafael J. Wysockifb3600c2011-03-23 22:16:23 +01005741 register_syscore_ops(&kvm_syscore_ops);
5742
Avi Kivity15ad7142007-07-11 18:17:21 +03005743 kvm_preempt_ops.sched_in = kvm_sched_in;
5744 kvm_preempt_ops.sched_out = kvm_sched_out;
5745
Greg Kroah-Hartman929f45e2018-05-29 18:22:04 +02005746 kvm_init_debug();
Darrick J. Wong0ea4ed82009-10-14 16:21:00 -07005747
Paolo Bonzini3c3c29f2014-09-24 13:02:46 +02005748 r = kvm_vfio_ops_init();
5749 WARN_ON(r);
5750
Avi Kivityc7addb92007-09-16 18:58:32 +02005751 return 0;
Avi Kivity6aa8b732006-12-10 02:21:36 -08005752
Gleb Natapovaf585b92010-10-14 11:22:46 +02005753out_unreg:
5754 kvm_async_pf_deinit();
Vitaly Kuznetsovbaff59c2021-09-03 09:51:40 +02005755out_free_5:
5756 for_each_possible_cpu(cpu)
5757 free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
5758out_free_4:
Rusty Russellc16f8622007-07-30 21:12:19 +10005759 kmem_cache_destroy(kvm_vcpu_cache);
Zhang Xiantaod23087842007-11-29 15:35:39 +08005760out_free_3:
Avi Kivity6aa8b732006-12-10 02:21:36 -08005761 unregister_reboot_notifier(&kvm_reboot_notifier);
Thomas Gleixner8c18b2d2016-07-13 17:16:37 +00005762 cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING);
Zhang Xiantaod23087842007-11-29 15:35:39 +08005763out_free_2:
Zhang Xiantaoe9b11c12007-11-14 20:38:21 +08005764 kvm_arch_hardware_unsetup();
Miaohe Linfaf0be22019-11-23 10:45:50 +08005765out_free_1:
Rusty Russell7f59f492008-12-07 21:25:45 +10305766 free_cpumask_var(cpus_hardware_enabled);
Zhang Xiantaod23087842007-11-29 15:35:39 +08005767out_free_0:
Cornelia Hucka0f155e2013-02-28 12:33:18 +01005768 kvm_irqfd_exit();
Paolo Bonzini36343f62016-10-26 13:35:56 +02005769out_irqfd:
Asias He7dac16c2013-05-08 10:57:29 +08005770 kvm_arch_exit();
5771out_fail:
Avi Kivity6aa8b732006-12-10 02:21:36 -08005772 return r;
5773}
Zhang Xiantaocb498ea2007-11-14 20:39:31 +08005774EXPORT_SYMBOL_GPL(kvm_init);
Avi Kivity6aa8b732006-12-10 02:21:36 -08005775
Zhang Xiantaocb498ea2007-11-14 20:39:31 +08005776void kvm_exit(void)
Avi Kivity6aa8b732006-12-10 02:21:36 -08005777{
Vitaly Kuznetsovbaff59c2021-09-03 09:51:40 +02005778 int cpu;
5779
Janosch Frank4bd33b52015-10-14 12:37:35 +02005780 debugfs_remove_recursive(kvm_debugfs_dir);
Avi Kivity6aa8b732006-12-10 02:21:36 -08005781 misc_deregister(&kvm_dev);
Vitaly Kuznetsovbaff59c2021-09-03 09:51:40 +02005782 for_each_possible_cpu(cpu)
5783 free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
Rusty Russellc16f8622007-07-30 21:12:19 +10005784 kmem_cache_destroy(kvm_vcpu_cache);
Gleb Natapovaf585b92010-10-14 11:22:46 +02005785 kvm_async_pf_deinit();
Rafael J. Wysockifb3600c2011-03-23 22:16:23 +01005786 unregister_syscore_ops(&kvm_syscore_ops);
Avi Kivity6aa8b732006-12-10 02:21:36 -08005787 unregister_reboot_notifier(&kvm_reboot_notifier);
Thomas Gleixner8c18b2d2016-07-13 17:16:37 +00005788 cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING);
Takuya Yoshikawa75b71272010-11-16 17:37:41 +09005789 on_each_cpu(hardware_disable_nolock, NULL, 1);
Zhang Xiantaoe9b11c12007-11-14 20:38:21 +08005790 kvm_arch_hardware_unsetup();
Zhang Xiantaof8c16bb2007-11-14 20:40:21 +08005791 kvm_arch_exit();
Cornelia Hucka0f155e2013-02-28 12:33:18 +01005792 kvm_irqfd_exit();
Rusty Russell7f59f492008-12-07 21:25:45 +10305793 free_cpumask_var(cpus_hardware_enabled);
Wanpeng Li571ee1b2014-10-09 18:30:08 +08005794 kvm_vfio_ops_exit();
Avi Kivity6aa8b732006-12-10 02:21:36 -08005795}
Zhang Xiantaocb498ea2007-11-14 20:39:31 +08005796EXPORT_SYMBOL_GPL(kvm_exit);
Junaid Shahidc57c8042019-11-04 12:22:02 +01005797
5798struct kvm_vm_worker_thread_context {
5799 struct kvm *kvm;
5800 struct task_struct *parent;
5801 struct completion init_done;
5802 kvm_vm_thread_fn_t thread_fn;
5803 uintptr_t data;
5804 int err;
5805};
5806
5807static int kvm_vm_worker_thread(void *context)
5808{
5809 /*
5810 * The init_context is allocated on the stack of the parent thread, so
5811 * we have to locally copy anything that is needed beyond initialization
5812 */
5813 struct kvm_vm_worker_thread_context *init_context = context;
5814 struct kvm *kvm = init_context->kvm;
5815 kvm_vm_thread_fn_t thread_fn = init_context->thread_fn;
5816 uintptr_t data = init_context->data;
5817 int err;
5818
5819 err = kthread_park(current);
5820 /* kthread_park(current) is never supposed to return an error */
5821 WARN_ON(err != 0);
5822 if (err)
5823 goto init_complete;
5824
5825 err = cgroup_attach_task_all(init_context->parent, current);
5826 if (err) {
5827 kvm_err("%s: cgroup_attach_task_all failed with err %d\n",
5828 __func__, err);
5829 goto init_complete;
5830 }
5831
5832 set_user_nice(current, task_nice(init_context->parent));
5833
5834init_complete:
5835 init_context->err = err;
5836 complete(&init_context->init_done);
5837 init_context = NULL;
5838
5839 if (err)
5840 return err;
5841
5842 /* Wait to be woken up by the spawner before proceeding. */
5843 kthread_parkme();
5844
5845 if (!kthread_should_stop())
5846 err = thread_fn(kvm, data);
5847
5848 return err;
5849}
5850
5851int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn,
5852 uintptr_t data, const char *name,
5853 struct task_struct **thread_ptr)
5854{
5855 struct kvm_vm_worker_thread_context init_context = {};
5856 struct task_struct *thread;
5857
5858 *thread_ptr = NULL;
5859 init_context.kvm = kvm;
5860 init_context.parent = current;
5861 init_context.thread_fn = thread_fn;
5862 init_context.data = data;
5863 init_completion(&init_context.init_done);
5864
5865 thread = kthread_run(kvm_vm_worker_thread, &init_context,
5866 "%s-%d", name, task_pid_nr(current));
5867 if (IS_ERR(thread))
5868 return PTR_ERR(thread);
5869
5870 /* kthread_run is never supposed to return NULL */
5871 WARN_ON(thread == NULL);
5872
5873 wait_for_completion(&init_context.init_done);
5874
5875 if (!init_context.err)
5876 *thread_ptr = thread;
5877
5878 return init_context.err;
5879}