blob: e6fc579bb454e6ed94e0644dcbcbbb66fbfc8c9c [file] [log] [blame]
Thomas Gleixner20c8ccb2019-06-04 10:11:32 +02001// SPDX-License-Identifier: GPL-2.0-only
Avi Kivity6aa8b732006-12-10 02:21:36 -08002/*
3 * Kernel-based Virtual Machine driver for Linux
4 *
5 * This module enables machines with Intel VT-x extensions to run virtual
6 * machines without emulation or binary translation.
7 *
8 * Copyright (C) 2006 Qumranet, Inc.
Nicolas Kaiser9611c182010-10-06 14:23:22 +02009 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
Avi Kivity6aa8b732006-12-10 02:21:36 -080010 *
11 * Authors:
12 * Avi Kivity <avi@qumranet.com>
13 * Yaniv Kamay <yaniv@qumranet.com>
Avi Kivity6aa8b732006-12-10 02:21:36 -080014 */
15
Andre Przywaraaf669ac2015-03-26 14:39:29 +000016#include <kvm/iodev.h>
Avi Kivity6aa8b732006-12-10 02:21:36 -080017
Avi Kivityedf88412007-12-16 11:02:48 +020018#include <linux/kvm_host.h>
Avi Kivity6aa8b732006-12-10 02:21:36 -080019#include <linux/kvm.h>
20#include <linux/module.h>
21#include <linux/errno.h>
Avi Kivity6aa8b732006-12-10 02:21:36 -080022#include <linux/percpu.h>
Avi Kivity6aa8b732006-12-10 02:21:36 -080023#include <linux/mm.h>
24#include <linux/miscdevice.h>
25#include <linux/vmalloc.h>
Avi Kivity6aa8b732006-12-10 02:21:36 -080026#include <linux/reboot.h>
Avi Kivity6aa8b732006-12-10 02:21:36 -080027#include <linux/debugfs.h>
28#include <linux/highmem.h>
29#include <linux/file.h>
Rafael J. Wysockifb3600c2011-03-23 22:16:23 +010030#include <linux/syscore_ops.h>
Avi Kivity774c47f2007-02-12 00:54:47 -080031#include <linux/cpu.h>
Ingo Molnar174cd4b2017-02-02 19:15:33 +010032#include <linux/sched/signal.h>
Ingo Molnar6e84f312017-02-08 18:51:29 +010033#include <linux/sched/mm.h>
Ingo Molnar03441a32017-02-08 18:51:35 +010034#include <linux/sched/stat.h>
Avi Kivityd9e368d2007-06-07 19:18:30 +030035#include <linux/cpumask.h>
36#include <linux/smp.h>
Avi Kivityd6d28162007-06-28 08:38:16 -040037#include <linux/anon_inodes.h>
Avi Kivity04d2cc72007-09-10 18:10:54 +030038#include <linux/profile.h>
Anthony Liguori7aa81cc2007-09-17 14:57:50 -050039#include <linux/kvm_para.h>
Izik Eidus6fc138d2007-10-09 19:20:39 +020040#include <linux/pagemap.h>
Anthony Liguori8d4e1282007-10-18 09:59:34 -050041#include <linux/mman.h>
Anthony Liguori35149e22008-04-02 14:46:56 -050042#include <linux/swap.h>
Sheng Yange56d5322009-03-12 21:45:39 +080043#include <linux/bitops.h>
Marcelo Tosatti547de292009-05-07 17:55:13 -030044#include <linux/spinlock.h>
Arnd Bergmann6ff58942009-10-22 14:19:27 +020045#include <linux/compat.h>
Marcelo Tosattibc6678a2009-12-23 14:35:21 -020046#include <linux/srcu.h>
Joerg Roedel8f0b1ab2010-01-28 12:37:56 +010047#include <linux/hugetlb.h>
Tejun Heo5a0e3ad2010-03-24 17:04:11 +090048#include <linux/slab.h>
Sasha Levin743eeb02011-07-27 16:00:48 +030049#include <linux/sort.h>
50#include <linux/bsearch.h>
Paolo Bonzinic011d232019-05-17 14:08:53 +020051#include <linux/io.h>
Wanpeng Li2eb06c32019-05-17 16:49:49 +080052#include <linux/lockdep.h>
Junaid Shahidc57c8042019-11-04 12:22:02 +010053#include <linux/kthread.h>
Sergey Senozhatsky2fdef3a2021-06-06 11:10:44 +090054#include <linux/suspend.h>
Avi Kivity6aa8b732006-12-10 02:21:36 -080055
Avi Kivitye4956062007-06-28 14:15:57 -040056#include <asm/processor.h>
David Matlack2ea75be2014-09-19 16:03:25 -070057#include <asm/ioctl.h>
Linus Torvalds7c0f6ba2016-12-24 11:46:01 -080058#include <linux/uaccess.h>
Avi Kivity6aa8b732006-12-10 02:21:36 -080059
Laurent Vivier5f94c172008-05-30 16:05:54 +020060#include "coalesced_mmio.h"
Gleb Natapovaf585b92010-10-14 11:22:46 +020061#include "async_pf.h"
Ben Gardon531810c2021-02-02 10:57:24 -080062#include "mmu_lock.h"
Paolo Bonzini3c3c29f2014-09-24 13:02:46 +020063#include "vfio.h"
Laurent Vivier5f94c172008-05-30 16:05:54 +020064
Marcelo Tosatti229456f2009-06-17 09:22:14 -030065#define CREATE_TRACE_POINTS
66#include <trace/events/kvm.h>
67
Peter Xufb04a1e2020-09-30 21:22:22 -040068#include <linux/kvm_dirty_ring.h>
69
Janosch Frank536a6f82016-05-18 13:26:23 +020070/* Worst case buffer size needed for holding an integer. */
71#define ITOA_MAX_LEN 12
72
Avi Kivity6aa8b732006-12-10 02:21:36 -080073MODULE_AUTHOR("Qumranet");
74MODULE_LICENSE("GPL");
75
David Hildenbrand920552b2015-09-18 12:34:53 +020076/* Architectures should define their poll value according to the halt latency */
Suraj Jitindar Singhec76d812016-10-14 11:53:19 +110077unsigned int halt_poll_ns = KVM_HALT_POLL_NS_DEFAULT;
Roman Storozhenko039c5d12017-06-27 12:51:18 +030078module_param(halt_poll_ns, uint, 0644);
Suraj Jitindar Singhec76d812016-10-14 11:53:19 +110079EXPORT_SYMBOL_GPL(halt_poll_ns);
Paolo Bonzinif7819512015-02-04 18:20:58 +010080
Wanpeng Liaca6ff22015-09-03 22:07:38 +080081/* Default doubles per-vcpu halt_poll_ns. */
Suraj Jitindar Singhec76d812016-10-14 11:53:19 +110082unsigned int halt_poll_ns_grow = 2;
Roman Storozhenko039c5d12017-06-27 12:51:18 +030083module_param(halt_poll_ns_grow, uint, 0644);
Suraj Jitindar Singhec76d812016-10-14 11:53:19 +110084EXPORT_SYMBOL_GPL(halt_poll_ns_grow);
Wanpeng Liaca6ff22015-09-03 22:07:38 +080085
Nir Weiner49113d32019-01-27 12:17:15 +020086/* The start value to grow halt_poll_ns from */
87unsigned int halt_poll_ns_grow_start = 10000; /* 10us */
88module_param(halt_poll_ns_grow_start, uint, 0644);
89EXPORT_SYMBOL_GPL(halt_poll_ns_grow_start);
90
Wanpeng Liaca6ff22015-09-03 22:07:38 +080091/* Default resets per-vcpu halt_poll_ns . */
Suraj Jitindar Singhec76d812016-10-14 11:53:19 +110092unsigned int halt_poll_ns_shrink;
Roman Storozhenko039c5d12017-06-27 12:51:18 +030093module_param(halt_poll_ns_shrink, uint, 0644);
Suraj Jitindar Singhec76d812016-10-14 11:53:19 +110094EXPORT_SYMBOL_GPL(halt_poll_ns_shrink);
Wanpeng Liaca6ff22015-09-03 22:07:38 +080095
Marcelo Tosattifa40a822009-06-04 15:08:24 -030096/*
97 * Ordering of locks:
98 *
Xiubo Lib7d409d2015-02-26 14:58:24 +080099 * kvm->lock --> kvm->slots_lock --> kvm->irq_lock
Marcelo Tosattifa40a822009-06-04 15:08:24 -0300100 */
101
Junaid Shahid0d9ce162019-01-03 17:14:28 -0800102DEFINE_MUTEX(kvm_lock);
Paolo Bonzini4a937f92013-09-10 12:58:35 +0200103static DEFINE_RAW_SPINLOCK(kvm_count_lock);
Zhang Xiantaoe9b11c12007-11-14 20:38:21 +0800104LIST_HEAD(vm_list);
Avi Kivity133de902007-02-12 00:54:44 -0800105
Rusty Russell7f59f492008-12-07 21:25:45 +1030106static cpumask_var_t cpus_hardware_enabled;
Xiubo Lif4fee932015-02-26 14:58:21 +0800107static int kvm_usage_count;
Alexander Graf10474ae2009-09-15 11:37:46 +0200108static atomic_t hardware_enable_failed;
Avi Kivity1b6c0162007-05-24 13:03:52 +0300109
Sean Christophersonaaba2982019-12-18 13:55:16 -0800110static struct kmem_cache *kvm_vcpu_cache;
Avi Kivity1165f5f2007-04-19 17:27:43 +0300111
Avi Kivity15ad7142007-07-11 18:17:21 +0300112static __read_mostly struct preempt_ops kvm_preempt_ops;
Paolo Bonzini7495e222020-01-09 09:57:19 -0500113static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_running_vcpu);
Avi Kivity15ad7142007-07-11 18:17:21 +0300114
Hollis Blanchard76f7c872008-04-15 16:05:42 -0500115struct dentry *kvm_debugfs_dir;
Paul Mackerrase23a8082015-03-28 14:21:01 +1100116EXPORT_SYMBOL_GPL(kvm_debugfs_dir);
Avi Kivity6aa8b732006-12-10 02:21:36 -0800117
Milan Pandurov09cbcef2019-12-13 14:07:21 +0100118static const struct file_operations stat_fops_per_vm;
Janosch Frank536a6f82016-05-18 13:26:23 +0200119
Avi Kivitybccf2152007-02-21 18:04:26 +0200120static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
121 unsigned long arg);
Christian Borntraegerde8e5d72015-02-03 09:35:15 +0100122#ifdef CONFIG_KVM_COMPAT
Alexander Graf1dda6062011-06-08 02:45:37 +0200123static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl,
124 unsigned long arg);
Marc Zyngier7ddfd3e2018-06-17 10:16:21 +0100125#define KVM_COMPAT(c) .compat_ioctl = (c)
126#else
Marc Zyngier9cb09e72019-11-14 13:17:39 +0000127/*
128 * For architectures that don't implement a compat infrastructure,
129 * adopt a double line of defense:
130 * - Prevent a compat task from opening /dev/kvm
131 * - If the open has been done by a 64bit task, and the KVM fd
132 * passed to a compat task, let the ioctls fail.
133 */
Marc Zyngier7ddfd3e2018-06-17 10:16:21 +0100134static long kvm_no_compat_ioctl(struct file *file, unsigned int ioctl,
135 unsigned long arg) { return -EINVAL; }
Marc Zyngierb9876e62019-11-13 16:05:23 +0000136
137static int kvm_no_compat_open(struct inode *inode, struct file *file)
138{
139 return is_compat_task() ? -ENODEV : 0;
140}
141#define KVM_COMPAT(c) .compat_ioctl = kvm_no_compat_ioctl, \
142 .open = kvm_no_compat_open
Alexander Graf1dda6062011-06-08 02:45:37 +0200143#endif
Alexander Graf10474ae2009-09-15 11:37:46 +0200144static int hardware_enable_all(void);
145static void hardware_disable_all(void);
Avi Kivitybccf2152007-02-21 18:04:26 +0200146
Marcelo Tosattie93f8a02009-12-23 14:35:24 -0200147static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
Stephen Hemminger79408762013-12-29 12:12:29 -0800148
Andi Kleen52480132014-02-08 08:51:57 +0100149__visible bool kvm_rebooting;
Avi Kivityb7c41452010-12-02 17:52:50 +0200150EXPORT_SYMBOL_GPL(kvm_rebooting);
Avi Kivity4ecac3f2008-05-13 13:23:38 +0300151
Claudio Imbrenda286de8f2017-07-12 17:56:44 +0200152#define KVM_EVENT_CREATE_VM 0
153#define KVM_EVENT_DESTROY_VM 1
154static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm);
155static unsigned long long kvm_createvm_count;
156static unsigned long long kvm_active_vms;
157
Eiichi Tsukatae649b3f2020-06-06 13:26:27 +0900158__weak void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
159 unsigned long start, unsigned long end)
Radim Krčmářb1394e72017-11-30 19:05:45 +0100160{
161}
162
Sean Christophersona78986a2019-11-11 14:12:27 -0800163bool kvm_is_zone_device_pfn(kvm_pfn_t pfn)
164{
165 /*
166 * The metadata used by is_zone_device_page() to determine whether or
167 * not a page is ZONE_DEVICE is guaranteed to be valid if and only if
168 * the device has been pinned, e.g. by get_user_pages(). WARN if the
169 * page_count() is zero to help detect bad usage of this helper.
170 */
171 if (!pfn_valid(pfn) || WARN_ON_ONCE(!page_count(pfn_to_page(pfn))))
172 return false;
173
174 return is_zone_device_page(pfn_to_page(pfn));
175}
176
Dan Williamsba049e92016-01-15 16:56:11 -0800177bool kvm_is_reserved_pfn(kvm_pfn_t pfn)
Ben-Ami Yassourcbff90a2008-07-28 19:26:24 +0300178{
Sean Christophersona78986a2019-11-11 14:12:27 -0800179 /*
180 * ZONE_DEVICE pages currently set PG_reserved, but from a refcounting
181 * perspective they are "normal" pages, albeit with slightly different
182 * usage rules.
183 */
Andrea Arcangeli11feeb42013-07-25 03:04:38 +0200184 if (pfn_valid(pfn))
Sean Christophersona78986a2019-11-11 14:12:27 -0800185 return PageReserved(pfn_to_page(pfn)) &&
Zhuang Yanying7df003c2019-10-12 11:37:31 +0800186 !is_zero_pfn(pfn) &&
Sean Christophersona78986a2019-11-11 14:12:27 -0800187 !kvm_is_zone_device_pfn(pfn);
Ben-Ami Yassourcbff90a2008-07-28 19:26:24 +0300188
189 return true;
190}
191
Sean Christopherson005ba372020-01-08 12:24:36 -0800192bool kvm_is_transparent_hugepage(kvm_pfn_t pfn)
193{
194 struct page *page = pfn_to_page(pfn);
195
196 if (!PageTransCompoundMap(page))
197 return false;
198
199 return is_transparent_hugepage(compound_head(page));
200}
201
Avi Kivity6aa8b732006-12-10 02:21:36 -0800202/*
203 * Switches to specified vcpu, until a matching vcpu_put()
204 */
Christoffer Dallec7660c2017-12-04 21:35:23 +0100205void vcpu_load(struct kvm_vcpu *vcpu)
Avi Kivity6aa8b732006-12-10 02:21:36 -0800206{
Christoffer Dallec7660c2017-12-04 21:35:23 +0100207 int cpu = get_cpu();
Paolo Bonzini7495e222020-01-09 09:57:19 -0500208
209 __this_cpu_write(kvm_running_vcpu, vcpu);
Avi Kivity15ad7142007-07-11 18:17:21 +0300210 preempt_notifier_register(&vcpu->preempt_notifier);
Carsten Otte313a3dc2007-10-11 19:16:52 +0200211 kvm_arch_vcpu_load(vcpu, cpu);
Avi Kivity15ad7142007-07-11 18:17:21 +0300212 put_cpu();
Avi Kivitybccf2152007-02-21 18:04:26 +0200213}
Jim Mattson2f1fe812016-07-08 15:36:06 -0700214EXPORT_SYMBOL_GPL(vcpu_load);
Avi Kivitybccf2152007-02-21 18:04:26 +0200215
Carsten Otte313a3dc2007-10-11 19:16:52 +0200216void vcpu_put(struct kvm_vcpu *vcpu)
Avi Kivity6aa8b732006-12-10 02:21:36 -0800217{
Avi Kivity15ad7142007-07-11 18:17:21 +0300218 preempt_disable();
Carsten Otte313a3dc2007-10-11 19:16:52 +0200219 kvm_arch_vcpu_put(vcpu);
Avi Kivity15ad7142007-07-11 18:17:21 +0300220 preempt_notifier_unregister(&vcpu->preempt_notifier);
Paolo Bonzini7495e222020-01-09 09:57:19 -0500221 __this_cpu_write(kvm_running_vcpu, NULL);
Avi Kivity15ad7142007-07-11 18:17:21 +0300222 preempt_enable();
Avi Kivity6aa8b732006-12-10 02:21:36 -0800223}
Jim Mattson2f1fe812016-07-08 15:36:06 -0700224EXPORT_SYMBOL_GPL(vcpu_put);
Avi Kivity6aa8b732006-12-10 02:21:36 -0800225
Paolo Bonzini7a97cec2017-04-27 14:33:43 +0200226/* TODO: merge with kvm_arch_vcpu_should_kick */
227static bool kvm_request_needs_ipi(struct kvm_vcpu *vcpu, unsigned req)
228{
229 int mode = kvm_vcpu_exiting_guest_mode(vcpu);
230
231 /*
232 * We need to wait for the VCPU to reenable interrupts and get out of
233 * READING_SHADOW_PAGE_TABLES mode.
234 */
235 if (req & KVM_REQUEST_WAIT)
236 return mode != OUTSIDE_GUEST_MODE;
237
238 /*
239 * Need to kick a running VCPU, but otherwise there is nothing to do.
240 */
241 return mode == IN_GUEST_MODE;
242}
243
Avi Kivityd9e368d2007-06-07 19:18:30 +0300244static void ack_flush(void *_completed)
245{
Avi Kivityd9e368d2007-06-07 19:18:30 +0300246}
247
Paolo Bonzinib49defe2017-06-30 13:25:45 +0200248static inline bool kvm_kick_many_cpus(const struct cpumask *cpus, bool wait)
249{
250 if (unlikely(!cpus))
251 cpus = cpu_online_mask;
252
253 if (cpumask_empty(cpus))
254 return false;
255
256 smp_call_function_many(cpus, ack_flush, NULL, wait);
257 return true;
258}
259
Vitaly Kuznetsov7053df42018-05-16 17:21:28 +0200260bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req,
Suravee Suthikulpanit54163a32020-05-06 08:17:53 -0500261 struct kvm_vcpu *except,
Vitaly Kuznetsov7053df42018-05-16 17:21:28 +0200262 unsigned long *vcpu_bitmap, cpumask_var_t tmp)
Avi Kivityd9e368d2007-06-07 19:18:30 +0300263{
Avi Kivity597a5f52008-07-20 14:24:22 +0300264 int i, cpu, me;
Avi Kivityd9e368d2007-06-07 19:18:30 +0300265 struct kvm_vcpu *vcpu;
Vitaly Kuznetsov7053df42018-05-16 17:21:28 +0200266 bool called;
Rusty Russell6ef7a1b2008-12-08 20:28:04 +1030267
Xiao Guangrong3cba4132011-01-12 15:41:22 +0800268 me = get_cpu();
Vitaly Kuznetsov7053df42018-05-16 17:21:28 +0200269
Gleb Natapov988a2ca2009-06-09 15:56:29 +0300270 kvm_for_each_vcpu(i, vcpu, kvm) {
Suravee Suthikulpanit54163a32020-05-06 08:17:53 -0500271 if ((vcpu_bitmap && !test_bit(i, vcpu_bitmap)) ||
272 vcpu == except)
Vitaly Kuznetsov7053df42018-05-16 17:21:28 +0200273 continue;
274
Xiao Guangrong3cba4132011-01-12 15:41:22 +0800275 kvm_make_request(req, vcpu);
Avi Kivityd9e368d2007-06-07 19:18:30 +0300276 cpu = vcpu->cpu;
Xiao Guangrong6b7e2d02011-01-12 15:40:31 +0800277
Radim Krčmář178f02f2017-04-26 22:32:26 +0200278 if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu))
279 continue;
Radim Krčmář6c6e8362017-04-26 22:32:23 +0200280
Vitaly Kuznetsov7053df42018-05-16 17:21:28 +0200281 if (tmp != NULL && cpu != -1 && cpu != me &&
Paolo Bonzini7a97cec2017-04-27 14:33:43 +0200282 kvm_request_needs_ipi(vcpu, req))
Vitaly Kuznetsov7053df42018-05-16 17:21:28 +0200283 __cpumask_set_cpu(cpu, tmp);
Avi Kivityd9e368d2007-06-07 19:18:30 +0300284 }
Vitaly Kuznetsov7053df42018-05-16 17:21:28 +0200285
286 called = kvm_kick_many_cpus(tmp, !!(req & KVM_REQUEST_WAIT));
Xiao Guangrong3cba4132011-01-12 15:41:22 +0800287 put_cpu();
Vitaly Kuznetsov7053df42018-05-16 17:21:28 +0200288
289 return called;
290}
291
Suravee Suthikulpanit54163a32020-05-06 08:17:53 -0500292bool kvm_make_all_cpus_request_except(struct kvm *kvm, unsigned int req,
293 struct kvm_vcpu *except)
Vitaly Kuznetsov7053df42018-05-16 17:21:28 +0200294{
295 cpumask_var_t cpus;
296 bool called;
Vitaly Kuznetsov7053df42018-05-16 17:21:28 +0200297
298 zalloc_cpumask_var(&cpus, GFP_ATOMIC);
299
Suravee Suthikulpanit54163a32020-05-06 08:17:53 -0500300 called = kvm_make_vcpus_request_mask(kvm, req, except, NULL, cpus);
Vitaly Kuznetsov7053df42018-05-16 17:21:28 +0200301
Rusty Russell6ef7a1b2008-12-08 20:28:04 +1030302 free_cpumask_var(cpus);
Rusty Russell49846892008-12-08 20:26:24 +1030303 return called;
304}
305
Suravee Suthikulpanit54163a32020-05-06 08:17:53 -0500306bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
307{
308 return kvm_make_all_cpus_request_except(kvm, req, NULL);
309}
Marcelo Tosattia2486022021-05-26 14:20:14 -0300310EXPORT_SYMBOL_GPL(kvm_make_all_cpus_request);
Suravee Suthikulpanit54163a32020-05-06 08:17:53 -0500311
Mario Smarducha6d51012015-01-15 15:58:52 -0800312#ifndef CONFIG_HAVE_KVM_ARCH_TLB_FLUSH_ALL
Rusty Russell49846892008-12-08 20:26:24 +1030313void kvm_flush_remote_tlbs(struct kvm *kvm)
314{
Lan Tianyu4ae3cb32016-03-13 11:10:28 +0800315 /*
316 * Read tlbs_dirty before setting KVM_REQ_TLB_FLUSH in
317 * kvm_make_all_cpus_request.
318 */
319 long dirty_count = smp_load_acquire(&kvm->tlbs_dirty);
Xiao Guangronga086f6a2014-04-17 17:06:12 +0800320
Lan Tianyu4ae3cb32016-03-13 11:10:28 +0800321 /*
322 * We want to publish modifications to the page tables before reading
323 * mode. Pairs with a memory barrier in arch-specific code.
324 * - x86: smp_mb__after_srcu_read_unlock in vcpu_enter_guest
325 * and smp_mb in walk_shadow_page_lockless_begin/end.
326 * - powerpc: smp_mb in kvmppc_prepare_to_enter.
327 *
328 * There is already an smp_mb__after_atomic() before
329 * kvm_make_all_cpus_request() reads vcpu->mode. We reuse that
330 * barrier here.
331 */
Tianyu Lanb08660e2018-07-19 08:40:17 +0000332 if (!kvm_arch_flush_remote_tlb(kvm)
333 || kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
Jing Zhang0193cc92021-06-18 22:27:03 +0000334 ++kvm->stat.generic.remote_tlb_flush;
Xiao Guangronga086f6a2014-04-17 17:06:12 +0800335 cmpxchg(&kvm->tlbs_dirty, dirty_count, 0);
Avi Kivityd9e368d2007-06-07 19:18:30 +0300336}
Aneesh Kumar K.V2ba9f0d2013-10-07 22:17:59 +0530337EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs);
Mario Smarducha6d51012015-01-15 15:58:52 -0800338#endif
Avi Kivityd9e368d2007-06-07 19:18:30 +0300339
Marcelo Tosatti2e53d632008-02-20 14:47:24 -0500340void kvm_reload_remote_mmus(struct kvm *kvm)
341{
Tang Chen445b8232014-09-24 15:57:55 +0800342 kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
Marcelo Tosatti2e53d632008-02-20 14:47:24 -0500343}
344
Sean Christopherson6926f952020-07-02 19:35:39 -0700345#ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE
346static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc,
347 gfp_t gfp_flags)
348{
349 gfp_flags |= mc->gfp_zero;
350
351 if (mc->kmem_cache)
352 return kmem_cache_alloc(mc->kmem_cache, gfp_flags);
353 else
354 return (void *)__get_free_page(gfp_flags);
355}
356
357int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min)
358{
359 void *obj;
360
361 if (mc->nobjs >= min)
362 return 0;
363 while (mc->nobjs < ARRAY_SIZE(mc->objects)) {
364 obj = mmu_memory_cache_alloc_obj(mc, GFP_KERNEL_ACCOUNT);
365 if (!obj)
366 return mc->nobjs >= min ? 0 : -ENOMEM;
367 mc->objects[mc->nobjs++] = obj;
368 }
369 return 0;
370}
371
372int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc)
373{
374 return mc->nobjs;
375}
376
377void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
378{
379 while (mc->nobjs) {
380 if (mc->kmem_cache)
381 kmem_cache_free(mc->kmem_cache, mc->objects[--mc->nobjs]);
382 else
383 free_page((unsigned long)mc->objects[--mc->nobjs]);
384 }
385}
386
387void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
388{
389 void *p;
390
391 if (WARN_ON(!mc->nobjs))
392 p = mmu_memory_cache_alloc_obj(mc, GFP_ATOMIC | __GFP_ACCOUNT);
393 else
394 p = mc->objects[--mc->nobjs];
395 BUG_ON(!p);
396 return p;
397}
398#endif
399
Sean Christopherson8bd826d2019-12-18 13:55:30 -0800400static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
Rusty Russellfb3f0f52007-07-27 17:16:56 +1000401{
Rusty Russellfb3f0f52007-07-27 17:16:56 +1000402 mutex_init(&vcpu->mutex);
403 vcpu->cpu = -1;
Rusty Russellfb3f0f52007-07-27 17:16:56 +1000404 vcpu->kvm = kvm;
405 vcpu->vcpu_id = id;
Rik van Riel34bb10b2011-02-01 09:52:41 -0500406 vcpu->pid = NULL;
Davidlohr Buesoda4ad882020-04-23 22:48:37 -0700407 rcuwait_init(&vcpu->wait);
Gleb Natapovaf585b92010-10-14 11:22:46 +0200408 kvm_async_pf_vcpu_init(vcpu);
Rusty Russellfb3f0f52007-07-27 17:16:56 +1000409
Feng Wubf9f6ac2015-09-18 22:29:55 +0800410 vcpu->pre_pcpu = -1;
411 INIT_LIST_HEAD(&vcpu->blocked_vcpu_list);
412
Raghavendra K T4c088492012-07-18 19:07:46 +0530413 kvm_vcpu_set_in_spin_loop(vcpu, false);
414 kvm_vcpu_set_dy_eligible(vcpu, false);
Raghavendra K T3a08a8f2013-03-04 23:32:07 +0530415 vcpu->preempted = false;
Wanpeng Lid73eb572019-07-18 19:39:06 +0800416 vcpu->ready = false;
Sean Christophersond5c48de2019-12-18 13:55:17 -0800417 preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
David Matlackfe22ed82021-08-04 22:28:40 +0000418 vcpu->last_used_slot = 0;
Rusty Russellfb3f0f52007-07-27 17:16:56 +1000419}
Rusty Russellfb3f0f52007-07-27 17:16:56 +1000420
Sean Christopherson4543bdc2019-12-18 13:55:14 -0800421void kvm_vcpu_destroy(struct kvm_vcpu *vcpu)
422{
Peter Xufb04a1e2020-09-30 21:22:22 -0400423 kvm_dirty_ring_free(&vcpu->dirty_ring);
Sean Christopherson4543bdc2019-12-18 13:55:14 -0800424 kvm_arch_vcpu_destroy(vcpu);
Sean Christophersone529ef62019-12-18 13:55:15 -0800425
Sean Christopherson9941d222019-12-18 13:55:29 -0800426 /*
427 * No need for rcu_read_lock as VCPU_RUN is the only place that changes
428 * the vcpu->pid pointer, and at destruction time all file descriptors
429 * are already gone.
430 */
431 put_pid(rcu_dereference_protected(vcpu->pid, 1));
432
Sean Christopherson8bd826d2019-12-18 13:55:30 -0800433 free_page((unsigned long)vcpu->run);
Sean Christophersone529ef62019-12-18 13:55:15 -0800434 kmem_cache_free(kvm_vcpu_cache, vcpu);
Sean Christopherson4543bdc2019-12-18 13:55:14 -0800435}
436EXPORT_SYMBOL_GPL(kvm_vcpu_destroy);
437
Andrea Arcangelie930bff2008-07-25 16:24:52 +0200438#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
439static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
440{
441 return container_of(mn, struct kvm, mmu_notifier);
442}
443
Eiichi Tsukatae649b3f2020-06-06 13:26:27 +0900444static void kvm_mmu_notifier_invalidate_range(struct mmu_notifier *mn,
445 struct mm_struct *mm,
446 unsigned long start, unsigned long end)
447{
448 struct kvm *kvm = mmu_notifier_to_kvm(mn);
449 int idx;
450
451 idx = srcu_read_lock(&kvm->srcu);
452 kvm_arch_mmu_notifier_invalidate_range(kvm, start, end);
453 srcu_read_unlock(&kvm->srcu, idx);
454}
455
Sean Christopherson3039bcc2021-04-01 17:56:50 -0700456typedef bool (*hva_handler_t)(struct kvm *kvm, struct kvm_gfn_range *range);
457
Sean Christophersonf922bd92021-04-01 17:56:55 -0700458typedef void (*on_lock_fn_t)(struct kvm *kvm, unsigned long start,
459 unsigned long end);
460
Sean Christopherson3039bcc2021-04-01 17:56:50 -0700461struct kvm_hva_range {
462 unsigned long start;
463 unsigned long end;
464 pte_t pte;
465 hva_handler_t handler;
Sean Christophersonf922bd92021-04-01 17:56:55 -0700466 on_lock_fn_t on_lock;
Sean Christopherson3039bcc2021-04-01 17:56:50 -0700467 bool flush_on_ret;
468 bool may_block;
469};
470
Sean Christophersonf922bd92021-04-01 17:56:55 -0700471/*
472 * Use a dedicated stub instead of NULL to indicate that there is no callback
473 * function/handler. The compiler technically can't guarantee that a real
474 * function will have a non-zero address, and so it will generate code to
475 * check for !NULL, whereas comparing against a stub will be elided at compile
476 * time (unless the compiler is getting long in the tooth, e.g. gcc 4.9).
477 */
478static void kvm_null_fn(void)
479{
480
481}
482#define IS_KVM_NULL_FN(fn) ((fn) == (void *)kvm_null_fn)
483
Sean Christopherson3039bcc2021-04-01 17:56:50 -0700484static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
485 const struct kvm_hva_range *range)
486{
Sean Christopherson8931a452021-04-01 17:56:56 -0700487 bool ret = false, locked = false;
Sean Christophersonf922bd92021-04-01 17:56:55 -0700488 struct kvm_gfn_range gfn_range;
Sean Christopherson3039bcc2021-04-01 17:56:50 -0700489 struct kvm_memory_slot *slot;
490 struct kvm_memslots *slots;
Sean Christopherson3039bcc2021-04-01 17:56:50 -0700491 int i, idx;
492
Sean Christophersonf922bd92021-04-01 17:56:55 -0700493 /* A null handler is allowed if and only if on_lock() is provided. */
494 if (WARN_ON_ONCE(IS_KVM_NULL_FN(range->on_lock) &&
495 IS_KVM_NULL_FN(range->handler)))
496 return 0;
497
Sean Christopherson3039bcc2021-04-01 17:56:50 -0700498 idx = srcu_read_lock(&kvm->srcu);
499
500 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
501 slots = __kvm_memslots(kvm, i);
502 kvm_for_each_memslot(slot, slots) {
503 unsigned long hva_start, hva_end;
504
505 hva_start = max(range->start, slot->userspace_addr);
506 hva_end = min(range->end, slot->userspace_addr +
507 (slot->npages << PAGE_SHIFT));
508 if (hva_start >= hva_end)
509 continue;
510
511 /*
512 * To optimize for the likely case where the address
513 * range is covered by zero or one memslots, don't
514 * bother making these conditional (to avoid writes on
515 * the second or later invocation of the handler).
516 */
517 gfn_range.pte = range->pte;
518 gfn_range.may_block = range->may_block;
519
520 /*
521 * {gfn(page) | page intersects with [hva_start, hva_end)} =
522 * {gfn_start, gfn_start+1, ..., gfn_end-1}.
523 */
524 gfn_range.start = hva_to_gfn_memslot(hva_start, slot);
525 gfn_range.end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, slot);
526 gfn_range.slot = slot;
527
Sean Christopherson8931a452021-04-01 17:56:56 -0700528 if (!locked) {
529 locked = true;
530 KVM_MMU_LOCK(kvm);
Paolo Bonzini071064f2021-08-03 03:45:41 -0400531 if (!IS_KVM_NULL_FN(range->on_lock))
532 range->on_lock(kvm, range->start, range->end);
533 if (IS_KVM_NULL_FN(range->handler))
534 break;
Sean Christopherson8931a452021-04-01 17:56:56 -0700535 }
Sean Christopherson3039bcc2021-04-01 17:56:50 -0700536 ret |= range->handler(kvm, &gfn_range);
537 }
538 }
539
540 if (range->flush_on_ret && (ret || kvm->tlbs_dirty))
541 kvm_flush_remote_tlbs(kvm);
542
Sean Christopherson8931a452021-04-01 17:56:56 -0700543 if (locked)
544 KVM_MMU_UNLOCK(kvm);
Sean Christophersonf922bd92021-04-01 17:56:55 -0700545
Sean Christopherson3039bcc2021-04-01 17:56:50 -0700546 srcu_read_unlock(&kvm->srcu, idx);
547
548 /* The notifiers are averse to booleans. :-( */
549 return (int)ret;
550}
551
552static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn,
553 unsigned long start,
554 unsigned long end,
555 pte_t pte,
556 hva_handler_t handler)
557{
558 struct kvm *kvm = mmu_notifier_to_kvm(mn);
559 const struct kvm_hva_range range = {
560 .start = start,
561 .end = end,
562 .pte = pte,
563 .handler = handler,
Sean Christophersonf922bd92021-04-01 17:56:55 -0700564 .on_lock = (void *)kvm_null_fn,
Sean Christopherson3039bcc2021-04-01 17:56:50 -0700565 .flush_on_ret = true,
566 .may_block = false,
567 };
Sean Christopherson3039bcc2021-04-01 17:56:50 -0700568
Sean Christophersonf922bd92021-04-01 17:56:55 -0700569 return __kvm_handle_hva_range(kvm, &range);
Sean Christopherson3039bcc2021-04-01 17:56:50 -0700570}
571
572static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn,
573 unsigned long start,
574 unsigned long end,
575 hva_handler_t handler)
576{
577 struct kvm *kvm = mmu_notifier_to_kvm(mn);
578 const struct kvm_hva_range range = {
579 .start = start,
580 .end = end,
581 .pte = __pte(0),
582 .handler = handler,
Sean Christophersonf922bd92021-04-01 17:56:55 -0700583 .on_lock = (void *)kvm_null_fn,
Sean Christopherson3039bcc2021-04-01 17:56:50 -0700584 .flush_on_ret = false,
585 .may_block = false,
586 };
Sean Christopherson3039bcc2021-04-01 17:56:50 -0700587
Sean Christophersonf922bd92021-04-01 17:56:55 -0700588 return __kvm_handle_hva_range(kvm, &range);
Sean Christopherson3039bcc2021-04-01 17:56:50 -0700589}
Izik Eidus3da0dd42009-09-23 21:47:18 +0300590static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
591 struct mm_struct *mm,
592 unsigned long address,
593 pte_t pte)
594{
595 struct kvm *kvm = mmu_notifier_to_kvm(mn);
596
Sean Christopherson501b9182021-03-25 19:19:48 -0700597 trace_kvm_set_spte_hva(address);
598
Sean Christophersonc13fda22021-04-02 02:56:49 +0200599 /*
Paolo Bonzini52ac8b32021-05-27 08:09:15 -0400600 * .change_pte() must be surrounded by .invalidate_range_{start,end}().
Paolo Bonzini071064f2021-08-03 03:45:41 -0400601 * If mmu_notifier_count is zero, then no in-progress invalidations,
602 * including this one, found a relevant memslot at start(); rechecking
603 * memslots here is unnecessary. Note, a false positive (count elevated
604 * by a different invalidation) is sub-optimal but functionally ok.
Sean Christophersonc13fda22021-04-02 02:56:49 +0200605 */
Paolo Bonzini52ac8b32021-05-27 08:09:15 -0400606 WARN_ON_ONCE(!READ_ONCE(kvm->mn_active_invalidate_count));
Paolo Bonzini071064f2021-08-03 03:45:41 -0400607 if (!READ_ONCE(kvm->mmu_notifier_count))
608 return;
Sean Christophersonc13fda22021-04-02 02:56:49 +0200609
Sean Christopherson3039bcc2021-04-01 17:56:50 -0700610 kvm_handle_hva_range(mn, address, address + 1, pte, kvm_set_spte_gfn);
Izik Eidus3da0dd42009-09-23 21:47:18 +0300611}
612
Maxim Levitskyedb298c2021-08-10 23:52:39 +0300613void kvm_inc_notifier_count(struct kvm *kvm, unsigned long start,
Sean Christophersonf922bd92021-04-01 17:56:55 -0700614 unsigned long end)
Andrea Arcangelie930bff2008-07-25 16:24:52 +0200615{
Andrea Arcangelie930bff2008-07-25 16:24:52 +0200616 /*
617 * The count increase must become visible at unlock time as no
618 * spte can be established without taking the mmu_lock and
619 * count is also read inside the mmu_lock critical section.
620 */
621 kvm->mmu_notifier_count++;
David Stevens4a42d842021-02-22 11:45:22 +0900622 if (likely(kvm->mmu_notifier_count == 1)) {
Sean Christophersonf922bd92021-04-01 17:56:55 -0700623 kvm->mmu_notifier_range_start = start;
624 kvm->mmu_notifier_range_end = end;
David Stevens4a42d842021-02-22 11:45:22 +0900625 } else {
626 /*
627 * Fully tracking multiple concurrent ranges has dimishing
628 * returns. Keep things simple and just find the minimal range
629 * which includes the current and new ranges. As there won't be
630 * enough information to subtract a range after its invalidate
631 * completes, any ranges invalidated concurrently will
632 * accumulate and persist until all outstanding invalidates
633 * complete.
634 */
635 kvm->mmu_notifier_range_start =
Sean Christophersonf922bd92021-04-01 17:56:55 -0700636 min(kvm->mmu_notifier_range_start, start);
David Stevens4a42d842021-02-22 11:45:22 +0900637 kvm->mmu_notifier_range_end =
Sean Christophersonf922bd92021-04-01 17:56:55 -0700638 max(kvm->mmu_notifier_range_end, end);
David Stevens4a42d842021-02-22 11:45:22 +0900639 }
Sean Christophersonf922bd92021-04-01 17:56:55 -0700640}
Maxim Levitskyedb298c2021-08-10 23:52:39 +0300641EXPORT_SYMBOL_GPL(kvm_inc_notifier_count);
Sean Christophersonf922bd92021-04-01 17:56:55 -0700642
643static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
644 const struct mmu_notifier_range *range)
645{
646 struct kvm *kvm = mmu_notifier_to_kvm(mn);
647 const struct kvm_hva_range hva_range = {
648 .start = range->start,
649 .end = range->end,
650 .pte = __pte(0),
651 .handler = kvm_unmap_gfn_range,
652 .on_lock = kvm_inc_notifier_count,
653 .flush_on_ret = true,
654 .may_block = mmu_notifier_range_blockable(range),
655 };
656
657 trace_kvm_unmap_hva_range(range->start, range->end);
Sean Christopherson3039bcc2021-04-01 17:56:50 -0700658
Paolo Bonzini52ac8b32021-05-27 08:09:15 -0400659 /*
660 * Prevent memslot modification between range_start() and range_end()
661 * so that conditionally locking provides the same result in both
662 * functions. Without that guarantee, the mmu_notifier_count
663 * adjustments will be imbalanced.
664 *
665 * Pairs with the decrement in range_end().
666 */
667 spin_lock(&kvm->mn_invalidate_lock);
668 kvm->mn_active_invalidate_count++;
669 spin_unlock(&kvm->mn_invalidate_lock);
670
Sean Christopherson3039bcc2021-04-01 17:56:50 -0700671 __kvm_handle_hva_range(kvm, &hva_range);
Takuya Yoshikawa565f3be2012-02-10 15:28:31 +0900672
Eiichi Tsukatae649b3f2020-06-06 13:26:27 +0900673 return 0;
Andrea Arcangelie930bff2008-07-25 16:24:52 +0200674}
675
Maxim Levitskyedb298c2021-08-10 23:52:39 +0300676void kvm_dec_notifier_count(struct kvm *kvm, unsigned long start,
Sean Christophersonf922bd92021-04-01 17:56:55 -0700677 unsigned long end)
Andrea Arcangelie930bff2008-07-25 16:24:52 +0200678{
Andrea Arcangelie930bff2008-07-25 16:24:52 +0200679 /*
680 * This sequence increase will notify the kvm page fault that
681 * the page that is going to be mapped in the spte could have
682 * been freed.
683 */
684 kvm->mmu_notifier_seq++;
Paul Mackerrasa355aa52011-12-12 12:37:21 +0000685 smp_wmb();
Andrea Arcangelie930bff2008-07-25 16:24:52 +0200686 /*
687 * The above sequence increase must be visible before the
Paul Mackerrasa355aa52011-12-12 12:37:21 +0000688 * below count decrease, which is ensured by the smp_wmb above
689 * in conjunction with the smp_rmb in mmu_notifier_retry().
Andrea Arcangelie930bff2008-07-25 16:24:52 +0200690 */
691 kvm->mmu_notifier_count--;
Sean Christophersonf922bd92021-04-01 17:56:55 -0700692}
Maxim Levitskyedb298c2021-08-10 23:52:39 +0300693EXPORT_SYMBOL_GPL(kvm_dec_notifier_count);
694
Sean Christophersonf922bd92021-04-01 17:56:55 -0700695
696static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
697 const struct mmu_notifier_range *range)
698{
699 struct kvm *kvm = mmu_notifier_to_kvm(mn);
700 const struct kvm_hva_range hva_range = {
701 .start = range->start,
702 .end = range->end,
703 .pte = __pte(0),
704 .handler = (void *)kvm_null_fn,
705 .on_lock = kvm_dec_notifier_count,
706 .flush_on_ret = false,
707 .may_block = mmu_notifier_range_blockable(range),
708 };
Paolo Bonzini52ac8b32021-05-27 08:09:15 -0400709 bool wake;
Sean Christophersonf922bd92021-04-01 17:56:55 -0700710
711 __kvm_handle_hva_range(kvm, &hva_range);
Andrea Arcangelie930bff2008-07-25 16:24:52 +0200712
Paolo Bonzini52ac8b32021-05-27 08:09:15 -0400713 /* Pairs with the increment in range_start(). */
714 spin_lock(&kvm->mn_invalidate_lock);
715 wake = (--kvm->mn_active_invalidate_count == 0);
716 spin_unlock(&kvm->mn_invalidate_lock);
717
718 /*
719 * There can only be one waiter, since the wait happens under
720 * slots_lock.
721 */
722 if (wake)
723 rcuwait_wake_up(&kvm->mn_memslots_update_rcuwait);
724
Andrea Arcangelie930bff2008-07-25 16:24:52 +0200725 BUG_ON(kvm->mmu_notifier_count < 0);
726}
727
728static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
729 struct mm_struct *mm,
Andres Lagar-Cavilla57128462014-09-22 14:54:42 -0700730 unsigned long start,
731 unsigned long end)
Andrea Arcangelie930bff2008-07-25 16:24:52 +0200732{
Sean Christopherson501b9182021-03-25 19:19:48 -0700733 trace_kvm_age_hva(start, end);
734
Sean Christopherson3039bcc2021-04-01 17:56:50 -0700735 return kvm_handle_hva_range(mn, start, end, __pte(0), kvm_age_gfn);
Andrea Arcangelie930bff2008-07-25 16:24:52 +0200736}
737
Vladimir Davydov1d7715c2015-09-09 15:35:41 -0700738static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
739 struct mm_struct *mm,
740 unsigned long start,
741 unsigned long end)
742{
Sean Christopherson501b9182021-03-25 19:19:48 -0700743 trace_kvm_age_hva(start, end);
744
Vladimir Davydov1d7715c2015-09-09 15:35:41 -0700745 /*
746 * Even though we do not flush TLB, this will still adversely
747 * affect performance on pre-Haswell Intel EPT, where there is
748 * no EPT Access Bit to clear so that we have to tear down EPT
749 * tables instead. If we find this unacceptable, we can always
750 * add a parameter to kvm_age_hva so that it effectively doesn't
751 * do anything on clear_young.
752 *
753 * Also note that currently we never issue secondary TLB flushes
754 * from clear_young, leaving this job up to the regular system
755 * cadence. If we find this inaccurate, we might come up with a
756 * more sophisticated heuristic later.
757 */
Sean Christopherson3039bcc2021-04-01 17:56:50 -0700758 return kvm_handle_hva_range_no_flush(mn, start, end, kvm_age_gfn);
Vladimir Davydov1d7715c2015-09-09 15:35:41 -0700759}
760
Andrea Arcangeli8ee53822011-01-13 15:47:10 -0800761static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
762 struct mm_struct *mm,
763 unsigned long address)
764{
Sean Christopherson501b9182021-03-25 19:19:48 -0700765 trace_kvm_test_age_hva(address);
766
Sean Christopherson3039bcc2021-04-01 17:56:50 -0700767 return kvm_handle_hva_range_no_flush(mn, address, address + 1,
768 kvm_test_age_gfn);
Andrea Arcangeli8ee53822011-01-13 15:47:10 -0800769}
770
Marcelo Tosatti85db06e2008-12-10 21:23:26 +0100771static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
772 struct mm_struct *mm)
773{
774 struct kvm *kvm = mmu_notifier_to_kvm(mn);
Lai Jiangshaneda2bed2010-04-20 14:29:29 +0800775 int idx;
776
777 idx = srcu_read_lock(&kvm->srcu);
Marcelo Tosatti2df72e92012-08-24 15:54:57 -0300778 kvm_arch_flush_shadow_all(kvm);
Lai Jiangshaneda2bed2010-04-20 14:29:29 +0800779 srcu_read_unlock(&kvm->srcu, idx);
Marcelo Tosatti85db06e2008-12-10 21:23:26 +0100780}
781
Andrea Arcangelie930bff2008-07-25 16:24:52 +0200782static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
Eiichi Tsukatae649b3f2020-06-06 13:26:27 +0900783 .invalidate_range = kvm_mmu_notifier_invalidate_range,
Andrea Arcangelie930bff2008-07-25 16:24:52 +0200784 .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
785 .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
786 .clear_flush_young = kvm_mmu_notifier_clear_flush_young,
Vladimir Davydov1d7715c2015-09-09 15:35:41 -0700787 .clear_young = kvm_mmu_notifier_clear_young,
Andrea Arcangeli8ee53822011-01-13 15:47:10 -0800788 .test_young = kvm_mmu_notifier_test_young,
Izik Eidus3da0dd42009-09-23 21:47:18 +0300789 .change_pte = kvm_mmu_notifier_change_pte,
Marcelo Tosatti85db06e2008-12-10 21:23:26 +0100790 .release = kvm_mmu_notifier_release,
Andrea Arcangelie930bff2008-07-25 16:24:52 +0200791};
Avi Kivity4c07b0a2009-12-20 14:54:04 +0200792
793static int kvm_init_mmu_notifier(struct kvm *kvm)
794{
795 kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
796 return mmu_notifier_register(&kvm->mmu_notifier, current->mm);
797}
798
799#else /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */
800
801static int kvm_init_mmu_notifier(struct kvm *kvm)
802{
803 return 0;
804}
805
Andrea Arcangelie930bff2008-07-25 16:24:52 +0200806#endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */
807
Sergey Senozhatsky2fdef3a2021-06-06 11:10:44 +0900808#ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
809static int kvm_pm_notifier_call(struct notifier_block *bl,
810 unsigned long state,
811 void *unused)
812{
813 struct kvm *kvm = container_of(bl, struct kvm, pm_notifier);
814
815 return kvm_arch_pm_notifier(kvm, state);
816}
817
818static void kvm_init_pm_notifier(struct kvm *kvm)
819{
820 kvm->pm_notifier.notifier_call = kvm_pm_notifier_call;
821 /* Suspend KVM before we suspend ftrace, RCU, etc. */
822 kvm->pm_notifier.priority = INT_MAX;
823 register_pm_notifier(&kvm->pm_notifier);
824}
825
826static void kvm_destroy_pm_notifier(struct kvm *kvm)
827{
828 unregister_pm_notifier(&kvm->pm_notifier);
829}
830#else /* !CONFIG_HAVE_KVM_PM_NOTIFIER */
831static void kvm_init_pm_notifier(struct kvm *kvm)
832{
833}
834
835static void kvm_destroy_pm_notifier(struct kvm *kvm)
836{
837}
838#endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */
839
Paolo Bonzinia47d2b02015-05-17 11:41:37 +0200840static struct kvm_memslots *kvm_alloc_memslots(void)
Xiao Guangrongbf3e05b2011-11-24 17:40:57 +0800841{
842 int i;
Paolo Bonzinia47d2b02015-05-17 11:41:37 +0200843 struct kvm_memslots *slots;
Xiao Guangrongbf3e05b2011-11-24 17:40:57 +0800844
Ben Gardonb12ce362019-02-11 11:02:49 -0800845 slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL_ACCOUNT);
Paolo Bonzinia47d2b02015-05-17 11:41:37 +0200846 if (!slots)
847 return NULL;
848
Xiao Guangrongbf3e05b2011-11-24 17:40:57 +0800849 for (i = 0; i < KVM_MEM_SLOTS_NUM; i++)
Sean Christopherson36947252020-02-18 13:07:32 -0800850 slots->id_to_index[i] = -1;
Paolo Bonzinia47d2b02015-05-17 11:41:37 +0200851
852 return slots;
853}
854
855static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
856{
857 if (!memslot->dirty_bitmap)
858 return;
859
860 kvfree(memslot->dirty_bitmap);
861 memslot->dirty_bitmap = NULL;
862}
863
Sean Christophersone96c81e2020-02-18 13:07:27 -0800864static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
Paolo Bonzinia47d2b02015-05-17 11:41:37 +0200865{
Sean Christophersone96c81e2020-02-18 13:07:27 -0800866 kvm_destroy_dirty_bitmap(slot);
Paolo Bonzinia47d2b02015-05-17 11:41:37 +0200867
Sean Christophersone96c81e2020-02-18 13:07:27 -0800868 kvm_arch_free_memslot(kvm, slot);
Paolo Bonzinia47d2b02015-05-17 11:41:37 +0200869
Sean Christophersone96c81e2020-02-18 13:07:27 -0800870 slot->flags = 0;
871 slot->npages = 0;
Paolo Bonzinia47d2b02015-05-17 11:41:37 +0200872}
873
874static void kvm_free_memslots(struct kvm *kvm, struct kvm_memslots *slots)
875{
876 struct kvm_memory_slot *memslot;
877
878 if (!slots)
879 return;
880
881 kvm_for_each_memslot(memslot, slots)
Sean Christophersone96c81e2020-02-18 13:07:27 -0800882 kvm_free_memslot(kvm, memslot);
Paolo Bonzinia47d2b02015-05-17 11:41:37 +0200883
884 kvfree(slots);
Xiao Guangrongbf3e05b2011-11-24 17:40:57 +0800885}
886
Jing Zhangbc9e9e62021-06-23 17:28:46 -0400887static umode_t kvm_stats_debugfs_mode(const struct _kvm_stats_desc *pdesc)
888{
889 switch (pdesc->desc.flags & KVM_STATS_TYPE_MASK) {
890 case KVM_STATS_TYPE_INSTANT:
891 return 0444;
892 case KVM_STATS_TYPE_CUMULATIVE:
893 case KVM_STATS_TYPE_PEAK:
894 default:
895 return 0644;
896 }
897}
898
899
Janosch Frank536a6f82016-05-18 13:26:23 +0200900static void kvm_destroy_vm_debugfs(struct kvm *kvm)
901{
902 int i;
Jing Zhangbc9e9e62021-06-23 17:28:46 -0400903 int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc +
904 kvm_vcpu_stats_header.num_desc;
Janosch Frank536a6f82016-05-18 13:26:23 +0200905
906 if (!kvm->debugfs_dentry)
907 return;
908
909 debugfs_remove_recursive(kvm->debugfs_dentry);
910
Luiz Capitulino9d5a1dc2016-09-07 14:47:21 -0400911 if (kvm->debugfs_stat_data) {
912 for (i = 0; i < kvm_debugfs_num_entries; i++)
913 kfree(kvm->debugfs_stat_data[i]);
914 kfree(kvm->debugfs_stat_data);
915 }
Janosch Frank536a6f82016-05-18 13:26:23 +0200916}
917
918static int kvm_create_vm_debugfs(struct kvm *kvm, int fd)
919{
Paolo Bonzini85cd39a2021-08-04 05:28:52 -0400920 static DEFINE_MUTEX(kvm_debugfs_lock);
921 struct dentry *dent;
Janosch Frank536a6f82016-05-18 13:26:23 +0200922 char dir_name[ITOA_MAX_LEN * 2];
923 struct kvm_stat_data *stat_data;
Jing Zhangbc9e9e62021-06-23 17:28:46 -0400924 const struct _kvm_stats_desc *pdesc;
Peter Xu3165af72021-07-30 18:04:49 -0400925 int i, ret;
Jing Zhangbc9e9e62021-06-23 17:28:46 -0400926 int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc +
927 kvm_vcpu_stats_header.num_desc;
Janosch Frank536a6f82016-05-18 13:26:23 +0200928
929 if (!debugfs_initialized())
930 return 0;
931
932 snprintf(dir_name, sizeof(dir_name), "%d-%d", task_pid_nr(current), fd);
Paolo Bonzini85cd39a2021-08-04 05:28:52 -0400933 mutex_lock(&kvm_debugfs_lock);
934 dent = debugfs_lookup(dir_name, kvm_debugfs_dir);
935 if (dent) {
936 pr_warn_ratelimited("KVM: debugfs: duplicate directory %s\n", dir_name);
937 dput(dent);
938 mutex_unlock(&kvm_debugfs_lock);
939 return 0;
940 }
941 dent = debugfs_create_dir(dir_name, kvm_debugfs_dir);
942 mutex_unlock(&kvm_debugfs_lock);
943 if (IS_ERR(dent))
944 return 0;
Janosch Frank536a6f82016-05-18 13:26:23 +0200945
Paolo Bonzini85cd39a2021-08-04 05:28:52 -0400946 kvm->debugfs_dentry = dent;
Janosch Frank536a6f82016-05-18 13:26:23 +0200947 kvm->debugfs_stat_data = kcalloc(kvm_debugfs_num_entries,
948 sizeof(*kvm->debugfs_stat_data),
Ben Gardonb12ce362019-02-11 11:02:49 -0800949 GFP_KERNEL_ACCOUNT);
Janosch Frank536a6f82016-05-18 13:26:23 +0200950 if (!kvm->debugfs_stat_data)
951 return -ENOMEM;
952
Jing Zhangbc9e9e62021-06-23 17:28:46 -0400953 for (i = 0; i < kvm_vm_stats_header.num_desc; ++i) {
954 pdesc = &kvm_vm_stats_desc[i];
Ben Gardonb12ce362019-02-11 11:02:49 -0800955 stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
Janosch Frank536a6f82016-05-18 13:26:23 +0200956 if (!stat_data)
957 return -ENOMEM;
958
959 stat_data->kvm = kvm;
Jing Zhangbc9e9e62021-06-23 17:28:46 -0400960 stat_data->desc = pdesc;
961 stat_data->kind = KVM_STAT_VM;
962 kvm->debugfs_stat_data[i] = stat_data;
963 debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
964 kvm->debugfs_dentry, stat_data,
965 &stat_fops_per_vm);
966 }
967
968 for (i = 0; i < kvm_vcpu_stats_header.num_desc; ++i) {
969 pdesc = &kvm_vcpu_stats_desc[i];
970 stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
971 if (!stat_data)
972 return -ENOMEM;
973
974 stat_data->kvm = kvm;
975 stat_data->desc = pdesc;
976 stat_data->kind = KVM_STAT_VCPU;
Pavel Skripkin004d62e2021-07-01 22:55:00 +0300977 kvm->debugfs_stat_data[i + kvm_vm_stats_header.num_desc] = stat_data;
Jing Zhangbc9e9e62021-06-23 17:28:46 -0400978 debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
Milan Pandurov09cbcef2019-12-13 14:07:21 +0100979 kvm->debugfs_dentry, stat_data,
980 &stat_fops_per_vm);
Janosch Frank536a6f82016-05-18 13:26:23 +0200981 }
Peter Xu3165af72021-07-30 18:04:49 -0400982
983 ret = kvm_arch_create_vm_debugfs(kvm);
984 if (ret) {
985 kvm_destroy_vm_debugfs(kvm);
986 return i;
987 }
988
Janosch Frank536a6f82016-05-18 13:26:23 +0200989 return 0;
990}
991
Junaid Shahid1aa9b952019-11-04 20:26:00 +0100992/*
993 * Called after the VM is otherwise initialized, but just before adding it to
994 * the vm_list.
995 */
996int __weak kvm_arch_post_init_vm(struct kvm *kvm)
997{
998 return 0;
999}
1000
1001/*
1002 * Called just after removing the VM from the vm_list, but before doing any
1003 * other destruction.
1004 */
1005void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm)
1006{
1007}
1008
Peter Xu3165af72021-07-30 18:04:49 -04001009/*
1010 * Called after per-vm debugfs created. When called kvm->debugfs_dentry should
1011 * be setup already, so we can create arch-specific debugfs entries under it.
1012 * Cleanup should be automatic done in kvm_destroy_vm_debugfs() recursively, so
1013 * a per-arch destroy interface is not needed.
1014 */
1015int __weak kvm_arch_create_vm_debugfs(struct kvm *kvm)
1016{
1017 return 0;
1018}
1019
Carsten Ottee08b9632012-01-04 10:25:20 +01001020static struct kvm *kvm_create_vm(unsigned long type)
Avi Kivity6aa8b732006-12-10 02:21:36 -08001021{
Jan Kiszkad89f5ef2010-11-09 17:02:49 +01001022 struct kvm *kvm = kvm_arch_alloc_vm();
Jim Mattson91219232019-10-24 16:03:26 -07001023 int r = -ENOMEM;
1024 int i;
Avi Kivity6aa8b732006-12-10 02:21:36 -08001025
Jan Kiszkad89f5ef2010-11-09 17:02:49 +01001026 if (!kvm)
1027 return ERR_PTR(-ENOMEM);
1028
Ben Gardon531810c2021-02-02 10:57:24 -08001029 KVM_MMU_LOCK_INIT(kvm);
Vegard Nossumf1f10072017-02-27 14:30:07 -08001030 mmgrab(current->mm);
Paolo Bonzinie9ad4ec2016-03-21 10:15:25 +01001031 kvm->mm = current->mm;
1032 kvm_eventfd_init(kvm);
1033 mutex_init(&kvm->lock);
1034 mutex_init(&kvm->irq_lock);
1035 mutex_init(&kvm->slots_lock);
Ben Gardonb10a0382021-05-18 10:34:11 -07001036 mutex_init(&kvm->slots_arch_lock);
Paolo Bonzini52ac8b32021-05-27 08:09:15 -04001037 spin_lock_init(&kvm->mn_invalidate_lock);
1038 rcuwait_init(&kvm->mn_memslots_update_rcuwait);
1039
Paolo Bonzinie9ad4ec2016-03-21 10:15:25 +01001040 INIT_LIST_HEAD(&kvm->devices);
1041
Jim Mattson91219232019-10-24 16:03:26 -07001042 BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
1043
Paolo Bonzini8a441192019-11-04 12:16:49 +01001044 if (init_srcu_struct(&kvm->srcu))
1045 goto out_err_no_srcu;
1046 if (init_srcu_struct(&kvm->irq_srcu))
1047 goto out_err_no_irq_srcu;
1048
Paolo Bonzinie2d3fca2019-11-04 13:23:53 +01001049 refcount_set(&kvm->users_count, 1);
Jim Mattson91219232019-10-24 16:03:26 -07001050 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
1051 struct kvm_memslots *slots = kvm_alloc_memslots();
1052
1053 if (!slots)
Jim Mattsona97b0e72019-10-25 13:34:58 +02001054 goto out_err_no_arch_destroy_vm;
Jim Mattson91219232019-10-24 16:03:26 -07001055 /* Generations must be different for each address space. */
1056 slots->generation = i;
1057 rcu_assign_pointer(kvm->memslots[i], slots);
1058 }
1059
1060 for (i = 0; i < KVM_NR_BUSES; i++) {
1061 rcu_assign_pointer(kvm->buses[i],
1062 kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL_ACCOUNT));
1063 if (!kvm->buses[i])
Jim Mattsona97b0e72019-10-25 13:34:58 +02001064 goto out_err_no_arch_destroy_vm;
Jim Mattson91219232019-10-24 16:03:26 -07001065 }
1066
David Matlackacd05782020-04-17 15:14:46 -07001067 kvm->max_halt_poll_ns = halt_poll_ns;
1068
Carsten Ottee08b9632012-01-04 10:25:20 +01001069 r = kvm_arch_init_vm(kvm, type);
Jan Kiszkad89f5ef2010-11-09 17:02:49 +01001070 if (r)
Jim Mattsona97b0e72019-10-25 13:34:58 +02001071 goto out_err_no_arch_destroy_vm;
Alexander Graf10474ae2009-09-15 11:37:46 +02001072
1073 r = hardware_enable_all();
1074 if (r)
Christian Borntraeger719d93c2014-01-16 13:44:20 +01001075 goto out_err_no_disable;
Alexander Graf10474ae2009-09-15 11:37:46 +02001076
Paolo Bonzinic77dcac2014-08-06 14:24:45 +02001077#ifdef CONFIG_HAVE_KVM_IRQFD
Gleb Natapov136bdfe2009-08-24 11:54:23 +03001078 INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
Avi Kivity75858a82009-01-04 17:10:50 +02001079#endif
Avi Kivity6aa8b732006-12-10 02:21:36 -08001080
Mike Waychison74b5c5b2011-06-03 13:04:53 -07001081 r = kvm_init_mmu_notifier(kvm);
1082 if (r)
Junaid Shahid1aa9b952019-11-04 20:26:00 +01001083 goto out_err_no_mmu_notifier;
1084
1085 r = kvm_arch_post_init_vm(kvm);
1086 if (r)
Mike Waychison74b5c5b2011-06-03 13:04:53 -07001087 goto out_err;
1088
Junaid Shahid0d9ce162019-01-03 17:14:28 -08001089 mutex_lock(&kvm_lock);
Rusty Russell5e58cfe2007-07-23 17:08:21 +10001090 list_add(&kvm->vm_list, &vm_list);
Junaid Shahid0d9ce162019-01-03 17:14:28 -08001091 mutex_unlock(&kvm_lock);
Jan Kiszkad89f5ef2010-11-09 17:02:49 +01001092
Peter Zijlstra2ecd9d22015-07-03 18:53:58 +02001093 preempt_notifier_inc();
Sergey Senozhatsky2fdef3a2021-06-06 11:10:44 +09001094 kvm_init_pm_notifier(kvm);
Peter Zijlstra2ecd9d22015-07-03 18:53:58 +02001095
Avi Kivityf17abe92007-02-21 19:28:04 +02001096 return kvm;
Alexander Graf10474ae2009-09-15 11:37:46 +02001097
1098out_err:
Junaid Shahid1aa9b952019-11-04 20:26:00 +01001099#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
1100 if (kvm->mmu_notifier.ops)
1101 mmu_notifier_unregister(&kvm->mmu_notifier, current->mm);
1102#endif
1103out_err_no_mmu_notifier:
Alexander Graf10474ae2009-09-15 11:37:46 +02001104 hardware_disable_all();
Christian Borntraeger719d93c2014-01-16 13:44:20 +01001105out_err_no_disable:
Jim Mattsona97b0e72019-10-25 13:34:58 +02001106 kvm_arch_destroy_vm(kvm);
Jim Mattsona97b0e72019-10-25 13:34:58 +02001107out_err_no_arch_destroy_vm:
Paolo Bonzinie2d3fca2019-11-04 13:23:53 +01001108 WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count));
Marcelo Tosattie93f8a02009-12-23 14:35:24 -02001109 for (i = 0; i < KVM_NR_BUSES; i++)
Paolo Bonzini3898da92017-08-02 17:55:54 +02001110 kfree(kvm_get_bus(kvm, i));
Paolo Bonzinif481b062015-05-17 17:30:37 +02001111 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
Paolo Bonzini3898da92017-08-02 17:55:54 +02001112 kvm_free_memslots(kvm, __kvm_memslots(kvm, i));
Paolo Bonzini8a441192019-11-04 12:16:49 +01001113 cleanup_srcu_struct(&kvm->irq_srcu);
1114out_err_no_irq_srcu:
1115 cleanup_srcu_struct(&kvm->srcu);
1116out_err_no_srcu:
Jan Kiszkad89f5ef2010-11-09 17:02:49 +01001117 kvm_arch_free_vm(kvm);
Paolo Bonzinie9ad4ec2016-03-21 10:15:25 +01001118 mmdrop(current->mm);
Alexander Graf10474ae2009-09-15 11:37:46 +02001119 return ERR_PTR(r);
Avi Kivityf17abe92007-02-21 19:28:04 +02001120}
1121
Scott Wood07f0a7b2013-04-25 14:11:23 +00001122static void kvm_destroy_devices(struct kvm *kvm)
1123{
Geliang Tange6e3b5a2016-01-01 19:47:12 +08001124 struct kvm_device *dev, *tmp;
Scott Wood07f0a7b2013-04-25 14:11:23 +00001125
Christoffer Dalla28ebea2016-08-09 19:13:01 +02001126 /*
1127 * We do not need to take the kvm->lock here, because nobody else
1128 * has a reference to the struct kvm at this point and therefore
1129 * cannot access the devices list anyhow.
1130 */
Geliang Tange6e3b5a2016-01-01 19:47:12 +08001131 list_for_each_entry_safe(dev, tmp, &kvm->devices, vm_node) {
1132 list_del(&dev->vm_node);
Scott Wood07f0a7b2013-04-25 14:11:23 +00001133 dev->ops->destroy(dev);
1134 }
1135}
1136
Avi Kivityf17abe92007-02-21 19:28:04 +02001137static void kvm_destroy_vm(struct kvm *kvm)
1138{
Marcelo Tosattie93f8a02009-12-23 14:35:24 -02001139 int i;
Avi Kivity6d4e4c42007-11-21 16:41:05 +02001140 struct mm_struct *mm = kvm->mm;
1141
Sergey Senozhatsky2fdef3a2021-06-06 11:10:44 +09001142 kvm_destroy_pm_notifier(kvm);
Claudio Imbrenda286de8f2017-07-12 17:56:44 +02001143 kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm);
Janosch Frank536a6f82016-05-18 13:26:23 +02001144 kvm_destroy_vm_debugfs(kvm);
Sheng Yangad8ba2c2009-01-06 10:03:02 +08001145 kvm_arch_sync_events(kvm);
Junaid Shahid0d9ce162019-01-03 17:14:28 -08001146 mutex_lock(&kvm_lock);
Avi Kivity133de902007-02-12 00:54:44 -08001147 list_del(&kvm->vm_list);
Junaid Shahid0d9ce162019-01-03 17:14:28 -08001148 mutex_unlock(&kvm_lock);
Junaid Shahid1aa9b952019-11-04 20:26:00 +01001149 kvm_arch_pre_destroy_vm(kvm);
1150
Avi Kivity399ec802008-11-19 13:58:46 +02001151 kvm_free_irq_routing(kvm);
Peter Xudf630b82017-03-15 16:01:17 +08001152 for (i = 0; i < KVM_NR_BUSES; i++) {
Paolo Bonzini3898da92017-08-02 17:55:54 +02001153 struct kvm_io_bus *bus = kvm_get_bus(kvm, i);
Christian Borntraeger4a12f952017-07-07 10:51:38 +02001154
Christian Borntraeger4a12f952017-07-07 10:51:38 +02001155 if (bus)
1156 kvm_io_bus_destroy(bus);
Peter Xudf630b82017-03-15 16:01:17 +08001157 kvm->buses[i] = NULL;
1158 }
Avi Kivity980da6c2009-12-20 15:13:43 +02001159 kvm_coalesced_mmio_free(kvm);
Andrea Arcangelie930bff2008-07-25 16:24:52 +02001160#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
1161 mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
Paolo Bonzini52ac8b32021-05-27 08:09:15 -04001162 /*
1163 * At this point, pending calls to invalidate_range_start()
1164 * have completed but no more MMU notifiers will run, so
1165 * mn_active_invalidate_count may remain unbalanced.
1166 * No threads can be waiting in install_new_memslots as the
1167 * last reference on KVM has been dropped, but freeing
1168 * memslots would deadlock without this manual intervention.
1169 */
1170 WARN_ON(rcuwait_active(&kvm->mn_memslots_update_rcuwait));
1171 kvm->mn_active_invalidate_count = 0;
Gleb Natapovf00be0c2009-03-19 12:20:36 +02001172#else
Marcelo Tosatti2df72e92012-08-24 15:54:57 -03001173 kvm_arch_flush_shadow_all(kvm);
Andrea Arcangelie930bff2008-07-25 16:24:52 +02001174#endif
Zhang Xiantaod19a9cd2007-11-18 18:43:45 +08001175 kvm_arch_destroy_vm(kvm);
Scott Wood07f0a7b2013-04-25 14:11:23 +00001176 kvm_destroy_devices(kvm);
Paolo Bonzinif481b062015-05-17 17:30:37 +02001177 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
Paolo Bonzini3898da92017-08-02 17:55:54 +02001178 kvm_free_memslots(kvm, __kvm_memslots(kvm, i));
Paolo Bonzini820b3fc2014-06-03 13:44:17 +02001179 cleanup_srcu_struct(&kvm->irq_srcu);
Jan Kiszkad89f5ef2010-11-09 17:02:49 +01001180 cleanup_srcu_struct(&kvm->srcu);
1181 kvm_arch_free_vm(kvm);
Peter Zijlstra2ecd9d22015-07-03 18:53:58 +02001182 preempt_notifier_dec();
Alexander Graf10474ae2009-09-15 11:37:46 +02001183 hardware_disable_all();
Avi Kivity6d4e4c42007-11-21 16:41:05 +02001184 mmdrop(mm);
Avi Kivityf17abe92007-02-21 19:28:04 +02001185}
1186
Izik Eidusd39f13b2008-03-30 16:01:25 +03001187void kvm_get_kvm(struct kvm *kvm)
1188{
Elena Reshetovae3736c32017-02-20 13:06:21 +02001189 refcount_inc(&kvm->users_count);
Izik Eidusd39f13b2008-03-30 16:01:25 +03001190}
1191EXPORT_SYMBOL_GPL(kvm_get_kvm);
1192
Peter Xu605c7132021-06-25 11:32:07 -04001193/*
1194 * Make sure the vm is not during destruction, which is a safe version of
1195 * kvm_get_kvm(). Return true if kvm referenced successfully, false otherwise.
1196 */
1197bool kvm_get_kvm_safe(struct kvm *kvm)
1198{
1199 return refcount_inc_not_zero(&kvm->users_count);
1200}
1201EXPORT_SYMBOL_GPL(kvm_get_kvm_safe);
1202
Izik Eidusd39f13b2008-03-30 16:01:25 +03001203void kvm_put_kvm(struct kvm *kvm)
1204{
Elena Reshetovae3736c32017-02-20 13:06:21 +02001205 if (refcount_dec_and_test(&kvm->users_count))
Izik Eidusd39f13b2008-03-30 16:01:25 +03001206 kvm_destroy_vm(kvm);
1207}
1208EXPORT_SYMBOL_GPL(kvm_put_kvm);
1209
Sean Christopherson149487b2019-10-21 15:58:42 -07001210/*
1211 * Used to put a reference that was taken on behalf of an object associated
1212 * with a user-visible file descriptor, e.g. a vcpu or device, if installation
1213 * of the new file descriptor fails and the reference cannot be transferred to
1214 * its final owner. In such cases, the caller is still actively using @kvm and
1215 * will fail miserably if the refcount unexpectedly hits zero.
1216 */
1217void kvm_put_kvm_no_destroy(struct kvm *kvm)
1218{
1219 WARN_ON(refcount_dec_and_test(&kvm->users_count));
1220}
1221EXPORT_SYMBOL_GPL(kvm_put_kvm_no_destroy);
Izik Eidusd39f13b2008-03-30 16:01:25 +03001222
Avi Kivityf17abe92007-02-21 19:28:04 +02001223static int kvm_vm_release(struct inode *inode, struct file *filp)
1224{
1225 struct kvm *kvm = filp->private_data;
1226
Gregory Haskins721eecbf2009-05-20 10:30:49 -04001227 kvm_irqfd_release(kvm);
1228
Izik Eidusd39f13b2008-03-30 16:01:25 +03001229 kvm_put_kvm(kvm);
Avi Kivity6aa8b732006-12-10 02:21:36 -08001230 return 0;
1231}
1232
Takuya Yoshikawa515a0122010-10-27 18:23:54 +09001233/*
1234 * Allocation size is twice as large as the actual dirty bitmap size.
Sean Christopherson0dff0842020-02-18 13:07:29 -08001235 * See kvm_vm_ioctl_get_dirty_log() why this is needed.
Takuya Yoshikawa515a0122010-10-27 18:23:54 +09001236 */
Jay Zhou3c9bd402020-02-27 09:32:27 +08001237static int kvm_alloc_dirty_bitmap(struct kvm_memory_slot *memslot)
Takuya Yoshikawaa36a57b12010-10-27 18:22:19 +09001238{
Takuya Yoshikawa515a0122010-10-27 18:23:54 +09001239 unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot);
Takuya Yoshikawaa36a57b12010-10-27 18:22:19 +09001240
Ben Gardonb12ce362019-02-11 11:02:49 -08001241 memslot->dirty_bitmap = kvzalloc(dirty_bytes, GFP_KERNEL_ACCOUNT);
Takuya Yoshikawaa36a57b12010-10-27 18:22:19 +09001242 if (!memslot->dirty_bitmap)
1243 return -ENOMEM;
1244
Takuya Yoshikawaa36a57b12010-10-27 18:22:19 +09001245 return 0;
1246}
1247
Xiao Guangrongbf3e05b2011-11-24 17:40:57 +08001248/*
Sean Christopherson0577d1a2020-02-18 13:07:31 -08001249 * Delete a memslot by decrementing the number of used slots and shifting all
1250 * other entries in the array forward one spot.
Xiao Guangrongbf3e05b2011-11-24 17:40:57 +08001251 */
Sean Christopherson0577d1a2020-02-18 13:07:31 -08001252static inline void kvm_memslot_delete(struct kvm_memslots *slots,
1253 struct kvm_memory_slot *memslot)
Xiao Guangrongbf3e05b2011-11-24 17:40:57 +08001254{
Igor Mammedov063584d2014-11-13 23:00:13 +00001255 struct kvm_memory_slot *mslots = slots->memslots;
Sean Christopherson0577d1a2020-02-18 13:07:31 -08001256 int i;
Xiao Guangrongf85e2cb2011-11-24 17:41:54 +08001257
Sean Christopherson0577d1a2020-02-18 13:07:31 -08001258 if (WARN_ON(slots->id_to_index[memslot->id] == -1))
1259 return;
Igor Mammedov0e60b072014-12-01 17:29:26 +00001260
Sean Christopherson0577d1a2020-02-18 13:07:31 -08001261 slots->used_slots--;
1262
David Matlack87689272021-08-04 22:28:38 +00001263 if (atomic_read(&slots->last_used_slot) >= slots->used_slots)
1264 atomic_set(&slots->last_used_slot, 0);
Sean Christopherson0774a962020-03-20 13:55:40 -07001265
Sean Christopherson0577d1a2020-02-18 13:07:31 -08001266 for (i = slots->id_to_index[memslot->id]; i < slots->used_slots; i++) {
Igor Mammedov7f379cf2014-12-01 17:29:24 +00001267 mslots[i] = mslots[i + 1];
1268 slots->id_to_index[mslots[i].id] = i;
Igor Mammedov7f379cf2014-12-01 17:29:24 +00001269 }
Sean Christopherson0577d1a2020-02-18 13:07:31 -08001270 mslots[i] = *memslot;
1271 slots->id_to_index[memslot->id] = -1;
1272}
1273
1274/*
1275 * "Insert" a new memslot by incrementing the number of used slots. Returns
1276 * the new slot's initial index into the memslots array.
1277 */
1278static inline int kvm_memslot_insert_back(struct kvm_memslots *slots)
1279{
1280 return slots->used_slots++;
1281}
1282
1283/*
1284 * Move a changed memslot backwards in the array by shifting existing slots
1285 * with a higher GFN toward the front of the array. Note, the changed memslot
1286 * itself is not preserved in the array, i.e. not swapped at this time, only
1287 * its new index into the array is tracked. Returns the changed memslot's
1288 * current index into the memslots array.
1289 */
1290static inline int kvm_memslot_move_backward(struct kvm_memslots *slots,
1291 struct kvm_memory_slot *memslot)
1292{
1293 struct kvm_memory_slot *mslots = slots->memslots;
1294 int i;
1295
1296 if (WARN_ON_ONCE(slots->id_to_index[memslot->id] == -1) ||
1297 WARN_ON_ONCE(!slots->used_slots))
1298 return -1;
Paolo Bonziniefbeec72014-12-27 18:01:00 +01001299
1300 /*
Sean Christopherson0577d1a2020-02-18 13:07:31 -08001301 * Move the target memslot backward in the array by shifting existing
1302 * memslots with a higher GFN (than the target memslot) towards the
1303 * front of the array.
Paolo Bonziniefbeec72014-12-27 18:01:00 +01001304 */
Sean Christopherson0577d1a2020-02-18 13:07:31 -08001305 for (i = slots->id_to_index[memslot->id]; i < slots->used_slots - 1; i++) {
1306 if (memslot->base_gfn > mslots[i + 1].base_gfn)
1307 break;
Xiao Guangrongf85e2cb2011-11-24 17:41:54 +08001308
Sean Christopherson0577d1a2020-02-18 13:07:31 -08001309 WARN_ON_ONCE(memslot->base_gfn == mslots[i + 1].base_gfn);
1310
1311 /* Shift the next memslot forward one and update its index. */
1312 mslots[i] = mslots[i + 1];
1313 slots->id_to_index[mslots[i].id] = i;
1314 }
1315 return i;
1316}
1317
1318/*
1319 * Move a changed memslot forwards in the array by shifting existing slots with
1320 * a lower GFN toward the back of the array. Note, the changed memslot itself
1321 * is not preserved in the array, i.e. not swapped at this time, only its new
1322 * index into the array is tracked. Returns the changed memslot's final index
1323 * into the memslots array.
1324 */
1325static inline int kvm_memslot_move_forward(struct kvm_memslots *slots,
1326 struct kvm_memory_slot *memslot,
1327 int start)
1328{
1329 struct kvm_memory_slot *mslots = slots->memslots;
1330 int i;
1331
1332 for (i = start; i > 0; i--) {
1333 if (memslot->base_gfn < mslots[i - 1].base_gfn)
1334 break;
1335
1336 WARN_ON_ONCE(memslot->base_gfn == mslots[i - 1].base_gfn);
1337
1338 /* Shift the next memslot back one and update its index. */
1339 mslots[i] = mslots[i - 1];
1340 slots->id_to_index[mslots[i].id] = i;
1341 }
1342 return i;
1343}
1344
1345/*
1346 * Re-sort memslots based on their GFN to account for an added, deleted, or
1347 * moved memslot. Sorting memslots by GFN allows using a binary search during
1348 * memslot lookup.
1349 *
1350 * IMPORTANT: Slots are sorted from highest GFN to lowest GFN! I.e. the entry
1351 * at memslots[0] has the highest GFN.
1352 *
1353 * The sorting algorithm takes advantage of having initially sorted memslots
1354 * and knowing the position of the changed memslot. Sorting is also optimized
1355 * by not swapping the updated memslot and instead only shifting other memslots
1356 * and tracking the new index for the update memslot. Only once its final
1357 * index is known is the updated memslot copied into its position in the array.
1358 *
1359 * - When deleting a memslot, the deleted memslot simply needs to be moved to
1360 * the end of the array.
1361 *
1362 * - When creating a memslot, the algorithm "inserts" the new memslot at the
1363 * end of the array and then it forward to its correct location.
1364 *
1365 * - When moving a memslot, the algorithm first moves the updated memslot
1366 * backward to handle the scenario where the memslot's GFN was changed to a
1367 * lower value. update_memslots() then falls through and runs the same flow
1368 * as creating a memslot to move the memslot forward to handle the scenario
1369 * where its GFN was changed to a higher value.
1370 *
1371 * Note, slots are sorted from highest->lowest instead of lowest->highest for
1372 * historical reasons. Originally, invalid memslots where denoted by having
1373 * GFN=0, thus sorting from highest->lowest naturally sorted invalid memslots
1374 * to the end of the array. The current algorithm uses dedicated logic to
1375 * delete a memslot and thus does not rely on invalid memslots having GFN=0.
1376 *
1377 * The other historical motiviation for highest->lowest was to improve the
1378 * performance of memslot lookup. KVM originally used a linear search starting
1379 * at memslots[0]. On x86, the largest memslot usually has one of the highest,
1380 * if not *the* highest, GFN, as the bulk of the guest's RAM is located in a
1381 * single memslot above the 4gb boundary. As the largest memslot is also the
1382 * most likely to be referenced, sorting it to the front of the array was
1383 * advantageous. The current binary search starts from the middle of the array
1384 * and uses an LRU pointer to improve performance for all memslots and GFNs.
1385 */
1386static void update_memslots(struct kvm_memslots *slots,
1387 struct kvm_memory_slot *memslot,
1388 enum kvm_mr_change change)
1389{
1390 int i;
1391
1392 if (change == KVM_MR_DELETE) {
1393 kvm_memslot_delete(slots, memslot);
1394 } else {
1395 if (change == KVM_MR_CREATE)
1396 i = kvm_memslot_insert_back(slots);
1397 else
1398 i = kvm_memslot_move_backward(slots, memslot);
1399 i = kvm_memslot_move_forward(slots, memslot, i);
1400
1401 /*
1402 * Copy the memslot to its new position in memslots and update
1403 * its index accordingly.
1404 */
1405 slots->memslots[i] = *memslot;
1406 slots->id_to_index[memslot->id] = i;
1407 }
Xiao Guangrongbf3e05b2011-11-24 17:40:57 +08001408}
1409
Paolo Bonzini09170a42015-05-18 13:59:39 +02001410static int check_memory_region_flags(const struct kvm_userspace_memory_region *mem)
Xiao Guangronga50d64d2012-08-21 10:58:13 +08001411{
Xiao Guangrong4d8b81a2012-08-21 11:02:51 +08001412 u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
1413
Christoffer Dall0f8a4de2014-08-26 14:00:37 +02001414#ifdef __KVM_HAVE_READONLY_MEM
Xiao Guangrong4d8b81a2012-08-21 11:02:51 +08001415 valid_flags |= KVM_MEM_READONLY;
1416#endif
1417
1418 if (mem->flags & ~valid_flags)
Xiao Guangronga50d64d2012-08-21 10:58:13 +08001419 return -EINVAL;
1420
1421 return 0;
1422}
1423
Gleb Natapov7ec4fb42012-12-24 17:49:30 +02001424static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
Paolo Bonzinif481b062015-05-17 17:30:37 +02001425 int as_id, struct kvm_memslots *slots)
Gleb Natapov7ec4fb42012-12-24 17:49:30 +02001426{
Paolo Bonzinif481b062015-05-17 17:30:37 +02001427 struct kvm_memslots *old_memslots = __kvm_memslots(kvm, as_id);
Sean Christopherson361209e2019-02-05 13:01:14 -08001428 u64 gen = old_memslots->generation;
Gleb Natapov7ec4fb42012-12-24 17:49:30 +02001429
Sean Christopherson361209e2019-02-05 13:01:14 -08001430 WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
1431 slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
David Matlackee3d1572014-08-18 15:46:06 -07001432
Paolo Bonzini52ac8b32021-05-27 08:09:15 -04001433 /*
1434 * Do not store the new memslots while there are invalidations in
Paolo Bonzini071064f2021-08-03 03:45:41 -04001435 * progress, otherwise the locking in invalidate_range_start and
1436 * invalidate_range_end will be unbalanced.
Paolo Bonzini52ac8b32021-05-27 08:09:15 -04001437 */
1438 spin_lock(&kvm->mn_invalidate_lock);
1439 prepare_to_rcuwait(&kvm->mn_memslots_update_rcuwait);
1440 while (kvm->mn_active_invalidate_count) {
1441 set_current_state(TASK_UNINTERRUPTIBLE);
1442 spin_unlock(&kvm->mn_invalidate_lock);
1443 schedule();
1444 spin_lock(&kvm->mn_invalidate_lock);
1445 }
1446 finish_rcuwait(&kvm->mn_memslots_update_rcuwait);
Paolo Bonzinif481b062015-05-17 17:30:37 +02001447 rcu_assign_pointer(kvm->memslots[as_id], slots);
Paolo Bonzini52ac8b32021-05-27 08:09:15 -04001448 spin_unlock(&kvm->mn_invalidate_lock);
Ben Gardonb10a0382021-05-18 10:34:11 -07001449
1450 /*
1451 * Acquired in kvm_set_memslot. Must be released before synchronize
1452 * SRCU below in order to avoid deadlock with another thread
1453 * acquiring the slots_arch_lock in an srcu critical section.
1454 */
1455 mutex_unlock(&kvm->slots_arch_lock);
1456
Gleb Natapov7ec4fb42012-12-24 17:49:30 +02001457 synchronize_srcu_expedited(&kvm->srcu);
Takuya Yoshikawae59dbe02013-07-04 13:40:29 +09001458
David Matlackee3d1572014-08-18 15:46:06 -07001459 /*
Sean Christopherson361209e2019-02-05 13:01:14 -08001460 * Increment the new memslot generation a second time, dropping the
Miaohe Lin00116792019-12-11 14:26:23 +08001461 * update in-progress flag and incrementing the generation based on
Sean Christopherson361209e2019-02-05 13:01:14 -08001462 * the number of address spaces. This provides a unique and easily
1463 * identifiable generation number while the memslots are in flux.
1464 */
1465 gen = slots->generation & ~KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
1466
1467 /*
Paolo Bonzini4bd518f2017-02-03 20:44:51 -08001468 * Generations must be unique even across address spaces. We do not need
1469 * a global counter for that, instead the generation space is evenly split
1470 * across address spaces. For example, with two address spaces, address
Sean Christopherson164bf7e2019-02-05 13:01:18 -08001471 * space 0 will use generations 0, 2, 4, ... while address space 1 will
1472 * use generations 1, 3, 5, ...
David Matlackee3d1572014-08-18 15:46:06 -07001473 */
Sean Christopherson164bf7e2019-02-05 13:01:18 -08001474 gen += KVM_ADDRESS_SPACE_NUM;
David Matlackee3d1572014-08-18 15:46:06 -07001475
Sean Christopherson15248252019-02-05 12:54:17 -08001476 kvm_arch_memslots_updated(kvm, gen);
1477
1478 slots->generation = gen;
Takuya Yoshikawae59dbe02013-07-04 13:40:29 +09001479
1480 return old_memslots;
Gleb Natapov7ec4fb42012-12-24 17:49:30 +02001481}
1482
Ben Gardonddc12f22021-05-18 10:34:10 -07001483static size_t kvm_memslots_size(int slots)
1484{
1485 return sizeof(struct kvm_memslots) +
1486 (sizeof(struct kvm_memory_slot) * slots);
1487}
1488
1489static void kvm_copy_memslots(struct kvm_memslots *to,
1490 struct kvm_memslots *from)
1491{
1492 memcpy(to, from, kvm_memslots_size(from->used_slots));
1493}
1494
Sean Christopherson36947252020-02-18 13:07:32 -08001495/*
1496 * Note, at a minimum, the current number of used slots must be allocated, even
1497 * when deleting a memslot, as we need a complete duplicate of the memslots for
1498 * use when invalidating a memslot prior to deleting/moving the memslot.
1499 */
1500static struct kvm_memslots *kvm_dup_memslots(struct kvm_memslots *old,
1501 enum kvm_mr_change change)
1502{
1503 struct kvm_memslots *slots;
Ben Gardonddc12f22021-05-18 10:34:10 -07001504 size_t new_size;
Sean Christopherson36947252020-02-18 13:07:32 -08001505
1506 if (change == KVM_MR_CREATE)
Ben Gardonddc12f22021-05-18 10:34:10 -07001507 new_size = kvm_memslots_size(old->used_slots + 1);
Sean Christopherson36947252020-02-18 13:07:32 -08001508 else
Ben Gardonddc12f22021-05-18 10:34:10 -07001509 new_size = kvm_memslots_size(old->used_slots);
Sean Christopherson36947252020-02-18 13:07:32 -08001510
1511 slots = kvzalloc(new_size, GFP_KERNEL_ACCOUNT);
1512 if (likely(slots))
Ben Gardonddc12f22021-05-18 10:34:10 -07001513 kvm_copy_memslots(slots, old);
Sean Christopherson36947252020-02-18 13:07:32 -08001514
1515 return slots;
1516}
1517
Sean Christophersoncf47f502020-02-18 13:07:23 -08001518static int kvm_set_memslot(struct kvm *kvm,
1519 const struct kvm_userspace_memory_region *mem,
Sean Christopherson9d4c1972020-02-18 13:07:24 -08001520 struct kvm_memory_slot *old,
Sean Christophersoncf47f502020-02-18 13:07:23 -08001521 struct kvm_memory_slot *new, int as_id,
1522 enum kvm_mr_change change)
1523{
1524 struct kvm_memory_slot *slot;
1525 struct kvm_memslots *slots;
1526 int r;
1527
Ben Gardonb10a0382021-05-18 10:34:11 -07001528 /*
1529 * Released in install_new_memslots.
1530 *
1531 * Must be held from before the current memslots are copied until
1532 * after the new memslots are installed with rcu_assign_pointer,
1533 * then released before the synchronize srcu in install_new_memslots.
1534 *
1535 * When modifying memslots outside of the slots_lock, must be held
1536 * before reading the pointer to the current memslots until after all
1537 * changes to those memslots are complete.
1538 *
1539 * These rules ensure that installing new memslots does not lose
1540 * changes made to the previous memslots.
1541 */
1542 mutex_lock(&kvm->slots_arch_lock);
1543
Sean Christopherson36947252020-02-18 13:07:32 -08001544 slots = kvm_dup_memslots(__kvm_memslots(kvm, as_id), change);
Ben Gardonb10a0382021-05-18 10:34:11 -07001545 if (!slots) {
1546 mutex_unlock(&kvm->slots_arch_lock);
Sean Christophersoncf47f502020-02-18 13:07:23 -08001547 return -ENOMEM;
Ben Gardonb10a0382021-05-18 10:34:11 -07001548 }
Sean Christophersoncf47f502020-02-18 13:07:23 -08001549
1550 if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) {
1551 /*
1552 * Note, the INVALID flag needs to be in the appropriate entry
1553 * in the freshly allocated memslots, not in @old or @new.
1554 */
1555 slot = id_to_memslot(slots, old->id);
1556 slot->flags |= KVM_MEMSLOT_INVALID;
1557
1558 /*
Ben Gardonb10a0382021-05-18 10:34:11 -07001559 * We can re-use the memory from the old memslots.
1560 * It will be overwritten with a copy of the new memslots
1561 * after reacquiring the slots_arch_lock below.
Sean Christophersoncf47f502020-02-18 13:07:23 -08001562 */
1563 slots = install_new_memslots(kvm, as_id, slots);
1564
1565 /* From this point no new shadow pages pointing to a deleted,
1566 * or moved, memslot will be created.
1567 *
1568 * validation of sp->gfn happens in:
1569 * - gfn_to_hva (kvm_read_guest, gfn_to_pfn)
1570 * - kvm_is_visible_gfn (mmu_check_root)
1571 */
1572 kvm_arch_flush_shadow_memslot(kvm, slot);
Ben Gardonb10a0382021-05-18 10:34:11 -07001573
1574 /* Released in install_new_memslots. */
1575 mutex_lock(&kvm->slots_arch_lock);
1576
1577 /*
1578 * The arch-specific fields of the memslots could have changed
1579 * between releasing the slots_arch_lock in
1580 * install_new_memslots and here, so get a fresh copy of the
1581 * slots.
1582 */
1583 kvm_copy_memslots(slots, __kvm_memslots(kvm, as_id));
Sean Christophersoncf47f502020-02-18 13:07:23 -08001584 }
1585
1586 r = kvm_arch_prepare_memory_region(kvm, new, mem, change);
1587 if (r)
1588 goto out_slots;
1589
1590 update_memslots(slots, new, change);
1591 slots = install_new_memslots(kvm, as_id, slots);
1592
1593 kvm_arch_commit_memory_region(kvm, mem, old, new, change);
1594
1595 kvfree(slots);
1596 return 0;
1597
1598out_slots:
Ben Gardonb10a0382021-05-18 10:34:11 -07001599 if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) {
1600 slot = id_to_memslot(slots, old->id);
1601 slot->flags &= ~KVM_MEMSLOT_INVALID;
Sean Christophersoncf47f502020-02-18 13:07:23 -08001602 slots = install_new_memslots(kvm, as_id, slots);
Ben Gardonb10a0382021-05-18 10:34:11 -07001603 } else {
1604 mutex_unlock(&kvm->slots_arch_lock);
1605 }
Sean Christophersoncf47f502020-02-18 13:07:23 -08001606 kvfree(slots);
1607 return r;
1608}
1609
Sean Christopherson5c0b4f32020-02-18 13:07:26 -08001610static int kvm_delete_memslot(struct kvm *kvm,
1611 const struct kvm_userspace_memory_region *mem,
1612 struct kvm_memory_slot *old, int as_id)
1613{
1614 struct kvm_memory_slot new;
1615 int r;
1616
1617 if (!old->npages)
1618 return -EINVAL;
1619
1620 memset(&new, 0, sizeof(new));
1621 new.id = old->id;
Peter Xu9e9eb222020-10-14 11:26:46 -07001622 /*
1623 * This is only for debugging purpose; it should never be referenced
1624 * for a removed memslot.
1625 */
1626 new.as_id = as_id;
Sean Christopherson5c0b4f32020-02-18 13:07:26 -08001627
1628 r = kvm_set_memslot(kvm, mem, old, &new, as_id, KVM_MR_DELETE);
1629 if (r)
1630 return r;
1631
Sean Christophersone96c81e2020-02-18 13:07:27 -08001632 kvm_free_memslot(kvm, old);
Sean Christopherson5c0b4f32020-02-18 13:07:26 -08001633 return 0;
1634}
1635
Avi Kivity6aa8b732006-12-10 02:21:36 -08001636/*
Avi Kivity6aa8b732006-12-10 02:21:36 -08001637 * Allocate some memory and give it an address in the guest physical address
1638 * space.
1639 *
1640 * Discontiguous memory is allowed, mostly for framebuffers.
Sheng Yangf78e0e22007-10-29 09:40:42 +08001641 *
Dominik Dingel02d5d552014-10-27 16:22:56 +01001642 * Must be called holding kvm->slots_lock for write.
Avi Kivity6aa8b732006-12-10 02:21:36 -08001643 */
Sheng Yangf78e0e22007-10-29 09:40:42 +08001644int __kvm_set_memory_region(struct kvm *kvm,
Paolo Bonzini09170a42015-05-18 13:59:39 +02001645 const struct kvm_userspace_memory_region *mem)
Avi Kivity6aa8b732006-12-10 02:21:36 -08001646{
Avi Kivity6aa8b732006-12-10 02:21:36 -08001647 struct kvm_memory_slot old, new;
Sean Christopherson163da372020-02-18 13:07:28 -08001648 struct kvm_memory_slot *tmp;
Takuya Yoshikawaf64c0392013-01-29 11:00:07 +09001649 enum kvm_mr_change change;
Sean Christopherson163da372020-02-18 13:07:28 -08001650 int as_id, id;
1651 int r;
Avi Kivity6aa8b732006-12-10 02:21:36 -08001652
Xiao Guangronga50d64d2012-08-21 10:58:13 +08001653 r = check_memory_region_flags(mem);
1654 if (r)
Sean Christopherson71a4c302020-02-18 13:07:22 -08001655 return r;
Xiao Guangronga50d64d2012-08-21 10:58:13 +08001656
Paolo Bonzinif481b062015-05-17 17:30:37 +02001657 as_id = mem->slot >> 16;
1658 id = (u16)mem->slot;
1659
Avi Kivity6aa8b732006-12-10 02:21:36 -08001660 /* General sanity checks */
1661 if (mem->memory_size & (PAGE_SIZE - 1))
Sean Christopherson71a4c302020-02-18 13:07:22 -08001662 return -EINVAL;
Avi Kivity6aa8b732006-12-10 02:21:36 -08001663 if (mem->guest_phys_addr & (PAGE_SIZE - 1))
Sean Christopherson71a4c302020-02-18 13:07:22 -08001664 return -EINVAL;
Takuya Yoshikawafa3d3152011-05-07 16:35:38 +09001665 /* We can read the guest memory with __xxx_user() later on. */
Paolo Bonzini09d952c2020-06-01 04:17:45 -04001666 if ((mem->userspace_addr & (PAGE_SIZE - 1)) ||
Marc Zyngier139bc8a2021-01-21 12:08:15 +00001667 (mem->userspace_addr != untagged_addr(mem->userspace_addr)) ||
Linus Torvalds96d4f262019-01-03 18:57:57 -08001668 !access_ok((void __user *)(unsigned long)mem->userspace_addr,
Paolo Bonzini09d952c2020-06-01 04:17:45 -04001669 mem->memory_size))
Sean Christopherson71a4c302020-02-18 13:07:22 -08001670 return -EINVAL;
Paolo Bonzinif481b062015-05-17 17:30:37 +02001671 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_MEM_SLOTS_NUM)
Sean Christopherson71a4c302020-02-18 13:07:22 -08001672 return -EINVAL;
Avi Kivity6aa8b732006-12-10 02:21:36 -08001673 if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
Sean Christopherson71a4c302020-02-18 13:07:22 -08001674 return -EINVAL;
Avi Kivity6aa8b732006-12-10 02:21:36 -08001675
Sean Christopherson5c0b4f32020-02-18 13:07:26 -08001676 /*
1677 * Make a full copy of the old memslot, the pointer will become stale
1678 * when the memslots are re-sorted by update_memslots(), and the old
1679 * memslot needs to be referenced after calling update_memslots(), e.g.
Sean Christopherson0dff0842020-02-18 13:07:29 -08001680 * to free its resources and for arch specific behavior.
Sean Christopherson5c0b4f32020-02-18 13:07:26 -08001681 */
Sean Christopherson0577d1a2020-02-18 13:07:31 -08001682 tmp = id_to_memslot(__kvm_memslots(kvm, as_id), id);
1683 if (tmp) {
1684 old = *tmp;
1685 tmp = NULL;
1686 } else {
1687 memset(&old, 0, sizeof(old));
1688 old.id = id;
1689 }
Sean Christopherson163da372020-02-18 13:07:28 -08001690
Sean Christopherson5c0b4f32020-02-18 13:07:26 -08001691 if (!mem->memory_size)
1692 return kvm_delete_memslot(kvm, mem, &old, as_id);
1693
Peter Xu9e9eb222020-10-14 11:26:46 -07001694 new.as_id = as_id;
Paolo Bonzinif481b062015-05-17 17:30:37 +02001695 new.id = id;
Sean Christopherson163da372020-02-18 13:07:28 -08001696 new.base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
1697 new.npages = mem->memory_size >> PAGE_SHIFT;
Avi Kivity6aa8b732006-12-10 02:21:36 -08001698 new.flags = mem->flags;
Sean Christopherson414de7ab2020-02-18 13:07:20 -08001699 new.userspace_addr = mem->userspace_addr;
Avi Kivity6aa8b732006-12-10 02:21:36 -08001700
Sean Christopherson163da372020-02-18 13:07:28 -08001701 if (new.npages > KVM_MEM_MAX_NR_PAGES)
1702 return -EINVAL;
1703
Sean Christopherson5c0b4f32020-02-18 13:07:26 -08001704 if (!old.npages) {
1705 change = KVM_MR_CREATE;
Sean Christopherson163da372020-02-18 13:07:28 -08001706 new.dirty_bitmap = NULL;
1707 memset(&new.arch, 0, sizeof(new.arch));
Sean Christopherson5c0b4f32020-02-18 13:07:26 -08001708 } else { /* Modify an existing slot. */
1709 if ((new.userspace_addr != old.userspace_addr) ||
Sean Christopherson163da372020-02-18 13:07:28 -08001710 (new.npages != old.npages) ||
Sean Christopherson5c0b4f32020-02-18 13:07:26 -08001711 ((new.flags ^ old.flags) & KVM_MEM_READONLY))
Sean Christopherson71a4c302020-02-18 13:07:22 -08001712 return -EINVAL;
Paolo Bonzini09170a42015-05-18 13:59:39 +02001713
Sean Christopherson163da372020-02-18 13:07:28 -08001714 if (new.base_gfn != old.base_gfn)
Sean Christopherson5c0b4f32020-02-18 13:07:26 -08001715 change = KVM_MR_MOVE;
1716 else if (new.flags != old.flags)
1717 change = KVM_MR_FLAGS_ONLY;
1718 else /* Nothing to change. */
1719 return 0;
Sean Christopherson163da372020-02-18 13:07:28 -08001720
1721 /* Copy dirty_bitmap and arch from the current memslot. */
1722 new.dirty_bitmap = old.dirty_bitmap;
1723 memcpy(&new.arch, &old.arch, sizeof(new.arch));
Paolo Bonzini09170a42015-05-18 13:59:39 +02001724 }
Avi Kivity6aa8b732006-12-10 02:21:36 -08001725
Takuya Yoshikawaf64c0392013-01-29 11:00:07 +09001726 if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) {
Takuya Yoshikawa0a706be2013-01-11 18:26:55 +09001727 /* Check for overlaps */
Sean Christopherson163da372020-02-18 13:07:28 -08001728 kvm_for_each_memslot(tmp, __kvm_memslots(kvm, as_id)) {
1729 if (tmp->id == id)
Takuya Yoshikawa0a706be2013-01-11 18:26:55 +09001730 continue;
Sean Christopherson163da372020-02-18 13:07:28 -08001731 if (!((new.base_gfn + new.npages <= tmp->base_gfn) ||
1732 (new.base_gfn >= tmp->base_gfn + tmp->npages)))
Sean Christopherson71a4c302020-02-18 13:07:22 -08001733 return -EEXIST;
Takuya Yoshikawa0a706be2013-01-11 18:26:55 +09001734 }
Avi Kivity6aa8b732006-12-10 02:21:36 -08001735 }
Avi Kivity6aa8b732006-12-10 02:21:36 -08001736
Sean Christopherson414de7ab2020-02-18 13:07:20 -08001737 /* Allocate/free page dirty bitmap as needed */
Avi Kivity6aa8b732006-12-10 02:21:36 -08001738 if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
Al Viro8b6d44c2007-02-09 16:38:40 +00001739 new.dirty_bitmap = NULL;
Peter Xu044c59c2020-09-30 21:22:26 -04001740 else if (!new.dirty_bitmap && !kvm->dirty_ring_size) {
Jay Zhou3c9bd402020-02-27 09:32:27 +08001741 r = kvm_alloc_dirty_bitmap(&new);
Sean Christopherson71a4c302020-02-18 13:07:22 -08001742 if (r)
1743 return r;
Jay Zhou3c9bd402020-02-27 09:32:27 +08001744
1745 if (kvm_dirty_log_manual_protect_and_init_set(kvm))
1746 bitmap_set(new.dirty_bitmap, 0, new.npages);
Avi Kivity6aa8b732006-12-10 02:21:36 -08001747 }
1748
Sean Christophersoncf47f502020-02-18 13:07:23 -08001749 r = kvm_set_memslot(kvm, mem, &old, &new, as_id, change);
1750 if (r)
1751 goto out_bitmap;
Zhang Xiantao3ad82a72007-11-20 13:11:38 +08001752
Sean Christopherson5c0b4f32020-02-18 13:07:26 -08001753 if (old.dirty_bitmap && !new.dirty_bitmap)
1754 kvm_destroy_dirty_bitmap(&old);
Avi Kivity6aa8b732006-12-10 02:21:36 -08001755 return 0;
1756
Sean Christophersonbd0e96f2020-02-18 13:07:21 -08001757out_bitmap:
1758 if (new.dirty_bitmap && !old.dirty_bitmap)
1759 kvm_destroy_dirty_bitmap(&new);
Avi Kivity6aa8b732006-12-10 02:21:36 -08001760 return r;
Izik Eidus210c7c42007-10-24 23:52:57 +02001761}
Sheng Yangf78e0e22007-10-29 09:40:42 +08001762EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
1763
1764int kvm_set_memory_region(struct kvm *kvm,
Paolo Bonzini09170a42015-05-18 13:59:39 +02001765 const struct kvm_userspace_memory_region *mem)
Sheng Yangf78e0e22007-10-29 09:40:42 +08001766{
1767 int r;
1768
Marcelo Tosatti79fac952009-12-23 14:35:26 -02001769 mutex_lock(&kvm->slots_lock);
Takuya Yoshikawa47ae31e2013-02-27 19:43:00 +09001770 r = __kvm_set_memory_region(kvm, mem);
Marcelo Tosatti79fac952009-12-23 14:35:26 -02001771 mutex_unlock(&kvm->slots_lock);
Sheng Yangf78e0e22007-10-29 09:40:42 +08001772 return r;
1773}
Izik Eidus210c7c42007-10-24 23:52:57 +02001774EXPORT_SYMBOL_GPL(kvm_set_memory_region);
1775
Stephen Hemminger79408762013-12-29 12:12:29 -08001776static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
1777 struct kvm_userspace_memory_region *mem)
Izik Eidus210c7c42007-10-24 23:52:57 +02001778{
Paolo Bonzinif481b062015-05-17 17:30:37 +02001779 if ((u16)mem->slot >= KVM_USER_MEM_SLOTS)
Izik Eiduse0d62c72007-10-24 23:57:46 +02001780 return -EINVAL;
Paolo Bonzini09170a42015-05-18 13:59:39 +02001781
Takuya Yoshikawa47ae31e2013-02-27 19:43:00 +09001782 return kvm_set_memory_region(kvm, mem);
Avi Kivity6aa8b732006-12-10 02:21:36 -08001783}
1784
Sean Christopherson0dff0842020-02-18 13:07:29 -08001785#ifndef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
Sean Christopherson2a49f612020-02-18 13:07:30 -08001786/**
1787 * kvm_get_dirty_log - get a snapshot of dirty pages
1788 * @kvm: pointer to kvm instance
1789 * @log: slot id and address to which we copy the log
1790 * @is_dirty: set to '1' if any dirty pages were found
1791 * @memslot: set to the associated memslot, always valid on success
1792 */
1793int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log,
1794 int *is_dirty, struct kvm_memory_slot **memslot)
Avi Kivity6aa8b732006-12-10 02:21:36 -08001795{
Paolo Bonzini9f6b8022015-05-17 16:20:07 +02001796 struct kvm_memslots *slots;
Markus Elfring843574a2017-01-22 17:41:07 +01001797 int i, as_id, id;
Takuya Yoshikawa87bf6e72010-04-12 19:35:35 +09001798 unsigned long n;
Avi Kivity6aa8b732006-12-10 02:21:36 -08001799 unsigned long any = 0;
1800
Peter Xub2cc64c2020-09-30 21:22:24 -04001801 /* Dirty ring tracking is exclusive to dirty log tracking */
1802 if (kvm->dirty_ring_size)
1803 return -ENXIO;
1804
Sean Christopherson2a49f612020-02-18 13:07:30 -08001805 *memslot = NULL;
1806 *is_dirty = 0;
1807
Paolo Bonzinif481b062015-05-17 17:30:37 +02001808 as_id = log->slot >> 16;
1809 id = (u16)log->slot;
1810 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
Markus Elfring843574a2017-01-22 17:41:07 +01001811 return -EINVAL;
Avi Kivity6aa8b732006-12-10 02:21:36 -08001812
Paolo Bonzinif481b062015-05-17 17:30:37 +02001813 slots = __kvm_memslots(kvm, as_id);
Sean Christopherson2a49f612020-02-18 13:07:30 -08001814 *memslot = id_to_memslot(slots, id);
Sean Christopherson0577d1a2020-02-18 13:07:31 -08001815 if (!(*memslot) || !(*memslot)->dirty_bitmap)
Markus Elfring843574a2017-01-22 17:41:07 +01001816 return -ENOENT;
Avi Kivity6aa8b732006-12-10 02:21:36 -08001817
Sean Christopherson2a49f612020-02-18 13:07:30 -08001818 kvm_arch_sync_dirty_log(kvm, *memslot);
1819
1820 n = kvm_dirty_bitmap_bytes(*memslot);
Avi Kivity6aa8b732006-12-10 02:21:36 -08001821
Uri Lublincd1a4a92007-02-22 16:43:09 +02001822 for (i = 0; !any && i < n/sizeof(long); ++i)
Sean Christopherson2a49f612020-02-18 13:07:30 -08001823 any = (*memslot)->dirty_bitmap[i];
Avi Kivity6aa8b732006-12-10 02:21:36 -08001824
Sean Christopherson2a49f612020-02-18 13:07:30 -08001825 if (copy_to_user(log->dirty_bitmap, (*memslot)->dirty_bitmap, n))
Markus Elfring843574a2017-01-22 17:41:07 +01001826 return -EFAULT;
Avi Kivity6aa8b732006-12-10 02:21:36 -08001827
Zhang Xiantao5bb064d2007-11-18 20:29:43 +08001828 if (any)
1829 *is_dirty = 1;
Markus Elfring843574a2017-01-22 17:41:07 +01001830 return 0;
Avi Kivity6aa8b732006-12-10 02:21:36 -08001831}
Aneesh Kumar K.V2ba9f0d2013-10-07 22:17:59 +05301832EXPORT_SYMBOL_GPL(kvm_get_dirty_log);
Avi Kivity6aa8b732006-12-10 02:21:36 -08001833
Sean Christopherson0dff0842020-02-18 13:07:29 -08001834#else /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
Mario Smarduchba0513b2015-01-15 15:58:53 -08001835/**
Jiang Biaob8b00222019-04-23 19:40:30 +08001836 * kvm_get_dirty_log_protect - get a snapshot of dirty pages
Paolo Bonzini2a31b9d2018-10-23 02:36:47 +02001837 * and reenable dirty page tracking for the corresponding pages.
Mario Smarduchba0513b2015-01-15 15:58:53 -08001838 * @kvm: pointer to kvm instance
1839 * @log: slot id and address to which we copy the log
Mario Smarduchba0513b2015-01-15 15:58:53 -08001840 *
1841 * We need to keep it in mind that VCPU threads can write to the bitmap
1842 * concurrently. So, to avoid losing track of dirty pages we keep the
1843 * following order:
1844 *
1845 * 1. Take a snapshot of the bit and clear it if needed.
1846 * 2. Write protect the corresponding page.
1847 * 3. Copy the snapshot to the userspace.
1848 * 4. Upon return caller flushes TLB's if needed.
1849 *
1850 * Between 2 and 4, the guest may write to the page using the remaining TLB
1851 * entry. This is not a problem because the page is reported dirty using
1852 * the snapshot taken before and step 4 ensures that writes done after
1853 * exiting to userspace will be logged for the next call.
1854 *
1855 */
Sean Christopherson0dff0842020-02-18 13:07:29 -08001856static int kvm_get_dirty_log_protect(struct kvm *kvm, struct kvm_dirty_log *log)
Mario Smarduchba0513b2015-01-15 15:58:53 -08001857{
Paolo Bonzini9f6b8022015-05-17 16:20:07 +02001858 struct kvm_memslots *slots;
Mario Smarduchba0513b2015-01-15 15:58:53 -08001859 struct kvm_memory_slot *memslot;
Markus Elfring58d6db32017-01-22 17:30:16 +01001860 int i, as_id, id;
Mario Smarduchba0513b2015-01-15 15:58:53 -08001861 unsigned long n;
1862 unsigned long *dirty_bitmap;
1863 unsigned long *dirty_bitmap_buffer;
Sean Christopherson0dff0842020-02-18 13:07:29 -08001864 bool flush;
Mario Smarduchba0513b2015-01-15 15:58:53 -08001865
Peter Xub2cc64c2020-09-30 21:22:24 -04001866 /* Dirty ring tracking is exclusive to dirty log tracking */
1867 if (kvm->dirty_ring_size)
1868 return -ENXIO;
1869
Paolo Bonzinif481b062015-05-17 17:30:37 +02001870 as_id = log->slot >> 16;
1871 id = (u16)log->slot;
1872 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
Markus Elfring58d6db32017-01-22 17:30:16 +01001873 return -EINVAL;
Mario Smarduchba0513b2015-01-15 15:58:53 -08001874
Paolo Bonzinif481b062015-05-17 17:30:37 +02001875 slots = __kvm_memslots(kvm, as_id);
1876 memslot = id_to_memslot(slots, id);
Sean Christopherson0577d1a2020-02-18 13:07:31 -08001877 if (!memslot || !memslot->dirty_bitmap)
1878 return -ENOENT;
Mario Smarduchba0513b2015-01-15 15:58:53 -08001879
1880 dirty_bitmap = memslot->dirty_bitmap;
Mario Smarduchba0513b2015-01-15 15:58:53 -08001881
Sean Christopherson0dff0842020-02-18 13:07:29 -08001882 kvm_arch_sync_dirty_log(kvm, memslot);
1883
Mario Smarduchba0513b2015-01-15 15:58:53 -08001884 n = kvm_dirty_bitmap_bytes(memslot);
Sean Christopherson0dff0842020-02-18 13:07:29 -08001885 flush = false;
Paolo Bonzini2a31b9d2018-10-23 02:36:47 +02001886 if (kvm->manual_dirty_log_protect) {
1887 /*
1888 * Unlike kvm_get_dirty_log, we always return false in *flush,
1889 * because no flush is needed until KVM_CLEAR_DIRTY_LOG. There
1890 * is some code duplication between this function and
1891 * kvm_get_dirty_log, but hopefully all architecture
1892 * transition to kvm_get_dirty_log_protect and kvm_get_dirty_log
1893 * can be eliminated.
1894 */
1895 dirty_bitmap_buffer = dirty_bitmap;
1896 } else {
1897 dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
1898 memset(dirty_bitmap_buffer, 0, n);
Mario Smarduchba0513b2015-01-15 15:58:53 -08001899
Ben Gardon531810c2021-02-02 10:57:24 -08001900 KVM_MMU_LOCK(kvm);
Paolo Bonzini2a31b9d2018-10-23 02:36:47 +02001901 for (i = 0; i < n / sizeof(long); i++) {
1902 unsigned long mask;
1903 gfn_t offset;
Mario Smarduchba0513b2015-01-15 15:58:53 -08001904
Paolo Bonzini2a31b9d2018-10-23 02:36:47 +02001905 if (!dirty_bitmap[i])
1906 continue;
Mario Smarduchba0513b2015-01-15 15:58:53 -08001907
Sean Christopherson0dff0842020-02-18 13:07:29 -08001908 flush = true;
Paolo Bonzini2a31b9d2018-10-23 02:36:47 +02001909 mask = xchg(&dirty_bitmap[i], 0);
1910 dirty_bitmap_buffer[i] = mask;
Mario Smarduchba0513b2015-01-15 15:58:53 -08001911
Lan Tianyua67794c2019-02-02 17:20:27 +08001912 offset = i * BITS_PER_LONG;
1913 kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
1914 offset, mask);
Takuya Yoshikawa58d29302015-03-17 16:19:58 +09001915 }
Ben Gardon531810c2021-02-02 10:57:24 -08001916 KVM_MMU_UNLOCK(kvm);
Mario Smarduchba0513b2015-01-15 15:58:53 -08001917 }
1918
Sean Christopherson0dff0842020-02-18 13:07:29 -08001919 if (flush)
1920 kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
1921
Mario Smarduchba0513b2015-01-15 15:58:53 -08001922 if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
Markus Elfring58d6db32017-01-22 17:30:16 +01001923 return -EFAULT;
1924 return 0;
Mario Smarduchba0513b2015-01-15 15:58:53 -08001925}
Sean Christopherson0dff0842020-02-18 13:07:29 -08001926
1927
1928/**
1929 * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
1930 * @kvm: kvm instance
1931 * @log: slot id and address to which we copy the log
1932 *
1933 * Steps 1-4 below provide general overview of dirty page logging. See
1934 * kvm_get_dirty_log_protect() function description for additional details.
1935 *
1936 * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we
1937 * always flush the TLB (step 4) even if previous step failed and the dirty
1938 * bitmap may be corrupt. Regardless of previous outcome the KVM logging API
1939 * does not preclude user space subsequent dirty log read. Flushing TLB ensures
1940 * writes will be marked dirty for next log read.
1941 *
1942 * 1. Take a snapshot of the bit and clear it if needed.
1943 * 2. Write protect the corresponding page.
1944 * 3. Copy the snapshot to the userspace.
1945 * 4. Flush TLB's if needed.
1946 */
1947static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
1948 struct kvm_dirty_log *log)
1949{
1950 int r;
1951
1952 mutex_lock(&kvm->slots_lock);
1953
1954 r = kvm_get_dirty_log_protect(kvm, log);
1955
1956 mutex_unlock(&kvm->slots_lock);
1957 return r;
1958}
Paolo Bonzini2a31b9d2018-10-23 02:36:47 +02001959
1960/**
1961 * kvm_clear_dirty_log_protect - clear dirty bits in the bitmap
1962 * and reenable dirty page tracking for the corresponding pages.
1963 * @kvm: pointer to kvm instance
1964 * @log: slot id and address from which to fetch the bitmap of dirty pages
1965 */
Sean Christopherson0dff0842020-02-18 13:07:29 -08001966static int kvm_clear_dirty_log_protect(struct kvm *kvm,
1967 struct kvm_clear_dirty_log *log)
Paolo Bonzini2a31b9d2018-10-23 02:36:47 +02001968{
1969 struct kvm_memslots *slots;
1970 struct kvm_memory_slot *memslot;
Tomas Bortoli98938aa2019-01-02 18:29:37 +01001971 int as_id, id;
Paolo Bonzini2a31b9d2018-10-23 02:36:47 +02001972 gfn_t offset;
Tomas Bortoli98938aa2019-01-02 18:29:37 +01001973 unsigned long i, n;
Paolo Bonzini2a31b9d2018-10-23 02:36:47 +02001974 unsigned long *dirty_bitmap;
1975 unsigned long *dirty_bitmap_buffer;
Sean Christopherson0dff0842020-02-18 13:07:29 -08001976 bool flush;
Paolo Bonzini2a31b9d2018-10-23 02:36:47 +02001977
Peter Xub2cc64c2020-09-30 21:22:24 -04001978 /* Dirty ring tracking is exclusive to dirty log tracking */
1979 if (kvm->dirty_ring_size)
1980 return -ENXIO;
1981
Paolo Bonzini2a31b9d2018-10-23 02:36:47 +02001982 as_id = log->slot >> 16;
1983 id = (u16)log->slot;
1984 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
1985 return -EINVAL;
1986
Paolo Bonzini76d58e02019-04-17 15:28:44 +02001987 if (log->first_page & 63)
Paolo Bonzini2a31b9d2018-10-23 02:36:47 +02001988 return -EINVAL;
1989
1990 slots = __kvm_memslots(kvm, as_id);
1991 memslot = id_to_memslot(slots, id);
Sean Christopherson0577d1a2020-02-18 13:07:31 -08001992 if (!memslot || !memslot->dirty_bitmap)
1993 return -ENOENT;
Paolo Bonzini2a31b9d2018-10-23 02:36:47 +02001994
1995 dirty_bitmap = memslot->dirty_bitmap;
Paolo Bonzini2a31b9d2018-10-23 02:36:47 +02001996
Peter Xu4ddc9202019-05-08 17:15:45 +08001997 n = ALIGN(log->num_pages, BITS_PER_LONG) / 8;
Tomas Bortoli98938aa2019-01-02 18:29:37 +01001998
1999 if (log->first_page > memslot->npages ||
Paolo Bonzini76d58e02019-04-17 15:28:44 +02002000 log->num_pages > memslot->npages - log->first_page ||
2001 (log->num_pages < memslot->npages - log->first_page && (log->num_pages & 63)))
2002 return -EINVAL;
Tomas Bortoli98938aa2019-01-02 18:29:37 +01002003
Sean Christopherson0dff0842020-02-18 13:07:29 -08002004 kvm_arch_sync_dirty_log(kvm, memslot);
2005
2006 flush = false;
Paolo Bonzini2a31b9d2018-10-23 02:36:47 +02002007 dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
2008 if (copy_from_user(dirty_bitmap_buffer, log->dirty_bitmap, n))
2009 return -EFAULT;
2010
Ben Gardon531810c2021-02-02 10:57:24 -08002011 KVM_MMU_LOCK(kvm);
Peter Xu53eac7a2019-05-08 17:15:46 +08002012 for (offset = log->first_page, i = offset / BITS_PER_LONG,
2013 n = DIV_ROUND_UP(log->num_pages, BITS_PER_LONG); n--;
Paolo Bonzini2a31b9d2018-10-23 02:36:47 +02002014 i++, offset += BITS_PER_LONG) {
2015 unsigned long mask = *dirty_bitmap_buffer++;
2016 atomic_long_t *p = (atomic_long_t *) &dirty_bitmap[i];
2017 if (!mask)
2018 continue;
2019
2020 mask &= atomic_long_fetch_andnot(mask, p);
2021
2022 /*
2023 * mask contains the bits that really have been cleared. This
2024 * never includes any bits beyond the length of the memslot (if
2025 * the length is not aligned to 64 pages), therefore it is not
2026 * a problem if userspace sets them in log->dirty_bitmap.
2027 */
2028 if (mask) {
Sean Christopherson0dff0842020-02-18 13:07:29 -08002029 flush = true;
Paolo Bonzini2a31b9d2018-10-23 02:36:47 +02002030 kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
2031 offset, mask);
2032 }
2033 }
Ben Gardon531810c2021-02-02 10:57:24 -08002034 KVM_MMU_UNLOCK(kvm);
Paolo Bonzini2a31b9d2018-10-23 02:36:47 +02002035
Sean Christopherson0dff0842020-02-18 13:07:29 -08002036 if (flush)
2037 kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
2038
Paolo Bonzini2a31b9d2018-10-23 02:36:47 +02002039 return 0;
2040}
Sean Christopherson0dff0842020-02-18 13:07:29 -08002041
2042static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm,
2043 struct kvm_clear_dirty_log *log)
2044{
2045 int r;
2046
2047 mutex_lock(&kvm->slots_lock);
2048
2049 r = kvm_clear_dirty_log_protect(kvm, log);
2050
2051 mutex_unlock(&kvm->slots_lock);
2052 return r;
2053}
2054#endif /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
Mario Smarduchba0513b2015-01-15 15:58:53 -08002055
Gleb Natapov49c77542010-10-18 15:22:23 +02002056struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
2057{
2058 return __gfn_to_memslot(kvm_memslots(kvm), gfn);
2059}
Avi Kivitya1f4d3952010-06-21 11:44:20 +03002060EXPORT_SYMBOL_GPL(gfn_to_memslot);
Avi Kivity6aa8b732006-12-10 02:21:36 -08002061
Paolo Bonzini8e734852015-05-17 13:58:53 +02002062struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn)
2063{
David Matlackfe22ed82021-08-04 22:28:40 +00002064 struct kvm_memslots *slots = kvm_vcpu_memslots(vcpu);
2065 struct kvm_memory_slot *slot;
2066 int slot_index;
2067
2068 slot = try_get_memslot(slots, vcpu->last_used_slot, gfn);
2069 if (slot)
2070 return slot;
2071
2072 /*
2073 * Fall back to searching all memslots. We purposely use
2074 * search_memslots() instead of __gfn_to_memslot() to avoid
2075 * thrashing the VM-wide last_used_index in kvm_memslots.
2076 */
2077 slot = search_memslots(slots, gfn, &slot_index);
2078 if (slot) {
2079 vcpu->last_used_slot = slot_index;
2080 return slot;
2081 }
2082
2083 return NULL;
Paolo Bonzini8e734852015-05-17 13:58:53 +02002084}
Paolo Bonzinie72436b2020-04-17 12:21:06 -04002085EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_memslot);
Paolo Bonzini8e734852015-05-17 13:58:53 +02002086
Yaowei Bai33e94152015-11-14 11:21:06 +08002087bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
Izik Eiduse0d62c72007-10-24 23:57:46 +02002088{
Xiao Guangrongbf3e05b2011-11-24 17:40:57 +08002089 struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn);
Izik Eiduse0d62c72007-10-24 23:57:46 +02002090
Paolo Bonzinic36b7152020-04-16 09:48:07 -04002091 return kvm_is_visible_memslot(memslot);
Izik Eiduse0d62c72007-10-24 23:57:46 +02002092}
2093EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
2094
Vitaly Kuznetsov995decb2020-07-08 16:00:23 +02002095bool kvm_vcpu_is_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
2096{
2097 struct kvm_memory_slot *memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2098
2099 return kvm_is_visible_memslot(memslot);
2100}
2101EXPORT_SYMBOL_GPL(kvm_vcpu_is_visible_gfn);
2102
Sean Christophersonf9b84e12020-01-08 12:24:37 -08002103unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn)
Joerg Roedel8f0b1ab2010-01-28 12:37:56 +01002104{
2105 struct vm_area_struct *vma;
2106 unsigned long addr, size;
2107
2108 size = PAGE_SIZE;
2109
Sean Christopherson42cde482020-01-08 12:24:38 -08002110 addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gfn, NULL);
Joerg Roedel8f0b1ab2010-01-28 12:37:56 +01002111 if (kvm_is_error_hva(addr))
2112 return PAGE_SIZE;
2113
Michel Lespinassed8ed45c2020-06-08 21:33:25 -07002114 mmap_read_lock(current->mm);
Joerg Roedel8f0b1ab2010-01-28 12:37:56 +01002115 vma = find_vma(current->mm, addr);
2116 if (!vma)
2117 goto out;
2118
2119 size = vma_kernel_pagesize(vma);
2120
2121out:
Michel Lespinassed8ed45c2020-06-08 21:33:25 -07002122 mmap_read_unlock(current->mm);
Joerg Roedel8f0b1ab2010-01-28 12:37:56 +01002123
2124 return size;
2125}
2126
Xiao Guangrong4d8b81a2012-08-21 11:02:51 +08002127static bool memslot_is_readonly(struct kvm_memory_slot *slot)
2128{
2129 return slot->flags & KVM_MEM_READONLY;
2130}
2131
Xiao Guangrong4d8b81a2012-08-21 11:02:51 +08002132static unsigned long __gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
2133 gfn_t *nr_pages, bool write)
Izik Eidus539cb662007-11-11 22:05:04 +02002134{
Marcelo Tosattibc6678a2009-12-23 14:35:21 -02002135 if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
Xiao Guangrongca3a4902012-08-21 11:01:50 +08002136 return KVM_HVA_ERR_BAD;
Xiao Guangrong48987782010-08-22 19:11:43 +08002137
Xiao Guangrong4d8b81a2012-08-21 11:02:51 +08002138 if (memslot_is_readonly(slot) && write)
2139 return KVM_HVA_ERR_RO_BAD;
Xiao Guangrong48987782010-08-22 19:11:43 +08002140
2141 if (nr_pages)
2142 *nr_pages = slot->npages - (gfn - slot->base_gfn);
2143
Xiao Guangrong4d8b81a2012-08-21 11:02:51 +08002144 return __gfn_to_hva_memslot(slot, gfn);
Izik Eidus539cb662007-11-11 22:05:04 +02002145}
Xiao Guangrong48987782010-08-22 19:11:43 +08002146
Xiao Guangrong4d8b81a2012-08-21 11:02:51 +08002147static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
2148 gfn_t *nr_pages)
2149{
2150 return __gfn_to_hva_many(slot, gfn, nr_pages, true);
2151}
2152
2153unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
Stephen Hemminger79408762013-12-29 12:12:29 -08002154 gfn_t gfn)
Xiao Guangrong4d8b81a2012-08-21 11:02:51 +08002155{
2156 return gfn_to_hva_many(slot, gfn, NULL);
2157}
2158EXPORT_SYMBOL_GPL(gfn_to_hva_memslot);
2159
Xiao Guangrong48987782010-08-22 19:11:43 +08002160unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
2161{
Gleb Natapov49c77542010-10-18 15:22:23 +02002162 return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL);
Xiao Guangrong48987782010-08-22 19:11:43 +08002163}
Sheng Yang0d150292008-04-25 21:44:50 +08002164EXPORT_SYMBOL_GPL(gfn_to_hva);
Izik Eidus539cb662007-11-11 22:05:04 +02002165
Paolo Bonzini8e734852015-05-17 13:58:53 +02002166unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn)
2167{
2168 return gfn_to_hva_many(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, NULL);
2169}
2170EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_hva);
2171
Xiao Guangrong86ab8cf2012-08-21 10:59:53 +08002172/*
Wei Yang970c0d42018-10-09 10:41:15 +08002173 * Return the hva of a @gfn and the R/W attribute if possible.
2174 *
2175 * @slot: the kvm_memory_slot which contains @gfn
2176 * @gfn: the gfn to be translated
2177 * @writable: used to return the read/write attribute of the @slot if the hva
2178 * is valid and @writable is not NULL
Xiao Guangrong86ab8cf2012-08-21 10:59:53 +08002179 */
Christoffer Dall64d83122014-08-19 12:15:00 +02002180unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot,
2181 gfn_t gfn, bool *writable)
Gleb Natapov80300892010-10-19 18:13:41 +02002182{
Gleb Natapova2ac07f2013-10-01 19:58:36 +03002183 unsigned long hva = __gfn_to_hva_many(slot, gfn, NULL, false);
2184
2185 if (!kvm_is_error_hva(hva) && writable)
Paolo Bonziniba6a3542013-09-09 13:52:33 +02002186 *writable = !memslot_is_readonly(slot);
2187
Gleb Natapova2ac07f2013-10-01 19:58:36 +03002188 return hva;
Xiao Guangrong86ab8cf2012-08-21 10:59:53 +08002189}
2190
Christoffer Dall64d83122014-08-19 12:15:00 +02002191unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable)
2192{
2193 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
2194
2195 return gfn_to_hva_memslot_prot(slot, gfn, writable);
2196}
2197
Paolo Bonzini8e734852015-05-17 13:58:53 +02002198unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable)
2199{
2200 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2201
2202 return gfn_to_hva_memslot_prot(slot, gfn, writable);
2203}
2204
Huang Yingfafc3db2011-01-30 11:15:49 +08002205static inline int check_user_page_hwpoison(unsigned long addr)
2206{
Lorenzo Stoakes0d731752016-10-24 10:57:25 +01002207 int rc, flags = FOLL_HWPOISON | FOLL_WRITE;
Huang Yingfafc3db2011-01-30 11:15:49 +08002208
Lorenzo Stoakes0d731752016-10-24 10:57:25 +01002209 rc = get_user_pages(addr, 1, flags, NULL, NULL);
Huang Yingfafc3db2011-01-30 11:15:49 +08002210 return rc == -EHWPOISON;
2211}
2212
Xiao Guangrong2fc84312012-08-21 11:00:22 +08002213/*
Paolo Bonzinib9b33da2018-07-27 17:44:41 +02002214 * The fast path to get the writable pfn which will be stored in @pfn,
2215 * true indicates success, otherwise false is returned. It's also the
Miaohe Lin311497e2019-12-11 14:26:25 +08002216 * only part that runs if we can in atomic context.
Xiao Guangrong2fc84312012-08-21 11:00:22 +08002217 */
Paolo Bonzinib9b33da2018-07-27 17:44:41 +02002218static bool hva_to_pfn_fast(unsigned long addr, bool write_fault,
2219 bool *writable, kvm_pfn_t *pfn)
Xiao Guangrong2fc84312012-08-21 11:00:22 +08002220{
2221 struct page *page[1];
Xiao Guangrong2fc84312012-08-21 11:00:22 +08002222
Xiao Guangrong12ce13f2012-08-21 11:00:49 +08002223 /*
2224 * Fast pin a writable pfn only if it is a write fault request
2225 * or the caller allows to map a writable pfn for a read fault
2226 * request.
2227 */
2228 if (!(write_fault || writable))
2229 return false;
2230
Souptick Joarderdadbb612020-06-07 21:40:55 -07002231 if (get_user_page_fast_only(addr, FOLL_WRITE, page)) {
Xiao Guangrong2fc84312012-08-21 11:00:22 +08002232 *pfn = page_to_pfn(page[0]);
2233
2234 if (writable)
2235 *writable = true;
2236 return true;
2237 }
2238
2239 return false;
2240}
2241
2242/*
2243 * The slow path to get the pfn of the specified host virtual address,
2244 * 1 indicates success, -errno is returned if error is detected.
2245 */
2246static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
Dan Williamsba049e92016-01-15 16:56:11 -08002247 bool *writable, kvm_pfn_t *pfn)
Avi Kivity954bbbc2007-03-30 14:02:32 +03002248{
Al Viroce530532017-11-19 17:47:33 -05002249 unsigned int flags = FOLL_HWPOISON;
2250 struct page *page;
Gleb Natapovaf585b92010-10-14 11:22:46 +02002251 int npages = 0;
Xiao Guangrong2fc84312012-08-21 11:00:22 +08002252
2253 might_sleep();
2254
2255 if (writable)
2256 *writable = write_fault;
2257
Al Viroce530532017-11-19 17:47:33 -05002258 if (write_fault)
2259 flags |= FOLL_WRITE;
2260 if (async)
2261 flags |= FOLL_NOWAIT;
Lorenzo Stoakesd4944b02016-10-13 01:20:12 +01002262
Al Viroce530532017-11-19 17:47:33 -05002263 npages = get_user_pages_unlocked(addr, 1, &page, flags);
Xiao Guangrong2fc84312012-08-21 11:00:22 +08002264 if (npages != 1)
2265 return npages;
2266
2267 /* map read fault as writable if possible */
Xiao Guangrong12ce13f2012-08-21 11:00:49 +08002268 if (unlikely(!write_fault) && writable) {
Al Viroce530532017-11-19 17:47:33 -05002269 struct page *wpage;
Xiao Guangrong2fc84312012-08-21 11:00:22 +08002270
Souptick Joarderdadbb612020-06-07 21:40:55 -07002271 if (get_user_page_fast_only(addr, FOLL_WRITE, &wpage)) {
Xiao Guangrong2fc84312012-08-21 11:00:22 +08002272 *writable = true;
Al Viroce530532017-11-19 17:47:33 -05002273 put_page(page);
2274 page = wpage;
Xiao Guangrong2fc84312012-08-21 11:00:22 +08002275 }
Xiao Guangrong2fc84312012-08-21 11:00:22 +08002276 }
Al Viroce530532017-11-19 17:47:33 -05002277 *pfn = page_to_pfn(page);
Xiao Guangrong2fc84312012-08-21 11:00:22 +08002278 return npages;
2279}
2280
Xiao Guangrong4d8b81a2012-08-21 11:02:51 +08002281static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault)
2282{
2283 if (unlikely(!(vma->vm_flags & VM_READ)))
2284 return false;
2285
2286 if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE))))
2287 return false;
2288
2289 return true;
2290}
2291
Nicholas Pigginf8be1562021-06-24 08:29:04 -04002292static int kvm_try_get_pfn(kvm_pfn_t pfn)
2293{
2294 if (kvm_is_reserved_pfn(pfn))
2295 return 1;
2296 return get_page_unless_zero(pfn_to_page(pfn));
2297}
2298
Paolo Bonzini92176a82016-06-07 16:22:47 +02002299static int hva_to_pfn_remapped(struct vm_area_struct *vma,
2300 unsigned long addr, bool *async,
KarimAllah Ahmeda340b3e2018-01-17 19:18:56 +01002301 bool write_fault, bool *writable,
2302 kvm_pfn_t *p_pfn)
Paolo Bonzini92176a82016-06-07 16:22:47 +02002303{
Sean Christophersona9545772021-02-08 12:19:40 -08002304 kvm_pfn_t pfn;
Paolo Bonzinibd2fae82021-02-01 05:12:11 -05002305 pte_t *ptep;
2306 spinlock_t *ptl;
Paolo Bonziniadd6a0c2016-06-07 17:51:18 +02002307 int r;
2308
Paolo Bonzini9fd6dad2021-02-05 05:07:11 -05002309 r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
Paolo Bonziniadd6a0c2016-06-07 17:51:18 +02002310 if (r) {
2311 /*
2312 * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does
2313 * not call the fault handler, so do it here.
2314 */
2315 bool unlocked = false;
Peter Xu64019a22020-08-11 18:39:01 -07002316 r = fixup_user_fault(current->mm, addr,
Paolo Bonziniadd6a0c2016-06-07 17:51:18 +02002317 (write_fault ? FAULT_FLAG_WRITE : 0),
2318 &unlocked);
Paolo Bonzinia8387d02020-05-29 05:42:55 -04002319 if (unlocked)
2320 return -EAGAIN;
Paolo Bonziniadd6a0c2016-06-07 17:51:18 +02002321 if (r)
2322 return r;
2323
Paolo Bonzini9fd6dad2021-02-05 05:07:11 -05002324 r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
Paolo Bonziniadd6a0c2016-06-07 17:51:18 +02002325 if (r)
2326 return r;
Paolo Bonzinibd2fae82021-02-01 05:12:11 -05002327 }
Paolo Bonziniadd6a0c2016-06-07 17:51:18 +02002328
Paolo Bonzinibd2fae82021-02-01 05:12:11 -05002329 if (write_fault && !pte_write(*ptep)) {
2330 pfn = KVM_PFN_ERR_RO_FAULT;
2331 goto out;
Paolo Bonziniadd6a0c2016-06-07 17:51:18 +02002332 }
2333
KarimAllah Ahmeda340b3e2018-01-17 19:18:56 +01002334 if (writable)
Paolo Bonzinibd2fae82021-02-01 05:12:11 -05002335 *writable = pte_write(*ptep);
2336 pfn = pte_pfn(*ptep);
Paolo Bonziniadd6a0c2016-06-07 17:51:18 +02002337
2338 /*
2339 * Get a reference here because callers of *hva_to_pfn* and
2340 * *gfn_to_pfn* ultimately call kvm_release_pfn_clean on the
2341 * returned pfn. This is only needed if the VMA has VM_MIXEDMAP
2342 * set, but the kvm_get_pfn/kvm_release_pfn_clean pair will
2343 * simply do nothing for reserved pfns.
2344 *
2345 * Whoever called remap_pfn_range is also going to call e.g.
2346 * unmap_mapping_range before the underlying pages are freed,
2347 * causing a call to our MMU notifier.
Nicholas Pigginf8be1562021-06-24 08:29:04 -04002348 *
2349 * Certain IO or PFNMAP mappings can be backed with valid
2350 * struct pages, but be allocated without refcounting e.g.,
2351 * tail pages of non-compound higher order allocations, which
2352 * would then underflow the refcount when the caller does the
2353 * required put_page. Don't allow those pages here.
Paolo Bonziniadd6a0c2016-06-07 17:51:18 +02002354 */
Nicholas Pigginf8be1562021-06-24 08:29:04 -04002355 if (!kvm_try_get_pfn(pfn))
2356 r = -EFAULT;
Paolo Bonziniadd6a0c2016-06-07 17:51:18 +02002357
Paolo Bonzinibd2fae82021-02-01 05:12:11 -05002358out:
2359 pte_unmap_unlock(ptep, ptl);
Paolo Bonziniadd6a0c2016-06-07 17:51:18 +02002360 *p_pfn = pfn;
Nicholas Pigginf8be1562021-06-24 08:29:04 -04002361
2362 return r;
Paolo Bonzini92176a82016-06-07 16:22:47 +02002363}
2364
Xiao Guangrong12ce13f2012-08-21 11:00:49 +08002365/*
2366 * Pin guest page in memory and return its pfn.
2367 * @addr: host virtual address which maps memory to the guest
2368 * @atomic: whether this function can sleep
2369 * @async: whether this function need to wait IO complete if the
2370 * host page is not in the memory
2371 * @write_fault: whether we should get a writable host page
2372 * @writable: whether it allows to map a writable host page for !@write_fault
2373 *
2374 * The function will map a writable host page for these two cases:
2375 * 1): @write_fault = true
2376 * 2): @write_fault = false && @writable, @writable will tell the caller
2377 * whether the mapping is writable.
2378 */
Dan Williamsba049e92016-01-15 16:56:11 -08002379static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
Xiao Guangrong2fc84312012-08-21 11:00:22 +08002380 bool write_fault, bool *writable)
2381{
2382 struct vm_area_struct *vma;
Dan Williamsba049e92016-01-15 16:56:11 -08002383 kvm_pfn_t pfn = 0;
Paolo Bonzini92176a82016-06-07 16:22:47 +02002384 int npages, r;
Avi Kivity954bbbc2007-03-30 14:02:32 +03002385
Gleb Natapovaf585b92010-10-14 11:22:46 +02002386 /* we can do it either atomically or asynchronously, not both */
2387 BUG_ON(atomic && async);
2388
Paolo Bonzinib9b33da2018-07-27 17:44:41 +02002389 if (hva_to_pfn_fast(addr, write_fault, writable, &pfn))
Xiao Guangrong2fc84312012-08-21 11:00:22 +08002390 return pfn;
Marcelo Tosatti612819c2010-10-22 14:18:18 -02002391
Xiao Guangrong2fc84312012-08-21 11:00:22 +08002392 if (atomic)
2393 return KVM_PFN_ERR_FAULT;
Marcelo Tosatti612819c2010-10-22 14:18:18 -02002394
Xiao Guangrong2fc84312012-08-21 11:00:22 +08002395 npages = hva_to_pfn_slow(addr, async, write_fault, writable, &pfn);
2396 if (npages == 1)
2397 return pfn;
Gleb Natapovaf585b92010-10-14 11:22:46 +02002398
Michel Lespinassed8ed45c2020-06-08 21:33:25 -07002399 mmap_read_lock(current->mm);
Xiao Guangrong2fc84312012-08-21 11:00:22 +08002400 if (npages == -EHWPOISON ||
2401 (!async && check_user_page_hwpoison(addr))) {
2402 pfn = KVM_PFN_ERR_HWPOISON;
2403 goto exit;
Xiao Guangrong887c08a2010-08-22 19:10:28 +08002404 }
Izik Eidus539cb662007-11-11 22:05:04 +02002405
Paolo Bonzinia8387d02020-05-29 05:42:55 -04002406retry:
Liam Howlettfc98c032021-06-28 19:39:17 -07002407 vma = vma_lookup(current->mm, addr);
Anthony Liguori8d4e1282007-10-18 09:59:34 -05002408
Xiao Guangrong2fc84312012-08-21 11:00:22 +08002409 if (vma == NULL)
2410 pfn = KVM_PFN_ERR_FAULT;
Paolo Bonzini92176a82016-06-07 16:22:47 +02002411 else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) {
KarimAllah Ahmeda340b3e2018-01-17 19:18:56 +01002412 r = hva_to_pfn_remapped(vma, addr, async, write_fault, writable, &pfn);
Paolo Bonzinia8387d02020-05-29 05:42:55 -04002413 if (r == -EAGAIN)
2414 goto retry;
Paolo Bonzini92176a82016-06-07 16:22:47 +02002415 if (r < 0)
2416 pfn = KVM_PFN_ERR_FAULT;
Xiao Guangrong2fc84312012-08-21 11:00:22 +08002417 } else {
Xiao Guangrong4d8b81a2012-08-21 11:02:51 +08002418 if (async && vma_is_valid(vma, write_fault))
Xiao Guangrong2fc84312012-08-21 11:00:22 +08002419 *async = true;
2420 pfn = KVM_PFN_ERR_FAULT;
2421 }
2422exit:
Michel Lespinassed8ed45c2020-06-08 21:33:25 -07002423 mmap_read_unlock(current->mm);
Anthony Liguori2e2e3732008-04-30 15:37:07 -05002424 return pfn;
Anthony Liguori35149e22008-04-02 14:46:56 -05002425}
2426
Dan Williamsba049e92016-01-15 16:56:11 -08002427kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn,
2428 bool atomic, bool *async, bool write_fault,
David Stevens4a42d842021-02-22 11:45:22 +09002429 bool *writable, hva_t *hva)
Xiao Guangrong887c08a2010-08-22 19:10:28 +08002430{
Xiao Guangrong4d8b81a2012-08-21 11:02:51 +08002431 unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
2432
David Stevens4a42d842021-02-22 11:45:22 +09002433 if (hva)
2434 *hva = addr;
2435
Paolo Bonzinib2740d32016-02-23 15:36:01 +01002436 if (addr == KVM_HVA_ERR_RO_BAD) {
2437 if (writable)
2438 *writable = false;
Xiao Guangrong4d8b81a2012-08-21 11:02:51 +08002439 return KVM_PFN_ERR_RO_FAULT;
Paolo Bonzinib2740d32016-02-23 15:36:01 +01002440 }
Xiao Guangrong4d8b81a2012-08-21 11:02:51 +08002441
Paolo Bonzinib2740d32016-02-23 15:36:01 +01002442 if (kvm_is_error_hva(addr)) {
2443 if (writable)
2444 *writable = false;
Xiao Guangrong81c52c52012-10-16 20:10:59 +08002445 return KVM_PFN_NOSLOT;
Paolo Bonzinib2740d32016-02-23 15:36:01 +01002446 }
Xiao Guangrong4d8b81a2012-08-21 11:02:51 +08002447
2448 /* Do not map writable pfn in the readonly memslot. */
2449 if (writable && memslot_is_readonly(slot)) {
2450 *writable = false;
2451 writable = NULL;
2452 }
2453
2454 return hva_to_pfn(addr, atomic, async, write_fault,
2455 writable);
Xiao Guangrong887c08a2010-08-22 19:10:28 +08002456}
Paolo Bonzini35204692015-04-02 11:20:48 +02002457EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot);
Xiao Guangrong887c08a2010-08-22 19:10:28 +08002458
Dan Williamsba049e92016-01-15 16:56:11 -08002459kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
Marcelo Tosatti612819c2010-10-22 14:18:18 -02002460 bool *writable)
2461{
Paolo Bonzinie37afc62015-05-19 16:09:04 +02002462 return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, NULL,
David Stevens4a42d842021-02-22 11:45:22 +09002463 write_fault, writable, NULL);
Marcelo Tosatti612819c2010-10-22 14:18:18 -02002464}
2465EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
2466
Dan Williamsba049e92016-01-15 16:56:11 -08002467kvm_pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
Marcelo Tosatti506f0d62009-12-23 14:35:19 -02002468{
David Stevens4a42d842021-02-22 11:45:22 +09002469 return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL, NULL);
Marcelo Tosatti506f0d62009-12-23 14:35:19 -02002470}
Paolo Bonzinie37afc62015-05-19 16:09:04 +02002471EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot);
Marcelo Tosatti506f0d62009-12-23 14:35:19 -02002472
Dan Williamsba049e92016-01-15 16:56:11 -08002473kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn)
Xiao Guangrong037d92d2012-08-21 10:59:12 +08002474{
David Stevens4a42d842021-02-22 11:45:22 +09002475 return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL, NULL);
Xiao Guangrong037d92d2012-08-21 10:59:12 +08002476}
2477EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
2478
Dan Williamsba049e92016-01-15 16:56:11 -08002479kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn)
Paolo Bonzini8e734852015-05-17 13:58:53 +02002480{
2481 return gfn_to_pfn_memslot_atomic(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
2482}
2483EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn_atomic);
2484
Dan Williamsba049e92016-01-15 16:56:11 -08002485kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
Paolo Bonzinie37afc62015-05-19 16:09:04 +02002486{
2487 return gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn);
2488}
2489EXPORT_SYMBOL_GPL(gfn_to_pfn);
2490
Dan Williamsba049e92016-01-15 16:56:11 -08002491kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn)
Paolo Bonzini8e734852015-05-17 13:58:53 +02002492{
2493 return gfn_to_pfn_memslot(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
2494}
2495EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn);
2496
Paolo Bonzinid9ef13c2015-05-19 16:01:50 +02002497int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
2498 struct page **pages, int nr_pages)
Xiao Guangrong48987782010-08-22 19:11:43 +08002499{
2500 unsigned long addr;
Arnd Bergmann076b9252017-08-10 14:14:39 +02002501 gfn_t entry = 0;
Xiao Guangrong48987782010-08-22 19:11:43 +08002502
Paolo Bonzinid9ef13c2015-05-19 16:01:50 +02002503 addr = gfn_to_hva_many(slot, gfn, &entry);
Xiao Guangrong48987782010-08-22 19:11:43 +08002504 if (kvm_is_error_hva(addr))
2505 return -1;
2506
2507 if (entry < nr_pages)
2508 return 0;
2509
Souptick Joarderdadbb612020-06-07 21:40:55 -07002510 return get_user_pages_fast_only(addr, nr_pages, FOLL_WRITE, pages);
Xiao Guangrong48987782010-08-22 19:11:43 +08002511}
2512EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
2513
Dan Williamsba049e92016-01-15 16:56:11 -08002514static struct page *kvm_pfn_to_page(kvm_pfn_t pfn)
Xiao Guangronga2766322012-07-26 11:58:59 +08002515{
Xiao Guangrong81c52c52012-10-16 20:10:59 +08002516 if (is_error_noslot_pfn(pfn))
Xiao Guangrong6cede2e2012-08-03 15:41:22 +08002517 return KVM_ERR_PTR_BAD_PAGE;
Xiao Guangronga2766322012-07-26 11:58:59 +08002518
Ard Biesheuvelbf4bea82014-11-10 08:33:56 +00002519 if (kvm_is_reserved_pfn(pfn)) {
Xiao Guangrongcb9aaa32012-08-03 15:42:10 +08002520 WARN_ON(1);
2521 return KVM_ERR_PTR_BAD_PAGE;
2522 }
2523
Xiao Guangronga2766322012-07-26 11:58:59 +08002524 return pfn_to_page(pfn);
2525}
2526
Anthony Liguori35149e22008-04-02 14:46:56 -05002527struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
2528{
Dan Williamsba049e92016-01-15 16:56:11 -08002529 kvm_pfn_t pfn;
Anthony Liguori2e2e3732008-04-30 15:37:07 -05002530
2531 pfn = gfn_to_pfn(kvm, gfn);
Anthony Liguori2e2e3732008-04-30 15:37:07 -05002532
Xiao Guangronga2766322012-07-26 11:58:59 +08002533 return kvm_pfn_to_page(pfn);
Avi Kivity954bbbc2007-03-30 14:02:32 +03002534}
2535EXPORT_SYMBOL_GPL(gfn_to_page);
2536
Boris Ostrovsky91724812019-12-05 01:30:51 +00002537void kvm_release_pfn(kvm_pfn_t pfn, bool dirty, struct gfn_to_pfn_cache *cache)
2538{
2539 if (pfn == 0)
2540 return;
2541
2542 if (cache)
2543 cache->pfn = cache->gfn = 0;
2544
2545 if (dirty)
2546 kvm_release_pfn_dirty(pfn);
2547 else
2548 kvm_release_pfn_clean(pfn);
2549}
2550
2551static void kvm_cache_gfn_to_pfn(struct kvm_memory_slot *slot, gfn_t gfn,
2552 struct gfn_to_pfn_cache *cache, u64 gen)
2553{
2554 kvm_release_pfn(cache->pfn, cache->dirty, cache);
2555
2556 cache->pfn = gfn_to_pfn_memslot(slot, gfn);
2557 cache->gfn = gfn;
2558 cache->dirty = false;
2559 cache->generation = gen;
2560}
2561
Boris Ostrovsky1eff70a2019-11-12 16:35:06 +00002562static int __kvm_map_gfn(struct kvm_memslots *slots, gfn_t gfn,
Boris Ostrovsky91724812019-12-05 01:30:51 +00002563 struct kvm_host_map *map,
2564 struct gfn_to_pfn_cache *cache,
2565 bool atomic)
KarimAllah Ahmede45adf62019-01-31 21:24:34 +01002566{
2567 kvm_pfn_t pfn;
2568 void *hva = NULL;
2569 struct page *page = KVM_UNMAPPED_PAGE;
Boris Ostrovsky1eff70a2019-11-12 16:35:06 +00002570 struct kvm_memory_slot *slot = __gfn_to_memslot(slots, gfn);
Boris Ostrovsky91724812019-12-05 01:30:51 +00002571 u64 gen = slots->generation;
KarimAllah Ahmede45adf62019-01-31 21:24:34 +01002572
2573 if (!map)
2574 return -EINVAL;
2575
Boris Ostrovsky91724812019-12-05 01:30:51 +00002576 if (cache) {
2577 if (!cache->pfn || cache->gfn != gfn ||
2578 cache->generation != gen) {
2579 if (atomic)
2580 return -EAGAIN;
2581 kvm_cache_gfn_to_pfn(slot, gfn, cache, gen);
2582 }
2583 pfn = cache->pfn;
2584 } else {
2585 if (atomic)
2586 return -EAGAIN;
2587 pfn = gfn_to_pfn_memslot(slot, gfn);
2588 }
KarimAllah Ahmede45adf62019-01-31 21:24:34 +01002589 if (is_error_noslot_pfn(pfn))
2590 return -EINVAL;
2591
2592 if (pfn_valid(pfn)) {
2593 page = pfn_to_page(pfn);
Boris Ostrovsky91724812019-12-05 01:30:51 +00002594 if (atomic)
2595 hva = kmap_atomic(page);
2596 else
2597 hva = kmap(page);
Paolo Bonzinid30b2142019-05-20 12:06:36 +02002598#ifdef CONFIG_HAS_IOMEM
Boris Ostrovsky91724812019-12-05 01:30:51 +00002599 } else if (!atomic) {
KarimAllah Ahmede45adf62019-01-31 21:24:34 +01002600 hva = memremap(pfn_to_hpa(pfn), PAGE_SIZE, MEMREMAP_WB);
Boris Ostrovsky91724812019-12-05 01:30:51 +00002601 } else {
2602 return -EINVAL;
Paolo Bonzinid30b2142019-05-20 12:06:36 +02002603#endif
KarimAllah Ahmede45adf62019-01-31 21:24:34 +01002604 }
2605
2606 if (!hva)
2607 return -EFAULT;
2608
2609 map->page = page;
2610 map->hva = hva;
2611 map->pfn = pfn;
2612 map->gfn = gfn;
2613
2614 return 0;
2615}
2616
Boris Ostrovsky91724812019-12-05 01:30:51 +00002617int kvm_map_gfn(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map,
2618 struct gfn_to_pfn_cache *cache, bool atomic)
Boris Ostrovsky1eff70a2019-11-12 16:35:06 +00002619{
Boris Ostrovsky91724812019-12-05 01:30:51 +00002620 return __kvm_map_gfn(kvm_memslots(vcpu->kvm), gfn, map,
2621 cache, atomic);
Boris Ostrovsky1eff70a2019-11-12 16:35:06 +00002622}
2623EXPORT_SYMBOL_GPL(kvm_map_gfn);
2624
KarimAllah Ahmede45adf62019-01-31 21:24:34 +01002625int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map)
2626{
Boris Ostrovsky91724812019-12-05 01:30:51 +00002627 return __kvm_map_gfn(kvm_vcpu_memslots(vcpu), gfn, map,
2628 NULL, false);
KarimAllah Ahmede45adf62019-01-31 21:24:34 +01002629}
2630EXPORT_SYMBOL_GPL(kvm_vcpu_map);
2631
Peter Xu28bd7262020-09-30 21:20:34 -04002632static void __kvm_unmap_gfn(struct kvm *kvm,
2633 struct kvm_memory_slot *memslot,
Boris Ostrovsky91724812019-12-05 01:30:51 +00002634 struct kvm_host_map *map,
2635 struct gfn_to_pfn_cache *cache,
2636 bool dirty, bool atomic)
KarimAllah Ahmede45adf62019-01-31 21:24:34 +01002637{
2638 if (!map)
2639 return;
2640
2641 if (!map->hva)
2642 return;
2643
Boris Ostrovsky91724812019-12-05 01:30:51 +00002644 if (map->page != KVM_UNMAPPED_PAGE) {
2645 if (atomic)
2646 kunmap_atomic(map->hva);
2647 else
2648 kunmap(map->page);
2649 }
Christian Borntraegereb1f2f32019-05-27 10:28:25 +02002650#ifdef CONFIG_HAS_IOMEM
Boris Ostrovsky91724812019-12-05 01:30:51 +00002651 else if (!atomic)
KarimAllah Ahmede45adf62019-01-31 21:24:34 +01002652 memunmap(map->hva);
Boris Ostrovsky91724812019-12-05 01:30:51 +00002653 else
2654 WARN_ONCE(1, "Unexpected unmapping in atomic context");
Christian Borntraegereb1f2f32019-05-27 10:28:25 +02002655#endif
KarimAllah Ahmede45adf62019-01-31 21:24:34 +01002656
Boris Ostrovsky91724812019-12-05 01:30:51 +00002657 if (dirty)
Peter Xu28bd7262020-09-30 21:20:34 -04002658 mark_page_dirty_in_slot(kvm, memslot, map->gfn);
Boris Ostrovsky91724812019-12-05 01:30:51 +00002659
2660 if (cache)
2661 cache->dirty |= dirty;
2662 else
2663 kvm_release_pfn(map->pfn, dirty, NULL);
KarimAllah Ahmede45adf62019-01-31 21:24:34 +01002664
2665 map->hva = NULL;
2666 map->page = NULL;
2667}
Boris Ostrovsky1eff70a2019-11-12 16:35:06 +00002668
Boris Ostrovsky91724812019-12-05 01:30:51 +00002669int kvm_unmap_gfn(struct kvm_vcpu *vcpu, struct kvm_host_map *map,
2670 struct gfn_to_pfn_cache *cache, bool dirty, bool atomic)
Boris Ostrovsky1eff70a2019-11-12 16:35:06 +00002671{
Peter Xu28bd7262020-09-30 21:20:34 -04002672 __kvm_unmap_gfn(vcpu->kvm, gfn_to_memslot(vcpu->kvm, map->gfn), map,
Boris Ostrovsky91724812019-12-05 01:30:51 +00002673 cache, dirty, atomic);
Boris Ostrovsky1eff70a2019-11-12 16:35:06 +00002674 return 0;
2675}
2676EXPORT_SYMBOL_GPL(kvm_unmap_gfn);
2677
2678void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty)
2679{
Peter Xu28bd7262020-09-30 21:20:34 -04002680 __kvm_unmap_gfn(vcpu->kvm, kvm_vcpu_gfn_to_memslot(vcpu, map->gfn),
2681 map, NULL, dirty, false);
Boris Ostrovsky1eff70a2019-11-12 16:35:06 +00002682}
KarimAllah Ahmede45adf62019-01-31 21:24:34 +01002683EXPORT_SYMBOL_GPL(kvm_vcpu_unmap);
2684
Paolo Bonzini8e734852015-05-17 13:58:53 +02002685struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn)
2686{
Dan Williamsba049e92016-01-15 16:56:11 -08002687 kvm_pfn_t pfn;
Paolo Bonzini8e734852015-05-17 13:58:53 +02002688
2689 pfn = kvm_vcpu_gfn_to_pfn(vcpu, gfn);
2690
2691 return kvm_pfn_to_page(pfn);
2692}
2693EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_page);
2694
Izik Eidusb4231d62007-11-20 11:49:33 +02002695void kvm_release_page_clean(struct page *page)
2696{
Xiao Guangrong32cad842012-08-03 15:42:52 +08002697 WARN_ON(is_error_page(page));
2698
Anthony Liguori35149e22008-04-02 14:46:56 -05002699 kvm_release_pfn_clean(page_to_pfn(page));
Izik Eidusb4231d62007-11-20 11:49:33 +02002700}
2701EXPORT_SYMBOL_GPL(kvm_release_page_clean);
2702
Dan Williamsba049e92016-01-15 16:56:11 -08002703void kvm_release_pfn_clean(kvm_pfn_t pfn)
Anthony Liguori35149e22008-04-02 14:46:56 -05002704{
Ard Biesheuvelbf4bea82014-11-10 08:33:56 +00002705 if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn))
Anthony Liguori2e2e3732008-04-30 15:37:07 -05002706 put_page(pfn_to_page(pfn));
Anthony Liguori35149e22008-04-02 14:46:56 -05002707}
2708EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
2709
Izik Eidusb4231d62007-11-20 11:49:33 +02002710void kvm_release_page_dirty(struct page *page)
Izik Eidus8a7ae052007-10-18 11:09:33 +02002711{
Xiao Guangronga2766322012-07-26 11:58:59 +08002712 WARN_ON(is_error_page(page));
2713
Anthony Liguori35149e22008-04-02 14:46:56 -05002714 kvm_release_pfn_dirty(page_to_pfn(page));
Izik Eidus8a7ae052007-10-18 11:09:33 +02002715}
Izik Eidusb4231d62007-11-20 11:49:33 +02002716EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
Izik Eidus8a7ae052007-10-18 11:09:33 +02002717
David Hildenbrandf7a65092017-09-01 17:11:43 +02002718void kvm_release_pfn_dirty(kvm_pfn_t pfn)
Anthony Liguori35149e22008-04-02 14:46:56 -05002719{
2720 kvm_set_pfn_dirty(pfn);
2721 kvm_release_pfn_clean(pfn);
2722}
David Hildenbrandf7a65092017-09-01 17:11:43 +02002723EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);
Anthony Liguori35149e22008-04-02 14:46:56 -05002724
Dan Williamsba049e92016-01-15 16:56:11 -08002725void kvm_set_pfn_dirty(kvm_pfn_t pfn)
Anthony Liguori35149e22008-04-02 14:46:56 -05002726{
Miaohe Lind29c03a2019-12-05 11:05:05 +08002727 if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn))
2728 SetPageDirty(pfn_to_page(pfn));
Anthony Liguori35149e22008-04-02 14:46:56 -05002729}
2730EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
2731
Dan Williamsba049e92016-01-15 16:56:11 -08002732void kvm_set_pfn_accessed(kvm_pfn_t pfn)
Anthony Liguori35149e22008-04-02 14:46:56 -05002733{
Sean Christophersona78986a2019-11-11 14:12:27 -08002734 if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn))
Anthony Liguori2e2e3732008-04-30 15:37:07 -05002735 mark_page_accessed(pfn_to_page(pfn));
Anthony Liguori35149e22008-04-02 14:46:56 -05002736}
2737EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
2738
Dan Williamsba049e92016-01-15 16:56:11 -08002739void kvm_get_pfn(kvm_pfn_t pfn)
Anthony Liguori35149e22008-04-02 14:46:56 -05002740{
Ard Biesheuvelbf4bea82014-11-10 08:33:56 +00002741 if (!kvm_is_reserved_pfn(pfn))
Anthony Liguori2e2e3732008-04-30 15:37:07 -05002742 get_page(pfn_to_page(pfn));
Anthony Liguori35149e22008-04-02 14:46:56 -05002743}
2744EXPORT_SYMBOL_GPL(kvm_get_pfn);
2745
Izik Eidus195aefd2007-10-01 22:14:18 +02002746static int next_segment(unsigned long len, int offset)
2747{
2748 if (len > PAGE_SIZE - offset)
2749 return PAGE_SIZE - offset;
2750 else
2751 return len;
2752}
2753
Paolo Bonzini8e734852015-05-17 13:58:53 +02002754static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn,
2755 void *data, int offset, int len)
Izik Eidus195aefd2007-10-01 22:14:18 +02002756{
Izik Eiduse0506bc2007-11-11 22:10:22 +02002757 int r;
2758 unsigned long addr;
Izik Eidus195aefd2007-10-01 22:14:18 +02002759
Paolo Bonzini8e734852015-05-17 13:58:53 +02002760 addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
Izik Eiduse0506bc2007-11-11 22:10:22 +02002761 if (kvm_is_error_hva(addr))
Izik Eidus195aefd2007-10-01 22:14:18 +02002762 return -EFAULT;
Paolo Bonzini3180a7f2015-04-02 14:08:20 +02002763 r = __copy_from_user(data, (void __user *)addr + offset, len);
Izik Eiduse0506bc2007-11-11 22:10:22 +02002764 if (r)
2765 return -EFAULT;
Izik Eidus195aefd2007-10-01 22:14:18 +02002766 return 0;
2767}
Paolo Bonzini8e734852015-05-17 13:58:53 +02002768
2769int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
2770 int len)
2771{
2772 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
2773
2774 return __kvm_read_guest_page(slot, gfn, data, offset, len);
2775}
Izik Eidus195aefd2007-10-01 22:14:18 +02002776EXPORT_SYMBOL_GPL(kvm_read_guest_page);
2777
Paolo Bonzini8e734852015-05-17 13:58:53 +02002778int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data,
2779 int offset, int len)
2780{
2781 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2782
2783 return __kvm_read_guest_page(slot, gfn, data, offset, len);
2784}
2785EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_page);
2786
Izik Eidus195aefd2007-10-01 22:14:18 +02002787int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len)
2788{
2789 gfn_t gfn = gpa >> PAGE_SHIFT;
2790 int seg;
2791 int offset = offset_in_page(gpa);
2792 int ret;
2793
2794 while ((seg = next_segment(len, offset)) != 0) {
2795 ret = kvm_read_guest_page(kvm, gfn, data, offset, seg);
2796 if (ret < 0)
2797 return ret;
2798 offset = 0;
2799 len -= seg;
2800 data += seg;
2801 ++gfn;
2802 }
2803 return 0;
2804}
2805EXPORT_SYMBOL_GPL(kvm_read_guest);
2806
Paolo Bonzini8e734852015-05-17 13:58:53 +02002807int kvm_vcpu_read_guest(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, unsigned long len)
2808{
2809 gfn_t gfn = gpa >> PAGE_SHIFT;
2810 int seg;
2811 int offset = offset_in_page(gpa);
2812 int ret;
2813
2814 while ((seg = next_segment(len, offset)) != 0) {
2815 ret = kvm_vcpu_read_guest_page(vcpu, gfn, data, offset, seg);
2816 if (ret < 0)
2817 return ret;
2818 offset = 0;
2819 len -= seg;
2820 data += seg;
2821 ++gfn;
2822 }
2823 return 0;
2824}
2825EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest);
2826
2827static int __kvm_read_guest_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
2828 void *data, int offset, unsigned long len)
Marcelo Tosatti7ec54582007-12-20 19:18:23 -05002829{
2830 int r;
2831 unsigned long addr;
Marcelo Tosatti7ec54582007-12-20 19:18:23 -05002832
Paolo Bonzini8e734852015-05-17 13:58:53 +02002833 addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
Marcelo Tosatti7ec54582007-12-20 19:18:23 -05002834 if (kvm_is_error_hva(addr))
2835 return -EFAULT;
Andrea Arcangeli0aac03f2008-01-30 19:57:35 +01002836 pagefault_disable();
Paolo Bonzini3180a7f2015-04-02 14:08:20 +02002837 r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
Andrea Arcangeli0aac03f2008-01-30 19:57:35 +01002838 pagefault_enable();
Marcelo Tosatti7ec54582007-12-20 19:18:23 -05002839 if (r)
2840 return -EFAULT;
2841 return 0;
2842}
Marcelo Tosatti7ec54582007-12-20 19:18:23 -05002843
Paolo Bonzini8e734852015-05-17 13:58:53 +02002844int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa,
2845 void *data, unsigned long len)
2846{
2847 gfn_t gfn = gpa >> PAGE_SHIFT;
2848 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2849 int offset = offset_in_page(gpa);
2850
2851 return __kvm_read_guest_atomic(slot, gfn, data, offset, len);
2852}
2853EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic);
2854
Peter Xu28bd7262020-09-30 21:20:34 -04002855static int __kvm_write_guest_page(struct kvm *kvm,
2856 struct kvm_memory_slot *memslot, gfn_t gfn,
Paolo Bonzini8e734852015-05-17 13:58:53 +02002857 const void *data, int offset, int len)
Izik Eidus195aefd2007-10-01 22:14:18 +02002858{
Izik Eiduse0506bc2007-11-11 22:10:22 +02002859 int r;
2860 unsigned long addr;
Izik Eidus195aefd2007-10-01 22:14:18 +02002861
Radim Krčmář251eb842015-04-10 21:47:27 +02002862 addr = gfn_to_hva_memslot(memslot, gfn);
Izik Eiduse0506bc2007-11-11 22:10:22 +02002863 if (kvm_is_error_hva(addr))
Izik Eidus195aefd2007-10-01 22:14:18 +02002864 return -EFAULT;
Xiao Guangrong8b0cedf2011-05-15 23:22:04 +08002865 r = __copy_to_user((void __user *)addr + offset, data, len);
Izik Eiduse0506bc2007-11-11 22:10:22 +02002866 if (r)
2867 return -EFAULT;
Peter Xu28bd7262020-09-30 21:20:34 -04002868 mark_page_dirty_in_slot(kvm, memslot, gfn);
Izik Eidus195aefd2007-10-01 22:14:18 +02002869 return 0;
2870}
Paolo Bonzini8e734852015-05-17 13:58:53 +02002871
2872int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn,
2873 const void *data, int offset, int len)
2874{
2875 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
2876
Peter Xu28bd7262020-09-30 21:20:34 -04002877 return __kvm_write_guest_page(kvm, slot, gfn, data, offset, len);
Paolo Bonzini8e734852015-05-17 13:58:53 +02002878}
Izik Eidus195aefd2007-10-01 22:14:18 +02002879EXPORT_SYMBOL_GPL(kvm_write_guest_page);
2880
Paolo Bonzini8e734852015-05-17 13:58:53 +02002881int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
2882 const void *data, int offset, int len)
2883{
2884 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2885
Peter Xu28bd7262020-09-30 21:20:34 -04002886 return __kvm_write_guest_page(vcpu->kvm, slot, gfn, data, offset, len);
Paolo Bonzini8e734852015-05-17 13:58:53 +02002887}
2888EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_page);
2889
Izik Eidus195aefd2007-10-01 22:14:18 +02002890int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
2891 unsigned long len)
2892{
2893 gfn_t gfn = gpa >> PAGE_SHIFT;
2894 int seg;
2895 int offset = offset_in_page(gpa);
2896 int ret;
2897
2898 while ((seg = next_segment(len, offset)) != 0) {
2899 ret = kvm_write_guest_page(kvm, gfn, data, offset, seg);
2900 if (ret < 0)
2901 return ret;
2902 offset = 0;
2903 len -= seg;
2904 data += seg;
2905 ++gfn;
2906 }
2907 return 0;
2908}
Wincy Vanff651cb2014-12-11 08:52:58 +03002909EXPORT_SYMBOL_GPL(kvm_write_guest);
Izik Eidus195aefd2007-10-01 22:14:18 +02002910
Paolo Bonzini8e734852015-05-17 13:58:53 +02002911int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data,
2912 unsigned long len)
2913{
2914 gfn_t gfn = gpa >> PAGE_SHIFT;
2915 int seg;
2916 int offset = offset_in_page(gpa);
2917 int ret;
2918
2919 while ((seg = next_segment(len, offset)) != 0) {
2920 ret = kvm_vcpu_write_guest_page(vcpu, gfn, data, offset, seg);
2921 if (ret < 0)
2922 return ret;
2923 offset = 0;
2924 len -= seg;
2925 data += seg;
2926 ++gfn;
2927 }
2928 return 0;
2929}
2930EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest);
2931
Paolo Bonzini5a2d4362017-02-03 20:32:28 -08002932static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots,
2933 struct gfn_to_hva_cache *ghc,
2934 gpa_t gpa, unsigned long len)
Gleb Natapov49c77542010-10-18 15:22:23 +02002935{
Gleb Natapov49c77542010-10-18 15:22:23 +02002936 int offset = offset_in_page(gpa);
Andrew Honig8f964522013-03-29 09:35:21 -07002937 gfn_t start_gfn = gpa >> PAGE_SHIFT;
2938 gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT;
2939 gfn_t nr_pages_needed = end_gfn - start_gfn + 1;
2940 gfn_t nr_pages_avail;
Gleb Natapov49c77542010-10-18 15:22:23 +02002941
Sean Christopherson6ad1e292020-01-09 14:58:55 -05002942 /* Update ghc->generation before performing any error checks. */
Gleb Natapov49c77542010-10-18 15:22:23 +02002943 ghc->generation = slots->generation;
Sean Christopherson6ad1e292020-01-09 14:58:55 -05002944
2945 if (start_gfn > end_gfn) {
2946 ghc->hva = KVM_HVA_ERR_BAD;
2947 return -EINVAL;
2948 }
Jim Mattsonf1b9dd52018-12-17 13:53:33 -08002949
2950 /*
2951 * If the requested region crosses two memslots, we still
2952 * verify that the entire region is valid here.
2953 */
Sean Christopherson6ad1e292020-01-09 14:58:55 -05002954 for ( ; start_gfn <= end_gfn; start_gfn += nr_pages_avail) {
Jim Mattsonf1b9dd52018-12-17 13:53:33 -08002955 ghc->memslot = __gfn_to_memslot(slots, start_gfn);
2956 ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn,
2957 &nr_pages_avail);
2958 if (kvm_is_error_hva(ghc->hva))
Sean Christopherson6ad1e292020-01-09 14:58:55 -05002959 return -EFAULT;
Andrew Honig8f964522013-03-29 09:35:21 -07002960 }
Jim Mattsonf1b9dd52018-12-17 13:53:33 -08002961
2962 /* Use the slow path for cross page reads and writes. */
Sean Christopherson6ad1e292020-01-09 14:58:55 -05002963 if (nr_pages_needed == 1)
Jim Mattsonf1b9dd52018-12-17 13:53:33 -08002964 ghc->hva += offset;
2965 else
2966 ghc->memslot = NULL;
2967
Sean Christopherson6ad1e292020-01-09 14:58:55 -05002968 ghc->gpa = gpa;
2969 ghc->len = len;
2970 return 0;
Gleb Natapov49c77542010-10-18 15:22:23 +02002971}
Paolo Bonzini5a2d4362017-02-03 20:32:28 -08002972
Paolo Bonzini4e335d92017-05-02 16:20:18 +02002973int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
Paolo Bonzini5a2d4362017-02-03 20:32:28 -08002974 gpa_t gpa, unsigned long len)
2975{
Paolo Bonzini4e335d92017-05-02 16:20:18 +02002976 struct kvm_memslots *slots = kvm_memslots(kvm);
Paolo Bonzini5a2d4362017-02-03 20:32:28 -08002977 return __kvm_gfn_to_hva_cache_init(slots, ghc, gpa, len);
2978}
Paolo Bonzini4e335d92017-05-02 16:20:18 +02002979EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init);
Gleb Natapov49c77542010-10-18 15:22:23 +02002980
Paolo Bonzini4e335d92017-05-02 16:20:18 +02002981int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
Jim Mattson7a86dab2018-12-14 14:34:43 -08002982 void *data, unsigned int offset,
2983 unsigned long len)
Gleb Natapov49c77542010-10-18 15:22:23 +02002984{
Paolo Bonzini4e335d92017-05-02 16:20:18 +02002985 struct kvm_memslots *slots = kvm_memslots(kvm);
Gleb Natapov49c77542010-10-18 15:22:23 +02002986 int r;
Pan Xinhui4ec6e862016-11-02 05:08:34 -04002987 gpa_t gpa = ghc->gpa + offset;
Gleb Natapov49c77542010-10-18 15:22:23 +02002988
Pan Xinhui4ec6e862016-11-02 05:08:34 -04002989 BUG_ON(len + offset > ghc->len);
Andrew Honig8f964522013-03-29 09:35:21 -07002990
Sean Christophersondc9ce712020-01-09 15:56:20 -08002991 if (slots->generation != ghc->generation) {
2992 if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))
2993 return -EFAULT;
2994 }
Andrew Honig8f964522013-03-29 09:35:21 -07002995
Gleb Natapov49c77542010-10-18 15:22:23 +02002996 if (kvm_is_error_hva(ghc->hva))
2997 return -EFAULT;
2998
Sean Christophersonfcfbc612020-01-09 15:56:18 -08002999 if (unlikely(!ghc->memslot))
3000 return kvm_write_guest(kvm, gpa, data, len);
3001
Pan Xinhui4ec6e862016-11-02 05:08:34 -04003002 r = __copy_to_user((void __user *)ghc->hva + offset, data, len);
Gleb Natapov49c77542010-10-18 15:22:23 +02003003 if (r)
3004 return -EFAULT;
Peter Xu28bd7262020-09-30 21:20:34 -04003005 mark_page_dirty_in_slot(kvm, ghc->memslot, gpa >> PAGE_SHIFT);
Gleb Natapov49c77542010-10-18 15:22:23 +02003006
3007 return 0;
3008}
Paolo Bonzini4e335d92017-05-02 16:20:18 +02003009EXPORT_SYMBOL_GPL(kvm_write_guest_offset_cached);
Pan Xinhui4ec6e862016-11-02 05:08:34 -04003010
Paolo Bonzini4e335d92017-05-02 16:20:18 +02003011int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3012 void *data, unsigned long len)
Pan Xinhui4ec6e862016-11-02 05:08:34 -04003013{
Paolo Bonzini4e335d92017-05-02 16:20:18 +02003014 return kvm_write_guest_offset_cached(kvm, ghc, data, 0, len);
Pan Xinhui4ec6e862016-11-02 05:08:34 -04003015}
Paolo Bonzini4e335d92017-05-02 16:20:18 +02003016EXPORT_SYMBOL_GPL(kvm_write_guest_cached);
Gleb Natapov49c77542010-10-18 15:22:23 +02003017
Vitaly Kuznetsov0958f0c2020-05-25 16:41:19 +02003018int kvm_read_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3019 void *data, unsigned int offset,
3020 unsigned long len)
Gleb Natapove03b6442011-07-11 15:28:11 -04003021{
Paolo Bonzini4e335d92017-05-02 16:20:18 +02003022 struct kvm_memslots *slots = kvm_memslots(kvm);
Gleb Natapove03b6442011-07-11 15:28:11 -04003023 int r;
Vitaly Kuznetsov0958f0c2020-05-25 16:41:19 +02003024 gpa_t gpa = ghc->gpa + offset;
Gleb Natapove03b6442011-07-11 15:28:11 -04003025
Vitaly Kuznetsov0958f0c2020-05-25 16:41:19 +02003026 BUG_ON(len + offset > ghc->len);
Andrew Honig8f964522013-03-29 09:35:21 -07003027
Sean Christophersondc9ce712020-01-09 15:56:20 -08003028 if (slots->generation != ghc->generation) {
3029 if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))
3030 return -EFAULT;
3031 }
Andrew Honig8f964522013-03-29 09:35:21 -07003032
Gleb Natapove03b6442011-07-11 15:28:11 -04003033 if (kvm_is_error_hva(ghc->hva))
3034 return -EFAULT;
3035
Sean Christophersonfcfbc612020-01-09 15:56:18 -08003036 if (unlikely(!ghc->memslot))
Vitaly Kuznetsov0958f0c2020-05-25 16:41:19 +02003037 return kvm_read_guest(kvm, gpa, data, len);
Sean Christophersonfcfbc612020-01-09 15:56:18 -08003038
Vitaly Kuznetsov0958f0c2020-05-25 16:41:19 +02003039 r = __copy_from_user(data, (void __user *)ghc->hva + offset, len);
Gleb Natapove03b6442011-07-11 15:28:11 -04003040 if (r)
3041 return -EFAULT;
3042
3043 return 0;
3044}
Vitaly Kuznetsov0958f0c2020-05-25 16:41:19 +02003045EXPORT_SYMBOL_GPL(kvm_read_guest_offset_cached);
3046
3047int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3048 void *data, unsigned long len)
3049{
3050 return kvm_read_guest_offset_cached(kvm, ghc, data, 0, len);
3051}
Paolo Bonzini4e335d92017-05-02 16:20:18 +02003052EXPORT_SYMBOL_GPL(kvm_read_guest_cached);
Gleb Natapove03b6442011-07-11 15:28:11 -04003053
Izik Eidus195aefd2007-10-01 22:14:18 +02003054int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
3055{
Paolo Bonzini2f541442020-11-06 05:25:09 -05003056 const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
Izik Eidus195aefd2007-10-01 22:14:18 +02003057 gfn_t gfn = gpa >> PAGE_SHIFT;
3058 int seg;
3059 int offset = offset_in_page(gpa);
3060 int ret;
3061
Kevin Mulveybfda0e82015-02-20 08:21:36 -05003062 while ((seg = next_segment(len, offset)) != 0) {
Paolo Bonzini2f541442020-11-06 05:25:09 -05003063 ret = kvm_write_guest_page(kvm, gfn, zero_page, offset, len);
Izik Eidus195aefd2007-10-01 22:14:18 +02003064 if (ret < 0)
3065 return ret;
3066 offset = 0;
3067 len -= seg;
3068 ++gfn;
3069 }
3070 return 0;
3071}
3072EXPORT_SYMBOL_GPL(kvm_clear_guest);
3073
Peter Xu28bd7262020-09-30 21:20:34 -04003074void mark_page_dirty_in_slot(struct kvm *kvm,
3075 struct kvm_memory_slot *memslot,
3076 gfn_t gfn)
Avi Kivity6aa8b732006-12-10 02:21:36 -08003077{
Peter Xu044c59c2020-09-30 21:22:26 -04003078 if (memslot && kvm_slot_dirty_track_enabled(memslot)) {
Rusty Russell7e9d6192007-07-31 20:41:14 +10003079 unsigned long rel_gfn = gfn - memslot->base_gfn;
Peter Xufb04a1e2020-09-30 21:22:22 -04003080 u32 slot = (memslot->as_id << 16) | memslot->id;
Avi Kivity6aa8b732006-12-10 02:21:36 -08003081
Peter Xufb04a1e2020-09-30 21:22:22 -04003082 if (kvm->dirty_ring_size)
3083 kvm_dirty_ring_push(kvm_dirty_ring_get(kvm),
3084 slot, rel_gfn);
3085 else
3086 set_bit_le(rel_gfn, memslot->dirty_bitmap);
Avi Kivity6aa8b732006-12-10 02:21:36 -08003087 }
3088}
Ben Gardona6a0b052020-10-14 11:26:55 -07003089EXPORT_SYMBOL_GPL(mark_page_dirty_in_slot);
Avi Kivity6aa8b732006-12-10 02:21:36 -08003090
Gleb Natapov49c77542010-10-18 15:22:23 +02003091void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
3092{
3093 struct kvm_memory_slot *memslot;
3094
3095 memslot = gfn_to_memslot(kvm, gfn);
Peter Xu28bd7262020-09-30 21:20:34 -04003096 mark_page_dirty_in_slot(kvm, memslot, gfn);
Gleb Natapov49c77542010-10-18 15:22:23 +02003097}
Aneesh Kumar K.V2ba9f0d2013-10-07 22:17:59 +05303098EXPORT_SYMBOL_GPL(mark_page_dirty);
Gleb Natapov49c77542010-10-18 15:22:23 +02003099
Paolo Bonzini8e734852015-05-17 13:58:53 +02003100void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn)
3101{
3102 struct kvm_memory_slot *memslot;
3103
3104 memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
Peter Xu28bd7262020-09-30 21:20:34 -04003105 mark_page_dirty_in_slot(vcpu->kvm, memslot, gfn);
Paolo Bonzini8e734852015-05-17 13:58:53 +02003106}
3107EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty);
3108
Jan H. Schönherr20b70352017-11-24 22:39:01 +01003109void kvm_sigset_activate(struct kvm_vcpu *vcpu)
3110{
3111 if (!vcpu->sigset_active)
3112 return;
3113
3114 /*
3115 * This does a lockless modification of ->real_blocked, which is fine
3116 * because, only current can change ->real_blocked and all readers of
3117 * ->real_blocked don't care as long ->real_blocked is always a subset
3118 * of ->blocked.
3119 */
3120 sigprocmask(SIG_SETMASK, &vcpu->sigset, &current->real_blocked);
3121}
3122
3123void kvm_sigset_deactivate(struct kvm_vcpu *vcpu)
3124{
3125 if (!vcpu->sigset_active)
3126 return;
3127
3128 sigprocmask(SIG_SETMASK, &current->real_blocked, NULL);
3129 sigemptyset(&current->real_blocked);
3130}
3131
Wanpeng Liaca6ff22015-09-03 22:07:38 +08003132static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
3133{
Nir Weinerdee339b2019-01-27 12:17:16 +02003134 unsigned int old, val, grow, grow_start;
Wanpeng Liaca6ff22015-09-03 22:07:38 +08003135
Wanpeng Li2cbd7822015-09-03 22:07:39 +08003136 old = val = vcpu->halt_poll_ns;
Nir Weinerdee339b2019-01-27 12:17:16 +02003137 grow_start = READ_ONCE(halt_poll_ns_grow_start);
Christian Borntraeger6b6de682016-02-09 13:47:55 +01003138 grow = READ_ONCE(halt_poll_ns_grow);
Nir Weiner7fa08e72019-01-27 12:17:14 +02003139 if (!grow)
3140 goto out;
3141
Nir Weinerdee339b2019-01-27 12:17:16 +02003142 val *= grow;
3143 if (val < grow_start)
3144 val = grow_start;
Wanpeng Liaca6ff22015-09-03 22:07:38 +08003145
David Matlack258785e2021-05-06 15:24:43 +00003146 if (val > vcpu->kvm->max_halt_poll_ns)
3147 val = vcpu->kvm->max_halt_poll_ns;
David Matlack313f6362016-03-08 16:19:44 -08003148
Wanpeng Liaca6ff22015-09-03 22:07:38 +08003149 vcpu->halt_poll_ns = val;
Nir Weiner7fa08e72019-01-27 12:17:14 +02003150out:
Wanpeng Li2cbd7822015-09-03 22:07:39 +08003151 trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old);
Wanpeng Liaca6ff22015-09-03 22:07:38 +08003152}
3153
3154static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu)
3155{
Christian Borntraeger6b6de682016-02-09 13:47:55 +01003156 unsigned int old, val, shrink;
Wanpeng Liaca6ff22015-09-03 22:07:38 +08003157
Wanpeng Li2cbd7822015-09-03 22:07:39 +08003158 old = val = vcpu->halt_poll_ns;
Christian Borntraeger6b6de682016-02-09 13:47:55 +01003159 shrink = READ_ONCE(halt_poll_ns_shrink);
3160 if (shrink == 0)
Wanpeng Liaca6ff22015-09-03 22:07:38 +08003161 val = 0;
3162 else
Christian Borntraeger6b6de682016-02-09 13:47:55 +01003163 val /= shrink;
Wanpeng Liaca6ff22015-09-03 22:07:38 +08003164
3165 vcpu->halt_poll_ns = val;
Wanpeng Li2cbd7822015-09-03 22:07:39 +08003166 trace_kvm_halt_poll_ns_shrink(vcpu->vcpu_id, val, old);
Wanpeng Liaca6ff22015-09-03 22:07:38 +08003167}
3168
Paolo Bonzinif7819512015-02-04 18:20:58 +01003169static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
3170{
Junaid Shahid50c28f22018-06-27 14:59:11 -07003171 int ret = -EINTR;
3172 int idx = srcu_read_lock(&vcpu->kvm->srcu);
3173
Paolo Bonzinif7819512015-02-04 18:20:58 +01003174 if (kvm_arch_vcpu_runnable(vcpu)) {
3175 kvm_make_request(KVM_REQ_UNHALT, vcpu);
Junaid Shahid50c28f22018-06-27 14:59:11 -07003176 goto out;
Paolo Bonzinif7819512015-02-04 18:20:58 +01003177 }
3178 if (kvm_cpu_has_pending_timer(vcpu))
Junaid Shahid50c28f22018-06-27 14:59:11 -07003179 goto out;
Paolo Bonzinif7819512015-02-04 18:20:58 +01003180 if (signal_pending(current))
Junaid Shahid50c28f22018-06-27 14:59:11 -07003181 goto out;
Marcelo Tosatti084071d2021-05-25 10:41:17 -03003182 if (kvm_check_request(KVM_REQ_UNBLOCK, vcpu))
3183 goto out;
Paolo Bonzinif7819512015-02-04 18:20:58 +01003184
Junaid Shahid50c28f22018-06-27 14:59:11 -07003185 ret = 0;
3186out:
3187 srcu_read_unlock(&vcpu->kvm->srcu, idx);
3188 return ret;
Paolo Bonzinif7819512015-02-04 18:20:58 +01003189}
3190
David Matlackcb953122020-05-08 11:22:40 -07003191static inline void
3192update_halt_poll_stats(struct kvm_vcpu *vcpu, u64 poll_ns, bool waited)
3193{
3194 if (waited)
Jing Zhang0193cc92021-06-18 22:27:03 +00003195 vcpu->stat.generic.halt_poll_fail_ns += poll_ns;
David Matlackcb953122020-05-08 11:22:40 -07003196 else
Jing Zhang0193cc92021-06-18 22:27:03 +00003197 vcpu->stat.generic.halt_poll_success_ns += poll_ns;
David Matlackcb953122020-05-08 11:22:40 -07003198}
3199
Eddie Dongb6958ce2007-07-18 12:15:21 +03003200/*
3201 * The vCPU has executed a HLT instruction with in-kernel mode enabled.
3202 */
Hollis Blanchard8776e512007-10-31 17:24:24 -05003203void kvm_vcpu_block(struct kvm_vcpu *vcpu)
Eddie Dongb6958ce2007-07-18 12:15:21 +03003204{
David Matlackcb953122020-05-08 11:22:40 -07003205 ktime_t start, cur, poll_end;
Paolo Bonzinif7819512015-02-04 18:20:58 +01003206 bool waited = false;
Wanpeng Liaca6ff22015-09-03 22:07:38 +08003207 u64 block_ns;
Paolo Bonzinif7819512015-02-04 18:20:58 +01003208
Marc Zyngier07ab0f82019-08-02 11:37:09 +01003209 kvm_arch_vcpu_blocking(vcpu);
3210
David Matlackcb953122020-05-08 11:22:40 -07003211 start = cur = poll_end = ktime_get();
Christian Borntraegercdd6ad32019-03-05 05:30:01 -05003212 if (vcpu->halt_poll_ns && !kvm_arch_no_poll(vcpu)) {
Wanpeng Li19020f82015-09-03 22:07:37 +08003213 ktime_t stop = ktime_add_ns(ktime_get(), vcpu->halt_poll_ns);
Xiubo Lif95ef0cd2015-02-26 14:58:23 +08003214
Jing Zhang0193cc92021-06-18 22:27:03 +00003215 ++vcpu->stat.generic.halt_attempted_poll;
Paolo Bonzinif7819512015-02-04 18:20:58 +01003216 do {
3217 /*
3218 * This sets KVM_REQ_UNHALT if an interrupt
3219 * arrives.
3220 */
3221 if (kvm_vcpu_check_block(vcpu) < 0) {
Jing Zhang0193cc92021-06-18 22:27:03 +00003222 ++vcpu->stat.generic.halt_successful_poll;
Christian Borntraeger3491caf2016-05-13 12:16:35 +02003223 if (!vcpu_valid_wakeup(vcpu))
Jing Zhang0193cc92021-06-18 22:27:03 +00003224 ++vcpu->stat.generic.halt_poll_invalid;
Paolo Bonzinif7819512015-02-04 18:20:58 +01003225 goto out;
3226 }
Li RongQing74775652021-07-27 19:12:47 +08003227 cpu_relax();
David Matlackcb953122020-05-08 11:22:40 -07003228 poll_end = cur = ktime_get();
Wanpeng Li6bd5b742021-05-18 05:00:31 -07003229 } while (kvm_vcpu_can_poll(cur, stop));
Paolo Bonzinif7819512015-02-04 18:20:58 +01003230 }
Eddie Dongb6958ce2007-07-18 12:15:21 +03003231
Davidlohr Buesoda4ad882020-04-23 22:48:37 -07003232 prepare_to_rcuwait(&vcpu->wait);
Marcelo Tosattie5c239c2008-05-08 19:47:01 -03003233 for (;;) {
Davidlohr Buesoda4ad882020-04-23 22:48:37 -07003234 set_current_state(TASK_INTERRUPTIBLE);
Eddie Dongb6958ce2007-07-18 12:15:21 +03003235
Paolo Bonzinif7819512015-02-04 18:20:58 +01003236 if (kvm_vcpu_check_block(vcpu) < 0)
Marcelo Tosattie5c239c2008-05-08 19:47:01 -03003237 break;
3238
Paolo Bonzinif7819512015-02-04 18:20:58 +01003239 waited = true;
Eddie Dongb6958ce2007-07-18 12:15:21 +03003240 schedule();
Eddie Dongb6958ce2007-07-18 12:15:21 +03003241 }
Davidlohr Buesoda4ad882020-04-23 22:48:37 -07003242 finish_rcuwait(&vcpu->wait);
Paolo Bonzinif7819512015-02-04 18:20:58 +01003243 cur = ktime_get();
Jing Zhang87bcc5f2021-08-02 16:56:32 +00003244 if (waited) {
3245 vcpu->stat.generic.halt_wait_ns +=
3246 ktime_to_ns(cur) - ktime_to_ns(poll_end);
3247 }
Paolo Bonzinif7819512015-02-04 18:20:58 +01003248out:
Marc Zyngier07ab0f82019-08-02 11:37:09 +01003249 kvm_arch_vcpu_unblocking(vcpu);
Wanpeng Liaca6ff22015-09-03 22:07:38 +08003250 block_ns = ktime_to_ns(cur) - ktime_to_ns(start);
3251
David Matlackcb953122020-05-08 11:22:40 -07003252 update_halt_poll_stats(
3253 vcpu, ktime_to_ns(ktime_sub(poll_end, start)), waited);
3254
Wanpeng Li44551b22019-09-29 09:06:56 +08003255 if (!kvm_arch_no_poll(vcpu)) {
3256 if (!vcpu_valid_wakeup(vcpu)) {
Wanpeng Liaca6ff22015-09-03 22:07:38 +08003257 shrink_halt_poll_ns(vcpu);
David Matlackacd05782020-04-17 15:14:46 -07003258 } else if (vcpu->kvm->max_halt_poll_ns) {
Wanpeng Li44551b22019-09-29 09:06:56 +08003259 if (block_ns <= vcpu->halt_poll_ns)
3260 ;
3261 /* we had a long block, shrink polling */
David Matlackacd05782020-04-17 15:14:46 -07003262 else if (vcpu->halt_poll_ns &&
3263 block_ns > vcpu->kvm->max_halt_poll_ns)
Wanpeng Li44551b22019-09-29 09:06:56 +08003264 shrink_halt_poll_ns(vcpu);
3265 /* we had a short halt and our poll time is too small */
David Matlackacd05782020-04-17 15:14:46 -07003266 else if (vcpu->halt_poll_ns < vcpu->kvm->max_halt_poll_ns &&
3267 block_ns < vcpu->kvm->max_halt_poll_ns)
Wanpeng Li44551b22019-09-29 09:06:56 +08003268 grow_halt_poll_ns(vcpu);
3269 } else {
3270 vcpu->halt_poll_ns = 0;
3271 }
3272 }
Wanpeng Liaca6ff22015-09-03 22:07:38 +08003273
Christian Borntraeger3491caf2016-05-13 12:16:35 +02003274 trace_kvm_vcpu_wakeup(block_ns, waited, vcpu_valid_wakeup(vcpu));
3275 kvm_arch_vcpu_block_finish(vcpu);
Eddie Dongb6958ce2007-07-18 12:15:21 +03003276}
Aneesh Kumar K.V2ba9f0d2013-10-07 22:17:59 +05303277EXPORT_SYMBOL_GPL(kvm_vcpu_block);
Eddie Dongb6958ce2007-07-18 12:15:21 +03003278
Radim Krčmář178f02f2017-04-26 22:32:26 +02003279bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu)
Christoffer Dallb6d33832012-03-08 16:44:24 -05003280{
Davidlohr Buesoda4ad882020-04-23 22:48:37 -07003281 struct rcuwait *waitp;
Christoffer Dallb6d33832012-03-08 16:44:24 -05003282
Davidlohr Buesoda4ad882020-04-23 22:48:37 -07003283 waitp = kvm_arch_vcpu_get_wait(vcpu);
3284 if (rcuwait_wake_up(waitp)) {
Wanpeng Lid73eb572019-07-18 19:39:06 +08003285 WRITE_ONCE(vcpu->ready, true);
Jing Zhang0193cc92021-06-18 22:27:03 +00003286 ++vcpu->stat.generic.halt_wakeup;
Radim Krčmář178f02f2017-04-26 22:32:26 +02003287 return true;
Christoffer Dallb6d33832012-03-08 16:44:24 -05003288 }
3289
Radim Krčmář178f02f2017-04-26 22:32:26 +02003290 return false;
Radim Krčmářdd1a4cc2016-05-04 14:09:44 -05003291}
3292EXPORT_SYMBOL_GPL(kvm_vcpu_wake_up);
3293
Paolo Bonzini0266c892017-05-04 15:14:13 +02003294#ifndef CONFIG_S390
Radim Krčmářdd1a4cc2016-05-04 14:09:44 -05003295/*
3296 * Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode.
3297 */
3298void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
3299{
3300 int me;
3301 int cpu = vcpu->cpu;
3302
Radim Krčmář178f02f2017-04-26 22:32:26 +02003303 if (kvm_vcpu_wake_up(vcpu))
3304 return;
3305
Christoffer Dallb6d33832012-03-08 16:44:24 -05003306 me = get_cpu();
3307 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
3308 if (kvm_arch_vcpu_should_kick(vcpu))
3309 smp_send_reschedule(cpu);
3310 put_cpu();
3311}
Yang Zhanga20ed542013-04-11 19:25:15 +08003312EXPORT_SYMBOL_GPL(kvm_vcpu_kick);
Paolo Bonzini0266c892017-05-04 15:14:13 +02003313#endif /* !CONFIG_S390 */
Christoffer Dallb6d33832012-03-08 16:44:24 -05003314
Dan Carpenterfa933842014-05-23 13:20:42 +03003315int kvm_vcpu_yield_to(struct kvm_vcpu *target)
Konstantin Weitz41628d32012-04-25 15:30:38 +02003316{
3317 struct pid *pid;
3318 struct task_struct *task = NULL;
Dan Carpenterfa933842014-05-23 13:20:42 +03003319 int ret = 0;
Konstantin Weitz41628d32012-04-25 15:30:38 +02003320
3321 rcu_read_lock();
3322 pid = rcu_dereference(target->pid);
3323 if (pid)
Sam Bobroff27fbe64b2014-09-19 09:40:41 +10003324 task = get_pid_task(pid, PIDTYPE_PID);
Konstantin Weitz41628d32012-04-25 15:30:38 +02003325 rcu_read_unlock();
3326 if (!task)
Raghavendra K Tc45c5282013-01-22 13:09:24 +05303327 return ret;
Raghavendra K Tc45c5282013-01-22 13:09:24 +05303328 ret = yield_to(task, 1);
Konstantin Weitz41628d32012-04-25 15:30:38 +02003329 put_task_struct(task);
Raghavendra K Tc45c5282013-01-22 13:09:24 +05303330
3331 return ret;
Konstantin Weitz41628d32012-04-25 15:30:38 +02003332}
3333EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to);
3334
Raghavendra K T06e48c52012-07-19 15:17:52 +05303335/*
3336 * Helper that checks whether a VCPU is eligible for directed yield.
3337 * Most eligible candidate to yield is decided by following heuristics:
3338 *
3339 * (a) VCPU which has not done pl-exit or cpu relax intercepted recently
3340 * (preempted lock holder), indicated by @in_spin_loop.
Fuad Tabba656012c2020-04-01 15:03:10 +01003341 * Set at the beginning and cleared at the end of interception/PLE handler.
Raghavendra K T06e48c52012-07-19 15:17:52 +05303342 *
3343 * (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get
3344 * chance last time (mostly it has become eligible now since we have probably
3345 * yielded to lockholder in last iteration. This is done by toggling
3346 * @dy_eligible each time a VCPU checked for eligibility.)
3347 *
3348 * Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding
3349 * to preempted lock-holder could result in wrong VCPU selection and CPU
3350 * burning. Giving priority for a potential lock-holder increases lock
3351 * progress.
3352 *
3353 * Since algorithm is based on heuristics, accessing another VCPU data without
3354 * locking does not harm. It may result in trying to yield to same VCPU, fail
3355 * and continue with next VCPU and so on.
3356 */
Stephen Hemminger79408762013-12-29 12:12:29 -08003357static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
Raghavendra K T06e48c52012-07-19 15:17:52 +05303358{
Scott Wood4a55dd72014-01-09 18:43:16 -06003359#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
Raghavendra K T06e48c52012-07-19 15:17:52 +05303360 bool eligible;
3361
3362 eligible = !vcpu->spin_loop.in_spin_loop ||
Christian Borntraeger34656112014-09-04 21:13:31 +02003363 vcpu->spin_loop.dy_eligible;
Raghavendra K T06e48c52012-07-19 15:17:52 +05303364
3365 if (vcpu->spin_loop.in_spin_loop)
3366 kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible);
3367
3368 return eligible;
Scott Wood4a55dd72014-01-09 18:43:16 -06003369#else
3370 return true;
Raghavendra K T06e48c52012-07-19 15:17:52 +05303371#endif
Scott Wood4a55dd72014-01-09 18:43:16 -06003372}
Raghavendra K Tc45c5282013-01-22 13:09:24 +05303373
Wanpeng Li17e433b2019-08-05 10:03:19 +08003374/*
3375 * Unlike kvm_arch_vcpu_runnable, this function is called outside
3376 * a vcpu_load/vcpu_put pair. However, for most architectures
3377 * kvm_arch_vcpu_runnable does not require vcpu_load.
3378 */
3379bool __weak kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
3380{
3381 return kvm_arch_vcpu_runnable(vcpu);
3382}
3383
3384static bool vcpu_dy_runnable(struct kvm_vcpu *vcpu)
3385{
3386 if (kvm_arch_dy_runnable(vcpu))
3387 return true;
3388
3389#ifdef CONFIG_KVM_ASYNC_PF
3390 if (!list_empty_careful(&vcpu->async_pf.done))
3391 return true;
3392#endif
3393
3394 return false;
3395}
3396
Wanpeng Li52acd222021-04-16 11:08:10 +08003397bool __weak kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
3398{
3399 return false;
3400}
3401
Longpeng(Mike)199b5762017-08-08 12:05:32 +08003402void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
Zhai, Edwind255f4f2009-10-09 18:03:20 +08003403{
Rik van Riel217ece62011-02-01 09:53:28 -05003404 struct kvm *kvm = me->kvm;
3405 struct kvm_vcpu *vcpu;
3406 int last_boosted_vcpu = me->kvm->last_boosted_vcpu;
3407 int yielded = 0;
Raghavendra K Tc45c5282013-01-22 13:09:24 +05303408 int try = 3;
Rik van Riel217ece62011-02-01 09:53:28 -05003409 int pass;
3410 int i;
Zhai, Edwind255f4f2009-10-09 18:03:20 +08003411
Raghavendra K T4c088492012-07-18 19:07:46 +05303412 kvm_vcpu_set_in_spin_loop(me, true);
Rik van Riel217ece62011-02-01 09:53:28 -05003413 /*
3414 * We boost the priority of a VCPU that is runnable but not
3415 * currently running, because it got preempted by something
3416 * else and called schedule in __vcpu_run. Hopefully that
3417 * VCPU is holding the lock that we need and will release it.
3418 * We approximate round-robin by starting at the last boosted VCPU.
3419 */
Raghavendra K Tc45c5282013-01-22 13:09:24 +05303420 for (pass = 0; pass < 2 && !yielded && try; pass++) {
Rik van Riel217ece62011-02-01 09:53:28 -05003421 kvm_for_each_vcpu(i, vcpu, kvm) {
Rik van Riel5cfc2aa2012-06-19 16:51:04 -04003422 if (!pass && i <= last_boosted_vcpu) {
Rik van Riel217ece62011-02-01 09:53:28 -05003423 i = last_boosted_vcpu;
3424 continue;
3425 } else if (pass && i > last_boosted_vcpu)
3426 break;
Wanpeng Lid73eb572019-07-18 19:39:06 +08003427 if (!READ_ONCE(vcpu->ready))
Raghavendra K T7bc7ae22013-03-04 23:32:27 +05303428 continue;
Rik van Riel217ece62011-02-01 09:53:28 -05003429 if (vcpu == me)
3430 continue;
Davidlohr Buesoda4ad882020-04-23 22:48:37 -07003431 if (rcuwait_active(&vcpu->wait) &&
3432 !vcpu_dy_runnable(vcpu))
Rik van Riel217ece62011-02-01 09:53:28 -05003433 continue;
Wanpeng Li046ddee2019-08-01 11:30:14 +08003434 if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode &&
Wanpeng Li52acd222021-04-16 11:08:10 +08003435 !kvm_arch_dy_has_pending_interrupt(vcpu) &&
3436 !kvm_arch_vcpu_in_kernel(vcpu))
Longpeng(Mike)199b5762017-08-08 12:05:32 +08003437 continue;
Raghavendra K T06e48c52012-07-19 15:17:52 +05303438 if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
3439 continue;
Raghavendra K Tc45c5282013-01-22 13:09:24 +05303440
3441 yielded = kvm_vcpu_yield_to(vcpu);
3442 if (yielded > 0) {
Rik van Riel217ece62011-02-01 09:53:28 -05003443 kvm->last_boosted_vcpu = i;
Rik van Riel217ece62011-02-01 09:53:28 -05003444 break;
Raghavendra K Tc45c5282013-01-22 13:09:24 +05303445 } else if (yielded < 0) {
3446 try--;
3447 if (!try)
3448 break;
Rik van Riel217ece62011-02-01 09:53:28 -05003449 }
Rik van Riel217ece62011-02-01 09:53:28 -05003450 }
3451 }
Raghavendra K T4c088492012-07-18 19:07:46 +05303452 kvm_vcpu_set_in_spin_loop(me, false);
Raghavendra K T06e48c52012-07-19 15:17:52 +05303453
3454 /* Ensure vcpu is not eligible during next spinloop */
3455 kvm_vcpu_set_dy_eligible(me, false);
Zhai, Edwind255f4f2009-10-09 18:03:20 +08003456}
3457EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
3458
Peter Xufb04a1e2020-09-30 21:22:22 -04003459static bool kvm_page_in_dirty_ring(struct kvm *kvm, unsigned long pgoff)
3460{
3461#if KVM_DIRTY_LOG_PAGE_OFFSET > 0
3462 return (pgoff >= KVM_DIRTY_LOG_PAGE_OFFSET) &&
3463 (pgoff < KVM_DIRTY_LOG_PAGE_OFFSET +
3464 kvm->dirty_ring_size / PAGE_SIZE);
3465#else
3466 return false;
3467#endif
3468}
3469
Souptick Joarder1499fa82018-04-19 00:49:58 +05303470static vm_fault_t kvm_vcpu_fault(struct vm_fault *vmf)
Avi Kivity9a2bb7f2007-02-22 12:58:31 +02003471{
Dave Jiang11bac802017-02-24 14:56:41 -08003472 struct kvm_vcpu *vcpu = vmf->vma->vm_file->private_data;
Avi Kivity9a2bb7f2007-02-22 12:58:31 +02003473 struct page *page;
3474
npiggin@suse.dee4a533a2007-12-05 18:15:52 +11003475 if (vmf->pgoff == 0)
Avi Kivity039576c2007-03-20 12:46:50 +02003476 page = virt_to_page(vcpu->run);
Avi Kivity09566762008-01-23 18:14:23 +02003477#ifdef CONFIG_X86
npiggin@suse.dee4a533a2007-12-05 18:15:52 +11003478 else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET)
Zhang Xiantaoad312c72007-12-13 23:50:52 +08003479 page = virt_to_page(vcpu->arch.pio_data);
Avi Kivity09566762008-01-23 18:14:23 +02003480#endif
Paolo Bonzini4b4357e2017-03-31 13:53:23 +02003481#ifdef CONFIG_KVM_MMIO
Laurent Vivier5f94c172008-05-30 16:05:54 +02003482 else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET)
3483 page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
3484#endif
Peter Xufb04a1e2020-09-30 21:22:22 -04003485 else if (kvm_page_in_dirty_ring(vcpu->kvm, vmf->pgoff))
3486 page = kvm_dirty_ring_get_page(
3487 &vcpu->dirty_ring,
3488 vmf->pgoff - KVM_DIRTY_LOG_PAGE_OFFSET);
Avi Kivity039576c2007-03-20 12:46:50 +02003489 else
Carsten Otte5b1c1492012-01-04 10:25:23 +01003490 return kvm_arch_vcpu_fault(vcpu, vmf);
Avi Kivity9a2bb7f2007-02-22 12:58:31 +02003491 get_page(page);
npiggin@suse.dee4a533a2007-12-05 18:15:52 +11003492 vmf->page = page;
3493 return 0;
Avi Kivity9a2bb7f2007-02-22 12:58:31 +02003494}
3495
Alexey Dobriyanf0f37e2f2009-09-27 22:29:37 +04003496static const struct vm_operations_struct kvm_vcpu_vm_ops = {
npiggin@suse.dee4a533a2007-12-05 18:15:52 +11003497 .fault = kvm_vcpu_fault,
Avi Kivity9a2bb7f2007-02-22 12:58:31 +02003498};
3499
3500static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
3501{
Peter Xufb04a1e2020-09-30 21:22:22 -04003502 struct kvm_vcpu *vcpu = file->private_data;
3503 unsigned long pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
3504
3505 if ((kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff) ||
3506 kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff + pages - 1)) &&
3507 ((vma->vm_flags & VM_EXEC) || !(vma->vm_flags & VM_SHARED)))
3508 return -EINVAL;
3509
Avi Kivity9a2bb7f2007-02-22 12:58:31 +02003510 vma->vm_ops = &kvm_vcpu_vm_ops;
3511 return 0;
3512}
3513
Avi Kivitybccf2152007-02-21 18:04:26 +02003514static int kvm_vcpu_release(struct inode *inode, struct file *filp)
3515{
3516 struct kvm_vcpu *vcpu = filp->private_data;
3517
Al Viro66c0b392008-04-19 20:33:56 +01003518 kvm_put_kvm(vcpu->kvm);
Avi Kivitybccf2152007-02-21 18:04:26 +02003519 return 0;
3520}
3521
Christian Borntraeger3d3aab12008-12-02 11:17:32 +01003522static struct file_operations kvm_vcpu_fops = {
Avi Kivitybccf2152007-02-21 18:04:26 +02003523 .release = kvm_vcpu_release,
3524 .unlocked_ioctl = kvm_vcpu_ioctl,
Avi Kivity9a2bb7f2007-02-22 12:58:31 +02003525 .mmap = kvm_vcpu_mmap,
Arnd Bergmann6038f372010-08-15 18:52:59 +02003526 .llseek = noop_llseek,
Marc Zyngier7ddfd3e2018-06-17 10:16:21 +01003527 KVM_COMPAT(kvm_vcpu_compat_ioctl),
Avi Kivitybccf2152007-02-21 18:04:26 +02003528};
3529
3530/*
3531 * Allocates an inode for the vcpu.
3532 */
3533static int create_vcpu_fd(struct kvm_vcpu *vcpu)
3534{
Masatake YAMATOe46b4692018-01-20 04:04:22 +09003535 char name[8 + 1 + ITOA_MAX_LEN + 1];
3536
3537 snprintf(name, sizeof(name), "kvm-vcpu:%d", vcpu->vcpu_id);
3538 return anon_inode_getfd(name, &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC);
Avi Kivitybccf2152007-02-21 18:04:26 +02003539}
3540
Greg KH3e7093d2019-07-31 20:56:20 +02003541static void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
Luiz Capitulino45b59392016-09-16 10:27:35 -04003542{
Paolo Bonzini741cbba2019-08-03 08:14:25 +02003543#ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS
Paolo Bonzinid56f5132020-06-04 15:16:52 +02003544 struct dentry *debugfs_dentry;
Luiz Capitulino45b59392016-09-16 10:27:35 -04003545 char dir_name[ITOA_MAX_LEN * 2];
Luiz Capitulino45b59392016-09-16 10:27:35 -04003546
Luiz Capitulino45b59392016-09-16 10:27:35 -04003547 if (!debugfs_initialized())
Greg KH3e7093d2019-07-31 20:56:20 +02003548 return;
Luiz Capitulino45b59392016-09-16 10:27:35 -04003549
3550 snprintf(dir_name, sizeof(dir_name), "vcpu%d", vcpu->vcpu_id);
Paolo Bonzinid56f5132020-06-04 15:16:52 +02003551 debugfs_dentry = debugfs_create_dir(dir_name,
3552 vcpu->kvm->debugfs_dentry);
Luiz Capitulino45b59392016-09-16 10:27:35 -04003553
Paolo Bonzinid56f5132020-06-04 15:16:52 +02003554 kvm_arch_create_vcpu_debugfs(vcpu, debugfs_dentry);
Paolo Bonzini741cbba2019-08-03 08:14:25 +02003555#endif
Luiz Capitulino45b59392016-09-16 10:27:35 -04003556}
3557
Avi Kivityc5ea7662007-02-20 18:41:05 +02003558/*
3559 * Creates some virtual cpus. Good luck creating more than one.
3560 */
Gleb Natapov73880c82009-06-09 15:56:28 +03003561static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
Avi Kivityc5ea7662007-02-20 18:41:05 +02003562{
3563 int r;
David Hildenbrande09fefd2015-11-05 09:03:50 +01003564 struct kvm_vcpu *vcpu;
Sean Christopherson8bd826d2019-12-18 13:55:30 -08003565 struct page *page;
Avi Kivityc5ea7662007-02-20 18:41:05 +02003566
Greg Kurz0b1b1df2016-05-09 18:13:37 +02003567 if (id >= KVM_MAX_VCPU_ID)
Andy Honig338c7db2013-11-18 16:09:22 -08003568 return -EINVAL;
3569
Paolo Bonzini6c7caeb2016-06-13 14:48:25 +02003570 mutex_lock(&kvm->lock);
3571 if (kvm->created_vcpus == KVM_MAX_VCPUS) {
3572 mutex_unlock(&kvm->lock);
3573 return -EINVAL;
3574 }
3575
3576 kvm->created_vcpus++;
3577 mutex_unlock(&kvm->lock);
3578
Sean Christopherson897cc382019-12-18 13:55:09 -08003579 r = kvm_arch_vcpu_precreate(kvm, id);
3580 if (r)
3581 goto vcpu_decrement;
3582
Sean Christopherson85f47932021-04-06 12:07:40 -07003583 vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT);
Sean Christophersone529ef62019-12-18 13:55:15 -08003584 if (!vcpu) {
3585 r = -ENOMEM;
Paolo Bonzini6c7caeb2016-06-13 14:48:25 +02003586 goto vcpu_decrement;
3587 }
Avi Kivityc5ea7662007-02-20 18:41:05 +02003588
Peter Xufcd97ad2020-01-09 09:57:12 -05003589 BUILD_BUG_ON(sizeof(struct kvm_run) > PAGE_SIZE);
Shakeel Butt93bb59c2020-12-18 14:01:38 -08003590 page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
Sean Christopherson8bd826d2019-12-18 13:55:30 -08003591 if (!page) {
3592 r = -ENOMEM;
Sean Christophersone529ef62019-12-18 13:55:15 -08003593 goto vcpu_free;
Sean Christopherson8bd826d2019-12-18 13:55:30 -08003594 }
3595 vcpu->run = page_address(page);
3596
3597 kvm_vcpu_init(vcpu, kvm, id);
Sean Christophersone529ef62019-12-18 13:55:15 -08003598
3599 r = kvm_arch_vcpu_create(vcpu);
3600 if (r)
Sean Christopherson8bd826d2019-12-18 13:55:30 -08003601 goto vcpu_free_run_page;
Sean Christophersone529ef62019-12-18 13:55:15 -08003602
Peter Xufb04a1e2020-09-30 21:22:22 -04003603 if (kvm->dirty_ring_size) {
3604 r = kvm_dirty_ring_alloc(&vcpu->dirty_ring,
3605 id, kvm->dirty_ring_size);
3606 if (r)
3607 goto arch_vcpu_destroy;
3608 }
3609
Shaohua Li11ec2802007-07-23 14:51:37 +08003610 mutex_lock(&kvm->lock);
David Hildenbrande09fefd2015-11-05 09:03:50 +01003611 if (kvm_get_vcpu_by_id(kvm, id)) {
3612 r = -EEXIST;
3613 goto unlock_vcpu_destroy;
3614 }
Gleb Natapov73880c82009-06-09 15:56:28 +03003615
Radim Krčmář8750e722019-11-07 07:53:42 -05003616 vcpu->vcpu_idx = atomic_read(&kvm->online_vcpus);
3617 BUG_ON(kvm->vcpus[vcpu->vcpu_idx]);
Rusty Russellfb3f0f52007-07-27 17:16:56 +10003618
Jing Zhangce55c042021-06-18 22:27:06 +00003619 /* Fill the stats id string for the vcpu */
3620 snprintf(vcpu->stats_id, sizeof(vcpu->stats_id), "kvm-%d/vcpu-%d",
3621 task_pid_nr(current), id);
3622
Rusty Russellfb3f0f52007-07-27 17:16:56 +10003623 /* Now it's all set up, let userspace reach it */
Al Viro66c0b392008-04-19 20:33:56 +01003624 kvm_get_kvm(kvm);
Avi Kivitybccf2152007-02-21 18:04:26 +02003625 r = create_vcpu_fd(vcpu);
Gleb Natapov73880c82009-06-09 15:56:28 +03003626 if (r < 0) {
Sean Christopherson149487b2019-10-21 15:58:42 -07003627 kvm_put_kvm_no_destroy(kvm);
Jan Kiszkad7805922011-05-23 10:33:05 +02003628 goto unlock_vcpu_destroy;
Gleb Natapov73880c82009-06-09 15:56:28 +03003629 }
3630
Radim Krčmář8750e722019-11-07 07:53:42 -05003631 kvm->vcpus[vcpu->vcpu_idx] = vcpu;
Paolo Bonzinidd489242015-07-29 11:32:20 +02003632
3633 /*
3634 * Pairs with smp_rmb() in kvm_get_vcpu. Write kvm->vcpus
3635 * before kvm->online_vcpu's incremented value.
3636 */
Gleb Natapov73880c82009-06-09 15:56:28 +03003637 smp_wmb();
3638 atomic_inc(&kvm->online_vcpus);
3639
Gleb Natapov73880c82009-06-09 15:56:28 +03003640 mutex_unlock(&kvm->lock);
Marcelo Tosatti42897d82012-11-27 23:29:02 -02003641 kvm_arch_vcpu_postcreate(vcpu);
Paolo Bonzini63d04342020-04-01 00:42:22 +02003642 kvm_create_vcpu_debugfs(vcpu);
Avi Kivitybccf2152007-02-21 18:04:26 +02003643 return r;
Avi Kivityc5ea7662007-02-20 18:41:05 +02003644
Jan Kiszkad7805922011-05-23 10:33:05 +02003645unlock_vcpu_destroy:
Glauber Costa7d8fece2008-09-17 23:16:59 -03003646 mutex_unlock(&kvm->lock);
Peter Xufb04a1e2020-09-30 21:22:22 -04003647 kvm_dirty_ring_free(&vcpu->dirty_ring);
3648arch_vcpu_destroy:
Hollis Blanchardd40ccc62007-11-19 14:04:43 -06003649 kvm_arch_vcpu_destroy(vcpu);
Sean Christopherson8bd826d2019-12-18 13:55:30 -08003650vcpu_free_run_page:
3651 free_page((unsigned long)vcpu->run);
Sean Christophersone529ef62019-12-18 13:55:15 -08003652vcpu_free:
3653 kmem_cache_free(kvm_vcpu_cache, vcpu);
Paolo Bonzini6c7caeb2016-06-13 14:48:25 +02003654vcpu_decrement:
3655 mutex_lock(&kvm->lock);
3656 kvm->created_vcpus--;
3657 mutex_unlock(&kvm->lock);
Avi Kivityc5ea7662007-02-20 18:41:05 +02003658 return r;
3659}
3660
Avi Kivity1961d272007-03-05 19:46:05 +02003661static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
3662{
3663 if (sigset) {
3664 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
3665 vcpu->sigset_active = 1;
3666 vcpu->sigset = *sigset;
3667 } else
3668 vcpu->sigset_active = 0;
3669 return 0;
3670}
3671
Jing Zhangce55c042021-06-18 22:27:06 +00003672static ssize_t kvm_vcpu_stats_read(struct file *file, char __user *user_buffer,
3673 size_t size, loff_t *offset)
3674{
3675 struct kvm_vcpu *vcpu = file->private_data;
3676
3677 return kvm_stats_read(vcpu->stats_id, &kvm_vcpu_stats_header,
3678 &kvm_vcpu_stats_desc[0], &vcpu->stat,
3679 sizeof(vcpu->stat), user_buffer, size, offset);
3680}
3681
3682static const struct file_operations kvm_vcpu_stats_fops = {
3683 .read = kvm_vcpu_stats_read,
3684 .llseek = noop_llseek,
3685};
3686
3687static int kvm_vcpu_ioctl_get_stats_fd(struct kvm_vcpu *vcpu)
3688{
3689 int fd;
3690 struct file *file;
3691 char name[15 + ITOA_MAX_LEN + 1];
3692
3693 snprintf(name, sizeof(name), "kvm-vcpu-stats:%d", vcpu->vcpu_id);
3694
3695 fd = get_unused_fd_flags(O_CLOEXEC);
3696 if (fd < 0)
3697 return fd;
3698
3699 file = anon_inode_getfile(name, &kvm_vcpu_stats_fops, vcpu, O_RDONLY);
3700 if (IS_ERR(file)) {
3701 put_unused_fd(fd);
3702 return PTR_ERR(file);
3703 }
3704 file->f_mode |= FMODE_PREAD;
3705 fd_install(fd, file);
3706
3707 return fd;
3708}
3709
Avi Kivitybccf2152007-02-21 18:04:26 +02003710static long kvm_vcpu_ioctl(struct file *filp,
3711 unsigned int ioctl, unsigned long arg)
Avi Kivity6aa8b732006-12-10 02:21:36 -08003712{
Avi Kivitybccf2152007-02-21 18:04:26 +02003713 struct kvm_vcpu *vcpu = filp->private_data;
Al Viro2f3669872007-02-09 16:38:35 +00003714 void __user *argp = (void __user *)arg;
Carsten Otte313a3dc2007-10-11 19:16:52 +02003715 int r;
Dave Hansenfa3795a2008-08-11 10:01:46 -07003716 struct kvm_fpu *fpu = NULL;
3717 struct kvm_sregs *kvm_sregs = NULL;
Avi Kivity6aa8b732006-12-10 02:21:36 -08003718
Sean Christopherson0b8f1172021-07-02 15:04:23 -07003719 if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_bugged)
Avi Kivity6d4e4c42007-11-21 16:41:05 +02003720 return -EIO;
Avi Kivity2122ff52010-05-13 11:25:04 +03003721
David Matlack2ea75be2014-09-19 16:03:25 -07003722 if (unlikely(_IOC_TYPE(ioctl) != KVMIO))
3723 return -EINVAL;
3724
Avi Kivity2122ff52010-05-13 11:25:04 +03003725 /*
Paolo Bonzini5cb09442017-12-12 17:41:34 +01003726 * Some architectures have vcpu ioctls that are asynchronous to vcpu
3727 * execution; mutex_lock() would break them.
Avi Kivity2122ff52010-05-13 11:25:04 +03003728 */
Paolo Bonzini5cb09442017-12-12 17:41:34 +01003729 r = kvm_arch_vcpu_async_ioctl(filp, ioctl, arg);
3730 if (r != -ENOIOCTLCMD)
Michael S. Tsirkin9fc77442012-09-16 11:50:30 +03003731 return r;
Avi Kivity2122ff52010-05-13 11:25:04 +03003732
Christoffer Dallec7660c2017-12-04 21:35:23 +01003733 if (mutex_lock_killable(&vcpu->mutex))
3734 return -EINTR;
Avi Kivity6aa8b732006-12-10 02:21:36 -08003735 switch (ioctl) {
Christian Borntraeger0e4524a2017-07-06 14:44:28 +02003736 case KVM_RUN: {
3737 struct pid *oldpid;
Avi Kivityf0fe5102007-03-07 13:11:17 +02003738 r = -EINVAL;
3739 if (arg)
3740 goto out;
Christian Borntraeger0e4524a2017-07-06 14:44:28 +02003741 oldpid = rcu_access_pointer(vcpu->pid);
Eric W. Biederman71dbc8a2017-07-16 21:39:32 -05003742 if (unlikely(oldpid != task_pid(current))) {
Christian Borntraeger7a72f7a2014-08-05 16:44:14 +02003743 /* The thread running this VCPU changed. */
Christoffer Dallbd2a6392018-02-23 17:23:57 +01003744 struct pid *newpid;
Xiubo Lif95ef0cd2015-02-26 14:58:23 +08003745
Christoffer Dallbd2a6392018-02-23 17:23:57 +01003746 r = kvm_arch_vcpu_run_pid_change(vcpu);
3747 if (r)
3748 break;
3749
3750 newpid = get_task_pid(current, PIDTYPE_PID);
Christian Borntraeger7a72f7a2014-08-05 16:44:14 +02003751 rcu_assign_pointer(vcpu->pid, newpid);
3752 if (oldpid)
3753 synchronize_rcu();
3754 put_pid(oldpid);
3755 }
Tianjia Zhang1b94f6f2020-04-16 13:10:57 +08003756 r = kvm_arch_vcpu_ioctl_run(vcpu);
Gleb Natapov64be5002010-10-24 16:49:08 +02003757 trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
Avi Kivity6aa8b732006-12-10 02:21:36 -08003758 break;
Christian Borntraeger0e4524a2017-07-06 14:44:28 +02003759 }
Avi Kivity6aa8b732006-12-10 02:21:36 -08003760 case KVM_GET_REGS: {
Xiantao Zhang3e4bb3a2008-02-25 18:52:20 +08003761 struct kvm_regs *kvm_regs;
Avi Kivity6aa8b732006-12-10 02:21:36 -08003762
Xiantao Zhang3e4bb3a2008-02-25 18:52:20 +08003763 r = -ENOMEM;
Ben Gardonb12ce362019-02-11 11:02:49 -08003764 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL_ACCOUNT);
Xiantao Zhang3e4bb3a2008-02-25 18:52:20 +08003765 if (!kvm_regs)
3766 goto out;
3767 r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs);
Avi Kivity6aa8b732006-12-10 02:21:36 -08003768 if (r)
Xiantao Zhang3e4bb3a2008-02-25 18:52:20 +08003769 goto out_free1;
Avi Kivity6aa8b732006-12-10 02:21:36 -08003770 r = -EFAULT;
Xiantao Zhang3e4bb3a2008-02-25 18:52:20 +08003771 if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs)))
3772 goto out_free1;
Avi Kivity6aa8b732006-12-10 02:21:36 -08003773 r = 0;
Xiantao Zhang3e4bb3a2008-02-25 18:52:20 +08003774out_free1:
3775 kfree(kvm_regs);
Avi Kivity6aa8b732006-12-10 02:21:36 -08003776 break;
3777 }
3778 case KVM_SET_REGS: {
Xiantao Zhang3e4bb3a2008-02-25 18:52:20 +08003779 struct kvm_regs *kvm_regs;
Avi Kivity6aa8b732006-12-10 02:21:36 -08003780
Sasha Levinff5c2c02011-12-04 19:36:29 +02003781 kvm_regs = memdup_user(argp, sizeof(*kvm_regs));
3782 if (IS_ERR(kvm_regs)) {
3783 r = PTR_ERR(kvm_regs);
Xiantao Zhang3e4bb3a2008-02-25 18:52:20 +08003784 goto out;
Sasha Levinff5c2c02011-12-04 19:36:29 +02003785 }
Xiantao Zhang3e4bb3a2008-02-25 18:52:20 +08003786 r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs);
Xiantao Zhang3e4bb3a2008-02-25 18:52:20 +08003787 kfree(kvm_regs);
Avi Kivity6aa8b732006-12-10 02:21:36 -08003788 break;
3789 }
3790 case KVM_GET_SREGS: {
Ben Gardonb12ce362019-02-11 11:02:49 -08003791 kvm_sregs = kzalloc(sizeof(struct kvm_sregs),
3792 GFP_KERNEL_ACCOUNT);
Dave Hansenfa3795a2008-08-11 10:01:46 -07003793 r = -ENOMEM;
3794 if (!kvm_sregs)
3795 goto out;
3796 r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs);
Avi Kivity6aa8b732006-12-10 02:21:36 -08003797 if (r)
3798 goto out;
3799 r = -EFAULT;
Dave Hansenfa3795a2008-08-11 10:01:46 -07003800 if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs)))
Avi Kivity6aa8b732006-12-10 02:21:36 -08003801 goto out;
3802 r = 0;
3803 break;
3804 }
3805 case KVM_SET_SREGS: {
Sasha Levinff5c2c02011-12-04 19:36:29 +02003806 kvm_sregs = memdup_user(argp, sizeof(*kvm_sregs));
3807 if (IS_ERR(kvm_sregs)) {
3808 r = PTR_ERR(kvm_sregs);
Guo Chao18595412012-11-02 18:33:21 +08003809 kvm_sregs = NULL;
Avi Kivity6aa8b732006-12-10 02:21:36 -08003810 goto out;
Sasha Levinff5c2c02011-12-04 19:36:29 +02003811 }
Dave Hansenfa3795a2008-08-11 10:01:46 -07003812 r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs);
Avi Kivity6aa8b732006-12-10 02:21:36 -08003813 break;
3814 }
Marcelo Tosatti62d9f0d2008-04-11 13:24:45 -03003815 case KVM_GET_MP_STATE: {
3816 struct kvm_mp_state mp_state;
3817
3818 r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state);
3819 if (r)
3820 goto out;
3821 r = -EFAULT;
Xiubo Li893bdbf2015-02-26 14:58:19 +08003822 if (copy_to_user(argp, &mp_state, sizeof(mp_state)))
Marcelo Tosatti62d9f0d2008-04-11 13:24:45 -03003823 goto out;
3824 r = 0;
3825 break;
3826 }
3827 case KVM_SET_MP_STATE: {
3828 struct kvm_mp_state mp_state;
3829
3830 r = -EFAULT;
Xiubo Li893bdbf2015-02-26 14:58:19 +08003831 if (copy_from_user(&mp_state, argp, sizeof(mp_state)))
Marcelo Tosatti62d9f0d2008-04-11 13:24:45 -03003832 goto out;
3833 r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state);
Marcelo Tosatti62d9f0d2008-04-11 13:24:45 -03003834 break;
3835 }
Avi Kivity6aa8b732006-12-10 02:21:36 -08003836 case KVM_TRANSLATE: {
3837 struct kvm_translation tr;
3838
3839 r = -EFAULT;
Xiubo Li893bdbf2015-02-26 14:58:19 +08003840 if (copy_from_user(&tr, argp, sizeof(tr)))
Avi Kivity6aa8b732006-12-10 02:21:36 -08003841 goto out;
Zhang Xiantao8b006792007-11-16 13:05:55 +08003842 r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
Avi Kivity6aa8b732006-12-10 02:21:36 -08003843 if (r)
3844 goto out;
3845 r = -EFAULT;
Xiubo Li893bdbf2015-02-26 14:58:19 +08003846 if (copy_to_user(argp, &tr, sizeof(tr)))
Avi Kivity6aa8b732006-12-10 02:21:36 -08003847 goto out;
3848 r = 0;
3849 break;
3850 }
Jan Kiszkad0bfb942008-12-15 13:52:10 +01003851 case KVM_SET_GUEST_DEBUG: {
3852 struct kvm_guest_debug dbg;
Avi Kivity6aa8b732006-12-10 02:21:36 -08003853
3854 r = -EFAULT;
Xiubo Li893bdbf2015-02-26 14:58:19 +08003855 if (copy_from_user(&dbg, argp, sizeof(dbg)))
Avi Kivity6aa8b732006-12-10 02:21:36 -08003856 goto out;
Jan Kiszkad0bfb942008-12-15 13:52:10 +01003857 r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg);
Avi Kivity6aa8b732006-12-10 02:21:36 -08003858 break;
3859 }
Avi Kivity1961d272007-03-05 19:46:05 +02003860 case KVM_SET_SIGNAL_MASK: {
3861 struct kvm_signal_mask __user *sigmask_arg = argp;
3862 struct kvm_signal_mask kvm_sigmask;
3863 sigset_t sigset, *p;
3864
3865 p = NULL;
3866 if (argp) {
3867 r = -EFAULT;
3868 if (copy_from_user(&kvm_sigmask, argp,
Xiubo Li893bdbf2015-02-26 14:58:19 +08003869 sizeof(kvm_sigmask)))
Avi Kivity1961d272007-03-05 19:46:05 +02003870 goto out;
3871 r = -EINVAL;
Xiubo Li893bdbf2015-02-26 14:58:19 +08003872 if (kvm_sigmask.len != sizeof(sigset))
Avi Kivity1961d272007-03-05 19:46:05 +02003873 goto out;
3874 r = -EFAULT;
3875 if (copy_from_user(&sigset, sigmask_arg->sigset,
Xiubo Li893bdbf2015-02-26 14:58:19 +08003876 sizeof(sigset)))
Avi Kivity1961d272007-03-05 19:46:05 +02003877 goto out;
3878 p = &sigset;
3879 }
Andi Kleen376d41f2010-06-10 13:10:47 +02003880 r = kvm_vcpu_ioctl_set_sigmask(vcpu, p);
Avi Kivity1961d272007-03-05 19:46:05 +02003881 break;
3882 }
Avi Kivityb8836732007-04-01 16:34:31 +03003883 case KVM_GET_FPU: {
Ben Gardonb12ce362019-02-11 11:02:49 -08003884 fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL_ACCOUNT);
Dave Hansenfa3795a2008-08-11 10:01:46 -07003885 r = -ENOMEM;
3886 if (!fpu)
3887 goto out;
3888 r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu);
Avi Kivityb8836732007-04-01 16:34:31 +03003889 if (r)
3890 goto out;
3891 r = -EFAULT;
Dave Hansenfa3795a2008-08-11 10:01:46 -07003892 if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu)))
Avi Kivityb8836732007-04-01 16:34:31 +03003893 goto out;
3894 r = 0;
3895 break;
3896 }
3897 case KVM_SET_FPU: {
Sasha Levinff5c2c02011-12-04 19:36:29 +02003898 fpu = memdup_user(argp, sizeof(*fpu));
3899 if (IS_ERR(fpu)) {
3900 r = PTR_ERR(fpu);
Guo Chao18595412012-11-02 18:33:21 +08003901 fpu = NULL;
Avi Kivityb8836732007-04-01 16:34:31 +03003902 goto out;
Sasha Levinff5c2c02011-12-04 19:36:29 +02003903 }
Dave Hansenfa3795a2008-08-11 10:01:46 -07003904 r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu);
Avi Kivityb8836732007-04-01 16:34:31 +03003905 break;
3906 }
Jing Zhangce55c042021-06-18 22:27:06 +00003907 case KVM_GET_STATS_FD: {
3908 r = kvm_vcpu_ioctl_get_stats_fd(vcpu);
3909 break;
3910 }
Avi Kivitybccf2152007-02-21 18:04:26 +02003911 default:
Carsten Otte313a3dc2007-10-11 19:16:52 +02003912 r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
Avi Kivitybccf2152007-02-21 18:04:26 +02003913 }
3914out:
Christoffer Dallec7660c2017-12-04 21:35:23 +01003915 mutex_unlock(&vcpu->mutex);
Dave Hansenfa3795a2008-08-11 10:01:46 -07003916 kfree(fpu);
3917 kfree(kvm_sregs);
Avi Kivitybccf2152007-02-21 18:04:26 +02003918 return r;
3919}
3920
Christian Borntraegerde8e5d72015-02-03 09:35:15 +01003921#ifdef CONFIG_KVM_COMPAT
Alexander Graf1dda6062011-06-08 02:45:37 +02003922static long kvm_vcpu_compat_ioctl(struct file *filp,
3923 unsigned int ioctl, unsigned long arg)
3924{
3925 struct kvm_vcpu *vcpu = filp->private_data;
3926 void __user *argp = compat_ptr(arg);
3927 int r;
3928
Sean Christopherson0b8f1172021-07-02 15:04:23 -07003929 if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_bugged)
Alexander Graf1dda6062011-06-08 02:45:37 +02003930 return -EIO;
3931
3932 switch (ioctl) {
3933 case KVM_SET_SIGNAL_MASK: {
3934 struct kvm_signal_mask __user *sigmask_arg = argp;
3935 struct kvm_signal_mask kvm_sigmask;
Alexander Graf1dda6062011-06-08 02:45:37 +02003936 sigset_t sigset;
3937
3938 if (argp) {
3939 r = -EFAULT;
3940 if (copy_from_user(&kvm_sigmask, argp,
Xiubo Li893bdbf2015-02-26 14:58:19 +08003941 sizeof(kvm_sigmask)))
Alexander Graf1dda6062011-06-08 02:45:37 +02003942 goto out;
3943 r = -EINVAL;
Al Viro3968cf62017-09-03 21:45:17 -04003944 if (kvm_sigmask.len != sizeof(compat_sigset_t))
Alexander Graf1dda6062011-06-08 02:45:37 +02003945 goto out;
3946 r = -EFAULT;
Paolo Bonzini1393b4a2020-07-02 05:39:31 -04003947 if (get_compat_sigset(&sigset,
3948 (compat_sigset_t __user *)sigmask_arg->sigset))
Alexander Graf1dda6062011-06-08 02:45:37 +02003949 goto out;
Alan Cox760a9a32012-08-22 14:34:11 +01003950 r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
3951 } else
3952 r = kvm_vcpu_ioctl_set_sigmask(vcpu, NULL);
Alexander Graf1dda6062011-06-08 02:45:37 +02003953 break;
3954 }
3955 default:
3956 r = kvm_vcpu_ioctl(filp, ioctl, arg);
3957 }
3958
3959out:
3960 return r;
3961}
3962#endif
3963
Cédric Le Goatera1cd3f02019-04-18 12:39:36 +02003964static int kvm_device_mmap(struct file *filp, struct vm_area_struct *vma)
3965{
3966 struct kvm_device *dev = filp->private_data;
3967
3968 if (dev->ops->mmap)
3969 return dev->ops->mmap(dev, vma);
3970
3971 return -ENODEV;
3972}
3973
Scott Wood852b6d52013-04-12 14:08:42 +00003974static int kvm_device_ioctl_attr(struct kvm_device *dev,
3975 int (*accessor)(struct kvm_device *dev,
3976 struct kvm_device_attr *attr),
3977 unsigned long arg)
3978{
3979 struct kvm_device_attr attr;
3980
3981 if (!accessor)
3982 return -EPERM;
3983
3984 if (copy_from_user(&attr, (void __user *)arg, sizeof(attr)))
3985 return -EFAULT;
3986
3987 return accessor(dev, &attr);
3988}
3989
3990static long kvm_device_ioctl(struct file *filp, unsigned int ioctl,
3991 unsigned long arg)
3992{
3993 struct kvm_device *dev = filp->private_data;
3994
Sean Christopherson0b8f1172021-07-02 15:04:23 -07003995 if (dev->kvm->mm != current->mm || dev->kvm->vm_bugged)
Sean Christophersonddba9182019-02-15 12:48:39 -08003996 return -EIO;
3997
Scott Wood852b6d52013-04-12 14:08:42 +00003998 switch (ioctl) {
3999 case KVM_SET_DEVICE_ATTR:
4000 return kvm_device_ioctl_attr(dev, dev->ops->set_attr, arg);
4001 case KVM_GET_DEVICE_ATTR:
4002 return kvm_device_ioctl_attr(dev, dev->ops->get_attr, arg);
4003 case KVM_HAS_DEVICE_ATTR:
4004 return kvm_device_ioctl_attr(dev, dev->ops->has_attr, arg);
4005 default:
4006 if (dev->ops->ioctl)
4007 return dev->ops->ioctl(dev, ioctl, arg);
4008
4009 return -ENOTTY;
4010 }
4011}
4012
Scott Wood852b6d52013-04-12 14:08:42 +00004013static int kvm_device_release(struct inode *inode, struct file *filp)
4014{
4015 struct kvm_device *dev = filp->private_data;
4016 struct kvm *kvm = dev->kvm;
4017
Cédric Le Goater2bde9b32019-04-18 12:39:41 +02004018 if (dev->ops->release) {
4019 mutex_lock(&kvm->lock);
4020 list_del(&dev->vm_node);
4021 dev->ops->release(dev);
4022 mutex_unlock(&kvm->lock);
4023 }
4024
Scott Wood852b6d52013-04-12 14:08:42 +00004025 kvm_put_kvm(kvm);
4026 return 0;
4027}
4028
4029static const struct file_operations kvm_device_fops = {
4030 .unlocked_ioctl = kvm_device_ioctl,
4031 .release = kvm_device_release,
Marc Zyngier7ddfd3e2018-06-17 10:16:21 +01004032 KVM_COMPAT(kvm_device_ioctl),
Cédric Le Goatera1cd3f02019-04-18 12:39:36 +02004033 .mmap = kvm_device_mmap,
Scott Wood852b6d52013-04-12 14:08:42 +00004034};
4035
4036struct kvm_device *kvm_device_from_filp(struct file *filp)
4037{
4038 if (filp->f_op != &kvm_device_fops)
4039 return NULL;
4040
4041 return filp->private_data;
4042}
4043
Steven Price8538cb22019-10-21 16:28:19 +01004044static const struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = {
Will Deacond60eacb2014-09-02 10:27:33 +01004045#ifdef CONFIG_KVM_MPIC
4046 [KVM_DEV_TYPE_FSL_MPIC_20] = &kvm_mpic_ops,
4047 [KVM_DEV_TYPE_FSL_MPIC_42] = &kvm_mpic_ops,
4048#endif
Will Deacond60eacb2014-09-02 10:27:33 +01004049};
4050
Steven Price8538cb22019-10-21 16:28:19 +01004051int kvm_register_device_ops(const struct kvm_device_ops *ops, u32 type)
Will Deacond60eacb2014-09-02 10:27:33 +01004052{
4053 if (type >= ARRAY_SIZE(kvm_device_ops_table))
4054 return -ENOSPC;
4055
4056 if (kvm_device_ops_table[type] != NULL)
4057 return -EEXIST;
4058
4059 kvm_device_ops_table[type] = ops;
4060 return 0;
4061}
4062
Wanpeng Li571ee1b2014-10-09 18:30:08 +08004063void kvm_unregister_device_ops(u32 type)
4064{
4065 if (kvm_device_ops_table[type] != NULL)
4066 kvm_device_ops_table[type] = NULL;
4067}
4068
Scott Wood852b6d52013-04-12 14:08:42 +00004069static int kvm_ioctl_create_device(struct kvm *kvm,
4070 struct kvm_create_device *cd)
4071{
Steven Price8538cb22019-10-21 16:28:19 +01004072 const struct kvm_device_ops *ops = NULL;
Scott Wood852b6d52013-04-12 14:08:42 +00004073 struct kvm_device *dev;
4074 bool test = cd->flags & KVM_CREATE_DEVICE_TEST;
Paolo Bonzini1d487e92019-04-11 11:16:47 +02004075 int type;
Scott Wood852b6d52013-04-12 14:08:42 +00004076 int ret;
4077
Will Deacond60eacb2014-09-02 10:27:33 +01004078 if (cd->type >= ARRAY_SIZE(kvm_device_ops_table))
Scott Wood852b6d52013-04-12 14:08:42 +00004079 return -ENODEV;
Will Deacond60eacb2014-09-02 10:27:33 +01004080
Paolo Bonzini1d487e92019-04-11 11:16:47 +02004081 type = array_index_nospec(cd->type, ARRAY_SIZE(kvm_device_ops_table));
4082 ops = kvm_device_ops_table[type];
Will Deacond60eacb2014-09-02 10:27:33 +01004083 if (ops == NULL)
4084 return -ENODEV;
Scott Wood852b6d52013-04-12 14:08:42 +00004085
4086 if (test)
4087 return 0;
4088
Ben Gardonb12ce362019-02-11 11:02:49 -08004089 dev = kzalloc(sizeof(*dev), GFP_KERNEL_ACCOUNT);
Scott Wood852b6d52013-04-12 14:08:42 +00004090 if (!dev)
4091 return -ENOMEM;
4092
4093 dev->ops = ops;
4094 dev->kvm = kvm;
Scott Wood852b6d52013-04-12 14:08:42 +00004095
Christoffer Dalla28ebea2016-08-09 19:13:01 +02004096 mutex_lock(&kvm->lock);
Paolo Bonzini1d487e92019-04-11 11:16:47 +02004097 ret = ops->create(dev, type);
Scott Wood852b6d52013-04-12 14:08:42 +00004098 if (ret < 0) {
Christoffer Dalla28ebea2016-08-09 19:13:01 +02004099 mutex_unlock(&kvm->lock);
Scott Wood852b6d52013-04-12 14:08:42 +00004100 kfree(dev);
4101 return ret;
4102 }
Christoffer Dalla28ebea2016-08-09 19:13:01 +02004103 list_add(&dev->vm_node, &kvm->devices);
4104 mutex_unlock(&kvm->lock);
Scott Wood852b6d52013-04-12 14:08:42 +00004105
Christoffer Dall023e9fd2016-08-09 19:13:00 +02004106 if (ops->init)
4107 ops->init(dev);
4108
Jann Horncfa39382019-01-26 01:54:33 +01004109 kvm_get_kvm(kvm);
Yann Droneaud24009b02013-08-24 22:14:07 +02004110 ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC);
Scott Wood852b6d52013-04-12 14:08:42 +00004111 if (ret < 0) {
Sean Christopherson149487b2019-10-21 15:58:42 -07004112 kvm_put_kvm_no_destroy(kvm);
Christoffer Dalla28ebea2016-08-09 19:13:01 +02004113 mutex_lock(&kvm->lock);
4114 list_del(&dev->vm_node);
4115 mutex_unlock(&kvm->lock);
Dan Carpentera0f1d212016-11-30 22:21:05 +03004116 ops->destroy(dev);
Scott Wood852b6d52013-04-12 14:08:42 +00004117 return ret;
4118 }
4119
Scott Wood852b6d52013-04-12 14:08:42 +00004120 cd->fd = ret;
4121 return 0;
4122}
4123
Alexander Graf92b591a2014-07-14 18:33:08 +02004124static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
4125{
4126 switch (arg) {
4127 case KVM_CAP_USER_MEMORY:
4128 case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
4129 case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS:
Alexander Graf92b591a2014-07-14 18:33:08 +02004130 case KVM_CAP_INTERNAL_ERROR_DATA:
4131#ifdef CONFIG_HAVE_KVM_MSI
4132 case KVM_CAP_SIGNAL_MSI:
4133#endif
Paul Mackerras297e2102014-06-30 20:51:13 +10004134#ifdef CONFIG_HAVE_KVM_IRQFD
Paolo Bonzinidc9be0f2015-03-05 11:54:46 +01004135 case KVM_CAP_IRQFD:
Alexander Graf92b591a2014-07-14 18:33:08 +02004136 case KVM_CAP_IRQFD_RESAMPLE:
4137#endif
Jason Wange9ea5062015-09-15 14:41:59 +08004138 case KVM_CAP_IOEVENTFD_ANY_LENGTH:
Alexander Graf92b591a2014-07-14 18:33:08 +02004139 case KVM_CAP_CHECK_EXTENSION_VM:
Paolo Bonzinie5d83c72017-02-16 10:40:56 +01004140 case KVM_CAP_ENABLE_CAP_VM:
David Matlackacd05782020-04-17 15:14:46 -07004141 case KVM_CAP_HALT_POLL:
Alexander Graf92b591a2014-07-14 18:33:08 +02004142 return 1;
Paolo Bonzini4b4357e2017-03-31 13:53:23 +02004143#ifdef CONFIG_KVM_MMIO
Paolo Bonzini30422552017-03-31 13:53:22 +02004144 case KVM_CAP_COALESCED_MMIO:
4145 return KVM_COALESCED_MMIO_PAGE_OFFSET;
Peng Hao0804c842018-10-14 07:09:55 +08004146 case KVM_CAP_COALESCED_PIO:
4147 return 1;
Paolo Bonzini30422552017-03-31 13:53:22 +02004148#endif
Jay Zhou3c9bd402020-02-27 09:32:27 +08004149#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
4150 case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2:
4151 return KVM_DIRTY_LOG_MANUAL_CAPS;
4152#endif
Alexander Graf92b591a2014-07-14 18:33:08 +02004153#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
4154 case KVM_CAP_IRQ_ROUTING:
4155 return KVM_MAX_IRQ_ROUTES;
4156#endif
Paolo Bonzinif481b062015-05-17 17:30:37 +02004157#if KVM_ADDRESS_SPACE_NUM > 1
4158 case KVM_CAP_MULTI_ADDRESS_SPACE:
4159 return KVM_ADDRESS_SPACE_NUM;
4160#endif
Paolo Bonzinic110ae52019-03-28 17:24:03 +01004161 case KVM_CAP_NR_MEMSLOTS:
4162 return KVM_USER_MEM_SLOTS;
Peter Xufb04a1e2020-09-30 21:22:22 -04004163 case KVM_CAP_DIRTY_LOG_RING:
4164#if KVM_DIRTY_LOG_PAGE_OFFSET > 0
4165 return KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct kvm_dirty_gfn);
4166#else
4167 return 0;
4168#endif
Jing Zhangce55c042021-06-18 22:27:06 +00004169 case KVM_CAP_BINARY_STATS_FD:
4170 return 1;
Alexander Graf92b591a2014-07-14 18:33:08 +02004171 default:
4172 break;
4173 }
4174 return kvm_vm_ioctl_check_extension(kvm, arg);
4175}
4176
Peter Xufb04a1e2020-09-30 21:22:22 -04004177static int kvm_vm_ioctl_enable_dirty_log_ring(struct kvm *kvm, u32 size)
4178{
4179 int r;
4180
4181 if (!KVM_DIRTY_LOG_PAGE_OFFSET)
4182 return -EINVAL;
4183
4184 /* the size should be power of 2 */
4185 if (!size || (size & (size - 1)))
4186 return -EINVAL;
4187
4188 /* Should be bigger to keep the reserved entries, or a page */
4189 if (size < kvm_dirty_ring_get_rsvd_entries() *
4190 sizeof(struct kvm_dirty_gfn) || size < PAGE_SIZE)
4191 return -EINVAL;
4192
4193 if (size > KVM_DIRTY_RING_MAX_ENTRIES *
4194 sizeof(struct kvm_dirty_gfn))
4195 return -E2BIG;
4196
4197 /* We only allow it to set once */
4198 if (kvm->dirty_ring_size)
4199 return -EINVAL;
4200
4201 mutex_lock(&kvm->lock);
4202
4203 if (kvm->created_vcpus) {
4204 /* We don't allow to change this value after vcpu created */
4205 r = -EINVAL;
4206 } else {
4207 kvm->dirty_ring_size = size;
4208 r = 0;
4209 }
4210
4211 mutex_unlock(&kvm->lock);
4212 return r;
4213}
4214
4215static int kvm_vm_ioctl_reset_dirty_pages(struct kvm *kvm)
4216{
4217 int i;
4218 struct kvm_vcpu *vcpu;
4219 int cleared = 0;
4220
4221 if (!kvm->dirty_ring_size)
4222 return -EINVAL;
4223
4224 mutex_lock(&kvm->slots_lock);
4225
4226 kvm_for_each_vcpu(i, vcpu, kvm)
4227 cleared += kvm_dirty_ring_reset(vcpu->kvm, &vcpu->dirty_ring);
4228
4229 mutex_unlock(&kvm->slots_lock);
4230
4231 if (cleared)
4232 kvm_flush_remote_tlbs(kvm);
4233
4234 return cleared;
4235}
4236
Paolo Bonzinie5d83c72017-02-16 10:40:56 +01004237int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm,
4238 struct kvm_enable_cap *cap)
4239{
4240 return -EINVAL;
4241}
4242
4243static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm,
4244 struct kvm_enable_cap *cap)
4245{
4246 switch (cap->cap) {
Paolo Bonzini2a31b9d2018-10-23 02:36:47 +02004247#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
Jay Zhou3c9bd402020-02-27 09:32:27 +08004248 case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2: {
4249 u64 allowed_options = KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE;
4250
4251 if (cap->args[0] & KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE)
4252 allowed_options = KVM_DIRTY_LOG_MANUAL_CAPS;
4253
4254 if (cap->flags || (cap->args[0] & ~allowed_options))
Paolo Bonzini2a31b9d2018-10-23 02:36:47 +02004255 return -EINVAL;
4256 kvm->manual_dirty_log_protect = cap->args[0];
4257 return 0;
Jay Zhou3c9bd402020-02-27 09:32:27 +08004258 }
Paolo Bonzini2a31b9d2018-10-23 02:36:47 +02004259#endif
David Matlackacd05782020-04-17 15:14:46 -07004260 case KVM_CAP_HALT_POLL: {
4261 if (cap->flags || cap->args[0] != (unsigned int)cap->args[0])
4262 return -EINVAL;
4263
4264 kvm->max_halt_poll_ns = cap->args[0];
4265 return 0;
4266 }
Peter Xufb04a1e2020-09-30 21:22:22 -04004267 case KVM_CAP_DIRTY_LOG_RING:
4268 return kvm_vm_ioctl_enable_dirty_log_ring(kvm, cap->args[0]);
Paolo Bonzinie5d83c72017-02-16 10:40:56 +01004269 default:
4270 return kvm_vm_ioctl_enable_cap(kvm, cap);
4271 }
4272}
4273
Jing Zhangfcfe1ba2021-06-18 22:27:05 +00004274static ssize_t kvm_vm_stats_read(struct file *file, char __user *user_buffer,
4275 size_t size, loff_t *offset)
4276{
4277 struct kvm *kvm = file->private_data;
4278
4279 return kvm_stats_read(kvm->stats_id, &kvm_vm_stats_header,
4280 &kvm_vm_stats_desc[0], &kvm->stat,
4281 sizeof(kvm->stat), user_buffer, size, offset);
4282}
4283
4284static const struct file_operations kvm_vm_stats_fops = {
4285 .read = kvm_vm_stats_read,
4286 .llseek = noop_llseek,
4287};
4288
4289static int kvm_vm_ioctl_get_stats_fd(struct kvm *kvm)
4290{
4291 int fd;
4292 struct file *file;
4293
4294 fd = get_unused_fd_flags(O_CLOEXEC);
4295 if (fd < 0)
4296 return fd;
4297
4298 file = anon_inode_getfile("kvm-vm-stats",
4299 &kvm_vm_stats_fops, kvm, O_RDONLY);
4300 if (IS_ERR(file)) {
4301 put_unused_fd(fd);
4302 return PTR_ERR(file);
4303 }
4304 file->f_mode |= FMODE_PREAD;
4305 fd_install(fd, file);
4306
4307 return fd;
4308}
4309
Avi Kivitybccf2152007-02-21 18:04:26 +02004310static long kvm_vm_ioctl(struct file *filp,
4311 unsigned int ioctl, unsigned long arg)
4312{
4313 struct kvm *kvm = filp->private_data;
4314 void __user *argp = (void __user *)arg;
Carsten Otte1fe779f2007-10-29 16:08:35 +01004315 int r;
Avi Kivitybccf2152007-02-21 18:04:26 +02004316
Sean Christopherson0b8f1172021-07-02 15:04:23 -07004317 if (kvm->mm != current->mm || kvm->vm_bugged)
Avi Kivity6d4e4c42007-11-21 16:41:05 +02004318 return -EIO;
Avi Kivitybccf2152007-02-21 18:04:26 +02004319 switch (ioctl) {
4320 case KVM_CREATE_VCPU:
4321 r = kvm_vm_ioctl_create_vcpu(kvm, arg);
Avi Kivitybccf2152007-02-21 18:04:26 +02004322 break;
Paolo Bonzinie5d83c72017-02-16 10:40:56 +01004323 case KVM_ENABLE_CAP: {
4324 struct kvm_enable_cap cap;
4325
4326 r = -EFAULT;
4327 if (copy_from_user(&cap, argp, sizeof(cap)))
4328 goto out;
4329 r = kvm_vm_ioctl_enable_cap_generic(kvm, &cap);
4330 break;
4331 }
Izik Eidus6fc138d2007-10-09 19:20:39 +02004332 case KVM_SET_USER_MEMORY_REGION: {
4333 struct kvm_userspace_memory_region kvm_userspace_mem;
4334
4335 r = -EFAULT;
4336 if (copy_from_user(&kvm_userspace_mem, argp,
Xiubo Li893bdbf2015-02-26 14:58:19 +08004337 sizeof(kvm_userspace_mem)))
Izik Eidus6fc138d2007-10-09 19:20:39 +02004338 goto out;
4339
Takuya Yoshikawa47ae31e2013-02-27 19:43:00 +09004340 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem);
Avi Kivity6aa8b732006-12-10 02:21:36 -08004341 break;
4342 }
4343 case KVM_GET_DIRTY_LOG: {
4344 struct kvm_dirty_log log;
4345
4346 r = -EFAULT;
Xiubo Li893bdbf2015-02-26 14:58:19 +08004347 if (copy_from_user(&log, argp, sizeof(log)))
Avi Kivity6aa8b732006-12-10 02:21:36 -08004348 goto out;
Avi Kivity2c6f5df2007-02-20 18:27:58 +02004349 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
Avi Kivity6aa8b732006-12-10 02:21:36 -08004350 break;
4351 }
Paolo Bonzini2a31b9d2018-10-23 02:36:47 +02004352#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
4353 case KVM_CLEAR_DIRTY_LOG: {
4354 struct kvm_clear_dirty_log log;
4355
4356 r = -EFAULT;
4357 if (copy_from_user(&log, argp, sizeof(log)))
4358 goto out;
4359 r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
4360 break;
4361 }
4362#endif
Paolo Bonzini4b4357e2017-03-31 13:53:23 +02004363#ifdef CONFIG_KVM_MMIO
Laurent Vivier5f94c172008-05-30 16:05:54 +02004364 case KVM_REGISTER_COALESCED_MMIO: {
4365 struct kvm_coalesced_mmio_zone zone;
Xiubo Lif95ef0cd2015-02-26 14:58:23 +08004366
Laurent Vivier5f94c172008-05-30 16:05:54 +02004367 r = -EFAULT;
Xiubo Li893bdbf2015-02-26 14:58:19 +08004368 if (copy_from_user(&zone, argp, sizeof(zone)))
Laurent Vivier5f94c172008-05-30 16:05:54 +02004369 goto out;
Laurent Vivier5f94c172008-05-30 16:05:54 +02004370 r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone);
Laurent Vivier5f94c172008-05-30 16:05:54 +02004371 break;
4372 }
4373 case KVM_UNREGISTER_COALESCED_MMIO: {
4374 struct kvm_coalesced_mmio_zone zone;
Xiubo Lif95ef0cd2015-02-26 14:58:23 +08004375
Laurent Vivier5f94c172008-05-30 16:05:54 +02004376 r = -EFAULT;
Xiubo Li893bdbf2015-02-26 14:58:19 +08004377 if (copy_from_user(&zone, argp, sizeof(zone)))
Laurent Vivier5f94c172008-05-30 16:05:54 +02004378 goto out;
Laurent Vivier5f94c172008-05-30 16:05:54 +02004379 r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone);
Laurent Vivier5f94c172008-05-30 16:05:54 +02004380 break;
4381 }
4382#endif
Gregory Haskins721eecbf2009-05-20 10:30:49 -04004383 case KVM_IRQFD: {
4384 struct kvm_irqfd data;
4385
4386 r = -EFAULT;
Xiubo Li893bdbf2015-02-26 14:58:19 +08004387 if (copy_from_user(&data, argp, sizeof(data)))
Gregory Haskins721eecbf2009-05-20 10:30:49 -04004388 goto out;
Alex Williamsond4db2932012-06-29 09:56:08 -06004389 r = kvm_irqfd(kvm, &data);
Gregory Haskins721eecbf2009-05-20 10:30:49 -04004390 break;
4391 }
Gregory Haskinsd34e6b12009-07-07 17:08:49 -04004392 case KVM_IOEVENTFD: {
4393 struct kvm_ioeventfd data;
4394
4395 r = -EFAULT;
Xiubo Li893bdbf2015-02-26 14:58:19 +08004396 if (copy_from_user(&data, argp, sizeof(data)))
Gregory Haskinsd34e6b12009-07-07 17:08:49 -04004397 goto out;
4398 r = kvm_ioeventfd(kvm, &data);
4399 break;
4400 }
Jan Kiszka07975ad2012-03-29 21:14:12 +02004401#ifdef CONFIG_HAVE_KVM_MSI
4402 case KVM_SIGNAL_MSI: {
4403 struct kvm_msi msi;
4404
4405 r = -EFAULT;
Xiubo Li893bdbf2015-02-26 14:58:19 +08004406 if (copy_from_user(&msi, argp, sizeof(msi)))
Jan Kiszka07975ad2012-03-29 21:14:12 +02004407 goto out;
4408 r = kvm_send_userspace_msi(kvm, &msi);
4409 break;
4410 }
4411#endif
Christoffer Dall23d43cf2012-07-24 08:51:20 -04004412#ifdef __KVM_HAVE_IRQ_LINE
4413 case KVM_IRQ_LINE_STATUS:
4414 case KVM_IRQ_LINE: {
4415 struct kvm_irq_level irq_event;
4416
4417 r = -EFAULT;
Xiubo Li893bdbf2015-02-26 14:58:19 +08004418 if (copy_from_user(&irq_event, argp, sizeof(irq_event)))
Christoffer Dall23d43cf2012-07-24 08:51:20 -04004419 goto out;
4420
Yang Zhangaa2fbe62013-04-11 19:21:40 +08004421 r = kvm_vm_ioctl_irq_line(kvm, &irq_event,
4422 ioctl == KVM_IRQ_LINE_STATUS);
Christoffer Dall23d43cf2012-07-24 08:51:20 -04004423 if (r)
4424 goto out;
4425
4426 r = -EFAULT;
4427 if (ioctl == KVM_IRQ_LINE_STATUS) {
Xiubo Li893bdbf2015-02-26 14:58:19 +08004428 if (copy_to_user(argp, &irq_event, sizeof(irq_event)))
Christoffer Dall23d43cf2012-07-24 08:51:20 -04004429 goto out;
4430 }
4431
4432 r = 0;
4433 break;
4434 }
4435#endif
Alexander Grafaa8d5942013-04-15 21:12:53 +02004436#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
4437 case KVM_SET_GSI_ROUTING: {
4438 struct kvm_irq_routing routing;
4439 struct kvm_irq_routing __user *urouting;
Paolo Bonzinif8c1b852016-06-01 14:09:22 +02004440 struct kvm_irq_routing_entry *entries = NULL;
Alexander Grafaa8d5942013-04-15 21:12:53 +02004441
4442 r = -EFAULT;
4443 if (copy_from_user(&routing, argp, sizeof(routing)))
4444 goto out;
4445 r = -EINVAL;
David Hildenbrand5c0aea02017-04-28 17:06:20 +02004446 if (!kvm_arch_can_set_irq_routing(kvm))
4447 goto out;
Xiubo Licaf1ff22016-06-15 18:00:33 +08004448 if (routing.nr > KVM_MAX_IRQ_ROUTES)
Alexander Grafaa8d5942013-04-15 21:12:53 +02004449 goto out;
4450 if (routing.flags)
4451 goto out;
Paolo Bonzinif8c1b852016-06-01 14:09:22 +02004452 if (routing.nr) {
Paolo Bonzinif8c1b852016-06-01 14:09:22 +02004453 urouting = argp;
Denis Efremov7ec28e22020-06-03 13:11:31 +03004454 entries = vmemdup_user(urouting->entries,
4455 array_size(sizeof(*entries),
4456 routing.nr));
4457 if (IS_ERR(entries)) {
4458 r = PTR_ERR(entries);
4459 goto out;
4460 }
Paolo Bonzinif8c1b852016-06-01 14:09:22 +02004461 }
Alexander Grafaa8d5942013-04-15 21:12:53 +02004462 r = kvm_set_irq_routing(kvm, entries, routing.nr,
4463 routing.flags);
Denis Efremov7ec28e22020-06-03 13:11:31 +03004464 kvfree(entries);
Alexander Grafaa8d5942013-04-15 21:12:53 +02004465 break;
4466 }
4467#endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */
Scott Wood852b6d52013-04-12 14:08:42 +00004468 case KVM_CREATE_DEVICE: {
4469 struct kvm_create_device cd;
4470
4471 r = -EFAULT;
4472 if (copy_from_user(&cd, argp, sizeof(cd)))
4473 goto out;
4474
4475 r = kvm_ioctl_create_device(kvm, &cd);
4476 if (r)
4477 goto out;
4478
4479 r = -EFAULT;
4480 if (copy_to_user(argp, &cd, sizeof(cd)))
4481 goto out;
4482
4483 r = 0;
4484 break;
4485 }
Alexander Graf92b591a2014-07-14 18:33:08 +02004486 case KVM_CHECK_EXTENSION:
4487 r = kvm_vm_ioctl_check_extension_generic(kvm, arg);
4488 break;
Peter Xufb04a1e2020-09-30 21:22:22 -04004489 case KVM_RESET_DIRTY_RINGS:
4490 r = kvm_vm_ioctl_reset_dirty_pages(kvm);
4491 break;
Jing Zhangfcfe1ba2021-06-18 22:27:05 +00004492 case KVM_GET_STATS_FD:
4493 r = kvm_vm_ioctl_get_stats_fd(kvm);
4494 break;
Avi Kivityf17abe92007-02-21 19:28:04 +02004495 default:
Carsten Otte1fe779f2007-10-29 16:08:35 +01004496 r = kvm_arch_vm_ioctl(filp, ioctl, arg);
Avi Kivityf17abe92007-02-21 19:28:04 +02004497 }
4498out:
4499 return r;
4500}
4501
Christian Borntraegerde8e5d72015-02-03 09:35:15 +01004502#ifdef CONFIG_KVM_COMPAT
Arnd Bergmann6ff58942009-10-22 14:19:27 +02004503struct compat_kvm_dirty_log {
4504 __u32 slot;
4505 __u32 padding1;
4506 union {
4507 compat_uptr_t dirty_bitmap; /* one bit per page */
4508 __u64 padding2;
4509 };
4510};
4511
Paolo Bonzini8750f9b2021-07-27 08:43:10 -04004512struct compat_kvm_clear_dirty_log {
4513 __u32 slot;
4514 __u32 num_pages;
4515 __u64 first_page;
4516 union {
4517 compat_uptr_t dirty_bitmap; /* one bit per page */
4518 __u64 padding2;
4519 };
4520};
4521
Arnd Bergmann6ff58942009-10-22 14:19:27 +02004522static long kvm_vm_compat_ioctl(struct file *filp,
4523 unsigned int ioctl, unsigned long arg)
4524{
4525 struct kvm *kvm = filp->private_data;
4526 int r;
4527
Sean Christopherson0b8f1172021-07-02 15:04:23 -07004528 if (kvm->mm != current->mm || kvm->vm_bugged)
Arnd Bergmann6ff58942009-10-22 14:19:27 +02004529 return -EIO;
4530 switch (ioctl) {
Paolo Bonzini8750f9b2021-07-27 08:43:10 -04004531#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
4532 case KVM_CLEAR_DIRTY_LOG: {
4533 struct compat_kvm_clear_dirty_log compat_log;
4534 struct kvm_clear_dirty_log log;
4535
4536 if (copy_from_user(&compat_log, (void __user *)arg,
4537 sizeof(compat_log)))
4538 return -EFAULT;
4539 log.slot = compat_log.slot;
4540 log.num_pages = compat_log.num_pages;
4541 log.first_page = compat_log.first_page;
4542 log.padding2 = compat_log.padding2;
4543 log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
4544
4545 r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
4546 break;
4547 }
4548#endif
Arnd Bergmann6ff58942009-10-22 14:19:27 +02004549 case KVM_GET_DIRTY_LOG: {
4550 struct compat_kvm_dirty_log compat_log;
4551 struct kvm_dirty_log log;
4552
Arnd Bergmann6ff58942009-10-22 14:19:27 +02004553 if (copy_from_user(&compat_log, (void __user *)arg,
4554 sizeof(compat_log)))
Markus Elfringf6a3b162017-01-22 11:30:21 +01004555 return -EFAULT;
Arnd Bergmann6ff58942009-10-22 14:19:27 +02004556 log.slot = compat_log.slot;
4557 log.padding1 = compat_log.padding1;
4558 log.padding2 = compat_log.padding2;
4559 log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
4560
4561 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
Arnd Bergmann6ff58942009-10-22 14:19:27 +02004562 break;
4563 }
4564 default:
4565 r = kvm_vm_ioctl(filp, ioctl, arg);
4566 }
Arnd Bergmann6ff58942009-10-22 14:19:27 +02004567 return r;
4568}
4569#endif
4570
Christian Borntraeger3d3aab12008-12-02 11:17:32 +01004571static struct file_operations kvm_vm_fops = {
Avi Kivityf17abe92007-02-21 19:28:04 +02004572 .release = kvm_vm_release,
4573 .unlocked_ioctl = kvm_vm_ioctl,
Arnd Bergmann6038f372010-08-15 18:52:59 +02004574 .llseek = noop_llseek,
Marc Zyngier7ddfd3e2018-06-17 10:16:21 +01004575 KVM_COMPAT(kvm_vm_compat_ioctl),
Avi Kivityf17abe92007-02-21 19:28:04 +02004576};
4577
Nathan Tempelman54526d12021-04-08 22:32:14 +00004578bool file_is_kvm(struct file *file)
4579{
4580 return file && file->f_op == &kvm_vm_fops;
4581}
4582EXPORT_SYMBOL_GPL(file_is_kvm);
4583
Carsten Ottee08b9632012-01-04 10:25:20 +01004584static int kvm_dev_ioctl_create_vm(unsigned long type)
Avi Kivityf17abe92007-02-21 19:28:04 +02004585{
Heiko Carstensaac87632010-10-27 17:22:10 +02004586 int r;
Avi Kivityf17abe92007-02-21 19:28:04 +02004587 struct kvm *kvm;
Al Viro506cfba2016-07-14 18:54:17 +02004588 struct file *file;
Avi Kivityf17abe92007-02-21 19:28:04 +02004589
Carsten Ottee08b9632012-01-04 10:25:20 +01004590 kvm = kvm_create_vm(type);
Avi Kivityd6d28162007-06-28 08:38:16 -04004591 if (IS_ERR(kvm))
4592 return PTR_ERR(kvm);
Paolo Bonzini4b4357e2017-03-31 13:53:23 +02004593#ifdef CONFIG_KVM_MMIO
Takuya Yoshikawa6ce5a092010-03-15 22:13:30 +09004594 r = kvm_coalesced_mmio_init(kvm);
Markus Elfring78588332017-11-21 13:40:17 +01004595 if (r < 0)
4596 goto put_kvm;
Takuya Yoshikawa6ce5a092010-03-15 22:13:30 +09004597#endif
Al Viro506cfba2016-07-14 18:54:17 +02004598 r = get_unused_fd_flags(O_CLOEXEC);
Markus Elfring78588332017-11-21 13:40:17 +01004599 if (r < 0)
4600 goto put_kvm;
4601
Jing Zhangfcfe1ba2021-06-18 22:27:05 +00004602 snprintf(kvm->stats_id, sizeof(kvm->stats_id),
4603 "kvm-%d", task_pid_nr(current));
4604
Al Viro506cfba2016-07-14 18:54:17 +02004605 file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
4606 if (IS_ERR(file)) {
4607 put_unused_fd(r);
Markus Elfring78588332017-11-21 13:40:17 +01004608 r = PTR_ERR(file);
4609 goto put_kvm;
Al Viro506cfba2016-07-14 18:54:17 +02004610 }
Janosch Frank536a6f82016-05-18 13:26:23 +02004611
Paolo Bonzini525df862017-06-27 15:45:09 +02004612 /*
4613 * Don't call kvm_put_kvm anymore at this point; file->f_op is
4614 * already set, with ->release() being kvm_vm_release(). In error
4615 * cases it will be called by the final fput(file) and will take
4616 * care of doing kvm_put_kvm(kvm).
4617 */
Janosch Frank536a6f82016-05-18 13:26:23 +02004618 if (kvm_create_vm_debugfs(kvm, r) < 0) {
Al Viro506cfba2016-07-14 18:54:17 +02004619 put_unused_fd(r);
4620 fput(file);
Janosch Frank536a6f82016-05-18 13:26:23 +02004621 return -ENOMEM;
4622 }
Claudio Imbrenda286de8f2017-07-12 17:56:44 +02004623 kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm);
Avi Kivityf17abe92007-02-21 19:28:04 +02004624
Al Viro506cfba2016-07-14 18:54:17 +02004625 fd_install(r, file);
Heiko Carstensaac87632010-10-27 17:22:10 +02004626 return r;
Markus Elfring78588332017-11-21 13:40:17 +01004627
4628put_kvm:
4629 kvm_put_kvm(kvm);
4630 return r;
Avi Kivityf17abe92007-02-21 19:28:04 +02004631}
4632
4633static long kvm_dev_ioctl(struct file *filp,
4634 unsigned int ioctl, unsigned long arg)
4635{
Avi Kivity07c45a32007-03-07 13:05:38 +02004636 long r = -EINVAL;
Avi Kivityf17abe92007-02-21 19:28:04 +02004637
4638 switch (ioctl) {
4639 case KVM_GET_API_VERSION:
Avi Kivityf0fe5102007-03-07 13:11:17 +02004640 if (arg)
4641 goto out;
Avi Kivityf17abe92007-02-21 19:28:04 +02004642 r = KVM_API_VERSION;
4643 break;
4644 case KVM_CREATE_VM:
Carsten Ottee08b9632012-01-04 10:25:20 +01004645 r = kvm_dev_ioctl_create_vm(arg);
Avi Kivityf17abe92007-02-21 19:28:04 +02004646 break;
Zhang Xiantao018d00d2007-11-15 23:07:47 +08004647 case KVM_CHECK_EXTENSION:
Alexander Graf784aa3d2014-07-14 18:27:35 +02004648 r = kvm_vm_ioctl_check_extension_generic(NULL, arg);
Avi Kivity5d308f42007-03-01 17:56:20 +02004649 break;
Avi Kivity07c45a32007-03-07 13:05:38 +02004650 case KVM_GET_VCPU_MMAP_SIZE:
Avi Kivity07c45a32007-03-07 13:05:38 +02004651 if (arg)
4652 goto out;
Avi Kivityadb1ff42008-01-24 15:13:08 +02004653 r = PAGE_SIZE; /* struct kvm_run */
4654#ifdef CONFIG_X86
4655 r += PAGE_SIZE; /* pio data page */
4656#endif
Paolo Bonzini4b4357e2017-03-31 13:53:23 +02004657#ifdef CONFIG_KVM_MMIO
Laurent Vivier5f94c172008-05-30 16:05:54 +02004658 r += PAGE_SIZE; /* coalesced mmio ring page */
4659#endif
Avi Kivity07c45a32007-03-07 13:05:38 +02004660 break;
Feng(Eric) Liud4c9ff22008-04-10 08:47:53 -04004661 case KVM_TRACE_ENABLE:
4662 case KVM_TRACE_PAUSE:
4663 case KVM_TRACE_DISABLE:
Marcelo Tosatti2023a292009-06-18 11:47:28 -03004664 r = -EOPNOTSUPP;
Feng(Eric) Liud4c9ff22008-04-10 08:47:53 -04004665 break;
Avi Kivity6aa8b732006-12-10 02:21:36 -08004666 default:
Carsten Otte043405e2007-10-10 17:16:19 +02004667 return kvm_arch_dev_ioctl(filp, ioctl, arg);
Avi Kivity6aa8b732006-12-10 02:21:36 -08004668 }
4669out:
4670 return r;
4671}
4672
Avi Kivity6aa8b732006-12-10 02:21:36 -08004673static struct file_operations kvm_chardev_ops = {
Avi Kivity6aa8b732006-12-10 02:21:36 -08004674 .unlocked_ioctl = kvm_dev_ioctl,
Arnd Bergmann6038f372010-08-15 18:52:59 +02004675 .llseek = noop_llseek,
Marc Zyngier7ddfd3e2018-06-17 10:16:21 +01004676 KVM_COMPAT(kvm_dev_ioctl),
Avi Kivity6aa8b732006-12-10 02:21:36 -08004677};
4678
4679static struct miscdevice kvm_dev = {
Avi Kivitybbe44322007-03-04 13:27:36 +02004680 KVM_MINOR,
Avi Kivity6aa8b732006-12-10 02:21:36 -08004681 "kvm",
4682 &kvm_chardev_ops,
4683};
4684
Takuya Yoshikawa75b71272010-11-16 17:37:41 +09004685static void hardware_enable_nolock(void *junk)
Avi Kivity1b6c0162007-05-24 13:03:52 +03004686{
4687 int cpu = raw_smp_processor_id();
Alexander Graf10474ae2009-09-15 11:37:46 +02004688 int r;
Avi Kivity1b6c0162007-05-24 13:03:52 +03004689
Rusty Russell7f59f492008-12-07 21:25:45 +10304690 if (cpumask_test_cpu(cpu, cpus_hardware_enabled))
Avi Kivity1b6c0162007-05-24 13:03:52 +03004691 return;
Alexander Graf10474ae2009-09-15 11:37:46 +02004692
Rusty Russell7f59f492008-12-07 21:25:45 +10304693 cpumask_set_cpu(cpu, cpus_hardware_enabled);
Alexander Graf10474ae2009-09-15 11:37:46 +02004694
Radim Krčmář13a34e02014-08-28 15:13:03 +02004695 r = kvm_arch_hardware_enable();
Alexander Graf10474ae2009-09-15 11:37:46 +02004696
4697 if (r) {
4698 cpumask_clear_cpu(cpu, cpus_hardware_enabled);
4699 atomic_inc(&hardware_enable_failed);
Xiubo Li1170adc2015-02-26 14:58:26 +08004700 pr_info("kvm: enabling virtualization on CPU%d failed\n", cpu);
Alexander Graf10474ae2009-09-15 11:37:46 +02004701 }
Avi Kivity1b6c0162007-05-24 13:03:52 +03004702}
4703
Thomas Gleixner8c18b2d2016-07-13 17:16:37 +00004704static int kvm_starting_cpu(unsigned int cpu)
Takuya Yoshikawa75b71272010-11-16 17:37:41 +09004705{
Paolo Bonzini4a937f92013-09-10 12:58:35 +02004706 raw_spin_lock(&kvm_count_lock);
Paolo Bonzini4fa92fb2013-09-10 12:57:17 +02004707 if (kvm_usage_count)
4708 hardware_enable_nolock(NULL);
Paolo Bonzini4a937f92013-09-10 12:58:35 +02004709 raw_spin_unlock(&kvm_count_lock);
Thomas Gleixner8c18b2d2016-07-13 17:16:37 +00004710 return 0;
Takuya Yoshikawa75b71272010-11-16 17:37:41 +09004711}
4712
4713static void hardware_disable_nolock(void *junk)
Avi Kivity1b6c0162007-05-24 13:03:52 +03004714{
4715 int cpu = raw_smp_processor_id();
4716
Rusty Russell7f59f492008-12-07 21:25:45 +10304717 if (!cpumask_test_cpu(cpu, cpus_hardware_enabled))
Avi Kivity1b6c0162007-05-24 13:03:52 +03004718 return;
Rusty Russell7f59f492008-12-07 21:25:45 +10304719 cpumask_clear_cpu(cpu, cpus_hardware_enabled);
Radim Krčmář13a34e02014-08-28 15:13:03 +02004720 kvm_arch_hardware_disable();
Avi Kivity1b6c0162007-05-24 13:03:52 +03004721}
4722
Thomas Gleixner8c18b2d2016-07-13 17:16:37 +00004723static int kvm_dying_cpu(unsigned int cpu)
Takuya Yoshikawa75b71272010-11-16 17:37:41 +09004724{
Paolo Bonzini4a937f92013-09-10 12:58:35 +02004725 raw_spin_lock(&kvm_count_lock);
Paolo Bonzini4fa92fb2013-09-10 12:57:17 +02004726 if (kvm_usage_count)
4727 hardware_disable_nolock(NULL);
Paolo Bonzini4a937f92013-09-10 12:58:35 +02004728 raw_spin_unlock(&kvm_count_lock);
Thomas Gleixner8c18b2d2016-07-13 17:16:37 +00004729 return 0;
Takuya Yoshikawa75b71272010-11-16 17:37:41 +09004730}
4731
Alexander Graf10474ae2009-09-15 11:37:46 +02004732static void hardware_disable_all_nolock(void)
4733{
4734 BUG_ON(!kvm_usage_count);
4735
4736 kvm_usage_count--;
4737 if (!kvm_usage_count)
Takuya Yoshikawa75b71272010-11-16 17:37:41 +09004738 on_each_cpu(hardware_disable_nolock, NULL, 1);
Alexander Graf10474ae2009-09-15 11:37:46 +02004739}
4740
4741static void hardware_disable_all(void)
4742{
Paolo Bonzini4a937f92013-09-10 12:58:35 +02004743 raw_spin_lock(&kvm_count_lock);
Alexander Graf10474ae2009-09-15 11:37:46 +02004744 hardware_disable_all_nolock();
Paolo Bonzini4a937f92013-09-10 12:58:35 +02004745 raw_spin_unlock(&kvm_count_lock);
Alexander Graf10474ae2009-09-15 11:37:46 +02004746}
4747
4748static int hardware_enable_all(void)
4749{
4750 int r = 0;
4751
Paolo Bonzini4a937f92013-09-10 12:58:35 +02004752 raw_spin_lock(&kvm_count_lock);
Alexander Graf10474ae2009-09-15 11:37:46 +02004753
4754 kvm_usage_count++;
4755 if (kvm_usage_count == 1) {
4756 atomic_set(&hardware_enable_failed, 0);
Takuya Yoshikawa75b71272010-11-16 17:37:41 +09004757 on_each_cpu(hardware_enable_nolock, NULL, 1);
Alexander Graf10474ae2009-09-15 11:37:46 +02004758
4759 if (atomic_read(&hardware_enable_failed)) {
4760 hardware_disable_all_nolock();
4761 r = -EBUSY;
4762 }
4763 }
4764
Paolo Bonzini4a937f92013-09-10 12:58:35 +02004765 raw_spin_unlock(&kvm_count_lock);
Alexander Graf10474ae2009-09-15 11:37:46 +02004766
4767 return r;
4768}
4769
Rusty Russell9a2b85c2007-07-17 23:17:55 +10004770static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
Mike Dayd77c26f2007-10-08 09:02:08 -04004771 void *v)
Rusty Russell9a2b85c2007-07-17 23:17:55 +10004772{
Sheng Yang8e1c1812009-04-29 11:09:04 +08004773 /*
4774 * Some (well, at least mine) BIOSes hang on reboot if
4775 * in vmx root mode.
4776 *
4777 * And Intel TXT required VMX off for all cpu when system shutdown.
4778 */
Xiubo Li1170adc2015-02-26 14:58:26 +08004779 pr_info("kvm: exiting hardware virtualization\n");
Sheng Yang8e1c1812009-04-29 11:09:04 +08004780 kvm_rebooting = true;
Takuya Yoshikawa75b71272010-11-16 17:37:41 +09004781 on_each_cpu(hardware_disable_nolock, NULL, 1);
Rusty Russell9a2b85c2007-07-17 23:17:55 +10004782 return NOTIFY_OK;
4783}
4784
4785static struct notifier_block kvm_reboot_notifier = {
4786 .notifier_call = kvm_reboot,
4787 .priority = 0,
4788};
4789
Marcelo Tosattie93f8a02009-12-23 14:35:24 -02004790static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
Gregory Haskins2eeb2e92007-05-31 14:08:53 -04004791{
4792 int i;
4793
4794 for (i = 0; i < bus->dev_count; i++) {
Sasha Levin743eeb02011-07-27 16:00:48 +03004795 struct kvm_io_device *pos = bus->range[i].dev;
Gregory Haskins2eeb2e92007-05-31 14:08:53 -04004796
4797 kvm_iodevice_destructor(pos);
4798 }
Marcelo Tosattie93f8a02009-12-23 14:35:24 -02004799 kfree(bus);
Gregory Haskins2eeb2e92007-05-31 14:08:53 -04004800}
4801
Paolo Bonzinic21fbff2013-08-27 15:41:41 +02004802static inline int kvm_io_bus_cmp(const struct kvm_io_range *r1,
Xiubo Li20e87b72015-02-26 14:58:25 +08004803 const struct kvm_io_range *r2)
Sasha Levin743eeb02011-07-27 16:00:48 +03004804{
Jason Wang8f4216c72015-09-15 14:41:57 +08004805 gpa_t addr1 = r1->addr;
4806 gpa_t addr2 = r2->addr;
4807
4808 if (addr1 < addr2)
Sasha Levin743eeb02011-07-27 16:00:48 +03004809 return -1;
Jason Wang8f4216c72015-09-15 14:41:57 +08004810
4811 /* If r2->len == 0, match the exact address. If r2->len != 0,
4812 * accept any overlapping write. Any order is acceptable for
4813 * overlapping ranges, because kvm_io_bus_get_first_dev ensures
4814 * we process all of them.
4815 */
4816 if (r2->len) {
4817 addr1 += r1->len;
4818 addr2 += r2->len;
4819 }
4820
4821 if (addr1 > addr2)
Sasha Levin743eeb02011-07-27 16:00:48 +03004822 return 1;
Jason Wang8f4216c72015-09-15 14:41:57 +08004823
Sasha Levin743eeb02011-07-27 16:00:48 +03004824 return 0;
4825}
4826
Paolo Bonzinia343c9b2013-07-16 13:03:29 +02004827static int kvm_io_bus_sort_cmp(const void *p1, const void *p2)
4828{
Paolo Bonzinic21fbff2013-08-27 15:41:41 +02004829 return kvm_io_bus_cmp(p1, p2);
Paolo Bonzinia343c9b2013-07-16 13:03:29 +02004830}
4831
Geoff Levand39369f72013-04-05 19:20:30 +00004832static int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus,
Sasha Levin743eeb02011-07-27 16:00:48 +03004833 gpa_t addr, int len)
4834{
4835 struct kvm_io_range *range, key;
4836 int off;
4837
4838 key = (struct kvm_io_range) {
4839 .addr = addr,
4840 .len = len,
4841 };
4842
4843 range = bsearch(&key, bus->range, bus->dev_count,
4844 sizeof(struct kvm_io_range), kvm_io_bus_sort_cmp);
4845 if (range == NULL)
4846 return -ENOENT;
4847
4848 off = range - bus->range;
4849
Paolo Bonzinic21fbff2013-08-27 15:41:41 +02004850 while (off > 0 && kvm_io_bus_cmp(&key, &bus->range[off-1]) == 0)
Sasha Levin743eeb02011-07-27 16:00:48 +03004851 off--;
4852
4853 return off;
4854}
4855
Nikolay Nikolaeve32edf42015-03-26 14:39:28 +00004856static int __kvm_io_bus_write(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
Cornelia Huck126a5af2013-07-03 16:30:53 +02004857 struct kvm_io_range *range, const void *val)
4858{
4859 int idx;
4860
4861 idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
4862 if (idx < 0)
4863 return -EOPNOTSUPP;
4864
4865 while (idx < bus->dev_count &&
Paolo Bonzinic21fbff2013-08-27 15:41:41 +02004866 kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
Nikolay Nikolaeve32edf42015-03-26 14:39:28 +00004867 if (!kvm_iodevice_write(vcpu, bus->range[idx].dev, range->addr,
Cornelia Huck126a5af2013-07-03 16:30:53 +02004868 range->len, val))
4869 return idx;
4870 idx++;
4871 }
4872
4873 return -EOPNOTSUPP;
4874}
4875
Michael S. Tsirkinbda90202009-06-29 22:24:32 +03004876/* kvm_io_bus_write - called under kvm->slots_lock */
Nikolay Nikolaeve32edf42015-03-26 14:39:28 +00004877int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
Michael S. Tsirkinbda90202009-06-29 22:24:32 +03004878 int len, const void *val)
Gregory Haskins2eeb2e92007-05-31 14:08:53 -04004879{
Cornelia Huck126a5af2013-07-03 16:30:53 +02004880 struct kvm_io_bus *bus;
4881 struct kvm_io_range range;
4882 int r;
4883
4884 range = (struct kvm_io_range) {
4885 .addr = addr,
4886 .len = len,
4887 };
4888
Nikolay Nikolaeve32edf42015-03-26 14:39:28 +00004889 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
David Hildenbrand90db1042017-03-23 18:24:19 +01004890 if (!bus)
4891 return -ENOMEM;
Nikolay Nikolaeve32edf42015-03-26 14:39:28 +00004892 r = __kvm_io_bus_write(vcpu, bus, &range, val);
Cornelia Huck126a5af2013-07-03 16:30:53 +02004893 return r < 0 ? r : 0;
4894}
Leo Yana2420102019-02-22 16:10:09 +08004895EXPORT_SYMBOL_GPL(kvm_io_bus_write);
Cornelia Huck126a5af2013-07-03 16:30:53 +02004896
4897/* kvm_io_bus_write_cookie - called under kvm->slots_lock */
Nikolay Nikolaeve32edf42015-03-26 14:39:28 +00004898int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx,
4899 gpa_t addr, int len, const void *val, long cookie)
Cornelia Huck126a5af2013-07-03 16:30:53 +02004900{
Lai Jiangshan90d83dc2010-04-19 17:41:23 +08004901 struct kvm_io_bus *bus;
Sasha Levin743eeb02011-07-27 16:00:48 +03004902 struct kvm_io_range range;
4903
4904 range = (struct kvm_io_range) {
4905 .addr = addr,
4906 .len = len,
4907 };
Lai Jiangshan90d83dc2010-04-19 17:41:23 +08004908
Nikolay Nikolaeve32edf42015-03-26 14:39:28 +00004909 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
David Hildenbrand90db1042017-03-23 18:24:19 +01004910 if (!bus)
4911 return -ENOMEM;
Cornelia Huck126a5af2013-07-03 16:30:53 +02004912
4913 /* First try the device referenced by cookie. */
4914 if ((cookie >= 0) && (cookie < bus->dev_count) &&
Paolo Bonzinic21fbff2013-08-27 15:41:41 +02004915 (kvm_io_bus_cmp(&range, &bus->range[cookie]) == 0))
Nikolay Nikolaeve32edf42015-03-26 14:39:28 +00004916 if (!kvm_iodevice_write(vcpu, bus->range[cookie].dev, addr, len,
Cornelia Huck126a5af2013-07-03 16:30:53 +02004917 val))
4918 return cookie;
4919
4920 /*
4921 * cookie contained garbage; fall back to search and return the
4922 * correct cookie value.
4923 */
Nikolay Nikolaeve32edf42015-03-26 14:39:28 +00004924 return __kvm_io_bus_write(vcpu, bus, &range, val);
Cornelia Huck126a5af2013-07-03 16:30:53 +02004925}
4926
Nikolay Nikolaeve32edf42015-03-26 14:39:28 +00004927static int __kvm_io_bus_read(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
4928 struct kvm_io_range *range, void *val)
Cornelia Huck126a5af2013-07-03 16:30:53 +02004929{
4930 int idx;
4931
4932 idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
Sasha Levin743eeb02011-07-27 16:00:48 +03004933 if (idx < 0)
4934 return -EOPNOTSUPP;
4935
4936 while (idx < bus->dev_count &&
Paolo Bonzinic21fbff2013-08-27 15:41:41 +02004937 kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
Nikolay Nikolaeve32edf42015-03-26 14:39:28 +00004938 if (!kvm_iodevice_read(vcpu, bus->range[idx].dev, range->addr,
Cornelia Huck126a5af2013-07-03 16:30:53 +02004939 range->len, val))
4940 return idx;
Sasha Levin743eeb02011-07-27 16:00:48 +03004941 idx++;
4942 }
4943
Michael S. Tsirkinbda90202009-06-29 22:24:32 +03004944 return -EOPNOTSUPP;
4945}
Gregory Haskins2eeb2e92007-05-31 14:08:53 -04004946
Michael S. Tsirkinbda90202009-06-29 22:24:32 +03004947/* kvm_io_bus_read - called under kvm->slots_lock */
Nikolay Nikolaeve32edf42015-03-26 14:39:28 +00004948int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
Marcelo Tosattie93f8a02009-12-23 14:35:24 -02004949 int len, void *val)
Michael S. Tsirkinbda90202009-06-29 22:24:32 +03004950{
Cornelia Huck126a5af2013-07-03 16:30:53 +02004951 struct kvm_io_bus *bus;
4952 struct kvm_io_range range;
4953 int r;
4954
4955 range = (struct kvm_io_range) {
4956 .addr = addr,
4957 .len = len,
4958 };
4959
Nikolay Nikolaeve32edf42015-03-26 14:39:28 +00004960 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
David Hildenbrand90db1042017-03-23 18:24:19 +01004961 if (!bus)
4962 return -ENOMEM;
Nikolay Nikolaeve32edf42015-03-26 14:39:28 +00004963 r = __kvm_io_bus_read(vcpu, bus, &range, val);
Cornelia Huck126a5af2013-07-03 16:30:53 +02004964 return r < 0 ? r : 0;
4965}
4966
Marcelo Tosatti79fac952009-12-23 14:35:26 -02004967/* Caller must hold slots_lock. */
Sasha Levin743eeb02011-07-27 16:00:48 +03004968int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
4969 int len, struct kvm_io_device *dev)
Michael S. Tsirkin6c474692009-06-29 22:24:26 +03004970{
Gal Hammerd4c67a72018-01-16 15:34:41 +02004971 int i;
Marcelo Tosattie93f8a02009-12-23 14:35:24 -02004972 struct kvm_io_bus *new_bus, *bus;
Gal Hammerd4c67a72018-01-16 15:34:41 +02004973 struct kvm_io_range range;
Gregory Haskins090b7af2009-07-07 17:08:44 -04004974
Christian Borntraeger4a12f952017-07-07 10:51:38 +02004975 bus = kvm_get_bus(kvm, bus_idx);
David Hildenbrand90db1042017-03-23 18:24:19 +01004976 if (!bus)
4977 return -ENOMEM;
4978
Amos Kong6ea34c92013-05-25 06:44:15 +08004979 /* exclude ioeventfd which is limited by maximum fd */
4980 if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1)
Gregory Haskins090b7af2009-07-07 17:08:44 -04004981 return -ENOSPC;
4982
Gustavo A. R. Silva90952cd2019-01-30 17:07:47 +01004983 new_bus = kmalloc(struct_size(bus, range, bus->dev_count + 1),
Ben Gardonb12ce362019-02-11 11:02:49 -08004984 GFP_KERNEL_ACCOUNT);
Marcelo Tosattie93f8a02009-12-23 14:35:24 -02004985 if (!new_bus)
4986 return -ENOMEM;
Gal Hammerd4c67a72018-01-16 15:34:41 +02004987
4988 range = (struct kvm_io_range) {
4989 .addr = addr,
4990 .len = len,
4991 .dev = dev,
4992 };
4993
4994 for (i = 0; i < bus->dev_count; i++)
4995 if (kvm_io_bus_cmp(&bus->range[i], &range) > 0)
4996 break;
4997
4998 memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range));
4999 new_bus->dev_count++;
5000 new_bus->range[i] = range;
5001 memcpy(new_bus->range + i + 1, bus->range + i,
5002 (bus->dev_count - i) * sizeof(struct kvm_io_range));
Marcelo Tosattie93f8a02009-12-23 14:35:24 -02005003 rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
5004 synchronize_srcu_expedited(&kvm->srcu);
5005 kfree(bus);
Gregory Haskins090b7af2009-07-07 17:08:44 -04005006
5007 return 0;
5008}
5009
Sean Christopherson5d3c4c7932021-04-12 15:20:49 -07005010int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
5011 struct kvm_io_device *dev)
Gregory Haskins090b7af2009-07-07 17:08:44 -04005012{
Rustam Kovhaevf6588662020-09-07 11:55:35 -07005013 int i, j;
Marcelo Tosattie93f8a02009-12-23 14:35:24 -02005014 struct kvm_io_bus *new_bus, *bus;
Michael S. Tsirkin6c474692009-06-29 22:24:26 +03005015
Sean Christopherson7c896d32021-04-12 15:20:50 -07005016 lockdep_assert_held(&kvm->slots_lock);
5017
Christian Borntraeger4a12f952017-07-07 10:51:38 +02005018 bus = kvm_get_bus(kvm, bus_idx);
Peter Xudf630b82017-03-15 16:01:17 +08005019 if (!bus)
Sean Christopherson5d3c4c7932021-04-12 15:20:49 -07005020 return 0;
Peter Xudf630b82017-03-15 16:01:17 +08005021
Sean Christopherson7c896d32021-04-12 15:20:50 -07005022 for (i = 0; i < bus->dev_count; i++) {
Amos Konga13007162012-03-09 12:17:32 +08005023 if (bus->range[i].dev == dev) {
Gregory Haskins090b7af2009-07-07 17:08:44 -04005024 break;
5025 }
Sean Christopherson7c896d32021-04-12 15:20:50 -07005026 }
Marcelo Tosattie93f8a02009-12-23 14:35:24 -02005027
David Hildenbrand90db1042017-03-23 18:24:19 +01005028 if (i == bus->dev_count)
Sean Christopherson5d3c4c7932021-04-12 15:20:49 -07005029 return 0;
Amos Konga13007162012-03-09 12:17:32 +08005030
Gustavo A. R. Silva90952cd2019-01-30 17:07:47 +01005031 new_bus = kmalloc(struct_size(bus, range, bus->dev_count - 1),
Ben Gardonb12ce362019-02-11 11:02:49 -08005032 GFP_KERNEL_ACCOUNT);
Rustam Kovhaevf6588662020-09-07 11:55:35 -07005033 if (new_bus) {
Rustam Kovhaev871c4332020-09-18 05:05:00 -07005034 memcpy(new_bus, bus, struct_size(bus, range, i));
Rustam Kovhaevf6588662020-09-07 11:55:35 -07005035 new_bus->dev_count--;
5036 memcpy(new_bus->range + i, bus->range + i + 1,
Rustam Kovhaev871c4332020-09-18 05:05:00 -07005037 flex_array_size(new_bus, range, new_bus->dev_count - i));
Sean Christopherson2ee37572021-04-12 15:20:48 -07005038 }
5039
5040 rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
5041 synchronize_srcu_expedited(&kvm->srcu);
5042
5043 /* Destroy the old bus _after_ installing the (null) bus. */
5044 if (!new_bus) {
David Hildenbrand90db1042017-03-23 18:24:19 +01005045 pr_err("kvm: failed to shrink bus, removing it completely\n");
Rustam Kovhaevf6588662020-09-07 11:55:35 -07005046 for (j = 0; j < bus->dev_count; j++) {
5047 if (j == i)
5048 continue;
5049 kvm_iodevice_destructor(bus->range[j].dev);
5050 }
David Hildenbrand90db1042017-03-23 18:24:19 +01005051 }
Amos Konga13007162012-03-09 12:17:32 +08005052
Marcelo Tosattie93f8a02009-12-23 14:35:24 -02005053 kfree(bus);
Sean Christopherson5d3c4c7932021-04-12 15:20:49 -07005054 return new_bus ? 0 : -ENOMEM;
Gregory Haskins2eeb2e92007-05-31 14:08:53 -04005055}
5056
Andre Przywara8a39d002016-07-15 12:43:26 +01005057struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,
5058 gpa_t addr)
5059{
5060 struct kvm_io_bus *bus;
5061 int dev_idx, srcu_idx;
5062 struct kvm_io_device *iodev = NULL;
5063
5064 srcu_idx = srcu_read_lock(&kvm->srcu);
5065
5066 bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
David Hildenbrand90db1042017-03-23 18:24:19 +01005067 if (!bus)
5068 goto out_unlock;
Andre Przywara8a39d002016-07-15 12:43:26 +01005069
5070 dev_idx = kvm_io_bus_get_first_dev(bus, addr, 1);
5071 if (dev_idx < 0)
5072 goto out_unlock;
5073
5074 iodev = bus->range[dev_idx].dev;
5075
5076out_unlock:
5077 srcu_read_unlock(&kvm->srcu, srcu_idx);
5078
5079 return iodev;
5080}
5081EXPORT_SYMBOL_GPL(kvm_io_bus_get_dev);
5082
Janosch Frank536a6f82016-05-18 13:26:23 +02005083static int kvm_debugfs_open(struct inode *inode, struct file *file,
5084 int (*get)(void *, u64 *), int (*set)(void *, u64),
5085 const char *fmt)
5086{
5087 struct kvm_stat_data *stat_data = (struct kvm_stat_data *)
5088 inode->i_private;
5089
Peter Xu605c7132021-06-25 11:32:07 -04005090 /*
5091 * The debugfs files are a reference to the kvm struct which
5092 * is still valid when kvm_destroy_vm is called. kvm_get_kvm_safe
5093 * avoids the race between open and the removal of the debugfs directory.
Janosch Frank536a6f82016-05-18 13:26:23 +02005094 */
Peter Xu605c7132021-06-25 11:32:07 -04005095 if (!kvm_get_kvm_safe(stat_data->kvm))
Janosch Frank536a6f82016-05-18 13:26:23 +02005096 return -ENOENT;
5097
Paolo Bonzini833b45d2019-09-30 18:48:44 +02005098 if (simple_attr_open(inode, file, get,
Jing Zhangbc9e9e62021-06-23 17:28:46 -04005099 kvm_stats_debugfs_mode(stat_data->desc) & 0222
Milan Pandurov09cbcef2019-12-13 14:07:21 +01005100 ? set : NULL,
5101 fmt)) {
Janosch Frank536a6f82016-05-18 13:26:23 +02005102 kvm_put_kvm(stat_data->kvm);
5103 return -ENOMEM;
5104 }
5105
5106 return 0;
5107}
5108
5109static int kvm_debugfs_release(struct inode *inode, struct file *file)
5110{
5111 struct kvm_stat_data *stat_data = (struct kvm_stat_data *)
5112 inode->i_private;
5113
5114 simple_attr_release(inode, file);
5115 kvm_put_kvm(stat_data->kvm);
5116
5117 return 0;
5118}
5119
Milan Pandurov09cbcef2019-12-13 14:07:21 +01005120static int kvm_get_stat_per_vm(struct kvm *kvm, size_t offset, u64 *val)
Janosch Frank536a6f82016-05-18 13:26:23 +02005121{
Jing Zhangbc9e9e62021-06-23 17:28:46 -04005122 *val = *(u64 *)((void *)(&kvm->stat) + offset);
Janosch Frank536a6f82016-05-18 13:26:23 +02005123
5124 return 0;
5125}
5126
Milan Pandurov09cbcef2019-12-13 14:07:21 +01005127static int kvm_clear_stat_per_vm(struct kvm *kvm, size_t offset)
Suraj Jitindar Singhce35ef22016-10-19 13:49:47 +11005128{
Jing Zhangbc9e9e62021-06-23 17:28:46 -04005129 *(u64 *)((void *)(&kvm->stat) + offset) = 0;
Suraj Jitindar Singhce35ef22016-10-19 13:49:47 +11005130
5131 return 0;
5132}
5133
Milan Pandurov09cbcef2019-12-13 14:07:21 +01005134static int kvm_get_stat_per_vcpu(struct kvm *kvm, size_t offset, u64 *val)
Janosch Frank536a6f82016-05-18 13:26:23 +02005135{
5136 int i;
Janosch Frank536a6f82016-05-18 13:26:23 +02005137 struct kvm_vcpu *vcpu;
5138
5139 *val = 0;
5140
Milan Pandurov09cbcef2019-12-13 14:07:21 +01005141 kvm_for_each_vcpu(i, vcpu, kvm)
Jing Zhangbc9e9e62021-06-23 17:28:46 -04005142 *val += *(u64 *)((void *)(&vcpu->stat) + offset);
Janosch Frank536a6f82016-05-18 13:26:23 +02005143
5144 return 0;
5145}
5146
Milan Pandurov09cbcef2019-12-13 14:07:21 +01005147static int kvm_clear_stat_per_vcpu(struct kvm *kvm, size_t offset)
Suraj Jitindar Singhce35ef22016-10-19 13:49:47 +11005148{
5149 int i;
Suraj Jitindar Singhce35ef22016-10-19 13:49:47 +11005150 struct kvm_vcpu *vcpu;
5151
Milan Pandurov09cbcef2019-12-13 14:07:21 +01005152 kvm_for_each_vcpu(i, vcpu, kvm)
Jing Zhangbc9e9e62021-06-23 17:28:46 -04005153 *(u64 *)((void *)(&vcpu->stat) + offset) = 0;
Milan Pandurov09cbcef2019-12-13 14:07:21 +01005154
5155 return 0;
5156}
5157
5158static int kvm_stat_data_get(void *data, u64 *val)
5159{
5160 int r = -EFAULT;
5161 struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
5162
Jing Zhangbc9e9e62021-06-23 17:28:46 -04005163 switch (stat_data->kind) {
Milan Pandurov09cbcef2019-12-13 14:07:21 +01005164 case KVM_STAT_VM:
5165 r = kvm_get_stat_per_vm(stat_data->kvm,
Jing Zhangbc9e9e62021-06-23 17:28:46 -04005166 stat_data->desc->desc.offset, val);
Milan Pandurov09cbcef2019-12-13 14:07:21 +01005167 break;
5168 case KVM_STAT_VCPU:
5169 r = kvm_get_stat_per_vcpu(stat_data->kvm,
Jing Zhangbc9e9e62021-06-23 17:28:46 -04005170 stat_data->desc->desc.offset, val);
Milan Pandurov09cbcef2019-12-13 14:07:21 +01005171 break;
5172 }
5173
5174 return r;
5175}
5176
5177static int kvm_stat_data_clear(void *data, u64 val)
5178{
5179 int r = -EFAULT;
5180 struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
5181
Suraj Jitindar Singhce35ef22016-10-19 13:49:47 +11005182 if (val)
5183 return -EINVAL;
5184
Jing Zhangbc9e9e62021-06-23 17:28:46 -04005185 switch (stat_data->kind) {
Milan Pandurov09cbcef2019-12-13 14:07:21 +01005186 case KVM_STAT_VM:
5187 r = kvm_clear_stat_per_vm(stat_data->kvm,
Jing Zhangbc9e9e62021-06-23 17:28:46 -04005188 stat_data->desc->desc.offset);
Milan Pandurov09cbcef2019-12-13 14:07:21 +01005189 break;
5190 case KVM_STAT_VCPU:
5191 r = kvm_clear_stat_per_vcpu(stat_data->kvm,
Jing Zhangbc9e9e62021-06-23 17:28:46 -04005192 stat_data->desc->desc.offset);
Milan Pandurov09cbcef2019-12-13 14:07:21 +01005193 break;
5194 }
Suraj Jitindar Singhce35ef22016-10-19 13:49:47 +11005195
Milan Pandurov09cbcef2019-12-13 14:07:21 +01005196 return r;
Suraj Jitindar Singhce35ef22016-10-19 13:49:47 +11005197}
5198
Milan Pandurov09cbcef2019-12-13 14:07:21 +01005199static int kvm_stat_data_open(struct inode *inode, struct file *file)
Janosch Frank536a6f82016-05-18 13:26:23 +02005200{
5201 __simple_attr_check_format("%llu\n", 0ull);
Milan Pandurov09cbcef2019-12-13 14:07:21 +01005202 return kvm_debugfs_open(inode, file, kvm_stat_data_get,
5203 kvm_stat_data_clear, "%llu\n");
Janosch Frank536a6f82016-05-18 13:26:23 +02005204}
5205
Milan Pandurov09cbcef2019-12-13 14:07:21 +01005206static const struct file_operations stat_fops_per_vm = {
5207 .owner = THIS_MODULE,
5208 .open = kvm_stat_data_open,
Janosch Frank536a6f82016-05-18 13:26:23 +02005209 .release = kvm_debugfs_release,
Milan Pandurov09cbcef2019-12-13 14:07:21 +01005210 .read = simple_attr_read,
5211 .write = simple_attr_write,
5212 .llseek = no_llseek,
Janosch Frank536a6f82016-05-18 13:26:23 +02005213};
5214
Christoph Hellwig8b88b092008-02-08 04:20:26 -08005215static int vm_stat_get(void *_offset, u64 *val)
Avi Kivityba1389b2007-11-18 16:24:12 +02005216{
5217 unsigned offset = (long)_offset;
Avi Kivityba1389b2007-11-18 16:24:12 +02005218 struct kvm *kvm;
Janosch Frank536a6f82016-05-18 13:26:23 +02005219 u64 tmp_val;
Avi Kivityba1389b2007-11-18 16:24:12 +02005220
Christoph Hellwig8b88b092008-02-08 04:20:26 -08005221 *val = 0;
Junaid Shahid0d9ce162019-01-03 17:14:28 -08005222 mutex_lock(&kvm_lock);
Janosch Frank536a6f82016-05-18 13:26:23 +02005223 list_for_each_entry(kvm, &vm_list, vm_list) {
Milan Pandurov09cbcef2019-12-13 14:07:21 +01005224 kvm_get_stat_per_vm(kvm, offset, &tmp_val);
Janosch Frank536a6f82016-05-18 13:26:23 +02005225 *val += tmp_val;
5226 }
Junaid Shahid0d9ce162019-01-03 17:14:28 -08005227 mutex_unlock(&kvm_lock);
Christoph Hellwig8b88b092008-02-08 04:20:26 -08005228 return 0;
Avi Kivityba1389b2007-11-18 16:24:12 +02005229}
5230
Suraj Jitindar Singhce35ef22016-10-19 13:49:47 +11005231static int vm_stat_clear(void *_offset, u64 val)
5232{
5233 unsigned offset = (long)_offset;
5234 struct kvm *kvm;
Suraj Jitindar Singhce35ef22016-10-19 13:49:47 +11005235
5236 if (val)
5237 return -EINVAL;
5238
Junaid Shahid0d9ce162019-01-03 17:14:28 -08005239 mutex_lock(&kvm_lock);
Suraj Jitindar Singhce35ef22016-10-19 13:49:47 +11005240 list_for_each_entry(kvm, &vm_list, vm_list) {
Milan Pandurov09cbcef2019-12-13 14:07:21 +01005241 kvm_clear_stat_per_vm(kvm, offset);
Suraj Jitindar Singhce35ef22016-10-19 13:49:47 +11005242 }
Junaid Shahid0d9ce162019-01-03 17:14:28 -08005243 mutex_unlock(&kvm_lock);
Suraj Jitindar Singhce35ef22016-10-19 13:49:47 +11005244
5245 return 0;
5246}
5247
5248DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, vm_stat_clear, "%llu\n");
Jing Zhangbc9e9e62021-06-23 17:28:46 -04005249DEFINE_SIMPLE_ATTRIBUTE(vm_stat_readonly_fops, vm_stat_get, NULL, "%llu\n");
Avi Kivityba1389b2007-11-18 16:24:12 +02005250
Christoph Hellwig8b88b092008-02-08 04:20:26 -08005251static int vcpu_stat_get(void *_offset, u64 *val)
Avi Kivity1165f5f2007-04-19 17:27:43 +03005252{
5253 unsigned offset = (long)_offset;
Avi Kivity1165f5f2007-04-19 17:27:43 +03005254 struct kvm *kvm;
Janosch Frank536a6f82016-05-18 13:26:23 +02005255 u64 tmp_val;
Avi Kivity1165f5f2007-04-19 17:27:43 +03005256
Christoph Hellwig8b88b092008-02-08 04:20:26 -08005257 *val = 0;
Junaid Shahid0d9ce162019-01-03 17:14:28 -08005258 mutex_lock(&kvm_lock);
Janosch Frank536a6f82016-05-18 13:26:23 +02005259 list_for_each_entry(kvm, &vm_list, vm_list) {
Milan Pandurov09cbcef2019-12-13 14:07:21 +01005260 kvm_get_stat_per_vcpu(kvm, offset, &tmp_val);
Janosch Frank536a6f82016-05-18 13:26:23 +02005261 *val += tmp_val;
5262 }
Junaid Shahid0d9ce162019-01-03 17:14:28 -08005263 mutex_unlock(&kvm_lock);
Christoph Hellwig8b88b092008-02-08 04:20:26 -08005264 return 0;
Avi Kivity1165f5f2007-04-19 17:27:43 +03005265}
5266
Suraj Jitindar Singhce35ef22016-10-19 13:49:47 +11005267static int vcpu_stat_clear(void *_offset, u64 val)
5268{
5269 unsigned offset = (long)_offset;
5270 struct kvm *kvm;
Suraj Jitindar Singhce35ef22016-10-19 13:49:47 +11005271
5272 if (val)
5273 return -EINVAL;
5274
Junaid Shahid0d9ce162019-01-03 17:14:28 -08005275 mutex_lock(&kvm_lock);
Suraj Jitindar Singhce35ef22016-10-19 13:49:47 +11005276 list_for_each_entry(kvm, &vm_list, vm_list) {
Milan Pandurov09cbcef2019-12-13 14:07:21 +01005277 kvm_clear_stat_per_vcpu(kvm, offset);
Suraj Jitindar Singhce35ef22016-10-19 13:49:47 +11005278 }
Junaid Shahid0d9ce162019-01-03 17:14:28 -08005279 mutex_unlock(&kvm_lock);
Suraj Jitindar Singhce35ef22016-10-19 13:49:47 +11005280
5281 return 0;
5282}
5283
5284DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, vcpu_stat_clear,
5285 "%llu\n");
Jing Zhangbc9e9e62021-06-23 17:28:46 -04005286DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_readonly_fops, vcpu_stat_get, NULL, "%llu\n");
Avi Kivity1165f5f2007-04-19 17:27:43 +03005287
Claudio Imbrenda286de8f2017-07-12 17:56:44 +02005288static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm)
5289{
5290 struct kobj_uevent_env *env;
Claudio Imbrenda286de8f2017-07-12 17:56:44 +02005291 unsigned long long created, active;
5292
5293 if (!kvm_dev.this_device || !kvm)
5294 return;
5295
Junaid Shahid0d9ce162019-01-03 17:14:28 -08005296 mutex_lock(&kvm_lock);
Claudio Imbrenda286de8f2017-07-12 17:56:44 +02005297 if (type == KVM_EVENT_CREATE_VM) {
5298 kvm_createvm_count++;
5299 kvm_active_vms++;
5300 } else if (type == KVM_EVENT_DESTROY_VM) {
5301 kvm_active_vms--;
5302 }
5303 created = kvm_createvm_count;
5304 active = kvm_active_vms;
Junaid Shahid0d9ce162019-01-03 17:14:28 -08005305 mutex_unlock(&kvm_lock);
Claudio Imbrenda286de8f2017-07-12 17:56:44 +02005306
Ben Gardonb12ce362019-02-11 11:02:49 -08005307 env = kzalloc(sizeof(*env), GFP_KERNEL_ACCOUNT);
Claudio Imbrenda286de8f2017-07-12 17:56:44 +02005308 if (!env)
5309 return;
5310
5311 add_uevent_var(env, "CREATED=%llu", created);
5312 add_uevent_var(env, "COUNT=%llu", active);
5313
Claudio Imbrendafdeaf7e2017-07-24 13:40:03 +02005314 if (type == KVM_EVENT_CREATE_VM) {
Claudio Imbrenda286de8f2017-07-12 17:56:44 +02005315 add_uevent_var(env, "EVENT=create");
Claudio Imbrendafdeaf7e2017-07-24 13:40:03 +02005316 kvm->userspace_pid = task_pid_nr(current);
5317 } else if (type == KVM_EVENT_DESTROY_VM) {
Claudio Imbrenda286de8f2017-07-12 17:56:44 +02005318 add_uevent_var(env, "EVENT=destroy");
Claudio Imbrendafdeaf7e2017-07-24 13:40:03 +02005319 }
5320 add_uevent_var(env, "PID=%d", kvm->userspace_pid);
Claudio Imbrenda286de8f2017-07-12 17:56:44 +02005321
Paolo Bonzini85cd39a2021-08-04 05:28:52 -04005322 if (kvm->debugfs_dentry) {
Ben Gardonb12ce362019-02-11 11:02:49 -08005323 char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL_ACCOUNT);
Claudio Imbrenda286de8f2017-07-12 17:56:44 +02005324
Claudio Imbrendafdeaf7e2017-07-24 13:40:03 +02005325 if (p) {
5326 tmp = dentry_path_raw(kvm->debugfs_dentry, p, PATH_MAX);
5327 if (!IS_ERR(tmp))
5328 add_uevent_var(env, "STATS_PATH=%s", tmp);
5329 kfree(p);
Claudio Imbrenda286de8f2017-07-12 17:56:44 +02005330 }
5331 }
5332 /* no need for checks, since we are adding at most only 5 keys */
5333 env->envp[env->envp_idx++] = NULL;
5334 kobject_uevent_env(&kvm_dev.this_device->kobj, KOBJ_CHANGE, env->envp);
5335 kfree(env);
Claudio Imbrenda286de8f2017-07-12 17:56:44 +02005336}
5337
Greg Kroah-Hartman929f45e2018-05-29 18:22:04 +02005338static void kvm_init_debug(void)
Avi Kivity6aa8b732006-12-10 02:21:36 -08005339{
Jing Zhangbc9e9e62021-06-23 17:28:46 -04005340 const struct file_operations *fops;
5341 const struct _kvm_stats_desc *pdesc;
5342 int i;
Avi Kivity6aa8b732006-12-10 02:21:36 -08005343
Hollis Blanchard76f7c872008-04-15 16:05:42 -05005344 kvm_debugfs_dir = debugfs_create_dir("kvm", NULL);
Hamo4f69b682011-12-15 14:23:16 +08005345
Jing Zhangbc9e9e62021-06-23 17:28:46 -04005346 for (i = 0; i < kvm_vm_stats_header.num_desc; ++i) {
5347 pdesc = &kvm_vm_stats_desc[i];
5348 if (kvm_stats_debugfs_mode(pdesc) & 0222)
5349 fops = &vm_stat_fops;
5350 else
5351 fops = &vm_stat_readonly_fops;
5352 debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
5353 kvm_debugfs_dir,
5354 (void *)(long)pdesc->desc.offset, fops);
5355 }
5356
5357 for (i = 0; i < kvm_vcpu_stats_header.num_desc; ++i) {
5358 pdesc = &kvm_vcpu_stats_desc[i];
5359 if (kvm_stats_debugfs_mode(pdesc) & 0222)
5360 fops = &vcpu_stat_fops;
5361 else
5362 fops = &vcpu_stat_readonly_fops;
5363 debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
5364 kvm_debugfs_dir,
5365 (void *)(long)pdesc->desc.offset, fops);
Hamo4f69b682011-12-15 14:23:16 +08005366 }
Avi Kivity6aa8b732006-12-10 02:21:36 -08005367}
5368
Rafael J. Wysockifb3600c2011-03-23 22:16:23 +01005369static int kvm_suspend(void)
Avi Kivity59ae6c62007-02-12 00:54:48 -08005370{
Alexander Graf10474ae2009-09-15 11:37:46 +02005371 if (kvm_usage_count)
Takuya Yoshikawa75b71272010-11-16 17:37:41 +09005372 hardware_disable_nolock(NULL);
Avi Kivity59ae6c62007-02-12 00:54:48 -08005373 return 0;
5374}
5375
Rafael J. Wysockifb3600c2011-03-23 22:16:23 +01005376static void kvm_resume(void)
Avi Kivity59ae6c62007-02-12 00:54:48 -08005377{
Zachary Amsdenca84d1a2010-08-19 22:07:28 -10005378 if (kvm_usage_count) {
Wanpeng Li2eb06c32019-05-17 16:49:49 +08005379#ifdef CONFIG_LOCKDEP
5380 WARN_ON(lockdep_is_held(&kvm_count_lock));
5381#endif
Takuya Yoshikawa75b71272010-11-16 17:37:41 +09005382 hardware_enable_nolock(NULL);
Zachary Amsdenca84d1a2010-08-19 22:07:28 -10005383 }
Avi Kivity59ae6c62007-02-12 00:54:48 -08005384}
5385
Rafael J. Wysockifb3600c2011-03-23 22:16:23 +01005386static struct syscore_ops kvm_syscore_ops = {
Avi Kivity59ae6c62007-02-12 00:54:48 -08005387 .suspend = kvm_suspend,
5388 .resume = kvm_resume,
5389};
5390
Avi Kivity15ad7142007-07-11 18:17:21 +03005391static inline
5392struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
5393{
5394 return container_of(pn, struct kvm_vcpu, preempt_notifier);
5395}
5396
5397static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
5398{
5399 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
Xiubo Lif95ef0cd2015-02-26 14:58:23 +08005400
Wanpeng Li046ddee2019-08-01 11:30:14 +08005401 WRITE_ONCE(vcpu->preempted, false);
Wanpeng Lid73eb572019-07-18 19:39:06 +08005402 WRITE_ONCE(vcpu->ready, false);
Avi Kivity15ad7142007-07-11 18:17:21 +03005403
Paolo Bonzini7495e222020-01-09 09:57:19 -05005404 __this_cpu_write(kvm_running_vcpu, vcpu);
Radim Krčmáře790d9e2014-08-21 18:08:05 +02005405 kvm_arch_sched_in(vcpu, cpu);
Zhang Xiantaoe9b11c12007-11-14 20:38:21 +08005406 kvm_arch_vcpu_load(vcpu, cpu);
Avi Kivity15ad7142007-07-11 18:17:21 +03005407}
5408
5409static void kvm_sched_out(struct preempt_notifier *pn,
5410 struct task_struct *next)
5411{
5412 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
5413
Peter Zijlstra3ba9f932021-06-11 10:28:13 +02005414 if (current->on_rq) {
Wanpeng Li046ddee2019-08-01 11:30:14 +08005415 WRITE_ONCE(vcpu->preempted, true);
Wanpeng Lid73eb572019-07-18 19:39:06 +08005416 WRITE_ONCE(vcpu->ready, true);
5417 }
Zhang Xiantaoe9b11c12007-11-14 20:38:21 +08005418 kvm_arch_vcpu_put(vcpu);
Paolo Bonzini7495e222020-01-09 09:57:19 -05005419 __this_cpu_write(kvm_running_vcpu, NULL);
5420}
5421
5422/**
5423 * kvm_get_running_vcpu - get the vcpu running on the current CPU.
Marc Zyngier1f03b2b2020-02-07 16:34:10 +00005424 *
5425 * We can disable preemption locally around accessing the per-CPU variable,
5426 * and use the resolved vcpu pointer after enabling preemption again,
5427 * because even if the current thread is migrated to another CPU, reading
5428 * the per-CPU value later will give us the same value as we update the
5429 * per-CPU variable in the preempt notifier handlers.
Paolo Bonzini7495e222020-01-09 09:57:19 -05005430 */
5431struct kvm_vcpu *kvm_get_running_vcpu(void)
5432{
Marc Zyngier1f03b2b2020-02-07 16:34:10 +00005433 struct kvm_vcpu *vcpu;
5434
5435 preempt_disable();
5436 vcpu = __this_cpu_read(kvm_running_vcpu);
5437 preempt_enable();
5438
5439 return vcpu;
Paolo Bonzini7495e222020-01-09 09:57:19 -05005440}
Wanpeng Li379a3c82020-04-28 14:23:27 +08005441EXPORT_SYMBOL_GPL(kvm_get_running_vcpu);
Paolo Bonzini7495e222020-01-09 09:57:19 -05005442
5443/**
5444 * kvm_get_running_vcpus - get the per-CPU array of currently running vcpus.
5445 */
5446struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void)
5447{
5448 return &kvm_running_vcpu;
Avi Kivity15ad7142007-07-11 18:17:21 +03005449}
5450
Sean Christophersonb9904082020-03-21 13:25:55 -07005451struct kvm_cpu_compat_check {
5452 void *opaque;
5453 int *ret;
5454};
5455
5456static void check_processor_compat(void *data)
Sean Christophersonf257d6d2019-04-19 22:18:17 -07005457{
Sean Christophersonb9904082020-03-21 13:25:55 -07005458 struct kvm_cpu_compat_check *c = data;
5459
5460 *c->ret = kvm_arch_check_processor_compat(c->opaque);
Sean Christophersonf257d6d2019-04-19 22:18:17 -07005461}
5462
Avi Kivity0ee75be2010-04-28 15:39:01 +03005463int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
Rusty Russellc16f8622007-07-30 21:12:19 +10005464 struct module *module)
Avi Kivity6aa8b732006-12-10 02:21:36 -08005465{
Sean Christophersonb9904082020-03-21 13:25:55 -07005466 struct kvm_cpu_compat_check c;
Avi Kivity6aa8b732006-12-10 02:21:36 -08005467 int r;
Yang, Sheng002c7f72007-07-31 14:23:01 +03005468 int cpu;
Avi Kivity6aa8b732006-12-10 02:21:36 -08005469
Zhang Xiantaof8c16bb2007-11-14 20:40:21 +08005470 r = kvm_arch_init(opaque);
5471 if (r)
Zhang Xiantaod23087842007-11-29 15:35:39 +08005472 goto out_fail;
Zhang Xiantaocb498ea2007-11-14 20:39:31 +08005473
Asias He7dac16c2013-05-08 10:57:29 +08005474 /*
5475 * kvm_arch_init makes sure there's at most one caller
5476 * for architectures that support multiple implementations,
5477 * like intel and amd on x86.
Paolo Bonzini36343f62016-10-26 13:35:56 +02005478 * kvm_arch_init must be called before kvm_irqfd_init to avoid creating
5479 * conflicts in case kvm is already setup for another implementation.
Asias He7dac16c2013-05-08 10:57:29 +08005480 */
Paolo Bonzini36343f62016-10-26 13:35:56 +02005481 r = kvm_irqfd_init();
5482 if (r)
5483 goto out_irqfd;
Asias He7dac16c2013-05-08 10:57:29 +08005484
Avi Kivity8437a6172009-06-06 14:52:35 -07005485 if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
Rusty Russell7f59f492008-12-07 21:25:45 +10305486 r = -ENOMEM;
5487 goto out_free_0;
5488 }
5489
Sean Christophersonb9904082020-03-21 13:25:55 -07005490 r = kvm_arch_hardware_setup(opaque);
Avi Kivity6aa8b732006-12-10 02:21:36 -08005491 if (r < 0)
Miaohe Linfaf0be22019-11-23 10:45:50 +08005492 goto out_free_1;
Avi Kivity6aa8b732006-12-10 02:21:36 -08005493
Sean Christophersonb9904082020-03-21 13:25:55 -07005494 c.ret = &r;
5495 c.opaque = opaque;
Yang, Sheng002c7f72007-07-31 14:23:01 +03005496 for_each_online_cpu(cpu) {
Sean Christophersonb9904082020-03-21 13:25:55 -07005497 smp_call_function_single(cpu, check_processor_compat, &c, 1);
Yang, Sheng002c7f72007-07-31 14:23:01 +03005498 if (r < 0)
Miaohe Linfaf0be22019-11-23 10:45:50 +08005499 goto out_free_2;
Yang, Sheng002c7f72007-07-31 14:23:01 +03005500 }
5501
Thomas Gleixner73c1b412016-12-21 20:19:54 +01005502 r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_STARTING, "kvm/cpu:starting",
Thomas Gleixner8c18b2d2016-07-13 17:16:37 +00005503 kvm_starting_cpu, kvm_dying_cpu);
Avi Kivity774c47f2007-02-12 00:54:47 -08005504 if (r)
Zhang Xiantaod23087842007-11-29 15:35:39 +08005505 goto out_free_2;
Avi Kivity6aa8b732006-12-10 02:21:36 -08005506 register_reboot_notifier(&kvm_reboot_notifier);
5507
Rusty Russellc16f8622007-07-30 21:12:19 +10005508 /* A kmem cache lets us meet the alignment requirements of fx_save. */
Avi Kivity0ee75be2010-04-28 15:39:01 +03005509 if (!vcpu_align)
5510 vcpu_align = __alignof__(struct kvm_vcpu);
Paolo Bonzini46515732017-10-26 15:45:46 +02005511 kvm_vcpu_cache =
5512 kmem_cache_create_usercopy("kvm_vcpu", vcpu_size, vcpu_align,
5513 SLAB_ACCOUNT,
5514 offsetof(struct kvm_vcpu, arch),
Jing Zhangce55c042021-06-18 22:27:06 +00005515 offsetofend(struct kvm_vcpu, stats_id)
5516 - offsetof(struct kvm_vcpu, arch),
Paolo Bonzini46515732017-10-26 15:45:46 +02005517 NULL);
Rusty Russellc16f8622007-07-30 21:12:19 +10005518 if (!kvm_vcpu_cache) {
5519 r = -ENOMEM;
Rafael J. Wysockifb3600c2011-03-23 22:16:23 +01005520 goto out_free_3;
Rusty Russellc16f8622007-07-30 21:12:19 +10005521 }
5522
Gleb Natapovaf585b92010-10-14 11:22:46 +02005523 r = kvm_async_pf_init();
5524 if (r)
5525 goto out_free;
5526
Avi Kivity6aa8b732006-12-10 02:21:36 -08005527 kvm_chardev_ops.owner = module;
Christian Borntraeger3d3aab12008-12-02 11:17:32 +01005528 kvm_vm_fops.owner = module;
5529 kvm_vcpu_fops.owner = module;
Avi Kivity6aa8b732006-12-10 02:21:36 -08005530
5531 r = misc_register(&kvm_dev);
5532 if (r) {
Xiubo Li1170adc2015-02-26 14:58:26 +08005533 pr_err("kvm: misc device register failed\n");
Gleb Natapovaf585b92010-10-14 11:22:46 +02005534 goto out_unreg;
Avi Kivity6aa8b732006-12-10 02:21:36 -08005535 }
5536
Rafael J. Wysockifb3600c2011-03-23 22:16:23 +01005537 register_syscore_ops(&kvm_syscore_ops);
5538
Avi Kivity15ad7142007-07-11 18:17:21 +03005539 kvm_preempt_ops.sched_in = kvm_sched_in;
5540 kvm_preempt_ops.sched_out = kvm_sched_out;
5541
Greg Kroah-Hartman929f45e2018-05-29 18:22:04 +02005542 kvm_init_debug();
Darrick J. Wong0ea4ed82009-10-14 16:21:00 -07005543
Paolo Bonzini3c3c29f2014-09-24 13:02:46 +02005544 r = kvm_vfio_ops_init();
5545 WARN_ON(r);
5546
Avi Kivityc7addb92007-09-16 18:58:32 +02005547 return 0;
Avi Kivity6aa8b732006-12-10 02:21:36 -08005548
Gleb Natapovaf585b92010-10-14 11:22:46 +02005549out_unreg:
5550 kvm_async_pf_deinit();
Avi Kivity6aa8b732006-12-10 02:21:36 -08005551out_free:
Rusty Russellc16f8622007-07-30 21:12:19 +10005552 kmem_cache_destroy(kvm_vcpu_cache);
Zhang Xiantaod23087842007-11-29 15:35:39 +08005553out_free_3:
Avi Kivity6aa8b732006-12-10 02:21:36 -08005554 unregister_reboot_notifier(&kvm_reboot_notifier);
Thomas Gleixner8c18b2d2016-07-13 17:16:37 +00005555 cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING);
Zhang Xiantaod23087842007-11-29 15:35:39 +08005556out_free_2:
Zhang Xiantaoe9b11c12007-11-14 20:38:21 +08005557 kvm_arch_hardware_unsetup();
Miaohe Linfaf0be22019-11-23 10:45:50 +08005558out_free_1:
Rusty Russell7f59f492008-12-07 21:25:45 +10305559 free_cpumask_var(cpus_hardware_enabled);
Zhang Xiantaod23087842007-11-29 15:35:39 +08005560out_free_0:
Cornelia Hucka0f155e2013-02-28 12:33:18 +01005561 kvm_irqfd_exit();
Paolo Bonzini36343f62016-10-26 13:35:56 +02005562out_irqfd:
Asias He7dac16c2013-05-08 10:57:29 +08005563 kvm_arch_exit();
5564out_fail:
Avi Kivity6aa8b732006-12-10 02:21:36 -08005565 return r;
5566}
Zhang Xiantaocb498ea2007-11-14 20:39:31 +08005567EXPORT_SYMBOL_GPL(kvm_init);
Avi Kivity6aa8b732006-12-10 02:21:36 -08005568
Zhang Xiantaocb498ea2007-11-14 20:39:31 +08005569void kvm_exit(void)
Avi Kivity6aa8b732006-12-10 02:21:36 -08005570{
Janosch Frank4bd33b52015-10-14 12:37:35 +02005571 debugfs_remove_recursive(kvm_debugfs_dir);
Avi Kivity6aa8b732006-12-10 02:21:36 -08005572 misc_deregister(&kvm_dev);
Rusty Russellc16f8622007-07-30 21:12:19 +10005573 kmem_cache_destroy(kvm_vcpu_cache);
Gleb Natapovaf585b92010-10-14 11:22:46 +02005574 kvm_async_pf_deinit();
Rafael J. Wysockifb3600c2011-03-23 22:16:23 +01005575 unregister_syscore_ops(&kvm_syscore_ops);
Avi Kivity6aa8b732006-12-10 02:21:36 -08005576 unregister_reboot_notifier(&kvm_reboot_notifier);
Thomas Gleixner8c18b2d2016-07-13 17:16:37 +00005577 cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING);
Takuya Yoshikawa75b71272010-11-16 17:37:41 +09005578 on_each_cpu(hardware_disable_nolock, NULL, 1);
Zhang Xiantaoe9b11c12007-11-14 20:38:21 +08005579 kvm_arch_hardware_unsetup();
Zhang Xiantaof8c16bb2007-11-14 20:40:21 +08005580 kvm_arch_exit();
Cornelia Hucka0f155e2013-02-28 12:33:18 +01005581 kvm_irqfd_exit();
Rusty Russell7f59f492008-12-07 21:25:45 +10305582 free_cpumask_var(cpus_hardware_enabled);
Wanpeng Li571ee1b2014-10-09 18:30:08 +08005583 kvm_vfio_ops_exit();
Avi Kivity6aa8b732006-12-10 02:21:36 -08005584}
Zhang Xiantaocb498ea2007-11-14 20:39:31 +08005585EXPORT_SYMBOL_GPL(kvm_exit);
Junaid Shahidc57c8042019-11-04 12:22:02 +01005586
5587struct kvm_vm_worker_thread_context {
5588 struct kvm *kvm;
5589 struct task_struct *parent;
5590 struct completion init_done;
5591 kvm_vm_thread_fn_t thread_fn;
5592 uintptr_t data;
5593 int err;
5594};
5595
5596static int kvm_vm_worker_thread(void *context)
5597{
5598 /*
5599 * The init_context is allocated on the stack of the parent thread, so
5600 * we have to locally copy anything that is needed beyond initialization
5601 */
5602 struct kvm_vm_worker_thread_context *init_context = context;
5603 struct kvm *kvm = init_context->kvm;
5604 kvm_vm_thread_fn_t thread_fn = init_context->thread_fn;
5605 uintptr_t data = init_context->data;
5606 int err;
5607
5608 err = kthread_park(current);
5609 /* kthread_park(current) is never supposed to return an error */
5610 WARN_ON(err != 0);
5611 if (err)
5612 goto init_complete;
5613
5614 err = cgroup_attach_task_all(init_context->parent, current);
5615 if (err) {
5616 kvm_err("%s: cgroup_attach_task_all failed with err %d\n",
5617 __func__, err);
5618 goto init_complete;
5619 }
5620
5621 set_user_nice(current, task_nice(init_context->parent));
5622
5623init_complete:
5624 init_context->err = err;
5625 complete(&init_context->init_done);
5626 init_context = NULL;
5627
5628 if (err)
5629 return err;
5630
5631 /* Wait to be woken up by the spawner before proceeding. */
5632 kthread_parkme();
5633
5634 if (!kthread_should_stop())
5635 err = thread_fn(kvm, data);
5636
5637 return err;
5638}
5639
5640int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn,
5641 uintptr_t data, const char *name,
5642 struct task_struct **thread_ptr)
5643{
5644 struct kvm_vm_worker_thread_context init_context = {};
5645 struct task_struct *thread;
5646
5647 *thread_ptr = NULL;
5648 init_context.kvm = kvm;
5649 init_context.parent = current;
5650 init_context.thread_fn = thread_fn;
5651 init_context.data = data;
5652 init_completion(&init_context.init_done);
5653
5654 thread = kthread_run(kvm_vm_worker_thread, &init_context,
5655 "%s-%d", name, task_pid_nr(current));
5656 if (IS_ERR(thread))
5657 return PTR_ERR(thread);
5658
5659 /* kthread_run is never supposed to return NULL */
5660 WARN_ON(thread == NULL);
5661
5662 wait_for_completion(&init_context.init_done);
5663
5664 if (!init_context.err)
5665 *thread_ptr = thread;
5666
5667 return init_context.err;
5668}