blob: ceddabd1f5c6f38bd4067bfbf0d123a539b321a5 [file] [log] [blame]
Joao Martins23200b72018-06-13 09:55:44 -04001// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright © 2019 Oracle and/or its affiliates. All rights reserved.
4 * Copyright © 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
5 *
6 * KVM Xen emulation
7 */
8
9#include "x86.h"
10#include "xen.h"
Joao Martins79033be2018-06-13 09:55:44 -040011#include "hyperv.h"
Joao Martins23200b72018-06-13 09:55:44 -040012
13#include <linux/kvm_host.h>
David Woodhouse30b5c852021-03-01 12:53:09 +000014#include <linux/sched/stat.h>
Joao Martins23200b72018-06-13 09:55:44 -040015
16#include <trace/events/kvm.h>
Joao Martins13ffb972018-06-15 21:17:14 -040017#include <xen/interface/xen.h>
David Woodhouse30b5c852021-03-01 12:53:09 +000018#include <xen/interface/vcpu.h>
David Woodhouse14243b32021-12-10 16:36:23 +000019#include <xen/interface/event_channel.h>
Joao Martins23200b72018-06-13 09:55:44 -040020
21#include "trace.h"
22
David Woodhouse7d6bbeb2021-02-02 15:48:05 +000023DEFINE_STATIC_KEY_DEFERRED_FALSE(kvm_xen_enabled, HZ);
24
Joao Martins13ffb972018-06-15 21:17:14 -040025static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn)
26{
David Woodhouse1cfc9c42021-12-10 16:36:22 +000027 struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache;
Joao Martins629b5342018-06-28 15:06:43 -040028 gpa_t gpa = gfn_to_gpa(gfn);
29 int wc_ofs, sec_hi_ofs;
Paolo Bonzini319afe62021-08-04 12:48:41 -040030 int ret = 0;
Joao Martins13ffb972018-06-15 21:17:14 -040031 int idx = srcu_read_lock(&kvm->srcu);
32
David Woodhouse1cfc9c42021-12-10 16:36:22 +000033 if (gfn == GPA_INVALID) {
34 kvm_gfn_to_pfn_cache_destroy(kvm, gpc);
Joao Martins629b5342018-06-28 15:06:43 -040035 goto out;
Paolo Bonzini319afe62021-08-04 12:48:41 -040036 }
David Woodhouse1cfc9c42021-12-10 16:36:22 +000037
38 ret = kvm_gfn_to_pfn_cache_init(kvm, gpc, NULL, false, true, gpa,
39 PAGE_SIZE, false);
40 if (ret)
41 goto out;
Joao Martins629b5342018-06-28 15:06:43 -040042
43 /* Paranoia checks on the 32-bit struct layout */
44 BUILD_BUG_ON(offsetof(struct compat_shared_info, wc) != 0x900);
45 BUILD_BUG_ON(offsetof(struct compat_shared_info, arch.wc_sec_hi) != 0x924);
46 BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
47
48 /* 32-bit location by default */
49 wc_ofs = offsetof(struct compat_shared_info, wc);
50 sec_hi_ofs = offsetof(struct compat_shared_info, arch.wc_sec_hi);
51
52#ifdef CONFIG_X86_64
53 /* Paranoia checks on the 64-bit struct layout */
54 BUILD_BUG_ON(offsetof(struct shared_info, wc) != 0xc00);
55 BUILD_BUG_ON(offsetof(struct shared_info, wc_sec_hi) != 0xc0c);
56
57 if (kvm->arch.xen.long_mode) {
58 wc_ofs = offsetof(struct shared_info, wc);
59 sec_hi_ofs = offsetof(struct shared_info, wc_sec_hi);
60 }
61#endif
62
63 kvm_write_wall_clock(kvm, gpa + wc_ofs, sec_hi_ofs - wc_ofs);
64 kvm_make_all_cpus_request(kvm, KVM_REQ_MASTERCLOCK_UPDATE);
65
66out:
Joao Martins13ffb972018-06-15 21:17:14 -040067 srcu_read_unlock(&kvm->srcu, idx);
68 return ret;
69}
70
David Woodhouse30b5c852021-03-01 12:53:09 +000071static void kvm_xen_update_runstate(struct kvm_vcpu *v, int state)
72{
73 struct kvm_vcpu_xen *vx = &v->arch.xen;
74 u64 now = get_kvmclock_ns(v->kvm);
75 u64 delta_ns = now - vx->runstate_entry_time;
76 u64 run_delay = current->sched_info.run_delay;
77
78 if (unlikely(!vx->runstate_entry_time))
79 vx->current_runstate = RUNSTATE_offline;
80
81 /*
82 * Time waiting for the scheduler isn't "stolen" if the
83 * vCPU wasn't running anyway.
84 */
85 if (vx->current_runstate == RUNSTATE_running) {
86 u64 steal_ns = run_delay - vx->last_steal;
87
88 delta_ns -= steal_ns;
89
90 vx->runstate_times[RUNSTATE_runnable] += steal_ns;
91 }
92 vx->last_steal = run_delay;
93
94 vx->runstate_times[vx->current_runstate] += delta_ns;
95 vx->current_runstate = state;
96 vx->runstate_entry_time = now;
97}
98
99void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
100{
101 struct kvm_vcpu_xen *vx = &v->arch.xen;
102 uint64_t state_entry_time;
103 unsigned int offset;
104
105 kvm_xen_update_runstate(v, state);
106
107 if (!vx->runstate_set)
108 return;
109
110 BUILD_BUG_ON(sizeof(struct compat_vcpu_runstate_info) != 0x2c);
111
112 offset = offsetof(struct compat_vcpu_runstate_info, state_entry_time);
113#ifdef CONFIG_X86_64
114 /*
115 * The only difference is alignment of uint64_t in 32-bit.
116 * So the first field 'state' is accessed directly using
117 * offsetof() (where its offset happens to be zero), while the
118 * remaining fields which are all uint64_t, start at 'offset'
119 * which we tweak here by adding 4.
120 */
121 BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) !=
122 offsetof(struct compat_vcpu_runstate_info, state_entry_time) + 4);
123 BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, time) !=
124 offsetof(struct compat_vcpu_runstate_info, time) + 4);
125
126 if (v->kvm->arch.xen.long_mode)
127 offset = offsetof(struct vcpu_runstate_info, state_entry_time);
128#endif
129 /*
130 * First write the updated state_entry_time at the appropriate
131 * location determined by 'offset'.
132 */
133 state_entry_time = vx->runstate_entry_time;
134 state_entry_time |= XEN_RUNSTATE_UPDATE;
135
David Woodhouse6a834752021-11-15 16:50:23 +0000136 BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, state_entry_time) !=
David Woodhouse30b5c852021-03-01 12:53:09 +0000137 sizeof(state_entry_time));
David Woodhouse6a834752021-11-15 16:50:23 +0000138 BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state_entry_time) !=
David Woodhouse30b5c852021-03-01 12:53:09 +0000139 sizeof(state_entry_time));
140
141 if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache,
142 &state_entry_time, offset,
143 sizeof(state_entry_time)))
144 return;
145 smp_wmb();
146
147 /*
148 * Next, write the new runstate. This is in the *same* place
149 * for 32-bit and 64-bit guests, asserted here for paranoia.
150 */
151 BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) !=
152 offsetof(struct compat_vcpu_runstate_info, state));
David Woodhouse6a834752021-11-15 16:50:23 +0000153 BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, state) !=
David Woodhouse30b5c852021-03-01 12:53:09 +0000154 sizeof(vx->current_runstate));
David Woodhouse6a834752021-11-15 16:50:23 +0000155 BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state) !=
David Woodhouse30b5c852021-03-01 12:53:09 +0000156 sizeof(vx->current_runstate));
157
158 if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache,
159 &vx->current_runstate,
160 offsetof(struct vcpu_runstate_info, state),
161 sizeof(vx->current_runstate)))
162 return;
163
164 /*
165 * Write the actual runstate times immediately after the
166 * runstate_entry_time.
167 */
168 BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) !=
169 offsetof(struct vcpu_runstate_info, time) - sizeof(u64));
170 BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state_entry_time) !=
171 offsetof(struct compat_vcpu_runstate_info, time) - sizeof(u64));
David Woodhouse6a834752021-11-15 16:50:23 +0000172 BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) !=
173 sizeof_field(struct compat_vcpu_runstate_info, time));
174 BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) !=
David Woodhouse30b5c852021-03-01 12:53:09 +0000175 sizeof(vx->runstate_times));
176
177 if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache,
178 &vx->runstate_times[0],
179 offset + sizeof(u64),
180 sizeof(vx->runstate_times)))
181 return;
182
183 smp_wmb();
184
185 /*
186 * Finally, clear the XEN_RUNSTATE_UPDATE bit in the guest's
187 * runstate_entry_time field.
188 */
189
190 state_entry_time &= ~XEN_RUNSTATE_UPDATE;
191 if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache,
192 &state_entry_time, offset,
193 sizeof(state_entry_time)))
194 return;
195}
196
David Woodhouse40da8cc2020-12-09 20:08:30 +0000197int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
198{
David Woodhouse14243b32021-12-10 16:36:23 +0000199 unsigned long evtchn_pending_sel = READ_ONCE(v->arch.xen.evtchn_pending_sel);
200 bool atomic = in_atomic() || !task_is_running(current);
David Woodhouse0985dba2021-10-23 20:47:19 +0100201 int err;
David Woodhouse40da8cc2020-12-09 20:08:30 +0000202 u8 rc = 0;
203
204 /*
205 * If the global upcall vector (HVMIRQ_callback_vector) is set and
206 * the vCPU's evtchn_upcall_pending flag is set, the IRQ is pending.
207 */
208 struct gfn_to_hva_cache *ghc = &v->arch.xen.vcpu_info_cache;
209 struct kvm_memslots *slots = kvm_memslots(v->kvm);
David Woodhouse14243b32021-12-10 16:36:23 +0000210 bool ghc_valid = slots->generation == ghc->generation &&
211 !kvm_is_error_hva(ghc->hva) && ghc->memslot;
212
David Woodhouse40da8cc2020-12-09 20:08:30 +0000213 unsigned int offset = offsetof(struct vcpu_info, evtchn_upcall_pending);
214
215 /* No need for compat handling here */
216 BUILD_BUG_ON(offsetof(struct vcpu_info, evtchn_upcall_pending) !=
217 offsetof(struct compat_vcpu_info, evtchn_upcall_pending));
218 BUILD_BUG_ON(sizeof(rc) !=
David Woodhouse6a834752021-11-15 16:50:23 +0000219 sizeof_field(struct vcpu_info, evtchn_upcall_pending));
David Woodhouse40da8cc2020-12-09 20:08:30 +0000220 BUILD_BUG_ON(sizeof(rc) !=
David Woodhouse6a834752021-11-15 16:50:23 +0000221 sizeof_field(struct compat_vcpu_info, evtchn_upcall_pending));
David Woodhouse40da8cc2020-12-09 20:08:30 +0000222
223 /*
224 * For efficiency, this mirrors the checks for using the valid
225 * cache in kvm_read_guest_offset_cached(), but just uses
226 * __get_user() instead. And falls back to the slow path.
227 */
David Woodhouse14243b32021-12-10 16:36:23 +0000228 if (!evtchn_pending_sel && ghc_valid) {
David Woodhouse40da8cc2020-12-09 20:08:30 +0000229 /* Fast path */
David Woodhouse0985dba2021-10-23 20:47:19 +0100230 pagefault_disable();
231 err = __get_user(rc, (u8 __user *)ghc->hva + offset);
232 pagefault_enable();
233 if (!err)
234 return rc;
David Woodhouse40da8cc2020-12-09 20:08:30 +0000235 }
236
David Woodhouse0985dba2021-10-23 20:47:19 +0100237 /* Slow path */
238
239 /*
240 * This function gets called from kvm_vcpu_block() after setting the
241 * task to TASK_INTERRUPTIBLE, to see if it needs to wake immediately
242 * from a HLT. So we really mustn't sleep. If the page ended up absent
243 * at that point, just return 1 in order to trigger an immediate wake,
244 * and we'll end up getting called again from a context where we *can*
245 * fault in the page and wait for it.
246 */
David Woodhouse14243b32021-12-10 16:36:23 +0000247 if (atomic)
David Woodhouse0985dba2021-10-23 20:47:19 +0100248 return 1;
249
David Woodhouse14243b32021-12-10 16:36:23 +0000250 if (!ghc_valid) {
251 err = kvm_gfn_to_hva_cache_init(v->kvm, ghc, ghc->gpa, ghc->len);
252 if (err || !ghc->memslot) {
253 /*
254 * If this failed, userspace has screwed up the
255 * vcpu_info mapping. No interrupts for you.
256 */
257 return 0;
258 }
259 }
260
261 /*
262 * Now we have a valid (protected by srcu) userspace HVA in
263 * ghc->hva which points to the struct vcpu_info. If there
264 * are any bits in the in-kernel evtchn_pending_sel then
265 * we need to write those to the guest vcpu_info and set
266 * its evtchn_upcall_pending flag. If there aren't any bits
267 * to add, we only want to *check* evtchn_upcall_pending.
268 */
269 if (evtchn_pending_sel) {
270 bool long_mode = v->kvm->arch.xen.long_mode;
271
272 if (!user_access_begin((void __user *)ghc->hva, sizeof(struct vcpu_info)))
273 return 0;
274
275 if (IS_ENABLED(CONFIG_64BIT) && long_mode) {
276 struct vcpu_info __user *vi = (void __user *)ghc->hva;
277
278 /* Attempt to set the evtchn_pending_sel bits in the
279 * guest, and if that succeeds then clear the same
280 * bits in the in-kernel version. */
281 asm volatile("1:\t" LOCK_PREFIX "orq %0, %1\n"
282 "\tnotq %0\n"
283 "\t" LOCK_PREFIX "andq %0, %2\n"
284 "2:\n"
285 "\t.section .fixup,\"ax\"\n"
286 "3:\tjmp\t2b\n"
287 "\t.previous\n"
288 _ASM_EXTABLE_UA(1b, 3b)
289 : "=r" (evtchn_pending_sel),
290 "+m" (vi->evtchn_pending_sel),
291 "+m" (v->arch.xen.evtchn_pending_sel)
292 : "0" (evtchn_pending_sel));
293 } else {
294 struct compat_vcpu_info __user *vi = (void __user *)ghc->hva;
295 u32 evtchn_pending_sel32 = evtchn_pending_sel;
296
297 /* Attempt to set the evtchn_pending_sel bits in the
298 * guest, and if that succeeds then clear the same
299 * bits in the in-kernel version. */
300 asm volatile("1:\t" LOCK_PREFIX "orl %0, %1\n"
301 "\tnotl %0\n"
302 "\t" LOCK_PREFIX "andl %0, %2\n"
303 "2:\n"
304 "\t.section .fixup,\"ax\"\n"
305 "3:\tjmp\t2b\n"
306 "\t.previous\n"
307 _ASM_EXTABLE_UA(1b, 3b)
308 : "=r" (evtchn_pending_sel32),
309 "+m" (vi->evtchn_pending_sel),
310 "+m" (v->arch.xen.evtchn_pending_sel)
311 : "0" (evtchn_pending_sel32));
312 }
313 rc = 1;
314 unsafe_put_user(rc, (u8 __user *)ghc->hva + offset, err);
315
316 err:
317 user_access_end();
318
319 mark_page_dirty_in_slot(v->kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT);
320 } else {
321 __get_user(rc, (u8 __user *)ghc->hva + offset);
322 }
David Woodhouse0985dba2021-10-23 20:47:19 +0100323
David Woodhouse40da8cc2020-12-09 20:08:30 +0000324 return rc;
325}
326
Joao Martinsa76b9642020-12-03 15:52:25 +0000327int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
328{
329 int r = -ENOENT;
330
Joao Martins13ffb972018-06-15 21:17:14 -0400331 mutex_lock(&kvm->lock);
332
Joao Martinsa76b9642020-12-03 15:52:25 +0000333 switch (data->type) {
David Woodhousea3833b82020-12-03 16:20:32 +0000334 case KVM_XEN_ATTR_TYPE_LONG_MODE:
Joao Martins13ffb972018-06-15 21:17:14 -0400335 if (!IS_ENABLED(CONFIG_64BIT) && data->u.long_mode) {
336 r = -EINVAL;
337 } else {
338 kvm->arch.xen.long_mode = !!data->u.long_mode;
339 r = 0;
340 }
David Woodhousea3833b82020-12-03 16:20:32 +0000341 break;
Joao Martins13ffb972018-06-15 21:17:14 -0400342
343 case KVM_XEN_ATTR_TYPE_SHARED_INFO:
344 r = kvm_xen_shared_info_init(kvm, data->u.shared_info.gfn);
345 break;
346
David Woodhouse40da8cc2020-12-09 20:08:30 +0000347 case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR:
David Woodhouse0c165b32021-02-08 23:23:25 +0000348 if (data->u.vector && data->u.vector < 0x10)
David Woodhouse40da8cc2020-12-09 20:08:30 +0000349 r = -EINVAL;
350 else {
351 kvm->arch.xen.upcall_vector = data->u.vector;
352 r = 0;
353 }
354 break;
355
Joao Martinsa76b9642020-12-03 15:52:25 +0000356 default:
357 break;
358 }
359
360 mutex_unlock(&kvm->lock);
361 return r;
362}
363
364int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
365{
366 int r = -ENOENT;
367
368 mutex_lock(&kvm->lock);
369
370 switch (data->type) {
David Woodhousea3833b82020-12-03 16:20:32 +0000371 case KVM_XEN_ATTR_TYPE_LONG_MODE:
372 data->u.long_mode = kvm->arch.xen.long_mode;
373 r = 0;
374 break;
Joao Martins13ffb972018-06-15 21:17:14 -0400375
376 case KVM_XEN_ATTR_TYPE_SHARED_INFO:
David Woodhouse1cfc9c42021-12-10 16:36:22 +0000377 if (kvm->arch.xen.shinfo_cache.active)
378 data->u.shared_info.gfn = gpa_to_gfn(kvm->arch.xen.shinfo_cache.gpa);
379 else
380 data->u.shared_info.gfn = GPA_INVALID;
David Woodhouse0c165b32021-02-08 23:23:25 +0000381 r = 0;
Joao Martins13ffb972018-06-15 21:17:14 -0400382 break;
383
David Woodhouse40da8cc2020-12-09 20:08:30 +0000384 case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR:
385 data->u.vector = kvm->arch.xen.upcall_vector;
386 r = 0;
387 break;
388
Joao Martinsa76b9642020-12-03 15:52:25 +0000389 default:
390 break;
391 }
392
393 mutex_unlock(&kvm->lock);
394 return r;
395}
396
David Woodhouse3e324612021-02-02 16:53:25 +0000397int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
398{
Joao Martins73e69a82018-06-29 10:52:52 -0400399 int idx, r = -ENOENT;
David Woodhouse3e324612021-02-02 16:53:25 +0000400
401 mutex_lock(&vcpu->kvm->lock);
Joao Martins73e69a82018-06-29 10:52:52 -0400402 idx = srcu_read_lock(&vcpu->kvm->srcu);
David Woodhouse3e324612021-02-02 16:53:25 +0000403
404 switch (data->type) {
Joao Martins73e69a82018-06-29 10:52:52 -0400405 case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO:
406 /* No compat necessary here. */
407 BUILD_BUG_ON(sizeof(struct vcpu_info) !=
408 sizeof(struct compat_vcpu_info));
David Woodhouse7d7c5f72021-03-01 12:53:08 +0000409 BUILD_BUG_ON(offsetof(struct vcpu_info, time) !=
410 offsetof(struct compat_vcpu_info, time));
Joao Martins73e69a82018-06-29 10:52:52 -0400411
David Woodhouse0c165b32021-02-08 23:23:25 +0000412 if (data->u.gpa == GPA_INVALID) {
413 vcpu->arch.xen.vcpu_info_set = false;
David Woodhouse7d7c5f72021-03-01 12:53:08 +0000414 r = 0;
David Woodhouse0c165b32021-02-08 23:23:25 +0000415 break;
416 }
417
Joao Martins73e69a82018-06-29 10:52:52 -0400418 r = kvm_gfn_to_hva_cache_init(vcpu->kvm,
419 &vcpu->arch.xen.vcpu_info_cache,
420 data->u.gpa,
421 sizeof(struct vcpu_info));
Joao Martinsaa096aa2019-02-01 13:01:45 -0500422 if (!r) {
Joao Martins73e69a82018-06-29 10:52:52 -0400423 vcpu->arch.xen.vcpu_info_set = true;
Joao Martinsaa096aa2019-02-01 13:01:45 -0500424 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
425 }
Joao Martins73e69a82018-06-29 10:52:52 -0400426 break;
427
Joao Martinsf2340cd2018-07-23 11:20:57 -0400428 case KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO:
David Woodhouse0c165b32021-02-08 23:23:25 +0000429 if (data->u.gpa == GPA_INVALID) {
430 vcpu->arch.xen.vcpu_time_info_set = false;
David Woodhouse7d7c5f72021-03-01 12:53:08 +0000431 r = 0;
David Woodhouse0c165b32021-02-08 23:23:25 +0000432 break;
433 }
434
Joao Martinsf2340cd2018-07-23 11:20:57 -0400435 r = kvm_gfn_to_hva_cache_init(vcpu->kvm,
436 &vcpu->arch.xen.vcpu_time_info_cache,
437 data->u.gpa,
438 sizeof(struct pvclock_vcpu_time_info));
439 if (!r) {
440 vcpu->arch.xen.vcpu_time_info_set = true;
441 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
442 }
443 break;
444
David Woodhouse30b5c852021-03-01 12:53:09 +0000445 case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR:
446 if (!sched_info_on()) {
447 r = -EOPNOTSUPP;
448 break;
449 }
450 if (data->u.gpa == GPA_INVALID) {
451 vcpu->arch.xen.runstate_set = false;
452 r = 0;
453 break;
454 }
455
456 r = kvm_gfn_to_hva_cache_init(vcpu->kvm,
457 &vcpu->arch.xen.runstate_cache,
458 data->u.gpa,
459 sizeof(struct vcpu_runstate_info));
460 if (!r) {
461 vcpu->arch.xen.runstate_set = true;
462 }
463 break;
464
465 case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT:
466 if (!sched_info_on()) {
467 r = -EOPNOTSUPP;
468 break;
469 }
470 if (data->u.runstate.state > RUNSTATE_offline) {
471 r = -EINVAL;
472 break;
473 }
474
475 kvm_xen_update_runstate(vcpu, data->u.runstate.state);
476 r = 0;
477 break;
478
479 case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA:
480 if (!sched_info_on()) {
481 r = -EOPNOTSUPP;
482 break;
483 }
484 if (data->u.runstate.state > RUNSTATE_offline) {
485 r = -EINVAL;
486 break;
487 }
488 if (data->u.runstate.state_entry_time !=
489 (data->u.runstate.time_running +
490 data->u.runstate.time_runnable +
491 data->u.runstate.time_blocked +
492 data->u.runstate.time_offline)) {
493 r = -EINVAL;
494 break;
495 }
496 if (get_kvmclock_ns(vcpu->kvm) <
497 data->u.runstate.state_entry_time) {
498 r = -EINVAL;
499 break;
500 }
501
502 vcpu->arch.xen.current_runstate = data->u.runstate.state;
503 vcpu->arch.xen.runstate_entry_time =
504 data->u.runstate.state_entry_time;
505 vcpu->arch.xen.runstate_times[RUNSTATE_running] =
506 data->u.runstate.time_running;
507 vcpu->arch.xen.runstate_times[RUNSTATE_runnable] =
508 data->u.runstate.time_runnable;
509 vcpu->arch.xen.runstate_times[RUNSTATE_blocked] =
510 data->u.runstate.time_blocked;
511 vcpu->arch.xen.runstate_times[RUNSTATE_offline] =
512 data->u.runstate.time_offline;
513 vcpu->arch.xen.last_steal = current->sched_info.run_delay;
514 r = 0;
515 break;
516
517 case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST:
518 if (!sched_info_on()) {
519 r = -EOPNOTSUPP;
520 break;
521 }
522 if (data->u.runstate.state > RUNSTATE_offline &&
523 data->u.runstate.state != (u64)-1) {
524 r = -EINVAL;
525 break;
526 }
527 /* The adjustment must add up */
528 if (data->u.runstate.state_entry_time !=
529 (data->u.runstate.time_running +
530 data->u.runstate.time_runnable +
531 data->u.runstate.time_blocked +
532 data->u.runstate.time_offline)) {
533 r = -EINVAL;
534 break;
535 }
536
537 if (get_kvmclock_ns(vcpu->kvm) <
538 (vcpu->arch.xen.runstate_entry_time +
539 data->u.runstate.state_entry_time)) {
540 r = -EINVAL;
541 break;
542 }
543
544 vcpu->arch.xen.runstate_entry_time +=
545 data->u.runstate.state_entry_time;
546 vcpu->arch.xen.runstate_times[RUNSTATE_running] +=
547 data->u.runstate.time_running;
548 vcpu->arch.xen.runstate_times[RUNSTATE_runnable] +=
549 data->u.runstate.time_runnable;
550 vcpu->arch.xen.runstate_times[RUNSTATE_blocked] +=
551 data->u.runstate.time_blocked;
552 vcpu->arch.xen.runstate_times[RUNSTATE_offline] +=
553 data->u.runstate.time_offline;
554
555 if (data->u.runstate.state <= RUNSTATE_offline)
556 kvm_xen_update_runstate(vcpu, data->u.runstate.state);
557 r = 0;
558 break;
559
David Woodhouse3e324612021-02-02 16:53:25 +0000560 default:
561 break;
562 }
563
Joao Martins73e69a82018-06-29 10:52:52 -0400564 srcu_read_unlock(&vcpu->kvm->srcu, idx);
David Woodhouse3e324612021-02-02 16:53:25 +0000565 mutex_unlock(&vcpu->kvm->lock);
566 return r;
567}
568
569int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
570{
571 int r = -ENOENT;
572
573 mutex_lock(&vcpu->kvm->lock);
574
575 switch (data->type) {
Joao Martins73e69a82018-06-29 10:52:52 -0400576 case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO:
David Woodhouse0c165b32021-02-08 23:23:25 +0000577 if (vcpu->arch.xen.vcpu_info_set)
Joao Martins73e69a82018-06-29 10:52:52 -0400578 data->u.gpa = vcpu->arch.xen.vcpu_info_cache.gpa;
David Woodhouse0c165b32021-02-08 23:23:25 +0000579 else
580 data->u.gpa = GPA_INVALID;
581 r = 0;
Joao Martins73e69a82018-06-29 10:52:52 -0400582 break;
583
Joao Martinsf2340cd2018-07-23 11:20:57 -0400584 case KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO:
David Woodhouse0c165b32021-02-08 23:23:25 +0000585 if (vcpu->arch.xen.vcpu_time_info_set)
Joao Martinsf2340cd2018-07-23 11:20:57 -0400586 data->u.gpa = vcpu->arch.xen.vcpu_time_info_cache.gpa;
David Woodhouse0c165b32021-02-08 23:23:25 +0000587 else
588 data->u.gpa = GPA_INVALID;
589 r = 0;
Joao Martinsf2340cd2018-07-23 11:20:57 -0400590 break;
591
David Woodhouse30b5c852021-03-01 12:53:09 +0000592 case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR:
593 if (!sched_info_on()) {
594 r = -EOPNOTSUPP;
595 break;
596 }
597 if (vcpu->arch.xen.runstate_set) {
598 data->u.gpa = vcpu->arch.xen.runstate_cache.gpa;
599 r = 0;
600 }
601 break;
602
603 case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT:
604 if (!sched_info_on()) {
605 r = -EOPNOTSUPP;
606 break;
607 }
608 data->u.runstate.state = vcpu->arch.xen.current_runstate;
609 r = 0;
610 break;
611
612 case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA:
613 if (!sched_info_on()) {
614 r = -EOPNOTSUPP;
615 break;
616 }
617 data->u.runstate.state = vcpu->arch.xen.current_runstate;
618 data->u.runstate.state_entry_time =
619 vcpu->arch.xen.runstate_entry_time;
620 data->u.runstate.time_running =
621 vcpu->arch.xen.runstate_times[RUNSTATE_running];
622 data->u.runstate.time_runnable =
623 vcpu->arch.xen.runstate_times[RUNSTATE_runnable];
624 data->u.runstate.time_blocked =
625 vcpu->arch.xen.runstate_times[RUNSTATE_blocked];
626 data->u.runstate.time_offline =
627 vcpu->arch.xen.runstate_times[RUNSTATE_offline];
628 r = 0;
629 break;
630
631 case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST:
632 r = -EINVAL;
633 break;
634
David Woodhouse3e324612021-02-02 16:53:25 +0000635 default:
636 break;
637 }
638
639 mutex_unlock(&vcpu->kvm->lock);
640 return r;
641}
642
Joao Martins23200b72018-06-13 09:55:44 -0400643int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data)
644{
645 struct kvm *kvm = vcpu->kvm;
646 u32 page_num = data & ~PAGE_MASK;
647 u64 page_addr = data & PAGE_MASK;
David Woodhousea3833b82020-12-03 16:20:32 +0000648 bool lm = is_long_mode(vcpu);
649
650 /* Latch long_mode for shared_info pages etc. */
651 vcpu->kvm->arch.xen.long_mode = lm;
Joao Martins23200b72018-06-13 09:55:44 -0400652
653 /*
654 * If Xen hypercall intercept is enabled, fill the hypercall
655 * page with VMCALL/VMMCALL instructions since that's what
656 * we catch. Else the VMM has provided the hypercall pages
657 * with instructions of its own choosing, so use those.
658 */
659 if (kvm_xen_hypercall_enabled(kvm)) {
660 u8 instructions[32];
661 int i;
662
663 if (page_num)
664 return 1;
665
666 /* mov imm32, %eax */
667 instructions[0] = 0xb8;
668
669 /* vmcall / vmmcall */
670 kvm_x86_ops.patch_hypercall(vcpu, instructions + 5);
671
672 /* ret */
673 instructions[8] = 0xc3;
674
675 /* int3 to pad */
676 memset(instructions + 9, 0xcc, sizeof(instructions) - 9);
677
678 for (i = 0; i < PAGE_SIZE / sizeof(instructions); i++) {
679 *(u32 *)&instructions[1] = i;
680 if (kvm_vcpu_write_guest(vcpu,
681 page_addr + (i * sizeof(instructions)),
682 instructions, sizeof(instructions)))
683 return 1;
684 }
685 } else {
Sean Christopherson448841f2021-02-08 12:15:02 -0800686 /*
687 * Note, truncation is a non-issue as 'lm' is guaranteed to be
688 * false for a 32-bit kernel, i.e. when hva_t is only 4 bytes.
689 */
690 hva_t blob_addr = lm ? kvm->arch.xen_hvm_config.blob_addr_64
691 : kvm->arch.xen_hvm_config.blob_addr_32;
Joao Martins23200b72018-06-13 09:55:44 -0400692 u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64
693 : kvm->arch.xen_hvm_config.blob_size_32;
694 u8 *page;
695
696 if (page_num >= blob_size)
697 return 1;
698
699 blob_addr += page_num * PAGE_SIZE;
700
701 page = memdup_user((u8 __user *)blob_addr, PAGE_SIZE);
702 if (IS_ERR(page))
703 return PTR_ERR(page);
704
705 if (kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE)) {
706 kfree(page);
707 return 1;
708 }
709 }
710 return 0;
711}
712
David Woodhouse78e98782021-02-02 13:19:35 +0000713int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc)
714{
715 if (xhc->flags & ~KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL)
716 return -EINVAL;
717
718 /*
719 * With hypercall interception the kernel generates its own
720 * hypercall page so it must not be provided.
721 */
722 if ((xhc->flags & KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL) &&
723 (xhc->blob_addr_32 || xhc->blob_addr_64 ||
724 xhc->blob_size_32 || xhc->blob_size_64))
725 return -EINVAL;
726
David Woodhouse7d6bbeb2021-02-02 15:48:05 +0000727 mutex_lock(&kvm->lock);
728
729 if (xhc->msr && !kvm->arch.xen_hvm_config.msr)
730 static_branch_inc(&kvm_xen_enabled.key);
731 else if (!xhc->msr && kvm->arch.xen_hvm_config.msr)
732 static_branch_slow_dec_deferred(&kvm_xen_enabled);
733
David Woodhouse78e98782021-02-02 13:19:35 +0000734 memcpy(&kvm->arch.xen_hvm_config, xhc, sizeof(*xhc));
David Woodhouse7d6bbeb2021-02-02 15:48:05 +0000735
736 mutex_unlock(&kvm->lock);
David Woodhouse78e98782021-02-02 13:19:35 +0000737 return 0;
738}
739
Paolo Bonzini319afe62021-08-04 12:48:41 -0400740void kvm_xen_init_vm(struct kvm *kvm)
741{
Paolo Bonzini319afe62021-08-04 12:48:41 -0400742}
743
David Woodhouse7d6bbeb2021-02-02 15:48:05 +0000744void kvm_xen_destroy_vm(struct kvm *kvm)
745{
David Woodhouse1cfc9c42021-12-10 16:36:22 +0000746 kvm_gfn_to_pfn_cache_destroy(kvm, &kvm->arch.xen.shinfo_cache);
747
David Woodhouse7d6bbeb2021-02-02 15:48:05 +0000748 if (kvm->arch.xen_hvm_config.msr)
749 static_branch_slow_dec_deferred(&kvm_xen_enabled);
750}
751
Joao Martins23200b72018-06-13 09:55:44 -0400752static int kvm_xen_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result)
753{
754 kvm_rax_write(vcpu, result);
755 return kvm_skip_emulated_instruction(vcpu);
756}
757
758static int kvm_xen_hypercall_complete_userspace(struct kvm_vcpu *vcpu)
759{
760 struct kvm_run *run = vcpu->run;
761
762 if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.xen.hypercall_rip)))
763 return 1;
764
765 return kvm_xen_hypercall_set_result(vcpu, run->xen.u.hcall.result);
766}
767
768int kvm_xen_hypercall(struct kvm_vcpu *vcpu)
769{
770 bool longmode;
771 u64 input, params[6];
772
773 input = (u64)kvm_register_read(vcpu, VCPU_REGS_RAX);
774
Joao Martins79033be2018-06-13 09:55:44 -0400775 /* Hyper-V hypercalls get bit 31 set in EAX */
776 if ((input & 0x80000000) &&
Vitaly Kuznetsov8f014552021-01-26 14:48:14 +0100777 kvm_hv_hypercall_enabled(vcpu))
Joao Martins79033be2018-06-13 09:55:44 -0400778 return kvm_hv_hypercall(vcpu);
779
Tom Lendackyb5aead02021-05-24 12:48:57 -0500780 longmode = is_64_bit_hypercall(vcpu);
Joao Martins23200b72018-06-13 09:55:44 -0400781 if (!longmode) {
782 params[0] = (u32)kvm_rbx_read(vcpu);
783 params[1] = (u32)kvm_rcx_read(vcpu);
784 params[2] = (u32)kvm_rdx_read(vcpu);
785 params[3] = (u32)kvm_rsi_read(vcpu);
786 params[4] = (u32)kvm_rdi_read(vcpu);
787 params[5] = (u32)kvm_rbp_read(vcpu);
788 }
789#ifdef CONFIG_X86_64
790 else {
791 params[0] = (u64)kvm_rdi_read(vcpu);
792 params[1] = (u64)kvm_rsi_read(vcpu);
793 params[2] = (u64)kvm_rdx_read(vcpu);
794 params[3] = (u64)kvm_r10_read(vcpu);
795 params[4] = (u64)kvm_r8_read(vcpu);
796 params[5] = (u64)kvm_r9_read(vcpu);
797 }
798#endif
799 trace_kvm_xen_hypercall(input, params[0], params[1], params[2],
800 params[3], params[4], params[5]);
801
802 vcpu->run->exit_reason = KVM_EXIT_XEN;
803 vcpu->run->xen.type = KVM_EXIT_XEN_HCALL;
804 vcpu->run->xen.u.hcall.longmode = longmode;
805 vcpu->run->xen.u.hcall.cpl = kvm_x86_ops.get_cpl(vcpu);
806 vcpu->run->xen.u.hcall.input = input;
807 vcpu->run->xen.u.hcall.params[0] = params[0];
808 vcpu->run->xen.u.hcall.params[1] = params[1];
809 vcpu->run->xen.u.hcall.params[2] = params[2];
810 vcpu->run->xen.u.hcall.params[3] = params[3];
811 vcpu->run->xen.u.hcall.params[4] = params[4];
812 vcpu->run->xen.u.hcall.params[5] = params[5];
813 vcpu->arch.xen.hypercall_rip = kvm_get_linear_rip(vcpu);
814 vcpu->arch.complete_userspace_io =
815 kvm_xen_hypercall_complete_userspace;
816
817 return 0;
818}
David Woodhouse14243b32021-12-10 16:36:23 +0000819
820static inline int max_evtchn_port(struct kvm *kvm)
821{
822 if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode)
823 return EVTCHN_2L_NR_CHANNELS;
824 else
825 return COMPAT_EVTCHN_2L_NR_CHANNELS;
826}
827
828/*
829 * This follows the kvm_set_irq() API, so it returns:
830 * < 0 Interrupt was ignored (masked or not delivered for other reasons)
831 * = 0 Interrupt was coalesced (previous irq is still pending)
832 * > 0 Number of CPUs interrupt was delivered to
833 */
834int kvm_xen_set_evtchn_fast(struct kvm_kernel_irq_routing_entry *e,
835 struct kvm *kvm)
836{
837 struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache;
838 struct kvm_vcpu *vcpu;
839 unsigned long *pending_bits, *mask_bits;
840 unsigned long flags;
841 int port_word_bit;
842 bool kick_vcpu = false;
843 int idx;
844 int rc;
845
846 vcpu = kvm_get_vcpu_by_id(kvm, e->xen_evtchn.vcpu);
847 if (!vcpu)
848 return -1;
849
850 if (!vcpu->arch.xen.vcpu_info_set)
851 return -1;
852
853 if (e->xen_evtchn.port >= max_evtchn_port(kvm))
854 return -1;
855
856 rc = -EWOULDBLOCK;
857 read_lock_irqsave(&gpc->lock, flags);
858
859 idx = srcu_read_lock(&kvm->srcu);
860 if (!kvm_gfn_to_pfn_cache_check(kvm, gpc, gpc->gpa, PAGE_SIZE))
861 goto out_rcu;
862
863 if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) {
864 struct shared_info *shinfo = gpc->khva;
865 pending_bits = (unsigned long *)&shinfo->evtchn_pending;
866 mask_bits = (unsigned long *)&shinfo->evtchn_mask;
867 port_word_bit = e->xen_evtchn.port / 64;
868 } else {
869 struct compat_shared_info *shinfo = gpc->khva;
870 pending_bits = (unsigned long *)&shinfo->evtchn_pending;
871 mask_bits = (unsigned long *)&shinfo->evtchn_mask;
872 port_word_bit = e->xen_evtchn.port / 32;
873 }
874
875 /*
876 * If this port wasn't already set, and if it isn't masked, then
877 * we try to set the corresponding bit in the in-kernel shadow of
878 * evtchn_pending_sel for the target vCPU. And if *that* wasn't
879 * already set, then we kick the vCPU in question to write to the
880 * *real* evtchn_pending_sel in its own guest vcpu_info struct.
881 */
882 if (test_and_set_bit(e->xen_evtchn.port, pending_bits)) {
883 rc = 0; /* It was already raised */
884 } else if (test_bit(e->xen_evtchn.port, mask_bits)) {
885 rc = -1; /* Masked */
886 } else {
887 rc = 1; /* Delivered. But was the vCPU waking already? */
888 if (!test_and_set_bit(port_word_bit, &vcpu->arch.xen.evtchn_pending_sel))
889 kick_vcpu = true;
890 }
891
892 out_rcu:
893 srcu_read_unlock(&kvm->srcu, idx);
894 read_unlock_irqrestore(&gpc->lock, flags);
895
896 if (kick_vcpu) {
897 kvm_make_request(KVM_REQ_EVENT, vcpu);
898 kvm_vcpu_kick(vcpu);
899 }
900
901 return rc;
902}
903
904/* This is the version called from kvm_set_irq() as the .set function */
905static int evtchn_set_fn(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
906 int irq_source_id, int level, bool line_status)
907{
908 bool mm_borrowed = false;
909 int rc;
910
911 if (!level)
912 return -1;
913
914 rc = kvm_xen_set_evtchn_fast(e, kvm);
915 if (rc != -EWOULDBLOCK)
916 return rc;
917
918 if (current->mm != kvm->mm) {
919 /*
920 * If not on a thread which already belongs to this KVM,
921 * we'd better be in the irqfd workqueue.
922 */
923 if (WARN_ON_ONCE(current->mm))
924 return -EINVAL;
925
926 kthread_use_mm(kvm->mm);
927 mm_borrowed = true;
928 }
929
930 /*
931 * For the irqfd workqueue, using the main kvm->lock mutex is
932 * fine since this function is invoked from kvm_set_irq() with
933 * no other lock held, no srcu. In future if it will be called
934 * directly from a vCPU thread (e.g. on hypercall for an IPI)
935 * then it may need to switch to using a leaf-node mutex for
936 * serializing the shared_info mapping.
937 */
938 mutex_lock(&kvm->lock);
939
940 /*
941 * It is theoretically possible for the page to be unmapped
942 * and the MMU notifier to invalidate the shared_info before
943 * we even get to use it. In that case, this looks like an
944 * infinite loop. It was tempting to do it via the userspace
945 * HVA instead... but that just *hides* the fact that it's
946 * an infinite loop, because if a fault occurs and it waits
947 * for the page to come back, it can *still* immediately
948 * fault and have to wait again, repeatedly.
949 *
950 * Conversely, the page could also have been reinstated by
951 * another thread before we even obtain the mutex above, so
952 * check again *first* before remapping it.
953 */
954 do {
955 struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache;
956 int idx;
957
958 rc = kvm_xen_set_evtchn_fast(e, kvm);
959 if (rc != -EWOULDBLOCK)
960 break;
961
962 idx = srcu_read_lock(&kvm->srcu);
963 rc = kvm_gfn_to_pfn_cache_refresh(kvm, gpc, gpc->gpa,
964 PAGE_SIZE, false);
965 srcu_read_unlock(&kvm->srcu, idx);
966 } while(!rc);
967
968 mutex_unlock(&kvm->lock);
969
970 if (mm_borrowed)
971 kthread_unuse_mm(kvm->mm);
972
973 return rc;
974}
975
976int kvm_xen_setup_evtchn(struct kvm *kvm,
977 struct kvm_kernel_irq_routing_entry *e,
978 const struct kvm_irq_routing_entry *ue)
979
980{
981 if (ue->u.xen_evtchn.port >= max_evtchn_port(kvm))
982 return -EINVAL;
983
984 /* We only support 2 level event channels for now */
985 if (ue->u.xen_evtchn.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL)
986 return -EINVAL;
987
988 e->xen_evtchn.port = ue->u.xen_evtchn.port;
989 e->xen_evtchn.vcpu = ue->u.xen_evtchn.vcpu;
990 e->xen_evtchn.priority = ue->u.xen_evtchn.priority;
991 e->set = evtchn_set_fn;
992
993 return 0;
994}