Blame - arch/x86/kvm/xen.c - SHIFTPHONES/mainline/linux

blob: 0e3f7d6e9fd775d68589139feb1f4adf8eea9277 [file] [log] [blame]

Joao Martins	23200b7	2018-06-13 09:55:44 -0400	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
				2	/*
				3	* Copyright © 2019 Oracle and/or its affiliates. All rights reserved.
				4	* Copyright © 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
				5	*
				6	* KVM Xen emulation
				7	*/
				8
				9	#include "x86.h"
				10	#include "xen.h"
Joao Martins	79033be	2018-06-13 09:55:44 -0400	[diff] [blame]	11	#include "hyperv.h"
Joao Martins	23200b7	2018-06-13 09:55:44 -0400	[diff] [blame]	12
				13	#include <linux/kvm_host.h>
David Woodhouse	30b5c85	2021-03-01 12:53:09 +0000	[diff] [blame]	14	#include <linux/sched/stat.h>
Joao Martins	23200b7	2018-06-13 09:55:44 -0400	[diff] [blame]	15
				16	#include <trace/events/kvm.h>
Joao Martins	13ffb97	2018-06-15 21:17:14 -0400	[diff] [blame]	17	#include <xen/interface/xen.h>
David Woodhouse	30b5c85	2021-03-01 12:53:09 +0000	[diff] [blame]	18	#include <xen/interface/vcpu.h>
David Woodhouse	14243b3	2021-12-10 16:36:23 +0000	[diff] [blame]	19	#include <xen/interface/event_channel.h>
Joao Martins	23200b7	2018-06-13 09:55:44 -0400	[diff] [blame]	20
				21	#include "trace.h"
				22
David Woodhouse	7d6bbeb	2021-02-02 15:48:05 +0000	[diff] [blame]	23	DEFINE_STATIC_KEY_DEFERRED_FALSE(kvm_xen_enabled, HZ);
				24
Joao Martins	13ffb97	2018-06-15 21:17:14 -0400	[diff] [blame]	25	static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn)
				26	{
David Woodhouse	1cfc9c4	2021-12-10 16:36:22 +0000	[diff] [blame]	27	struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache;
David Woodhouse	5574976	2021-12-10 16:36:24 +0000	[diff] [blame]	28	struct pvclock_wall_clock *wc;
Joao Martins	629b534	2018-06-28 15:06:43 -0400	[diff] [blame]	29	gpa_t gpa = gfn_to_gpa(gfn);
David Woodhouse	5574976	2021-12-10 16:36:24 +0000	[diff] [blame]	30	u32 *wc_sec_hi;
				31	u32 wc_version;
				32	u64 wall_nsec;
Paolo Bonzini	319afe6	2021-08-04 12:48:41 -0400	[diff] [blame]	33	int ret = 0;
Joao Martins	13ffb97	2018-06-15 21:17:14 -0400	[diff] [blame]	34	int idx = srcu_read_lock(&kvm->srcu);
				35
David Woodhouse	1cfc9c4	2021-12-10 16:36:22 +0000	[diff] [blame]	36	if (gfn == GPA_INVALID) {
				37	kvm_gfn_to_pfn_cache_destroy(kvm, gpc);
Joao Martins	629b534	2018-06-28 15:06:43 -0400	[diff] [blame]	38	goto out;
Paolo Bonzini	319afe6	2021-08-04 12:48:41 -0400	[diff] [blame]	39	}
David Woodhouse	1cfc9c4	2021-12-10 16:36:22 +0000	[diff] [blame]	40
David Woodhouse	5574976	2021-12-10 16:36:24 +0000	[diff] [blame]	41	do {
				42	ret = kvm_gfn_to_pfn_cache_init(kvm, gpc, NULL, false, true,
				43	gpa, PAGE_SIZE, false);
				44	if (ret)
				45	goto out;
				46
				47	/*
				48	* This code mirrors kvm_write_wall_clock() except that it writes
				49	* directly through the pfn cache and doesn't mark the page dirty.
				50	*/
				51	wall_nsec = ktime_get_real_ns() - get_kvmclock_ns(kvm);
				52
				53	/* It could be invalid again already, so we need to check */
				54	read_lock_irq(&gpc->lock);
				55
				56	if (gpc->valid)
				57	break;
				58
				59	read_unlock_irq(&gpc->lock);
				60	} while (1);
Joao Martins	629b534	2018-06-28 15:06:43 -0400	[diff] [blame]	61
				62	/* Paranoia checks on the 32-bit struct layout */
				63	BUILD_BUG_ON(offsetof(struct compat_shared_info, wc) != 0x900);
				64	BUILD_BUG_ON(offsetof(struct compat_shared_info, arch.wc_sec_hi) != 0x924);
				65	BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
				66
Joao Martins	629b534	2018-06-28 15:06:43 -0400	[diff] [blame]	67	#ifdef CONFIG_X86_64
				68	/* Paranoia checks on the 64-bit struct layout */
				69	BUILD_BUG_ON(offsetof(struct shared_info, wc) != 0xc00);
				70	BUILD_BUG_ON(offsetof(struct shared_info, wc_sec_hi) != 0xc0c);
				71
David Woodhouse	5574976	2021-12-10 16:36:24 +0000	[diff] [blame]	72	if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) {
				73	struct shared_info *shinfo = gpc->khva;
Joao Martins	629b534	2018-06-28 15:06:43 -0400	[diff] [blame]	74
David Woodhouse	5574976	2021-12-10 16:36:24 +0000	[diff] [blame]	75	wc_sec_hi = &shinfo->wc_sec_hi;
				76	wc = &shinfo->wc;
				77	} else
				78	#endif
				79	{
				80	struct compat_shared_info *shinfo = gpc->khva;
				81
				82	wc_sec_hi = &shinfo->arch.wc_sec_hi;
				83	wc = &shinfo->wc;
				84	}
				85
				86	/* Increment and ensure an odd value */
				87	wc_version = wc->version = (wc->version + 1) \| 1;
				88	smp_wmb();
				89
				90	wc->nsec = do_div(wall_nsec, 1000000000);
				91	wc->sec = (u32)wall_nsec;
				92	*wc_sec_hi = wall_nsec >> 32;
				93	smp_wmb();
				94
				95	wc->version = wc_version + 1;
				96	read_unlock_irq(&gpc->lock);
				97
Joao Martins	629b534	2018-06-28 15:06:43 -0400	[diff] [blame]	98	kvm_make_all_cpus_request(kvm, KVM_REQ_MASTERCLOCK_UPDATE);
				99
				100	out:
Joao Martins	13ffb97	2018-06-15 21:17:14 -0400	[diff] [blame]	101	srcu_read_unlock(&kvm->srcu, idx);
				102	return ret;
				103	}
				104
David Woodhouse	30b5c85	2021-03-01 12:53:09 +0000	[diff] [blame]	105	static void kvm_xen_update_runstate(struct kvm_vcpu *v, int state)
				106	{
				107	struct kvm_vcpu_xen *vx = &v->arch.xen;
				108	u64 now = get_kvmclock_ns(v->kvm);
				109	u64 delta_ns = now - vx->runstate_entry_time;
				110	u64 run_delay = current->sched_info.run_delay;
				111
				112	if (unlikely(!vx->runstate_entry_time))
				113	vx->current_runstate = RUNSTATE_offline;
				114
				115	/*
				116	* Time waiting for the scheduler isn't "stolen" if the
				117	* vCPU wasn't running anyway.
				118	*/
				119	if (vx->current_runstate == RUNSTATE_running) {
				120	u64 steal_ns = run_delay - vx->last_steal;
				121
				122	delta_ns -= steal_ns;
				123
				124	vx->runstate_times[RUNSTATE_runnable] += steal_ns;
				125	}
				126	vx->last_steal = run_delay;
				127
				128	vx->runstate_times[vx->current_runstate] += delta_ns;
				129	vx->current_runstate = state;
				130	vx->runstate_entry_time = now;
				131	}
				132
				133	void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
				134	{
				135	struct kvm_vcpu_xen *vx = &v->arch.xen;
				136	uint64_t state_entry_time;
				137	unsigned int offset;
				138
				139	kvm_xen_update_runstate(v, state);
				140
				141	if (!vx->runstate_set)
				142	return;
				143
				144	BUILD_BUG_ON(sizeof(struct compat_vcpu_runstate_info) != 0x2c);
				145
				146	offset = offsetof(struct compat_vcpu_runstate_info, state_entry_time);
				147	#ifdef CONFIG_X86_64
				148	/*
				149	* The only difference is alignment of uint64_t in 32-bit.
				150	* So the first field 'state' is accessed directly using
				151	* offsetof() (where its offset happens to be zero), while the
				152	* remaining fields which are all uint64_t, start at 'offset'
				153	* which we tweak here by adding 4.
				154	*/
				155	BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) !=
				156	offsetof(struct compat_vcpu_runstate_info, state_entry_time) + 4);
				157	BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, time) !=
				158	offsetof(struct compat_vcpu_runstate_info, time) + 4);
				159
				160	if (v->kvm->arch.xen.long_mode)
				161	offset = offsetof(struct vcpu_runstate_info, state_entry_time);
				162	#endif
				163	/*
				164	* First write the updated state_entry_time at the appropriate
				165	* location determined by 'offset'.
				166	*/
				167	state_entry_time = vx->runstate_entry_time;
				168	state_entry_time \|= XEN_RUNSTATE_UPDATE;
				169
David Woodhouse	6a83475	2021-11-15 16:50:23 +0000	[diff] [blame]	170	BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, state_entry_time) !=
David Woodhouse	30b5c85	2021-03-01 12:53:09 +0000	[diff] [blame]	171	sizeof(state_entry_time));
David Woodhouse	6a83475	2021-11-15 16:50:23 +0000	[diff] [blame]	172	BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state_entry_time) !=
David Woodhouse	30b5c85	2021-03-01 12:53:09 +0000	[diff] [blame]	173	sizeof(state_entry_time));
				174
				175	if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache,
				176	&state_entry_time, offset,
				177	sizeof(state_entry_time)))
				178	return;
				179	smp_wmb();
				180
				181	/*
				182	* Next, write the new runstate. This is in the same place
				183	* for 32-bit and 64-bit guests, asserted here for paranoia.
				184	*/
				185	BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) !=
				186	offsetof(struct compat_vcpu_runstate_info, state));
David Woodhouse	6a83475	2021-11-15 16:50:23 +0000	[diff] [blame]	187	BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, state) !=
David Woodhouse	30b5c85	2021-03-01 12:53:09 +0000	[diff] [blame]	188	sizeof(vx->current_runstate));
David Woodhouse	6a83475	2021-11-15 16:50:23 +0000	[diff] [blame]	189	BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state) !=
David Woodhouse	30b5c85	2021-03-01 12:53:09 +0000	[diff] [blame]	190	sizeof(vx->current_runstate));
				191
				192	if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache,
				193	&vx->current_runstate,
				194	offsetof(struct vcpu_runstate_info, state),
				195	sizeof(vx->current_runstate)))
				196	return;
				197
				198	/*
				199	* Write the actual runstate times immediately after the
				200	* runstate_entry_time.
				201	*/
				202	BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) !=
				203	offsetof(struct vcpu_runstate_info, time) - sizeof(u64));
				204	BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state_entry_time) !=
				205	offsetof(struct compat_vcpu_runstate_info, time) - sizeof(u64));
David Woodhouse	6a83475	2021-11-15 16:50:23 +0000	[diff] [blame]	206	BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) !=
				207	sizeof_field(struct compat_vcpu_runstate_info, time));
				208	BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) !=
David Woodhouse	30b5c85	2021-03-01 12:53:09 +0000	[diff] [blame]	209	sizeof(vx->runstate_times));
				210
				211	if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache,
				212	&vx->runstate_times[0],
				213	offset + sizeof(u64),
				214	sizeof(vx->runstate_times)))
				215	return;
				216
				217	smp_wmb();
				218
				219	/*
				220	* Finally, clear the XEN_RUNSTATE_UPDATE bit in the guest's
				221	* runstate_entry_time field.
				222	*/
				223
				224	state_entry_time &= ~XEN_RUNSTATE_UPDATE;
				225	if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache,
				226	&state_entry_time, offset,
				227	sizeof(state_entry_time)))
				228	return;
				229	}
				230
David Woodhouse	40da8cc	2020-12-09 20:08:30 +0000	[diff] [blame]	231	int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
				232	{
David Woodhouse	14243b3	2021-12-10 16:36:23 +0000	[diff] [blame]	233	unsigned long evtchn_pending_sel = READ_ONCE(v->arch.xen.evtchn_pending_sel);
				234	bool atomic = in_atomic() \|\| !task_is_running(current);
David Woodhouse	0985dba	2021-10-23 20:47:19 +0100	[diff] [blame]	235	int err;
David Woodhouse	40da8cc	2020-12-09 20:08:30 +0000	[diff] [blame]	236	u8 rc = 0;
				237
				238	/*
				239	* If the global upcall vector (HVMIRQ_callback_vector) is set and
				240	* the vCPU's evtchn_upcall_pending flag is set, the IRQ is pending.
				241	*/
				242	struct gfn_to_hva_cache *ghc = &v->arch.xen.vcpu_info_cache;
				243	struct kvm_memslots *slots = kvm_memslots(v->kvm);
David Woodhouse	14243b3	2021-12-10 16:36:23 +0000	[diff] [blame]	244	bool ghc_valid = slots->generation == ghc->generation &&
				245	!kvm_is_error_hva(ghc->hva) && ghc->memslot;
				246
David Woodhouse	40da8cc	2020-12-09 20:08:30 +0000	[diff] [blame]	247	unsigned int offset = offsetof(struct vcpu_info, evtchn_upcall_pending);
				248
				249	/* No need for compat handling here */
				250	BUILD_BUG_ON(offsetof(struct vcpu_info, evtchn_upcall_pending) !=
				251	offsetof(struct compat_vcpu_info, evtchn_upcall_pending));
				252	BUILD_BUG_ON(sizeof(rc) !=
David Woodhouse	6a83475	2021-11-15 16:50:23 +0000	[diff] [blame]	253	sizeof_field(struct vcpu_info, evtchn_upcall_pending));
David Woodhouse	40da8cc	2020-12-09 20:08:30 +0000	[diff] [blame]	254	BUILD_BUG_ON(sizeof(rc) !=
David Woodhouse	6a83475	2021-11-15 16:50:23 +0000	[diff] [blame]	255	sizeof_field(struct compat_vcpu_info, evtchn_upcall_pending));
David Woodhouse	40da8cc	2020-12-09 20:08:30 +0000	[diff] [blame]	256
				257	/*
				258	* For efficiency, this mirrors the checks for using the valid
				259	* cache in kvm_read_guest_offset_cached(), but just uses
				260	* __get_user() instead. And falls back to the slow path.
				261	*/
David Woodhouse	14243b3	2021-12-10 16:36:23 +0000	[diff] [blame]	262	if (!evtchn_pending_sel && ghc_valid) {
David Woodhouse	40da8cc	2020-12-09 20:08:30 +0000	[diff] [blame]	263	/* Fast path */
David Woodhouse	0985dba	2021-10-23 20:47:19 +0100	[diff] [blame]	264	pagefault_disable();
				265	err = __get_user(rc, (u8 __user *)ghc->hva + offset);
				266	pagefault_enable();
				267	if (!err)
				268	return rc;
David Woodhouse	40da8cc	2020-12-09 20:08:30 +0000	[diff] [blame]	269	}
				270
David Woodhouse	0985dba	2021-10-23 20:47:19 +0100	[diff] [blame]	271	/* Slow path */
				272
				273	/*
				274	* This function gets called from kvm_vcpu_block() after setting the
				275	* task to TASK_INTERRUPTIBLE, to see if it needs to wake immediately
				276	* from a HLT. So we really mustn't sleep. If the page ended up absent
				277	* at that point, just return 1 in order to trigger an immediate wake,
				278	* and we'll end up getting called again from a context where we can
				279	* fault in the page and wait for it.
				280	*/
David Woodhouse	14243b3	2021-12-10 16:36:23 +0000	[diff] [blame]	281	if (atomic)
David Woodhouse	0985dba	2021-10-23 20:47:19 +0100	[diff] [blame]	282	return 1;
				283
David Woodhouse	14243b3	2021-12-10 16:36:23 +0000	[diff] [blame]	284	if (!ghc_valid) {
				285	err = kvm_gfn_to_hva_cache_init(v->kvm, ghc, ghc->gpa, ghc->len);
				286	if (err \|\| !ghc->memslot) {
				287	/*
				288	* If this failed, userspace has screwed up the
				289	* vcpu_info mapping. No interrupts for you.
				290	*/
				291	return 0;
				292	}
				293	}
				294
				295	/*
				296	* Now we have a valid (protected by srcu) userspace HVA in
				297	* ghc->hva which points to the struct vcpu_info. If there
				298	* are any bits in the in-kernel evtchn_pending_sel then
				299	* we need to write those to the guest vcpu_info and set
				300	* its evtchn_upcall_pending flag. If there aren't any bits
				301	* to add, we only want to check evtchn_upcall_pending.
				302	*/
				303	if (evtchn_pending_sel) {
				304	bool long_mode = v->kvm->arch.xen.long_mode;
				305
				306	if (!user_access_begin((void __user *)ghc->hva, sizeof(struct vcpu_info)))
				307	return 0;
				308
				309	if (IS_ENABLED(CONFIG_64BIT) && long_mode) {
				310	struct vcpu_info __user vi = (void __user )ghc->hva;
				311
				312	/* Attempt to set the evtchn_pending_sel bits in the
				313	* guest, and if that succeeds then clear the same
				314	* bits in the in-kernel version. */
				315	asm volatile("1:\t" LOCK_PREFIX "orq %0, %1\n"
				316	"\tnotq %0\n"
				317	"\t" LOCK_PREFIX "andq %0, %2\n"
				318	"2:\n"
				319	"\t.section .fixup,\"ax\"\n"
				320	"3:\tjmp\t2b\n"
				321	"\t.previous\n"
				322	_ASM_EXTABLE_UA(1b, 3b)
				323	: "=r" (evtchn_pending_sel),
				324	"+m" (vi->evtchn_pending_sel),
				325	"+m" (v->arch.xen.evtchn_pending_sel)
				326	: "0" (evtchn_pending_sel));
				327	} else {
				328	struct compat_vcpu_info __user vi = (void __user )ghc->hva;
				329	u32 evtchn_pending_sel32 = evtchn_pending_sel;
				330
				331	/* Attempt to set the evtchn_pending_sel bits in the
				332	* guest, and if that succeeds then clear the same
				333	* bits in the in-kernel version. */
				334	asm volatile("1:\t" LOCK_PREFIX "orl %0, %1\n"
				335	"\tnotl %0\n"
				336	"\t" LOCK_PREFIX "andl %0, %2\n"
				337	"2:\n"
				338	"\t.section .fixup,\"ax\"\n"
				339	"3:\tjmp\t2b\n"
				340	"\t.previous\n"
				341	_ASM_EXTABLE_UA(1b, 3b)
				342	: "=r" (evtchn_pending_sel32),
				343	"+m" (vi->evtchn_pending_sel),
				344	"+m" (v->arch.xen.evtchn_pending_sel)
				345	: "0" (evtchn_pending_sel32));
				346	}
				347	rc = 1;
				348	unsafe_put_user(rc, (u8 __user *)ghc->hva + offset, err);
				349
				350	err:
				351	user_access_end();
				352
				353	mark_page_dirty_in_slot(v->kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT);
				354	} else {
				355	__get_user(rc, (u8 __user *)ghc->hva + offset);
				356	}
David Woodhouse	0985dba	2021-10-23 20:47:19 +0100	[diff] [blame]	357
David Woodhouse	40da8cc	2020-12-09 20:08:30 +0000	[diff] [blame]	358	return rc;
				359	}
				360
Joao Martins	a76b964	2020-12-03 15:52:25 +0000	[diff] [blame]	361	int kvm_xen_hvm_set_attr(struct kvm kvm, struct kvm_xen_hvm_attr data)
				362	{
				363	int r = -ENOENT;
				364
Joao Martins	13ffb97	2018-06-15 21:17:14 -0400	[diff] [blame]	365	mutex_lock(&kvm->lock);
				366
Joao Martins	a76b964	2020-12-03 15:52:25 +0000	[diff] [blame]	367	switch (data->type) {
David Woodhouse	a3833b8	2020-12-03 16:20:32 +0000	[diff] [blame]	368	case KVM_XEN_ATTR_TYPE_LONG_MODE:
Joao Martins	13ffb97	2018-06-15 21:17:14 -0400	[diff] [blame]	369	if (!IS_ENABLED(CONFIG_64BIT) && data->u.long_mode) {
				370	r = -EINVAL;
				371	} else {
				372	kvm->arch.xen.long_mode = !!data->u.long_mode;
				373	r = 0;
				374	}
David Woodhouse	a3833b8	2020-12-03 16:20:32 +0000	[diff] [blame]	375	break;
Joao Martins	13ffb97	2018-06-15 21:17:14 -0400	[diff] [blame]	376
				377	case KVM_XEN_ATTR_TYPE_SHARED_INFO:
				378	r = kvm_xen_shared_info_init(kvm, data->u.shared_info.gfn);
				379	break;
				380
David Woodhouse	40da8cc	2020-12-09 20:08:30 +0000	[diff] [blame]	381	case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR:
David Woodhouse	0c165b3	2021-02-08 23:23:25 +0000	[diff] [blame]	382	if (data->u.vector && data->u.vector < 0x10)
David Woodhouse	40da8cc	2020-12-09 20:08:30 +0000	[diff] [blame]	383	r = -EINVAL;
				384	else {
				385	kvm->arch.xen.upcall_vector = data->u.vector;
				386	r = 0;
				387	}
				388	break;
				389
Joao Martins	a76b964	2020-12-03 15:52:25 +0000	[diff] [blame]	390	default:
				391	break;
				392	}
				393
				394	mutex_unlock(&kvm->lock);
				395	return r;
				396	}
				397
				398	int kvm_xen_hvm_get_attr(struct kvm kvm, struct kvm_xen_hvm_attr data)
				399	{
				400	int r = -ENOENT;
				401
				402	mutex_lock(&kvm->lock);
				403
				404	switch (data->type) {
David Woodhouse	a3833b8	2020-12-03 16:20:32 +0000	[diff] [blame]	405	case KVM_XEN_ATTR_TYPE_LONG_MODE:
				406	data->u.long_mode = kvm->arch.xen.long_mode;
				407	r = 0;
				408	break;
Joao Martins	13ffb97	2018-06-15 21:17:14 -0400	[diff] [blame]	409
				410	case KVM_XEN_ATTR_TYPE_SHARED_INFO:
David Woodhouse	1cfc9c4	2021-12-10 16:36:22 +0000	[diff] [blame]	411	if (kvm->arch.xen.shinfo_cache.active)
				412	data->u.shared_info.gfn = gpa_to_gfn(kvm->arch.xen.shinfo_cache.gpa);
				413	else
				414	data->u.shared_info.gfn = GPA_INVALID;
David Woodhouse	0c165b3	2021-02-08 23:23:25 +0000	[diff] [blame]	415	r = 0;
Joao Martins	13ffb97	2018-06-15 21:17:14 -0400	[diff] [blame]	416	break;
				417
David Woodhouse	40da8cc	2020-12-09 20:08:30 +0000	[diff] [blame]	418	case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR:
				419	data->u.vector = kvm->arch.xen.upcall_vector;
				420	r = 0;
				421	break;
				422
Joao Martins	a76b964	2020-12-03 15:52:25 +0000	[diff] [blame]	423	default:
				424	break;
				425	}
				426
				427	mutex_unlock(&kvm->lock);
				428	return r;
				429	}
				430
David Woodhouse	3e32461	2021-02-02 16:53:25 +0000	[diff] [blame]	431	int kvm_xen_vcpu_set_attr(struct kvm_vcpu vcpu, struct kvm_xen_vcpu_attr data)
				432	{
Joao Martins	73e69a8	2018-06-29 10:52:52 -0400	[diff] [blame]	433	int idx, r = -ENOENT;
David Woodhouse	3e32461	2021-02-02 16:53:25 +0000	[diff] [blame]	434
				435	mutex_lock(&vcpu->kvm->lock);
Joao Martins	73e69a8	2018-06-29 10:52:52 -0400	[diff] [blame]	436	idx = srcu_read_lock(&vcpu->kvm->srcu);
David Woodhouse	3e32461	2021-02-02 16:53:25 +0000	[diff] [blame]	437
				438	switch (data->type) {
Joao Martins	73e69a8	2018-06-29 10:52:52 -0400	[diff] [blame]	439	case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO:
				440	/* No compat necessary here. */
				441	BUILD_BUG_ON(sizeof(struct vcpu_info) !=
				442	sizeof(struct compat_vcpu_info));
David Woodhouse	7d7c5f7	2021-03-01 12:53:08 +0000	[diff] [blame]	443	BUILD_BUG_ON(offsetof(struct vcpu_info, time) !=
				444	offsetof(struct compat_vcpu_info, time));
Joao Martins	73e69a8	2018-06-29 10:52:52 -0400	[diff] [blame]	445
David Woodhouse	0c165b3	2021-02-08 23:23:25 +0000	[diff] [blame]	446	if (data->u.gpa == GPA_INVALID) {
				447	vcpu->arch.xen.vcpu_info_set = false;
David Woodhouse	7d7c5f7	2021-03-01 12:53:08 +0000	[diff] [blame]	448	r = 0;
David Woodhouse	0c165b3	2021-02-08 23:23:25 +0000	[diff] [blame]	449	break;
				450	}
				451
Joao Martins	73e69a8	2018-06-29 10:52:52 -0400	[diff] [blame]	452	r = kvm_gfn_to_hva_cache_init(vcpu->kvm,
				453	&vcpu->arch.xen.vcpu_info_cache,
				454	data->u.gpa,
				455	sizeof(struct vcpu_info));
Joao Martins	aa096aa	2019-02-01 13:01:45 -0500	[diff] [blame]	456	if (!r) {
Joao Martins	73e69a8	2018-06-29 10:52:52 -0400	[diff] [blame]	457	vcpu->arch.xen.vcpu_info_set = true;
Joao Martins	aa096aa	2019-02-01 13:01:45 -0500	[diff] [blame]	458	kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
				459	}
Joao Martins	73e69a8	2018-06-29 10:52:52 -0400	[diff] [blame]	460	break;
				461
Joao Martins	f2340cd	2018-07-23 11:20:57 -0400	[diff] [blame]	462	case KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO:
David Woodhouse	0c165b3	2021-02-08 23:23:25 +0000	[diff] [blame]	463	if (data->u.gpa == GPA_INVALID) {
				464	vcpu->arch.xen.vcpu_time_info_set = false;
David Woodhouse	7d7c5f7	2021-03-01 12:53:08 +0000	[diff] [blame]	465	r = 0;
David Woodhouse	0c165b3	2021-02-08 23:23:25 +0000	[diff] [blame]	466	break;
				467	}
				468
Joao Martins	f2340cd	2018-07-23 11:20:57 -0400	[diff] [blame]	469	r = kvm_gfn_to_hva_cache_init(vcpu->kvm,
				470	&vcpu->arch.xen.vcpu_time_info_cache,
				471	data->u.gpa,
				472	sizeof(struct pvclock_vcpu_time_info));
				473	if (!r) {
				474	vcpu->arch.xen.vcpu_time_info_set = true;
				475	kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
				476	}
				477	break;
				478
David Woodhouse	30b5c85	2021-03-01 12:53:09 +0000	[diff] [blame]	479	case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR:
				480	if (!sched_info_on()) {
				481	r = -EOPNOTSUPP;
				482	break;
				483	}
				484	if (data->u.gpa == GPA_INVALID) {
				485	vcpu->arch.xen.runstate_set = false;
				486	r = 0;
				487	break;
				488	}
				489
				490	r = kvm_gfn_to_hva_cache_init(vcpu->kvm,
				491	&vcpu->arch.xen.runstate_cache,
				492	data->u.gpa,
				493	sizeof(struct vcpu_runstate_info));
				494	if (!r) {
				495	vcpu->arch.xen.runstate_set = true;
				496	}
				497	break;
				498
				499	case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT:
				500	if (!sched_info_on()) {
				501	r = -EOPNOTSUPP;
				502	break;
				503	}
				504	if (data->u.runstate.state > RUNSTATE_offline) {
				505	r = -EINVAL;
				506	break;
				507	}
				508
				509	kvm_xen_update_runstate(vcpu, data->u.runstate.state);
				510	r = 0;
				511	break;
				512
				513	case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA:
				514	if (!sched_info_on()) {
				515	r = -EOPNOTSUPP;
				516	break;
				517	}
				518	if (data->u.runstate.state > RUNSTATE_offline) {
				519	r = -EINVAL;
				520	break;
				521	}
				522	if (data->u.runstate.state_entry_time !=
				523	(data->u.runstate.time_running +
				524	data->u.runstate.time_runnable +
				525	data->u.runstate.time_blocked +
				526	data->u.runstate.time_offline)) {
				527	r = -EINVAL;
				528	break;
				529	}
				530	if (get_kvmclock_ns(vcpu->kvm) <
				531	data->u.runstate.state_entry_time) {
				532	r = -EINVAL;
				533	break;
				534	}
				535
				536	vcpu->arch.xen.current_runstate = data->u.runstate.state;
				537	vcpu->arch.xen.runstate_entry_time =
				538	data->u.runstate.state_entry_time;
				539	vcpu->arch.xen.runstate_times[RUNSTATE_running] =
				540	data->u.runstate.time_running;
				541	vcpu->arch.xen.runstate_times[RUNSTATE_runnable] =
				542	data->u.runstate.time_runnable;
				543	vcpu->arch.xen.runstate_times[RUNSTATE_blocked] =
				544	data->u.runstate.time_blocked;
				545	vcpu->arch.xen.runstate_times[RUNSTATE_offline] =
				546	data->u.runstate.time_offline;
				547	vcpu->arch.xen.last_steal = current->sched_info.run_delay;
				548	r = 0;
				549	break;
				550
				551	case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST:
				552	if (!sched_info_on()) {
				553	r = -EOPNOTSUPP;
				554	break;
				555	}
				556	if (data->u.runstate.state > RUNSTATE_offline &&
				557	data->u.runstate.state != (u64)-1) {
				558	r = -EINVAL;
				559	break;
				560	}
				561	/* The adjustment must add up */
				562	if (data->u.runstate.state_entry_time !=
				563	(data->u.runstate.time_running +
				564	data->u.runstate.time_runnable +
				565	data->u.runstate.time_blocked +
				566	data->u.runstate.time_offline)) {
				567	r = -EINVAL;
				568	break;
				569	}
				570
				571	if (get_kvmclock_ns(vcpu->kvm) <
				572	(vcpu->arch.xen.runstate_entry_time +
				573	data->u.runstate.state_entry_time)) {
				574	r = -EINVAL;
				575	break;
				576	}
				577
				578	vcpu->arch.xen.runstate_entry_time +=
				579	data->u.runstate.state_entry_time;
				580	vcpu->arch.xen.runstate_times[RUNSTATE_running] +=
				581	data->u.runstate.time_running;
				582	vcpu->arch.xen.runstate_times[RUNSTATE_runnable] +=
				583	data->u.runstate.time_runnable;
				584	vcpu->arch.xen.runstate_times[RUNSTATE_blocked] +=
				585	data->u.runstate.time_blocked;
				586	vcpu->arch.xen.runstate_times[RUNSTATE_offline] +=
				587	data->u.runstate.time_offline;
				588
				589	if (data->u.runstate.state <= RUNSTATE_offline)
				590	kvm_xen_update_runstate(vcpu, data->u.runstate.state);
				591	r = 0;
				592	break;
				593
David Woodhouse	3e32461	2021-02-02 16:53:25 +0000	[diff] [blame]	594	default:
				595	break;
				596	}
				597
Joao Martins	73e69a8	2018-06-29 10:52:52 -0400	[diff] [blame]	598	srcu_read_unlock(&vcpu->kvm->srcu, idx);
David Woodhouse	3e32461	2021-02-02 16:53:25 +0000	[diff] [blame]	599	mutex_unlock(&vcpu->kvm->lock);
				600	return r;
				601	}
				602
				603	int kvm_xen_vcpu_get_attr(struct kvm_vcpu vcpu, struct kvm_xen_vcpu_attr data)
				604	{
				605	int r = -ENOENT;
				606
				607	mutex_lock(&vcpu->kvm->lock);
				608
				609	switch (data->type) {
Joao Martins	73e69a8	2018-06-29 10:52:52 -0400	[diff] [blame]	610	case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO:
David Woodhouse	0c165b3	2021-02-08 23:23:25 +0000	[diff] [blame]	611	if (vcpu->arch.xen.vcpu_info_set)
Joao Martins	73e69a8	2018-06-29 10:52:52 -0400	[diff] [blame]	612	data->u.gpa = vcpu->arch.xen.vcpu_info_cache.gpa;
David Woodhouse	0c165b3	2021-02-08 23:23:25 +0000	[diff] [blame]	613	else
				614	data->u.gpa = GPA_INVALID;
				615	r = 0;
Joao Martins	73e69a8	2018-06-29 10:52:52 -0400	[diff] [blame]	616	break;
				617
Joao Martins	f2340cd	2018-07-23 11:20:57 -0400	[diff] [blame]	618	case KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO:
David Woodhouse	0c165b3	2021-02-08 23:23:25 +0000	[diff] [blame]	619	if (vcpu->arch.xen.vcpu_time_info_set)
Joao Martins	f2340cd	2018-07-23 11:20:57 -0400	[diff] [blame]	620	data->u.gpa = vcpu->arch.xen.vcpu_time_info_cache.gpa;
David Woodhouse	0c165b3	2021-02-08 23:23:25 +0000	[diff] [blame]	621	else
				622	data->u.gpa = GPA_INVALID;
				623	r = 0;
Joao Martins	f2340cd	2018-07-23 11:20:57 -0400	[diff] [blame]	624	break;
				625
David Woodhouse	30b5c85	2021-03-01 12:53:09 +0000	[diff] [blame]	626	case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR:
				627	if (!sched_info_on()) {
				628	r = -EOPNOTSUPP;
				629	break;
				630	}
				631	if (vcpu->arch.xen.runstate_set) {
				632	data->u.gpa = vcpu->arch.xen.runstate_cache.gpa;
				633	r = 0;
				634	}
				635	break;
				636
				637	case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT:
				638	if (!sched_info_on()) {
				639	r = -EOPNOTSUPP;
				640	break;
				641	}
				642	data->u.runstate.state = vcpu->arch.xen.current_runstate;
				643	r = 0;
				644	break;
				645
				646	case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA:
				647	if (!sched_info_on()) {
				648	r = -EOPNOTSUPP;
				649	break;
				650	}
				651	data->u.runstate.state = vcpu->arch.xen.current_runstate;
				652	data->u.runstate.state_entry_time =
				653	vcpu->arch.xen.runstate_entry_time;
				654	data->u.runstate.time_running =
				655	vcpu->arch.xen.runstate_times[RUNSTATE_running];
				656	data->u.runstate.time_runnable =
				657	vcpu->arch.xen.runstate_times[RUNSTATE_runnable];
				658	data->u.runstate.time_blocked =
				659	vcpu->arch.xen.runstate_times[RUNSTATE_blocked];
				660	data->u.runstate.time_offline =
				661	vcpu->arch.xen.runstate_times[RUNSTATE_offline];
				662	r = 0;
				663	break;
				664
				665	case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST:
				666	r = -EINVAL;
				667	break;
				668
David Woodhouse	3e32461	2021-02-02 16:53:25 +0000	[diff] [blame]	669	default:
				670	break;
				671	}
				672
				673	mutex_unlock(&vcpu->kvm->lock);
				674	return r;
				675	}
				676
Joao Martins	23200b7	2018-06-13 09:55:44 -0400	[diff] [blame]	677	int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data)
				678	{
				679	struct kvm *kvm = vcpu->kvm;
				680	u32 page_num = data & ~PAGE_MASK;
				681	u64 page_addr = data & PAGE_MASK;
David Woodhouse	a3833b8	2020-12-03 16:20:32 +0000	[diff] [blame]	682	bool lm = is_long_mode(vcpu);
				683
				684	/* Latch long_mode for shared_info pages etc. */
				685	vcpu->kvm->arch.xen.long_mode = lm;
Joao Martins	23200b7	2018-06-13 09:55:44 -0400	[diff] [blame]	686
				687	/*
				688	* If Xen hypercall intercept is enabled, fill the hypercall
				689	* page with VMCALL/VMMCALL instructions since that's what
				690	* we catch. Else the VMM has provided the hypercall pages
				691	* with instructions of its own choosing, so use those.
				692	*/
				693	if (kvm_xen_hypercall_enabled(kvm)) {
				694	u8 instructions[32];
				695	int i;
				696
				697	if (page_num)
				698	return 1;
				699
				700	/* mov imm32, %eax */
				701	instructions[0] = 0xb8;
				702
				703	/* vmcall / vmmcall */
				704	kvm_x86_ops.patch_hypercall(vcpu, instructions + 5);
				705
				706	/* ret */
				707	instructions[8] = 0xc3;
				708
				709	/* int3 to pad */
				710	memset(instructions + 9, 0xcc, sizeof(instructions) - 9);
				711
				712	for (i = 0; i < PAGE_SIZE / sizeof(instructions); i++) {
				713	(u32 )&instructions[1] = i;
				714	if (kvm_vcpu_write_guest(vcpu,
				715	page_addr + (i * sizeof(instructions)),
				716	instructions, sizeof(instructions)))
				717	return 1;
				718	}
				719	} else {
Sean Christopherson	448841f	2021-02-08 12:15:02 -0800	[diff] [blame]	720	/*
				721	* Note, truncation is a non-issue as 'lm' is guaranteed to be
				722	* false for a 32-bit kernel, i.e. when hva_t is only 4 bytes.
				723	*/
				724	hva_t blob_addr = lm ? kvm->arch.xen_hvm_config.blob_addr_64
				725	: kvm->arch.xen_hvm_config.blob_addr_32;
Joao Martins	23200b7	2018-06-13 09:55:44 -0400	[diff] [blame]	726	u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64
				727	: kvm->arch.xen_hvm_config.blob_size_32;
				728	u8 *page;
				729
				730	if (page_num >= blob_size)
				731	return 1;
				732
				733	blob_addr += page_num * PAGE_SIZE;
				734
				735	page = memdup_user((u8 __user *)blob_addr, PAGE_SIZE);
				736	if (IS_ERR(page))
				737	return PTR_ERR(page);
				738
				739	if (kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE)) {
				740	kfree(page);
				741	return 1;
				742	}
				743	}
				744	return 0;
				745	}
				746
David Woodhouse	78e9878	2021-02-02 13:19:35 +0000	[diff] [blame]	747	int kvm_xen_hvm_config(struct kvm kvm, struct kvm_xen_hvm_config xhc)
				748	{
				749	if (xhc->flags & ~KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL)
				750	return -EINVAL;
				751
				752	/*
				753	* With hypercall interception the kernel generates its own
				754	* hypercall page so it must not be provided.
				755	*/
				756	if ((xhc->flags & KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL) &&
				757	(xhc->blob_addr_32 \|\| xhc->blob_addr_64 \|\|
				758	xhc->blob_size_32 \|\| xhc->blob_size_64))
				759	return -EINVAL;
				760
David Woodhouse	7d6bbeb	2021-02-02 15:48:05 +0000	[diff] [blame]	761	mutex_lock(&kvm->lock);
				762
				763	if (xhc->msr && !kvm->arch.xen_hvm_config.msr)
				764	static_branch_inc(&kvm_xen_enabled.key);
				765	else if (!xhc->msr && kvm->arch.xen_hvm_config.msr)
				766	static_branch_slow_dec_deferred(&kvm_xen_enabled);
				767
David Woodhouse	78e9878	2021-02-02 13:19:35 +0000	[diff] [blame]	768	memcpy(&kvm->arch.xen_hvm_config, xhc, sizeof(*xhc));
David Woodhouse	7d6bbeb	2021-02-02 15:48:05 +0000	[diff] [blame]	769
				770	mutex_unlock(&kvm->lock);
David Woodhouse	78e9878	2021-02-02 13:19:35 +0000	[diff] [blame]	771	return 0;
				772	}
				773
Paolo Bonzini	319afe6	2021-08-04 12:48:41 -0400	[diff] [blame]	774	void kvm_xen_init_vm(struct kvm *kvm)
				775	{
Paolo Bonzini	319afe6	2021-08-04 12:48:41 -0400	[diff] [blame]	776	}
				777
David Woodhouse	7d6bbeb	2021-02-02 15:48:05 +0000	[diff] [blame]	778	void kvm_xen_destroy_vm(struct kvm *kvm)
				779	{
David Woodhouse	1cfc9c4	2021-12-10 16:36:22 +0000	[diff] [blame]	780	kvm_gfn_to_pfn_cache_destroy(kvm, &kvm->arch.xen.shinfo_cache);
				781
David Woodhouse	7d6bbeb	2021-02-02 15:48:05 +0000	[diff] [blame]	782	if (kvm->arch.xen_hvm_config.msr)
				783	static_branch_slow_dec_deferred(&kvm_xen_enabled);
				784	}
				785
Joao Martins	23200b7	2018-06-13 09:55:44 -0400	[diff] [blame]	786	static int kvm_xen_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result)
				787	{
				788	kvm_rax_write(vcpu, result);
				789	return kvm_skip_emulated_instruction(vcpu);
				790	}
				791
				792	static int kvm_xen_hypercall_complete_userspace(struct kvm_vcpu *vcpu)
				793	{
				794	struct kvm_run *run = vcpu->run;
				795
				796	if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.xen.hypercall_rip)))
				797	return 1;
				798
				799	return kvm_xen_hypercall_set_result(vcpu, run->xen.u.hcall.result);
				800	}
				801
				802	int kvm_xen_hypercall(struct kvm_vcpu *vcpu)
				803	{
				804	bool longmode;
				805	u64 input, params[6];
				806
				807	input = (u64)kvm_register_read(vcpu, VCPU_REGS_RAX);
				808
Joao Martins	79033be	2018-06-13 09:55:44 -0400	[diff] [blame]	809	/* Hyper-V hypercalls get bit 31 set in EAX */
				810	if ((input & 0x80000000) &&
Vitaly Kuznetsov	8f01455	2021-01-26 14:48:14 +0100	[diff] [blame]	811	kvm_hv_hypercall_enabled(vcpu))
Joao Martins	79033be	2018-06-13 09:55:44 -0400	[diff] [blame]	812	return kvm_hv_hypercall(vcpu);
				813
Tom Lendacky	b5aead0	2021-05-24 12:48:57 -0500	[diff] [blame]	814	longmode = is_64_bit_hypercall(vcpu);
Joao Martins	23200b7	2018-06-13 09:55:44 -0400	[diff] [blame]	815	if (!longmode) {
				816	params[0] = (u32)kvm_rbx_read(vcpu);
				817	params[1] = (u32)kvm_rcx_read(vcpu);
				818	params[2] = (u32)kvm_rdx_read(vcpu);
				819	params[3] = (u32)kvm_rsi_read(vcpu);
				820	params[4] = (u32)kvm_rdi_read(vcpu);
				821	params[5] = (u32)kvm_rbp_read(vcpu);
				822	}
				823	#ifdef CONFIG_X86_64
				824	else {
				825	params[0] = (u64)kvm_rdi_read(vcpu);
				826	params[1] = (u64)kvm_rsi_read(vcpu);
				827	params[2] = (u64)kvm_rdx_read(vcpu);
				828	params[3] = (u64)kvm_r10_read(vcpu);
				829	params[4] = (u64)kvm_r8_read(vcpu);
				830	params[5] = (u64)kvm_r9_read(vcpu);
				831	}
				832	#endif
				833	trace_kvm_xen_hypercall(input, params[0], params[1], params[2],
				834	params[3], params[4], params[5]);
				835
				836	vcpu->run->exit_reason = KVM_EXIT_XEN;
				837	vcpu->run->xen.type = KVM_EXIT_XEN_HCALL;
				838	vcpu->run->xen.u.hcall.longmode = longmode;
				839	vcpu->run->xen.u.hcall.cpl = kvm_x86_ops.get_cpl(vcpu);
				840	vcpu->run->xen.u.hcall.input = input;
				841	vcpu->run->xen.u.hcall.params[0] = params[0];
				842	vcpu->run->xen.u.hcall.params[1] = params[1];
				843	vcpu->run->xen.u.hcall.params[2] = params[2];
				844	vcpu->run->xen.u.hcall.params[3] = params[3];
				845	vcpu->run->xen.u.hcall.params[4] = params[4];
				846	vcpu->run->xen.u.hcall.params[5] = params[5];
				847	vcpu->arch.xen.hypercall_rip = kvm_get_linear_rip(vcpu);
				848	vcpu->arch.complete_userspace_io =
				849	kvm_xen_hypercall_complete_userspace;
				850
				851	return 0;
				852	}
David Woodhouse	14243b3	2021-12-10 16:36:23 +0000	[diff] [blame]	853
				854	static inline int max_evtchn_port(struct kvm *kvm)
				855	{
				856	if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode)
				857	return EVTCHN_2L_NR_CHANNELS;
				858	else
				859	return COMPAT_EVTCHN_2L_NR_CHANNELS;
				860	}
				861
				862	/*
				863	* This follows the kvm_set_irq() API, so it returns:
				864	* < 0 Interrupt was ignored (masked or not delivered for other reasons)
				865	* = 0 Interrupt was coalesced (previous irq is still pending)
				866	* > 0 Number of CPUs interrupt was delivered to
				867	*/
				868	int kvm_xen_set_evtchn_fast(struct kvm_kernel_irq_routing_entry *e,
				869	struct kvm *kvm)
				870	{
				871	struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache;
				872	struct kvm_vcpu *vcpu;
				873	unsigned long pending_bits, mask_bits;
				874	unsigned long flags;
				875	int port_word_bit;
				876	bool kick_vcpu = false;
				877	int idx;
				878	int rc;
				879
				880	vcpu = kvm_get_vcpu_by_id(kvm, e->xen_evtchn.vcpu);
				881	if (!vcpu)
				882	return -1;
				883
				884	if (!vcpu->arch.xen.vcpu_info_set)
				885	return -1;
				886
				887	if (e->xen_evtchn.port >= max_evtchn_port(kvm))
				888	return -1;
				889
				890	rc = -EWOULDBLOCK;
				891	read_lock_irqsave(&gpc->lock, flags);
				892
				893	idx = srcu_read_lock(&kvm->srcu);
				894	if (!kvm_gfn_to_pfn_cache_check(kvm, gpc, gpc->gpa, PAGE_SIZE))
				895	goto out_rcu;
				896
				897	if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) {
				898	struct shared_info *shinfo = gpc->khva;
				899	pending_bits = (unsigned long *)&shinfo->evtchn_pending;
				900	mask_bits = (unsigned long *)&shinfo->evtchn_mask;
				901	port_word_bit = e->xen_evtchn.port / 64;
				902	} else {
				903	struct compat_shared_info *shinfo = gpc->khva;
				904	pending_bits = (unsigned long *)&shinfo->evtchn_pending;
				905	mask_bits = (unsigned long *)&shinfo->evtchn_mask;
				906	port_word_bit = e->xen_evtchn.port / 32;
				907	}
				908
				909	/*
				910	* If this port wasn't already set, and if it isn't masked, then
				911	* we try to set the corresponding bit in the in-kernel shadow of
				912	* evtchn_pending_sel for the target vCPU. And if that wasn't
				913	* already set, then we kick the vCPU in question to write to the
				914	* real evtchn_pending_sel in its own guest vcpu_info struct.
				915	*/
				916	if (test_and_set_bit(e->xen_evtchn.port, pending_bits)) {
				917	rc = 0; /* It was already raised */
				918	} else if (test_bit(e->xen_evtchn.port, mask_bits)) {
				919	rc = -1; /* Masked */
				920	} else {
				921	rc = 1; /* Delivered. But was the vCPU waking already? */
				922	if (!test_and_set_bit(port_word_bit, &vcpu->arch.xen.evtchn_pending_sel))
				923	kick_vcpu = true;
				924	}
				925
				926	out_rcu:
				927	srcu_read_unlock(&kvm->srcu, idx);
				928	read_unlock_irqrestore(&gpc->lock, flags);
				929
				930	if (kick_vcpu) {
				931	kvm_make_request(KVM_REQ_EVENT, vcpu);
				932	kvm_vcpu_kick(vcpu);
				933	}
				934
				935	return rc;
				936	}
				937
				938	/* This is the version called from kvm_set_irq() as the .set function */
				939	static int evtchn_set_fn(struct kvm_kernel_irq_routing_entry e, struct kvm kvm,
				940	int irq_source_id, int level, bool line_status)
				941	{
				942	bool mm_borrowed = false;
				943	int rc;
				944
				945	if (!level)
				946	return -1;
				947
				948	rc = kvm_xen_set_evtchn_fast(e, kvm);
				949	if (rc != -EWOULDBLOCK)
				950	return rc;
				951
				952	if (current->mm != kvm->mm) {
				953	/*
				954	* If not on a thread which already belongs to this KVM,
				955	* we'd better be in the irqfd workqueue.
				956	*/
				957	if (WARN_ON_ONCE(current->mm))
				958	return -EINVAL;
				959
				960	kthread_use_mm(kvm->mm);
				961	mm_borrowed = true;
				962	}
				963
				964	/*
				965	* For the irqfd workqueue, using the main kvm->lock mutex is
				966	* fine since this function is invoked from kvm_set_irq() with
				967	* no other lock held, no srcu. In future if it will be called
				968	* directly from a vCPU thread (e.g. on hypercall for an IPI)
				969	* then it may need to switch to using a leaf-node mutex for
				970	* serializing the shared_info mapping.
				971	*/
				972	mutex_lock(&kvm->lock);
				973
				974	/*
				975	* It is theoretically possible for the page to be unmapped
				976	* and the MMU notifier to invalidate the shared_info before
				977	* we even get to use it. In that case, this looks like an
				978	* infinite loop. It was tempting to do it via the userspace
				979	* HVA instead... but that just hides the fact that it's
				980	* an infinite loop, because if a fault occurs and it waits
				981	* for the page to come back, it can still immediately
				982	* fault and have to wait again, repeatedly.
				983	*
				984	* Conversely, the page could also have been reinstated by
				985	* another thread before we even obtain the mutex above, so
				986	* check again first before remapping it.
				987	*/
				988	do {
				989	struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache;
				990	int idx;
				991
				992	rc = kvm_xen_set_evtchn_fast(e, kvm);
				993	if (rc != -EWOULDBLOCK)
				994	break;
				995
				996	idx = srcu_read_lock(&kvm->srcu);
				997	rc = kvm_gfn_to_pfn_cache_refresh(kvm, gpc, gpc->gpa,
				998	PAGE_SIZE, false);
				999	srcu_read_unlock(&kvm->srcu, idx);
				1000	} while(!rc);
				1001
				1002	mutex_unlock(&kvm->lock);
				1003
				1004	if (mm_borrowed)
				1005	kthread_unuse_mm(kvm->mm);
				1006
				1007	return rc;
				1008	}
				1009
				1010	int kvm_xen_setup_evtchn(struct kvm *kvm,
				1011	struct kvm_kernel_irq_routing_entry *e,
				1012	const struct kvm_irq_routing_entry *ue)
				1013
				1014	{
				1015	if (ue->u.xen_evtchn.port >= max_evtchn_port(kvm))
				1016	return -EINVAL;
				1017
				1018	/* We only support 2 level event channels for now */
				1019	if (ue->u.xen_evtchn.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL)
				1020	return -EINVAL;
				1021
				1022	e->xen_evtchn.port = ue->u.xen_evtchn.port;
				1023	e->xen_evtchn.vcpu = ue->u.xen_evtchn.vcpu;
				1024	e->xen_evtchn.priority = ue->u.xen_evtchn.priority;
				1025	e->set = evtchn_set_fn;
				1026
				1027	return 0;
				1028	}