arm64: KVM: Yield CPU when vcpu executes a WFE

On an (even slightly) oversubscribed system, spinlocks are quickly
becoming a bottleneck, as some vcpus are spinning, waiting for a
lock to be released, while the vcpu holding the lock may not be
running at all.

The solution is to trap blocking WFEs and tell KVM that we're
now spinning. This ensures that other vpus will get a scheduling
boost, allowing the lock to be released more quickly. Also, using
CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT slightly improves the performance
when the VM is severely overcommited.

Acked-by: Christoffer Dall <christoffer.dall@linaro.org>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h
index a5f28e2..c98ef47 100644
--- a/arch/arm64/include/asm/kvm_arm.h
+++ b/arch/arm64/include/asm/kvm_arm.h
@@ -63,6 +63,7 @@
  * TAC:		Trap ACTLR
  * TSC:		Trap SMC
  * TSW:		Trap cache operations by set/way
+ * TWE:		Trap WFE
  * TWI:		Trap WFI
  * TIDCP:	Trap L2CTLR/L2ECTLR
  * BSU_IS:	Upgrade barriers to the inner shareable domain
@@ -72,8 +73,9 @@
  * FMO:		Override CPSR.F and enable signaling with VF
  * SWIO:	Turn set/way invalidates into set/way clean+invalidate
  */
-#define HCR_GUEST_FLAGS (HCR_TSC | HCR_TSW | HCR_TWI | HCR_VM | HCR_BSU_IS | \
-			 HCR_FB | HCR_TAC | HCR_AMO | HCR_IMO | HCR_FMO | \
+#define HCR_GUEST_FLAGS (HCR_TSC | HCR_TSW | HCR_TWE | HCR_TWI | HCR_VM | \
+			 HCR_BSU_IS | HCR_FB | HCR_TAC | \
+			 HCR_AMO | HCR_IMO | HCR_FMO | \
 			 HCR_SWIO | HCR_TIDCP | HCR_RW)
 #define HCR_VIRT_EXCP_MASK (HCR_VA | HCR_VI | HCR_VF)
 
@@ -242,4 +244,6 @@
 
 #define ESR_EL2_EC_xABT_xFSR_EXTABT	0x10
 
+#define ESR_EL2_EC_WFI_ISS_WFE	(1 << 0)
+
 #endif /* __ARM64_KVM_ARM_H__ */