Blame - kernel/locking/qspinlock_paravirt.h - SHIFTPHONES/mainline/linux

blob: ab8b1bb8caa411d86cecb915329ed7be7ce4de2b [file] [log] [blame]

Waiman Long	a23db28	2015-04-24 14:56:37 -0400	[diff] [blame]	1	#ifndef _GEN_PV_LOCK_SLOWPATH
				2	#error "do not include this file"
				3	#endif
				4
				5	#include <linux/hash.h>
				6	#include <linux/bootmem.h>
Waiman Long	cba77f0	2015-07-11 21:19:19 -0400	[diff] [blame]	7	#include <linux/debug_locks.h>
Waiman Long	a23db28	2015-04-24 14:56:37 -0400	[diff] [blame]	8
				9	/*
				10	* Implement paravirt qspinlocks; the general idea is to halt the vcpus instead
				11	* of spinning them.
				12	*
				13	* This relies on the architecture to provide two paravirt hypercalls:
				14	*
				15	* pv_wait(u8 ptr, u8 val) -- suspends the vcpu if ptr == val
				16	* pv_kick(cpu) -- wakes a suspended vcpu
				17	*
				18	* Using these we implement __pv_queued_spin_lock_slowpath() and
				19	* __pv_queued_spin_unlock() to replace native_queued_spin_lock_slowpath() and
				20	* native_queued_spin_unlock().
				21	*/
				22
				23	#define _Q_SLOW_VAL (3U << _Q_LOCKED_OFFSET)
				24
				25	enum vcpu_state {
				26	vcpu_running = 0,
				27	vcpu_halted,
				28	};
				29
				30	struct pv_node {
				31	struct mcs_spinlock mcs;
				32	struct mcs_spinlock __res[3];
				33
				34	int cpu;
				35	u8 state;
				36	};
				37
				38	/*
				39	* Lock and MCS node addresses hash table for fast lookup
				40	*
				41	* Hashing is done on a per-cacheline basis to minimize the need to access
				42	* more than one cacheline.
				43	*
				44	* Dynamically allocate a hash table big enough to hold at least 4X the
				45	* number of possible cpus in the system. Allocation is done on page
				46	* granularity. So the minimum number of hash buckets should be at least
				47	* 256 (64-bit) or 512 (32-bit) to fully utilize a 4k page.
				48	*
				49	* Since we should not be holding locks from NMI context (very rare indeed) the
				50	* max load factor is 0.75, which is around the point where open addressing
				51	* breaks down.
				52	*
				53	*/
				54	struct pv_hash_entry {
				55	struct qspinlock *lock;
				56	struct pv_node *node;
				57	};
				58
				59	#define PV_HE_PER_LINE (SMP_CACHE_BYTES / sizeof(struct pv_hash_entry))
				60	#define PV_HE_MIN (PAGE_SIZE / sizeof(struct pv_hash_entry))
				61
				62	static struct pv_hash_entry *pv_lock_hash;
				63	static unsigned int pv_lock_hash_bits __read_mostly;
				64
				65	/*
				66	* Allocate memory for the PV qspinlock hash buckets
				67	*
				68	* This function should be called from the paravirt spinlock initialization
				69	* routine.
				70	*/
				71	void __init __pv_init_lock_hash(void)
				72	{
				73	int pv_hash_size = ALIGN(4 * num_possible_cpus(), PV_HE_PER_LINE);
				74
				75	if (pv_hash_size < PV_HE_MIN)
				76	pv_hash_size = PV_HE_MIN;
				77
				78	/*
				79	* Allocate space from bootmem which should be page-size aligned
				80	* and hence cacheline aligned.
				81	*/
				82	pv_lock_hash = alloc_large_system_hash("PV qspinlock",
				83	sizeof(struct pv_hash_entry),
				84	pv_hash_size, 0, HASH_EARLY,
				85	&pv_lock_hash_bits, NULL,
				86	pv_hash_size, pv_hash_size);
				87	}
				88
				89	#define for_each_hash_entry(he, offset, hash) \
				90	for (hash &= ~(PV_HE_PER_LINE - 1), he = &pv_lock_hash[hash], offset = 0; \
				91	offset < (1 << pv_lock_hash_bits); \
				92	offset++, he = &pv_lock_hash[(hash + offset) & ((1 << pv_lock_hash_bits) - 1)])
				93
				94	static struct qspinlock *pv_hash(struct qspinlock lock, struct pv_node *node)
				95	{
				96	unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits);
				97	struct pv_hash_entry *he;
				98
				99	for_each_hash_entry(he, offset, hash) {
				100	if (!cmpxchg(&he->lock, NULL, lock)) {
				101	WRITE_ONCE(he->node, node);
				102	return &he->lock;
				103	}
				104	}
				105	/*
				106	* Hard assume there is a free entry for us.
				107	*
				108	* This is guaranteed by ensuring every blocked lock only ever consumes
				109	* a single entry, and since we only have 4 nesting levels per CPU
				110	* and allocated 4*nr_possible_cpus(), this must be so.
				111	*
				112	* The single entry is guaranteed by having the lock owner unhash
				113	* before it releases.
				114	*/
				115	BUG();
				116	}
				117
				118	static struct pv_node pv_unhash(struct qspinlock lock)
				119	{
				120	unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits);
				121	struct pv_hash_entry *he;
				122	struct pv_node *node;
				123
				124	for_each_hash_entry(he, offset, hash) {
				125	if (READ_ONCE(he->lock) == lock) {
				126	node = READ_ONCE(he->node);
				127	WRITE_ONCE(he->lock, NULL);
				128	return node;
				129	}
				130	}
				131	/*
				132	* Hard assume we'll find an entry.
				133	*
				134	* This guarantees a limited lookup time and is itself guaranteed by
				135	* having the lock owner do the unhash -- IFF the unlock sees the
				136	* SLOW flag, there MUST be a hash entry.
				137	*/
				138	BUG();
				139	}
				140
				141	/*
				142	* Initialize the PV part of the mcs_spinlock node.
				143	*/
				144	static void pv_init_node(struct mcs_spinlock *node)
				145	{
				146	struct pv_node pn = (struct pv_node )node;
				147
				148	BUILD_BUG_ON(sizeof(struct pv_node) > 5*sizeof(struct mcs_spinlock));
				149
				150	pn->cpu = smp_processor_id();
				151	pn->state = vcpu_running;
				152	}
				153
				154	/*
				155	* Wait for node->locked to become true, halt the vcpu after a short spin.
				156	* pv_kick_node() is used to wake the vcpu again.
				157	*/
				158	static void pv_wait_node(struct mcs_spinlock *node)
				159	{
				160	struct pv_node pn = (struct pv_node )node;
				161	int loop;
				162
				163	for (;;) {
				164	for (loop = SPIN_THRESHOLD; loop; loop--) {
				165	if (READ_ONCE(node->locked))
				166	return;
				167	cpu_relax();
				168	}
				169
				170	/*
				171	* Order pn->state vs pn->locked thusly:
				172	*
				173	* [S] pn->state = vcpu_halted [S] next->locked = 1
				174	* MB MB
				175	* [L] pn->locked [RmW] pn->state = vcpu_running
				176	*
				177	* Matches the xchg() from pv_kick_node().
				178	*/
Peter Zijlstra	b92b8b3	2015-05-12 10:51:55 +0200	[diff] [blame]	179	smp_store_mb(pn->state, vcpu_halted);
Waiman Long	a23db28	2015-04-24 14:56:37 -0400	[diff] [blame]	180
				181	if (!READ_ONCE(node->locked))
				182	pv_wait(&pn->state, vcpu_halted);
				183
				184	/*
				185	* Reset the vCPU state to avoid unncessary CPU kicking
				186	*/
				187	WRITE_ONCE(pn->state, vcpu_running);
				188
				189	/*
				190	* If the locked flag is still not set after wakeup, it is a
				191	* spurious wakeup and the vCPU should wait again. However,
				192	* there is a pretty high overhead for CPU halting and kicking.
				193	* So it is better to spin for a while in the hope that the
				194	* MCS lock will be released soon.
				195	*/
				196	}
				197	/*
				198	* By now our node->locked should be 1 and our caller will not actually
				199	* spin-wait for it. We do however rely on our caller to do a
				200	* load-acquire for us.
				201	*/
				202	}
				203
				204	/*
				205	* Called after setting next->locked = 1, used to wake those stuck in
				206	* pv_wait_node().
				207	*/
				208	static void pv_kick_node(struct mcs_spinlock *node)
				209	{
				210	struct pv_node pn = (struct pv_node )node;
				211
				212	/*
				213	* Note that because node->locked is already set, this actual
				214	* mcs_spinlock entry could be re-used already.
				215	*
				216	* This should be fine however, kicking people for no reason is
				217	* harmless.
				218	*
				219	* See the comment in pv_wait_node().
				220	*/
				221	if (xchg(&pn->state, vcpu_running) == vcpu_halted)
				222	pv_kick(pn->cpu);
				223	}
				224
				225	/*
				226	* Wait for l->locked to become clear; halt the vcpu after a short spin.
				227	* __pv_queued_spin_unlock() will wake us.
				228	*/
				229	static void pv_wait_head(struct qspinlock lock, struct mcs_spinlock node)
				230	{
				231	struct pv_node pn = (struct pv_node )node;
				232	struct __qspinlock l = (void )lock;
				233	struct qspinlock **lp = NULL;
				234	int loop;
				235
				236	for (;;) {
				237	for (loop = SPIN_THRESHOLD; loop; loop--) {
				238	if (!READ_ONCE(l->locked))
				239	return;
				240	cpu_relax();
				241	}
				242
				243	WRITE_ONCE(pn->state, vcpu_halted);
				244	if (!lp) { /* ONCE */
				245	lp = pv_hash(lock, pn);
				246	/*
Will Deacon	3b3fdf1	2015-07-13 16:58:30 +0100	[diff] [blame^]	247	* We must hash before setting _Q_SLOW_VAL, such that
				248	* when we observe _Q_SLOW_VAL in __pv_queued_spin_unlock()
				249	* we'll be sure to be able to observe our hash entry.
Waiman Long	a23db28	2015-04-24 14:56:37 -0400	[diff] [blame]	250	*
Will Deacon	3b3fdf1	2015-07-13 16:58:30 +0100	[diff] [blame^]	251	* [S] pn->state
				252	* [S] <hash> [Rmw] l->locked == _Q_SLOW_VAL
				253	* MB RMB
				254	* [RmW] l->locked = _Q_SLOW_VAL [L] <unhash>
				255	* [L] pn->state
Waiman Long	a23db28	2015-04-24 14:56:37 -0400	[diff] [blame]	256	*
Will Deacon	3b3fdf1	2015-07-13 16:58:30 +0100	[diff] [blame^]	257	* Matches the smp_rmb() in __pv_queued_spin_unlock().
Waiman Long	a23db28	2015-04-24 14:56:37 -0400	[diff] [blame]	258	*/
				259	if (!cmpxchg(&l->locked, _Q_LOCKED_VAL, _Q_SLOW_VAL)) {
				260	/*
				261	* The lock is free and _Q_SLOW_VAL has never
				262	* been set. Therefore we need to unhash before
				263	* getting the lock.
				264	*/
				265	WRITE_ONCE(*lp, NULL);
				266	return;
				267	}
				268	}
				269	pv_wait(&l->locked, _Q_SLOW_VAL);
				270
				271	/*
				272	* The unlocker should have freed the lock before kicking the
				273	* CPU. So if the lock is still not free, it is a spurious
				274	* wakeup and so the vCPU should wait again after spinning for
				275	* a while.
				276	*/
				277	}
				278
				279	/*
				280	* Lock is unlocked now; the caller will acquire it without waiting.
				281	* As with pv_wait_node() we rely on the caller to do a load-acquire
				282	* for us.
				283	*/
				284	}
				285
				286	/*
				287	* PV version of the unlock function to be used in stead of
				288	* queued_spin_unlock().
				289	*/
				290	__visible void __pv_queued_spin_unlock(struct qspinlock *lock)
				291	{
				292	struct __qspinlock l = (void )lock;
				293	struct pv_node *node;
Peter Zijlstra	0b792bf	2015-07-21 12:13:43 +0200	[diff] [blame]	294	u8 locked;
Waiman Long	a23db28	2015-04-24 14:56:37 -0400	[diff] [blame]	295
				296	/*
				297	* We must not unlock if SLOW, because in that case we must first
				298	* unhash. Otherwise it would be possible to have multiple @lock
				299	* entries, which would be BAD.
				300	*/
Peter Zijlstra	0b792bf	2015-07-21 12:13:43 +0200	[diff] [blame]	301	locked = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0);
				302	if (likely(locked == _Q_LOCKED_VAL))
Waiman Long	a23db28	2015-04-24 14:56:37 -0400	[diff] [blame]	303	return;
				304
Peter Zijlstra	0b792bf	2015-07-21 12:13:43 +0200	[diff] [blame]	305	if (unlikely(locked != _Q_SLOW_VAL)) {
				306	WARN(!debug_locks_silent,
				307	"pvqspinlock: lock 0x%lx has corrupted value 0x%x!\n",
				308	(unsigned long)lock, atomic_read(&lock->val));
Waiman Long	cba77f0	2015-07-11 21:19:19 -0400	[diff] [blame]	309	return;
				310	}
				311
Waiman Long	a23db28	2015-04-24 14:56:37 -0400	[diff] [blame]	312	/*
Will Deacon	3b3fdf1	2015-07-13 16:58:30 +0100	[diff] [blame^]	313	* A failed cmpxchg doesn't provide any memory-ordering guarantees,
				314	* so we need a barrier to order the read of the node data in
				315	* pv_unhash after we've read the lock being _Q_SLOW_VAL.
				316	*
				317	* Matches the cmpxchg() in pv_wait_head() setting _Q_SLOW_VAL.
				318	*/
				319	smp_rmb();
				320
				321	/*
Waiman Long	a23db28	2015-04-24 14:56:37 -0400	[diff] [blame]	322	* Since the above failed to release, this must be the SLOW path.
				323	* Therefore start by looking up the blocked node and unhashing it.
				324	*/
				325	node = pv_unhash(lock);
				326
				327	/*
				328	* Now that we have a reference to the (likely) blocked pv_node,
				329	* release the lock.
				330	*/
				331	smp_store_release(&l->locked, 0);
				332
				333	/*
				334	* At this point the memory pointed at by lock can be freed/reused,
				335	* however we can still use the pv_node to kick the CPU.
				336	*/
				337	if (READ_ONCE(node->state) == vcpu_halted)
				338	pv_kick(node->cpu);
				339	}
				340	/*
				341	* Include the architecture specific callee-save thunk of the
				342	* __pv_queued_spin_unlock(). This thunk is put together with
				343	* __pv_queued_spin_unlock() near the top of the file to make sure
				344	* that the callee-save thunk and the real unlock function are close
				345	* to each other sharing consecutive instruction cachelines.
				346	*/
				347	#include <asm/qspinlock_paravirt.h>
				348