Blame - kernel/locking/qspinlock.c - SHIFTPHONES/mainline/linux

blob: cbff6ba53d563634791e27ad8d11e7a683065679 [file] [log] [blame]

Thomas Gleixner	c942fdd	2019-05-27 08:55:06 +0200	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0-or-later
Waiman Long	a33fda3	2015-04-24 14:56:30 -0400	[diff] [blame]	2	/*
				3	* Queued spinlock
				4	*
Waiman Long	a33fda3	2015-04-24 14:56:30 -0400	[diff] [blame]	5	* (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P.
Waiman Long	81d3dc9	2018-04-26 11:34:27 +0100	[diff] [blame]	6	* (C) Copyright 2013-2014,2018 Red Hat, Inc.
Waiman Long	a33fda3	2015-04-24 14:56:30 -0400	[diff] [blame]	7	* (C) Copyright 2015 Intel Corp.
Waiman Long	64d816c	2015-11-09 19:09:21 -0500	[diff] [blame]	8	* (C) Copyright 2015 Hewlett-Packard Enterprise Development LP
Waiman Long	a33fda3	2015-04-24 14:56:30 -0400	[diff] [blame]	9	*
Waiman Long	81d3dc9	2018-04-26 11:34:27 +0100	[diff] [blame]	10	* Authors: Waiman Long <longman@redhat.com>
Waiman Long	a33fda3	2015-04-24 14:56:30 -0400	[diff] [blame]	11	* Peter Zijlstra <peterz@infradead.org>
				12	*/
Waiman Long	a23db28	2015-04-24 14:56:37 -0400	[diff] [blame]	13
				14	#ifndef _GEN_PV_LOCK_SLOWPATH
				15
Waiman Long	a33fda3	2015-04-24 14:56:30 -0400	[diff] [blame]	16	#include <linux/smp.h>
				17	#include <linux/bug.h>
				18	#include <linux/cpumask.h>
				19	#include <linux/percpu.h>
				20	#include <linux/hardirq.h>
				21	#include <linux/mutex.h>
Stafford Horne	5671360	2017-07-08 04:56:58 +0900	[diff] [blame]	22	#include <linux/prefetch.h>
Peter Zijlstra (Intel)	69f9cae	2015-04-24 14:56:34 -0400	[diff] [blame]	23	#include <asm/byteorder.h>
Waiman Long	a33fda3	2015-04-24 14:56:30 -0400	[diff] [blame]	24	#include <asm/qspinlock.h>
				25
				26	/*
Waiman Long	81d3dc9	2018-04-26 11:34:27 +0100	[diff] [blame]	27	* Include queued spinlock statistics code
				28	*/
				29	#include "qspinlock_stat.h"
				30
				31	/*
Waiman Long	a33fda3	2015-04-24 14:56:30 -0400	[diff] [blame]	32	* The basic principle of a queue-based spinlock can best be understood
				33	* by studying a classic queue-based spinlock implementation called the
Waiman Long	5709712	2020-01-07 12:49:14 -0500	[diff] [blame]	34	* MCS lock. A copy of the original MCS lock paper ("Algorithms for Scalable
				35	* Synchronization on Shared-Memory Multiprocessors by Mellor-Crummey and
				36	* Scott") is available at
Waiman Long	a33fda3	2015-04-24 14:56:30 -0400	[diff] [blame]	37	*
Waiman Long	5709712	2020-01-07 12:49:14 -0500	[diff] [blame]	38	* https://bugzilla.kernel.org/show_bug.cgi?id=206115
Waiman Long	a33fda3	2015-04-24 14:56:30 -0400	[diff] [blame]	39	*
Waiman Long	5709712	2020-01-07 12:49:14 -0500	[diff] [blame]	40	* This queued spinlock implementation is based on the MCS lock, however to
				41	* make it fit the 4 bytes we assume spinlock_t to be, and preserve its
				42	* existing API, we must modify it somehow.
Waiman Long	a33fda3	2015-04-24 14:56:30 -0400	[diff] [blame]	43	*
				44	* In particular; where the traditional MCS lock consists of a tail pointer
				45	* (8 bytes) and needs the next pointer (another 8 bytes) of its own node to
				46	* unlock the next pending (next->locked), we compress both these: {tail,
				47	* next->locked} into a single u32 value.
				48	*
				49	* Since a spinlock disables recursion of its own context and there is a limit
				50	* to the contexts that can nest; namely: task, softirq, hardirq, nmi. As there
				51	* are at most 4 nesting levels, it can be encoded by a 2-bit number. Now
				52	* we can encode the tail by combining the 2-bit nesting level with the cpu
				53	* number. With one byte for the lock value and 3 bytes for the tail, only a
				54	* 32-bit word is now needed. Even though we only need 1 bit for the lock,
				55	* we extend it to a full byte to achieve better performance for architectures
				56	* that support atomic byte write.
				57	*
				58	* We also change the first spinner to spin on the lock bit instead of its
				59	* node; whereby avoiding the need to carry a node from lock to unlock, and
				60	* preserving existing lock API. This also makes the unlock code simpler and
				61	* faster.
Peter Zijlstra (Intel)	69f9cae	2015-04-24 14:56:34 -0400	[diff] [blame]	62	*
				63	* N.B. The current implementation only supports architectures that allow
				64	* atomic operations on smaller 8-bit and 16-bit data types.
				65	*
Waiman Long	a33fda3	2015-04-24 14:56:30 -0400	[diff] [blame]	66	*/
				67
				68	#include "mcs_spinlock.h"
Waiman Long	a23db28	2015-04-24 14:56:37 -0400	[diff] [blame]	69	#define MAX_NODES 4
Waiman Long	0fa809c	2018-10-16 09:45:07 -0400	[diff] [blame]	70
				71	/*
				72	* On 64-bit architectures, the mcs_spinlock structure will be 16 bytes in
				73	* size and four of them will fit nicely in one 64-byte cacheline. For
				74	* pvqspinlock, however, we need more space for extra data. To accommodate
				75	* that, we insert two more long words to pad it up to 32 bytes. IOW, only
				76	* two of them can fit in a cacheline in this case. That is OK as it is rare
				77	* to have more than 2 levels of slowpath nesting in actual use. We don't
				78	* want to penalize pvqspinlocks to optimize for a rare case in native
				79	* qspinlocks.
				80	*/
				81	struct qnode {
				82	struct mcs_spinlock mcs;
				83	#ifdef CONFIG_PARAVIRT_SPINLOCKS
				84	long reserved[2];
Waiman Long	a23db28	2015-04-24 14:56:37 -0400	[diff] [blame]	85	#endif
Waiman Long	0fa809c	2018-10-16 09:45:07 -0400	[diff] [blame]	86	};
Waiman Long	a23db28	2015-04-24 14:56:37 -0400	[diff] [blame]	87
Waiman Long	a33fda3	2015-04-24 14:56:30 -0400	[diff] [blame]	88	/*
Will Deacon	6512276	2018-04-26 11:34:17 +0100	[diff] [blame]	89	* The pending bit spinning loop count.
				90	* This heuristic is used to limit the number of lockword accesses
				91	* made by atomic_cond_read_relaxed when waiting for the lock to
				92	* transition out of the "== _Q_PENDING_VAL" state. We don't spin
				93	* indefinitely because there's no guarantee that we'll make forward
				94	* progress.
				95	*/
				96	#ifndef _Q_PENDING_LOOPS
				97	#define _Q_PENDING_LOOPS 1
				98	#endif
				99
				100	/*
Waiman Long	a33fda3	2015-04-24 14:56:30 -0400	[diff] [blame]	101	* Per-CPU queue node structures; we can never have more than 4 nested
				102	* contexts: task, softirq, hardirq, nmi.
				103	*
				104	* Exactly fits one 64-byte cacheline on a 64-bit architecture.
Waiman Long	a23db28	2015-04-24 14:56:37 -0400	[diff] [blame]	105	*
				106	* PV doubles the storage and uses the second cacheline for PV state.
Waiman Long	a33fda3	2015-04-24 14:56:30 -0400	[diff] [blame]	107	*/
Waiman Long	0fa809c	2018-10-16 09:45:07 -0400	[diff] [blame]	108	static DEFINE_PER_CPU_ALIGNED(struct qnode, qnodes[MAX_NODES]);
Waiman Long	a33fda3	2015-04-24 14:56:30 -0400	[diff] [blame]	109
				110	/*
				111	* We must be able to distinguish between no-tail and the tail at 0:0,
				112	* therefore increment the cpu number by one.
				113	*/
				114
Peter Zijlstra	8d53fa1	2016-06-08 09:12:30 +0200	[diff] [blame]	115	static inline __pure u32 encode_tail(int cpu, int idx)
Waiman Long	a33fda3	2015-04-24 14:56:30 -0400	[diff] [blame]	116	{
				117	u32 tail;
				118
Waiman Long	a33fda3	2015-04-24 14:56:30 -0400	[diff] [blame]	119	tail = (cpu + 1) << _Q_TAIL_CPU_OFFSET;
				120	tail \|= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */
				121
				122	return tail;
				123	}
				124
Peter Zijlstra	8d53fa1	2016-06-08 09:12:30 +0200	[diff] [blame]	125	static inline __pure struct mcs_spinlock *decode_tail(u32 tail)
Waiman Long	a33fda3	2015-04-24 14:56:30 -0400	[diff] [blame]	126	{
				127	int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1;
				128	int idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET;
				129
Waiman Long	0fa809c	2018-10-16 09:45:07 -0400	[diff] [blame]	130	return per_cpu_ptr(&qnodes[idx].mcs, cpu);
				131	}
				132
				133	static inline __pure
				134	struct mcs_spinlock grab_mcs_node(struct mcs_spinlock base, int idx)
				135	{
				136	return &((struct qnode *)base + idx)->mcs;
Waiman Long	a33fda3	2015-04-24 14:56:30 -0400	[diff] [blame]	137	}
				138
Peter Zijlstra (Intel)	c1fb159	2015-04-24 14:56:32 -0400	[diff] [blame]	139	#define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK \| _Q_PENDING_MASK)
				140
Waiman Long	2c83e8e	2015-04-24 14:56:35 -0400	[diff] [blame]	141	#if _Q_PENDING_BITS == 8
Peter Zijlstra (Intel)	69f9cae	2015-04-24 14:56:34 -0400	[diff] [blame]	142	/**
Will Deacon	59fb586	2018-04-26 11:34:19 +0100	[diff] [blame]	143	* clear_pending - clear the pending bit.
				144	* @lock: Pointer to queued spinlock structure
				145	*
				146	* ,1, -> ,0,
				147	*/
				148	static __always_inline void clear_pending(struct qspinlock *lock)
				149	{
				150	WRITE_ONCE(lock->pending, 0);
				151	}
				152
				153	/**
Peter Zijlstra (Intel)	69f9cae	2015-04-24 14:56:34 -0400	[diff] [blame]	154	* clear_pending_set_locked - take ownership and clear the pending bit.
				155	* @lock: Pointer to queued spinlock structure
				156	*
				157	* ,1,0 -> ,0,1
				158	*
				159	* Lock stealing is not allowed if this function is used.
				160	*/
				161	static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
				162	{
Will Deacon	625e88b	2018-04-26 11:34:16 +0100	[diff] [blame]	163	WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL);
Peter Zijlstra (Intel)	69f9cae	2015-04-24 14:56:34 -0400	[diff] [blame]	164	}
				165
				166	/*
				167	* xchg_tail - Put in the new queue tail code word & retrieve previous one
				168	* @lock : Pointer to queued spinlock structure
				169	* @tail : The new queue tail code word
				170	* Return: The previous queue tail code word
				171	*
Paul E. McKenney	548095d	2017-10-09 11:22:50 -0700	[diff] [blame]	172	* xchg(lock, tail), which heads an address dependency
Peter Zijlstra (Intel)	69f9cae	2015-04-24 14:56:34 -0400	[diff] [blame]	173	*
				174	* p,, -> n,, ; prev = xchg(lock, node)
				175	*/
				176	static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
				177	{
Waiman Long	64d816c	2015-11-09 19:09:21 -0500	[diff] [blame]	178	/*
Will Deacon	9d4646d	2018-04-26 11:34:25 +0100	[diff] [blame]	179	* We can use relaxed semantics since the caller ensures that the
				180	* MCS node is properly initialized before updating the tail.
Waiman Long	64d816c	2015-11-09 19:09:21 -0500	[diff] [blame]	181	*/
Will Deacon	9d4646d	2018-04-26 11:34:25 +0100	[diff] [blame]	182	return (u32)xchg_relaxed(&lock->tail,
Waiman Long	64d816c	2015-11-09 19:09:21 -0500	[diff] [blame]	183	tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
Peter Zijlstra (Intel)	69f9cae	2015-04-24 14:56:34 -0400	[diff] [blame]	184	}
				185
				186	#else /* _Q_PENDING_BITS == 8 */
				187
Waiman Long	a33fda3	2015-04-24 14:56:30 -0400	[diff] [blame]	188	/**
Will Deacon	59fb586	2018-04-26 11:34:19 +0100	[diff] [blame]	189	* clear_pending - clear the pending bit.
				190	* @lock: Pointer to queued spinlock structure
				191	*
				192	* ,1, -> ,0,
				193	*/
				194	static __always_inline void clear_pending(struct qspinlock *lock)
				195	{
				196	atomic_andnot(_Q_PENDING_VAL, &lock->val);
				197	}
				198
				199	/**
Waiman Long	6403bd7	2015-04-24 14:56:33 -0400	[diff] [blame]	200	* clear_pending_set_locked - take ownership and clear the pending bit.
				201	* @lock: Pointer to queued spinlock structure
				202	*
				203	* ,1,0 -> ,0,1
				204	*/
				205	static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
				206	{
				207	atomic_add(-_Q_PENDING_VAL + _Q_LOCKED_VAL, &lock->val);
				208	}
				209
				210	/**
				211	* xchg_tail - Put in the new queue tail code word & retrieve previous one
				212	* @lock : Pointer to queued spinlock structure
				213	* @tail : The new queue tail code word
				214	* Return: The previous queue tail code word
				215	*
				216	* xchg(lock, tail)
				217	*
				218	* p,, -> n,, ; prev = xchg(lock, node)
				219	*/
				220	static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
				221	{
				222	u32 old, new, val = atomic_read(&lock->val);
				223
				224	for (;;) {
				225	new = (val & _Q_LOCKED_PENDING_MASK) \| tail;
Waiman Long	64d816c	2015-11-09 19:09:21 -0500	[diff] [blame]	226	/*
Will Deacon	9d4646d	2018-04-26 11:34:25 +0100	[diff] [blame]	227	* We can use relaxed semantics since the caller ensures that
				228	* the MCS node is properly initialized before updating the
				229	* tail.
Waiman Long	64d816c	2015-11-09 19:09:21 -0500	[diff] [blame]	230	*/
Will Deacon	9d4646d	2018-04-26 11:34:25 +0100	[diff] [blame]	231	old = atomic_cmpxchg_relaxed(&lock->val, val, new);
Waiman Long	6403bd7	2015-04-24 14:56:33 -0400	[diff] [blame]	232	if (old == val)
				233	break;
				234
				235	val = old;
				236	}
				237	return old;
				238	}
Peter Zijlstra (Intel)	69f9cae	2015-04-24 14:56:34 -0400	[diff] [blame]	239	#endif /* _Q_PENDING_BITS == 8 */
Waiman Long	6403bd7	2015-04-24 14:56:33 -0400	[diff] [blame]	240
				241	/**
Peter Zijlstra	7aa54be	2018-09-26 13:01:20 +0200	[diff] [blame]	242	* queued_fetch_set_pending_acquire - fetch the whole lock value and set pending
				243	* @lock : Pointer to queued spinlock structure
				244	* Return: The previous lock value
				245	*
				246	* ,,* -> ,1,
				247	*/
				248	#ifndef queued_fetch_set_pending_acquire
				249	static __always_inline u32 queued_fetch_set_pending_acquire(struct qspinlock *lock)
				250	{
				251	return atomic_fetch_or_acquire(_Q_PENDING_VAL, &lock->val);
				252	}
				253	#endif
				254
				255	/**
Waiman Long	2c83e8e	2015-04-24 14:56:35 -0400	[diff] [blame]	256	* set_locked - Set the lock bit and own the lock
				257	* @lock: Pointer to queued spinlock structure
				258	*
				259	* ,,0 -> *,0,1
				260	*/
				261	static __always_inline void set_locked(struct qspinlock *lock)
				262	{
Will Deacon	625e88b	2018-04-26 11:34:16 +0100	[diff] [blame]	263	WRITE_ONCE(lock->locked, _Q_LOCKED_VAL);
Waiman Long	2c83e8e	2015-04-24 14:56:35 -0400	[diff] [blame]	264	}
				265
Waiman Long	a23db28	2015-04-24 14:56:37 -0400	[diff] [blame]	266
				267	/*
				268	* Generate the native code for queued_spin_unlock_slowpath(); provide NOPs for
				269	* all the PV callbacks.
				270	*/
				271
				272	static __always_inline void __pv_init_node(struct mcs_spinlock *node) { }
Waiman Long	cd0272f	2015-11-09 19:09:27 -0500	[diff] [blame]	273	static __always_inline void __pv_wait_node(struct mcs_spinlock *node,
				274	struct mcs_spinlock *prev) { }
Waiman Long	75d2270	2015-07-11 16:36:52 -0400	[diff] [blame]	275	static __always_inline void __pv_kick_node(struct qspinlock *lock,
				276	struct mcs_spinlock *node) { }
Waiman Long	1c4941f	2015-11-10 16:18:56 -0500	[diff] [blame]	277	static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock,
				278	struct mcs_spinlock *node)
				279	{ return 0; }
Waiman Long	a23db28	2015-04-24 14:56:37 -0400	[diff] [blame]	280
				281	#define pv_enabled() false
				282
				283	#define pv_init_node __pv_init_node
				284	#define pv_wait_node __pv_wait_node
				285	#define pv_kick_node __pv_kick_node
Waiman Long	1c4941f	2015-11-10 16:18:56 -0500	[diff] [blame]	286	#define pv_wait_head_or_lock __pv_wait_head_or_lock
Waiman Long	a23db28	2015-04-24 14:56:37 -0400	[diff] [blame]	287
				288	#ifdef CONFIG_PARAVIRT_SPINLOCKS
				289	#define queued_spin_lock_slowpath native_queued_spin_lock_slowpath
				290	#endif
				291
				292	#endif /* _GEN_PV_LOCK_SLOWPATH */
				293
Waiman Long	2c83e8e	2015-04-24 14:56:35 -0400	[diff] [blame]	294	/**
Waiman Long	a33fda3	2015-04-24 14:56:30 -0400	[diff] [blame]	295	* queued_spin_lock_slowpath - acquire the queued spinlock
				296	* @lock: Pointer to queued spinlock structure
				297	* @val: Current value of the queued spinlock 32-bit word
				298	*
Peter Zijlstra (Intel)	c1fb159	2015-04-24 14:56:32 -0400	[diff] [blame]	299	* (queue tail, pending bit, lock value)
Waiman Long	a33fda3	2015-04-24 14:56:30 -0400	[diff] [blame]	300	*
Peter Zijlstra (Intel)	c1fb159	2015-04-24 14:56:32 -0400	[diff] [blame]	301	* fast : slow : unlock
				302	* : :
				303	* uncontended (0,0,0) -:--> (0,0,1) ------------------------------:--> (,,0)
				304	* : \| ^--------.------. / :
				305	* : v \ \ \| :
				306	* pending : (0,1,1) +--> (0,1,0) \ \| :
				307	* : \| ^--' \| \| :
				308	* : v \| \| :
				309	* uncontended : (n,x,y) +--> (n,0,0) --' \| :
				310	* queue : \| ^--' \| :
				311	* : v \| :
				312	* contended : (,x,y) +--> (,0,0) ---> (*,0,1) -' :
				313	* queue : ^--' :
Waiman Long	a33fda3	2015-04-24 14:56:30 -0400	[diff] [blame]	314	*/
				315	void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
				316	{
				317	struct mcs_spinlock prev, next, *node;
Will Deacon	59fb586	2018-04-26 11:34:19 +0100	[diff] [blame]	318	u32 old, tail;
Waiman Long	a33fda3	2015-04-24 14:56:30 -0400	[diff] [blame]	319	int idx;
				320
				321	BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS));
				322
Waiman Long	a23db28	2015-04-24 14:56:37 -0400	[diff] [blame]	323	if (pv_enabled())
Waiman Long	81d3dc9	2018-04-26 11:34:27 +0100	[diff] [blame]	324	goto pv_queue;
Waiman Long	a23db28	2015-04-24 14:56:37 -0400	[diff] [blame]	325
Peter Zijlstra	43b3f02	2015-09-04 17:25:23 +0200	[diff] [blame]	326	if (virt_spin_lock(lock))
Peter Zijlstra (Intel)	2aa79af	2015-04-24 14:56:36 -0400	[diff] [blame]	327	return;
				328
Peter Zijlstra (Intel)	c1fb159	2015-04-24 14:56:32 -0400	[diff] [blame]	329	/*
Will Deacon	6512276	2018-04-26 11:34:17 +0100	[diff] [blame]	330	* Wait for in-progress pending->locked hand-overs with a bounded
				331	* number of spins so that we guarantee forward progress.
Peter Zijlstra (Intel)	c1fb159	2015-04-24 14:56:32 -0400	[diff] [blame]	332	*
				333	* 0,1,0 -> 0,0,1
				334	*/
				335	if (val == _Q_PENDING_VAL) {
Will Deacon	6512276	2018-04-26 11:34:17 +0100	[diff] [blame]	336	int cnt = _Q_PENDING_LOOPS;
				337	val = atomic_cond_read_relaxed(&lock->val,
				338	(VAL != _Q_PENDING_VAL) \|\| !cnt--);
Peter Zijlstra (Intel)	c1fb159	2015-04-24 14:56:32 -0400	[diff] [blame]	339	}
				340
				341	/*
Will Deacon	59fb586	2018-04-26 11:34:19 +0100	[diff] [blame]	342	* If we observe any contention; queue.
				343	*/
				344	if (val & ~_Q_LOCKED_MASK)
				345	goto queue;
				346
				347	/*
Peter Zijlstra (Intel)	c1fb159	2015-04-24 14:56:32 -0400	[diff] [blame]	348	* trylock \|\| pending
				349	*
Peter Zijlstra	756b1df	2018-09-26 13:01:19 +0200	[diff] [blame]	350	* 0,0,* -> 0,1,* -> 0,0,1 pending, trylock
Peter Zijlstra (Intel)	c1fb159	2015-04-24 14:56:32 -0400	[diff] [blame]	351	*/
Peter Zijlstra	7aa54be	2018-09-26 13:01:20 +0200	[diff] [blame]	352	val = queued_fetch_set_pending_acquire(lock);
Peter Zijlstra	756b1df	2018-09-26 13:01:19 +0200	[diff] [blame]	353
Peter Zijlstra	53bf57f	2018-09-26 13:01:18 +0200	[diff] [blame]	354	/*
Peter Zijlstra	756b1df	2018-09-26 13:01:19 +0200	[diff] [blame]	355	* If we observe contention, there is a concurrent locker.
				356	*
				357	* Undo and queue; our setting of PENDING might have made the
				358	* n,0,0 -> 0,0,0 transition fail and it will now be waiting
				359	* on @next to become !NULL.
Peter Zijlstra	53bf57f	2018-09-26 13:01:18 +0200	[diff] [blame]	360	*/
				361	if (unlikely(val & ~_Q_LOCKED_MASK)) {
Peter Zijlstra	756b1df	2018-09-26 13:01:19 +0200	[diff] [blame]	362
				363	/* Undo PENDING if we set it. */
Peter Zijlstra	53bf57f	2018-09-26 13:01:18 +0200	[diff] [blame]	364	if (!(val & _Q_PENDING_MASK))
				365	clear_pending(lock);
Peter Zijlstra	756b1df	2018-09-26 13:01:19 +0200	[diff] [blame]	366
Peter Zijlstra	53bf57f	2018-09-26 13:01:18 +0200	[diff] [blame]	367	goto queue;
Peter Zijlstra (Intel)	c1fb159	2015-04-24 14:56:32 -0400	[diff] [blame]	368	}
				369
				370	/*
Peter Zijlstra	53bf57f	2018-09-26 13:01:18 +0200	[diff] [blame]	371	* We're pending, wait for the owner to go away.
				372	*
				373	* 0,1,1 -> 0,1,0
				374	*
				375	* this wait loop must be a load-acquire such that we match the
				376	* store-release that clears the locked bit and create lock
				377	* sequentiality; this is because not all
				378	* clear_pending_set_locked() implementations imply full
				379	* barriers.
Peter Zijlstra (Intel)	c1fb159	2015-04-24 14:56:32 -0400	[diff] [blame]	380	*/
Peter Zijlstra	53bf57f	2018-09-26 13:01:18 +0200	[diff] [blame]	381	if (val & _Q_LOCKED_MASK)
				382	atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_MASK));
				383
				384	/*
				385	* take ownership and clear the pending bit.
				386	*
				387	* 0,1,0 -> 0,0,1
				388	*/
				389	clear_pending_set_locked(lock);
Waiman Long	ad53fa1	2019-04-04 13:43:16 -0400	[diff] [blame]	390	lockevent_inc(lock_pending);
Peter Zijlstra	53bf57f	2018-09-26 13:01:18 +0200	[diff] [blame]	391	return;
Peter Zijlstra (Intel)	c1fb159	2015-04-24 14:56:32 -0400	[diff] [blame]	392
				393	/*
				394	* End of pending bit optimistic spinning and beginning of MCS
				395	* queuing.
				396	*/
				397	queue:
Waiman Long	ad53fa1	2019-04-04 13:43:16 -0400	[diff] [blame]	398	lockevent_inc(lock_slowpath);
Waiman Long	81d3dc9	2018-04-26 11:34:27 +0100	[diff] [blame]	399	pv_queue:
Waiman Long	0fa809c	2018-10-16 09:45:07 -0400	[diff] [blame]	400	node = this_cpu_ptr(&qnodes[0].mcs);
Waiman Long	a33fda3	2015-04-24 14:56:30 -0400	[diff] [blame]	401	idx = node->count++;
				402	tail = encode_tail(smp_processor_id(), idx);
				403
Waiman Long	d682b59	2019-01-29 22:53:45 +0100	[diff] [blame]	404	/*
				405	* 4 nodes are allocated based on the assumption that there will
				406	* not be nested NMIs taking spinlocks. That may not be true in
				407	* some architectures even though the chance of needing more than
				408	* 4 nodes will still be extremely unlikely. When that happens,
				409	* we fall back to spinning on the lock directly without using
				410	* any MCS node. This is not the most elegant solution, but is
				411	* simple enough.
				412	*/
				413	if (unlikely(idx >= MAX_NODES)) {
Waiman Long	ad53fa1	2019-04-04 13:43:16 -0400	[diff] [blame]	414	lockevent_inc(lock_no_node);
Waiman Long	d682b59	2019-01-29 22:53:45 +0100	[diff] [blame]	415	while (!queued_spin_trylock(lock))
				416	cpu_relax();
				417	goto release;
				418	}
				419
Waiman Long	0fa809c	2018-10-16 09:45:07 -0400	[diff] [blame]	420	node = grab_mcs_node(node, idx);
Will Deacon	11dc132	2018-02-13 13:22:57 +0000	[diff] [blame]	421
				422	/*
Waiman Long	1222109	2018-10-16 09:45:06 -0400	[diff] [blame]	423	* Keep counts of non-zero index values:
				424	*/
Waiman Long	ad53fa1	2019-04-04 13:43:16 -0400	[diff] [blame]	425	lockevent_cond_inc(lock_use_node2 + idx - 1, idx);
Waiman Long	1222109	2018-10-16 09:45:06 -0400	[diff] [blame]	426
				427	/*
Will Deacon	11dc132	2018-02-13 13:22:57 +0000	[diff] [blame]	428	* Ensure that we increment the head node->count before initialising
				429	* the actual node. If the compiler is kind enough to reorder these
				430	* stores, then an IRQ could overwrite our assignments.
				431	*/
				432	barrier();
				433
Waiman Long	a33fda3	2015-04-24 14:56:30 -0400	[diff] [blame]	434	node->locked = 0;
				435	node->next = NULL;
Waiman Long	a23db28	2015-04-24 14:56:37 -0400	[diff] [blame]	436	pv_init_node(node);
Waiman Long	a33fda3	2015-04-24 14:56:30 -0400	[diff] [blame]	437
				438	/*
Waiman Long	6403bd7	2015-04-24 14:56:33 -0400	[diff] [blame]	439	* We touched a (possibly) cold cacheline in the per-cpu queue node;
				440	* attempt the trylock once more in the hope someone let go while we
				441	* weren't watching.
				442	*/
				443	if (queued_spin_trylock(lock))
				444	goto release;
				445
				446	/*
Will Deacon	9d4646d	2018-04-26 11:34:25 +0100	[diff] [blame]	447	* Ensure that the initialisation of @node is complete before we
				448	* publish the updated tail via xchg_tail() and potentially link
				449	* @node into the waitqueue via WRITE_ONCE(prev->next, node) below.
				450	*/
				451	smp_wmb();
				452
				453	/*
				454	* Publish the updated tail.
Peter Zijlstra (Intel)	c1fb159	2015-04-24 14:56:32 -0400	[diff] [blame]	455	* We have already touched the queueing cacheline; don't bother with
				456	* pending stuff.
				457	*
Waiman Long	6403bd7	2015-04-24 14:56:33 -0400	[diff] [blame]	458	* p,, -> n,,
Waiman Long	a33fda3	2015-04-24 14:56:30 -0400	[diff] [blame]	459	*/
Waiman Long	6403bd7	2015-04-24 14:56:33 -0400	[diff] [blame]	460	old = xchg_tail(lock, tail);
Waiman Long	aa68744	2015-11-09 19:09:23 -0500	[diff] [blame]	461	next = NULL;
Waiman Long	a33fda3	2015-04-24 14:56:30 -0400	[diff] [blame]	462
				463	/*
				464	* if there was a previous node; link it and wait until reaching the
				465	* head of the waitqueue.
				466	*/
Waiman Long	6403bd7	2015-04-24 14:56:33 -0400	[diff] [blame]	467	if (old & _Q_TAIL_MASK) {
Waiman Long	a33fda3	2015-04-24 14:56:30 -0400	[diff] [blame]	468	prev = decode_tail(old);
Peter Zijlstra	8d53fa1	2016-06-08 09:12:30 +0200	[diff] [blame]	469
Will Deacon	9d4646d	2018-04-26 11:34:25 +0100	[diff] [blame]	470	/* Link @node into the waitqueue. */
				471	WRITE_ONCE(prev->next, node);
Waiman Long	a33fda3	2015-04-24 14:56:30 -0400	[diff] [blame]	472
Waiman Long	cd0272f	2015-11-09 19:09:27 -0500	[diff] [blame]	473	pv_wait_node(node, prev);
Waiman Long	a33fda3	2015-04-24 14:56:30 -0400	[diff] [blame]	474	arch_mcs_spin_lock_contended(&node->locked);
Waiman Long	81b5598	2015-11-09 19:09:22 -0500	[diff] [blame]	475
				476	/*
				477	* While waiting for the MCS lock, the next pointer may have
				478	* been set by another lock waiter. We optimistically load
				479	* the next pointer & prefetch the cacheline for writing
				480	* to reduce latency in the upcoming MCS unlock operation.
				481	*/
				482	next = READ_ONCE(node->next);
				483	if (next)
				484	prefetchw(next);
Waiman Long	a33fda3	2015-04-24 14:56:30 -0400	[diff] [blame]	485	}
				486
				487	/*
Peter Zijlstra (Intel)	c1fb159	2015-04-24 14:56:32 -0400	[diff] [blame]	488	* we're at the head of the waitqueue, wait for the owner & pending to
				489	* go away.
Waiman Long	a33fda3	2015-04-24 14:56:30 -0400	[diff] [blame]	490	*
Peter Zijlstra (Intel)	c1fb159	2015-04-24 14:56:32 -0400	[diff] [blame]	491	* ,x,y -> ,0,0
Waiman Long	2c83e8e	2015-04-24 14:56:35 -0400	[diff] [blame]	492	*
				493	* this wait loop must use a load-acquire such that we match the
				494	* store-release that clears the locked bit and create lock
				495	* sequentiality; this is because the set_locked() function below
				496	* does not imply a full barrier.
				497	*
Waiman Long	1c4941f	2015-11-10 16:18:56 -0500	[diff] [blame]	498	* The PV pv_wait_head_or_lock function, if active, will acquire
				499	* the lock and return a non-zero value. So we have to skip the
Will Deacon	f9c811fa	2018-04-26 11:34:21 +0100	[diff] [blame]	500	* atomic_cond_read_acquire() call. As the next PV queue head hasn't
				501	* been designated yet, there is no way for the locked value to become
Waiman Long	1c4941f	2015-11-10 16:18:56 -0500	[diff] [blame]	502	* _Q_SLOW_VAL. So both the set_locked() and the
				503	* atomic_cmpxchg_relaxed() calls will be safe.
				504	*
				505	* If PV isn't active, 0 will be returned instead.
				506	*
Waiman Long	a33fda3	2015-04-24 14:56:30 -0400	[diff] [blame]	507	*/
Waiman Long	1c4941f	2015-11-10 16:18:56 -0500	[diff] [blame]	508	if ((val = pv_wait_head_or_lock(lock, node)))
				509	goto locked;
				510
Will Deacon	f9c811fa	2018-04-26 11:34:21 +0100	[diff] [blame]	511	val = atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK));
Waiman Long	a33fda3	2015-04-24 14:56:30 -0400	[diff] [blame]	512
Waiman Long	1c4941f	2015-11-10 16:18:56 -0500	[diff] [blame]	513	locked:
Waiman Long	a33fda3	2015-04-24 14:56:30 -0400	[diff] [blame]	514	/*
				515	* claim the lock:
				516	*
Peter Zijlstra (Intel)	c1fb159	2015-04-24 14:56:32 -0400	[diff] [blame]	517	* n,0,0 -> 0,0,1 : lock, uncontended
Will Deacon	59fb586	2018-04-26 11:34:19 +0100	[diff] [blame]	518	* ,,0 -> ,,1 : lock, contended
Waiman Long	2c83e8e	2015-04-24 14:56:35 -0400	[diff] [blame]	519	*
Will Deacon	59fb586	2018-04-26 11:34:19 +0100	[diff] [blame]	520	* If the queue head is the only one in the queue (lock value == tail)
				521	* and nobody is pending, clear the tail code and grab the lock.
				522	* Otherwise, we only need to grab the lock.
Waiman Long	a33fda3	2015-04-24 14:56:30 -0400	[diff] [blame]	523	*/
Will Deacon	c61da58	2018-04-26 11:34:20 +0100	[diff] [blame]	524
Will Deacon	ae75d90	2018-04-26 11:34:26 +0100	[diff] [blame]	525	/*
Peter Zijlstra	756b1df	2018-09-26 13:01:19 +0200	[diff] [blame]	526	* In the PV case we might already have _Q_LOCKED_VAL set, because
				527	* of lock stealing; therefore we must also allow:
Will Deacon	ae75d90	2018-04-26 11:34:26 +0100	[diff] [blame]	528	*
Peter Zijlstra	756b1df	2018-09-26 13:01:19 +0200	[diff] [blame]	529	* n,0,1 -> 0,0,1
				530	*
				531	* Note: at this point: (val & _Q_PENDING_MASK) == 0, because of the
				532	* above wait condition, therefore any concurrent setting of
				533	* PENDING will make the uncontended transition fail.
Will Deacon	ae75d90	2018-04-26 11:34:26 +0100	[diff] [blame]	534	*/
Peter Zijlstra	756b1df	2018-09-26 13:01:19 +0200	[diff] [blame]	535	if ((val & _Q_TAIL_MASK) == tail) {
				536	if (atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL))
				537	goto release; /* No contention */
				538	}
Waiman Long	a33fda3	2015-04-24 14:56:30 -0400	[diff] [blame]	539
Peter Zijlstra	756b1df	2018-09-26 13:01:19 +0200	[diff] [blame]	540	/*
				541	* Either somebody is queued behind us or _Q_PENDING_VAL got set
				542	* which will then detect the remaining tail and queue behind us
				543	* ensuring we'll see a @next.
				544	*/
Will Deacon	c61da58	2018-04-26 11:34:20 +0100	[diff] [blame]	545	set_locked(lock);
				546
Waiman Long	a33fda3	2015-04-24 14:56:30 -0400	[diff] [blame]	547	/*
Waiman Long	aa68744	2015-11-09 19:09:23 -0500	[diff] [blame]	548	* contended path; wait for next if not observed yet, release.
Waiman Long	a33fda3	2015-04-24 14:56:30 -0400	[diff] [blame]	549	*/
Will Deacon	c131a19	2018-04-26 11:34:23 +0100	[diff] [blame]	550	if (!next)
				551	next = smp_cond_load_relaxed(&node->next, (VAL));
Waiman Long	a33fda3	2015-04-24 14:56:30 -0400	[diff] [blame]	552
Waiman Long	2c83e8e	2015-04-24 14:56:35 -0400	[diff] [blame]	553	arch_mcs_spin_unlock_contended(&next->locked);
Waiman Long	75d2270	2015-07-11 16:36:52 -0400	[diff] [blame]	554	pv_kick_node(lock, next);
Waiman Long	a33fda3	2015-04-24 14:56:30 -0400	[diff] [blame]	555
				556	release:
				557	/*
				558	* release the node
				559	*/
Waiman Long	0fa809c	2018-10-16 09:45:07 -0400	[diff] [blame]	560	__this_cpu_dec(qnodes[0].mcs.count);
Waiman Long	a33fda3	2015-04-24 14:56:30 -0400	[diff] [blame]	561	}
				562	EXPORT_SYMBOL(queued_spin_lock_slowpath);
Waiman Long	a23db28	2015-04-24 14:56:37 -0400	[diff] [blame]	563
				564	/*
				565	* Generate the paravirt code for queued_spin_unlock_slowpath().
				566	*/
				567	#if !defined(_GEN_PV_LOCK_SLOWPATH) && defined(CONFIG_PARAVIRT_SPINLOCKS)
				568	#define _GEN_PV_LOCK_SLOWPATH
				569
				570	#undef pv_enabled
				571	#define pv_enabled() true
				572
				573	#undef pv_init_node
				574	#undef pv_wait_node
				575	#undef pv_kick_node
Waiman Long	1c4941f	2015-11-10 16:18:56 -0500	[diff] [blame]	576	#undef pv_wait_head_or_lock
Waiman Long	a23db28	2015-04-24 14:56:37 -0400	[diff] [blame]	577
				578	#undef queued_spin_lock_slowpath
				579	#define queued_spin_lock_slowpath __pv_queued_spin_lock_slowpath
				580
				581	#include "qspinlock_paravirt.h"
				582	#include "qspinlock.c"
				583
Zhenzhong Duan	05eee61	2019-10-23 19:16:22 +0800	[diff] [blame]	584	bool nopvspin __initdata;
				585	static __init int parse_nopvspin(char *arg)
				586	{
				587	nopvspin = true;
				588	return 0;
				589	}
				590	early_param("nopvspin", parse_nopvspin);
Waiman Long	a23db28	2015-04-24 14:56:37 -0400	[diff] [blame]	591	#endif