Blame - kernel/futex/pi.c - SHIFTPHONES/mainline/linux

blob: 183b28c32c832e7169b6953d7c99ef844477b8b3 [file] [log] [blame]

Peter Zijlstra	85dc28f	2021-09-23 14:10:58 -0300	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0-or-later
				2
				3	#include <linux/slab.h>
				4	#include <linux/sched/task.h>
				5
				6	#include "futex.h"
				7	#include "../locking/rtmutex_common.h"
				8
				9	/*
				10	* PI code:
				11	*/
				12	int refill_pi_state_cache(void)
				13	{
				14	struct futex_pi_state *pi_state;
				15
				16	if (likely(current->pi_state_cache))
				17	return 0;
				18
				19	pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL);
				20
				21	if (!pi_state)
				22	return -ENOMEM;
				23
				24	INIT_LIST_HEAD(&pi_state->list);
				25	/* pi_mutex gets initialized later */
				26	pi_state->owner = NULL;
				27	refcount_set(&pi_state->refcount, 1);
				28	pi_state->key = FUTEX_KEY_INIT;
				29
				30	current->pi_state_cache = pi_state;
				31
				32	return 0;
				33	}
				34
				35	static struct futex_pi_state *alloc_pi_state(void)
				36	{
				37	struct futex_pi_state *pi_state = current->pi_state_cache;
				38
				39	WARN_ON(!pi_state);
				40	current->pi_state_cache = NULL;
				41
				42	return pi_state;
				43	}
				44
				45	static void pi_state_update_owner(struct futex_pi_state *pi_state,
				46	struct task_struct *new_owner)
				47	{
				48	struct task_struct *old_owner = pi_state->owner;
				49
				50	lockdep_assert_held(&pi_state->pi_mutex.wait_lock);
				51
				52	if (old_owner) {
				53	raw_spin_lock(&old_owner->pi_lock);
				54	WARN_ON(list_empty(&pi_state->list));
				55	list_del_init(&pi_state->list);
				56	raw_spin_unlock(&old_owner->pi_lock);
				57	}
				58
				59	if (new_owner) {
				60	raw_spin_lock(&new_owner->pi_lock);
				61	WARN_ON(!list_empty(&pi_state->list));
				62	list_add(&pi_state->list, &new_owner->pi_state_list);
				63	pi_state->owner = new_owner;
				64	raw_spin_unlock(&new_owner->pi_lock);
				65	}
				66	}
				67
				68	void get_pi_state(struct futex_pi_state *pi_state)
				69	{
				70	WARN_ON_ONCE(!refcount_inc_not_zero(&pi_state->refcount));
				71	}
				72
				73	/*
				74	* Drops a reference to the pi_state object and frees or caches it
				75	* when the last reference is gone.
				76	*/
				77	void put_pi_state(struct futex_pi_state *pi_state)
				78	{
				79	if (!pi_state)
				80	return;
				81
				82	if (!refcount_dec_and_test(&pi_state->refcount))
				83	return;
				84
				85	/*
				86	* If pi_state->owner is NULL, the owner is most probably dying
				87	* and has cleaned up the pi_state already
				88	*/
				89	if (pi_state->owner) {
				90	unsigned long flags;
				91
				92	raw_spin_lock_irqsave(&pi_state->pi_mutex.wait_lock, flags);
				93	pi_state_update_owner(pi_state, NULL);
				94	rt_mutex_proxy_unlock(&pi_state->pi_mutex);
				95	raw_spin_unlock_irqrestore(&pi_state->pi_mutex.wait_lock, flags);
				96	}
				97
				98	if (current->pi_state_cache) {
				99	kfree(pi_state);
				100	} else {
				101	/*
				102	* pi_state->list is already empty.
				103	* clear pi_state->owner.
				104	* refcount is at 0 - put it back to 1.
				105	*/
				106	pi_state->owner = NULL;
				107	refcount_set(&pi_state->refcount, 1);
				108	current->pi_state_cache = pi_state;
				109	}
				110	}
				111
				112	/*
				113	* We need to check the following states:
				114	*
				115	* Waiter \| pi_state \| pi->owner \| uTID \| uODIED \| ?
				116	*
				117	* [1] NULL \| --- \| --- \| 0 \| 0/1 \| Valid
				118	* [2] NULL \| --- \| --- \| >0 \| 0/1 \| Valid
				119	*
				120	* [3] Found \| NULL \| -- \| Any \| 0/1 \| Invalid
				121	*
				122	* [4] Found \| Found \| NULL \| 0 \| 1 \| Valid
				123	* [5] Found \| Found \| NULL \| >0 \| 1 \| Invalid
				124	*
				125	* [6] Found \| Found \| task \| 0 \| 1 \| Valid
				126	*
				127	* [7] Found \| Found \| NULL \| Any \| 0 \| Invalid
				128	*
				129	* [8] Found \| Found \| task \| ==taskTID \| 0/1 \| Valid
				130	* [9] Found \| Found \| task \| 0 \| 0 \| Invalid
				131	* [10] Found \| Found \| task \| !=taskTID \| 0/1 \| Invalid
				132	*
				133	* [1] Indicates that the kernel can acquire the futex atomically. We
				134	* came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit.
				135	*
				136	* [2] Valid, if TID does not belong to a kernel thread. If no matching
				137	* thread is found then it indicates that the owner TID has died.
				138	*
				139	* [3] Invalid. The waiter is queued on a non PI futex
				140	*
				141	* [4] Valid state after exit_robust_list(), which sets the user space
				142	* value to FUTEX_WAITERS \| FUTEX_OWNER_DIED.
				143	*
				144	* [5] The user space value got manipulated between exit_robust_list()
				145	* and exit_pi_state_list()
				146	*
				147	* [6] Valid state after exit_pi_state_list() which sets the new owner in
				148	* the pi_state but cannot access the user space value.
				149	*
				150	* [7] pi_state->owner can only be NULL when the OWNER_DIED bit is set.
				151	*
				152	* [8] Owner and user space value match
				153	*
				154	* [9] There is no transient state which sets the user space TID to 0
				155	* except exit_robust_list(), but this is indicated by the
				156	* FUTEX_OWNER_DIED bit. See [4]
				157	*
				158	* [10] There is no transient state which leaves owner and user space
				159	* TID out of sync. Except one error case where the kernel is denied
				160	* write access to the user address, see fixup_pi_state_owner().
				161	*
				162	*
				163	* Serialization and lifetime rules:
				164	*
				165	* hb->lock:
				166	*
				167	* hb -> futex_q, relation
				168	* futex_q -> pi_state, relation
				169	*
				170	* (cannot be raw because hb can contain arbitrary amount
				171	* of futex_q's)
				172	*
				173	* pi_mutex->wait_lock:
				174	*
				175	* {uval, pi_state}
				176	*
				177	* (and pi_mutex 'obviously')
				178	*
				179	* p->pi_lock:
				180	*
				181	* p->pi_state_list -> pi_state->list, relation
				182	* pi_mutex->owner -> pi_state->owner, relation
				183	*
				184	* pi_state->refcount:
				185	*
				186	* pi_state lifetime
				187	*
				188	*
				189	* Lock order:
				190	*
				191	* hb->lock
				192	* pi_mutex->wait_lock
				193	* p->pi_lock
				194	*
				195	*/
				196
				197	/*
				198	* Validate that the existing waiter has a pi_state and sanity check
				199	* the pi_state against the user space value. If correct, attach to
				200	* it.
				201	*/
				202	static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
				203	struct futex_pi_state *pi_state,
				204	struct futex_pi_state **ps)
				205	{
				206	pid_t pid = uval & FUTEX_TID_MASK;
				207	u32 uval2;
				208	int ret;
				209
				210	/*
				211	* Userspace might have messed up non-PI and PI futexes [3]
				212	*/
				213	if (unlikely(!pi_state))
				214	return -EINVAL;
				215
				216	/*
				217	* We get here with hb->lock held, and having found a
				218	* futex_top_waiter(). This means that futex_lock_pi() of said futex_q
				219	* has dropped the hb->lock in between futex_queue() and futex_unqueue_pi(),
				220	* which in turn means that futex_lock_pi() still has a reference on
				221	* our pi_state.
				222	*
				223	* The waiter holding a reference on @pi_state also protects against
				224	* the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi()
				225	* and futex_wait_requeue_pi() as it cannot go to 0 and consequently
				226	* free pi_state before we can take a reference ourselves.
				227	*/
				228	WARN_ON(!refcount_read(&pi_state->refcount));
				229
				230	/*
				231	* Now that we have a pi_state, we can acquire wait_lock
				232	* and do the state validation.
				233	*/
				234	raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
				235
				236	/*
				237	* Since {uval, pi_state} is serialized by wait_lock, and our current
				238	* uval was read without holding it, it can have changed. Verify it
				239	* still is what we expect it to be, otherwise retry the entire
				240	* operation.
				241	*/
				242	if (futex_get_value_locked(&uval2, uaddr))
				243	goto out_efault;
				244
				245	if (uval != uval2)
				246	goto out_eagain;
				247
				248	/*
				249	* Handle the owner died case:
				250	*/
				251	if (uval & FUTEX_OWNER_DIED) {
				252	/*
				253	* exit_pi_state_list sets owner to NULL and wakes the
				254	* topmost waiter. The task which acquires the
				255	* pi_state->rt_mutex will fixup owner.
				256	*/
				257	if (!pi_state->owner) {
				258	/*
				259	* No pi state owner, but the user space TID
				260	* is not 0. Inconsistent state. [5]
				261	*/
				262	if (pid)
				263	goto out_einval;
				264	/*
				265	* Take a ref on the state and return success. [4]
				266	*/
				267	goto out_attach;
				268	}
				269
				270	/*
				271	* If TID is 0, then either the dying owner has not
				272	* yet executed exit_pi_state_list() or some waiter
				273	* acquired the rtmutex in the pi state, but did not
				274	* yet fixup the TID in user space.
				275	*
				276	* Take a ref on the state and return success. [6]
				277	*/
				278	if (!pid)
				279	goto out_attach;
				280	} else {
				281	/*
				282	* If the owner died bit is not set, then the pi_state
				283	* must have an owner. [7]
				284	*/
				285	if (!pi_state->owner)
				286	goto out_einval;
				287	}
				288
				289	/*
				290	* Bail out if user space manipulated the futex value. If pi
				291	* state exists then the owner TID must be the same as the
				292	* user space TID. [9/10]
				293	*/
				294	if (pid != task_pid_vnr(pi_state->owner))
				295	goto out_einval;
				296
				297	out_attach:
				298	get_pi_state(pi_state);
				299	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
				300	*ps = pi_state;
				301	return 0;
				302
				303	out_einval:
				304	ret = -EINVAL;
				305	goto out_error;
				306
				307	out_eagain:
				308	ret = -EAGAIN;
				309	goto out_error;
				310
				311	out_efault:
				312	ret = -EFAULT;
				313	goto out_error;
				314
				315	out_error:
				316	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
				317	return ret;
				318	}
				319
				320	static int handle_exit_race(u32 __user *uaddr, u32 uval,
				321	struct task_struct *tsk)
				322	{
				323	u32 uval2;
				324
				325	/*
				326	* If the futex exit state is not yet FUTEX_STATE_DEAD, tell the
				327	* caller that the alleged owner is busy.
				328	*/
				329	if (tsk && tsk->futex_state != FUTEX_STATE_DEAD)
				330	return -EBUSY;
				331
				332	/*
				333	* Reread the user space value to handle the following situation:
				334	*
				335	* CPU0 CPU1
				336	*
				337	* sys_exit() sys_futex()
				338	* do_exit() futex_lock_pi()
				339	* futex_lock_pi_atomic()
				340	* exit_signals(tsk) No waiters:
				341	* tsk->flags \|= PF_EXITING; *uaddr == 0x00000PID
				342	* mm_release(tsk) Set waiter bit
				343	* exit_robust_list(tsk) { *uaddr = 0x80000PID;
				344	* Set owner died attach_to_pi_owner() {
				345	* *uaddr = 0xC0000000; tsk = get_task(PID);
				346	* } if (!tsk->flags & PF_EXITING) {
				347	* ... attach();
				348	* tsk->futex_state = } else {
				349	* FUTEX_STATE_DEAD; if (tsk->futex_state !=
				350	* FUTEX_STATE_DEAD)
				351	* return -EAGAIN;
				352	* return -ESRCH; <--- FAIL
				353	* }
				354	*
				355	* Returning ESRCH unconditionally is wrong here because the
				356	* user space value has been changed by the exiting task.
				357	*
				358	* The same logic applies to the case where the exiting task is
				359	* already gone.
				360	*/
				361	if (futex_get_value_locked(&uval2, uaddr))
				362	return -EFAULT;
				363
				364	/* If the user space value has changed, try again. */
				365	if (uval2 != uval)
				366	return -EAGAIN;
				367
				368	/*
				369	* The exiting task did not have a robust list, the robust list was
				370	* corrupted or the user space value in *uaddr is simply bogus.
				371	* Give up and tell user space.
				372	*/
				373	return -ESRCH;
				374	}
				375
				376	static void __attach_to_pi_owner(struct task_struct p, union futex_key key,
				377	struct futex_pi_state **ps)
				378	{
				379	/*
				380	* No existing pi state. First waiter. [2]
				381	*
				382	* This creates pi_state, we have hb->lock held, this means nothing can
				383	* observe this state, wait_lock is irrelevant.
				384	*/
				385	struct futex_pi_state *pi_state = alloc_pi_state();
				386
				387	/*
				388	* Initialize the pi_mutex in locked state and make @p
				389	* the owner of it:
				390	*/
				391	rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
				392
				393	/* Store the key for possible exit cleanups: */
				394	pi_state->key = *key;
				395
				396	WARN_ON(!list_empty(&pi_state->list));
				397	list_add(&pi_state->list, &p->pi_state_list);
				398	/*
				399	* Assignment without holding pi_state->pi_mutex.wait_lock is safe
				400	* because there is no concurrency as the object is not published yet.
				401	*/
				402	pi_state->owner = p;
				403
				404	*ps = pi_state;
				405	}
				406	/*
				407	* Lookup the task for the TID provided from user space and attach to
				408	* it after doing proper sanity checks.
				409	*/
				410	static int attach_to_pi_owner(u32 __user uaddr, u32 uval, union futex_key key,
				411	struct futex_pi_state **ps,
				412	struct task_struct **exiting)
				413	{
				414	pid_t pid = uval & FUTEX_TID_MASK;
				415	struct task_struct *p;
				416
				417	/*
				418	* We are the first waiter - try to look up the real owner and attach
				419	* the new pi_state to it, but bail out when TID = 0 [1]
				420	*
				421	* The !pid check is paranoid. None of the call sites should end up
				422	* with pid == 0, but better safe than sorry. Let the caller retry
				423	*/
				424	if (!pid)
				425	return -EAGAIN;
				426	p = find_get_task_by_vpid(pid);
				427	if (!p)
				428	return handle_exit_race(uaddr, uval, NULL);
				429
				430	if (unlikely(p->flags & PF_KTHREAD)) {
				431	put_task_struct(p);
				432	return -EPERM;
				433	}
				434
				435	/*
				436	* We need to look at the task state to figure out, whether the
				437	* task is exiting. To protect against the change of the task state
				438	* in futex_exit_release(), we do this protected by p->pi_lock:
				439	*/
				440	raw_spin_lock_irq(&p->pi_lock);
				441	if (unlikely(p->futex_state != FUTEX_STATE_OK)) {
				442	/*
				443	* The task is on the way out. When the futex state is
				444	* FUTEX_STATE_DEAD, we know that the task has finished
				445	* the cleanup:
				446	*/
				447	int ret = handle_exit_race(uaddr, uval, p);
				448
				449	raw_spin_unlock_irq(&p->pi_lock);
				450	/*
				451	* If the owner task is between FUTEX_STATE_EXITING and
				452	* FUTEX_STATE_DEAD then store the task pointer and keep
				453	* the reference on the task struct. The calling code will
				454	* drop all locks, wait for the task to reach
				455	* FUTEX_STATE_DEAD and then drop the refcount. This is
				456	* required to prevent a live lock when the current task
				457	* preempted the exiting task between the two states.
				458	*/
				459	if (ret == -EBUSY)
				460	*exiting = p;
				461	else
				462	put_task_struct(p);
				463	return ret;
				464	}
				465
				466	__attach_to_pi_owner(p, key, ps);
				467	raw_spin_unlock_irq(&p->pi_lock);
				468
				469	put_task_struct(p);
				470
				471	return 0;
				472	}
				473
				474	static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
				475	{
				476	int err;
				477	u32 curval;
				478
				479	if (unlikely(should_fail_futex(true)))
				480	return -EFAULT;
				481
				482	err = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
				483	if (unlikely(err))
				484	return err;
				485
				486	/* If user space value changed, let the caller retry */
				487	return curval != uval ? -EAGAIN : 0;
				488	}
				489
				490	/**
				491	* futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex
				492	* @uaddr: the pi futex user address
				493	* @hb: the pi futex hash bucket
				494	* @key: the futex key associated with uaddr and hb
				495	* @ps: the pi_state pointer where we store the result of the
				496	* lookup
				497	* @task: the task to perform the atomic lock work for. This will
				498	* be "current" except in the case of requeue pi.
				499	* @exiting: Pointer to store the task pointer of the owner task
				500	* which is in the middle of exiting
				501	* @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
				502	*
				503	* Return:
				504	* - 0 - ready to wait;
				505	* - 1 - acquired the lock;
				506	* - <0 - error
				507	*
				508	* The hb->lock must be held by the caller.
				509	*
				510	* @exiting is only set when the return value is -EBUSY. If so, this holds
				511	* a refcount on the exiting task on return and the caller needs to drop it
				512	* after waiting for the exit to complete.
				513	*/
				514	int futex_lock_pi_atomic(u32 __user uaddr, struct futex_hash_bucket hb,
				515	union futex_key *key,
				516	struct futex_pi_state **ps,
				517	struct task_struct *task,
				518	struct task_struct **exiting,
				519	int set_waiters)
				520	{
				521	u32 uval, newval, vpid = task_pid_vnr(task);
				522	struct futex_q *top_waiter;
				523	int ret;
				524
				525	/*
				526	* Read the user space value first so we can validate a few
				527	* things before proceeding further.
				528	*/
				529	if (futex_get_value_locked(&uval, uaddr))
				530	return -EFAULT;
				531
				532	if (unlikely(should_fail_futex(true)))
				533	return -EFAULT;
				534
				535	/*
				536	* Detect deadlocks.
				537	*/
				538	if ((unlikely((uval & FUTEX_TID_MASK) == vpid)))
				539	return -EDEADLK;
				540
				541	if ((unlikely(should_fail_futex(true))))
				542	return -EDEADLK;
				543
				544	/*
				545	* Lookup existing state first. If it exists, try to attach to
				546	* its pi_state.
				547	*/
				548	top_waiter = futex_top_waiter(hb, key);
				549	if (top_waiter)
				550	return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
				551
				552	/*
				553	* No waiter and user TID is 0. We are here because the
				554	* waiters or the owner died bit is set or called from
				555	* requeue_cmp_pi or for whatever reason something took the
				556	* syscall.
				557	*/
				558	if (!(uval & FUTEX_TID_MASK)) {
				559	/*
				560	* We take over the futex. No other waiters and the user space
				561	* TID is 0. We preserve the owner died bit.
				562	*/
				563	newval = uval & FUTEX_OWNER_DIED;
				564	newval \|= vpid;
				565
				566	/* The futex requeue_pi code can enforce the waiters bit */
				567	if (set_waiters)
				568	newval \|= FUTEX_WAITERS;
				569
				570	ret = lock_pi_update_atomic(uaddr, uval, newval);
				571	if (ret)
				572	return ret;
				573
				574	/*
				575	* If the waiter bit was requested the caller also needs PI
				576	* state attached to the new owner of the user space futex.
				577	*
				578	* @task is guaranteed to be alive and it cannot be exiting
				579	* because it is either sleeping or waiting in
				580	* futex_requeue_pi_wakeup_sync().
				581	*
				582	* No need to do the full attach_to_pi_owner() exercise
				583	* because @task is known and valid.
				584	*/
				585	if (set_waiters) {
				586	raw_spin_lock_irq(&task->pi_lock);
				587	__attach_to_pi_owner(task, key, ps);
				588	raw_spin_unlock_irq(&task->pi_lock);
				589	}
				590	return 1;
				591	}
				592
				593	/*
				594	* First waiter. Set the waiters bit before attaching ourself to
				595	* the owner. If owner tries to unlock, it will be forced into
				596	* the kernel and blocked on hb->lock.
				597	*/
				598	newval = uval \| FUTEX_WAITERS;
				599	ret = lock_pi_update_atomic(uaddr, uval, newval);
				600	if (ret)
				601	return ret;
				602	/*
				603	* If the update of the user space value succeeded, we try to
				604	* attach to the owner. If that fails, no harm done, we only
				605	* set the FUTEX_WAITERS bit in the user space variable.
				606	*/
				607	return attach_to_pi_owner(uaddr, newval, key, ps, exiting);
				608	}
				609
				610	/*
				611	* Caller must hold a reference on @pi_state.
				612	*/
				613	static int wake_futex_pi(u32 __user uaddr, u32 uval, struct futex_pi_state pi_state)
				614	{
				615	struct rt_mutex_waiter *top_waiter;
				616	struct task_struct *new_owner;
				617	bool postunlock = false;
				618	DEFINE_RT_WAKE_Q(wqh);
				619	u32 curval, newval;
				620	int ret = 0;
				621
				622	top_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex);
				623	if (WARN_ON_ONCE(!top_waiter)) {
				624	/*
				625	* As per the comment in futex_unlock_pi() this should not happen.
				626	*
				627	* When this happens, give up our locks and try again, giving
				628	* the futex_lock_pi() instance time to complete, either by
				629	* waiting on the rtmutex or removing itself from the futex
				630	* queue.
				631	*/
				632	ret = -EAGAIN;
				633	goto out_unlock;
				634	}
				635
				636	new_owner = top_waiter->task;
				637
				638	/*
				639	* We pass it to the next owner. The WAITERS bit is always kept
				640	* enabled while there is PI state around. We cleanup the owner
				641	* died bit, because we are the owner.
				642	*/
				643	newval = FUTEX_WAITERS \| task_pid_vnr(new_owner);
				644
				645	if (unlikely(should_fail_futex(true))) {
				646	ret = -EFAULT;
				647	goto out_unlock;
				648	}
				649
				650	ret = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
				651	if (!ret && (curval != uval)) {
				652	/*
				653	* If a unconditional UNLOCK_PI operation (user space did not
				654	* try the TID->0 transition) raced with a waiter setting the
				655	* FUTEX_WAITERS flag between get_user() and locking the hash
				656	* bucket lock, retry the operation.
				657	*/
				658	if ((FUTEX_TID_MASK & curval) == uval)
				659	ret = -EAGAIN;
				660	else
				661	ret = -EINVAL;
				662	}
				663
				664	if (!ret) {
				665	/*
				666	* This is a point of no return; once we modified the uval
				667	* there is no going back and subsequent operations must
				668	* not fail.
				669	*/
				670	pi_state_update_owner(pi_state, new_owner);
				671	postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wqh);
				672	}
				673
				674	out_unlock:
				675	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
				676
				677	if (postunlock)
				678	rt_mutex_postunlock(&wqh);
				679
				680	return ret;
				681	}
				682
				683	static int __fixup_pi_state_owner(u32 __user uaddr, struct futex_q q,
				684	struct task_struct *argowner)
				685	{
				686	struct futex_pi_state *pi_state = q->pi_state;
				687	struct task_struct oldowner, newowner;
				688	u32 uval, curval, newval, newtid;
				689	int err = 0;
				690
				691	oldowner = pi_state->owner;
				692
				693	/*
				694	* We are here because either:
				695	*
				696	* - we stole the lock and pi_state->owner needs updating to reflect
				697	* that (@argowner == current),
				698	*
				699	* or:
				700	*
				701	* - someone stole our lock and we need to fix things to point to the
				702	* new owner (@argowner == NULL).
				703	*
				704	* Either way, we have to replace the TID in the user space variable.
				705	* This must be atomic as we have to preserve the owner died bit here.
				706	*
				707	* Note: We write the user space value _before_ changing the pi_state
				708	* because we can fault here. Imagine swapped out pages or a fork
				709	* that marked all the anonymous memory readonly for cow.
				710	*
				711	* Modifying pi_state _before_ the user space value would leave the
				712	* pi_state in an inconsistent state when we fault here, because we
				713	* need to drop the locks to handle the fault. This might be observed
				714	* in the PID checks when attaching to PI state .
				715	*/
				716	retry:
				717	if (!argowner) {
				718	if (oldowner != current) {
				719	/*
				720	* We raced against a concurrent self; things are
				721	* already fixed up. Nothing to do.
				722	*/
				723	return 0;
				724	}
				725
				726	if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) {
				727	/* We got the lock. pi_state is correct. Tell caller. */
				728	return 1;
				729	}
				730
				731	/*
				732	* The trylock just failed, so either there is an owner or
				733	* there is a higher priority waiter than this one.
				734	*/
				735	newowner = rt_mutex_owner(&pi_state->pi_mutex);
				736	/*
				737	* If the higher priority waiter has not yet taken over the
				738	* rtmutex then newowner is NULL. We can't return here with
				739	* that state because it's inconsistent vs. the user space
				740	* state. So drop the locks and try again. It's a valid
				741	* situation and not any different from the other retry
				742	* conditions.
				743	*/
				744	if (unlikely(!newowner)) {
				745	err = -EAGAIN;
				746	goto handle_err;
				747	}
				748	} else {
				749	WARN_ON_ONCE(argowner != current);
				750	if (oldowner == current) {
				751	/*
				752	* We raced against a concurrent self; things are
				753	* already fixed up. Nothing to do.
				754	*/
				755	return 1;
				756	}
				757	newowner = argowner;
				758	}
				759
				760	newtid = task_pid_vnr(newowner) \| FUTEX_WAITERS;
				761	/* Owner died? */
				762	if (!pi_state->owner)
				763	newtid \|= FUTEX_OWNER_DIED;
				764
				765	err = futex_get_value_locked(&uval, uaddr);
				766	if (err)
				767	goto handle_err;
				768
				769	for (;;) {
				770	newval = (uval & FUTEX_OWNER_DIED) \| newtid;
				771
				772	err = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
				773	if (err)
				774	goto handle_err;
				775
				776	if (curval == uval)
				777	break;
				778	uval = curval;
				779	}
				780
				781	/*
				782	* We fixed up user space. Now we need to fix the pi_state
				783	* itself.
				784	*/
				785	pi_state_update_owner(pi_state, newowner);
				786
				787	return argowner == current;
				788
				789	/*
				790	* In order to reschedule or handle a page fault, we need to drop the
				791	* locks here. In the case of a fault, this gives the other task
				792	* (either the highest priority waiter itself or the task which stole
				793	* the rtmutex) the chance to try the fixup of the pi_state. So once we
				794	* are back from handling the fault we need to check the pi_state after
				795	* reacquiring the locks and before trying to do another fixup. When
				796	* the fixup has been done already we simply return.
				797	*
				798	* Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
				799	* drop hb->lock since the caller owns the hb -> futex_q relation.
				800	* Dropping the pi_mutex->wait_lock requires the state revalidate.
				801	*/
				802	handle_err:
				803	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
				804	spin_unlock(q->lock_ptr);
				805
				806	switch (err) {
				807	case -EFAULT:
				808	err = fault_in_user_writeable(uaddr);
				809	break;
				810
				811	case -EAGAIN:
				812	cond_resched();
				813	err = 0;
				814	break;
				815
				816	default:
				817	WARN_ON_ONCE(1);
				818	break;
				819	}
				820
				821	spin_lock(q->lock_ptr);
				822	raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
				823
				824	/*
				825	* Check if someone else fixed it for us:
				826	*/
				827	if (pi_state->owner != oldowner)
				828	return argowner == current;
				829
				830	/* Retry if err was -EAGAIN or the fault in succeeded */
				831	if (!err)
				832	goto retry;
				833
				834	/*
				835	* fault_in_user_writeable() failed so user state is immutable. At
				836	* best we can make the kernel state consistent but user state will
				837	* be most likely hosed and any subsequent unlock operation will be
				838	* rejected due to PI futex rule [10].
				839	*
				840	* Ensure that the rtmutex owner is also the pi_state owner despite
				841	* the user space value claiming something different. There is no
				842	* point in unlocking the rtmutex if current is the owner as it
				843	* would need to wait until the next waiter has taken the rtmutex
				844	* to guarantee consistent state. Keep it simple. Userspace asked
				845	* for this wreckaged state.
				846	*
				847	* The rtmutex has an owner - either current or some other
				848	* task. See the EAGAIN loop above.
				849	*/
				850	pi_state_update_owner(pi_state, rt_mutex_owner(&pi_state->pi_mutex));
				851
				852	return err;
				853	}
				854
				855	static int fixup_pi_state_owner(u32 __user uaddr, struct futex_q q,
				856	struct task_struct *argowner)
				857	{
				858	struct futex_pi_state *pi_state = q->pi_state;
				859	int ret;
				860
				861	lockdep_assert_held(q->lock_ptr);
				862
				863	raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
				864	ret = __fixup_pi_state_owner(uaddr, q, argowner);
				865	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
				866	return ret;
				867	}
				868
				869	/**
				870	* fixup_pi_owner() - Post lock pi_state and corner case management
				871	* @uaddr: user address of the futex
				872	* @q: futex_q (contains pi_state and access to the rt_mutex)
				873	* @locked: if the attempt to take the rt_mutex succeeded (1) or not (0)
				874	*
				875	* After attempting to lock an rt_mutex, this function is called to cleanup
				876	* the pi_state owner as well as handle race conditions that may allow us to
				877	* acquire the lock. Must be called with the hb lock held.
				878	*
				879	* Return:
				880	* - 1 - success, lock taken;
				881	* - 0 - success, lock not taken;
				882	* - <0 - on error (-EFAULT)
				883	*/
				884	int fixup_pi_owner(u32 __user uaddr, struct futex_q q, int locked)
				885	{
				886	if (locked) {
				887	/*
				888	* Got the lock. We might not be the anticipated owner if we
				889	* did a lock-steal - fix up the PI-state in that case:
				890	*
				891	* Speculative pi_state->owner read (we don't hold wait_lock);
				892	* since we own the lock pi_state->owner == current is the
				893	* stable state, anything else needs more attention.
				894	*/
				895	if (q->pi_state->owner != current)
				896	return fixup_pi_state_owner(uaddr, q, current);
				897	return 1;
				898	}
				899
				900	/*
				901	* If we didn't get the lock; check if anybody stole it from us. In
				902	* that case, we need to fix up the uval to point to them instead of
				903	* us, otherwise bad things happen. [10]
				904	*
				905	* Another speculative read; pi_state->owner == current is unstable
				906	* but needs our attention.
				907	*/
				908	if (q->pi_state->owner == current)
				909	return fixup_pi_state_owner(uaddr, q, NULL);
				910
				911	/*
				912	* Paranoia check. If we did not take the lock, then we should not be
				913	* the owner of the rt_mutex. Warn and establish consistent state.
				914	*/
				915	if (WARN_ON_ONCE(rt_mutex_owner(&q->pi_state->pi_mutex) == current))
				916	return fixup_pi_state_owner(uaddr, q, current);
				917
				918	return 0;
				919	}
				920
				921	/*
				922	* Userspace tried a 0 -> TID atomic transition of the futex value
				923	* and failed. The kernel side here does the whole locking operation:
				924	* if there are waiters then it will block as a consequence of relying
				925	* on rt-mutexes, it does PI, etc. (Due to races the kernel might see
				926	* a 0 value of the futex too.).
				927	*
				928	* Also serves as futex trylock_pi()'ing, and due semantics.
				929	*/
				930	int futex_lock_pi(u32 __user uaddr, unsigned int flags, ktime_t time, int trylock)
				931	{
				932	struct hrtimer_sleeper timeout, *to;
				933	struct task_struct *exiting = NULL;
				934	struct rt_mutex_waiter rt_waiter;
				935	struct futex_hash_bucket *hb;
				936	struct futex_q q = futex_q_init;
				937	int res, ret;
				938
				939	if (!IS_ENABLED(CONFIG_FUTEX_PI))
				940	return -ENOSYS;
				941
				942	if (refill_pi_state_cache())
				943	return -ENOMEM;
				944
				945	to = futex_setup_timer(time, &timeout, flags, 0);
				946
				947	retry:
				948	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, FUTEX_WRITE);
				949	if (unlikely(ret != 0))
				950	goto out;
				951
				952	retry_private:
				953	hb = futex_q_lock(&q);
				954
				955	ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current,
				956	&exiting, 0);
				957	if (unlikely(ret)) {
				958	/*
				959	* Atomic work succeeded and we got the lock,
				960	* or failed. Either way, we do _not_ block.
				961	*/
				962	switch (ret) {
				963	case 1:
				964	/* We got the lock. */
				965	ret = 0;
				966	goto out_unlock_put_key;
				967	case -EFAULT:
				968	goto uaddr_faulted;
				969	case -EBUSY:
				970	case -EAGAIN:
				971	/*
				972	* Two reasons for this:
				973	* - EBUSY: Task is exiting and we just wait for the
				974	* exit to complete.
				975	* - EAGAIN: The user space value changed.
				976	*/
				977	futex_q_unlock(hb);
				978	/*
				979	* Handle the case where the owner is in the middle of
				980	* exiting. Wait for the exit to complete otherwise
				981	* this task might loop forever, aka. live lock.
				982	*/
				983	wait_for_owner_exiting(ret, exiting);
				984	cond_resched();
				985	goto retry;
				986	default:
				987	goto out_unlock_put_key;
				988	}
				989	}
				990
				991	WARN_ON(!q.pi_state);
				992
				993	/*
				994	* Only actually queue now that the atomic ops are done:
				995	*/
				996	__futex_queue(&q, hb);
				997
				998	if (trylock) {
				999	ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex);
				1000	/* Fixup the trylock return value: */
				1001	ret = ret ? 0 : -EWOULDBLOCK;
				1002	goto no_block;
				1003	}
				1004
				1005	rt_mutex_init_waiter(&rt_waiter);
				1006
				1007	/*
				1008	* On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not
				1009	* hold it while doing rt_mutex_start_proxy(), because then it will
				1010	* include hb->lock in the blocking chain, even through we'll not in
				1011	* fact hold it while blocking. This will lead it to report -EDEADLK
				1012	* and BUG when futex_unlock_pi() interleaves with this.
				1013	*
				1014	* Therefore acquire wait_lock while holding hb->lock, but drop the
				1015	* latter before calling __rt_mutex_start_proxy_lock(). This
				1016	* interleaves with futex_unlock_pi() -- which does a similar lock
				1017	* handoff -- such that the latter can observe the futex_q::pi_state
				1018	* before __rt_mutex_start_proxy_lock() is done.
				1019	*/
				1020	raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
				1021	spin_unlock(q.lock_ptr);
				1022	/*
				1023	* __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter
				1024	* such that futex_unlock_pi() is guaranteed to observe the waiter when
				1025	* it sees the futex_q::pi_state.
				1026	*/
				1027	ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
				1028	raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
				1029
				1030	if (ret) {
				1031	if (ret == 1)
				1032	ret = 0;
				1033	goto cleanup;
				1034	}
				1035
				1036	if (unlikely(to))
				1037	hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS);
				1038
				1039	ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
				1040
				1041	cleanup:
				1042	spin_lock(q.lock_ptr);
				1043	/*
				1044	* If we failed to acquire the lock (deadlock/signal/timeout), we must
				1045	* first acquire the hb->lock before removing the lock from the
				1046	* rt_mutex waitqueue, such that we can keep the hb and rt_mutex wait
				1047	* lists consistent.
				1048	*
				1049	* In particular; it is important that futex_unlock_pi() can not
				1050	* observe this inconsistency.
				1051	*/
				1052	if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
				1053	ret = 0;
				1054
				1055	no_block:
				1056	/*
				1057	* Fixup the pi_state owner and possibly acquire the lock if we
				1058	* haven't already.
				1059	*/
				1060	res = fixup_pi_owner(uaddr, &q, !ret);
				1061	/*
				1062	* If fixup_pi_owner() returned an error, propagate that. If it acquired
				1063	* the lock, clear our -ETIMEDOUT or -EINTR.
				1064	*/
				1065	if (res)
				1066	ret = (res < 0) ? res : 0;
				1067
				1068	futex_unqueue_pi(&q);
				1069	spin_unlock(q.lock_ptr);
				1070	goto out;
				1071
				1072	out_unlock_put_key:
				1073	futex_q_unlock(hb);
				1074
				1075	out:
				1076	if (to) {
				1077	hrtimer_cancel(&to->timer);
				1078	destroy_hrtimer_on_stack(&to->timer);
				1079	}
				1080	return ret != -EINTR ? ret : -ERESTARTNOINTR;
				1081
				1082	uaddr_faulted:
				1083	futex_q_unlock(hb);
				1084
				1085	ret = fault_in_user_writeable(uaddr);
				1086	if (ret)
				1087	goto out;
				1088
				1089	if (!(flags & FLAGS_SHARED))
				1090	goto retry_private;
				1091
				1092	goto retry;
				1093	}
				1094
				1095	/*
				1096	* Userspace attempted a TID -> 0 atomic transition, and failed.
				1097	* This is the in-kernel slowpath: we look up the PI state (if any),
				1098	* and do the rt-mutex unlock.
				1099	*/
				1100	int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
				1101	{
				1102	u32 curval, uval, vpid = task_pid_vnr(current);
				1103	union futex_key key = FUTEX_KEY_INIT;
				1104	struct futex_hash_bucket *hb;
				1105	struct futex_q *top_waiter;
				1106	int ret;
				1107
				1108	if (!IS_ENABLED(CONFIG_FUTEX_PI))
				1109	return -ENOSYS;
				1110
				1111	retry:
				1112	if (get_user(uval, uaddr))
				1113	return -EFAULT;
				1114	/*
				1115	* We release only a lock we actually own:
				1116	*/
				1117	if ((uval & FUTEX_TID_MASK) != vpid)
				1118	return -EPERM;
				1119
				1120	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_WRITE);
				1121	if (ret)
				1122	return ret;
				1123
				1124	hb = futex_hash(&key);
				1125	spin_lock(&hb->lock);
				1126
				1127	/*
				1128	* Check waiters first. We do not trust user space values at
				1129	* all and we at least want to know if user space fiddled
				1130	* with the futex value instead of blindly unlocking.
				1131	*/
				1132	top_waiter = futex_top_waiter(hb, &key);
				1133	if (top_waiter) {
				1134	struct futex_pi_state *pi_state = top_waiter->pi_state;
				1135
				1136	ret = -EINVAL;
				1137	if (!pi_state)
				1138	goto out_unlock;
				1139
				1140	/*
				1141	* If current does not own the pi_state then the futex is
				1142	* inconsistent and user space fiddled with the futex value.
				1143	*/
				1144	if (pi_state->owner != current)
				1145	goto out_unlock;
				1146
				1147	get_pi_state(pi_state);
				1148	/*
				1149	* By taking wait_lock while still holding hb->lock, we ensure
				1150	* there is no point where we hold neither; and therefore
				1151	* wake_futex_p() must observe a state consistent with what we
				1152	* observed.
				1153	*
				1154	* In particular; this forces __rt_mutex_start_proxy() to
				1155	* complete such that we're guaranteed to observe the
				1156	* rt_waiter. Also see the WARN in wake_futex_pi().
				1157	*/
				1158	raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
				1159	spin_unlock(&hb->lock);
				1160
				1161	/* drops pi_state->pi_mutex.wait_lock */
				1162	ret = wake_futex_pi(uaddr, uval, pi_state);
				1163
				1164	put_pi_state(pi_state);
				1165
				1166	/*
				1167	* Success, we're done! No tricky corner cases.
				1168	*/
				1169	if (!ret)
				1170	return ret;
				1171	/*
				1172	* The atomic access to the futex value generated a
				1173	* pagefault, so retry the user-access and the wakeup:
				1174	*/
				1175	if (ret == -EFAULT)
				1176	goto pi_faulted;
				1177	/*
				1178	* A unconditional UNLOCK_PI op raced against a waiter
				1179	* setting the FUTEX_WAITERS bit. Try again.
				1180	*/
				1181	if (ret == -EAGAIN)
				1182	goto pi_retry;
				1183	/*
				1184	* wake_futex_pi has detected invalid state. Tell user
				1185	* space.
				1186	*/
				1187	return ret;
				1188	}
				1189
				1190	/*
				1191	* We have no kernel internal state, i.e. no waiters in the
				1192	* kernel. Waiters which are about to queue themselves are stuck
				1193	* on hb->lock. So we can safely ignore them. We do neither
				1194	* preserve the WAITERS bit not the OWNER_DIED one. We are the
				1195	* owner.
				1196	*/
				1197	if ((ret = futex_cmpxchg_value_locked(&curval, uaddr, uval, 0))) {
				1198	spin_unlock(&hb->lock);
				1199	switch (ret) {
				1200	case -EFAULT:
				1201	goto pi_faulted;
				1202
				1203	case -EAGAIN:
				1204	goto pi_retry;
				1205
				1206	default:
				1207	WARN_ON_ONCE(1);
				1208	return ret;
				1209	}
				1210	}
				1211
				1212	/*
				1213	* If uval has changed, let user space handle it.
				1214	*/
				1215	ret = (curval == uval) ? 0 : -EAGAIN;
				1216
				1217	out_unlock:
				1218	spin_unlock(&hb->lock);
				1219	return ret;
				1220
				1221	pi_retry:
				1222	cond_resched();
				1223	goto retry;
				1224
				1225	pi_faulted:
				1226
				1227	ret = fault_in_user_writeable(uaddr);
				1228	if (!ret)
				1229	goto retry;
				1230
				1231	return ret;
				1232	}
				1233