Thomas Gleixner | 1a59d1b8 | 2019-05-27 08:55:05 +0200 | [diff] [blame] | 1 | // SPDX-License-Identifier: GPL-2.0-or-later |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 2 | /* |
| 3 | * Fast Userspace Mutexes (which I call "Futexes!"). |
| 4 | * (C) Rusty Russell, IBM 2002 |
| 5 | * |
| 6 | * Generalized futexes, futex requeueing, misc fixes by Ingo Molnar |
| 7 | * (C) Copyright 2003 Red Hat Inc, All Rights Reserved |
| 8 | * |
| 9 | * Removed page pinning, fix privately mapped COW pages and other cleanups |
| 10 | * (C) Copyright 2003, 2004 Jamie Lokier |
| 11 | * |
Ingo Molnar | 0771dfe | 2006-03-27 01:16:22 -0800 | [diff] [blame] | 12 | * Robust futex support started by Ingo Molnar |
| 13 | * (C) Copyright 2006 Red Hat Inc, All Rights Reserved |
| 14 | * Thanks to Thomas Gleixner for suggestions, analysis and fixes. |
| 15 | * |
Ingo Molnar | c87e283 | 2006-06-27 02:54:58 -0700 | [diff] [blame] | 16 | * PI-futex support started by Ingo Molnar and Thomas Gleixner |
| 17 | * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> |
| 18 | * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> |
| 19 | * |
Eric Dumazet | 34f01cc | 2007-05-09 02:35:04 -0700 | [diff] [blame] | 20 | * PRIVATE futexes by Eric Dumazet |
| 21 | * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com> |
| 22 | * |
Darren Hart | 52400ba | 2009-04-03 13:40:49 -0700 | [diff] [blame] | 23 | * Requeue-PI support by Darren Hart <dvhltc@us.ibm.com> |
| 24 | * Copyright (C) IBM Corporation, 2009 |
| 25 | * Thanks to Thomas Gleixner for conceptual design and careful reviews. |
| 26 | * |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 27 | * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly |
| 28 | * enough at me, Linus for the original (flawed) idea, Matthew |
| 29 | * Kirkwood for proof-of-concept implementation. |
| 30 | * |
| 31 | * "The futexes are also cursed." |
| 32 | * "But they come in a choice of three flavours!" |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 33 | */ |
Arnd Bergmann | 04e7712 | 2018-04-17 16:31:07 +0200 | [diff] [blame] | 34 | #include <linux/compat.h> |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 35 | #include <linux/jhash.h> |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 36 | #include <linux/pagemap.h> |
Mike Rapoport | 57c8a66 | 2018-10-30 15:09:49 -0700 | [diff] [blame] | 37 | #include <linux/memblock.h> |
Davidlohr Bueso | ab51fba | 2015-06-29 23:26:02 -0700 | [diff] [blame] | 38 | #include <linux/fault-inject.h> |
Peter Zijlstra | af8cc96 | 2021-09-23 14:10:51 -0300 | [diff] [blame] | 39 | #include <linux/slab.h> |
Pavel Emelyanov | b488893 | 2007-10-18 23:40:14 -0700 | [diff] [blame] | 40 | |
Peter Zijlstra | af8cc96 | 2021-09-23 14:10:51 -0300 | [diff] [blame] | 41 | #include "futex.h" |
Peter Zijlstra | 77e52ae | 2021-09-23 14:10:50 -0300 | [diff] [blame] | 42 | #include "../locking/rtmutex_common.h" |
Ingo Molnar | c87e283 | 2006-06-27 02:54:58 -0700 | [diff] [blame] | 43 | |
Darren Hart | b41277d | 2010-11-08 13:10:09 -0800 | [diff] [blame] | 44 | /* |
Rasmus Villemoes | ac742d3 | 2015-09-09 23:36:40 +0200 | [diff] [blame] | 45 | * The base of the bucket array and its size are always used together |
Peter Zijlstra | eee5a7b | 2021-09-23 14:10:56 -0300 | [diff] [blame] | 46 | * (after initialization only in futex_hash()), so ensure that they |
Rasmus Villemoes | ac742d3 | 2015-09-09 23:36:40 +0200 | [diff] [blame] | 47 | * reside in the same cacheline. |
| 48 | */ |
| 49 | static struct { |
| 50 | struct futex_hash_bucket *queues; |
| 51 | unsigned long hashsize; |
| 52 | } __futex_data __read_mostly __aligned(2*sizeof(long)); |
| 53 | #define futex_queues (__futex_data.queues) |
| 54 | #define futex_hashsize (__futex_data.hashsize) |
Davidlohr Bueso | a52b89e | 2014-01-12 15:31:23 -0800 | [diff] [blame] | 55 | |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 56 | |
Davidlohr Bueso | ab51fba | 2015-06-29 23:26:02 -0700 | [diff] [blame] | 57 | /* |
| 58 | * Fault injections for futexes. |
| 59 | */ |
| 60 | #ifdef CONFIG_FAIL_FUTEX |
| 61 | |
| 62 | static struct { |
| 63 | struct fault_attr attr; |
| 64 | |
Viresh Kumar | 621a5f7 | 2015-09-26 15:04:07 -0700 | [diff] [blame] | 65 | bool ignore_private; |
Davidlohr Bueso | ab51fba | 2015-06-29 23:26:02 -0700 | [diff] [blame] | 66 | } fail_futex = { |
| 67 | .attr = FAULT_ATTR_INITIALIZER, |
Viresh Kumar | 621a5f7 | 2015-09-26 15:04:07 -0700 | [diff] [blame] | 68 | .ignore_private = false, |
Davidlohr Bueso | ab51fba | 2015-06-29 23:26:02 -0700 | [diff] [blame] | 69 | }; |
| 70 | |
| 71 | static int __init setup_fail_futex(char *str) |
| 72 | { |
| 73 | return setup_fault_attr(&fail_futex.attr, str); |
| 74 | } |
| 75 | __setup("fail_futex=", setup_fail_futex); |
| 76 | |
Peter Zijlstra | af8cc96 | 2021-09-23 14:10:51 -0300 | [diff] [blame] | 77 | bool should_fail_futex(bool fshared) |
Davidlohr Bueso | ab51fba | 2015-06-29 23:26:02 -0700 | [diff] [blame] | 78 | { |
| 79 | if (fail_futex.ignore_private && !fshared) |
| 80 | return false; |
| 81 | |
| 82 | return should_fail(&fail_futex.attr, 1); |
| 83 | } |
| 84 | |
| 85 | #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS |
| 86 | |
| 87 | static int __init fail_futex_debugfs(void) |
| 88 | { |
| 89 | umode_t mode = S_IFREG | S_IRUSR | S_IWUSR; |
| 90 | struct dentry *dir; |
| 91 | |
| 92 | dir = fault_create_debugfs_attr("fail_futex", NULL, |
| 93 | &fail_futex.attr); |
| 94 | if (IS_ERR(dir)) |
| 95 | return PTR_ERR(dir); |
| 96 | |
Greg Kroah-Hartman | 0365aeb | 2019-01-22 16:21:39 +0100 | [diff] [blame] | 97 | debugfs_create_bool("ignore-private", mode, dir, |
| 98 | &fail_futex.ignore_private); |
Davidlohr Bueso | ab51fba | 2015-06-29 23:26:02 -0700 | [diff] [blame] | 99 | return 0; |
| 100 | } |
| 101 | |
| 102 | late_initcall(fail_futex_debugfs); |
| 103 | |
| 104 | #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ |
| 105 | |
Davidlohr Bueso | ab51fba | 2015-06-29 23:26:02 -0700 | [diff] [blame] | 106 | #endif /* CONFIG_FAIL_FUTEX */ |
| 107 | |
Thomas Gleixner | e8b61b3 | 2016-06-01 10:43:29 +0200 | [diff] [blame] | 108 | /** |
Peter Zijlstra | eee5a7b | 2021-09-23 14:10:56 -0300 | [diff] [blame] | 109 | * futex_hash - Return the hash bucket in the global hash |
Thomas Gleixner | e8b61b3 | 2016-06-01 10:43:29 +0200 | [diff] [blame] | 110 | * @key: Pointer to the futex key for which the hash is calculated |
| 111 | * |
| 112 | * We hash on the keys returned from get_futex_key (see below) and return the |
| 113 | * corresponding hash bucket in the global hash. |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 114 | */ |
Peter Zijlstra | 85dc28f | 2021-09-23 14:10:58 -0300 | [diff] [blame] | 115 | struct futex_hash_bucket *futex_hash(union futex_key *key) |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 116 | { |
Thomas Gleixner | 8d67743 | 2020-03-08 19:07:17 +0100 | [diff] [blame] | 117 | u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4, |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 118 | key->both.offset); |
Thomas Gleixner | 8d67743 | 2020-03-08 19:07:17 +0100 | [diff] [blame] | 119 | |
Davidlohr Bueso | a52b89e | 2014-01-12 15:31:23 -0800 | [diff] [blame] | 120 | return &futex_queues[hash & (futex_hashsize - 1)]; |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 121 | } |
| 122 | |
Thomas Gleixner | e8b61b3 | 2016-06-01 10:43:29 +0200 | [diff] [blame] | 123 | |
| 124 | /** |
Waiman Long | 5ca584d | 2019-05-28 12:03:45 -0400 | [diff] [blame] | 125 | * futex_setup_timer - set up the sleeping hrtimer. |
| 126 | * @time: ptr to the given timeout value |
| 127 | * @timeout: the hrtimer_sleeper structure to be set up |
| 128 | * @flags: futex flags |
| 129 | * @range_ns: optional range in ns |
| 130 | * |
| 131 | * Return: Initialized hrtimer_sleeper structure or NULL if no timeout |
| 132 | * value given |
| 133 | */ |
Peter Zijlstra | 85dc28f | 2021-09-23 14:10:58 -0300 | [diff] [blame] | 134 | struct hrtimer_sleeper * |
Waiman Long | 5ca584d | 2019-05-28 12:03:45 -0400 | [diff] [blame] | 135 | futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout, |
| 136 | int flags, u64 range_ns) |
| 137 | { |
| 138 | if (!time) |
| 139 | return NULL; |
| 140 | |
Sebastian Andrzej Siewior | dbc1625 | 2019-07-26 20:30:50 +0200 | [diff] [blame] | 141 | hrtimer_init_sleeper_on_stack(timeout, (flags & FLAGS_CLOCKRT) ? |
| 142 | CLOCK_REALTIME : CLOCK_MONOTONIC, |
| 143 | HRTIMER_MODE_ABS); |
Waiman Long | 5ca584d | 2019-05-28 12:03:45 -0400 | [diff] [blame] | 144 | /* |
| 145 | * If range_ns is 0, calling hrtimer_set_expires_range_ns() is |
| 146 | * effectively the same as calling hrtimer_set_expires(). |
| 147 | */ |
| 148 | hrtimer_set_expires_range_ns(&timeout->timer, *time, range_ns); |
| 149 | |
| 150 | return timeout; |
| 151 | } |
| 152 | |
Peter Zijlstra | 8019ad1 | 2020-03-04 11:28:31 +0100 | [diff] [blame] | 153 | /* |
| 154 | * Generate a machine wide unique identifier for this inode. |
| 155 | * |
| 156 | * This relies on u64 not wrapping in the life-time of the machine; which with |
| 157 | * 1ns resolution means almost 585 years. |
| 158 | * |
| 159 | * This further relies on the fact that a well formed program will not unmap |
| 160 | * the file while it has a (shared) futex waiting on it. This mapping will have |
| 161 | * a file reference which pins the mount and inode. |
| 162 | * |
| 163 | * If for some reason an inode gets evicted and read back in again, it will get |
| 164 | * a new sequence number and will _NOT_ match, even though it is the exact same |
| 165 | * file. |
| 166 | * |
Peter Zijlstra | f56a76f | 2021-09-23 14:11:00 -0300 | [diff] [blame] | 167 | * It is important that futex_match() will never have a false-positive, esp. |
Peter Zijlstra | 8019ad1 | 2020-03-04 11:28:31 +0100 | [diff] [blame] | 168 | * for PI futexes that can mess up the state. The above argues that false-negatives |
| 169 | * are only possible for malformed programs. |
| 170 | */ |
| 171 | static u64 get_inode_sequence_number(struct inode *inode) |
| 172 | { |
| 173 | static atomic64_t i_seq; |
| 174 | u64 old; |
| 175 | |
| 176 | /* Does the inode already have a sequence number? */ |
| 177 | old = atomic64_read(&inode->i_sequence); |
| 178 | if (likely(old)) |
| 179 | return old; |
| 180 | |
| 181 | for (;;) { |
| 182 | u64 new = atomic64_add_return(1, &i_seq); |
| 183 | if (WARN_ON_ONCE(!new)) |
| 184 | continue; |
| 185 | |
| 186 | old = atomic64_cmpxchg_relaxed(&inode->i_sequence, 0, new); |
| 187 | if (old) |
| 188 | return old; |
| 189 | return new; |
| 190 | } |
| 191 | } |
| 192 | |
Waiman Long | 5ca584d | 2019-05-28 12:03:45 -0400 | [diff] [blame] | 193 | /** |
Darren Hart | d96ee56 | 2009-09-21 22:30:22 -0700 | [diff] [blame] | 194 | * get_futex_key() - Get parameters which are the keys for a futex |
| 195 | * @uaddr: virtual address of the futex |
André Almeida | 9261308 | 2020-07-02 17:28:43 -0300 | [diff] [blame] | 196 | * @fshared: false for a PROCESS_PRIVATE futex, true for PROCESS_SHARED |
Darren Hart | d96ee56 | 2009-09-21 22:30:22 -0700 | [diff] [blame] | 197 | * @key: address where result is stored. |
Linus Torvalds | 96d4f26 | 2019-01-03 18:57:57 -0800 | [diff] [blame] | 198 | * @rw: mapping needs to be read/write (values: FUTEX_READ, |
| 199 | * FUTEX_WRITE) |
Eric Dumazet | 34f01cc | 2007-05-09 02:35:04 -0700 | [diff] [blame] | 200 | * |
Randy Dunlap | 6c23cbb | 2013-03-05 10:00:24 -0800 | [diff] [blame] | 201 | * Return: a negative error code or 0 |
| 202 | * |
Mauro Carvalho Chehab | 7b4ff1a | 2017-05-11 10:17:45 -0300 | [diff] [blame] | 203 | * The key words are stored in @key on success. |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 204 | * |
Peter Zijlstra | 8019ad1 | 2020-03-04 11:28:31 +0100 | [diff] [blame] | 205 | * For shared mappings (when @fshared), the key is: |
Mauro Carvalho Chehab | 03c109d | 2020-04-14 18:48:58 +0200 | [diff] [blame] | 206 | * |
Peter Zijlstra | 8019ad1 | 2020-03-04 11:28:31 +0100 | [diff] [blame] | 207 | * ( inode->i_sequence, page->index, offset_within_page ) |
Mauro Carvalho Chehab | 03c109d | 2020-04-14 18:48:58 +0200 | [diff] [blame] | 208 | * |
Peter Zijlstra | 8019ad1 | 2020-03-04 11:28:31 +0100 | [diff] [blame] | 209 | * [ also see get_inode_sequence_number() ] |
| 210 | * |
| 211 | * For private mappings (or when !@fshared), the key is: |
Mauro Carvalho Chehab | 03c109d | 2020-04-14 18:48:58 +0200 | [diff] [blame] | 212 | * |
Peter Zijlstra | 8019ad1 | 2020-03-04 11:28:31 +0100 | [diff] [blame] | 213 | * ( current->mm, address, 0 ) |
| 214 | * |
| 215 | * This allows (cross process, where applicable) identification of the futex |
| 216 | * without keeping the page pinned for the duration of the FUTEX_WAIT. |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 217 | * |
Darren Hart | b2d0994 | 2009-03-12 00:55:37 -0700 | [diff] [blame] | 218 | * lock_page() might sleep, the caller should not hold a spinlock. |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 219 | */ |
Peter Zijlstra | 85dc28f | 2021-09-23 14:10:58 -0300 | [diff] [blame] | 220 | int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key, |
| 221 | enum futex_access rw) |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 222 | { |
Ingo Molnar | e2970f2 | 2006-06-27 02:54:47 -0700 | [diff] [blame] | 223 | unsigned long address = (unsigned long)uaddr; |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 224 | struct mm_struct *mm = current->mm; |
Mel Gorman | 077fa7a | 2016-06-08 14:25:22 +0100 | [diff] [blame] | 225 | struct page *page, *tail; |
Kirill A. Shutemov | 14d27ab | 2016-01-15 16:53:00 -0800 | [diff] [blame] | 226 | struct address_space *mapping; |
Shawn Bohrer | 9ea7150 | 2011-06-30 11:21:32 -0500 | [diff] [blame] | 227 | int err, ro = 0; |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 228 | |
| 229 | /* |
| 230 | * The futex address must be "naturally" aligned. |
| 231 | */ |
Ingo Molnar | e2970f2 | 2006-06-27 02:54:47 -0700 | [diff] [blame] | 232 | key->both.offset = address % PAGE_SIZE; |
Eric Dumazet | 34f01cc | 2007-05-09 02:35:04 -0700 | [diff] [blame] | 233 | if (unlikely((address % sizeof(u32)) != 0)) |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 234 | return -EINVAL; |
Ingo Molnar | e2970f2 | 2006-06-27 02:54:47 -0700 | [diff] [blame] | 235 | address -= key->both.offset; |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 236 | |
Linus Torvalds | 96d4f26 | 2019-01-03 18:57:57 -0800 | [diff] [blame] | 237 | if (unlikely(!access_ok(uaddr, sizeof(u32)))) |
Linus Torvalds | 5cdec2d | 2013-12-12 09:53:51 -0800 | [diff] [blame] | 238 | return -EFAULT; |
| 239 | |
Davidlohr Bueso | ab51fba | 2015-06-29 23:26:02 -0700 | [diff] [blame] | 240 | if (unlikely(should_fail_futex(fshared))) |
| 241 | return -EFAULT; |
| 242 | |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 243 | /* |
Eric Dumazet | 34f01cc | 2007-05-09 02:35:04 -0700 | [diff] [blame] | 244 | * PROCESS_PRIVATE futexes are fast. |
| 245 | * As the mm cannot disappear under us and the 'key' only needs |
| 246 | * virtual address, we dont even have to find the underlying vma. |
| 247 | * Note : We do have to check 'uaddr' is a valid user address, |
| 248 | * but access_ok() should be faster than find_vma() |
| 249 | */ |
| 250 | if (!fshared) { |
Eric Dumazet | 34f01cc | 2007-05-09 02:35:04 -0700 | [diff] [blame] | 251 | key->private.mm = mm; |
| 252 | key->private.address = address; |
| 253 | return 0; |
| 254 | } |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 255 | |
Peter Zijlstra | 38d47c1 | 2008-09-26 19:32:20 +0200 | [diff] [blame] | 256 | again: |
Davidlohr Bueso | ab51fba | 2015-06-29 23:26:02 -0700 | [diff] [blame] | 257 | /* Ignore any VERIFY_READ mapping (futex common case) */ |
André Almeida | 9261308 | 2020-07-02 17:28:43 -0300 | [diff] [blame] | 258 | if (unlikely(should_fail_futex(true))) |
Davidlohr Bueso | ab51fba | 2015-06-29 23:26:02 -0700 | [diff] [blame] | 259 | return -EFAULT; |
| 260 | |
Ira Weiny | 73b0140 | 2019-05-13 17:17:11 -0700 | [diff] [blame] | 261 | err = get_user_pages_fast(address, 1, FOLL_WRITE, &page); |
Shawn Bohrer | 9ea7150 | 2011-06-30 11:21:32 -0500 | [diff] [blame] | 262 | /* |
| 263 | * If write access is not required (eg. FUTEX_WAIT), try |
| 264 | * and get read-only access. |
| 265 | */ |
Linus Torvalds | 96d4f26 | 2019-01-03 18:57:57 -0800 | [diff] [blame] | 266 | if (err == -EFAULT && rw == FUTEX_READ) { |
Shawn Bohrer | 9ea7150 | 2011-06-30 11:21:32 -0500 | [diff] [blame] | 267 | err = get_user_pages_fast(address, 1, 0, &page); |
| 268 | ro = 1; |
| 269 | } |
Peter Zijlstra | 38d47c1 | 2008-09-26 19:32:20 +0200 | [diff] [blame] | 270 | if (err < 0) |
| 271 | return err; |
Shawn Bohrer | 9ea7150 | 2011-06-30 11:21:32 -0500 | [diff] [blame] | 272 | else |
| 273 | err = 0; |
Peter Zijlstra | 38d47c1 | 2008-09-26 19:32:20 +0200 | [diff] [blame] | 274 | |
Mel Gorman | 65d8fc7 | 2016-02-09 11:15:14 -0800 | [diff] [blame] | 275 | /* |
| 276 | * The treatment of mapping from this point on is critical. The page |
| 277 | * lock protects many things but in this context the page lock |
| 278 | * stabilizes mapping, prevents inode freeing in the shared |
| 279 | * file-backed region case and guards against movement to swap cache. |
| 280 | * |
| 281 | * Strictly speaking the page lock is not needed in all cases being |
| 282 | * considered here and page lock forces unnecessarily serialization |
| 283 | * From this point on, mapping will be re-verified if necessary and |
| 284 | * page lock will be acquired only if it is unavoidable |
Mel Gorman | 077fa7a | 2016-06-08 14:25:22 +0100 | [diff] [blame] | 285 | * |
| 286 | * Mapping checks require the head page for any compound page so the |
| 287 | * head page and mapping is looked up now. For anonymous pages, it |
| 288 | * does not matter if the page splits in the future as the key is |
| 289 | * based on the address. For filesystem-backed pages, the tail is |
| 290 | * required as the index of the page determines the key. For |
| 291 | * base pages, there is no tail page and tail == page. |
Mel Gorman | 65d8fc7 | 2016-02-09 11:15:14 -0800 | [diff] [blame] | 292 | */ |
Mel Gorman | 077fa7a | 2016-06-08 14:25:22 +0100 | [diff] [blame] | 293 | tail = page; |
Mel Gorman | 65d8fc7 | 2016-02-09 11:15:14 -0800 | [diff] [blame] | 294 | page = compound_head(page); |
| 295 | mapping = READ_ONCE(page->mapping); |
| 296 | |
Hugh Dickins | e6780f7 | 2011-12-31 11:44:01 -0800 | [diff] [blame] | 297 | /* |
Kirill A. Shutemov | 14d27ab | 2016-01-15 16:53:00 -0800 | [diff] [blame] | 298 | * If page->mapping is NULL, then it cannot be a PageAnon |
Hugh Dickins | e6780f7 | 2011-12-31 11:44:01 -0800 | [diff] [blame] | 299 | * page; but it might be the ZERO_PAGE or in the gate area or |
| 300 | * in a special mapping (all cases which we are happy to fail); |
| 301 | * or it may have been a good file page when get_user_pages_fast |
| 302 | * found it, but truncated or holepunched or subjected to |
| 303 | * invalidate_complete_page2 before we got the page lock (also |
| 304 | * cases which we are happy to fail). And we hold a reference, |
| 305 | * so refcount care in invalidate_complete_page's remove_mapping |
| 306 | * prevents drop_caches from setting mapping to NULL beneath us. |
| 307 | * |
| 308 | * The case we do have to guard against is when memory pressure made |
| 309 | * shmem_writepage move it from filecache to swapcache beneath us: |
Kirill A. Shutemov | 14d27ab | 2016-01-15 16:53:00 -0800 | [diff] [blame] | 310 | * an unlikely race, but we do need to retry for page->mapping. |
Hugh Dickins | e6780f7 | 2011-12-31 11:44:01 -0800 | [diff] [blame] | 311 | */ |
Mel Gorman | 65d8fc7 | 2016-02-09 11:15:14 -0800 | [diff] [blame] | 312 | if (unlikely(!mapping)) { |
| 313 | int shmem_swizzled; |
| 314 | |
| 315 | /* |
| 316 | * Page lock is required to identify which special case above |
| 317 | * applies. If this is really a shmem page then the page lock |
| 318 | * will prevent unexpected transitions. |
| 319 | */ |
| 320 | lock_page(page); |
| 321 | shmem_swizzled = PageSwapCache(page) || page->mapping; |
Kirill A. Shutemov | 14d27ab | 2016-01-15 16:53:00 -0800 | [diff] [blame] | 322 | unlock_page(page); |
| 323 | put_page(page); |
Mel Gorman | 65d8fc7 | 2016-02-09 11:15:14 -0800 | [diff] [blame] | 324 | |
Hugh Dickins | e6780f7 | 2011-12-31 11:44:01 -0800 | [diff] [blame] | 325 | if (shmem_swizzled) |
| 326 | goto again; |
Mel Gorman | 65d8fc7 | 2016-02-09 11:15:14 -0800 | [diff] [blame] | 327 | |
Hugh Dickins | e6780f7 | 2011-12-31 11:44:01 -0800 | [diff] [blame] | 328 | return -EFAULT; |
Peter Zijlstra | 38d47c1 | 2008-09-26 19:32:20 +0200 | [diff] [blame] | 329 | } |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 330 | |
| 331 | /* |
| 332 | * Private mappings are handled in a simple way. |
| 333 | * |
Mel Gorman | 65d8fc7 | 2016-02-09 11:15:14 -0800 | [diff] [blame] | 334 | * If the futex key is stored on an anonymous page, then the associated |
| 335 | * object is the mm which is implicitly pinned by the calling process. |
| 336 | * |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 337 | * NOTE: When userspace waits on a MAP_SHARED mapping, even if |
| 338 | * it's a read-only handle, it's expected that futexes attach to |
Peter Zijlstra | 38d47c1 | 2008-09-26 19:32:20 +0200 | [diff] [blame] | 339 | * the object not the particular process. |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 340 | */ |
Kirill A. Shutemov | 14d27ab | 2016-01-15 16:53:00 -0800 | [diff] [blame] | 341 | if (PageAnon(page)) { |
Shawn Bohrer | 9ea7150 | 2011-06-30 11:21:32 -0500 | [diff] [blame] | 342 | /* |
| 343 | * A RO anonymous page will never change and thus doesn't make |
| 344 | * sense for futex operations. |
| 345 | */ |
André Almeida | 9261308 | 2020-07-02 17:28:43 -0300 | [diff] [blame] | 346 | if (unlikely(should_fail_futex(true)) || ro) { |
Shawn Bohrer | 9ea7150 | 2011-06-30 11:21:32 -0500 | [diff] [blame] | 347 | err = -EFAULT; |
| 348 | goto out; |
| 349 | } |
| 350 | |
Peter Zijlstra | 38d47c1 | 2008-09-26 19:32:20 +0200 | [diff] [blame] | 351 | key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */ |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 352 | key->private.mm = mm; |
Ingo Molnar | e2970f2 | 2006-06-27 02:54:47 -0700 | [diff] [blame] | 353 | key->private.address = address; |
Mel Gorman | 65d8fc7 | 2016-02-09 11:15:14 -0800 | [diff] [blame] | 354 | |
Peter Zijlstra | 38d47c1 | 2008-09-26 19:32:20 +0200 | [diff] [blame] | 355 | } else { |
Mel Gorman | 65d8fc7 | 2016-02-09 11:15:14 -0800 | [diff] [blame] | 356 | struct inode *inode; |
| 357 | |
| 358 | /* |
| 359 | * The associated futex object in this case is the inode and |
| 360 | * the page->mapping must be traversed. Ordinarily this should |
| 361 | * be stabilised under page lock but it's not strictly |
| 362 | * necessary in this case as we just want to pin the inode, not |
| 363 | * update the radix tree or anything like that. |
| 364 | * |
| 365 | * The RCU read lock is taken as the inode is finally freed |
| 366 | * under RCU. If the mapping still matches expectations then the |
| 367 | * mapping->host can be safely accessed as being a valid inode. |
| 368 | */ |
| 369 | rcu_read_lock(); |
| 370 | |
| 371 | if (READ_ONCE(page->mapping) != mapping) { |
| 372 | rcu_read_unlock(); |
| 373 | put_page(page); |
| 374 | |
| 375 | goto again; |
| 376 | } |
| 377 | |
| 378 | inode = READ_ONCE(mapping->host); |
| 379 | if (!inode) { |
| 380 | rcu_read_unlock(); |
| 381 | put_page(page); |
| 382 | |
| 383 | goto again; |
| 384 | } |
| 385 | |
Peter Zijlstra | 38d47c1 | 2008-09-26 19:32:20 +0200 | [diff] [blame] | 386 | key->both.offset |= FUT_OFF_INODE; /* inode-based key */ |
Peter Zijlstra | 8019ad1 | 2020-03-04 11:28:31 +0100 | [diff] [blame] | 387 | key->shared.i_seq = get_inode_sequence_number(inode); |
Hugh Dickins | fe19bd3 | 2021-06-24 18:39:52 -0700 | [diff] [blame] | 388 | key->shared.pgoff = page_to_pgoff(tail); |
Mel Gorman | 65d8fc7 | 2016-02-09 11:15:14 -0800 | [diff] [blame] | 389 | rcu_read_unlock(); |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 390 | } |
| 391 | |
Shawn Bohrer | 9ea7150 | 2011-06-30 11:21:32 -0500 | [diff] [blame] | 392 | out: |
Kirill A. Shutemov | 14d27ab | 2016-01-15 16:53:00 -0800 | [diff] [blame] | 393 | put_page(page); |
Shawn Bohrer | 9ea7150 | 2011-06-30 11:21:32 -0500 | [diff] [blame] | 394 | return err; |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 395 | } |
| 396 | |
Darren Hart | d96ee56 | 2009-09-21 22:30:22 -0700 | [diff] [blame] | 397 | /** |
| 398 | * fault_in_user_writeable() - Fault in user address and verify RW access |
Thomas Gleixner | d072599 | 2009-06-11 23:15:43 +0200 | [diff] [blame] | 399 | * @uaddr: pointer to faulting user space address |
| 400 | * |
| 401 | * Slow path to fixup the fault we just took in the atomic write |
| 402 | * access to @uaddr. |
| 403 | * |
Randy Dunlap | fb62db2 | 2010-10-13 11:02:34 -0700 | [diff] [blame] | 404 | * We have no generic implementation of a non-destructive write to the |
Thomas Gleixner | d072599 | 2009-06-11 23:15:43 +0200 | [diff] [blame] | 405 | * user address. We know that we faulted in the atomic pagefault |
| 406 | * disabled section so we can as well avoid the #PF overhead by |
| 407 | * calling get_user_pages() right away. |
| 408 | */ |
Peter Zijlstra | 85dc28f | 2021-09-23 14:10:58 -0300 | [diff] [blame] | 409 | int fault_in_user_writeable(u32 __user *uaddr) |
Thomas Gleixner | d072599 | 2009-06-11 23:15:43 +0200 | [diff] [blame] | 410 | { |
Andi Kleen | 722d017 | 2009-12-08 13:19:42 +0100 | [diff] [blame] | 411 | struct mm_struct *mm = current->mm; |
| 412 | int ret; |
| 413 | |
Michel Lespinasse | d8ed45c | 2020-06-08 21:33:25 -0700 | [diff] [blame] | 414 | mmap_read_lock(mm); |
Peter Xu | 64019a2 | 2020-08-11 18:39:01 -0700 | [diff] [blame] | 415 | ret = fixup_user_fault(mm, (unsigned long)uaddr, |
Dominik Dingel | 4a9e1cd | 2016-01-15 16:57:04 -0800 | [diff] [blame] | 416 | FAULT_FLAG_WRITE, NULL); |
Michel Lespinasse | d8ed45c | 2020-06-08 21:33:25 -0700 | [diff] [blame] | 417 | mmap_read_unlock(mm); |
Andi Kleen | 722d017 | 2009-12-08 13:19:42 +0100 | [diff] [blame] | 418 | |
Thomas Gleixner | d072599 | 2009-06-11 23:15:43 +0200 | [diff] [blame] | 419 | return ret < 0 ? ret : 0; |
| 420 | } |
| 421 | |
Darren Hart | 4b1c486 | 2009-04-03 13:39:42 -0700 | [diff] [blame] | 422 | /** |
| 423 | * futex_top_waiter() - Return the highest priority waiter on a futex |
Darren Hart | d96ee56 | 2009-09-21 22:30:22 -0700 | [diff] [blame] | 424 | * @hb: the hash bucket the futex_q's reside in |
| 425 | * @key: the futex key (to distinguish it from other futex futex_q's) |
Darren Hart | 4b1c486 | 2009-04-03 13:39:42 -0700 | [diff] [blame] | 426 | * |
| 427 | * Must be called with the hb lock held. |
| 428 | */ |
Peter Zijlstra | 85dc28f | 2021-09-23 14:10:58 -0300 | [diff] [blame] | 429 | struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, union futex_key *key) |
Darren Hart | 4b1c486 | 2009-04-03 13:39:42 -0700 | [diff] [blame] | 430 | { |
| 431 | struct futex_q *this; |
| 432 | |
| 433 | plist_for_each_entry(this, &hb->chain, list) { |
Peter Zijlstra | f56a76f | 2021-09-23 14:11:00 -0300 | [diff] [blame] | 434 | if (futex_match(&this->key, key)) |
Darren Hart | 4b1c486 | 2009-04-03 13:39:42 -0700 | [diff] [blame] | 435 | return this; |
| 436 | } |
| 437 | return NULL; |
| 438 | } |
| 439 | |
Peter Zijlstra | 85dc28f | 2021-09-23 14:10:58 -0300 | [diff] [blame] | 440 | int futex_cmpxchg_value_locked(u32 *curval, u32 __user *uaddr, u32 uval, u32 newval) |
Thomas Gleixner | 36cf3b5 | 2007-07-15 23:41:20 -0700 | [diff] [blame] | 441 | { |
Michel Lespinasse | 37a9d91 | 2011-03-10 18:48:51 -0800 | [diff] [blame] | 442 | int ret; |
Thomas Gleixner | 36cf3b5 | 2007-07-15 23:41:20 -0700 | [diff] [blame] | 443 | |
| 444 | pagefault_disable(); |
Michel Lespinasse | 37a9d91 | 2011-03-10 18:48:51 -0800 | [diff] [blame] | 445 | ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval); |
Thomas Gleixner | 36cf3b5 | 2007-07-15 23:41:20 -0700 | [diff] [blame] | 446 | pagefault_enable(); |
| 447 | |
Michel Lespinasse | 37a9d91 | 2011-03-10 18:48:51 -0800 | [diff] [blame] | 448 | return ret; |
Thomas Gleixner | 36cf3b5 | 2007-07-15 23:41:20 -0700 | [diff] [blame] | 449 | } |
| 450 | |
Peter Zijlstra | 85dc28f | 2021-09-23 14:10:58 -0300 | [diff] [blame] | 451 | int futex_get_value_locked(u32 *dest, u32 __user *from) |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 452 | { |
| 453 | int ret; |
| 454 | |
Peter Zijlstra | a866374 | 2006-12-06 20:32:20 -0800 | [diff] [blame] | 455 | pagefault_disable(); |
Linus Torvalds | bd28b14 | 2016-05-22 17:21:27 -0700 | [diff] [blame] | 456 | ret = __get_user(*dest, from); |
Peter Zijlstra | a866374 | 2006-12-06 20:32:20 -0800 | [diff] [blame] | 457 | pagefault_enable(); |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 458 | |
| 459 | return ret ? -EFAULT : 0; |
| 460 | } |
| 461 | |
Thomas Gleixner | 3ef240e | 2019-11-06 22:55:46 +0100 | [diff] [blame] | 462 | /** |
| 463 | * wait_for_owner_exiting - Block until the owner has exited |
Randy Dunlap | 51bfb1d | 2019-12-08 20:26:55 -0800 | [diff] [blame] | 464 | * @ret: owner's current futex lock status |
Thomas Gleixner | 3ef240e | 2019-11-06 22:55:46 +0100 | [diff] [blame] | 465 | * @exiting: Pointer to the exiting task |
| 466 | * |
| 467 | * Caller must hold a refcount on @exiting. |
| 468 | */ |
Peter Zijlstra | 85dc28f | 2021-09-23 14:10:58 -0300 | [diff] [blame] | 469 | void wait_for_owner_exiting(int ret, struct task_struct *exiting) |
Thomas Gleixner | 3ef240e | 2019-11-06 22:55:46 +0100 | [diff] [blame] | 470 | { |
| 471 | if (ret != -EBUSY) { |
| 472 | WARN_ON_ONCE(exiting); |
| 473 | return; |
| 474 | } |
| 475 | |
| 476 | if (WARN_ON_ONCE(ret == -EBUSY && !exiting)) |
| 477 | return; |
| 478 | |
| 479 | mutex_lock(&exiting->futex_exit_mutex); |
| 480 | /* |
| 481 | * No point in doing state checking here. If the waiter got here |
| 482 | * while the task was in exec()->exec_futex_release() then it can |
| 483 | * have any FUTEX_STATE_* value when the waiter has acquired the |
| 484 | * mutex. OK, if running, EXITING or DEAD if it reached exit() |
| 485 | * already. Highly unlikely and not a problem. Just one more round |
| 486 | * through the futex maze. |
| 487 | */ |
| 488 | mutex_unlock(&exiting->futex_exit_mutex); |
| 489 | |
| 490 | put_task_struct(exiting); |
| 491 | } |
| 492 | |
Lai Jiangshan | 2e12978 | 2010-12-22 14:18:50 +0800 | [diff] [blame] | 493 | /** |
Peter Zijlstra | af92dce | 2021-09-23 14:10:55 -0300 | [diff] [blame] | 494 | * __futex_unqueue() - Remove the futex_q from its futex_hash_bucket |
Lai Jiangshan | 2e12978 | 2010-12-22 14:18:50 +0800 | [diff] [blame] | 495 | * @q: The futex_q to unqueue |
| 496 | * |
| 497 | * The q->lock_ptr must not be NULL and must be held by the caller. |
| 498 | */ |
Peter Zijlstra | e5c6828 | 2021-09-23 14:11:02 -0300 | [diff] [blame] | 499 | void __futex_unqueue(struct futex_q *q) |
Lai Jiangshan | 2e12978 | 2010-12-22 14:18:50 +0800 | [diff] [blame] | 500 | { |
| 501 | struct futex_hash_bucket *hb; |
| 502 | |
Lance Roy | 4de1a29 | 2018-10-02 22:38:57 -0700 | [diff] [blame] | 503 | if (WARN_ON_SMP(!q->lock_ptr) || WARN_ON(plist_node_empty(&q->list))) |
Lai Jiangshan | 2e12978 | 2010-12-22 14:18:50 +0800 | [diff] [blame] | 504 | return; |
Lance Roy | 4de1a29 | 2018-10-02 22:38:57 -0700 | [diff] [blame] | 505 | lockdep_assert_held(q->lock_ptr); |
Lai Jiangshan | 2e12978 | 2010-12-22 14:18:50 +0800 | [diff] [blame] | 506 | |
| 507 | hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock); |
| 508 | plist_del(&q->list, &hb->chain); |
Peter Zijlstra | 832c054 | 2021-09-23 14:10:59 -0300 | [diff] [blame] | 509 | futex_hb_waiters_dec(hb); |
Lai Jiangshan | 2e12978 | 2010-12-22 14:18:50 +0800 | [diff] [blame] | 510 | } |
| 511 | |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 512 | /* The key must be already stored in q->key. */ |
Peter Zijlstra | 85dc28f | 2021-09-23 14:10:58 -0300 | [diff] [blame] | 513 | struct futex_hash_bucket *futex_q_lock(struct futex_q *q) |
Namhyung Kim | 15e408c | 2010-09-14 21:43:48 +0900 | [diff] [blame] | 514 | __acquires(&hb->lock) |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 515 | { |
Ingo Molnar | e2970f2 | 2006-06-27 02:54:47 -0700 | [diff] [blame] | 516 | struct futex_hash_bucket *hb; |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 517 | |
Peter Zijlstra | eee5a7b | 2021-09-23 14:10:56 -0300 | [diff] [blame] | 518 | hb = futex_hash(&q->key); |
Linus Torvalds | 11d4616 | 2014-03-20 22:11:17 -0700 | [diff] [blame] | 519 | |
| 520 | /* |
| 521 | * Increment the counter before taking the lock so that |
| 522 | * a potential waker won't miss a to-be-slept task that is |
Peter Zijlstra | e7ba9c8 | 2021-09-23 14:10:54 -0300 | [diff] [blame] | 523 | * waiting for the spinlock. This is safe as all futex_q_lock() |
Peter Zijlstra | bce760d | 2021-09-23 14:10:52 -0300 | [diff] [blame] | 524 | * users end up calling futex_queue(). Similarly, for housekeeping, |
Peter Zijlstra | e7ba9c8 | 2021-09-23 14:10:54 -0300 | [diff] [blame] | 525 | * decrement the counter at futex_q_unlock() when some error has |
Linus Torvalds | 11d4616 | 2014-03-20 22:11:17 -0700 | [diff] [blame] | 526 | * occurred and we don't end up adding the task to the list. |
| 527 | */ |
Peter Zijlstra | 832c054 | 2021-09-23 14:10:59 -0300 | [diff] [blame] | 528 | futex_hb_waiters_inc(hb); /* implies smp_mb(); (A) */ |
Linus Torvalds | 11d4616 | 2014-03-20 22:11:17 -0700 | [diff] [blame] | 529 | |
Ingo Molnar | e2970f2 | 2006-06-27 02:54:47 -0700 | [diff] [blame] | 530 | q->lock_ptr = &hb->lock; |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 531 | |
Davidlohr Bueso | 6f568eb | 2019-02-06 10:56:02 -0800 | [diff] [blame] | 532 | spin_lock(&hb->lock); |
Ingo Molnar | e2970f2 | 2006-06-27 02:54:47 -0700 | [diff] [blame] | 533 | return hb; |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 534 | } |
| 535 | |
Peter Zijlstra | 85dc28f | 2021-09-23 14:10:58 -0300 | [diff] [blame] | 536 | void futex_q_unlock(struct futex_hash_bucket *hb) |
Namhyung Kim | 15e408c | 2010-09-14 21:43:48 +0900 | [diff] [blame] | 537 | __releases(&hb->lock) |
Darren Hart | d40d65c | 2009-09-21 22:30:15 -0700 | [diff] [blame] | 538 | { |
| 539 | spin_unlock(&hb->lock); |
Peter Zijlstra | 832c054 | 2021-09-23 14:10:59 -0300 | [diff] [blame] | 540 | futex_hb_waiters_dec(hb); |
Darren Hart | d40d65c | 2009-09-21 22:30:15 -0700 | [diff] [blame] | 541 | } |
| 542 | |
Peter Zijlstra | 85dc28f | 2021-09-23 14:10:58 -0300 | [diff] [blame] | 543 | void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb) |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 544 | { |
Pierre Peiffer | ec92d08 | 2007-05-09 02:35:00 -0700 | [diff] [blame] | 545 | int prio; |
| 546 | |
| 547 | /* |
| 548 | * The priority used to register this element is |
| 549 | * - either the real thread-priority for the real-time threads |
| 550 | * (i.e. threads with a priority lower than MAX_RT_PRIO) |
| 551 | * - or MAX_RT_PRIO for non-RT threads. |
| 552 | * Thus, all RT-threads are woken first in priority order, and |
| 553 | * the others are woken last, in FIFO order. |
| 554 | */ |
| 555 | prio = min(current->normal_prio, MAX_RT_PRIO); |
| 556 | |
| 557 | plist_node_init(&q->list, prio); |
Pierre Peiffer | ec92d08 | 2007-05-09 02:35:00 -0700 | [diff] [blame] | 558 | plist_add(&q->list, &hb->chain); |
Ingo Molnar | c87e283 | 2006-06-27 02:54:58 -0700 | [diff] [blame] | 559 | q->task = current; |
Peter Zijlstra | cfafcd1 | 2017-03-22 11:35:58 +0100 | [diff] [blame] | 560 | } |
| 561 | |
| 562 | /** |
Peter Zijlstra | bce760d | 2021-09-23 14:10:52 -0300 | [diff] [blame] | 563 | * futex_unqueue() - Remove the futex_q from its futex_hash_bucket |
Darren Hart | d40d65c | 2009-09-21 22:30:15 -0700 | [diff] [blame] | 564 | * @q: The futex_q to unqueue |
| 565 | * |
Peter Zijlstra | bce760d | 2021-09-23 14:10:52 -0300 | [diff] [blame] | 566 | * The q->lock_ptr must not be held by the caller. A call to futex_unqueue() must |
| 567 | * be paired with exactly one earlier call to futex_queue(). |
Darren Hart | d40d65c | 2009-09-21 22:30:15 -0700 | [diff] [blame] | 568 | * |
Randy Dunlap | 6c23cbb | 2013-03-05 10:00:24 -0800 | [diff] [blame] | 569 | * Return: |
Mauro Carvalho Chehab | 7b4ff1a | 2017-05-11 10:17:45 -0300 | [diff] [blame] | 570 | * - 1 - if the futex_q was still queued (and we removed unqueued it); |
| 571 | * - 0 - if the futex_q was already removed by the waking thread |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 572 | */ |
Peter Zijlstra | a046f1a | 2021-09-23 14:11:03 -0300 | [diff] [blame] | 573 | int futex_unqueue(struct futex_q *q) |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 574 | { |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 575 | spinlock_t *lock_ptr; |
Ingo Molnar | e2970f2 | 2006-06-27 02:54:47 -0700 | [diff] [blame] | 576 | int ret = 0; |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 577 | |
| 578 | /* In the common case we don't take the spinlock, which is nice. */ |
Darren Hart | 42d35d4 | 2008-12-29 15:49:53 -0800 | [diff] [blame] | 579 | retry: |
Jianyu Zhan | 29b75eb | 2016-03-07 09:32:24 +0800 | [diff] [blame] | 580 | /* |
| 581 | * q->lock_ptr can change between this read and the following spin_lock. |
| 582 | * Use READ_ONCE to forbid the compiler from reloading q->lock_ptr and |
| 583 | * optimizing lock_ptr out of the logic below. |
| 584 | */ |
| 585 | lock_ptr = READ_ONCE(q->lock_ptr); |
Stephen Hemminger | c80544d | 2007-10-18 03:07:05 -0700 | [diff] [blame] | 586 | if (lock_ptr != NULL) { |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 587 | spin_lock(lock_ptr); |
| 588 | /* |
| 589 | * q->lock_ptr can change between reading it and |
| 590 | * spin_lock(), causing us to take the wrong lock. This |
| 591 | * corrects the race condition. |
| 592 | * |
| 593 | * Reasoning goes like this: if we have the wrong lock, |
| 594 | * q->lock_ptr must have changed (maybe several times) |
| 595 | * between reading it and the spin_lock(). It can |
| 596 | * change again after the spin_lock() but only if it was |
| 597 | * already changed before the spin_lock(). It cannot, |
| 598 | * however, change back to the original value. Therefore |
| 599 | * we can detect whether we acquired the correct lock. |
| 600 | */ |
| 601 | if (unlikely(lock_ptr != q->lock_ptr)) { |
| 602 | spin_unlock(lock_ptr); |
| 603 | goto retry; |
| 604 | } |
Peter Zijlstra | af92dce | 2021-09-23 14:10:55 -0300 | [diff] [blame] | 605 | __futex_unqueue(q); |
Ingo Molnar | c87e283 | 2006-06-27 02:54:58 -0700 | [diff] [blame] | 606 | |
| 607 | BUG_ON(q->pi_state); |
| 608 | |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 609 | spin_unlock(lock_ptr); |
| 610 | ret = 1; |
| 611 | } |
| 612 | |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 613 | return ret; |
| 614 | } |
| 615 | |
Ingo Molnar | c87e283 | 2006-06-27 02:54:58 -0700 | [diff] [blame] | 616 | /* |
Ingo Molnar | 93d0955 | 2021-05-12 20:04:28 +0200 | [diff] [blame] | 617 | * PI futexes can not be requeued and must remove themselves from the |
Davidlohr Bueso | a3f2428 | 2021-02-26 09:50:28 -0800 | [diff] [blame] | 618 | * hash bucket. The hash bucket lock (i.e. lock_ptr) is held. |
Ingo Molnar | c87e283 | 2006-06-27 02:54:58 -0700 | [diff] [blame] | 619 | */ |
Peter Zijlstra | 85dc28f | 2021-09-23 14:10:58 -0300 | [diff] [blame] | 620 | void futex_unqueue_pi(struct futex_q *q) |
Ingo Molnar | c87e283 | 2006-06-27 02:54:58 -0700 | [diff] [blame] | 621 | { |
Peter Zijlstra | af92dce | 2021-09-23 14:10:55 -0300 | [diff] [blame] | 622 | __futex_unqueue(q); |
Ingo Molnar | c87e283 | 2006-06-27 02:54:58 -0700 | [diff] [blame] | 623 | |
| 624 | BUG_ON(!q->pi_state); |
Thomas Gleixner | 29e9ee5 | 2015-12-19 20:07:39 +0000 | [diff] [blame] | 625 | put_pi_state(q->pi_state); |
Ingo Molnar | c87e283 | 2006-06-27 02:54:58 -0700 | [diff] [blame] | 626 | q->pi_state = NULL; |
Ingo Molnar | c87e283 | 2006-06-27 02:54:58 -0700 | [diff] [blame] | 627 | } |
| 628 | |
Yang Tao | ca16d5b | 2019-11-06 22:55:35 +0100 | [diff] [blame] | 629 | /* Constants for the pending_op argument of handle_futex_death */ |
| 630 | #define HANDLE_DEATH_PENDING true |
| 631 | #define HANDLE_DEATH_LIST false |
| 632 | |
Ingo Molnar | 0771dfe | 2006-03-27 01:16:22 -0800 | [diff] [blame] | 633 | /* |
| 634 | * Process a futex-list entry, check whether it's owned by the |
| 635 | * dying task, and do notification if so: |
| 636 | */ |
Yang Tao | ca16d5b | 2019-11-06 22:55:35 +0100 | [diff] [blame] | 637 | static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, |
| 638 | bool pi, bool pending_op) |
Ingo Molnar | 0771dfe | 2006-03-27 01:16:22 -0800 | [diff] [blame] | 639 | { |
Kees Cook | 3f649ab | 2020-06-03 13:09:38 -0700 | [diff] [blame] | 640 | u32 uval, nval, mval; |
Will Deacon | 6b4f4bc | 2019-02-28 11:58:08 +0000 | [diff] [blame] | 641 | int err; |
Ingo Molnar | 0771dfe | 2006-03-27 01:16:22 -0800 | [diff] [blame] | 642 | |
Chen Jie | 5a07168 | 2019-03-15 03:44:38 +0000 | [diff] [blame] | 643 | /* Futex address must be 32bit aligned */ |
| 644 | if ((((unsigned long)uaddr) % sizeof(*uaddr)) != 0) |
| 645 | return -1; |
| 646 | |
Ingo Molnar | 8f17d3a | 2006-03-27 01:16:27 -0800 | [diff] [blame] | 647 | retry: |
| 648 | if (get_user(uval, uaddr)) |
Ingo Molnar | 0771dfe | 2006-03-27 01:16:22 -0800 | [diff] [blame] | 649 | return -1; |
| 650 | |
Yang Tao | ca16d5b | 2019-11-06 22:55:35 +0100 | [diff] [blame] | 651 | /* |
| 652 | * Special case for regular (non PI) futexes. The unlock path in |
| 653 | * user space has two race scenarios: |
| 654 | * |
| 655 | * 1. The unlock path releases the user space futex value and |
| 656 | * before it can execute the futex() syscall to wake up |
| 657 | * waiters it is killed. |
| 658 | * |
| 659 | * 2. A woken up waiter is killed before it can acquire the |
| 660 | * futex in user space. |
| 661 | * |
| 662 | * In both cases the TID validation below prevents a wakeup of |
| 663 | * potential waiters which can cause these waiters to block |
| 664 | * forever. |
| 665 | * |
| 666 | * In both cases the following conditions are met: |
| 667 | * |
| 668 | * 1) task->robust_list->list_op_pending != NULL |
| 669 | * @pending_op == true |
| 670 | * 2) User space futex value == 0 |
| 671 | * 3) Regular futex: @pi == false |
| 672 | * |
| 673 | * If these conditions are met, it is safe to attempt waking up a |
| 674 | * potential waiter without touching the user space futex value and |
| 675 | * trying to set the OWNER_DIED bit. The user space futex value is |
| 676 | * uncontended and the rest of the user space mutex state is |
| 677 | * consistent, so a woken waiter will just take over the |
| 678 | * uncontended futex. Setting the OWNER_DIED bit would create |
| 679 | * inconsistent state and malfunction of the user space owner died |
| 680 | * handling. |
| 681 | */ |
| 682 | if (pending_op && !pi && !uval) { |
| 683 | futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY); |
| 684 | return 0; |
| 685 | } |
| 686 | |
Will Deacon | 6b4f4bc | 2019-02-28 11:58:08 +0000 | [diff] [blame] | 687 | if ((uval & FUTEX_TID_MASK) != task_pid_vnr(curr)) |
| 688 | return 0; |
| 689 | |
| 690 | /* |
| 691 | * Ok, this dying thread is truly holding a futex |
| 692 | * of interest. Set the OWNER_DIED bit atomically |
| 693 | * via cmpxchg, and if the value had FUTEX_WAITERS |
| 694 | * set, wake up a waiter (if any). (We have to do a |
| 695 | * futex_wake() even if OWNER_DIED is already set - |
| 696 | * to handle the rare but possible case of recursive |
| 697 | * thread-death.) The rest of the cleanup is done in |
| 698 | * userspace. |
| 699 | */ |
| 700 | mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; |
| 701 | |
| 702 | /* |
| 703 | * We are not holding a lock here, but we want to have |
| 704 | * the pagefault_disable/enable() protection because |
| 705 | * we want to handle the fault gracefully. If the |
| 706 | * access fails we try to fault in the futex with R/W |
| 707 | * verification via get_user_pages. get_user() above |
| 708 | * does not guarantee R/W access. If that fails we |
| 709 | * give up and leave the futex locked. |
| 710 | */ |
Peter Zijlstra | 966cb75 | 2021-09-23 14:10:57 -0300 | [diff] [blame] | 711 | if ((err = futex_cmpxchg_value_locked(&nval, uaddr, uval, mval))) { |
Will Deacon | 6b4f4bc | 2019-02-28 11:58:08 +0000 | [diff] [blame] | 712 | switch (err) { |
| 713 | case -EFAULT: |
Thomas Gleixner | 6e0aa9f | 2011-03-14 10:34:35 +0100 | [diff] [blame] | 714 | if (fault_in_user_writeable(uaddr)) |
| 715 | return -1; |
| 716 | goto retry; |
Will Deacon | 6b4f4bc | 2019-02-28 11:58:08 +0000 | [diff] [blame] | 717 | |
| 718 | case -EAGAIN: |
| 719 | cond_resched(); |
Ingo Molnar | 8f17d3a | 2006-03-27 01:16:27 -0800 | [diff] [blame] | 720 | goto retry; |
Ingo Molnar | 0771dfe | 2006-03-27 01:16:22 -0800 | [diff] [blame] | 721 | |
Will Deacon | 6b4f4bc | 2019-02-28 11:58:08 +0000 | [diff] [blame] | 722 | default: |
| 723 | WARN_ON_ONCE(1); |
| 724 | return err; |
| 725 | } |
Ingo Molnar | 0771dfe | 2006-03-27 01:16:22 -0800 | [diff] [blame] | 726 | } |
Will Deacon | 6b4f4bc | 2019-02-28 11:58:08 +0000 | [diff] [blame] | 727 | |
| 728 | if (nval != uval) |
| 729 | goto retry; |
| 730 | |
| 731 | /* |
| 732 | * Wake robust non-PI futexes here. The wakeup of |
| 733 | * PI futexes happens in exit_pi_state(): |
| 734 | */ |
| 735 | if (!pi && (uval & FUTEX_WAITERS)) |
| 736 | futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY); |
| 737 | |
Ingo Molnar | 0771dfe | 2006-03-27 01:16:22 -0800 | [diff] [blame] | 738 | return 0; |
| 739 | } |
| 740 | |
| 741 | /* |
Ingo Molnar | e3f2dde | 2006-07-29 05:17:57 +0200 | [diff] [blame] | 742 | * Fetch a robust-list pointer. Bit 0 signals PI futexes: |
| 743 | */ |
| 744 | static inline int fetch_robust_entry(struct robust_list __user **entry, |
Al Viro | ba46df9 | 2006-10-10 22:46:07 +0100 | [diff] [blame] | 745 | struct robust_list __user * __user *head, |
Namhyung Kim | 1dcc41b | 2010-09-14 21:43:46 +0900 | [diff] [blame] | 746 | unsigned int *pi) |
Ingo Molnar | e3f2dde | 2006-07-29 05:17:57 +0200 | [diff] [blame] | 747 | { |
| 748 | unsigned long uentry; |
| 749 | |
Al Viro | ba46df9 | 2006-10-10 22:46:07 +0100 | [diff] [blame] | 750 | if (get_user(uentry, (unsigned long __user *)head)) |
Ingo Molnar | e3f2dde | 2006-07-29 05:17:57 +0200 | [diff] [blame] | 751 | return -EFAULT; |
| 752 | |
Al Viro | ba46df9 | 2006-10-10 22:46:07 +0100 | [diff] [blame] | 753 | *entry = (void __user *)(uentry & ~1UL); |
Ingo Molnar | e3f2dde | 2006-07-29 05:17:57 +0200 | [diff] [blame] | 754 | *pi = uentry & 1; |
| 755 | |
| 756 | return 0; |
| 757 | } |
| 758 | |
| 759 | /* |
Ingo Molnar | 0771dfe | 2006-03-27 01:16:22 -0800 | [diff] [blame] | 760 | * Walk curr->robust_list (very carefully, it's a userspace list!) |
| 761 | * and mark any locks found there dead, and notify any waiters. |
| 762 | * |
| 763 | * We silently return on any sign of list-walking problem. |
| 764 | */ |
Thomas Gleixner | ba31c1a4 | 2019-11-06 22:55:36 +0100 | [diff] [blame] | 765 | static void exit_robust_list(struct task_struct *curr) |
Ingo Molnar | 0771dfe | 2006-03-27 01:16:22 -0800 | [diff] [blame] | 766 | { |
| 767 | struct robust_list_head __user *head = curr->robust_list; |
Martin Schwidefsky | 9f96cb1 | 2007-10-01 01:20:13 -0700 | [diff] [blame] | 768 | struct robust_list __user *entry, *next_entry, *pending; |
Darren Hart | 4c115e9 | 2010-11-04 15:00:00 -0400 | [diff] [blame] | 769 | unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; |
Kees Cook | 3f649ab | 2020-06-03 13:09:38 -0700 | [diff] [blame] | 770 | unsigned int next_pi; |
Ingo Molnar | 0771dfe | 2006-03-27 01:16:22 -0800 | [diff] [blame] | 771 | unsigned long futex_offset; |
Martin Schwidefsky | 9f96cb1 | 2007-10-01 01:20:13 -0700 | [diff] [blame] | 772 | int rc; |
Ingo Molnar | 0771dfe | 2006-03-27 01:16:22 -0800 | [diff] [blame] | 773 | |
| 774 | /* |
| 775 | * Fetch the list head (which was registered earlier, via |
| 776 | * sys_set_robust_list()): |
| 777 | */ |
Ingo Molnar | e3f2dde | 2006-07-29 05:17:57 +0200 | [diff] [blame] | 778 | if (fetch_robust_entry(&entry, &head->list.next, &pi)) |
Ingo Molnar | 0771dfe | 2006-03-27 01:16:22 -0800 | [diff] [blame] | 779 | return; |
| 780 | /* |
| 781 | * Fetch the relative futex offset: |
| 782 | */ |
| 783 | if (get_user(futex_offset, &head->futex_offset)) |
| 784 | return; |
| 785 | /* |
| 786 | * Fetch any possibly pending lock-add first, and handle it |
| 787 | * if it exists: |
| 788 | */ |
Ingo Molnar | e3f2dde | 2006-07-29 05:17:57 +0200 | [diff] [blame] | 789 | if (fetch_robust_entry(&pending, &head->list_op_pending, &pip)) |
Ingo Molnar | 0771dfe | 2006-03-27 01:16:22 -0800 | [diff] [blame] | 790 | return; |
Ingo Molnar | e3f2dde | 2006-07-29 05:17:57 +0200 | [diff] [blame] | 791 | |
Martin Schwidefsky | 9f96cb1 | 2007-10-01 01:20:13 -0700 | [diff] [blame] | 792 | next_entry = NULL; /* avoid warning with gcc */ |
Ingo Molnar | 0771dfe | 2006-03-27 01:16:22 -0800 | [diff] [blame] | 793 | while (entry != &head->list) { |
| 794 | /* |
Martin Schwidefsky | 9f96cb1 | 2007-10-01 01:20:13 -0700 | [diff] [blame] | 795 | * Fetch the next entry in the list before calling |
| 796 | * handle_futex_death: |
| 797 | */ |
| 798 | rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi); |
| 799 | /* |
Ingo Molnar | 0771dfe | 2006-03-27 01:16:22 -0800 | [diff] [blame] | 800 | * A pending lock might already be on the list, so |
Ingo Molnar | c87e283 | 2006-06-27 02:54:58 -0700 | [diff] [blame] | 801 | * don't process it twice: |
Ingo Molnar | 0771dfe | 2006-03-27 01:16:22 -0800 | [diff] [blame] | 802 | */ |
Yang Tao | ca16d5b | 2019-11-06 22:55:35 +0100 | [diff] [blame] | 803 | if (entry != pending) { |
Al Viro | ba46df9 | 2006-10-10 22:46:07 +0100 | [diff] [blame] | 804 | if (handle_futex_death((void __user *)entry + futex_offset, |
Yang Tao | ca16d5b | 2019-11-06 22:55:35 +0100 | [diff] [blame] | 805 | curr, pi, HANDLE_DEATH_LIST)) |
Ingo Molnar | 0771dfe | 2006-03-27 01:16:22 -0800 | [diff] [blame] | 806 | return; |
Yang Tao | ca16d5b | 2019-11-06 22:55:35 +0100 | [diff] [blame] | 807 | } |
Martin Schwidefsky | 9f96cb1 | 2007-10-01 01:20:13 -0700 | [diff] [blame] | 808 | if (rc) |
Ingo Molnar | 0771dfe | 2006-03-27 01:16:22 -0800 | [diff] [blame] | 809 | return; |
Martin Schwidefsky | 9f96cb1 | 2007-10-01 01:20:13 -0700 | [diff] [blame] | 810 | entry = next_entry; |
| 811 | pi = next_pi; |
Ingo Molnar | 0771dfe | 2006-03-27 01:16:22 -0800 | [diff] [blame] | 812 | /* |
| 813 | * Avoid excessively long or circular lists: |
| 814 | */ |
| 815 | if (!--limit) |
| 816 | break; |
| 817 | |
| 818 | cond_resched(); |
| 819 | } |
Martin Schwidefsky | 9f96cb1 | 2007-10-01 01:20:13 -0700 | [diff] [blame] | 820 | |
Yang Tao | ca16d5b | 2019-11-06 22:55:35 +0100 | [diff] [blame] | 821 | if (pending) { |
Martin Schwidefsky | 9f96cb1 | 2007-10-01 01:20:13 -0700 | [diff] [blame] | 822 | handle_futex_death((void __user *)pending + futex_offset, |
Yang Tao | ca16d5b | 2019-11-06 22:55:35 +0100 | [diff] [blame] | 823 | curr, pip, HANDLE_DEATH_PENDING); |
| 824 | } |
Ingo Molnar | 0771dfe | 2006-03-27 01:16:22 -0800 | [diff] [blame] | 825 | } |
| 826 | |
Peter Zijlstra | af8cc96 | 2021-09-23 14:10:51 -0300 | [diff] [blame] | 827 | #ifdef CONFIG_COMPAT |
| 828 | static void __user *futex_uaddr(struct robust_list __user *entry, |
| 829 | compat_long_t futex_offset) |
| 830 | { |
| 831 | compat_uptr_t base = ptr_to_compat(entry); |
| 832 | void __user *uaddr = compat_ptr(base + futex_offset); |
| 833 | |
| 834 | return uaddr; |
| 835 | } |
| 836 | |
| 837 | /* |
| 838 | * Fetch a robust-list pointer. Bit 0 signals PI futexes: |
| 839 | */ |
| 840 | static inline int |
| 841 | compat_fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry, |
| 842 | compat_uptr_t __user *head, unsigned int *pi) |
| 843 | { |
| 844 | if (get_user(*uentry, head)) |
| 845 | return -EFAULT; |
| 846 | |
| 847 | *entry = compat_ptr((*uentry) & ~1); |
| 848 | *pi = (unsigned int)(*uentry) & 1; |
| 849 | |
| 850 | return 0; |
| 851 | } |
| 852 | |
| 853 | /* |
| 854 | * Walk curr->robust_list (very carefully, it's a userspace list!) |
| 855 | * and mark any locks found there dead, and notify any waiters. |
| 856 | * |
| 857 | * We silently return on any sign of list-walking problem. |
| 858 | */ |
| 859 | static void compat_exit_robust_list(struct task_struct *curr) |
| 860 | { |
| 861 | struct compat_robust_list_head __user *head = curr->compat_robust_list; |
| 862 | struct robust_list __user *entry, *next_entry, *pending; |
| 863 | unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; |
| 864 | unsigned int next_pi; |
| 865 | compat_uptr_t uentry, next_uentry, upending; |
| 866 | compat_long_t futex_offset; |
| 867 | int rc; |
| 868 | |
Peter Zijlstra | af8cc96 | 2021-09-23 14:10:51 -0300 | [diff] [blame] | 869 | /* |
| 870 | * Fetch the list head (which was registered earlier, via |
| 871 | * sys_set_robust_list()): |
| 872 | */ |
| 873 | if (compat_fetch_robust_entry(&uentry, &entry, &head->list.next, &pi)) |
| 874 | return; |
| 875 | /* |
| 876 | * Fetch the relative futex offset: |
| 877 | */ |
| 878 | if (get_user(futex_offset, &head->futex_offset)) |
| 879 | return; |
| 880 | /* |
| 881 | * Fetch any possibly pending lock-add first, and handle it |
| 882 | * if it exists: |
| 883 | */ |
| 884 | if (compat_fetch_robust_entry(&upending, &pending, |
| 885 | &head->list_op_pending, &pip)) |
| 886 | return; |
| 887 | |
| 888 | next_entry = NULL; /* avoid warning with gcc */ |
| 889 | while (entry != (struct robust_list __user *) &head->list) { |
| 890 | /* |
| 891 | * Fetch the next entry in the list before calling |
| 892 | * handle_futex_death: |
| 893 | */ |
| 894 | rc = compat_fetch_robust_entry(&next_uentry, &next_entry, |
| 895 | (compat_uptr_t __user *)&entry->next, &next_pi); |
| 896 | /* |
| 897 | * A pending lock might already be on the list, so |
| 898 | * dont process it twice: |
| 899 | */ |
| 900 | if (entry != pending) { |
| 901 | void __user *uaddr = futex_uaddr(entry, futex_offset); |
| 902 | |
| 903 | if (handle_futex_death(uaddr, curr, pi, |
| 904 | HANDLE_DEATH_LIST)) |
| 905 | return; |
| 906 | } |
| 907 | if (rc) |
| 908 | return; |
| 909 | uentry = next_uentry; |
| 910 | entry = next_entry; |
| 911 | pi = next_pi; |
| 912 | /* |
| 913 | * Avoid excessively long or circular lists: |
| 914 | */ |
| 915 | if (!--limit) |
| 916 | break; |
| 917 | |
| 918 | cond_resched(); |
| 919 | } |
| 920 | if (pending) { |
| 921 | void __user *uaddr = futex_uaddr(pending, futex_offset); |
| 922 | |
| 923 | handle_futex_death(uaddr, curr, pip, HANDLE_DEATH_PENDING); |
| 924 | } |
| 925 | } |
| 926 | #endif |
| 927 | |
Peter Zijlstra | 85dc28f | 2021-09-23 14:10:58 -0300 | [diff] [blame] | 928 | #ifdef CONFIG_FUTEX_PI |
| 929 | |
| 930 | /* |
| 931 | * This task is holding PI mutexes at exit time => bad. |
| 932 | * Kernel cleans up PI-state, but userspace is likely hosed. |
| 933 | * (Robust-futex cleanup is separate and might save the day for userspace.) |
| 934 | */ |
| 935 | static void exit_pi_state_list(struct task_struct *curr) |
| 936 | { |
| 937 | struct list_head *next, *head = &curr->pi_state_list; |
| 938 | struct futex_pi_state *pi_state; |
| 939 | struct futex_hash_bucket *hb; |
| 940 | union futex_key key = FUTEX_KEY_INIT; |
| 941 | |
Peter Zijlstra | 85dc28f | 2021-09-23 14:10:58 -0300 | [diff] [blame] | 942 | /* |
| 943 | * We are a ZOMBIE and nobody can enqueue itself on |
| 944 | * pi_state_list anymore, but we have to be careful |
| 945 | * versus waiters unqueueing themselves: |
| 946 | */ |
| 947 | raw_spin_lock_irq(&curr->pi_lock); |
| 948 | while (!list_empty(head)) { |
| 949 | next = head->next; |
| 950 | pi_state = list_entry(next, struct futex_pi_state, list); |
| 951 | key = pi_state->key; |
| 952 | hb = futex_hash(&key); |
| 953 | |
| 954 | /* |
| 955 | * We can race against put_pi_state() removing itself from the |
| 956 | * list (a waiter going away). put_pi_state() will first |
| 957 | * decrement the reference count and then modify the list, so |
| 958 | * its possible to see the list entry but fail this reference |
| 959 | * acquire. |
| 960 | * |
| 961 | * In that case; drop the locks to let put_pi_state() make |
| 962 | * progress and retry the loop. |
| 963 | */ |
| 964 | if (!refcount_inc_not_zero(&pi_state->refcount)) { |
| 965 | raw_spin_unlock_irq(&curr->pi_lock); |
| 966 | cpu_relax(); |
| 967 | raw_spin_lock_irq(&curr->pi_lock); |
| 968 | continue; |
| 969 | } |
| 970 | raw_spin_unlock_irq(&curr->pi_lock); |
| 971 | |
| 972 | spin_lock(&hb->lock); |
| 973 | raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); |
| 974 | raw_spin_lock(&curr->pi_lock); |
| 975 | /* |
| 976 | * We dropped the pi-lock, so re-check whether this |
| 977 | * task still owns the PI-state: |
| 978 | */ |
| 979 | if (head->next != next) { |
| 980 | /* retain curr->pi_lock for the loop invariant */ |
| 981 | raw_spin_unlock(&pi_state->pi_mutex.wait_lock); |
| 982 | spin_unlock(&hb->lock); |
| 983 | put_pi_state(pi_state); |
| 984 | continue; |
| 985 | } |
| 986 | |
| 987 | WARN_ON(pi_state->owner != curr); |
| 988 | WARN_ON(list_empty(&pi_state->list)); |
| 989 | list_del_init(&pi_state->list); |
| 990 | pi_state->owner = NULL; |
| 991 | |
| 992 | raw_spin_unlock(&curr->pi_lock); |
| 993 | raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); |
| 994 | spin_unlock(&hb->lock); |
| 995 | |
| 996 | rt_mutex_futex_unlock(&pi_state->pi_mutex); |
| 997 | put_pi_state(pi_state); |
| 998 | |
| 999 | raw_spin_lock_irq(&curr->pi_lock); |
| 1000 | } |
| 1001 | raw_spin_unlock_irq(&curr->pi_lock); |
| 1002 | } |
| 1003 | #else |
| 1004 | static inline void exit_pi_state_list(struct task_struct *curr) { } |
| 1005 | #endif |
| 1006 | |
Thomas Gleixner | af8cbda | 2019-11-06 22:55:43 +0100 | [diff] [blame] | 1007 | static void futex_cleanup(struct task_struct *tsk) |
Thomas Gleixner | ba31c1a4 | 2019-11-06 22:55:36 +0100 | [diff] [blame] | 1008 | { |
| 1009 | if (unlikely(tsk->robust_list)) { |
| 1010 | exit_robust_list(tsk); |
| 1011 | tsk->robust_list = NULL; |
| 1012 | } |
| 1013 | |
| 1014 | #ifdef CONFIG_COMPAT |
| 1015 | if (unlikely(tsk->compat_robust_list)) { |
| 1016 | compat_exit_robust_list(tsk); |
| 1017 | tsk->compat_robust_list = NULL; |
| 1018 | } |
| 1019 | #endif |
| 1020 | |
| 1021 | if (unlikely(!list_empty(&tsk->pi_state_list))) |
| 1022 | exit_pi_state_list(tsk); |
| 1023 | } |
| 1024 | |
Thomas Gleixner | 18f6943 | 2019-11-06 22:55:41 +0100 | [diff] [blame] | 1025 | /** |
| 1026 | * futex_exit_recursive - Set the tasks futex state to FUTEX_STATE_DEAD |
| 1027 | * @tsk: task to set the state on |
| 1028 | * |
| 1029 | * Set the futex exit state of the task lockless. The futex waiter code |
| 1030 | * observes that state when a task is exiting and loops until the task has |
| 1031 | * actually finished the futex cleanup. The worst case for this is that the |
| 1032 | * waiter runs through the wait loop until the state becomes visible. |
| 1033 | * |
Eric W. Biederman | 05ea042 | 2021-11-22 09:33:00 -0600 | [diff] [blame] | 1034 | * This is called from the recursive fault handling path in make_task_dead(). |
Thomas Gleixner | 18f6943 | 2019-11-06 22:55:41 +0100 | [diff] [blame] | 1035 | * |
| 1036 | * This is best effort. Either the futex exit code has run already or |
| 1037 | * not. If the OWNER_DIED bit has been set on the futex then the waiter can |
| 1038 | * take it over. If not, the problem is pushed back to user space. If the |
| 1039 | * futex exit code did not run yet, then an already queued waiter might |
| 1040 | * block forever, but there is nothing which can be done about that. |
| 1041 | */ |
| 1042 | void futex_exit_recursive(struct task_struct *tsk) |
| 1043 | { |
Thomas Gleixner | 3f186d9 | 2019-11-06 22:55:44 +0100 | [diff] [blame] | 1044 | /* If the state is FUTEX_STATE_EXITING then futex_exit_mutex is held */ |
| 1045 | if (tsk->futex_state == FUTEX_STATE_EXITING) |
| 1046 | mutex_unlock(&tsk->futex_exit_mutex); |
Thomas Gleixner | 18f6943 | 2019-11-06 22:55:41 +0100 | [diff] [blame] | 1047 | tsk->futex_state = FUTEX_STATE_DEAD; |
| 1048 | } |
| 1049 | |
Thomas Gleixner | af8cbda | 2019-11-06 22:55:43 +0100 | [diff] [blame] | 1050 | static void futex_cleanup_begin(struct task_struct *tsk) |
Thomas Gleixner | 150d715 | 2019-11-06 22:55:39 +0100 | [diff] [blame] | 1051 | { |
Thomas Gleixner | 18f6943 | 2019-11-06 22:55:41 +0100 | [diff] [blame] | 1052 | /* |
Thomas Gleixner | 3f186d9 | 2019-11-06 22:55:44 +0100 | [diff] [blame] | 1053 | * Prevent various race issues against a concurrent incoming waiter |
| 1054 | * including live locks by forcing the waiter to block on |
| 1055 | * tsk->futex_exit_mutex when it observes FUTEX_STATE_EXITING in |
| 1056 | * attach_to_pi_owner(). |
| 1057 | */ |
| 1058 | mutex_lock(&tsk->futex_exit_mutex); |
| 1059 | |
| 1060 | /* |
Thomas Gleixner | 4a8e991 | 2019-11-06 22:55:42 +0100 | [diff] [blame] | 1061 | * Switch the state to FUTEX_STATE_EXITING under tsk->pi_lock. |
| 1062 | * |
| 1063 | * This ensures that all subsequent checks of tsk->futex_state in |
| 1064 | * attach_to_pi_owner() must observe FUTEX_STATE_EXITING with |
| 1065 | * tsk->pi_lock held. |
| 1066 | * |
| 1067 | * It guarantees also that a pi_state which was queued right before |
| 1068 | * the state change under tsk->pi_lock by a concurrent waiter must |
| 1069 | * be observed in exit_pi_state_list(). |
Thomas Gleixner | 18f6943 | 2019-11-06 22:55:41 +0100 | [diff] [blame] | 1070 | */ |
| 1071 | raw_spin_lock_irq(&tsk->pi_lock); |
Thomas Gleixner | 4a8e991 | 2019-11-06 22:55:42 +0100 | [diff] [blame] | 1072 | tsk->futex_state = FUTEX_STATE_EXITING; |
Thomas Gleixner | 18f6943 | 2019-11-06 22:55:41 +0100 | [diff] [blame] | 1073 | raw_spin_unlock_irq(&tsk->pi_lock); |
Thomas Gleixner | af8cbda | 2019-11-06 22:55:43 +0100 | [diff] [blame] | 1074 | } |
Thomas Gleixner | 18f6943 | 2019-11-06 22:55:41 +0100 | [diff] [blame] | 1075 | |
Thomas Gleixner | af8cbda | 2019-11-06 22:55:43 +0100 | [diff] [blame] | 1076 | static void futex_cleanup_end(struct task_struct *tsk, int state) |
| 1077 | { |
| 1078 | /* |
| 1079 | * Lockless store. The only side effect is that an observer might |
| 1080 | * take another loop until it becomes visible. |
| 1081 | */ |
| 1082 | tsk->futex_state = state; |
Thomas Gleixner | 3f186d9 | 2019-11-06 22:55:44 +0100 | [diff] [blame] | 1083 | /* |
| 1084 | * Drop the exit protection. This unblocks waiters which observed |
| 1085 | * FUTEX_STATE_EXITING to reevaluate the state. |
| 1086 | */ |
| 1087 | mutex_unlock(&tsk->futex_exit_mutex); |
Thomas Gleixner | af8cbda | 2019-11-06 22:55:43 +0100 | [diff] [blame] | 1088 | } |
Thomas Gleixner | 18f6943 | 2019-11-06 22:55:41 +0100 | [diff] [blame] | 1089 | |
Thomas Gleixner | af8cbda | 2019-11-06 22:55:43 +0100 | [diff] [blame] | 1090 | void futex_exec_release(struct task_struct *tsk) |
| 1091 | { |
| 1092 | /* |
| 1093 | * The state handling is done for consistency, but in the case of |
Ingo Molnar | 93d0955 | 2021-05-12 20:04:28 +0200 | [diff] [blame] | 1094 | * exec() there is no way to prevent further damage as the PID stays |
Thomas Gleixner | af8cbda | 2019-11-06 22:55:43 +0100 | [diff] [blame] | 1095 | * the same. But for the unlikely and arguably buggy case that a |
| 1096 | * futex is held on exec(), this provides at least as much state |
| 1097 | * consistency protection which is possible. |
| 1098 | */ |
| 1099 | futex_cleanup_begin(tsk); |
| 1100 | futex_cleanup(tsk); |
| 1101 | /* |
| 1102 | * Reset the state to FUTEX_STATE_OK. The task is alive and about |
| 1103 | * exec a new binary. |
| 1104 | */ |
| 1105 | futex_cleanup_end(tsk, FUTEX_STATE_OK); |
| 1106 | } |
| 1107 | |
| 1108 | void futex_exit_release(struct task_struct *tsk) |
| 1109 | { |
| 1110 | futex_cleanup_begin(tsk); |
| 1111 | futex_cleanup(tsk); |
| 1112 | futex_cleanup_end(tsk, FUTEX_STATE_DEAD); |
Thomas Gleixner | 150d715 | 2019-11-06 22:55:39 +0100 | [diff] [blame] | 1113 | } |
| 1114 | |
Benjamin Herrenschmidt | f6d107f | 2008-03-27 14:52:15 +1100 | [diff] [blame] | 1115 | static int __init futex_init(void) |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 1116 | { |
Heiko Carstens | 63b1a81 | 2014-01-16 14:54:50 +0100 | [diff] [blame] | 1117 | unsigned int futex_shift; |
Davidlohr Bueso | a52b89e | 2014-01-12 15:31:23 -0800 | [diff] [blame] | 1118 | unsigned long i; |
| 1119 | |
| 1120 | #if CONFIG_BASE_SMALL |
| 1121 | futex_hashsize = 16; |
| 1122 | #else |
| 1123 | futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus()); |
| 1124 | #endif |
| 1125 | |
| 1126 | futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues), |
| 1127 | futex_hashsize, 0, |
| 1128 | futex_hashsize < 256 ? HASH_SMALL : 0, |
Heiko Carstens | 63b1a81 | 2014-01-16 14:54:50 +0100 | [diff] [blame] | 1129 | &futex_shift, NULL, |
| 1130 | futex_hashsize, futex_hashsize); |
| 1131 | futex_hashsize = 1UL << futex_shift; |
Heiko Carstens | 03b8c7b | 2014-03-02 13:09:47 +0100 | [diff] [blame] | 1132 | |
Davidlohr Bueso | a52b89e | 2014-01-12 15:31:23 -0800 | [diff] [blame] | 1133 | for (i = 0; i < futex_hashsize; i++) { |
Linus Torvalds | 11d4616 | 2014-03-20 22:11:17 -0700 | [diff] [blame] | 1134 | atomic_set(&futex_queues[i].waiters, 0); |
Dima Zavin | 732375c | 2011-07-07 17:27:59 -0700 | [diff] [blame] | 1135 | plist_head_init(&futex_queues[i].chain); |
Thomas Gleixner | 3e4ab74 | 2008-02-23 15:23:55 -0800 | [diff] [blame] | 1136 | spin_lock_init(&futex_queues[i].lock); |
| 1137 | } |
| 1138 | |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 1139 | return 0; |
| 1140 | } |
Yang Yang | 25f71d1 | 2016-12-30 16:17:55 +0800 | [diff] [blame] | 1141 | core_initcall(futex_init); |