blob: 0deaf8b5068d49e708e38ccb24ba3884789b1387 [file] [log] [blame]
Jens Axboe2b188cc2019-01-07 10:46:33 -07001// SPDX-License-Identifier: GPL-2.0
2/*
3 * Shared application/kernel submission and completion ring pairs, for
4 * supporting fast/efficient IO.
5 *
6 * A note on the read/write ordering memory barriers that are matched between
Stefan Bühler1e84b972019-04-24 23:54:16 +02007 * the application and kernel side.
8 *
9 * After the application reads the CQ ring tail, it must use an
10 * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses
11 * before writing the tail (using smp_load_acquire to read the tail will
12 * do). It also needs a smp_mb() before updating CQ head (ordering the
13 * entry load(s) with the head store), pairing with an implicit barrier
14 * through a control-dependency in io_get_cqring (smp_store_release to
15 * store head will do). Failure to do so could lead to reading invalid
16 * CQ entries.
17 *
18 * Likewise, the application must use an appropriate smp_wmb() before
19 * writing the SQ tail (ordering SQ entry stores with the tail store),
20 * which pairs with smp_load_acquire in io_get_sqring (smp_store_release
21 * to store the tail will do). And it needs a barrier ordering the SQ
22 * head load before writing new SQ entries (smp_load_acquire to read
23 * head will do).
24 *
25 * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application
26 * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after*
27 * updating the SQ tail; a full memory barrier smp_mb() is needed
28 * between.
Jens Axboe2b188cc2019-01-07 10:46:33 -070029 *
30 * Also see the examples in the liburing library:
31 *
32 * git://git.kernel.dk/liburing
33 *
34 * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
35 * from data shared between the kernel and application. This is done both
36 * for ordering purposes, but also to ensure that once a value is loaded from
37 * data that the application could potentially modify, it remains stable.
38 *
39 * Copyright (C) 2018-2019 Jens Axboe
Christoph Hellwigc992fe22019-01-11 09:43:02 -070040 * Copyright (c) 2018-2019 Christoph Hellwig
Jens Axboe2b188cc2019-01-07 10:46:33 -070041 */
42#include <linux/kernel.h>
43#include <linux/init.h>
44#include <linux/errno.h>
45#include <linux/syscalls.h>
46#include <linux/compat.h>
Jens Axboe52de1fe2020-02-27 10:15:42 -070047#include <net/compat.h>
Jens Axboe2b188cc2019-01-07 10:46:33 -070048#include <linux/refcount.h>
49#include <linux/uio.h>
Pavel Begunkov6b47ee62020-01-18 20:22:41 +030050#include <linux/bits.h>
Jens Axboe2b188cc2019-01-07 10:46:33 -070051
52#include <linux/sched/signal.h>
53#include <linux/fs.h>
54#include <linux/file.h>
55#include <linux/fdtable.h>
56#include <linux/mm.h>
57#include <linux/mman.h>
Jens Axboe2b188cc2019-01-07 10:46:33 -070058#include <linux/percpu.h>
59#include <linux/slab.h>
Jens Axboe6c271ce2019-01-10 11:22:30 -070060#include <linux/kthread.h>
Jens Axboe2b188cc2019-01-07 10:46:33 -070061#include <linux/blkdev.h>
Jens Axboeedafcce2019-01-09 09:16:05 -070062#include <linux/bvec.h>
Jens Axboe2b188cc2019-01-07 10:46:33 -070063#include <linux/net.h>
64#include <net/sock.h>
65#include <net/af_unix.h>
Jens Axboe6b063142019-01-10 22:13:58 -070066#include <net/scm.h>
Jens Axboe2b188cc2019-01-07 10:46:33 -070067#include <linux/anon_inodes.h>
68#include <linux/sched/mm.h>
69#include <linux/uaccess.h>
70#include <linux/nospec.h>
Jens Axboeedafcce2019-01-09 09:16:05 -070071#include <linux/sizes.h>
72#include <linux/hugetlb.h>
Jens Axboeaa4c3962019-11-29 10:14:00 -070073#include <linux/highmem.h>
Jens Axboe15b71ab2019-12-11 11:20:36 -070074#include <linux/namei.h>
75#include <linux/fsnotify.h>
Jens Axboe4840e412019-12-25 22:03:45 -070076#include <linux/fadvise.h>
Jens Axboe3e4827b2020-01-08 15:18:09 -070077#include <linux/eventpoll.h>
Jens Axboeff002b32020-02-07 16:05:21 -070078#include <linux/fs_struct.h>
Pavel Begunkov7d67af22020-02-24 11:32:45 +030079#include <linux/splice.h>
Jens Axboeb41e9852020-02-17 09:52:41 -070080#include <linux/task_work.h>
Jens Axboebcf5a062020-05-22 09:24:42 -060081#include <linux/pagemap.h>
Jens Axboe0f212202020-09-13 13:09:39 -060082#include <linux/io_uring.h>
Jens Axboe2b188cc2019-01-07 10:46:33 -070083
Dmitrii Dolgovc826bd72019-10-15 19:02:01 +020084#define CREATE_TRACE_POINTS
85#include <trace/events/io_uring.h>
86
Jens Axboe2b188cc2019-01-07 10:46:33 -070087#include <uapi/linux/io_uring.h>
88
89#include "internal.h"
Jens Axboe561fb042019-10-24 07:25:42 -060090#include "io-wq.h"
Jens Axboe2b188cc2019-01-07 10:46:33 -070091
Daniel Xu5277dea2019-09-14 14:23:45 -070092#define IORING_MAX_ENTRIES 32768
Jens Axboe33a107f2019-10-04 12:10:03 -060093#define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES)
Jens Axboe65e19f52019-10-26 07:20:21 -060094
95/*
96 * Shift of 9 is 512 entries, or exactly one page on 64-bit archs
97 */
98#define IORING_FILE_TABLE_SHIFT 9
99#define IORING_MAX_FILES_TABLE (1U << IORING_FILE_TABLE_SHIFT)
100#define IORING_FILE_TABLE_MASK (IORING_MAX_FILES_TABLE - 1)
101#define IORING_MAX_FIXED_FILES (64 * IORING_MAX_FILES_TABLE)
Stefano Garzarella21b55db2020-08-27 16:58:30 +0200102#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
103 IORING_REGISTER_LAST + IORING_OP_LAST)
Jens Axboe2b188cc2019-01-07 10:46:33 -0700104
105struct io_uring {
106 u32 head ____cacheline_aligned_in_smp;
107 u32 tail ____cacheline_aligned_in_smp;
108};
109
Stefan Bühler1e84b972019-04-24 23:54:16 +0200110/*
Hristo Venev75b28af2019-08-26 17:23:46 +0000111 * This data is shared with the application through the mmap at offsets
112 * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING.
Stefan Bühler1e84b972019-04-24 23:54:16 +0200113 *
114 * The offsets to the member fields are published through struct
115 * io_sqring_offsets when calling io_uring_setup.
116 */
Hristo Venev75b28af2019-08-26 17:23:46 +0000117struct io_rings {
Stefan Bühler1e84b972019-04-24 23:54:16 +0200118 /*
119 * Head and tail offsets into the ring; the offsets need to be
120 * masked to get valid indices.
121 *
Hristo Venev75b28af2019-08-26 17:23:46 +0000122 * The kernel controls head of the sq ring and the tail of the cq ring,
123 * and the application controls tail of the sq ring and the head of the
124 * cq ring.
Stefan Bühler1e84b972019-04-24 23:54:16 +0200125 */
Hristo Venev75b28af2019-08-26 17:23:46 +0000126 struct io_uring sq, cq;
Stefan Bühler1e84b972019-04-24 23:54:16 +0200127 /*
Hristo Venev75b28af2019-08-26 17:23:46 +0000128 * Bitmasks to apply to head and tail offsets (constant, equals
Stefan Bühler1e84b972019-04-24 23:54:16 +0200129 * ring_entries - 1)
130 */
Hristo Venev75b28af2019-08-26 17:23:46 +0000131 u32 sq_ring_mask, cq_ring_mask;
132 /* Ring sizes (constant, power of 2) */
133 u32 sq_ring_entries, cq_ring_entries;
Stefan Bühler1e84b972019-04-24 23:54:16 +0200134 /*
135 * Number of invalid entries dropped by the kernel due to
136 * invalid index stored in array
137 *
138 * Written by the kernel, shouldn't be modified by the
139 * application (i.e. get number of "new events" by comparing to
140 * cached value).
141 *
142 * After a new SQ head value was read by the application this
143 * counter includes all submissions that were dropped reaching
144 * the new SQ head (and possibly more).
145 */
Hristo Venev75b28af2019-08-26 17:23:46 +0000146 u32 sq_dropped;
Stefan Bühler1e84b972019-04-24 23:54:16 +0200147 /*
Stefano Garzarella0d9b5b32020-05-15 18:38:04 +0200148 * Runtime SQ flags
Stefan Bühler1e84b972019-04-24 23:54:16 +0200149 *
150 * Written by the kernel, shouldn't be modified by the
151 * application.
152 *
153 * The application needs a full memory barrier before checking
154 * for IORING_SQ_NEED_WAKEUP after updating the sq tail.
155 */
Hristo Venev75b28af2019-08-26 17:23:46 +0000156 u32 sq_flags;
Stefan Bühler1e84b972019-04-24 23:54:16 +0200157 /*
Stefano Garzarella0d9b5b32020-05-15 18:38:04 +0200158 * Runtime CQ flags
159 *
160 * Written by the application, shouldn't be modified by the
161 * kernel.
162 */
163 u32 cq_flags;
164 /*
Stefan Bühler1e84b972019-04-24 23:54:16 +0200165 * Number of completion events lost because the queue was full;
166 * this should be avoided by the application by making sure
LimingWu0b4295b2019-12-05 20:18:18 +0800167 * there are not more requests pending than there is space in
Stefan Bühler1e84b972019-04-24 23:54:16 +0200168 * the completion queue.
169 *
170 * Written by the kernel, shouldn't be modified by the
171 * application (i.e. get number of "new events" by comparing to
172 * cached value).
173 *
174 * As completion events come in out of order this counter is not
175 * ordered with any other data.
176 */
Hristo Venev75b28af2019-08-26 17:23:46 +0000177 u32 cq_overflow;
Stefan Bühler1e84b972019-04-24 23:54:16 +0200178 /*
179 * Ring buffer of completion events.
180 *
181 * The kernel writes completion events fresh every time they are
182 * produced, so the application is allowed to modify pending
183 * entries.
184 */
Hristo Venev75b28af2019-08-26 17:23:46 +0000185 struct io_uring_cqe cqes[] ____cacheline_aligned_in_smp;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700186};
187
Jens Axboeedafcce2019-01-09 09:16:05 -0700188struct io_mapped_ubuf {
189 u64 ubuf;
190 size_t len;
191 struct bio_vec *bvec;
192 unsigned int nr_bvecs;
Jens Axboede293932020-09-17 16:19:16 -0600193 unsigned long acct_pages;
Jens Axboeedafcce2019-01-09 09:16:05 -0700194};
195
Jens Axboe65e19f52019-10-26 07:20:21 -0600196struct fixed_file_table {
197 struct file **files;
Jens Axboe31b51512019-01-18 22:56:34 -0700198};
199
Xiaoguang Wang05589552020-03-31 14:05:18 +0800200struct fixed_file_ref_node {
201 struct percpu_ref refs;
202 struct list_head node;
203 struct list_head file_list;
204 struct fixed_file_data *file_data;
Jens Axboe4a38aed22020-05-14 17:21:15 -0600205 struct llist_node llist;
Xiaoguang Wang05589552020-03-31 14:05:18 +0800206};
207
Jens Axboe05f3fb32019-12-09 11:22:50 -0700208struct fixed_file_data {
209 struct fixed_file_table *table;
210 struct io_ring_ctx *ctx;
211
Xiaoguang Wang05589552020-03-31 14:05:18 +0800212 struct percpu_ref *cur_refs;
Jens Axboe05f3fb32019-12-09 11:22:50 -0700213 struct percpu_ref refs;
Jens Axboe05f3fb32019-12-09 11:22:50 -0700214 struct completion done;
Xiaoguang Wang05589552020-03-31 14:05:18 +0800215 struct list_head ref_list;
216 spinlock_t lock;
Jens Axboe05f3fb32019-12-09 11:22:50 -0700217};
218
Jens Axboe5a2e7452020-02-23 16:23:11 -0700219struct io_buffer {
220 struct list_head list;
221 __u64 addr;
222 __s32 len;
223 __u16 bid;
224};
225
Stefano Garzarella21b55db2020-08-27 16:58:30 +0200226struct io_restriction {
227 DECLARE_BITMAP(register_op, IORING_REGISTER_LAST);
228 DECLARE_BITMAP(sqe_op, IORING_OP_LAST);
229 u8 sqe_flags_allowed;
230 u8 sqe_flags_required;
Stefano Garzarella7e84e1c2020-08-27 16:58:31 +0200231 bool registered;
Stefano Garzarella21b55db2020-08-27 16:58:30 +0200232};
233
Jens Axboe534ca6d2020-09-02 13:52:19 -0600234struct io_sq_data {
235 refcount_t refs;
Jens Axboe69fb2132020-09-14 11:16:23 -0600236 struct mutex lock;
237
238 /* ctx's that are using this sqd */
239 struct list_head ctx_list;
240 struct list_head ctx_new_list;
241 struct mutex ctx_lock;
242
Jens Axboe534ca6d2020-09-02 13:52:19 -0600243 struct task_struct *thread;
244 struct wait_queue_head wait;
245};
246
Jens Axboe2b188cc2019-01-07 10:46:33 -0700247struct io_ring_ctx {
248 struct {
249 struct percpu_ref refs;
250 } ____cacheline_aligned_in_smp;
251
252 struct {
253 unsigned int flags;
Randy Dunlape1d85332020-02-05 20:57:10 -0800254 unsigned int compat: 1;
Bijan Mottahedehaad5d8d2020-06-16 16:36:08 -0700255 unsigned int limit_mem: 1;
Randy Dunlape1d85332020-02-05 20:57:10 -0800256 unsigned int cq_overflow_flushed: 1;
257 unsigned int drain_next: 1;
258 unsigned int eventfd_async: 1;
Stefano Garzarella21b55db2020-08-27 16:58:30 +0200259 unsigned int restricted: 1;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700260
Hristo Venev75b28af2019-08-26 17:23:46 +0000261 /*
262 * Ring buffer of indices into array of io_uring_sqe, which is
263 * mmapped by the application using the IORING_OFF_SQES offset.
264 *
265 * This indirection could e.g. be used to assign fixed
266 * io_uring_sqe entries to operations and only submit them to
267 * the queue when needed.
268 *
269 * The kernel modifies neither the indices array nor the entries
270 * array.
271 */
272 u32 *sq_array;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700273 unsigned cached_sq_head;
274 unsigned sq_entries;
275 unsigned sq_mask;
Jens Axboe6c271ce2019-01-10 11:22:30 -0700276 unsigned sq_thread_idle;
Jens Axboe498ccd92019-10-25 10:04:25 -0600277 unsigned cached_sq_dropped;
Jens Axboe206aefd2019-11-07 18:27:42 -0700278 atomic_t cached_cq_overflow;
Jens Axboead3eb2c2019-12-18 17:12:20 -0700279 unsigned long sq_check_overflow;
Jens Axboede0617e2019-04-06 21:51:27 -0600280
281 struct list_head defer_list;
Jens Axboe5262f562019-09-17 12:26:57 -0600282 struct list_head timeout_list;
Jens Axboe1d7bb1d2019-11-06 11:31:17 -0700283 struct list_head cq_overflow_list;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700284
Jens Axboefcb323c2019-10-24 12:39:47 -0600285 wait_queue_head_t inflight_wait;
Jens Axboead3eb2c2019-12-18 17:12:20 -0700286 struct io_uring_sqe *sq_sqes;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700287 } ____cacheline_aligned_in_smp;
288
Hristo Venev75b28af2019-08-26 17:23:46 +0000289 struct io_rings *rings;
290
Jens Axboe2b188cc2019-01-07 10:46:33 -0700291 /* IO offload */
Jens Axboe561fb042019-10-24 07:25:42 -0600292 struct io_wq *io_wq;
Jens Axboe2aede0e2020-09-14 10:45:53 -0600293
294 /*
295 * For SQPOLL usage - we hold a reference to the parent task, so we
296 * have access to the ->files
297 */
298 struct task_struct *sqo_task;
299
300 /* Only used for accounting purposes */
301 struct mm_struct *mm_account;
302
Jens Axboe534ca6d2020-09-02 13:52:19 -0600303 struct io_sq_data *sq_data; /* if using sq thread polling */
304
Jens Axboe90554202020-09-03 12:12:41 -0600305 struct wait_queue_head sqo_sq_wait;
Jens Axboe6a779382020-09-02 12:21:41 -0600306 struct wait_queue_entry sqo_wait_entry;
Jens Axboe69fb2132020-09-14 11:16:23 -0600307 struct list_head sqd_list;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700308
Jens Axboe6b063142019-01-10 22:13:58 -0700309 /*
310 * If used, fixed file set. Writers must ensure that ->refs is dead,
311 * readers must ensure that ->refs is alive as long as the file* is
312 * used. Only updated through io_uring_register(2).
313 */
Jens Axboe05f3fb32019-12-09 11:22:50 -0700314 struct fixed_file_data *file_data;
Jens Axboe6b063142019-01-10 22:13:58 -0700315 unsigned nr_user_files;
316
Jens Axboeedafcce2019-01-09 09:16:05 -0700317 /* if used, fixed mapped user buffers */
318 unsigned nr_user_bufs;
319 struct io_mapped_ubuf *user_bufs;
320
Jens Axboe2b188cc2019-01-07 10:46:33 -0700321 struct user_struct *user;
322
Jens Axboe0b8c0ec2019-12-02 08:50:00 -0700323 const struct cred *creds;
Jens Axboe181e4482019-11-25 08:52:30 -0700324
Jens Axboe0f158b42020-05-14 17:18:39 -0600325 struct completion ref_comp;
326 struct completion sq_thread_comp;
Jens Axboe206aefd2019-11-07 18:27:42 -0700327
Jens Axboe0ddf92e2019-11-08 08:52:53 -0700328 /* if all else fails... */
329 struct io_kiocb *fallback_req;
330
Jens Axboe206aefd2019-11-07 18:27:42 -0700331#if defined(CONFIG_UNIX)
332 struct socket *ring_sock;
333#endif
334
Jens Axboe5a2e7452020-02-23 16:23:11 -0700335 struct idr io_buffer_idr;
336
Jens Axboe071698e2020-01-28 10:04:42 -0700337 struct idr personality_idr;
338
Jens Axboe206aefd2019-11-07 18:27:42 -0700339 struct {
340 unsigned cached_cq_tail;
341 unsigned cq_entries;
342 unsigned cq_mask;
343 atomic_t cq_timeouts;
Jens Axboead3eb2c2019-12-18 17:12:20 -0700344 unsigned long cq_check_overflow;
Jens Axboe206aefd2019-11-07 18:27:42 -0700345 struct wait_queue_head cq_wait;
346 struct fasync_struct *cq_fasync;
347 struct eventfd_ctx *cq_ev_fd;
348 } ____cacheline_aligned_in_smp;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700349
350 struct {
351 struct mutex uring_lock;
352 wait_queue_head_t wait;
353 } ____cacheline_aligned_in_smp;
354
355 struct {
356 spinlock_t completion_lock;
Jens Axboee94f1412019-12-19 12:06:02 -0700357
Jens Axboedef596e2019-01-09 08:59:42 -0700358 /*
Pavel Begunkov540e32a2020-07-13 23:37:09 +0300359 * ->iopoll_list is protected by the ctx->uring_lock for
Jens Axboedef596e2019-01-09 08:59:42 -0700360 * io_uring instances that don't use IORING_SETUP_SQPOLL.
361 * For SQPOLL, only the single threaded io_sq_thread() will
362 * manipulate the list, hence no extra locking is needed there.
363 */
Pavel Begunkov540e32a2020-07-13 23:37:09 +0300364 struct list_head iopoll_list;
Jens Axboe78076bb2019-12-04 19:56:40 -0700365 struct hlist_head *cancel_hash;
366 unsigned cancel_hash_bits;
Jens Axboee94f1412019-12-19 12:06:02 -0700367 bool poll_multi_file;
Jens Axboefcb323c2019-10-24 12:39:47 -0600368
369 spinlock_t inflight_lock;
370 struct list_head inflight_list;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700371 } ____cacheline_aligned_in_smp;
Jens Axboe85faa7b2020-04-09 18:14:00 -0600372
Jens Axboe4a38aed22020-05-14 17:21:15 -0600373 struct delayed_work file_put_work;
374 struct llist_head file_put_llist;
375
Jens Axboe85faa7b2020-04-09 18:14:00 -0600376 struct work_struct exit_work;
Stefano Garzarella21b55db2020-08-27 16:58:30 +0200377 struct io_restriction restrictions;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700378};
379
Jens Axboe09bb8392019-03-13 12:39:28 -0600380/*
381 * First field must be the file pointer in all the
382 * iocb unions! See also 'struct kiocb' in <linux/fs.h>
383 */
Jens Axboe221c5eb2019-01-17 09:41:58 -0700384struct io_poll_iocb {
385 struct file *file;
Jens Axboe0969e782019-12-17 18:40:57 -0700386 union {
387 struct wait_queue_head *head;
388 u64 addr;
389 };
Jens Axboe221c5eb2019-01-17 09:41:58 -0700390 __poll_t events;
Jens Axboe8c838782019-03-12 15:48:16 -0600391 bool done;
Jens Axboe221c5eb2019-01-17 09:41:58 -0700392 bool canceled;
Jens Axboe392edb42019-12-09 17:52:20 -0700393 struct wait_queue_entry wait;
Jens Axboe221c5eb2019-01-17 09:41:58 -0700394};
395
Jens Axboeb5dba592019-12-11 14:02:38 -0700396struct io_close {
397 struct file *file;
398 struct file *put_file;
399 int fd;
400};
401
Jens Axboead8a48a2019-11-15 08:49:11 -0700402struct io_timeout_data {
403 struct io_kiocb *req;
404 struct hrtimer timer;
405 struct timespec64 ts;
406 enum hrtimer_mode mode;
407};
408
Jens Axboe8ed8d3c2019-12-16 11:55:28 -0700409struct io_accept {
410 struct file *file;
411 struct sockaddr __user *addr;
412 int __user *addr_len;
413 int flags;
Jens Axboe09952e32020-03-19 20:16:56 -0600414 unsigned long nofile;
Jens Axboe8ed8d3c2019-12-16 11:55:28 -0700415};
416
417struct io_sync {
418 struct file *file;
419 loff_t len;
420 loff_t off;
421 int flags;
Jens Axboed63d1b52019-12-10 10:38:56 -0700422 int mode;
Jens Axboe8ed8d3c2019-12-16 11:55:28 -0700423};
424
Jens Axboefbf23842019-12-17 18:45:56 -0700425struct io_cancel {
426 struct file *file;
427 u64 addr;
428};
429
Jens Axboeb29472e2019-12-17 18:50:29 -0700430struct io_timeout {
431 struct file *file;
432 u64 addr;
433 int flags;
Pavel Begunkovbfe68a22020-05-30 14:54:18 +0300434 u32 off;
435 u32 target_seq;
Pavel Begunkov135fcde2020-07-13 23:37:12 +0300436 struct list_head list;
Jens Axboeb29472e2019-12-17 18:50:29 -0700437};
438
Jens Axboe9adbd452019-12-20 08:45:55 -0700439struct io_rw {
440 /* NOTE: kiocb has the file as the first member, so don't do it here */
441 struct kiocb kiocb;
442 u64 addr;
443 u64 len;
444};
445
Jens Axboe3fbb51c2019-12-20 08:51:52 -0700446struct io_connect {
447 struct file *file;
448 struct sockaddr __user *addr;
449 int addr_len;
450};
451
Jens Axboee47293f2019-12-20 08:58:21 -0700452struct io_sr_msg {
453 struct file *file;
Jens Axboefddafac2020-01-04 20:19:44 -0700454 union {
Pavel Begunkov270a5942020-07-12 20:41:04 +0300455 struct user_msghdr __user *umsg;
Jens Axboefddafac2020-01-04 20:19:44 -0700456 void __user *buf;
457 };
Jens Axboee47293f2019-12-20 08:58:21 -0700458 int msg_flags;
Jens Axboebcda7ba2020-02-23 16:42:51 -0700459 int bgid;
Jens Axboefddafac2020-01-04 20:19:44 -0700460 size_t len;
Jens Axboebcda7ba2020-02-23 16:42:51 -0700461 struct io_buffer *kbuf;
Jens Axboee47293f2019-12-20 08:58:21 -0700462};
463
Jens Axboe15b71ab2019-12-11 11:20:36 -0700464struct io_open {
465 struct file *file;
466 int dfd;
Jens Axboe15b71ab2019-12-11 11:20:36 -0700467 struct filename *filename;
Jens Axboec12cedf2020-01-08 17:41:21 -0700468 struct open_how how;
Jens Axboe4022e7a2020-03-19 19:23:18 -0600469 unsigned long nofile;
Jens Axboe15b71ab2019-12-11 11:20:36 -0700470};
471
Jens Axboe05f3fb32019-12-09 11:22:50 -0700472struct io_files_update {
473 struct file *file;
474 u64 arg;
475 u32 nr_args;
476 u32 offset;
477};
478
Jens Axboe4840e412019-12-25 22:03:45 -0700479struct io_fadvise {
480 struct file *file;
481 u64 offset;
482 u32 len;
483 u32 advice;
484};
485
Jens Axboec1ca7572019-12-25 22:18:28 -0700486struct io_madvise {
487 struct file *file;
488 u64 addr;
489 u32 len;
490 u32 advice;
491};
492
Jens Axboe3e4827b2020-01-08 15:18:09 -0700493struct io_epoll {
494 struct file *file;
495 int epfd;
496 int op;
497 int fd;
498 struct epoll_event event;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700499};
500
Pavel Begunkov7d67af22020-02-24 11:32:45 +0300501struct io_splice {
502 struct file *file_out;
503 struct file *file_in;
504 loff_t off_out;
505 loff_t off_in;
506 u64 len;
507 unsigned int flags;
508};
509
Jens Axboeddf0322d2020-02-23 16:41:33 -0700510struct io_provide_buf {
511 struct file *file;
512 __u64 addr;
513 __s32 len;
514 __u32 bgid;
515 __u16 nbufs;
516 __u16 bid;
517};
518
Bijan Mottahedeh1d9e1282020-05-22 21:31:16 -0700519struct io_statx {
520 struct file *file;
521 int dfd;
522 unsigned int mask;
523 unsigned int flags;
Bijan Mottahedehe62753e2020-05-22 21:31:18 -0700524 const char __user *filename;
Bijan Mottahedeh1d9e1282020-05-22 21:31:16 -0700525 struct statx __user *buffer;
526};
527
Pavel Begunkov3ca405e2020-07-13 23:37:08 +0300528struct io_completion {
529 struct file *file;
530 struct list_head list;
Pavel Begunkov0f7e4662020-07-13 23:37:16 +0300531 int cflags;
Pavel Begunkov3ca405e2020-07-13 23:37:08 +0300532};
533
Jens Axboef499a022019-12-02 16:28:46 -0700534struct io_async_connect {
535 struct sockaddr_storage address;
536};
537
Jens Axboe03b12302019-12-02 18:50:25 -0700538struct io_async_msghdr {
539 struct iovec fast_iov[UIO_FASTIOV];
540 struct iovec *iov;
541 struct sockaddr __user *uaddr;
542 struct msghdr msg;
Jens Axboeb5379162020-02-09 11:29:15 -0700543 struct sockaddr_storage addr;
Jens Axboe03b12302019-12-02 18:50:25 -0700544};
545
Jens Axboef67676d2019-12-02 11:03:47 -0700546struct io_async_rw {
547 struct iovec fast_iov[UIO_FASTIOV];
Jens Axboeff6165b2020-08-13 09:47:43 -0600548 const struct iovec *free_iovec;
549 struct iov_iter iter;
Jens Axboe227c0c92020-08-13 11:51:40 -0600550 size_t bytes_done;
Jens Axboebcf5a062020-05-22 09:24:42 -0600551 struct wait_page_queue wpq;
Jens Axboef67676d2019-12-02 11:03:47 -0700552};
553
Pavel Begunkov6b47ee62020-01-18 20:22:41 +0300554enum {
555 REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT,
556 REQ_F_IO_DRAIN_BIT = IOSQE_IO_DRAIN_BIT,
557 REQ_F_LINK_BIT = IOSQE_IO_LINK_BIT,
558 REQ_F_HARDLINK_BIT = IOSQE_IO_HARDLINK_BIT,
559 REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT,
Jens Axboebcda7ba2020-02-23 16:42:51 -0700560 REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT,
Pavel Begunkov6b47ee62020-01-18 20:22:41 +0300561
Pavel Begunkovdea3b492020-04-12 02:05:04 +0300562 REQ_F_LINK_HEAD_BIT,
Pavel Begunkov6b47ee62020-01-18 20:22:41 +0300563 REQ_F_FAIL_LINK_BIT,
564 REQ_F_INFLIGHT_BIT,
565 REQ_F_CUR_POS_BIT,
566 REQ_F_NOWAIT_BIT,
Pavel Begunkov6b47ee62020-01-18 20:22:41 +0300567 REQ_F_LINK_TIMEOUT_BIT,
Pavel Begunkov6b47ee62020-01-18 20:22:41 +0300568 REQ_F_ISREG_BIT,
Pavel Begunkov6b47ee62020-01-18 20:22:41 +0300569 REQ_F_COMP_LOCKED_BIT,
Pavel Begunkov99bc4c32020-02-07 22:04:45 +0300570 REQ_F_NEED_CLEANUP_BIT,
Jens Axboed7718a92020-02-14 22:23:12 -0700571 REQ_F_POLLED_BIT,
Jens Axboebcda7ba2020-02-23 16:42:51 -0700572 REQ_F_BUFFER_SELECTED_BIT,
Jens Axboe5b0bbee2020-04-27 10:41:22 -0600573 REQ_F_NO_FILE_TABLE_BIT,
Xiaoguang Wang7cdaf582020-06-10 19:41:19 +0800574 REQ_F_WORK_INITIALIZED_BIT,
Jens Axboe84557872020-03-03 15:28:17 -0700575
576 /* not a real bit, just to check we're not overflowing the space */
577 __REQ_F_LAST_BIT,
Pavel Begunkov6b47ee62020-01-18 20:22:41 +0300578};
579
580enum {
581 /* ctx owns file */
582 REQ_F_FIXED_FILE = BIT(REQ_F_FIXED_FILE_BIT),
583 /* drain existing IO first */
584 REQ_F_IO_DRAIN = BIT(REQ_F_IO_DRAIN_BIT),
585 /* linked sqes */
586 REQ_F_LINK = BIT(REQ_F_LINK_BIT),
587 /* doesn't sever on completion < 0 */
588 REQ_F_HARDLINK = BIT(REQ_F_HARDLINK_BIT),
589 /* IOSQE_ASYNC */
590 REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT),
Jens Axboebcda7ba2020-02-23 16:42:51 -0700591 /* IOSQE_BUFFER_SELECT */
592 REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT),
Pavel Begunkov6b47ee62020-01-18 20:22:41 +0300593
Pavel Begunkovdea3b492020-04-12 02:05:04 +0300594 /* head of a link */
595 REQ_F_LINK_HEAD = BIT(REQ_F_LINK_HEAD_BIT),
Pavel Begunkov6b47ee62020-01-18 20:22:41 +0300596 /* fail rest of links */
597 REQ_F_FAIL_LINK = BIT(REQ_F_FAIL_LINK_BIT),
598 /* on inflight list */
599 REQ_F_INFLIGHT = BIT(REQ_F_INFLIGHT_BIT),
600 /* read/write uses file position */
601 REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT),
602 /* must not punt to workers */
603 REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT),
Pavel Begunkov6b47ee62020-01-18 20:22:41 +0300604 /* has linked timeout */
605 REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT),
Pavel Begunkov6b47ee62020-01-18 20:22:41 +0300606 /* regular file */
607 REQ_F_ISREG = BIT(REQ_F_ISREG_BIT),
Pavel Begunkov6b47ee62020-01-18 20:22:41 +0300608 /* completion under lock */
609 REQ_F_COMP_LOCKED = BIT(REQ_F_COMP_LOCKED_BIT),
Pavel Begunkov99bc4c32020-02-07 22:04:45 +0300610 /* needs cleanup */
611 REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT),
Jens Axboed7718a92020-02-14 22:23:12 -0700612 /* already went through poll handler */
613 REQ_F_POLLED = BIT(REQ_F_POLLED_BIT),
Jens Axboebcda7ba2020-02-23 16:42:51 -0700614 /* buffer already selected */
615 REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT),
Jens Axboe5b0bbee2020-04-27 10:41:22 -0600616 /* doesn't need file table for this request */
617 REQ_F_NO_FILE_TABLE = BIT(REQ_F_NO_FILE_TABLE_BIT),
Xiaoguang Wang7cdaf582020-06-10 19:41:19 +0800618 /* io_wq_work is initialized */
619 REQ_F_WORK_INITIALIZED = BIT(REQ_F_WORK_INITIALIZED_BIT),
Jens Axboed7718a92020-02-14 22:23:12 -0700620};
621
622struct async_poll {
623 struct io_poll_iocb poll;
Jens Axboe807abcb2020-07-17 17:09:27 -0600624 struct io_poll_iocb *double_poll;
Pavel Begunkov6b47ee62020-01-18 20:22:41 +0300625};
626
Jens Axboe09bb8392019-03-13 12:39:28 -0600627/*
628 * NOTE! Each of the iocb union members has the file pointer
629 * as the first entry in their struct definition. So you can
630 * access the file pointer through any of the sub-structs,
631 * or directly as just 'ki_filp' in this struct.
632 */
Jens Axboe2b188cc2019-01-07 10:46:33 -0700633struct io_kiocb {
Jens Axboe221c5eb2019-01-17 09:41:58 -0700634 union {
Jens Axboe09bb8392019-03-13 12:39:28 -0600635 struct file *file;
Jens Axboe9adbd452019-12-20 08:45:55 -0700636 struct io_rw rw;
Jens Axboe221c5eb2019-01-17 09:41:58 -0700637 struct io_poll_iocb poll;
Jens Axboe8ed8d3c2019-12-16 11:55:28 -0700638 struct io_accept accept;
639 struct io_sync sync;
Jens Axboefbf23842019-12-17 18:45:56 -0700640 struct io_cancel cancel;
Jens Axboeb29472e2019-12-17 18:50:29 -0700641 struct io_timeout timeout;
Jens Axboe3fbb51c2019-12-20 08:51:52 -0700642 struct io_connect connect;
Jens Axboee47293f2019-12-20 08:58:21 -0700643 struct io_sr_msg sr_msg;
Jens Axboe15b71ab2019-12-11 11:20:36 -0700644 struct io_open open;
Jens Axboeb5dba592019-12-11 14:02:38 -0700645 struct io_close close;
Jens Axboe05f3fb32019-12-09 11:22:50 -0700646 struct io_files_update files_update;
Jens Axboe4840e412019-12-25 22:03:45 -0700647 struct io_fadvise fadvise;
Jens Axboec1ca7572019-12-25 22:18:28 -0700648 struct io_madvise madvise;
Jens Axboe3e4827b2020-01-08 15:18:09 -0700649 struct io_epoll epoll;
Pavel Begunkov7d67af22020-02-24 11:32:45 +0300650 struct io_splice splice;
Jens Axboeddf0322d2020-02-23 16:41:33 -0700651 struct io_provide_buf pbuf;
Bijan Mottahedeh1d9e1282020-05-22 21:31:16 -0700652 struct io_statx statx;
Pavel Begunkov3ca405e2020-07-13 23:37:08 +0300653 /* use only after cleaning per-op data, see io_clean_op() */
654 struct io_completion compl;
Jens Axboe221c5eb2019-01-17 09:41:58 -0700655 };
Jens Axboe2b188cc2019-01-07 10:46:33 -0700656
Jens Axboee8c2bc12020-08-15 18:44:09 -0700657 /* opcode allocated if it needs to store data for async defer */
658 void *async_data;
Jens Axboed625c6e2019-12-17 19:53:05 -0700659 u8 opcode;
Xiaoguang Wang65a65432020-06-11 23:39:36 +0800660 /* polled IO has completed */
661 u8 iopoll_completed;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700662
Bijan Mottahedeh4f4eeba2020-05-19 14:52:49 -0700663 u16 buf_index;
Pavel Begunkov9cf7c102020-07-13 23:37:15 +0300664 u32 result;
Bijan Mottahedeh4f4eeba2020-05-19 14:52:49 -0700665
Pavel Begunkov010e8e62020-07-30 18:43:45 +0300666 struct io_ring_ctx *ctx;
667 unsigned int flags;
668 refcount_t refs;
669 struct task_struct *task;
670 u64 user_data;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700671
Pavel Begunkov010e8e62020-07-30 18:43:45 +0300672 struct list_head link_list;
Jens Axboed7718a92020-02-14 22:23:12 -0700673
Pavel Begunkovd21ffe72020-07-13 23:37:10 +0300674 /*
675 * 1. used with ctx->iopoll_list with reads/writes
676 * 2. to track reqs with ->files (see io_op_def::file_table)
677 */
Pavel Begunkov010e8e62020-07-30 18:43:45 +0300678 struct list_head inflight_entry;
Jens Axboefcb323c2019-10-24 12:39:47 -0600679
Pavel Begunkov010e8e62020-07-30 18:43:45 +0300680 struct percpu_ref *fixed_file_refs;
681 struct callback_head task_work;
682 /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
683 struct hlist_node hash_node;
684 struct async_poll *apoll;
685 struct io_wq_work work;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700686};
687
Pavel Begunkov27dc8332020-07-13 23:37:14 +0300688struct io_defer_entry {
689 struct list_head list;
690 struct io_kiocb *req;
Pavel Begunkov9cf7c102020-07-13 23:37:15 +0300691 u32 seq;
Pavel Begunkov27dc8332020-07-13 23:37:14 +0300692};
693
Jens Axboedef596e2019-01-09 08:59:42 -0700694#define IO_IOPOLL_BATCH 8
Jens Axboe2b188cc2019-01-07 10:46:33 -0700695
Jens Axboe013538b2020-06-22 09:29:15 -0600696struct io_comp_state {
697 unsigned int nr;
698 struct list_head list;
699 struct io_ring_ctx *ctx;
700};
701
Jens Axboe9a56a232019-01-09 09:06:50 -0700702struct io_submit_state {
703 struct blk_plug plug;
704
705 /*
Jens Axboe2579f912019-01-09 09:10:43 -0700706 * io_kiocb alloc cache
707 */
708 void *reqs[IO_IOPOLL_BATCH];
Pavel Begunkov6c8a3132020-02-01 03:58:00 +0300709 unsigned int free_reqs;
Jens Axboe2579f912019-01-09 09:10:43 -0700710
711 /*
Jens Axboe013538b2020-06-22 09:29:15 -0600712 * Batch completion logic
713 */
714 struct io_comp_state comp;
715
716 /*
Jens Axboe9a56a232019-01-09 09:06:50 -0700717 * File reference cache
718 */
719 struct file *file;
720 unsigned int fd;
721 unsigned int has_refs;
Jens Axboe9a56a232019-01-09 09:06:50 -0700722 unsigned int ios_left;
723};
724
Jens Axboed3656342019-12-18 09:50:26 -0700725struct io_op_def {
Jens Axboed3656342019-12-18 09:50:26 -0700726 /* needs current->mm setup, does mm access */
727 unsigned needs_mm : 1;
728 /* needs req->file assigned */
729 unsigned needs_file : 1;
Jens Axboefd2206e2020-06-02 16:40:47 -0600730 /* don't fail if file grab fails */
731 unsigned needs_file_no_error : 1;
Jens Axboed3656342019-12-18 09:50:26 -0700732 /* hash wq insertion if file is a regular file */
733 unsigned hash_reg_file : 1;
734 /* unbound wq insertion if file is a non-regular file */
735 unsigned unbound_nonreg_file : 1;
Jens Axboe66f4af92020-01-16 15:36:52 -0700736 /* opcode is not supported by this kernel */
737 unsigned not_supported : 1;
Jens Axboef86cd202020-01-29 13:46:44 -0700738 /* needs file table */
739 unsigned file_table : 1;
Jens Axboeff002b32020-02-07 16:05:21 -0700740 /* needs ->fs */
741 unsigned needs_fs : 1;
Jens Axboe8a727582020-02-20 09:59:44 -0700742 /* set if opcode supports polled "wait" */
743 unsigned pollin : 1;
744 unsigned pollout : 1;
Jens Axboebcda7ba2020-02-23 16:42:51 -0700745 /* op supports buffer selection */
746 unsigned buffer_select : 1;
Jens Axboee8c2bc12020-08-15 18:44:09 -0700747 /* needs rlimit(RLIMIT_FSIZE) assigned */
Pavel Begunkov57f1a642020-07-15 12:46:52 +0300748 unsigned needs_fsize : 1;
Jens Axboee8c2bc12020-08-15 18:44:09 -0700749 /* must always have async data allocated */
750 unsigned needs_async_data : 1;
751 /* size of async data needed, if any */
752 unsigned short async_size;
Jens Axboed3656342019-12-18 09:50:26 -0700753};
754
Jens Axboe738277a2020-09-03 05:54:56 -0600755static const struct io_op_def io_op_defs[] __read_mostly = {
Pavel Begunkov0463b6c2020-01-18 21:35:38 +0300756 [IORING_OP_NOP] = {},
757 [IORING_OP_READV] = {
Jens Axboed3656342019-12-18 09:50:26 -0700758 .needs_mm = 1,
759 .needs_file = 1,
760 .unbound_nonreg_file = 1,
Jens Axboe8a727582020-02-20 09:59:44 -0700761 .pollin = 1,
Jens Axboe4d954c22020-02-27 07:31:19 -0700762 .buffer_select = 1,
Jens Axboee8c2bc12020-08-15 18:44:09 -0700763 .needs_async_data = 1,
764 .async_size = sizeof(struct io_async_rw),
Jens Axboed3656342019-12-18 09:50:26 -0700765 },
Pavel Begunkov0463b6c2020-01-18 21:35:38 +0300766 [IORING_OP_WRITEV] = {
Jens Axboed3656342019-12-18 09:50:26 -0700767 .needs_mm = 1,
768 .needs_file = 1,
769 .hash_reg_file = 1,
770 .unbound_nonreg_file = 1,
Jens Axboe8a727582020-02-20 09:59:44 -0700771 .pollout = 1,
Pavel Begunkov57f1a642020-07-15 12:46:52 +0300772 .needs_fsize = 1,
Jens Axboee8c2bc12020-08-15 18:44:09 -0700773 .needs_async_data = 1,
774 .async_size = sizeof(struct io_async_rw),
Jens Axboed3656342019-12-18 09:50:26 -0700775 },
Pavel Begunkov0463b6c2020-01-18 21:35:38 +0300776 [IORING_OP_FSYNC] = {
Jens Axboed3656342019-12-18 09:50:26 -0700777 .needs_file = 1,
778 },
Pavel Begunkov0463b6c2020-01-18 21:35:38 +0300779 [IORING_OP_READ_FIXED] = {
Jens Axboed3656342019-12-18 09:50:26 -0700780 .needs_file = 1,
781 .unbound_nonreg_file = 1,
Jens Axboe8a727582020-02-20 09:59:44 -0700782 .pollin = 1,
Jens Axboee8c2bc12020-08-15 18:44:09 -0700783 .async_size = sizeof(struct io_async_rw),
Jens Axboed3656342019-12-18 09:50:26 -0700784 },
Pavel Begunkov0463b6c2020-01-18 21:35:38 +0300785 [IORING_OP_WRITE_FIXED] = {
Jens Axboed3656342019-12-18 09:50:26 -0700786 .needs_file = 1,
787 .hash_reg_file = 1,
788 .unbound_nonreg_file = 1,
Jens Axboe8a727582020-02-20 09:59:44 -0700789 .pollout = 1,
Pavel Begunkov57f1a642020-07-15 12:46:52 +0300790 .needs_fsize = 1,
Jens Axboee8c2bc12020-08-15 18:44:09 -0700791 .async_size = sizeof(struct io_async_rw),
Jens Axboed3656342019-12-18 09:50:26 -0700792 },
Pavel Begunkov0463b6c2020-01-18 21:35:38 +0300793 [IORING_OP_POLL_ADD] = {
Jens Axboed3656342019-12-18 09:50:26 -0700794 .needs_file = 1,
795 .unbound_nonreg_file = 1,
796 },
Pavel Begunkov0463b6c2020-01-18 21:35:38 +0300797 [IORING_OP_POLL_REMOVE] = {},
798 [IORING_OP_SYNC_FILE_RANGE] = {
Jens Axboed3656342019-12-18 09:50:26 -0700799 .needs_file = 1,
800 },
Pavel Begunkov0463b6c2020-01-18 21:35:38 +0300801 [IORING_OP_SENDMSG] = {
Jens Axboed3656342019-12-18 09:50:26 -0700802 .needs_mm = 1,
803 .needs_file = 1,
804 .unbound_nonreg_file = 1,
Jens Axboeff002b32020-02-07 16:05:21 -0700805 .needs_fs = 1,
Jens Axboe8a727582020-02-20 09:59:44 -0700806 .pollout = 1,
Jens Axboee8c2bc12020-08-15 18:44:09 -0700807 .needs_async_data = 1,
808 .async_size = sizeof(struct io_async_msghdr),
Jens Axboed3656342019-12-18 09:50:26 -0700809 },
Pavel Begunkov0463b6c2020-01-18 21:35:38 +0300810 [IORING_OP_RECVMSG] = {
Jens Axboed3656342019-12-18 09:50:26 -0700811 .needs_mm = 1,
812 .needs_file = 1,
813 .unbound_nonreg_file = 1,
Jens Axboeff002b32020-02-07 16:05:21 -0700814 .needs_fs = 1,
Jens Axboe8a727582020-02-20 09:59:44 -0700815 .pollin = 1,
Jens Axboe52de1fe2020-02-27 10:15:42 -0700816 .buffer_select = 1,
Jens Axboee8c2bc12020-08-15 18:44:09 -0700817 .needs_async_data = 1,
818 .async_size = sizeof(struct io_async_msghdr),
Jens Axboed3656342019-12-18 09:50:26 -0700819 },
Pavel Begunkov0463b6c2020-01-18 21:35:38 +0300820 [IORING_OP_TIMEOUT] = {
Jens Axboed3656342019-12-18 09:50:26 -0700821 .needs_mm = 1,
Jens Axboee8c2bc12020-08-15 18:44:09 -0700822 .needs_async_data = 1,
823 .async_size = sizeof(struct io_timeout_data),
Jens Axboed3656342019-12-18 09:50:26 -0700824 },
Pavel Begunkov0463b6c2020-01-18 21:35:38 +0300825 [IORING_OP_TIMEOUT_REMOVE] = {},
826 [IORING_OP_ACCEPT] = {
Jens Axboed3656342019-12-18 09:50:26 -0700827 .needs_mm = 1,
828 .needs_file = 1,
829 .unbound_nonreg_file = 1,
Jens Axboef86cd202020-01-29 13:46:44 -0700830 .file_table = 1,
Jens Axboe8a727582020-02-20 09:59:44 -0700831 .pollin = 1,
Jens Axboed3656342019-12-18 09:50:26 -0700832 },
Pavel Begunkov0463b6c2020-01-18 21:35:38 +0300833 [IORING_OP_ASYNC_CANCEL] = {},
834 [IORING_OP_LINK_TIMEOUT] = {
Jens Axboed3656342019-12-18 09:50:26 -0700835 .needs_mm = 1,
Jens Axboee8c2bc12020-08-15 18:44:09 -0700836 .needs_async_data = 1,
837 .async_size = sizeof(struct io_timeout_data),
Jens Axboed3656342019-12-18 09:50:26 -0700838 },
Pavel Begunkov0463b6c2020-01-18 21:35:38 +0300839 [IORING_OP_CONNECT] = {
Jens Axboed3656342019-12-18 09:50:26 -0700840 .needs_mm = 1,
841 .needs_file = 1,
842 .unbound_nonreg_file = 1,
Jens Axboe8a727582020-02-20 09:59:44 -0700843 .pollout = 1,
Jens Axboee8c2bc12020-08-15 18:44:09 -0700844 .needs_async_data = 1,
845 .async_size = sizeof(struct io_async_connect),
Jens Axboed3656342019-12-18 09:50:26 -0700846 },
Pavel Begunkov0463b6c2020-01-18 21:35:38 +0300847 [IORING_OP_FALLOCATE] = {
Jens Axboed3656342019-12-18 09:50:26 -0700848 .needs_file = 1,
Pavel Begunkov57f1a642020-07-15 12:46:52 +0300849 .needs_fsize = 1,
Jens Axboed3656342019-12-18 09:50:26 -0700850 },
Pavel Begunkov0463b6c2020-01-18 21:35:38 +0300851 [IORING_OP_OPENAT] = {
Jens Axboef86cd202020-01-29 13:46:44 -0700852 .file_table = 1,
Jens Axboeff002b32020-02-07 16:05:21 -0700853 .needs_fs = 1,
Jens Axboed3656342019-12-18 09:50:26 -0700854 },
Pavel Begunkov0463b6c2020-01-18 21:35:38 +0300855 [IORING_OP_CLOSE] = {
Jens Axboefd2206e2020-06-02 16:40:47 -0600856 .needs_file = 1,
857 .needs_file_no_error = 1,
Jens Axboef86cd202020-01-29 13:46:44 -0700858 .file_table = 1,
Jens Axboed3656342019-12-18 09:50:26 -0700859 },
Pavel Begunkov0463b6c2020-01-18 21:35:38 +0300860 [IORING_OP_FILES_UPDATE] = {
Jens Axboed3656342019-12-18 09:50:26 -0700861 .needs_mm = 1,
Jens Axboef86cd202020-01-29 13:46:44 -0700862 .file_table = 1,
Jens Axboed3656342019-12-18 09:50:26 -0700863 },
Pavel Begunkov0463b6c2020-01-18 21:35:38 +0300864 [IORING_OP_STATX] = {
Jens Axboed3656342019-12-18 09:50:26 -0700865 .needs_mm = 1,
Jens Axboeff002b32020-02-07 16:05:21 -0700866 .needs_fs = 1,
Jens Axboe5b0bbee2020-04-27 10:41:22 -0600867 .file_table = 1,
Jens Axboed3656342019-12-18 09:50:26 -0700868 },
Pavel Begunkov0463b6c2020-01-18 21:35:38 +0300869 [IORING_OP_READ] = {
Jens Axboe3a6820f2019-12-22 15:19:35 -0700870 .needs_mm = 1,
871 .needs_file = 1,
872 .unbound_nonreg_file = 1,
Jens Axboe8a727582020-02-20 09:59:44 -0700873 .pollin = 1,
Jens Axboebcda7ba2020-02-23 16:42:51 -0700874 .buffer_select = 1,
Jens Axboee8c2bc12020-08-15 18:44:09 -0700875 .async_size = sizeof(struct io_async_rw),
Jens Axboe3a6820f2019-12-22 15:19:35 -0700876 },
Pavel Begunkov0463b6c2020-01-18 21:35:38 +0300877 [IORING_OP_WRITE] = {
Jens Axboe3a6820f2019-12-22 15:19:35 -0700878 .needs_mm = 1,
879 .needs_file = 1,
880 .unbound_nonreg_file = 1,
Jens Axboe8a727582020-02-20 09:59:44 -0700881 .pollout = 1,
Pavel Begunkov57f1a642020-07-15 12:46:52 +0300882 .needs_fsize = 1,
Jens Axboee8c2bc12020-08-15 18:44:09 -0700883 .async_size = sizeof(struct io_async_rw),
Jens Axboe3a6820f2019-12-22 15:19:35 -0700884 },
Pavel Begunkov0463b6c2020-01-18 21:35:38 +0300885 [IORING_OP_FADVISE] = {
Jens Axboe4840e412019-12-25 22:03:45 -0700886 .needs_file = 1,
887 },
Pavel Begunkov0463b6c2020-01-18 21:35:38 +0300888 [IORING_OP_MADVISE] = {
Jens Axboec1ca7572019-12-25 22:18:28 -0700889 .needs_mm = 1,
890 },
Pavel Begunkov0463b6c2020-01-18 21:35:38 +0300891 [IORING_OP_SEND] = {
Jens Axboefddafac2020-01-04 20:19:44 -0700892 .needs_mm = 1,
893 .needs_file = 1,
894 .unbound_nonreg_file = 1,
Jens Axboe8a727582020-02-20 09:59:44 -0700895 .pollout = 1,
Jens Axboefddafac2020-01-04 20:19:44 -0700896 },
Pavel Begunkov0463b6c2020-01-18 21:35:38 +0300897 [IORING_OP_RECV] = {
Jens Axboefddafac2020-01-04 20:19:44 -0700898 .needs_mm = 1,
899 .needs_file = 1,
900 .unbound_nonreg_file = 1,
Jens Axboe8a727582020-02-20 09:59:44 -0700901 .pollin = 1,
Jens Axboebcda7ba2020-02-23 16:42:51 -0700902 .buffer_select = 1,
Jens Axboefddafac2020-01-04 20:19:44 -0700903 },
Pavel Begunkov0463b6c2020-01-18 21:35:38 +0300904 [IORING_OP_OPENAT2] = {
Jens Axboef86cd202020-01-29 13:46:44 -0700905 .file_table = 1,
Jens Axboeff002b32020-02-07 16:05:21 -0700906 .needs_fs = 1,
Jens Axboecebdb982020-01-08 17:59:24 -0700907 },
Jens Axboe3e4827b2020-01-08 15:18:09 -0700908 [IORING_OP_EPOLL_CTL] = {
909 .unbound_nonreg_file = 1,
910 .file_table = 1,
911 },
Pavel Begunkov7d67af22020-02-24 11:32:45 +0300912 [IORING_OP_SPLICE] = {
913 .needs_file = 1,
914 .hash_reg_file = 1,
915 .unbound_nonreg_file = 1,
Jens Axboeddf0322d2020-02-23 16:41:33 -0700916 },
917 [IORING_OP_PROVIDE_BUFFERS] = {},
Jens Axboe067524e2020-03-02 16:32:28 -0700918 [IORING_OP_REMOVE_BUFFERS] = {},
Pavel Begunkovf2a8d5c2020-05-17 14:18:06 +0300919 [IORING_OP_TEE] = {
920 .needs_file = 1,
921 .hash_reg_file = 1,
922 .unbound_nonreg_file = 1,
923 },
Jens Axboed3656342019-12-18 09:50:26 -0700924};
925
Bijan Mottahedeh2e0464d2020-06-16 16:36:10 -0700926enum io_mem_account {
927 ACCT_LOCKED,
928 ACCT_PINNED,
929};
930
Pavel Begunkov81b68a52020-07-30 18:43:46 +0300931static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
932 struct io_comp_state *cs);
Jens Axboe78e19bb2019-11-06 15:21:34 -0700933static void io_cqring_fill_event(struct io_kiocb *req, long res);
Jackie Liuec9c02a2019-11-08 23:50:36 +0800934static void io_put_req(struct io_kiocb *req);
Jens Axboec40f6372020-06-25 15:39:59 -0600935static void io_double_put_req(struct io_kiocb *req);
Jens Axboe978db572019-11-14 22:39:04 -0700936static void __io_double_put_req(struct io_kiocb *req);
Jens Axboe94ae5e72019-11-14 19:39:52 -0700937static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
Jens Axboe7271ef32020-08-10 09:55:22 -0600938static void __io_queue_linked_timeout(struct io_kiocb *req);
Jens Axboe94ae5e72019-11-14 19:39:52 -0700939static void io_queue_linked_timeout(struct io_kiocb *req);
Jens Axboe05f3fb32019-12-09 11:22:50 -0700940static int __io_sqe_files_update(struct io_ring_ctx *ctx,
941 struct io_uring_files_update *ip,
942 unsigned nr_args);
Pavel Begunkovf56040b2020-07-23 20:25:21 +0300943static int io_prep_work_files(struct io_kiocb *req);
Pavel Begunkov3ca405e2020-07-13 23:37:08 +0300944static void __io_clean_op(struct io_kiocb *req);
Jens Axboeb41e9852020-02-17 09:52:41 -0700945static int io_file_get(struct io_submit_state *state, struct io_kiocb *req,
946 int fd, struct file **out_file, bool fixed);
947static void __io_queue_sqe(struct io_kiocb *req,
Jens Axboef13fad72020-06-22 09:34:30 -0600948 const struct io_uring_sqe *sqe,
949 struct io_comp_state *cs);
Jens Axboe4349f302020-07-09 15:07:01 -0600950static void io_file_put_work(struct work_struct *work);
Jens Axboede0617e2019-04-06 21:51:27 -0600951
Jens Axboeb63534c2020-06-04 11:28:00 -0600952static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
953 struct iovec **iovec, struct iov_iter *iter,
954 bool needs_lock);
Jens Axboeff6165b2020-08-13 09:47:43 -0600955static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
956 const struct iovec *fast_iov,
Jens Axboe227c0c92020-08-13 11:51:40 -0600957 struct iov_iter *iter, bool force);
Jens Axboe2b188cc2019-01-07 10:46:33 -0700958
959static struct kmem_cache *req_cachep;
960
Jens Axboe738277a2020-09-03 05:54:56 -0600961static const struct file_operations io_uring_fops __read_mostly;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700962
963struct sock *io_uring_get_socket(struct file *file)
964{
965#if defined(CONFIG_UNIX)
966 if (file->f_op == &io_uring_fops) {
967 struct io_ring_ctx *ctx = file->private_data;
968
969 return ctx->ring_sock->sk;
970 }
971#endif
972 return NULL;
973}
974EXPORT_SYMBOL(io_uring_get_socket);
975
Pavel Begunkov3ca405e2020-07-13 23:37:08 +0300976static inline void io_clean_op(struct io_kiocb *req)
977{
Pavel Begunkovbb175342020-08-20 11:33:35 +0300978 if (req->flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED |
979 REQ_F_INFLIGHT))
Pavel Begunkov3ca405e2020-07-13 23:37:08 +0300980 __io_clean_op(req);
981}
982
Jens Axboe4349f302020-07-09 15:07:01 -0600983static void io_sq_thread_drop_mm(void)
Jens Axboec40f6372020-06-25 15:39:59 -0600984{
985 struct mm_struct *mm = current->mm;
986
987 if (mm) {
988 kthread_unuse_mm(mm);
989 mmput(mm);
990 }
991}
992
993static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx)
994{
995 if (!current->mm) {
Pavel Begunkovcbcf7212020-07-18 11:31:21 +0300996 if (unlikely(!(ctx->flags & IORING_SETUP_SQPOLL) ||
Jens Axboe2aede0e2020-09-14 10:45:53 -0600997 !ctx->sqo_task->mm ||
998 !mmget_not_zero(ctx->sqo_task->mm)))
Jens Axboec40f6372020-06-25 15:39:59 -0600999 return -EFAULT;
Jens Axboe2aede0e2020-09-14 10:45:53 -06001000 kthread_use_mm(ctx->sqo_task->mm);
Jens Axboec40f6372020-06-25 15:39:59 -06001001 }
1002
1003 return 0;
1004}
1005
1006static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx,
1007 struct io_kiocb *req)
1008{
1009 if (!io_op_defs[req->opcode].needs_mm)
1010 return 0;
1011 return __io_sq_thread_acquire_mm(ctx);
1012}
1013
1014static inline void req_set_fail_links(struct io_kiocb *req)
1015{
1016 if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK)
1017 req->flags |= REQ_F_FAIL_LINK;
1018}
Jens Axboe4a38aed22020-05-14 17:21:15 -06001019
Xiaoguang Wang7cdaf582020-06-10 19:41:19 +08001020/*
1021 * Note: must call io_req_init_async() for the first time you
1022 * touch any members of io_wq_work.
1023 */
1024static inline void io_req_init_async(struct io_kiocb *req)
1025{
1026 if (req->flags & REQ_F_WORK_INITIALIZED)
1027 return;
1028
1029 memset(&req->work, 0, sizeof(req->work));
1030 req->flags |= REQ_F_WORK_INITIALIZED;
1031}
1032
Pavel Begunkov0cdaf762020-05-17 14:13:40 +03001033static inline bool io_async_submit(struct io_ring_ctx *ctx)
1034{
1035 return ctx->flags & IORING_SETUP_SQPOLL;
1036}
1037
Jens Axboe2b188cc2019-01-07 10:46:33 -07001038static void io_ring_ctx_ref_free(struct percpu_ref *ref)
1039{
1040 struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
1041
Jens Axboe0f158b42020-05-14 17:18:39 -06001042 complete(&ctx->ref_comp);
Jens Axboe2b188cc2019-01-07 10:46:33 -07001043}
1044
Pavel Begunkov8eb7e2d2020-06-29 13:13:02 +03001045static inline bool io_is_timeout_noseq(struct io_kiocb *req)
1046{
1047 return !req->timeout.off;
1048}
1049
Jens Axboe2b188cc2019-01-07 10:46:33 -07001050static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
1051{
1052 struct io_ring_ctx *ctx;
Jens Axboe78076bb2019-12-04 19:56:40 -07001053 int hash_bits;
Jens Axboe2b188cc2019-01-07 10:46:33 -07001054
1055 ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
1056 if (!ctx)
1057 return NULL;
1058
Jens Axboe0ddf92e2019-11-08 08:52:53 -07001059 ctx->fallback_req = kmem_cache_alloc(req_cachep, GFP_KERNEL);
1060 if (!ctx->fallback_req)
1061 goto err;
1062
Jens Axboe78076bb2019-12-04 19:56:40 -07001063 /*
1064 * Use 5 bits less than the max cq entries, that should give us around
1065 * 32 entries per hash list if totally full and uniformly spread.
1066 */
1067 hash_bits = ilog2(p->cq_entries);
1068 hash_bits -= 5;
1069 if (hash_bits <= 0)
1070 hash_bits = 1;
1071 ctx->cancel_hash_bits = hash_bits;
1072 ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head),
1073 GFP_KERNEL);
1074 if (!ctx->cancel_hash)
1075 goto err;
1076 __hash_init(ctx->cancel_hash, 1U << hash_bits);
1077
Roman Gushchin21482892019-05-07 10:01:48 -07001078 if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
Jens Axboe206aefd2019-11-07 18:27:42 -07001079 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
1080 goto err;
Jens Axboe2b188cc2019-01-07 10:46:33 -07001081
1082 ctx->flags = p->flags;
Jens Axboe90554202020-09-03 12:12:41 -06001083 init_waitqueue_head(&ctx->sqo_sq_wait);
Jens Axboe69fb2132020-09-14 11:16:23 -06001084 INIT_LIST_HEAD(&ctx->sqd_list);
Jens Axboe2b188cc2019-01-07 10:46:33 -07001085 init_waitqueue_head(&ctx->cq_wait);
Jens Axboe1d7bb1d2019-11-06 11:31:17 -07001086 INIT_LIST_HEAD(&ctx->cq_overflow_list);
Jens Axboe0f158b42020-05-14 17:18:39 -06001087 init_completion(&ctx->ref_comp);
1088 init_completion(&ctx->sq_thread_comp);
Jens Axboe5a2e7452020-02-23 16:23:11 -07001089 idr_init(&ctx->io_buffer_idr);
Jens Axboe071698e2020-01-28 10:04:42 -07001090 idr_init(&ctx->personality_idr);
Jens Axboe2b188cc2019-01-07 10:46:33 -07001091 mutex_init(&ctx->uring_lock);
1092 init_waitqueue_head(&ctx->wait);
1093 spin_lock_init(&ctx->completion_lock);
Pavel Begunkov540e32a2020-07-13 23:37:09 +03001094 INIT_LIST_HEAD(&ctx->iopoll_list);
Jens Axboede0617e2019-04-06 21:51:27 -06001095 INIT_LIST_HEAD(&ctx->defer_list);
Jens Axboe5262f562019-09-17 12:26:57 -06001096 INIT_LIST_HEAD(&ctx->timeout_list);
Jens Axboefcb323c2019-10-24 12:39:47 -06001097 init_waitqueue_head(&ctx->inflight_wait);
1098 spin_lock_init(&ctx->inflight_lock);
1099 INIT_LIST_HEAD(&ctx->inflight_list);
Jens Axboe4a38aed22020-05-14 17:21:15 -06001100 INIT_DELAYED_WORK(&ctx->file_put_work, io_file_put_work);
1101 init_llist_head(&ctx->file_put_llist);
Jens Axboe2b188cc2019-01-07 10:46:33 -07001102 return ctx;
Jens Axboe206aefd2019-11-07 18:27:42 -07001103err:
Jens Axboe0ddf92e2019-11-08 08:52:53 -07001104 if (ctx->fallback_req)
1105 kmem_cache_free(req_cachep, ctx->fallback_req);
Jens Axboe78076bb2019-12-04 19:56:40 -07001106 kfree(ctx->cancel_hash);
Jens Axboe206aefd2019-11-07 18:27:42 -07001107 kfree(ctx);
1108 return NULL;
Jens Axboe2b188cc2019-01-07 10:46:33 -07001109}
1110
Pavel Begunkov9cf7c102020-07-13 23:37:15 +03001111static bool req_need_defer(struct io_kiocb *req, u32 seq)
Jens Axboede0617e2019-04-06 21:51:27 -06001112{
Jens Axboe2bc99302020-07-09 09:43:27 -06001113 if (unlikely(req->flags & REQ_F_IO_DRAIN)) {
1114 struct io_ring_ctx *ctx = req->ctx;
Jackie Liua197f662019-11-08 08:09:12 -07001115
Pavel Begunkov9cf7c102020-07-13 23:37:15 +03001116 return seq != ctx->cached_cq_tail
Pavel Begunkov31af27c2020-04-15 00:39:50 +03001117 + atomic_read(&ctx->cached_cq_overflow);
Jens Axboe2bc99302020-07-09 09:43:27 -06001118 }
Jens Axboe7adf4ea2019-10-10 21:42:58 -06001119
Bob Liu9d858b22019-11-13 18:06:25 +08001120 return false;
Jens Axboe7adf4ea2019-10-10 21:42:58 -06001121}
1122
Jens Axboede0617e2019-04-06 21:51:27 -06001123static void __io_commit_cqring(struct io_ring_ctx *ctx)
Jens Axboe2b188cc2019-01-07 10:46:33 -07001124{
Hristo Venev75b28af2019-08-26 17:23:46 +00001125 struct io_rings *rings = ctx->rings;
Jens Axboe2b188cc2019-01-07 10:46:33 -07001126
Pavel Begunkov07910152020-01-17 03:52:46 +03001127 /* order cqe stores with ring update */
1128 smp_store_release(&rings->cq.tail, ctx->cached_cq_tail);
Jens Axboe2b188cc2019-01-07 10:46:33 -07001129
Pavel Begunkov07910152020-01-17 03:52:46 +03001130 if (wq_has_sleeper(&ctx->cq_wait)) {
1131 wake_up_interruptible(&ctx->cq_wait);
1132 kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
Jens Axboe2b188cc2019-01-07 10:46:33 -07001133 }
1134}
1135
Jens Axboe51a4cc12020-08-10 10:55:56 -06001136/*
1137 * Returns true if we need to defer file table putting. This can only happen
1138 * from the error path with REQ_F_COMP_LOCKED set.
1139 */
1140static bool io_req_clean_work(struct io_kiocb *req)
Jens Axboecccf0ee2020-01-27 16:34:48 -07001141{
Xiaoguang Wang7cdaf582020-06-10 19:41:19 +08001142 if (!(req->flags & REQ_F_WORK_INITIALIZED))
Jens Axboe51a4cc12020-08-10 10:55:56 -06001143 return false;
1144
1145 req->flags &= ~REQ_F_WORK_INITIALIZED;
Xiaoguang Wang7cdaf582020-06-10 19:41:19 +08001146
Jens Axboecccf0ee2020-01-27 16:34:48 -07001147 if (req->work.mm) {
1148 mmdrop(req->work.mm);
1149 req->work.mm = NULL;
1150 }
1151 if (req->work.creds) {
1152 put_cred(req->work.creds);
1153 req->work.creds = NULL;
1154 }
Jens Axboeff002b32020-02-07 16:05:21 -07001155 if (req->work.fs) {
1156 struct fs_struct *fs = req->work.fs;
1157
Jens Axboe51a4cc12020-08-10 10:55:56 -06001158 if (req->flags & REQ_F_COMP_LOCKED)
1159 return true;
1160
Jens Axboeff002b32020-02-07 16:05:21 -07001161 spin_lock(&req->work.fs->lock);
1162 if (--fs->users)
1163 fs = NULL;
1164 spin_unlock(&req->work.fs->lock);
1165 if (fs)
1166 free_fs_struct(fs);
Pavel Begunkovb65e0dd2020-07-25 14:41:58 +03001167 req->work.fs = NULL;
Jens Axboeff002b32020-02-07 16:05:21 -07001168 }
Jens Axboe51a4cc12020-08-10 10:55:56 -06001169
1170 return false;
Jens Axboe561fb042019-10-24 07:25:42 -06001171}
1172
Pavel Begunkovcbdcb432020-06-29 19:18:43 +03001173static void io_prep_async_work(struct io_kiocb *req)
Jens Axboe561fb042019-10-24 07:25:42 -06001174{
Jens Axboed3656342019-12-18 09:50:26 -07001175 const struct io_op_def *def = &io_op_defs[req->opcode];
Jens Axboe54a91f32019-09-10 09:15:04 -06001176
Pavel Begunkov16d59802020-07-12 16:16:47 +03001177 io_req_init_async(req);
1178
Jens Axboed3656342019-12-18 09:50:26 -07001179 if (req->flags & REQ_F_ISREG) {
Jens Axboeeefdf302020-08-27 16:40:19 -06001180 if (def->hash_reg_file || (req->ctx->flags & IORING_SETUP_IOPOLL))
Pavel Begunkov8766dd52020-03-14 00:31:04 +03001181 io_wq_hash_work(&req->work, file_inode(req->file));
Jens Axboed3656342019-12-18 09:50:26 -07001182 } else {
1183 if (def->unbound_nonreg_file)
Jens Axboe3529d8c2019-12-19 18:24:38 -07001184 req->work.flags |= IO_WQ_WORK_UNBOUND;
Jens Axboe54a91f32019-09-10 09:15:04 -06001185 }
Pavel Begunkovdca9cf82020-07-15 12:46:49 +03001186 if (!req->work.mm && def->needs_mm) {
1187 mmgrab(current->mm);
1188 req->work.mm = current->mm;
1189 }
1190 if (!req->work.creds)
1191 req->work.creds = get_current_cred();
1192 if (!req->work.fs && def->needs_fs) {
1193 spin_lock(&current->fs->lock);
1194 if (!current->fs->in_exec) {
1195 req->work.fs = current->fs;
1196 req->work.fs->users++;
1197 } else {
1198 req->work.flags |= IO_WQ_WORK_CANCEL;
1199 }
1200 spin_unlock(&current->fs->lock);
1201 }
Pavel Begunkov57f1a642020-07-15 12:46:52 +03001202 if (def->needs_fsize)
1203 req->work.fsize = rlimit(RLIMIT_FSIZE);
1204 else
1205 req->work.fsize = RLIM_INFINITY;
Jens Axboe561fb042019-10-24 07:25:42 -06001206}
1207
Pavel Begunkovcbdcb432020-06-29 19:18:43 +03001208static void io_prep_async_link(struct io_kiocb *req)
1209{
1210 struct io_kiocb *cur;
1211
1212 io_prep_async_work(req);
1213 if (req->flags & REQ_F_LINK_HEAD)
1214 list_for_each_entry(cur, &req->link_list, link_list)
1215 io_prep_async_work(cur);
1216}
1217
Jens Axboe7271ef32020-08-10 09:55:22 -06001218static struct io_kiocb *__io_queue_async_work(struct io_kiocb *req)
Jens Axboe561fb042019-10-24 07:25:42 -06001219{
Jackie Liua197f662019-11-08 08:09:12 -07001220 struct io_ring_ctx *ctx = req->ctx;
Pavel Begunkovcbdcb432020-06-29 19:18:43 +03001221 struct io_kiocb *link = io_prep_linked_timeout(req);
Jens Axboe561fb042019-10-24 07:25:42 -06001222
Pavel Begunkov8766dd52020-03-14 00:31:04 +03001223 trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req,
1224 &req->work, req->flags);
1225 io_wq_enqueue(ctx->io_wq, &req->work);
Jens Axboe7271ef32020-08-10 09:55:22 -06001226 return link;
Jens Axboe18d9be12019-09-10 09:13:05 -06001227}
1228
Pavel Begunkovcbdcb432020-06-29 19:18:43 +03001229static void io_queue_async_work(struct io_kiocb *req)
1230{
Jens Axboe7271ef32020-08-10 09:55:22 -06001231 struct io_kiocb *link;
1232
Pavel Begunkovcbdcb432020-06-29 19:18:43 +03001233 /* init ->work of the whole link before punting */
1234 io_prep_async_link(req);
Jens Axboe7271ef32020-08-10 09:55:22 -06001235 link = __io_queue_async_work(req);
1236
1237 if (link)
1238 io_queue_linked_timeout(link);
Pavel Begunkovcbdcb432020-06-29 19:18:43 +03001239}
1240
Jens Axboe5262f562019-09-17 12:26:57 -06001241static void io_kill_timeout(struct io_kiocb *req)
1242{
Jens Axboee8c2bc12020-08-15 18:44:09 -07001243 struct io_timeout_data *io = req->async_data;
Jens Axboe5262f562019-09-17 12:26:57 -06001244 int ret;
1245
Jens Axboee8c2bc12020-08-15 18:44:09 -07001246 ret = hrtimer_try_to_cancel(&io->timer);
Jens Axboe5262f562019-09-17 12:26:57 -06001247 if (ret != -1) {
Pavel Begunkov01cec8c2020-07-30 18:43:50 +03001248 atomic_set(&req->ctx->cq_timeouts,
1249 atomic_read(&req->ctx->cq_timeouts) + 1);
Pavel Begunkov135fcde2020-07-13 23:37:12 +03001250 list_del_init(&req->timeout.list);
Pavel Begunkovf0e20b82020-03-07 01:15:22 +03001251 req->flags |= REQ_F_COMP_LOCKED;
Jens Axboe78e19bb2019-11-06 15:21:34 -07001252 io_cqring_fill_event(req, 0);
Jackie Liuec9c02a2019-11-08 23:50:36 +08001253 io_put_req(req);
Jens Axboe5262f562019-09-17 12:26:57 -06001254 }
1255}
1256
Jens Axboef3606e32020-09-22 08:18:24 -06001257static bool io_task_match(struct io_kiocb *req, struct task_struct *tsk)
1258{
1259 struct io_ring_ctx *ctx = req->ctx;
1260
1261 if (!tsk || req->task == tsk)
1262 return true;
Jens Axboe534ca6d2020-09-02 13:52:19 -06001263 if (ctx->flags & IORING_SETUP_SQPOLL) {
1264 if (ctx->sq_data && req->task == ctx->sq_data->thread)
1265 return true;
1266 }
Jens Axboef3606e32020-09-22 08:18:24 -06001267 return false;
1268}
1269
Jens Axboe76e1b642020-09-26 15:05:03 -06001270/*
1271 * Returns true if we found and killed one or more timeouts
1272 */
1273static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk)
Jens Axboe5262f562019-09-17 12:26:57 -06001274{
1275 struct io_kiocb *req, *tmp;
Jens Axboe76e1b642020-09-26 15:05:03 -06001276 int canceled = 0;
Jens Axboe5262f562019-09-17 12:26:57 -06001277
1278 spin_lock_irq(&ctx->completion_lock);
Jens Axboef3606e32020-09-22 08:18:24 -06001279 list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) {
Jens Axboe76e1b642020-09-26 15:05:03 -06001280 if (io_task_match(req, tsk)) {
Jens Axboef3606e32020-09-22 08:18:24 -06001281 io_kill_timeout(req);
Jens Axboe76e1b642020-09-26 15:05:03 -06001282 canceled++;
1283 }
Jens Axboef3606e32020-09-22 08:18:24 -06001284 }
Jens Axboe5262f562019-09-17 12:26:57 -06001285 spin_unlock_irq(&ctx->completion_lock);
Jens Axboe76e1b642020-09-26 15:05:03 -06001286 return canceled != 0;
Jens Axboe5262f562019-09-17 12:26:57 -06001287}
1288
Pavel Begunkov04518942020-05-26 20:34:05 +03001289static void __io_queue_deferred(struct io_ring_ctx *ctx)
1290{
1291 do {
Pavel Begunkov27dc8332020-07-13 23:37:14 +03001292 struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
1293 struct io_defer_entry, list);
Jens Axboe7271ef32020-08-10 09:55:22 -06001294 struct io_kiocb *link;
Pavel Begunkov04518942020-05-26 20:34:05 +03001295
Pavel Begunkov9cf7c102020-07-13 23:37:15 +03001296 if (req_need_defer(de->req, de->seq))
Pavel Begunkov04518942020-05-26 20:34:05 +03001297 break;
Pavel Begunkov27dc8332020-07-13 23:37:14 +03001298 list_del_init(&de->list);
Pavel Begunkovcbdcb432020-06-29 19:18:43 +03001299 /* punt-init is done before queueing for defer */
Jens Axboe7271ef32020-08-10 09:55:22 -06001300 link = __io_queue_async_work(de->req);
1301 if (link) {
1302 __io_queue_linked_timeout(link);
1303 /* drop submission reference */
1304 link->flags |= REQ_F_COMP_LOCKED;
1305 io_put_req(link);
1306 }
Pavel Begunkov27dc8332020-07-13 23:37:14 +03001307 kfree(de);
Pavel Begunkov04518942020-05-26 20:34:05 +03001308 } while (!list_empty(&ctx->defer_list));
1309}
1310
Pavel Begunkov360428f2020-05-30 14:54:17 +03001311static void io_flush_timeouts(struct io_ring_ctx *ctx)
1312{
1313 while (!list_empty(&ctx->timeout_list)) {
1314 struct io_kiocb *req = list_first_entry(&ctx->timeout_list,
Pavel Begunkov135fcde2020-07-13 23:37:12 +03001315 struct io_kiocb, timeout.list);
Pavel Begunkov360428f2020-05-30 14:54:17 +03001316
Pavel Begunkov8eb7e2d2020-06-29 13:13:02 +03001317 if (io_is_timeout_noseq(req))
Pavel Begunkov360428f2020-05-30 14:54:17 +03001318 break;
Pavel Begunkovbfe68a22020-05-30 14:54:18 +03001319 if (req->timeout.target_seq != ctx->cached_cq_tail
1320 - atomic_read(&ctx->cq_timeouts))
Pavel Begunkov360428f2020-05-30 14:54:17 +03001321 break;
Pavel Begunkovbfe68a22020-05-30 14:54:18 +03001322
Pavel Begunkov135fcde2020-07-13 23:37:12 +03001323 list_del_init(&req->timeout.list);
Pavel Begunkov360428f2020-05-30 14:54:17 +03001324 io_kill_timeout(req);
1325 }
1326}
1327
Jens Axboede0617e2019-04-06 21:51:27 -06001328static void io_commit_cqring(struct io_ring_ctx *ctx)
1329{
Pavel Begunkov360428f2020-05-30 14:54:17 +03001330 io_flush_timeouts(ctx);
Jens Axboede0617e2019-04-06 21:51:27 -06001331 __io_commit_cqring(ctx);
1332
Pavel Begunkov04518942020-05-26 20:34:05 +03001333 if (unlikely(!list_empty(&ctx->defer_list)))
1334 __io_queue_deferred(ctx);
Jens Axboede0617e2019-04-06 21:51:27 -06001335}
1336
Jens Axboe90554202020-09-03 12:12:41 -06001337static inline bool io_sqring_full(struct io_ring_ctx *ctx)
1338{
1339 struct io_rings *r = ctx->rings;
1340
1341 return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == r->sq_ring_entries;
1342}
1343
Jens Axboe2b188cc2019-01-07 10:46:33 -07001344static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
1345{
Hristo Venev75b28af2019-08-26 17:23:46 +00001346 struct io_rings *rings = ctx->rings;
Jens Axboe2b188cc2019-01-07 10:46:33 -07001347 unsigned tail;
1348
1349 tail = ctx->cached_cq_tail;
Stefan Bühler115e12e2019-04-24 23:54:18 +02001350 /*
1351 * writes to the cq entry need to come after reading head; the
1352 * control dependency is enough as we're using WRITE_ONCE to
1353 * fill the cq entry
1354 */
Hristo Venev75b28af2019-08-26 17:23:46 +00001355 if (tail - READ_ONCE(rings->cq.head) == rings->cq_ring_entries)
Jens Axboe2b188cc2019-01-07 10:46:33 -07001356 return NULL;
1357
1358 ctx->cached_cq_tail++;
Hristo Venev75b28af2019-08-26 17:23:46 +00001359 return &rings->cqes[tail & ctx->cq_mask];
Jens Axboe2b188cc2019-01-07 10:46:33 -07001360}
1361
Jens Axboef2842ab2020-01-08 11:04:00 -07001362static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
1363{
Jens Axboef0b493e2020-02-01 21:30:11 -07001364 if (!ctx->cq_ev_fd)
1365 return false;
Stefano Garzarella7e55a192020-05-15 18:38:05 +02001366 if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
1367 return false;
Jens Axboef2842ab2020-01-08 11:04:00 -07001368 if (!ctx->eventfd_async)
1369 return true;
Jens Axboeb41e9852020-02-17 09:52:41 -07001370 return io_wq_current_is_worker();
Jens Axboef2842ab2020-01-08 11:04:00 -07001371}
1372
Jens Axboeb41e9852020-02-17 09:52:41 -07001373static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
Jens Axboe8c838782019-03-12 15:48:16 -06001374{
1375 if (waitqueue_active(&ctx->wait))
1376 wake_up(&ctx->wait);
Jens Axboe534ca6d2020-09-02 13:52:19 -06001377 if (ctx->sq_data && waitqueue_active(&ctx->sq_data->wait))
1378 wake_up(&ctx->sq_data->wait);
Jens Axboeb41e9852020-02-17 09:52:41 -07001379 if (io_should_trigger_evfd(ctx))
Jens Axboe9b402842019-04-11 11:45:41 -06001380 eventfd_signal(ctx->cq_ev_fd, 1);
Jens Axboe8c838782019-03-12 15:48:16 -06001381}
1382
Pavel Begunkov46930142020-07-30 18:43:49 +03001383static void io_cqring_mark_overflow(struct io_ring_ctx *ctx)
1384{
1385 if (list_empty(&ctx->cq_overflow_list)) {
1386 clear_bit(0, &ctx->sq_check_overflow);
1387 clear_bit(0, &ctx->cq_check_overflow);
1388 ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW;
1389 }
1390}
1391
Jens Axboee6c8aa92020-09-28 13:10:13 -06001392static inline bool io_match_files(struct io_kiocb *req,
1393 struct files_struct *files)
1394{
1395 if (!files)
1396 return true;
1397 if (req->flags & REQ_F_WORK_INITIALIZED)
1398 return req->work.files == files;
1399 return false;
1400}
1401
Jens Axboec4a2ed72019-11-21 21:01:26 -07001402/* Returns true if there are no backlogged entries after the flush */
Jens Axboee6c8aa92020-09-28 13:10:13 -06001403static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
1404 struct task_struct *tsk,
1405 struct files_struct *files)
Jens Axboe2b188cc2019-01-07 10:46:33 -07001406{
Jens Axboe1d7bb1d2019-11-06 11:31:17 -07001407 struct io_rings *rings = ctx->rings;
Jens Axboee6c8aa92020-09-28 13:10:13 -06001408 struct io_kiocb *req, *tmp;
Jens Axboe1d7bb1d2019-11-06 11:31:17 -07001409 struct io_uring_cqe *cqe;
Jens Axboe1d7bb1d2019-11-06 11:31:17 -07001410 unsigned long flags;
1411 LIST_HEAD(list);
1412
1413 if (!force) {
1414 if (list_empty_careful(&ctx->cq_overflow_list))
Jens Axboec4a2ed72019-11-21 21:01:26 -07001415 return true;
Jens Axboe1d7bb1d2019-11-06 11:31:17 -07001416 if ((ctx->cached_cq_tail - READ_ONCE(rings->cq.head) ==
1417 rings->cq_ring_entries))
Jens Axboec4a2ed72019-11-21 21:01:26 -07001418 return false;
Jens Axboe1d7bb1d2019-11-06 11:31:17 -07001419 }
1420
1421 spin_lock_irqsave(&ctx->completion_lock, flags);
1422
1423 /* if force is set, the ring is going away. always drop after that */
1424 if (force)
Jens Axboe69b3e542020-01-08 11:01:46 -07001425 ctx->cq_overflow_flushed = 1;
Jens Axboe1d7bb1d2019-11-06 11:31:17 -07001426
Jens Axboec4a2ed72019-11-21 21:01:26 -07001427 cqe = NULL;
Jens Axboee6c8aa92020-09-28 13:10:13 -06001428 list_for_each_entry_safe(req, tmp, &ctx->cq_overflow_list, compl.list) {
1429 if (tsk && req->task != tsk)
1430 continue;
1431 if (!io_match_files(req, files))
1432 continue;
1433
Jens Axboe1d7bb1d2019-11-06 11:31:17 -07001434 cqe = io_get_cqring(ctx);
1435 if (!cqe && !force)
1436 break;
1437
Pavel Begunkov40d8ddd2020-07-13 23:37:11 +03001438 list_move(&req->compl.list, &list);
Jens Axboe1d7bb1d2019-11-06 11:31:17 -07001439 if (cqe) {
1440 WRITE_ONCE(cqe->user_data, req->user_data);
1441 WRITE_ONCE(cqe->res, req->result);
Pavel Begunkov0f7e4662020-07-13 23:37:16 +03001442 WRITE_ONCE(cqe->flags, req->compl.cflags);
Jens Axboe1d7bb1d2019-11-06 11:31:17 -07001443 } else {
1444 WRITE_ONCE(ctx->rings->cq_overflow,
1445 atomic_inc_return(&ctx->cached_cq_overflow));
1446 }
1447 }
1448
1449 io_commit_cqring(ctx);
Pavel Begunkov46930142020-07-30 18:43:49 +03001450 io_cqring_mark_overflow(ctx);
1451
Jens Axboe1d7bb1d2019-11-06 11:31:17 -07001452 spin_unlock_irqrestore(&ctx->completion_lock, flags);
1453 io_cqring_ev_posted(ctx);
1454
1455 while (!list_empty(&list)) {
Pavel Begunkov40d8ddd2020-07-13 23:37:11 +03001456 req = list_first_entry(&list, struct io_kiocb, compl.list);
1457 list_del(&req->compl.list);
Jackie Liuec9c02a2019-11-08 23:50:36 +08001458 io_put_req(req);
Jens Axboe1d7bb1d2019-11-06 11:31:17 -07001459 }
Jens Axboec4a2ed72019-11-21 21:01:26 -07001460
1461 return cqe != NULL;
Jens Axboe1d7bb1d2019-11-06 11:31:17 -07001462}
1463
Jens Axboebcda7ba2020-02-23 16:42:51 -07001464static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
Jens Axboe2b188cc2019-01-07 10:46:33 -07001465{
Jens Axboe78e19bb2019-11-06 15:21:34 -07001466 struct io_ring_ctx *ctx = req->ctx;
Jens Axboe2b188cc2019-01-07 10:46:33 -07001467 struct io_uring_cqe *cqe;
1468
Jens Axboe78e19bb2019-11-06 15:21:34 -07001469 trace_io_uring_complete(ctx, req->user_data, res);
Jens Axboe51c3ff62019-11-03 06:52:50 -07001470
Jens Axboe2b188cc2019-01-07 10:46:33 -07001471 /*
1472 * If we can't get a cq entry, userspace overflowed the
1473 * submission (by quite a lot). Increment the overflow count in
1474 * the ring.
1475 */
1476 cqe = io_get_cqring(ctx);
Jens Axboe1d7bb1d2019-11-06 11:31:17 -07001477 if (likely(cqe)) {
Jens Axboe78e19bb2019-11-06 15:21:34 -07001478 WRITE_ONCE(cqe->user_data, req->user_data);
Jens Axboe2b188cc2019-01-07 10:46:33 -07001479 WRITE_ONCE(cqe->res, res);
Jens Axboebcda7ba2020-02-23 16:42:51 -07001480 WRITE_ONCE(cqe->flags, cflags);
Jens Axboe0f212202020-09-13 13:09:39 -06001481 } else if (ctx->cq_overflow_flushed || req->task->io_uring->in_idle) {
1482 /*
1483 * If we're in ring overflow flush mode, or in task cancel mode,
1484 * then we cannot store the request for later flushing, we need
1485 * to drop it on the floor.
1486 */
Jens Axboe2b188cc2019-01-07 10:46:33 -07001487 WRITE_ONCE(ctx->rings->cq_overflow,
1488 atomic_inc_return(&ctx->cached_cq_overflow));
Jens Axboe1d7bb1d2019-11-06 11:31:17 -07001489 } else {
Jens Axboead3eb2c2019-12-18 17:12:20 -07001490 if (list_empty(&ctx->cq_overflow_list)) {
1491 set_bit(0, &ctx->sq_check_overflow);
1492 set_bit(0, &ctx->cq_check_overflow);
Xiaoguang Wang6d5f9042020-07-09 09:15:29 +08001493 ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW;
Jens Axboead3eb2c2019-12-18 17:12:20 -07001494 }
Pavel Begunkov40d8ddd2020-07-13 23:37:11 +03001495 io_clean_op(req);
Jens Axboe1d7bb1d2019-11-06 11:31:17 -07001496 req->result = res;
Pavel Begunkov0f7e4662020-07-13 23:37:16 +03001497 req->compl.cflags = cflags;
Pavel Begunkov40d8ddd2020-07-13 23:37:11 +03001498 refcount_inc(&req->refs);
1499 list_add_tail(&req->compl.list, &ctx->cq_overflow_list);
Jens Axboe2b188cc2019-01-07 10:46:33 -07001500 }
1501}
1502
Jens Axboebcda7ba2020-02-23 16:42:51 -07001503static void io_cqring_fill_event(struct io_kiocb *req, long res)
1504{
1505 __io_cqring_fill_event(req, res, 0);
1506}
1507
Jens Axboee1e16092020-06-22 09:17:17 -06001508static void io_cqring_add_event(struct io_kiocb *req, long res, long cflags)
Jens Axboe2b188cc2019-01-07 10:46:33 -07001509{
Jens Axboe78e19bb2019-11-06 15:21:34 -07001510 struct io_ring_ctx *ctx = req->ctx;
Jens Axboe2b188cc2019-01-07 10:46:33 -07001511 unsigned long flags;
1512
1513 spin_lock_irqsave(&ctx->completion_lock, flags);
Jens Axboebcda7ba2020-02-23 16:42:51 -07001514 __io_cqring_fill_event(req, res, cflags);
Jens Axboe2b188cc2019-01-07 10:46:33 -07001515 io_commit_cqring(ctx);
1516 spin_unlock_irqrestore(&ctx->completion_lock, flags);
1517
Jens Axboe8c838782019-03-12 15:48:16 -06001518 io_cqring_ev_posted(ctx);
Jens Axboe2b188cc2019-01-07 10:46:33 -07001519}
1520
Jens Axboe229a7b62020-06-22 10:13:11 -06001521static void io_submit_flush_completions(struct io_comp_state *cs)
Jens Axboebcda7ba2020-02-23 16:42:51 -07001522{
Jens Axboe229a7b62020-06-22 10:13:11 -06001523 struct io_ring_ctx *ctx = cs->ctx;
1524
1525 spin_lock_irq(&ctx->completion_lock);
1526 while (!list_empty(&cs->list)) {
1527 struct io_kiocb *req;
1528
Pavel Begunkov3ca405e2020-07-13 23:37:08 +03001529 req = list_first_entry(&cs->list, struct io_kiocb, compl.list);
1530 list_del(&req->compl.list);
Pavel Begunkov0f7e4662020-07-13 23:37:16 +03001531 __io_cqring_fill_event(req, req->result, req->compl.cflags);
Jens Axboe229a7b62020-06-22 10:13:11 -06001532 if (!(req->flags & REQ_F_LINK_HEAD)) {
1533 req->flags |= REQ_F_COMP_LOCKED;
1534 io_put_req(req);
1535 } else {
1536 spin_unlock_irq(&ctx->completion_lock);
1537 io_put_req(req);
1538 spin_lock_irq(&ctx->completion_lock);
1539 }
1540 }
1541 io_commit_cqring(ctx);
1542 spin_unlock_irq(&ctx->completion_lock);
1543
1544 io_cqring_ev_posted(ctx);
1545 cs->nr = 0;
1546}
1547
1548static void __io_req_complete(struct io_kiocb *req, long res, unsigned cflags,
1549 struct io_comp_state *cs)
1550{
1551 if (!cs) {
1552 io_cqring_add_event(req, res, cflags);
1553 io_put_req(req);
1554 } else {
Pavel Begunkov3ca405e2020-07-13 23:37:08 +03001555 io_clean_op(req);
Jens Axboe229a7b62020-06-22 10:13:11 -06001556 req->result = res;
Pavel Begunkov0f7e4662020-07-13 23:37:16 +03001557 req->compl.cflags = cflags;
Pavel Begunkov3ca405e2020-07-13 23:37:08 +03001558 list_add_tail(&req->compl.list, &cs->list);
Jens Axboe229a7b62020-06-22 10:13:11 -06001559 if (++cs->nr >= 32)
1560 io_submit_flush_completions(cs);
1561 }
Jens Axboee1e16092020-06-22 09:17:17 -06001562}
1563
1564static void io_req_complete(struct io_kiocb *req, long res)
1565{
Jens Axboe229a7b62020-06-22 10:13:11 -06001566 __io_req_complete(req, res, 0, NULL);
Jens Axboebcda7ba2020-02-23 16:42:51 -07001567}
1568
Jens Axboe0ddf92e2019-11-08 08:52:53 -07001569static inline bool io_is_fallback_req(struct io_kiocb *req)
1570{
1571 return req == (struct io_kiocb *)
1572 ((unsigned long) req->ctx->fallback_req & ~1UL);
1573}
1574
1575static struct io_kiocb *io_get_fallback_req(struct io_ring_ctx *ctx)
1576{
1577 struct io_kiocb *req;
1578
1579 req = ctx->fallback_req;
Bijan Mottahedehdd461af2020-04-29 17:47:50 -07001580 if (!test_and_set_bit_lock(0, (unsigned long *) &ctx->fallback_req))
Jens Axboe0ddf92e2019-11-08 08:52:53 -07001581 return req;
1582
1583 return NULL;
1584}
1585
Pavel Begunkov0553b8b2020-04-08 08:58:45 +03001586static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx,
1587 struct io_submit_state *state)
Jens Axboe2b188cc2019-01-07 10:46:33 -07001588{
Jens Axboefd6fab22019-03-14 16:30:06 -06001589 gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
Jens Axboe2b188cc2019-01-07 10:46:33 -07001590 struct io_kiocb *req;
1591
Pavel Begunkovf6b6c7d2020-06-21 13:09:53 +03001592 if (!state->free_reqs) {
Jens Axboe2579f912019-01-09 09:10:43 -07001593 size_t sz;
1594 int ret;
1595
1596 sz = min_t(size_t, state->ios_left, ARRAY_SIZE(state->reqs));
Jens Axboefd6fab22019-03-14 16:30:06 -06001597 ret = kmem_cache_alloc_bulk(req_cachep, gfp, sz, state->reqs);
1598
1599 /*
1600 * Bulk alloc is all-or-nothing. If we fail to get a batch,
1601 * retry single alloc to be on the safe side.
1602 */
1603 if (unlikely(ret <= 0)) {
1604 state->reqs[0] = kmem_cache_alloc(req_cachep, gfp);
1605 if (!state->reqs[0])
Jens Axboe0ddf92e2019-11-08 08:52:53 -07001606 goto fallback;
Jens Axboefd6fab22019-03-14 16:30:06 -06001607 ret = 1;
1608 }
Jens Axboe2579f912019-01-09 09:10:43 -07001609 state->free_reqs = ret - 1;
Pavel Begunkov6c8a3132020-02-01 03:58:00 +03001610 req = state->reqs[ret - 1];
Jens Axboe2579f912019-01-09 09:10:43 -07001611 } else {
Jens Axboe2579f912019-01-09 09:10:43 -07001612 state->free_reqs--;
Pavel Begunkov6c8a3132020-02-01 03:58:00 +03001613 req = state->reqs[state->free_reqs];
Jens Axboe2b188cc2019-01-07 10:46:33 -07001614 }
1615
Jens Axboe2579f912019-01-09 09:10:43 -07001616 return req;
Jens Axboe0ddf92e2019-11-08 08:52:53 -07001617fallback:
Pavel Begunkov0553b8b2020-04-08 08:58:45 +03001618 return io_get_fallback_req(ctx);
Jens Axboe2b188cc2019-01-07 10:46:33 -07001619}
1620
Pavel Begunkov8da11c12020-02-24 11:32:44 +03001621static inline void io_put_file(struct io_kiocb *req, struct file *file,
1622 bool fixed)
1623{
1624 if (fixed)
Xiaoguang Wang05589552020-03-31 14:05:18 +08001625 percpu_ref_put(req->fixed_file_refs);
Pavel Begunkov8da11c12020-02-24 11:32:44 +03001626 else
1627 fput(file);
1628}
1629
Jens Axboe51a4cc12020-08-10 10:55:56 -06001630static bool io_dismantle_req(struct io_kiocb *req)
Jens Axboe2b188cc2019-01-07 10:46:33 -07001631{
Pavel Begunkov3ca405e2020-07-13 23:37:08 +03001632 io_clean_op(req);
Pavel Begunkov929a3af2020-02-19 00:19:09 +03001633
Jens Axboee8c2bc12020-08-15 18:44:09 -07001634 if (req->async_data)
1635 kfree(req->async_data);
Pavel Begunkov8da11c12020-02-24 11:32:44 +03001636 if (req->file)
1637 io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));
Jens Axboefcb323c2019-10-24 12:39:47 -06001638
Jens Axboe51a4cc12020-08-10 10:55:56 -06001639 return io_req_clean_work(req);
Pavel Begunkove6543a82020-06-28 12:52:30 +03001640}
Pavel Begunkov2b85edf2019-12-28 14:13:03 +03001641
Jens Axboe51a4cc12020-08-10 10:55:56 -06001642static void __io_free_req_finish(struct io_kiocb *req)
Pavel Begunkove6543a82020-06-28 12:52:30 +03001643{
Jens Axboe0f212202020-09-13 13:09:39 -06001644 struct io_uring_task *tctx = req->task->io_uring;
Jens Axboe51a4cc12020-08-10 10:55:56 -06001645 struct io_ring_ctx *ctx = req->ctx;
Pavel Begunkovecfc5172020-06-29 13:13:03 +03001646
Jens Axboe0f212202020-09-13 13:09:39 -06001647 atomic_long_inc(&tctx->req_complete);
1648 if (tctx->in_idle)
1649 wake_up(&tctx->wait);
Jens Axboee3bc8e92020-09-24 08:45:57 -06001650 put_task_struct(req->task);
1651
Pavel Begunkovb1e50e52020-04-08 08:58:44 +03001652 if (likely(!io_is_fallback_req(req)))
1653 kmem_cache_free(req_cachep, req);
1654 else
Pavel Begunkovecfc5172020-06-29 13:13:03 +03001655 clear_bit_unlock(0, (unsigned long *) &ctx->fallback_req);
1656 percpu_ref_put(&ctx->refs);
Jens Axboee65ef562019-03-12 10:16:44 -06001657}
1658
Jens Axboe51a4cc12020-08-10 10:55:56 -06001659static void io_req_task_file_table_put(struct callback_head *cb)
1660{
1661 struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
1662 struct fs_struct *fs = req->work.fs;
1663
1664 spin_lock(&req->work.fs->lock);
1665 if (--fs->users)
1666 fs = NULL;
1667 spin_unlock(&req->work.fs->lock);
1668 if (fs)
1669 free_fs_struct(fs);
1670 req->work.fs = NULL;
1671 __io_free_req_finish(req);
1672}
1673
1674static void __io_free_req(struct io_kiocb *req)
1675{
1676 if (!io_dismantle_req(req)) {
1677 __io_free_req_finish(req);
1678 } else {
1679 int ret;
1680
1681 init_task_work(&req->task_work, io_req_task_file_table_put);
1682 ret = task_work_add(req->task, &req->task_work, TWA_RESUME);
1683 if (unlikely(ret)) {
1684 struct task_struct *tsk;
1685
1686 tsk = io_wq_get_task(req->ctx->io_wq);
1687 task_work_add(tsk, &req->task_work, 0);
1688 }
1689 }
1690}
1691
Jackie Liua197f662019-11-08 08:09:12 -07001692static bool io_link_cancel_timeout(struct io_kiocb *req)
Jens Axboe9e645e112019-05-10 16:07:28 -06001693{
Jens Axboee8c2bc12020-08-15 18:44:09 -07001694 struct io_timeout_data *io = req->async_data;
Jackie Liua197f662019-11-08 08:09:12 -07001695 struct io_ring_ctx *ctx = req->ctx;
Jens Axboe2665abf2019-11-05 12:40:47 -07001696 int ret;
1697
Jens Axboee8c2bc12020-08-15 18:44:09 -07001698 ret = hrtimer_try_to_cancel(&io->timer);
Jens Axboe2665abf2019-11-05 12:40:47 -07001699 if (ret != -1) {
Jens Axboe78e19bb2019-11-06 15:21:34 -07001700 io_cqring_fill_event(req, -ECANCELED);
Jens Axboe2665abf2019-11-05 12:40:47 -07001701 io_commit_cqring(ctx);
Pavel Begunkovdea3b492020-04-12 02:05:04 +03001702 req->flags &= ~REQ_F_LINK_HEAD;
Jackie Liuec9c02a2019-11-08 23:50:36 +08001703 io_put_req(req);
Jens Axboe2665abf2019-11-05 12:40:47 -07001704 return true;
1705 }
1706
1707 return false;
1708}
1709
Jens Axboeab0b6452020-06-30 08:43:15 -06001710static bool __io_kill_linked_timeout(struct io_kiocb *req)
Jens Axboe9e645e112019-05-10 16:07:28 -06001711{
Pavel Begunkov7c86ffe2020-06-29 13:12:59 +03001712 struct io_kiocb *link;
Jens Axboeab0b6452020-06-30 08:43:15 -06001713 bool wake_ev;
Pavel Begunkov7c86ffe2020-06-29 13:12:59 +03001714
1715 if (list_empty(&req->link_list))
Jens Axboeab0b6452020-06-30 08:43:15 -06001716 return false;
Pavel Begunkov7c86ffe2020-06-29 13:12:59 +03001717 link = list_first_entry(&req->link_list, struct io_kiocb, link_list);
1718 if (link->opcode != IORING_OP_LINK_TIMEOUT)
Jens Axboeab0b6452020-06-30 08:43:15 -06001719 return false;
Pavel Begunkov7c86ffe2020-06-29 13:12:59 +03001720
1721 list_del_init(&link->link_list);
Jens Axboe9b7adba2020-08-10 10:54:02 -06001722 link->flags |= REQ_F_COMP_LOCKED;
Pavel Begunkov7c86ffe2020-06-29 13:12:59 +03001723 wake_ev = io_link_cancel_timeout(link);
1724 req->flags &= ~REQ_F_LINK_TIMEOUT;
Jens Axboeab0b6452020-06-30 08:43:15 -06001725 return wake_ev;
1726}
1727
1728static void io_kill_linked_timeout(struct io_kiocb *req)
Jens Axboe9e645e112019-05-10 16:07:28 -06001729{
Jens Axboe2665abf2019-11-05 12:40:47 -07001730 struct io_ring_ctx *ctx = req->ctx;
Jens Axboeab0b6452020-06-30 08:43:15 -06001731 bool wake_ev;
Jens Axboe9e645e112019-05-10 16:07:28 -06001732
Jens Axboeab0b6452020-06-30 08:43:15 -06001733 if (!(req->flags & REQ_F_COMP_LOCKED)) {
1734 unsigned long flags;
1735
1736 spin_lock_irqsave(&ctx->completion_lock, flags);
1737 wake_ev = __io_kill_linked_timeout(req);
Pavel Begunkov7c86ffe2020-06-29 13:12:59 +03001738 spin_unlock_irqrestore(&ctx->completion_lock, flags);
Jens Axboeab0b6452020-06-30 08:43:15 -06001739 } else {
1740 wake_ev = __io_kill_linked_timeout(req);
1741 }
1742
Pavel Begunkov7c86ffe2020-06-29 13:12:59 +03001743 if (wake_ev)
1744 io_cqring_ev_posted(ctx);
1745}
1746
Pavel Begunkov9b5f7bd2020-06-29 13:13:00 +03001747static struct io_kiocb *io_req_link_next(struct io_kiocb *req)
Pavel Begunkov7c86ffe2020-06-29 13:12:59 +03001748{
1749 struct io_kiocb *nxt;
Jens Axboe4d7dd462019-11-20 13:03:52 -07001750
Jens Axboe9e645e112019-05-10 16:07:28 -06001751 /*
1752 * The list should never be empty when we are called here. But could
1753 * potentially happen if the chain is messed up, check to be on the
1754 * safe side.
1755 */
Pavel Begunkov7c86ffe2020-06-29 13:12:59 +03001756 if (unlikely(list_empty(&req->link_list)))
Pavel Begunkov9b5f7bd2020-06-29 13:13:00 +03001757 return NULL;
Jens Axboe94ae5e72019-11-14 19:39:52 -07001758
Pavel Begunkov7c86ffe2020-06-29 13:12:59 +03001759 nxt = list_first_entry(&req->link_list, struct io_kiocb, link_list);
1760 list_del_init(&req->link_list);
1761 if (!list_empty(&nxt->link_list))
1762 nxt->flags |= REQ_F_LINK_HEAD;
Pavel Begunkov9b5f7bd2020-06-29 13:13:00 +03001763 return nxt;
Jens Axboe9e645e112019-05-10 16:07:28 -06001764}
1765
1766/*
Pavel Begunkovdea3b492020-04-12 02:05:04 +03001767 * Called if REQ_F_LINK_HEAD is set, and we fail the head request
Jens Axboe9e645e112019-05-10 16:07:28 -06001768 */
Pavel Begunkov7c86ffe2020-06-29 13:12:59 +03001769static void __io_fail_links(struct io_kiocb *req)
Jens Axboe9e645e112019-05-10 16:07:28 -06001770{
Jens Axboe2665abf2019-11-05 12:40:47 -07001771 struct io_ring_ctx *ctx = req->ctx;
Jens Axboe9e645e112019-05-10 16:07:28 -06001772
1773 while (!list_empty(&req->link_list)) {
Pavel Begunkov44932332019-12-05 16:16:35 +03001774 struct io_kiocb *link = list_first_entry(&req->link_list,
1775 struct io_kiocb, link_list);
Jens Axboe9e645e112019-05-10 16:07:28 -06001776
Pavel Begunkov44932332019-12-05 16:16:35 +03001777 list_del_init(&link->link_list);
Dmitrii Dolgovc826bd72019-10-15 19:02:01 +02001778 trace_io_uring_fail_link(req, link);
Jens Axboe2665abf2019-11-05 12:40:47 -07001779
Pavel Begunkov7c86ffe2020-06-29 13:12:59 +03001780 io_cqring_fill_event(link, -ECANCELED);
Jens Axboe9b7adba2020-08-10 10:54:02 -06001781 link->flags |= REQ_F_COMP_LOCKED;
Pavel Begunkov7c86ffe2020-06-29 13:12:59 +03001782 __io_double_put_req(link);
Jens Axboe5d960722019-11-19 15:31:28 -07001783 req->flags &= ~REQ_F_LINK_TIMEOUT;
Jens Axboe9e645e112019-05-10 16:07:28 -06001784 }
Jens Axboe2665abf2019-11-05 12:40:47 -07001785
1786 io_commit_cqring(ctx);
Jens Axboe2665abf2019-11-05 12:40:47 -07001787 io_cqring_ev_posted(ctx);
Jens Axboe9e645e112019-05-10 16:07:28 -06001788}
1789
Pavel Begunkov7c86ffe2020-06-29 13:12:59 +03001790static void io_fail_links(struct io_kiocb *req)
Jens Axboe9e645e112019-05-10 16:07:28 -06001791{
Pavel Begunkov7c86ffe2020-06-29 13:12:59 +03001792 struct io_ring_ctx *ctx = req->ctx;
1793
1794 if (!(req->flags & REQ_F_COMP_LOCKED)) {
1795 unsigned long flags;
1796
1797 spin_lock_irqsave(&ctx->completion_lock, flags);
1798 __io_fail_links(req);
1799 spin_unlock_irqrestore(&ctx->completion_lock, flags);
1800 } else {
1801 __io_fail_links(req);
1802 }
1803
Jens Axboe9e645e112019-05-10 16:07:28 -06001804 io_cqring_ev_posted(ctx);
1805}
1806
Pavel Begunkov3fa5e0f2020-06-30 15:20:43 +03001807static struct io_kiocb *__io_req_find_next(struct io_kiocb *req)
Jens Axboe9e645e112019-05-10 16:07:28 -06001808{
Pavel Begunkov9b0d9112020-06-28 12:52:34 +03001809 req->flags &= ~REQ_F_LINK_HEAD;
Pavel Begunkov7c86ffe2020-06-29 13:12:59 +03001810 if (req->flags & REQ_F_LINK_TIMEOUT)
1811 io_kill_linked_timeout(req);
Jens Axboe2665abf2019-11-05 12:40:47 -07001812
Jens Axboe9e645e112019-05-10 16:07:28 -06001813 /*
1814 * If LINK is set, we have dependent requests in this chain. If we
1815 * didn't fail this request, queue the first one up, moving any other
1816 * dependencies to the next request. In case of failure, fail the rest
1817 * of the chain.
1818 */
Pavel Begunkov9b5f7bd2020-06-29 13:13:00 +03001819 if (likely(!(req->flags & REQ_F_FAIL_LINK)))
1820 return io_req_link_next(req);
1821 io_fail_links(req);
1822 return NULL;
Jens Axboe4d7dd462019-11-20 13:03:52 -07001823}
Jens Axboe2665abf2019-11-05 12:40:47 -07001824
Pavel Begunkov3fa5e0f2020-06-30 15:20:43 +03001825static struct io_kiocb *io_req_find_next(struct io_kiocb *req)
1826{
1827 if (likely(!(req->flags & REQ_F_LINK_HEAD)))
1828 return NULL;
1829 return __io_req_find_next(req);
1830}
1831
Jens Axboefd7d6de2020-08-23 11:00:37 -06001832static int io_req_task_work_add(struct io_kiocb *req, struct callback_head *cb,
1833 bool twa_signal_ok)
Jens Axboec2c4c832020-07-01 15:37:11 -06001834{
1835 struct task_struct *tsk = req->task;
1836 struct io_ring_ctx *ctx = req->ctx;
Jens Axboe0ba9c9e2020-08-06 19:41:50 -06001837 int ret, notify;
Jens Axboec2c4c832020-07-01 15:37:11 -06001838
Jens Axboe6200b0a2020-09-13 14:38:30 -06001839 if (tsk->flags & PF_EXITING)
1840 return -ESRCH;
1841
Jens Axboec2c4c832020-07-01 15:37:11 -06001842 /*
Jens Axboe0ba9c9e2020-08-06 19:41:50 -06001843 * SQPOLL kernel thread doesn't need notification, just a wakeup. For
1844 * all other cases, use TWA_SIGNAL unconditionally to ensure we're
1845 * processing task_work. There's no reliable way to tell if TWA_RESUME
1846 * will do the job.
Jens Axboec2c4c832020-07-01 15:37:11 -06001847 */
Jens Axboe0ba9c9e2020-08-06 19:41:50 -06001848 notify = 0;
Jens Axboefd7d6de2020-08-23 11:00:37 -06001849 if (!(ctx->flags & IORING_SETUP_SQPOLL) && twa_signal_ok)
Jens Axboec2c4c832020-07-01 15:37:11 -06001850 notify = TWA_SIGNAL;
1851
1852 ret = task_work_add(tsk, cb, notify);
1853 if (!ret)
1854 wake_up_process(tsk);
Jens Axboe0ba9c9e2020-08-06 19:41:50 -06001855
Jens Axboec2c4c832020-07-01 15:37:11 -06001856 return ret;
1857}
1858
Jens Axboec40f6372020-06-25 15:39:59 -06001859static void __io_req_task_cancel(struct io_kiocb *req, int error)
1860{
1861 struct io_ring_ctx *ctx = req->ctx;
1862
1863 spin_lock_irq(&ctx->completion_lock);
1864 io_cqring_fill_event(req, error);
1865 io_commit_cqring(ctx);
1866 spin_unlock_irq(&ctx->completion_lock);
1867
1868 io_cqring_ev_posted(ctx);
1869 req_set_fail_links(req);
1870 io_double_put_req(req);
1871}
1872
1873static void io_req_task_cancel(struct callback_head *cb)
1874{
1875 struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
Jens Axboe87ceb6a2020-09-14 08:20:12 -06001876 struct io_ring_ctx *ctx = req->ctx;
Jens Axboec40f6372020-06-25 15:39:59 -06001877
1878 __io_req_task_cancel(req, -ECANCELED);
Jens Axboe87ceb6a2020-09-14 08:20:12 -06001879 percpu_ref_put(&ctx->refs);
Jens Axboec40f6372020-06-25 15:39:59 -06001880}
1881
1882static void __io_req_task_submit(struct io_kiocb *req)
1883{
1884 struct io_ring_ctx *ctx = req->ctx;
1885
Jens Axboec40f6372020-06-25 15:39:59 -06001886 if (!__io_sq_thread_acquire_mm(ctx)) {
1887 mutex_lock(&ctx->uring_lock);
1888 __io_queue_sqe(req, NULL, NULL);
1889 mutex_unlock(&ctx->uring_lock);
Jens Axboe2665abf2019-11-05 12:40:47 -07001890 } else {
Jens Axboec40f6372020-06-25 15:39:59 -06001891 __io_req_task_cancel(req, -EFAULT);
Jens Axboe2665abf2019-11-05 12:40:47 -07001892 }
Jens Axboe9e645e112019-05-10 16:07:28 -06001893}
1894
Jens Axboec40f6372020-06-25 15:39:59 -06001895static void io_req_task_submit(struct callback_head *cb)
1896{
1897 struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
Jens Axboe6d816e02020-08-11 08:04:14 -06001898 struct io_ring_ctx *ctx = req->ctx;
Jens Axboec40f6372020-06-25 15:39:59 -06001899
1900 __io_req_task_submit(req);
Jens Axboe6d816e02020-08-11 08:04:14 -06001901 percpu_ref_put(&ctx->refs);
Jens Axboec40f6372020-06-25 15:39:59 -06001902}
1903
1904static void io_req_task_queue(struct io_kiocb *req)
1905{
Jens Axboec40f6372020-06-25 15:39:59 -06001906 int ret;
1907
1908 init_task_work(&req->task_work, io_req_task_submit);
Jens Axboe6d816e02020-08-11 08:04:14 -06001909 percpu_ref_get(&req->ctx->refs);
Jens Axboec40f6372020-06-25 15:39:59 -06001910
Jens Axboefd7d6de2020-08-23 11:00:37 -06001911 ret = io_req_task_work_add(req, &req->task_work, true);
Jens Axboec40f6372020-06-25 15:39:59 -06001912 if (unlikely(ret)) {
Jens Axboec2c4c832020-07-01 15:37:11 -06001913 struct task_struct *tsk;
1914
Jens Axboec40f6372020-06-25 15:39:59 -06001915 init_task_work(&req->task_work, io_req_task_cancel);
1916 tsk = io_wq_get_task(req->ctx->io_wq);
Jens Axboec2c4c832020-07-01 15:37:11 -06001917 task_work_add(tsk, &req->task_work, 0);
1918 wake_up_process(tsk);
Jens Axboec40f6372020-06-25 15:39:59 -06001919 }
Jens Axboec40f6372020-06-25 15:39:59 -06001920}
1921
Pavel Begunkovc3524382020-06-28 12:52:32 +03001922static void io_queue_next(struct io_kiocb *req)
Jackie Liuc69f8db2019-11-09 11:00:08 +08001923{
Pavel Begunkov9b5f7bd2020-06-29 13:13:00 +03001924 struct io_kiocb *nxt = io_req_find_next(req);
Pavel Begunkov944e58b2019-11-21 23:21:01 +03001925
Pavel Begunkov906a8c32020-06-27 14:04:55 +03001926 if (nxt)
1927 io_req_task_queue(nxt);
Jackie Liuc69f8db2019-11-09 11:00:08 +08001928}
1929
Jens Axboe9e645e112019-05-10 16:07:28 -06001930static void io_free_req(struct io_kiocb *req)
1931{
Pavel Begunkovc3524382020-06-28 12:52:32 +03001932 io_queue_next(req);
Jens Axboe9e645e112019-05-10 16:07:28 -06001933 __io_free_req(req);
Jens Axboee65ef562019-03-12 10:16:44 -06001934}
1935
Pavel Begunkov2d6500d2020-06-28 12:52:33 +03001936struct req_batch {
1937 void *reqs[IO_IOPOLL_BATCH];
1938 int to_free;
Pavel Begunkov5af1d132020-07-18 11:32:52 +03001939
1940 struct task_struct *task;
1941 int task_refs;
Pavel Begunkov2d6500d2020-06-28 12:52:33 +03001942};
1943
Pavel Begunkov5af1d132020-07-18 11:32:52 +03001944static inline void io_init_req_batch(struct req_batch *rb)
Pavel Begunkov7a743e22020-03-03 21:33:13 +03001945{
Pavel Begunkov5af1d132020-07-18 11:32:52 +03001946 rb->to_free = 0;
1947 rb->task_refs = 0;
1948 rb->task = NULL;
1949}
Pavel Begunkov8766dd52020-03-14 00:31:04 +03001950
Pavel Begunkov2d6500d2020-06-28 12:52:33 +03001951static void __io_req_free_batch_flush(struct io_ring_ctx *ctx,
1952 struct req_batch *rb)
1953{
1954 kmem_cache_free_bulk(req_cachep, rb->to_free, rb->reqs);
1955 percpu_ref_put_many(&ctx->refs, rb->to_free);
1956 rb->to_free = 0;
1957}
Pavel Begunkov7a743e22020-03-03 21:33:13 +03001958
Pavel Begunkov2d6500d2020-06-28 12:52:33 +03001959static void io_req_free_batch_finish(struct io_ring_ctx *ctx,
1960 struct req_batch *rb)
1961{
1962 if (rb->to_free)
1963 __io_req_free_batch_flush(ctx, rb);
Pavel Begunkov5af1d132020-07-18 11:32:52 +03001964 if (rb->task) {
Jens Axboe0f212202020-09-13 13:09:39 -06001965 atomic_long_add(rb->task_refs, &rb->task->io_uring->req_complete);
Pavel Begunkov5af1d132020-07-18 11:32:52 +03001966 put_task_struct_many(rb->task, rb->task_refs);
1967 rb->task = NULL;
1968 }
Pavel Begunkov2d6500d2020-06-28 12:52:33 +03001969}
1970
1971static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req)
1972{
1973 if (unlikely(io_is_fallback_req(req))) {
1974 io_free_req(req);
1975 return;
1976 }
1977 if (req->flags & REQ_F_LINK_HEAD)
1978 io_queue_next(req);
1979
Jens Axboee3bc8e92020-09-24 08:45:57 -06001980 if (req->task != rb->task) {
Jens Axboe0f212202020-09-13 13:09:39 -06001981 if (rb->task) {
1982 atomic_long_add(rb->task_refs, &rb->task->io_uring->req_complete);
Jens Axboee3bc8e92020-09-24 08:45:57 -06001983 put_task_struct_many(rb->task, rb->task_refs);
Jens Axboe0f212202020-09-13 13:09:39 -06001984 }
Jens Axboee3bc8e92020-09-24 08:45:57 -06001985 rb->task = req->task;
1986 rb->task_refs = 0;
Pavel Begunkov5af1d132020-07-18 11:32:52 +03001987 }
Jens Axboee3bc8e92020-09-24 08:45:57 -06001988 rb->task_refs++;
Pavel Begunkov5af1d132020-07-18 11:32:52 +03001989
Jens Axboe51a4cc12020-08-10 10:55:56 -06001990 WARN_ON_ONCE(io_dismantle_req(req));
Pavel Begunkov2d6500d2020-06-28 12:52:33 +03001991 rb->reqs[rb->to_free++] = req;
1992 if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs)))
1993 __io_req_free_batch_flush(req->ctx, rb);
Pavel Begunkov7a743e22020-03-03 21:33:13 +03001994}
1995
Jens Axboeba816ad2019-09-28 11:36:45 -06001996/*
1997 * Drop reference to request, return next in chain (if there is one) if this
1998 * was the last reference to this request.
1999 */
Pavel Begunkov9b5f7bd2020-06-29 13:13:00 +03002000static struct io_kiocb *io_put_req_find_next(struct io_kiocb *req)
Jens Axboee65ef562019-03-12 10:16:44 -06002001{
Pavel Begunkov9b5f7bd2020-06-29 13:13:00 +03002002 struct io_kiocb *nxt = NULL;
2003
Jens Axboe2a44f462020-02-25 13:25:41 -07002004 if (refcount_dec_and_test(&req->refs)) {
Pavel Begunkov9b5f7bd2020-06-29 13:13:00 +03002005 nxt = io_req_find_next(req);
Jens Axboe4d7dd462019-11-20 13:03:52 -07002006 __io_free_req(req);
Jens Axboe2a44f462020-02-25 13:25:41 -07002007 }
Pavel Begunkov9b5f7bd2020-06-29 13:13:00 +03002008 return nxt;
Jens Axboe2b188cc2019-01-07 10:46:33 -07002009}
2010
Jens Axboe2b188cc2019-01-07 10:46:33 -07002011static void io_put_req(struct io_kiocb *req)
2012{
Jens Axboedef596e2019-01-09 08:59:42 -07002013 if (refcount_dec_and_test(&req->refs))
2014 io_free_req(req);
2015}
2016
Pavel Begunkovf4db7182020-06-25 18:20:54 +03002017static struct io_wq_work *io_steal_work(struct io_kiocb *req)
Pavel Begunkov7a743e22020-03-03 21:33:13 +03002018{
Pavel Begunkov6df1db62020-07-03 22:15:06 +03002019 struct io_kiocb *nxt;
Pavel Begunkov7a743e22020-03-03 21:33:13 +03002020
Pavel Begunkovf4db7182020-06-25 18:20:54 +03002021 /*
2022 * A ref is owned by io-wq in which context we're. So, if that's the
2023 * last one, it's safe to steal next work. False negatives are Ok,
2024 * it just will be re-punted async in io_put_work()
2025 */
2026 if (refcount_read(&req->refs) != 1)
2027 return NULL;
2028
Pavel Begunkov9b5f7bd2020-06-29 13:13:00 +03002029 nxt = io_req_find_next(req);
Pavel Begunkov6df1db62020-07-03 22:15:06 +03002030 return nxt ? &nxt->work : NULL;
Pavel Begunkov7a743e22020-03-03 21:33:13 +03002031}
2032
Jens Axboe978db572019-11-14 22:39:04 -07002033/*
2034 * Must only be used if we don't need to care about links, usually from
2035 * within the completion handling itself.
2036 */
2037static void __io_double_put_req(struct io_kiocb *req)
Jens Axboea3a0e432019-08-20 11:03:11 -06002038{
Jens Axboe78e19bb2019-11-06 15:21:34 -07002039 /* drop both submit and complete references */
2040 if (refcount_sub_and_test(2, &req->refs))
2041 __io_free_req(req);
2042}
2043
Jens Axboe978db572019-11-14 22:39:04 -07002044static void io_double_put_req(struct io_kiocb *req)
2045{
2046 /* drop both submit and complete references */
2047 if (refcount_sub_and_test(2, &req->refs))
2048 io_free_req(req);
2049}
2050
Jens Axboe1d7bb1d2019-11-06 11:31:17 -07002051static unsigned io_cqring_events(struct io_ring_ctx *ctx, bool noflush)
Jens Axboea3a0e432019-08-20 11:03:11 -06002052{
Jens Axboe84f97dc2019-11-06 11:27:53 -07002053 struct io_rings *rings = ctx->rings;
2054
Jens Axboead3eb2c2019-12-18 17:12:20 -07002055 if (test_bit(0, &ctx->cq_check_overflow)) {
2056 /*
2057 * noflush == true is from the waitqueue handler, just ensure
2058 * we wake up the task, and the next invocation will flush the
2059 * entries. We cannot safely to it from here.
2060 */
2061 if (noflush && !list_empty(&ctx->cq_overflow_list))
2062 return -1U;
Jens Axboe1d7bb1d2019-11-06 11:31:17 -07002063
Jens Axboee6c8aa92020-09-28 13:10:13 -06002064 io_cqring_overflow_flush(ctx, false, NULL, NULL);
Jens Axboead3eb2c2019-12-18 17:12:20 -07002065 }
Jens Axboe1d7bb1d2019-11-06 11:31:17 -07002066
Jens Axboea3a0e432019-08-20 11:03:11 -06002067 /* See comment at the top of this file */
2068 smp_rmb();
Jens Axboead3eb2c2019-12-18 17:12:20 -07002069 return ctx->cached_cq_tail - READ_ONCE(rings->cq.head);
Jens Axboea3a0e432019-08-20 11:03:11 -06002070}
2071
Pavel Begunkovfb5ccc92019-10-25 12:31:30 +03002072static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
2073{
2074 struct io_rings *rings = ctx->rings;
2075
2076 /* make sure SQ entry isn't read before tail */
2077 return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
2078}
2079
Pavel Begunkov8ff069b2020-07-16 23:28:04 +03002080static unsigned int io_put_kbuf(struct io_kiocb *req, struct io_buffer *kbuf)
Jens Axboee94f1412019-12-19 12:06:02 -07002081{
Pavel Begunkov8ff069b2020-07-16 23:28:04 +03002082 unsigned int cflags;
Jens Axboee94f1412019-12-19 12:06:02 -07002083
Jens Axboebcda7ba2020-02-23 16:42:51 -07002084 cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT;
2085 cflags |= IORING_CQE_F_BUFFER;
Pavel Begunkov0e1b6fe2020-07-16 23:28:02 +03002086 req->flags &= ~REQ_F_BUFFER_SELECTED;
Jens Axboebcda7ba2020-02-23 16:42:51 -07002087 kfree(kbuf);
2088 return cflags;
2089}
2090
Pavel Begunkov8ff069b2020-07-16 23:28:04 +03002091static inline unsigned int io_put_rw_kbuf(struct io_kiocb *req)
2092{
2093 struct io_buffer *kbuf;
2094
2095 kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
2096 return io_put_kbuf(req, kbuf);
2097}
2098
Jens Axboe4c6e2772020-07-01 11:29:10 -06002099static inline bool io_run_task_work(void)
2100{
Jens Axboe6200b0a2020-09-13 14:38:30 -06002101 /*
2102 * Not safe to run on exiting task, and the task_work handling will
2103 * not add work to such a task.
2104 */
2105 if (unlikely(current->flags & PF_EXITING))
2106 return false;
Jens Axboe4c6e2772020-07-01 11:29:10 -06002107 if (current->task_works) {
2108 __set_current_state(TASK_RUNNING);
2109 task_work_run();
2110 return true;
2111 }
2112
2113 return false;
2114}
2115
Xiaoguang Wangbbde0172020-06-16 02:06:38 +08002116static void io_iopoll_queue(struct list_head *again)
2117{
2118 struct io_kiocb *req;
2119
2120 do {
Pavel Begunkovd21ffe72020-07-13 23:37:10 +03002121 req = list_first_entry(again, struct io_kiocb, inflight_entry);
2122 list_del(&req->inflight_entry);
Pavel Begunkov81b68a52020-07-30 18:43:46 +03002123 __io_complete_rw(req, -EAGAIN, 0, NULL);
Xiaoguang Wangbbde0172020-06-16 02:06:38 +08002124 } while (!list_empty(again));
2125}
2126
Jens Axboedef596e2019-01-09 08:59:42 -07002127/*
2128 * Find and free completed poll iocbs
2129 */
2130static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
2131 struct list_head *done)
2132{
Jens Axboe8237e042019-12-28 10:48:22 -07002133 struct req_batch rb;
Jens Axboedef596e2019-01-09 08:59:42 -07002134 struct io_kiocb *req;
Xiaoguang Wangbbde0172020-06-16 02:06:38 +08002135 LIST_HEAD(again);
2136
2137 /* order with ->result store in io_complete_rw_iopoll() */
2138 smp_rmb();
Jens Axboedef596e2019-01-09 08:59:42 -07002139
Pavel Begunkov5af1d132020-07-18 11:32:52 +03002140 io_init_req_batch(&rb);
Jens Axboedef596e2019-01-09 08:59:42 -07002141 while (!list_empty(done)) {
Jens Axboebcda7ba2020-02-23 16:42:51 -07002142 int cflags = 0;
2143
Pavel Begunkovd21ffe72020-07-13 23:37:10 +03002144 req = list_first_entry(done, struct io_kiocb, inflight_entry);
Xiaoguang Wangbbde0172020-06-16 02:06:38 +08002145 if (READ_ONCE(req->result) == -EAGAIN) {
Jens Axboe56450c22020-08-26 18:58:26 -06002146 req->result = 0;
Xiaoguang Wangbbde0172020-06-16 02:06:38 +08002147 req->iopoll_completed = 0;
Pavel Begunkovd21ffe72020-07-13 23:37:10 +03002148 list_move_tail(&req->inflight_entry, &again);
Xiaoguang Wangbbde0172020-06-16 02:06:38 +08002149 continue;
2150 }
Pavel Begunkovd21ffe72020-07-13 23:37:10 +03002151 list_del(&req->inflight_entry);
Jens Axboedef596e2019-01-09 08:59:42 -07002152
Jens Axboebcda7ba2020-02-23 16:42:51 -07002153 if (req->flags & REQ_F_BUFFER_SELECTED)
Pavel Begunkov8ff069b2020-07-16 23:28:04 +03002154 cflags = io_put_rw_kbuf(req);
Jens Axboebcda7ba2020-02-23 16:42:51 -07002155
2156 __io_cqring_fill_event(req, req->result, cflags);
Jens Axboedef596e2019-01-09 08:59:42 -07002157 (*nr_events)++;
2158
Pavel Begunkovc3524382020-06-28 12:52:32 +03002159 if (refcount_dec_and_test(&req->refs))
Pavel Begunkov2d6500d2020-06-28 12:52:33 +03002160 io_req_free_batch(&rb, req);
Jens Axboedef596e2019-01-09 08:59:42 -07002161 }
Jens Axboedef596e2019-01-09 08:59:42 -07002162
Jens Axboe09bb8392019-03-13 12:39:28 -06002163 io_commit_cqring(ctx);
Xiaoguang Wang32b22442020-03-11 09:26:09 +08002164 if (ctx->flags & IORING_SETUP_SQPOLL)
2165 io_cqring_ev_posted(ctx);
Pavel Begunkov2d6500d2020-06-28 12:52:33 +03002166 io_req_free_batch_finish(ctx, &rb);
Jens Axboedef596e2019-01-09 08:59:42 -07002167
Xiaoguang Wangbbde0172020-06-16 02:06:38 +08002168 if (!list_empty(&again))
2169 io_iopoll_queue(&again);
Bijan Mottahedeh581f9812020-04-03 13:51:33 -07002170}
2171
Jens Axboedef596e2019-01-09 08:59:42 -07002172static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
2173 long min)
2174{
2175 struct io_kiocb *req, *tmp;
2176 LIST_HEAD(done);
2177 bool spin;
2178 int ret;
2179
2180 /*
2181 * Only spin for completions if we don't have multiple devices hanging
2182 * off our complete list, and we're under the requested amount.
2183 */
2184 spin = !ctx->poll_multi_file && *nr_events < min;
2185
2186 ret = 0;
Pavel Begunkovd21ffe72020-07-13 23:37:10 +03002187 list_for_each_entry_safe(req, tmp, &ctx->iopoll_list, inflight_entry) {
Jens Axboe9adbd452019-12-20 08:45:55 -07002188 struct kiocb *kiocb = &req->rw.kiocb;
Jens Axboedef596e2019-01-09 08:59:42 -07002189
2190 /*
Bijan Mottahedeh581f9812020-04-03 13:51:33 -07002191 * Move completed and retryable entries to our local lists.
2192 * If we find a request that requires polling, break out
2193 * and complete those lists first, if we have entries there.
Jens Axboedef596e2019-01-09 08:59:42 -07002194 */
Xiaoguang Wang65a65432020-06-11 23:39:36 +08002195 if (READ_ONCE(req->iopoll_completed)) {
Pavel Begunkovd21ffe72020-07-13 23:37:10 +03002196 list_move_tail(&req->inflight_entry, &done);
Jens Axboedef596e2019-01-09 08:59:42 -07002197 continue;
2198 }
2199 if (!list_empty(&done))
2200 break;
2201
2202 ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
2203 if (ret < 0)
2204 break;
2205
Pavel Begunkov3aadc232020-07-06 17:59:29 +03002206 /* iopoll may have completed current req */
2207 if (READ_ONCE(req->iopoll_completed))
Pavel Begunkovd21ffe72020-07-13 23:37:10 +03002208 list_move_tail(&req->inflight_entry, &done);
Pavel Begunkov3aadc232020-07-06 17:59:29 +03002209
Jens Axboedef596e2019-01-09 08:59:42 -07002210 if (ret && spin)
2211 spin = false;
2212 ret = 0;
2213 }
2214
2215 if (!list_empty(&done))
2216 io_iopoll_complete(ctx, nr_events, &done);
2217
2218 return ret;
2219}
2220
2221/*
Brian Gianforcarod195a662019-12-13 03:09:50 -08002222 * Poll for a minimum of 'min' events. Note that if min == 0 we consider that a
Jens Axboedef596e2019-01-09 08:59:42 -07002223 * non-spinning poll check - we'll still enter the driver poll loop, but only
2224 * as a non-spinning completion check.
2225 */
2226static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events,
2227 long min)
2228{
Pavel Begunkov540e32a2020-07-13 23:37:09 +03002229 while (!list_empty(&ctx->iopoll_list) && !need_resched()) {
Jens Axboedef596e2019-01-09 08:59:42 -07002230 int ret;
2231
2232 ret = io_do_iopoll(ctx, nr_events, min);
2233 if (ret < 0)
2234 return ret;
Pavel Begunkoveba0a4d2020-07-06 17:59:30 +03002235 if (*nr_events >= min)
Jens Axboedef596e2019-01-09 08:59:42 -07002236 return 0;
2237 }
2238
2239 return 1;
2240}
2241
2242/*
2243 * We can't just wait for polled events to come to us, we have to actively
2244 * find and complete them.
2245 */
Pavel Begunkovb2edc0a2020-07-07 16:36:22 +03002246static void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
Jens Axboedef596e2019-01-09 08:59:42 -07002247{
2248 if (!(ctx->flags & IORING_SETUP_IOPOLL))
2249 return;
2250
2251 mutex_lock(&ctx->uring_lock);
Pavel Begunkov540e32a2020-07-13 23:37:09 +03002252 while (!list_empty(&ctx->iopoll_list)) {
Jens Axboedef596e2019-01-09 08:59:42 -07002253 unsigned int nr_events = 0;
2254
Pavel Begunkovb2edc0a2020-07-07 16:36:22 +03002255 io_do_iopoll(ctx, &nr_events, 0);
Jens Axboe08f54392019-08-21 22:19:11 -06002256
Pavel Begunkovb2edc0a2020-07-07 16:36:22 +03002257 /* let it sleep and repeat later if can't complete a request */
2258 if (nr_events == 0)
2259 break;
Jens Axboe08f54392019-08-21 22:19:11 -06002260 /*
2261 * Ensure we allow local-to-the-cpu processing to take place,
2262 * in this case we need to ensure that we reap all events.
Pavel Begunkov3fcee5a2020-07-06 17:59:31 +03002263 * Also let task_work, etc. to progress by releasing the mutex
Jens Axboe08f54392019-08-21 22:19:11 -06002264 */
Pavel Begunkov3fcee5a2020-07-06 17:59:31 +03002265 if (need_resched()) {
2266 mutex_unlock(&ctx->uring_lock);
2267 cond_resched();
2268 mutex_lock(&ctx->uring_lock);
2269 }
Jens Axboedef596e2019-01-09 08:59:42 -07002270 }
2271 mutex_unlock(&ctx->uring_lock);
2272}
2273
Pavel Begunkov7668b922020-07-07 16:36:21 +03002274static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
Jens Axboedef596e2019-01-09 08:59:42 -07002275{
Pavel Begunkov7668b922020-07-07 16:36:21 +03002276 unsigned int nr_events = 0;
Jens Axboe2b2ed972019-10-25 10:06:15 -06002277 int iters = 0, ret = 0;
Jens Axboedef596e2019-01-09 08:59:42 -07002278
Xiaoguang Wangc7849be2020-02-22 14:46:05 +08002279 /*
2280 * We disallow the app entering submit/complete with polling, but we
2281 * still need to lock the ring to prevent racing with polled issue
2282 * that got punted to a workqueue.
2283 */
2284 mutex_lock(&ctx->uring_lock);
Jens Axboedef596e2019-01-09 08:59:42 -07002285 do {
Jens Axboe500f9fb2019-08-19 12:15:59 -06002286 /*
Jens Axboea3a0e432019-08-20 11:03:11 -06002287 * Don't enter poll loop if we already have events pending.
2288 * If we do, we can potentially be spinning for commands that
2289 * already triggered a CQE (eg in error).
2290 */
Jens Axboe1d7bb1d2019-11-06 11:31:17 -07002291 if (io_cqring_events(ctx, false))
Jens Axboea3a0e432019-08-20 11:03:11 -06002292 break;
2293
2294 /*
Jens Axboe500f9fb2019-08-19 12:15:59 -06002295 * If a submit got punted to a workqueue, we can have the
2296 * application entering polling for a command before it gets
2297 * issued. That app will hold the uring_lock for the duration
2298 * of the poll right here, so we need to take a breather every
2299 * now and then to ensure that the issue has a chance to add
2300 * the poll to the issued list. Otherwise we can spin here
2301 * forever, while the workqueue is stuck trying to acquire the
2302 * very same mutex.
2303 */
2304 if (!(++iters & 7)) {
2305 mutex_unlock(&ctx->uring_lock);
Jens Axboe4c6e2772020-07-01 11:29:10 -06002306 io_run_task_work();
Jens Axboe500f9fb2019-08-19 12:15:59 -06002307 mutex_lock(&ctx->uring_lock);
2308 }
2309
Pavel Begunkov7668b922020-07-07 16:36:21 +03002310 ret = io_iopoll_getevents(ctx, &nr_events, min);
Jens Axboedef596e2019-01-09 08:59:42 -07002311 if (ret <= 0)
2312 break;
2313 ret = 0;
Pavel Begunkov7668b922020-07-07 16:36:21 +03002314 } while (min && !nr_events && !need_resched());
Jens Axboedef596e2019-01-09 08:59:42 -07002315
Jens Axboe500f9fb2019-08-19 12:15:59 -06002316 mutex_unlock(&ctx->uring_lock);
Jens Axboedef596e2019-01-09 08:59:42 -07002317 return ret;
2318}
2319
Jens Axboe491381ce2019-10-17 09:20:46 -06002320static void kiocb_end_write(struct io_kiocb *req)
Jens Axboe2b188cc2019-01-07 10:46:33 -07002321{
Jens Axboe491381ce2019-10-17 09:20:46 -06002322 /*
2323 * Tell lockdep we inherited freeze protection from submission
2324 * thread.
2325 */
2326 if (req->flags & REQ_F_ISREG) {
2327 struct inode *inode = file_inode(req->file);
Jens Axboe2b188cc2019-01-07 10:46:33 -07002328
Jens Axboe491381ce2019-10-17 09:20:46 -06002329 __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
Jens Axboe2b188cc2019-01-07 10:46:33 -07002330 }
Jens Axboe491381ce2019-10-17 09:20:46 -06002331 file_end_write(req->file);
Jens Axboe2b188cc2019-01-07 10:46:33 -07002332}
2333
Jens Axboea1d7c392020-06-22 11:09:46 -06002334static void io_complete_rw_common(struct kiocb *kiocb, long res,
2335 struct io_comp_state *cs)
Jens Axboe2b188cc2019-01-07 10:46:33 -07002336{
Jens Axboe9adbd452019-12-20 08:45:55 -07002337 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
Jens Axboebcda7ba2020-02-23 16:42:51 -07002338 int cflags = 0;
Jens Axboe2b188cc2019-01-07 10:46:33 -07002339
Jens Axboe491381ce2019-10-17 09:20:46 -06002340 if (kiocb->ki_flags & IOCB_WRITE)
2341 kiocb_end_write(req);
Jens Axboe2b188cc2019-01-07 10:46:33 -07002342
Jens Axboe4e88d6e2019-12-07 20:59:47 -07002343 if (res != req->result)
2344 req_set_fail_links(req);
Jens Axboebcda7ba2020-02-23 16:42:51 -07002345 if (req->flags & REQ_F_BUFFER_SELECTED)
Pavel Begunkov8ff069b2020-07-16 23:28:04 +03002346 cflags = io_put_rw_kbuf(req);
Jens Axboea1d7c392020-06-22 11:09:46 -06002347 __io_req_complete(req, res, cflags, cs);
Jens Axboeba816ad2019-09-28 11:36:45 -06002348}
2349
Jens Axboeb63534c2020-06-04 11:28:00 -06002350#ifdef CONFIG_BLOCK
2351static bool io_resubmit_prep(struct io_kiocb *req, int error)
2352{
2353 struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
2354 ssize_t ret = -ECANCELED;
2355 struct iov_iter iter;
2356 int rw;
2357
2358 if (error) {
2359 ret = error;
2360 goto end_req;
2361 }
2362
2363 switch (req->opcode) {
2364 case IORING_OP_READV:
2365 case IORING_OP_READ_FIXED:
2366 case IORING_OP_READ:
2367 rw = READ;
2368 break;
2369 case IORING_OP_WRITEV:
2370 case IORING_OP_WRITE_FIXED:
2371 case IORING_OP_WRITE:
2372 rw = WRITE;
2373 break;
2374 default:
2375 printk_once(KERN_WARNING "io_uring: bad opcode in resubmit %d\n",
2376 req->opcode);
2377 goto end_req;
2378 }
2379
Jens Axboee8c2bc12020-08-15 18:44:09 -07002380 if (!req->async_data) {
Jens Axboe8f3d7492020-09-14 09:28:14 -06002381 ret = io_import_iovec(rw, req, &iovec, &iter, false);
2382 if (ret < 0)
2383 goto end_req;
2384 ret = io_setup_async_rw(req, iovec, inline_vecs, &iter, false);
2385 if (!ret)
2386 return true;
2387 kfree(iovec);
2388 } else {
Jens Axboeb63534c2020-06-04 11:28:00 -06002389 return true;
Jens Axboe8f3d7492020-09-14 09:28:14 -06002390 }
Jens Axboeb63534c2020-06-04 11:28:00 -06002391end_req:
Jens Axboeb63534c2020-06-04 11:28:00 -06002392 req_set_fail_links(req);
Jens Axboee1e16092020-06-22 09:17:17 -06002393 io_req_complete(req, ret);
Jens Axboeb63534c2020-06-04 11:28:00 -06002394 return false;
2395}
Jens Axboeb63534c2020-06-04 11:28:00 -06002396#endif
2397
2398static bool io_rw_reissue(struct io_kiocb *req, long res)
2399{
2400#ifdef CONFIG_BLOCK
Jens Axboe355afae2020-09-02 09:30:31 -06002401 umode_t mode = file_inode(req->file)->i_mode;
Jens Axboeb63534c2020-06-04 11:28:00 -06002402 int ret;
2403
Jens Axboe355afae2020-09-02 09:30:31 -06002404 if (!S_ISBLK(mode) && !S_ISREG(mode))
2405 return false;
Jens Axboeb63534c2020-06-04 11:28:00 -06002406 if ((res != -EAGAIN && res != -EOPNOTSUPP) || io_wq_current_is_worker())
2407 return false;
2408
Jens Axboefdee9462020-08-27 16:46:24 -06002409 ret = io_sq_thread_acquire_mm(req->ctx, req);
Jens Axboe6d816e02020-08-11 08:04:14 -06002410
Jens Axboefdee9462020-08-27 16:46:24 -06002411 if (io_resubmit_prep(req, ret)) {
2412 refcount_inc(&req->refs);
2413 io_queue_async_work(req);
Jens Axboeb63534c2020-06-04 11:28:00 -06002414 return true;
Jens Axboefdee9462020-08-27 16:46:24 -06002415 }
2416
Jens Axboeb63534c2020-06-04 11:28:00 -06002417#endif
2418 return false;
2419}
2420
Jens Axboea1d7c392020-06-22 11:09:46 -06002421static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
2422 struct io_comp_state *cs)
2423{
2424 if (!io_rw_reissue(req, res))
2425 io_complete_rw_common(&req->rw.kiocb, res, cs);
Jens Axboeba816ad2019-09-28 11:36:45 -06002426}
2427
2428static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
2429{
Jens Axboe9adbd452019-12-20 08:45:55 -07002430 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
Jens Axboeba816ad2019-09-28 11:36:45 -06002431
Jens Axboea1d7c392020-06-22 11:09:46 -06002432 __io_complete_rw(req, res, res2, NULL);
Jens Axboe2b188cc2019-01-07 10:46:33 -07002433}
2434
Jens Axboedef596e2019-01-09 08:59:42 -07002435static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
2436{
Jens Axboe9adbd452019-12-20 08:45:55 -07002437 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
Jens Axboedef596e2019-01-09 08:59:42 -07002438
Jens Axboe491381ce2019-10-17 09:20:46 -06002439 if (kiocb->ki_flags & IOCB_WRITE)
2440 kiocb_end_write(req);
Jens Axboedef596e2019-01-09 08:59:42 -07002441
Xiaoguang Wang2d7d6792020-06-16 02:06:37 +08002442 if (res != -EAGAIN && res != req->result)
Jens Axboe4e88d6e2019-12-07 20:59:47 -07002443 req_set_fail_links(req);
Xiaoguang Wangbbde0172020-06-16 02:06:38 +08002444
2445 WRITE_ONCE(req->result, res);
2446 /* order with io_poll_complete() checking ->result */
Pavel Begunkovcd664b02020-06-25 12:37:10 +03002447 smp_wmb();
2448 WRITE_ONCE(req->iopoll_completed, 1);
Jens Axboedef596e2019-01-09 08:59:42 -07002449}
2450
2451/*
2452 * After the iocb has been issued, it's safe to be found on the poll list.
2453 * Adding the kiocb to the list AFTER submission ensures that we don't
2454 * find it from a io_iopoll_getevents() thread before the issuer is done
2455 * accessing the kiocb cookie.
2456 */
2457static void io_iopoll_req_issued(struct io_kiocb *req)
2458{
2459 struct io_ring_ctx *ctx = req->ctx;
2460
2461 /*
2462 * Track whether we have multiple files in our lists. This will impact
2463 * how we do polling eventually, not spinning if we're on potentially
2464 * different devices.
2465 */
Pavel Begunkov540e32a2020-07-13 23:37:09 +03002466 if (list_empty(&ctx->iopoll_list)) {
Jens Axboedef596e2019-01-09 08:59:42 -07002467 ctx->poll_multi_file = false;
2468 } else if (!ctx->poll_multi_file) {
2469 struct io_kiocb *list_req;
2470
Pavel Begunkov540e32a2020-07-13 23:37:09 +03002471 list_req = list_first_entry(&ctx->iopoll_list, struct io_kiocb,
Pavel Begunkovd21ffe72020-07-13 23:37:10 +03002472 inflight_entry);
Jens Axboe9adbd452019-12-20 08:45:55 -07002473 if (list_req->file != req->file)
Jens Axboedef596e2019-01-09 08:59:42 -07002474 ctx->poll_multi_file = true;
2475 }
2476
2477 /*
2478 * For fast devices, IO may have already completed. If it has, add
2479 * it to the front so we find it first.
2480 */
Xiaoguang Wang65a65432020-06-11 23:39:36 +08002481 if (READ_ONCE(req->iopoll_completed))
Pavel Begunkovd21ffe72020-07-13 23:37:10 +03002482 list_add(&req->inflight_entry, &ctx->iopoll_list);
Jens Axboedef596e2019-01-09 08:59:42 -07002483 else
Pavel Begunkovd21ffe72020-07-13 23:37:10 +03002484 list_add_tail(&req->inflight_entry, &ctx->iopoll_list);
Xiaoguang Wangbdcd3ea2020-02-25 22:12:08 +08002485
Jens Axboe534ca6d2020-09-02 13:52:19 -06002486 if ((ctx->flags & IORING_SETUP_SQPOLL) &&
2487 wq_has_sleeper(&ctx->sq_data->wait))
2488 wake_up(&ctx->sq_data->wait);
Jens Axboedef596e2019-01-09 08:59:42 -07002489}
2490
Pavel Begunkov9f13c352020-05-17 14:13:41 +03002491static void __io_state_file_put(struct io_submit_state *state)
Jens Axboe9a56a232019-01-09 09:06:50 -07002492{
Pavel Begunkov06ef3602020-07-16 23:28:33 +03002493 if (state->has_refs)
2494 fput_many(state->file, state->has_refs);
Pavel Begunkov9f13c352020-05-17 14:13:41 +03002495 state->file = NULL;
2496}
2497
2498static inline void io_state_file_put(struct io_submit_state *state)
2499{
2500 if (state->file)
2501 __io_state_file_put(state);
Jens Axboe9a56a232019-01-09 09:06:50 -07002502}
2503
2504/*
2505 * Get as many references to a file as we have IOs left in this submission,
2506 * assuming most submissions are for one file, or at least that each file
2507 * has more than one submission.
2508 */
Pavel Begunkov8da11c12020-02-24 11:32:44 +03002509static struct file *__io_file_get(struct io_submit_state *state, int fd)
Jens Axboe9a56a232019-01-09 09:06:50 -07002510{
2511 if (!state)
2512 return fget(fd);
2513
2514 if (state->file) {
2515 if (state->fd == fd) {
Pavel Begunkov06ef3602020-07-16 23:28:33 +03002516 state->has_refs--;
Jens Axboe9a56a232019-01-09 09:06:50 -07002517 state->ios_left--;
2518 return state->file;
2519 }
Pavel Begunkov9f13c352020-05-17 14:13:41 +03002520 __io_state_file_put(state);
Jens Axboe9a56a232019-01-09 09:06:50 -07002521 }
2522 state->file = fget_many(fd, state->ios_left);
2523 if (!state->file)
2524 return NULL;
2525
2526 state->fd = fd;
Jens Axboe9a56a232019-01-09 09:06:50 -07002527 state->ios_left--;
Pavel Begunkov06ef3602020-07-16 23:28:33 +03002528 state->has_refs = state->ios_left;
Jens Axboe9a56a232019-01-09 09:06:50 -07002529 return state->file;
2530}
2531
Jens Axboe4503b762020-06-01 10:00:27 -06002532static bool io_bdev_nowait(struct block_device *bdev)
2533{
2534#ifdef CONFIG_BLOCK
2535 return !bdev || queue_is_mq(bdev_get_queue(bdev));
2536#else
2537 return true;
2538#endif
2539}
2540
Jens Axboe2b188cc2019-01-07 10:46:33 -07002541/*
2542 * If we tracked the file through the SCM inflight mechanism, we could support
2543 * any file. For now, just ensure that anything potentially problematic is done
2544 * inline.
2545 */
Jens Axboeaf197f52020-04-28 13:15:06 -06002546static bool io_file_supports_async(struct file *file, int rw)
Jens Axboe2b188cc2019-01-07 10:46:33 -07002547{
2548 umode_t mode = file_inode(file)->i_mode;
2549
Jens Axboe4503b762020-06-01 10:00:27 -06002550 if (S_ISBLK(mode)) {
2551 if (io_bdev_nowait(file->f_inode->i_bdev))
2552 return true;
2553 return false;
2554 }
2555 if (S_ISCHR(mode) || S_ISSOCK(mode))
Jens Axboe2b188cc2019-01-07 10:46:33 -07002556 return true;
Jens Axboe4503b762020-06-01 10:00:27 -06002557 if (S_ISREG(mode)) {
2558 if (io_bdev_nowait(file->f_inode->i_sb->s_bdev) &&
2559 file->f_op != &io_uring_fops)
2560 return true;
2561 return false;
2562 }
Jens Axboe2b188cc2019-01-07 10:46:33 -07002563
Jens Axboec5b85622020-06-09 19:23:05 -06002564 /* any ->read/write should understand O_NONBLOCK */
2565 if (file->f_flags & O_NONBLOCK)
2566 return true;
2567
Jens Axboeaf197f52020-04-28 13:15:06 -06002568 if (!(file->f_mode & FMODE_NOWAIT))
2569 return false;
2570
2571 if (rw == READ)
2572 return file->f_op->read_iter != NULL;
2573
2574 return file->f_op->write_iter != NULL;
Jens Axboe2b188cc2019-01-07 10:46:33 -07002575}
2576
Jens Axboe3529d8c2019-12-19 18:24:38 -07002577static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
2578 bool force_nonblock)
Jens Axboe2b188cc2019-01-07 10:46:33 -07002579{
Jens Axboedef596e2019-01-09 08:59:42 -07002580 struct io_ring_ctx *ctx = req->ctx;
Jens Axboe9adbd452019-12-20 08:45:55 -07002581 struct kiocb *kiocb = &req->rw.kiocb;
Jens Axboe09bb8392019-03-13 12:39:28 -06002582 unsigned ioprio;
2583 int ret;
Jens Axboe2b188cc2019-01-07 10:46:33 -07002584
Jens Axboe491381ce2019-10-17 09:20:46 -06002585 if (S_ISREG(file_inode(req->file)->i_mode))
2586 req->flags |= REQ_F_ISREG;
2587
Jens Axboe2b188cc2019-01-07 10:46:33 -07002588 kiocb->ki_pos = READ_ONCE(sqe->off);
Jens Axboeba042912019-12-25 16:33:42 -07002589 if (kiocb->ki_pos == -1 && !(req->file->f_mode & FMODE_STREAM)) {
2590 req->flags |= REQ_F_CUR_POS;
2591 kiocb->ki_pos = req->file->f_pos;
2592 }
Jens Axboe2b188cc2019-01-07 10:46:33 -07002593 kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
Pavel Begunkov3e577dc2020-02-01 03:58:42 +03002594 kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
2595 ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
2596 if (unlikely(ret))
2597 return ret;
Jens Axboe2b188cc2019-01-07 10:46:33 -07002598
2599 ioprio = READ_ONCE(sqe->ioprio);
2600 if (ioprio) {
2601 ret = ioprio_check_cap(ioprio);
2602 if (ret)
Jens Axboe09bb8392019-03-13 12:39:28 -06002603 return ret;
Jens Axboe2b188cc2019-01-07 10:46:33 -07002604
2605 kiocb->ki_ioprio = ioprio;
2606 } else
2607 kiocb->ki_ioprio = get_current_ioprio();
2608
Stefan Bühler8449eed2019-04-27 20:34:19 +02002609 /* don't allow async punt if RWF_NOWAIT was requested */
Jens Axboec5b85622020-06-09 19:23:05 -06002610 if (kiocb->ki_flags & IOCB_NOWAIT)
Stefan Bühler8449eed2019-04-27 20:34:19 +02002611 req->flags |= REQ_F_NOWAIT;
2612
2613 if (force_nonblock)
Jens Axboe2b188cc2019-01-07 10:46:33 -07002614 kiocb->ki_flags |= IOCB_NOWAIT;
Stefan Bühler8449eed2019-04-27 20:34:19 +02002615
Jens Axboedef596e2019-01-09 08:59:42 -07002616 if (ctx->flags & IORING_SETUP_IOPOLL) {
Jens Axboedef596e2019-01-09 08:59:42 -07002617 if (!(kiocb->ki_flags & IOCB_DIRECT) ||
2618 !kiocb->ki_filp->f_op->iopoll)
Jens Axboe09bb8392019-03-13 12:39:28 -06002619 return -EOPNOTSUPP;
Jens Axboe2b188cc2019-01-07 10:46:33 -07002620
Jens Axboedef596e2019-01-09 08:59:42 -07002621 kiocb->ki_flags |= IOCB_HIPRI;
2622 kiocb->ki_complete = io_complete_rw_iopoll;
Xiaoguang Wang65a65432020-06-11 23:39:36 +08002623 req->iopoll_completed = 0;
Jens Axboedef596e2019-01-09 08:59:42 -07002624 } else {
Jens Axboe09bb8392019-03-13 12:39:28 -06002625 if (kiocb->ki_flags & IOCB_HIPRI)
2626 return -EINVAL;
Jens Axboedef596e2019-01-09 08:59:42 -07002627 kiocb->ki_complete = io_complete_rw;
2628 }
Jens Axboe9adbd452019-12-20 08:45:55 -07002629
Jens Axboe3529d8c2019-12-19 18:24:38 -07002630 req->rw.addr = READ_ONCE(sqe->addr);
2631 req->rw.len = READ_ONCE(sqe->len);
Bijan Mottahedeh4f4eeba2020-05-19 14:52:49 -07002632 req->buf_index = READ_ONCE(sqe->buf_index);
Jens Axboe2b188cc2019-01-07 10:46:33 -07002633 return 0;
Jens Axboe2b188cc2019-01-07 10:46:33 -07002634}
2635
2636static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
2637{
2638 switch (ret) {
2639 case -EIOCBQUEUED:
2640 break;
2641 case -ERESTARTSYS:
2642 case -ERESTARTNOINTR:
2643 case -ERESTARTNOHAND:
2644 case -ERESTART_RESTARTBLOCK:
2645 /*
2646 * We can't just restart the syscall, since previously
2647 * submitted sqes may already be in progress. Just fail this
2648 * IO with EINTR.
2649 */
2650 ret = -EINTR;
Gustavo A. R. Silvadf561f662020-08-23 17:36:59 -05002651 fallthrough;
Jens Axboe2b188cc2019-01-07 10:46:33 -07002652 default:
2653 kiocb->ki_complete(kiocb, ret, 0);
2654 }
2655}
2656
Jens Axboea1d7c392020-06-22 11:09:46 -06002657static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
2658 struct io_comp_state *cs)
Jens Axboeba816ad2019-09-28 11:36:45 -06002659{
Jens Axboeba042912019-12-25 16:33:42 -07002660 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
Jens Axboee8c2bc12020-08-15 18:44:09 -07002661 struct io_async_rw *io = req->async_data;
Jens Axboeba042912019-12-25 16:33:42 -07002662
Jens Axboe227c0c92020-08-13 11:51:40 -06002663 /* add previously done IO, if any */
Jens Axboee8c2bc12020-08-15 18:44:09 -07002664 if (io && io->bytes_done > 0) {
Jens Axboe227c0c92020-08-13 11:51:40 -06002665 if (ret < 0)
Jens Axboee8c2bc12020-08-15 18:44:09 -07002666 ret = io->bytes_done;
Jens Axboe227c0c92020-08-13 11:51:40 -06002667 else
Jens Axboee8c2bc12020-08-15 18:44:09 -07002668 ret += io->bytes_done;
Jens Axboe227c0c92020-08-13 11:51:40 -06002669 }
2670
Jens Axboeba042912019-12-25 16:33:42 -07002671 if (req->flags & REQ_F_CUR_POS)
2672 req->file->f_pos = kiocb->ki_pos;
Pavel Begunkovbcaec082020-02-24 11:30:18 +03002673 if (ret >= 0 && kiocb->ki_complete == io_complete_rw)
Jens Axboea1d7c392020-06-22 11:09:46 -06002674 __io_complete_rw(req, ret, 0, cs);
Jens Axboeba816ad2019-09-28 11:36:45 -06002675 else
2676 io_rw_done(kiocb, ret);
2677}
2678
Jens Axboe9adbd452019-12-20 08:45:55 -07002679static ssize_t io_import_fixed(struct io_kiocb *req, int rw,
Pavel Begunkov7d009162019-11-25 23:14:40 +03002680 struct iov_iter *iter)
Jens Axboeedafcce2019-01-09 09:16:05 -07002681{
Jens Axboe9adbd452019-12-20 08:45:55 -07002682 struct io_ring_ctx *ctx = req->ctx;
2683 size_t len = req->rw.len;
Jens Axboeedafcce2019-01-09 09:16:05 -07002684 struct io_mapped_ubuf *imu;
Pavel Begunkov4be1c612020-09-06 00:45:48 +03002685 u16 index, buf_index = req->buf_index;
Jens Axboeedafcce2019-01-09 09:16:05 -07002686 size_t offset;
2687 u64 buf_addr;
2688
Jens Axboeedafcce2019-01-09 09:16:05 -07002689 if (unlikely(buf_index >= ctx->nr_user_bufs))
2690 return -EFAULT;
Jens Axboeedafcce2019-01-09 09:16:05 -07002691 index = array_index_nospec(buf_index, ctx->nr_user_bufs);
2692 imu = &ctx->user_bufs[index];
Jens Axboe9adbd452019-12-20 08:45:55 -07002693 buf_addr = req->rw.addr;
Jens Axboeedafcce2019-01-09 09:16:05 -07002694
2695 /* overflow */
2696 if (buf_addr + len < buf_addr)
2697 return -EFAULT;
2698 /* not inside the mapped region */
2699 if (buf_addr < imu->ubuf || buf_addr + len > imu->ubuf + imu->len)
2700 return -EFAULT;
2701
2702 /*
2703 * May not be a start of buffer, set size appropriately
2704 * and advance us to the beginning.
2705 */
2706 offset = buf_addr - imu->ubuf;
2707 iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
Jens Axboebd11b3a2019-07-20 08:37:31 -06002708
2709 if (offset) {
2710 /*
2711 * Don't use iov_iter_advance() here, as it's really slow for
2712 * using the latter parts of a big fixed buffer - it iterates
2713 * over each segment manually. We can cheat a bit here, because
2714 * we know that:
2715 *
2716 * 1) it's a BVEC iter, we set it up
2717 * 2) all bvecs are PAGE_SIZE in size, except potentially the
2718 * first and last bvec
2719 *
2720 * So just find our index, and adjust the iterator afterwards.
2721 * If the offset is within the first bvec (or the whole first
2722 * bvec, just use iov_iter_advance(). This makes it easier
2723 * since we can just skip the first segment, which may not
2724 * be PAGE_SIZE aligned.
2725 */
2726 const struct bio_vec *bvec = imu->bvec;
2727
2728 if (offset <= bvec->bv_len) {
2729 iov_iter_advance(iter, offset);
2730 } else {
2731 unsigned long seg_skip;
2732
2733 /* skip first vec */
2734 offset -= bvec->bv_len;
2735 seg_skip = 1 + (offset >> PAGE_SHIFT);
2736
2737 iter->bvec = bvec + seg_skip;
2738 iter->nr_segs -= seg_skip;
Aleix Roca Nonell99c79f62019-08-15 14:03:22 +02002739 iter->count -= bvec->bv_len + offset;
Jens Axboebd11b3a2019-07-20 08:37:31 -06002740 iter->iov_offset = offset & ~PAGE_MASK;
Jens Axboebd11b3a2019-07-20 08:37:31 -06002741 }
2742 }
2743
Jens Axboe5e559562019-11-13 16:12:46 -07002744 return len;
Jens Axboeedafcce2019-01-09 09:16:05 -07002745}
2746
Jens Axboebcda7ba2020-02-23 16:42:51 -07002747static void io_ring_submit_unlock(struct io_ring_ctx *ctx, bool needs_lock)
2748{
2749 if (needs_lock)
2750 mutex_unlock(&ctx->uring_lock);
2751}
2752
2753static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock)
2754{
2755 /*
2756 * "Normal" inline submissions always hold the uring_lock, since we
2757 * grab it from the system call. Same is true for the SQPOLL offload.
2758 * The only exception is when we've detached the request and issue it
2759 * from an async worker thread, grab the lock for that case.
2760 */
2761 if (needs_lock)
2762 mutex_lock(&ctx->uring_lock);
2763}
2764
2765static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len,
2766 int bgid, struct io_buffer *kbuf,
2767 bool needs_lock)
2768{
2769 struct io_buffer *head;
2770
2771 if (req->flags & REQ_F_BUFFER_SELECTED)
2772 return kbuf;
2773
2774 io_ring_submit_lock(req->ctx, needs_lock);
2775
2776 lockdep_assert_held(&req->ctx->uring_lock);
2777
2778 head = idr_find(&req->ctx->io_buffer_idr, bgid);
2779 if (head) {
2780 if (!list_empty(&head->list)) {
2781 kbuf = list_last_entry(&head->list, struct io_buffer,
2782 list);
2783 list_del(&kbuf->list);
2784 } else {
2785 kbuf = head;
2786 idr_remove(&req->ctx->io_buffer_idr, bgid);
2787 }
2788 if (*len > kbuf->len)
2789 *len = kbuf->len;
2790 } else {
2791 kbuf = ERR_PTR(-ENOBUFS);
2792 }
2793
2794 io_ring_submit_unlock(req->ctx, needs_lock);
2795
2796 return kbuf;
2797}
2798
Jens Axboe4d954c22020-02-27 07:31:19 -07002799static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len,
2800 bool needs_lock)
2801{
2802 struct io_buffer *kbuf;
Bijan Mottahedeh4f4eeba2020-05-19 14:52:49 -07002803 u16 bgid;
Jens Axboe4d954c22020-02-27 07:31:19 -07002804
2805 kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
Bijan Mottahedeh4f4eeba2020-05-19 14:52:49 -07002806 bgid = req->buf_index;
Jens Axboe4d954c22020-02-27 07:31:19 -07002807 kbuf = io_buffer_select(req, len, bgid, kbuf, needs_lock);
2808 if (IS_ERR(kbuf))
2809 return kbuf;
2810 req->rw.addr = (u64) (unsigned long) kbuf;
2811 req->flags |= REQ_F_BUFFER_SELECTED;
2812 return u64_to_user_ptr(kbuf->addr);
2813}
2814
2815#ifdef CONFIG_COMPAT
2816static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
2817 bool needs_lock)
2818{
2819 struct compat_iovec __user *uiov;
2820 compat_ssize_t clen;
2821 void __user *buf;
2822 ssize_t len;
2823
2824 uiov = u64_to_user_ptr(req->rw.addr);
2825 if (!access_ok(uiov, sizeof(*uiov)))
2826 return -EFAULT;
2827 if (__get_user(clen, &uiov->iov_len))
2828 return -EFAULT;
2829 if (clen < 0)
2830 return -EINVAL;
2831
2832 len = clen;
2833 buf = io_rw_buffer_select(req, &len, needs_lock);
2834 if (IS_ERR(buf))
2835 return PTR_ERR(buf);
2836 iov[0].iov_base = buf;
2837 iov[0].iov_len = (compat_size_t) len;
2838 return 0;
2839}
2840#endif
2841
2842static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
2843 bool needs_lock)
2844{
2845 struct iovec __user *uiov = u64_to_user_ptr(req->rw.addr);
2846 void __user *buf;
2847 ssize_t len;
2848
2849 if (copy_from_user(iov, uiov, sizeof(*uiov)))
2850 return -EFAULT;
2851
2852 len = iov[0].iov_len;
2853 if (len < 0)
2854 return -EINVAL;
2855 buf = io_rw_buffer_select(req, &len, needs_lock);
2856 if (IS_ERR(buf))
2857 return PTR_ERR(buf);
2858 iov[0].iov_base = buf;
2859 iov[0].iov_len = len;
2860 return 0;
2861}
2862
2863static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
2864 bool needs_lock)
2865{
Jens Axboedddb3e22020-06-04 11:27:01 -06002866 if (req->flags & REQ_F_BUFFER_SELECTED) {
2867 struct io_buffer *kbuf;
2868
2869 kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
2870 iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
2871 iov[0].iov_len = kbuf->len;
Jens Axboe4d954c22020-02-27 07:31:19 -07002872 return 0;
Jens Axboedddb3e22020-06-04 11:27:01 -06002873 }
Jens Axboe4d954c22020-02-27 07:31:19 -07002874 if (!req->rw.len)
2875 return 0;
2876 else if (req->rw.len > 1)
2877 return -EINVAL;
2878
2879#ifdef CONFIG_COMPAT
2880 if (req->ctx->compat)
2881 return io_compat_import(req, iov, needs_lock);
2882#endif
2883
2884 return __io_iov_buffer_select(req, iov, needs_lock);
2885}
2886
Jens Axboe8452fd02020-08-18 13:58:33 -07002887static ssize_t __io_import_iovec(int rw, struct io_kiocb *req,
2888 struct iovec **iovec, struct iov_iter *iter,
2889 bool needs_lock)
Jens Axboe2b188cc2019-01-07 10:46:33 -07002890{
Jens Axboe9adbd452019-12-20 08:45:55 -07002891 void __user *buf = u64_to_user_ptr(req->rw.addr);
2892 size_t sqe_len = req->rw.len;
Jens Axboe4d954c22020-02-27 07:31:19 -07002893 ssize_t ret;
Jens Axboeedafcce2019-01-09 09:16:05 -07002894 u8 opcode;
2895
Jens Axboed625c6e2019-12-17 19:53:05 -07002896 opcode = req->opcode;
Pavel Begunkov7d009162019-11-25 23:14:40 +03002897 if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
Jens Axboeedafcce2019-01-09 09:16:05 -07002898 *iovec = NULL;
Jens Axboe9adbd452019-12-20 08:45:55 -07002899 return io_import_fixed(req, rw, iter);
Jens Axboeedafcce2019-01-09 09:16:05 -07002900 }
Jens Axboe2b188cc2019-01-07 10:46:33 -07002901
Jens Axboebcda7ba2020-02-23 16:42:51 -07002902 /* buffer index only valid with fixed read/write, or buffer select */
Bijan Mottahedeh4f4eeba2020-05-19 14:52:49 -07002903 if (req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT))
Jens Axboe9adbd452019-12-20 08:45:55 -07002904 return -EINVAL;
2905
Jens Axboe3a6820f2019-12-22 15:19:35 -07002906 if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
Jens Axboebcda7ba2020-02-23 16:42:51 -07002907 if (req->flags & REQ_F_BUFFER_SELECT) {
Jens Axboe4d954c22020-02-27 07:31:19 -07002908 buf = io_rw_buffer_select(req, &sqe_len, needs_lock);
Pavel Begunkov867a23e2020-08-20 11:34:39 +03002909 if (IS_ERR(buf))
Jens Axboe4d954c22020-02-27 07:31:19 -07002910 return PTR_ERR(buf);
Jens Axboe3f9d6442020-03-11 12:27:04 -06002911 req->rw.len = sqe_len;
Jens Axboebcda7ba2020-02-23 16:42:51 -07002912 }
2913
Jens Axboe3a6820f2019-12-22 15:19:35 -07002914 ret = import_single_range(rw, buf, sqe_len, *iovec, iter);
2915 *iovec = NULL;
Jens Axboe3a901592020-02-25 17:48:55 -07002916 return ret < 0 ? ret : sqe_len;
Jens Axboe3a6820f2019-12-22 15:19:35 -07002917 }
2918
Jens Axboe4d954c22020-02-27 07:31:19 -07002919 if (req->flags & REQ_F_BUFFER_SELECT) {
2920 ret = io_iov_buffer_select(req, *iovec, needs_lock);
Jens Axboe3f9d6442020-03-11 12:27:04 -06002921 if (!ret) {
2922 ret = (*iovec)->iov_len;
2923 iov_iter_init(iter, rw, *iovec, 1, ret);
2924 }
Jens Axboe4d954c22020-02-27 07:31:19 -07002925 *iovec = NULL;
2926 return ret;
2927 }
2928
Jens Axboe2b188cc2019-01-07 10:46:33 -07002929#ifdef CONFIG_COMPAT
Pavel Begunkovcf6fd4b2019-11-25 23:14:39 +03002930 if (req->ctx->compat)
Jens Axboe2b188cc2019-01-07 10:46:33 -07002931 return compat_import_iovec(rw, buf, sqe_len, UIO_FASTIOV,
2932 iovec, iter);
2933#endif
2934
2935 return import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter);
2936}
2937
Jens Axboe8452fd02020-08-18 13:58:33 -07002938static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
2939 struct iovec **iovec, struct iov_iter *iter,
2940 bool needs_lock)
2941{
Jens Axboee8c2bc12020-08-15 18:44:09 -07002942 struct io_async_rw *iorw = req->async_data;
2943
2944 if (!iorw)
Jens Axboe8452fd02020-08-18 13:58:33 -07002945 return __io_import_iovec(rw, req, iovec, iter, needs_lock);
2946 *iovec = NULL;
Jens Axboee8c2bc12020-08-15 18:44:09 -07002947 return iov_iter_count(&iorw->iter);
Jens Axboe8452fd02020-08-18 13:58:33 -07002948}
2949
Jens Axboe0fef9482020-08-26 10:36:20 -06002950static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb)
2951{
2952 return kiocb->ki_filp->f_mode & FMODE_STREAM ? NULL : &kiocb->ki_pos;
2953}
2954
Jens Axboe32960612019-09-23 11:05:34 -06002955/*
2956 * For files that don't have ->read_iter() and ->write_iter(), handle them
2957 * by looping over ->read() or ->write() manually.
2958 */
2959static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb,
2960 struct iov_iter *iter)
2961{
2962 ssize_t ret = 0;
2963
2964 /*
2965 * Don't support polled IO through this interface, and we can't
2966 * support non-blocking either. For the latter, this just causes
2967 * the kiocb to be handled from an async context.
2968 */
2969 if (kiocb->ki_flags & IOCB_HIPRI)
2970 return -EOPNOTSUPP;
2971 if (kiocb->ki_flags & IOCB_NOWAIT)
2972 return -EAGAIN;
2973
2974 while (iov_iter_count(iter)) {
Pavel Begunkov311ae9e2019-11-24 11:58:24 +03002975 struct iovec iovec;
Jens Axboe32960612019-09-23 11:05:34 -06002976 ssize_t nr;
2977
Pavel Begunkov311ae9e2019-11-24 11:58:24 +03002978 if (!iov_iter_is_bvec(iter)) {
2979 iovec = iov_iter_iovec(iter);
2980 } else {
2981 /* fixed buffers import bvec */
2982 iovec.iov_base = kmap(iter->bvec->bv_page)
2983 + iter->iov_offset;
2984 iovec.iov_len = min(iter->count,
2985 iter->bvec->bv_len - iter->iov_offset);
2986 }
2987
Jens Axboe32960612019-09-23 11:05:34 -06002988 if (rw == READ) {
2989 nr = file->f_op->read(file, iovec.iov_base,
Jens Axboe0fef9482020-08-26 10:36:20 -06002990 iovec.iov_len, io_kiocb_ppos(kiocb));
Jens Axboe32960612019-09-23 11:05:34 -06002991 } else {
2992 nr = file->f_op->write(file, iovec.iov_base,
Jens Axboe0fef9482020-08-26 10:36:20 -06002993 iovec.iov_len, io_kiocb_ppos(kiocb));
Jens Axboe32960612019-09-23 11:05:34 -06002994 }
2995
Pavel Begunkov311ae9e2019-11-24 11:58:24 +03002996 if (iov_iter_is_bvec(iter))
2997 kunmap(iter->bvec->bv_page);
2998
Jens Axboe32960612019-09-23 11:05:34 -06002999 if (nr < 0) {
3000 if (!ret)
3001 ret = nr;
3002 break;
3003 }
3004 ret += nr;
3005 if (nr != iovec.iov_len)
3006 break;
3007 iov_iter_advance(iter, nr);
3008 }
3009
3010 return ret;
3011}
3012
Jens Axboeff6165b2020-08-13 09:47:43 -06003013static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
3014 const struct iovec *fast_iov, struct iov_iter *iter)
Jens Axboef67676d2019-12-02 11:03:47 -07003015{
Jens Axboee8c2bc12020-08-15 18:44:09 -07003016 struct io_async_rw *rw = req->async_data;
Pavel Begunkovb64e3442020-07-13 22:59:18 +03003017
Jens Axboeff6165b2020-08-13 09:47:43 -06003018 memcpy(&rw->iter, iter, sizeof(*iter));
Pavel Begunkovafb87652020-09-06 00:45:46 +03003019 rw->free_iovec = iovec;
Jens Axboe227c0c92020-08-13 11:51:40 -06003020 rw->bytes_done = 0;
Jens Axboeff6165b2020-08-13 09:47:43 -06003021 /* can only be fixed buffers, no need to do anything */
3022 if (iter->type == ITER_BVEC)
3023 return;
Pavel Begunkovb64e3442020-07-13 22:59:18 +03003024 if (!iovec) {
Jens Axboeff6165b2020-08-13 09:47:43 -06003025 unsigned iov_off = 0;
3026
3027 rw->iter.iov = rw->fast_iov;
3028 if (iter->iov != fast_iov) {
3029 iov_off = iter->iov - fast_iov;
3030 rw->iter.iov += iov_off;
3031 }
3032 if (rw->fast_iov != fast_iov)
3033 memcpy(rw->fast_iov + iov_off, fast_iov + iov_off,
Xiaoguang Wang45097da2020-04-08 22:29:58 +08003034 sizeof(struct iovec) * iter->nr_segs);
Pavel Begunkov99bc4c32020-02-07 22:04:45 +03003035 } else {
3036 req->flags |= REQ_F_NEED_CLEANUP;
Jens Axboef67676d2019-12-02 11:03:47 -07003037 }
3038}
3039
Jens Axboee8c2bc12020-08-15 18:44:09 -07003040static inline int __io_alloc_async_data(struct io_kiocb *req)
Xiaoguang Wang3d9932a2020-03-27 15:36:52 +08003041{
Jens Axboee8c2bc12020-08-15 18:44:09 -07003042 WARN_ON_ONCE(!io_op_defs[req->opcode].async_size);
3043 req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL);
3044 return req->async_data == NULL;
Xiaoguang Wang3d9932a2020-03-27 15:36:52 +08003045}
3046
Jens Axboee8c2bc12020-08-15 18:44:09 -07003047static int io_alloc_async_data(struct io_kiocb *req)
Jens Axboef67676d2019-12-02 11:03:47 -07003048{
Jens Axboee8c2bc12020-08-15 18:44:09 -07003049 if (!io_op_defs[req->opcode].needs_async_data)
Jens Axboed3656342019-12-18 09:50:26 -07003050 return 0;
Xiaoguang Wang3d9932a2020-03-27 15:36:52 +08003051
Jens Axboee8c2bc12020-08-15 18:44:09 -07003052 return __io_alloc_async_data(req);
Jens Axboeb7bb4f72019-12-15 22:13:43 -07003053}
3054
Jens Axboeff6165b2020-08-13 09:47:43 -06003055static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
3056 const struct iovec *fast_iov,
Jens Axboe227c0c92020-08-13 11:51:40 -06003057 struct iov_iter *iter, bool force)
Jens Axboeb7bb4f72019-12-15 22:13:43 -07003058{
Jens Axboee8c2bc12020-08-15 18:44:09 -07003059 if (!force && !io_op_defs[req->opcode].needs_async_data)
Jens Axboe74566df2020-01-13 19:23:24 -07003060 return 0;
Jens Axboee8c2bc12020-08-15 18:44:09 -07003061 if (!req->async_data) {
3062 if (__io_alloc_async_data(req))
Jens Axboe5d204bc2020-01-31 12:06:52 -07003063 return -ENOMEM;
Jens Axboeb7bb4f72019-12-15 22:13:43 -07003064
Jens Axboeff6165b2020-08-13 09:47:43 -06003065 io_req_map_rw(req, iovec, fast_iov, iter);
Jens Axboe5d204bc2020-01-31 12:06:52 -07003066 }
Jens Axboeb7bb4f72019-12-15 22:13:43 -07003067 return 0;
Jens Axboef67676d2019-12-02 11:03:47 -07003068}
3069
Pavel Begunkovc3e330a2020-07-13 22:59:19 +03003070static inline int io_rw_prep_async(struct io_kiocb *req, int rw,
3071 bool force_nonblock)
3072{
Jens Axboee8c2bc12020-08-15 18:44:09 -07003073 struct io_async_rw *iorw = req->async_data;
Pavel Begunkovf4bff102020-09-06 00:45:45 +03003074 struct iovec *iov = iorw->fast_iov;
Pavel Begunkovc3e330a2020-07-13 22:59:19 +03003075 ssize_t ret;
3076
Jens Axboec183edf2020-09-04 22:36:52 -06003077 ret = __io_import_iovec(rw, req, &iov, &iorw->iter, !force_nonblock);
Pavel Begunkovc3e330a2020-07-13 22:59:19 +03003078 if (unlikely(ret < 0))
3079 return ret;
3080
Pavel Begunkovab0b1962020-09-06 00:45:47 +03003081 iorw->bytes_done = 0;
3082 iorw->free_iovec = iov;
3083 if (iov)
3084 req->flags |= REQ_F_NEED_CLEANUP;
Pavel Begunkovc3e330a2020-07-13 22:59:19 +03003085 return 0;
3086}
3087
Jens Axboe3529d8c2019-12-19 18:24:38 -07003088static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
3089 bool force_nonblock)
Jens Axboef67676d2019-12-02 11:03:47 -07003090{
3091 ssize_t ret;
3092
Jens Axboe3529d8c2019-12-19 18:24:38 -07003093 ret = io_prep_rw(req, sqe, force_nonblock);
3094 if (ret)
3095 return ret;
Jens Axboef67676d2019-12-02 11:03:47 -07003096
Jens Axboe3529d8c2019-12-19 18:24:38 -07003097 if (unlikely(!(req->file->f_mode & FMODE_READ)))
3098 return -EBADF;
Jens Axboef67676d2019-12-02 11:03:47 -07003099
Pavel Begunkov5f798be2020-02-08 13:28:02 +03003100 /* either don't need iovec imported or already have it */
Jens Axboee8c2bc12020-08-15 18:44:09 -07003101 if (!req->async_data || req->flags & REQ_F_NEED_CLEANUP)
Jens Axboe3529d8c2019-12-19 18:24:38 -07003102 return 0;
Pavel Begunkovc3e330a2020-07-13 22:59:19 +03003103 return io_rw_prep_async(req, READ, force_nonblock);
Jens Axboef67676d2019-12-02 11:03:47 -07003104}
3105
Jens Axboec1dd91d2020-08-03 16:43:59 -06003106/*
3107 * This is our waitqueue callback handler, registered through lock_page_async()
3108 * when we initially tried to do the IO with the iocb armed our waitqueue.
3109 * This gets called when the page is unlocked, and we generally expect that to
3110 * happen when the page IO is completed and the page is now uptodate. This will
3111 * queue a task_work based retry of the operation, attempting to copy the data
3112 * again. If the latter fails because the page was NOT uptodate, then we will
3113 * do a thread based blocking retry of the operation. That's the unexpected
3114 * slow path.
3115 */
Jens Axboebcf5a062020-05-22 09:24:42 -06003116static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
3117 int sync, void *arg)
3118{
3119 struct wait_page_queue *wpq;
3120 struct io_kiocb *req = wait->private;
Jens Axboebcf5a062020-05-22 09:24:42 -06003121 struct wait_page_key *key = arg;
Jens Axboebcf5a062020-05-22 09:24:42 -06003122 int ret;
3123
3124 wpq = container_of(wait, struct wait_page_queue, wait);
3125
Linus Torvaldscdc8fcb2020-08-03 13:01:22 -07003126 if (!wake_page_match(wpq, key))
3127 return 0;
3128
Hao Xuc8d317a2020-09-29 20:00:45 +08003129 req->rw.kiocb.ki_flags &= ~IOCB_WAITQ;
Jens Axboebcf5a062020-05-22 09:24:42 -06003130 list_del_init(&wait->entry);
3131
Pavel Begunkove7375122020-07-12 20:42:04 +03003132 init_task_work(&req->task_work, io_req_task_submit);
Jens Axboe6d816e02020-08-11 08:04:14 -06003133 percpu_ref_get(&req->ctx->refs);
3134
Jens Axboebcf5a062020-05-22 09:24:42 -06003135 /* submit ref gets dropped, acquire a new one */
3136 refcount_inc(&req->refs);
Jens Axboefd7d6de2020-08-23 11:00:37 -06003137 ret = io_req_task_work_add(req, &req->task_work, true);
Jens Axboebcf5a062020-05-22 09:24:42 -06003138 if (unlikely(ret)) {
Jens Axboec2c4c832020-07-01 15:37:11 -06003139 struct task_struct *tsk;
3140
Jens Axboebcf5a062020-05-22 09:24:42 -06003141 /* queue just for cancelation */
Pavel Begunkove7375122020-07-12 20:42:04 +03003142 init_task_work(&req->task_work, io_req_task_cancel);
Jens Axboebcf5a062020-05-22 09:24:42 -06003143 tsk = io_wq_get_task(req->ctx->io_wq);
Pavel Begunkove7375122020-07-12 20:42:04 +03003144 task_work_add(tsk, &req->task_work, 0);
Jens Axboec2c4c832020-07-01 15:37:11 -06003145 wake_up_process(tsk);
Jens Axboebcf5a062020-05-22 09:24:42 -06003146 }
Jens Axboebcf5a062020-05-22 09:24:42 -06003147 return 1;
3148}
3149
Jens Axboec1dd91d2020-08-03 16:43:59 -06003150/*
3151 * This controls whether a given IO request should be armed for async page
3152 * based retry. If we return false here, the request is handed to the async
3153 * worker threads for retry. If we're doing buffered reads on a regular file,
3154 * we prepare a private wait_page_queue entry and retry the operation. This
3155 * will either succeed because the page is now uptodate and unlocked, or it
3156 * will register a callback when the page is unlocked at IO completion. Through
3157 * that callback, io_uring uses task_work to setup a retry of the operation.
3158 * That retry will attempt the buffered read again. The retry will generally
3159 * succeed, or in rare cases where it fails, we then fall back to using the
3160 * async worker threads for a blocking retry.
3161 */
Jens Axboe227c0c92020-08-13 11:51:40 -06003162static bool io_rw_should_retry(struct io_kiocb *req)
Jens Axboebcf5a062020-05-22 09:24:42 -06003163{
Jens Axboee8c2bc12020-08-15 18:44:09 -07003164 struct io_async_rw *rw = req->async_data;
3165 struct wait_page_queue *wait = &rw->wpq;
Jens Axboebcf5a062020-05-22 09:24:42 -06003166 struct kiocb *kiocb = &req->rw.kiocb;
Jens Axboebcf5a062020-05-22 09:24:42 -06003167
3168 /* never retry for NOWAIT, we just complete with -EAGAIN */
3169 if (req->flags & REQ_F_NOWAIT)
3170 return false;
3171
Jens Axboe227c0c92020-08-13 11:51:40 -06003172 /* Only for buffered IO */
Jens Axboe3b2a4432020-08-16 10:58:43 -07003173 if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_HIPRI))
Jens Axboebcf5a062020-05-22 09:24:42 -06003174 return false;
Jens Axboe3b2a4432020-08-16 10:58:43 -07003175
Jens Axboebcf5a062020-05-22 09:24:42 -06003176 /*
3177 * just use poll if we can, and don't attempt if the fs doesn't
3178 * support callback based unlocks
3179 */
3180 if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC))
3181 return false;
3182
Jens Axboe3b2a4432020-08-16 10:58:43 -07003183 wait->wait.func = io_async_buf_func;
3184 wait->wait.private = req;
3185 wait->wait.flags = 0;
3186 INIT_LIST_HEAD(&wait->wait.entry);
3187 kiocb->ki_flags |= IOCB_WAITQ;
Hao Xuc8d317a2020-09-29 20:00:45 +08003188 kiocb->ki_flags &= ~IOCB_NOWAIT;
Jens Axboe3b2a4432020-08-16 10:58:43 -07003189 kiocb->ki_waitq = wait;
Jens Axboe3b2a4432020-08-16 10:58:43 -07003190 return true;
Jens Axboebcf5a062020-05-22 09:24:42 -06003191}
3192
3193static int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter)
3194{
3195 if (req->file->f_op->read_iter)
3196 return call_read_iter(req->file, &req->rw.kiocb, iter);
Guoyu Huang2dd21112020-08-05 03:53:50 -07003197 else if (req->file->f_op->read)
3198 return loop_rw_iter(READ, req->file, &req->rw.kiocb, iter);
3199 else
3200 return -EINVAL;
Jens Axboebcf5a062020-05-22 09:24:42 -06003201}
3202
Jens Axboea1d7c392020-06-22 11:09:46 -06003203static int io_read(struct io_kiocb *req, bool force_nonblock,
3204 struct io_comp_state *cs)
Jens Axboe2b188cc2019-01-07 10:46:33 -07003205{
3206 struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
Jens Axboe9adbd452019-12-20 08:45:55 -07003207 struct kiocb *kiocb = &req->rw.kiocb;
Jens Axboeff6165b2020-08-13 09:47:43 -06003208 struct iov_iter __iter, *iter = &__iter;
Jens Axboee8c2bc12020-08-15 18:44:09 -07003209 struct io_async_rw *rw = req->async_data;
Jens Axboe227c0c92020-08-13 11:51:40 -06003210 ssize_t io_size, ret, ret2;
Jens Axboe31b51512019-01-18 22:56:34 -07003211 size_t iov_count;
Jens Axboef5cac8b2020-09-14 09:30:38 -06003212 bool no_async;
Jens Axboe2b188cc2019-01-07 10:46:33 -07003213
Jens Axboee8c2bc12020-08-15 18:44:09 -07003214 if (rw)
3215 iter = &rw->iter;
Jens Axboeff6165b2020-08-13 09:47:43 -06003216
3217 ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock);
Jens Axboe06b76d42019-12-19 14:44:26 -07003218 if (ret < 0)
3219 return ret;
Jens Axboeeefdf302020-08-27 16:40:19 -06003220 iov_count = iov_iter_count(iter);
Pavel Begunkovfa15baf2020-08-01 13:50:02 +03003221 io_size = ret;
3222 req->result = io_size;
Jens Axboe227c0c92020-08-13 11:51:40 -06003223 ret = 0;
Jens Axboe2b188cc2019-01-07 10:46:33 -07003224
Jens Axboefd6c2e42019-12-18 12:19:41 -07003225 /* Ensure we clear previously set non-block flag */
3226 if (!force_nonblock)
Jens Axboe29de5f62020-02-20 09:56:08 -07003227 kiocb->ki_flags &= ~IOCB_NOWAIT;
Jens Axboefd6c2e42019-12-18 12:19:41 -07003228
Pavel Begunkov24c74672020-06-21 13:09:51 +03003229 /* If the file doesn't support async, just async punt */
Jens Axboef5cac8b2020-09-14 09:30:38 -06003230 no_async = force_nonblock && !io_file_supports_async(req->file, READ);
3231 if (no_async)
Jens Axboef67676d2019-12-02 11:03:47 -07003232 goto copy_iov;
Jens Axboe9e645e112019-05-10 16:07:28 -06003233
Jens Axboe0fef9482020-08-26 10:36:20 -06003234 ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), iov_count);
Pavel Begunkovfa15baf2020-08-01 13:50:02 +03003235 if (unlikely(ret))
3236 goto out_free;
Jens Axboe2b188cc2019-01-07 10:46:33 -07003237
Jens Axboe227c0c92020-08-13 11:51:40 -06003238 ret = io_iter_do_read(req, iter);
Jens Axboe32960612019-09-23 11:05:34 -06003239
Jens Axboe227c0c92020-08-13 11:51:40 -06003240 if (!ret) {
3241 goto done;
3242 } else if (ret == -EIOCBQUEUED) {
3243 ret = 0;
3244 goto out_free;
3245 } else if (ret == -EAGAIN) {
Jens Axboeeefdf302020-08-27 16:40:19 -06003246 /* IOPOLL retry should happen for io-wq threads */
3247 if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
Jens Axboef91daf52020-08-15 15:58:42 -07003248 goto done;
Jens Axboe355afae2020-09-02 09:30:31 -06003249 /* no retry on NONBLOCK marked file */
3250 if (req->file->f_flags & O_NONBLOCK)
3251 goto done;
Jens Axboe84216312020-08-24 11:45:26 -06003252 /* some cases will consume bytes even on error returns */
3253 iov_iter_revert(iter, iov_count - iov_iter_count(iter));
Jens Axboef38c7e32020-09-25 15:23:43 -06003254 ret = 0;
3255 goto copy_iov;
Jens Axboe227c0c92020-08-13 11:51:40 -06003256 } else if (ret < 0) {
Jens Axboe00d23d52020-08-25 12:59:22 -06003257 /* make sure -ERESTARTSYS -> -EINTR is done */
3258 goto done;
Jens Axboe227c0c92020-08-13 11:51:40 -06003259 }
3260
3261 /* read it all, or we did blocking attempt. no retry. */
Jens Axboef91daf52020-08-15 15:58:42 -07003262 if (!iov_iter_count(iter) || !force_nonblock ||
3263 (req->file->f_flags & O_NONBLOCK))
Jens Axboe227c0c92020-08-13 11:51:40 -06003264 goto done;
3265
3266 io_size -= ret;
3267copy_iov:
3268 ret2 = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
3269 if (ret2) {
3270 ret = ret2;
3271 goto out_free;
3272 }
Jens Axboef5cac8b2020-09-14 09:30:38 -06003273 if (no_async)
3274 return -EAGAIN;
Jens Axboee8c2bc12020-08-15 18:44:09 -07003275 rw = req->async_data;
Jens Axboe227c0c92020-08-13 11:51:40 -06003276 /* it's copied and will be cleaned with ->io */
3277 iovec = NULL;
3278 /* now use our persistent iterator, if we aren't already */
Jens Axboee8c2bc12020-08-15 18:44:09 -07003279 iter = &rw->iter;
Jens Axboe227c0c92020-08-13 11:51:40 -06003280retry:
Jens Axboee8c2bc12020-08-15 18:44:09 -07003281 rw->bytes_done += ret;
Jens Axboe227c0c92020-08-13 11:51:40 -06003282 /* if we can retry, do so with the callbacks armed */
3283 if (!io_rw_should_retry(req)) {
Pavel Begunkovfa15baf2020-08-01 13:50:02 +03003284 kiocb->ki_flags &= ~IOCB_WAITQ;
3285 return -EAGAIN;
Jens Axboe2b188cc2019-01-07 10:46:33 -07003286 }
Jens Axboe227c0c92020-08-13 11:51:40 -06003287
3288 /*
3289 * Now retry read with the IOCB_WAITQ parts set in the iocb. If we
3290 * get -EIOCBQUEUED, then we'll get a notification when the desired
3291 * page gets unlocked. We can also get a partial read here, and if we
3292 * do, then just retry at the new offset.
3293 */
3294 ret = io_iter_do_read(req, iter);
3295 if (ret == -EIOCBQUEUED) {
3296 ret = 0;
3297 goto out_free;
3298 } else if (ret > 0 && ret < io_size) {
3299 /* we got some bytes, but not all. retry. */
3300 goto retry;
3301 }
3302done:
3303 kiocb_done(kiocb, ret, cs);
3304 ret = 0;
Jens Axboef67676d2019-12-02 11:03:47 -07003305out_free:
Pavel Begunkovf261c162020-08-20 11:34:10 +03003306 /* it's reportedly faster than delegating the null check to kfree() */
Pavel Begunkov252917c2020-07-13 22:59:20 +03003307 if (iovec)
Xiaoguang Wang6f2cc162020-06-18 15:01:56 +08003308 kfree(iovec);
Jens Axboe2b188cc2019-01-07 10:46:33 -07003309 return ret;
3310}
3311
Jens Axboe3529d8c2019-12-19 18:24:38 -07003312static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
3313 bool force_nonblock)
Jens Axboef67676d2019-12-02 11:03:47 -07003314{
3315 ssize_t ret;
3316
Jens Axboe3529d8c2019-12-19 18:24:38 -07003317 ret = io_prep_rw(req, sqe, force_nonblock);
3318 if (ret)
3319 return ret;
Jens Axboef67676d2019-12-02 11:03:47 -07003320
Jens Axboe3529d8c2019-12-19 18:24:38 -07003321 if (unlikely(!(req->file->f_mode & FMODE_WRITE)))
3322 return -EBADF;
Jens Axboef67676d2019-12-02 11:03:47 -07003323
Pavel Begunkov5f798be2020-02-08 13:28:02 +03003324 /* either don't need iovec imported or already have it */
Jens Axboee8c2bc12020-08-15 18:44:09 -07003325 if (!req->async_data || req->flags & REQ_F_NEED_CLEANUP)
Jens Axboe3529d8c2019-12-19 18:24:38 -07003326 return 0;
Pavel Begunkovc3e330a2020-07-13 22:59:19 +03003327 return io_rw_prep_async(req, WRITE, force_nonblock);
Jens Axboef67676d2019-12-02 11:03:47 -07003328}
3329
Jens Axboea1d7c392020-06-22 11:09:46 -06003330static int io_write(struct io_kiocb *req, bool force_nonblock,
3331 struct io_comp_state *cs)
Jens Axboe2b188cc2019-01-07 10:46:33 -07003332{
3333 struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
Jens Axboe9adbd452019-12-20 08:45:55 -07003334 struct kiocb *kiocb = &req->rw.kiocb;
Jens Axboeff6165b2020-08-13 09:47:43 -06003335 struct iov_iter __iter, *iter = &__iter;
Jens Axboee8c2bc12020-08-15 18:44:09 -07003336 struct io_async_rw *rw = req->async_data;
Jens Axboe31b51512019-01-18 22:56:34 -07003337 size_t iov_count;
Pavel Begunkovfa15baf2020-08-01 13:50:02 +03003338 ssize_t ret, ret2, io_size;
Jens Axboe2b188cc2019-01-07 10:46:33 -07003339
Jens Axboee8c2bc12020-08-15 18:44:09 -07003340 if (rw)
3341 iter = &rw->iter;
Jens Axboeff6165b2020-08-13 09:47:43 -06003342
3343 ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock);
Jens Axboe06b76d42019-12-19 14:44:26 -07003344 if (ret < 0)
3345 return ret;
Jens Axboeeefdf302020-08-27 16:40:19 -06003346 iov_count = iov_iter_count(iter);
Pavel Begunkovfa15baf2020-08-01 13:50:02 +03003347 io_size = ret;
3348 req->result = io_size;
Jens Axboe2b188cc2019-01-07 10:46:33 -07003349
Jens Axboefd6c2e42019-12-18 12:19:41 -07003350 /* Ensure we clear previously set non-block flag */
3351 if (!force_nonblock)
Jens Axboe9adbd452019-12-20 08:45:55 -07003352 req->rw.kiocb.ki_flags &= ~IOCB_NOWAIT;
Jens Axboefd6c2e42019-12-18 12:19:41 -07003353
Pavel Begunkov24c74672020-06-21 13:09:51 +03003354 /* If the file doesn't support async, just async punt */
Jens Axboeaf197f52020-04-28 13:15:06 -06003355 if (force_nonblock && !io_file_supports_async(req->file, WRITE))
Jens Axboef67676d2019-12-02 11:03:47 -07003356 goto copy_iov;
Jens Axboef67676d2019-12-02 11:03:47 -07003357
Jens Axboe10d59342019-12-09 20:16:22 -07003358 /* file path doesn't support NOWAIT for non-direct_IO */
3359 if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
3360 (req->flags & REQ_F_ISREG))
Jens Axboef67676d2019-12-02 11:03:47 -07003361 goto copy_iov;
Jens Axboe9e645e112019-05-10 16:07:28 -06003362
Jens Axboe0fef9482020-08-26 10:36:20 -06003363 ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), iov_count);
Pavel Begunkovfa15baf2020-08-01 13:50:02 +03003364 if (unlikely(ret))
3365 goto out_free;
Roman Penyaev9bf79332019-03-25 20:09:24 +01003366
Pavel Begunkovfa15baf2020-08-01 13:50:02 +03003367 /*
3368 * Open-code file_start_write here to grab freeze protection,
3369 * which will be released by another thread in
3370 * io_complete_rw(). Fool lockdep by telling it the lock got
3371 * released so that it doesn't complain about the held lock when
3372 * we return to userspace.
3373 */
3374 if (req->flags & REQ_F_ISREG) {
3375 __sb_start_write(file_inode(req->file)->i_sb,
3376 SB_FREEZE_WRITE, true);
3377 __sb_writers_release(file_inode(req->file)->i_sb,
3378 SB_FREEZE_WRITE);
3379 }
3380 kiocb->ki_flags |= IOCB_WRITE;
Roman Penyaev9bf79332019-03-25 20:09:24 +01003381
Pavel Begunkovfa15baf2020-08-01 13:50:02 +03003382 if (req->file->f_op->write_iter)
Jens Axboeff6165b2020-08-13 09:47:43 -06003383 ret2 = call_write_iter(req->file, kiocb, iter);
Guoyu Huang2dd21112020-08-05 03:53:50 -07003384 else if (req->file->f_op->write)
Jens Axboeff6165b2020-08-13 09:47:43 -06003385 ret2 = loop_rw_iter(WRITE, req->file, kiocb, iter);
Guoyu Huang2dd21112020-08-05 03:53:50 -07003386 else
3387 ret2 = -EINVAL;
Jens Axboe4ed734b2020-03-20 11:23:41 -06003388
Pavel Begunkovfa15baf2020-08-01 13:50:02 +03003389 /*
3390 * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just
3391 * retry them without IOCB_NOWAIT.
3392 */
3393 if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT))
3394 ret2 = -EAGAIN;
Jens Axboe355afae2020-09-02 09:30:31 -06003395 /* no retry on NONBLOCK marked file */
3396 if (ret2 == -EAGAIN && (req->file->f_flags & O_NONBLOCK))
3397 goto done;
Pavel Begunkovfa15baf2020-08-01 13:50:02 +03003398 if (!force_nonblock || ret2 != -EAGAIN) {
Jens Axboeeefdf302020-08-27 16:40:19 -06003399 /* IOPOLL retry should happen for io-wq threads */
3400 if ((req->ctx->flags & IORING_SETUP_IOPOLL) && ret2 == -EAGAIN)
3401 goto copy_iov;
Jens Axboe355afae2020-09-02 09:30:31 -06003402done:
Pavel Begunkovfa15baf2020-08-01 13:50:02 +03003403 kiocb_done(kiocb, ret2, cs);
3404 } else {
Jens Axboef67676d2019-12-02 11:03:47 -07003405copy_iov:
Jens Axboe84216312020-08-24 11:45:26 -06003406 /* some cases will consume bytes even on error returns */
3407 iov_iter_revert(iter, iov_count - iov_iter_count(iter));
Jens Axboe227c0c92020-08-13 11:51:40 -06003408 ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false);
Jens Axboeff6165b2020-08-13 09:47:43 -06003409 if (!ret)
3410 return -EAGAIN;
Jens Axboe2b188cc2019-01-07 10:46:33 -07003411 }
Jens Axboe31b51512019-01-18 22:56:34 -07003412out_free:
Pavel Begunkovf261c162020-08-20 11:34:10 +03003413 /* it's reportedly faster than delegating the null check to kfree() */
Pavel Begunkov252917c2020-07-13 22:59:20 +03003414 if (iovec)
Xiaoguang Wang6f2cc162020-06-18 15:01:56 +08003415 kfree(iovec);
Jens Axboe2b188cc2019-01-07 10:46:33 -07003416 return ret;
3417}
3418
Pavel Begunkovf2a8d5c2020-05-17 14:18:06 +03003419static int __io_splice_prep(struct io_kiocb *req,
3420 const struct io_uring_sqe *sqe)
Pavel Begunkov7d67af22020-02-24 11:32:45 +03003421{
3422 struct io_splice* sp = &req->splice;
3423 unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL;
3424 int ret;
3425
3426 if (req->flags & REQ_F_NEED_CLEANUP)
3427 return 0;
Pavel Begunkov3232dd02020-06-03 18:03:22 +03003428 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3429 return -EINVAL;
Pavel Begunkov7d67af22020-02-24 11:32:45 +03003430
3431 sp->file_in = NULL;
Pavel Begunkov7d67af22020-02-24 11:32:45 +03003432 sp->len = READ_ONCE(sqe->len);
3433 sp->flags = READ_ONCE(sqe->splice_flags);
3434
3435 if (unlikely(sp->flags & ~valid_flags))
3436 return -EINVAL;
3437
3438 ret = io_file_get(NULL, req, READ_ONCE(sqe->splice_fd_in), &sp->file_in,
3439 (sp->flags & SPLICE_F_FD_IN_FIXED));
3440 if (ret)
3441 return ret;
3442 req->flags |= REQ_F_NEED_CLEANUP;
3443
Xiaoguang Wang7cdaf582020-06-10 19:41:19 +08003444 if (!S_ISREG(file_inode(sp->file_in)->i_mode)) {
3445 /*
3446 * Splice operation will be punted aync, and here need to
3447 * modify io_wq_work.flags, so initialize io_wq_work firstly.
3448 */
3449 io_req_init_async(req);
Pavel Begunkov7d67af22020-02-24 11:32:45 +03003450 req->work.flags |= IO_WQ_WORK_UNBOUND;
Xiaoguang Wang7cdaf582020-06-10 19:41:19 +08003451 }
Pavel Begunkov7d67af22020-02-24 11:32:45 +03003452
3453 return 0;
3454}
3455
Pavel Begunkovf2a8d5c2020-05-17 14:18:06 +03003456static int io_tee_prep(struct io_kiocb *req,
3457 const struct io_uring_sqe *sqe)
3458{
3459 if (READ_ONCE(sqe->splice_off_in) || READ_ONCE(sqe->off))
3460 return -EINVAL;
3461 return __io_splice_prep(req, sqe);
3462}
3463
3464static int io_tee(struct io_kiocb *req, bool force_nonblock)
3465{
3466 struct io_splice *sp = &req->splice;
3467 struct file *in = sp->file_in;
3468 struct file *out = sp->file_out;
3469 unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
3470 long ret = 0;
3471
3472 if (force_nonblock)
3473 return -EAGAIN;
3474 if (sp->len)
3475 ret = do_tee(in, out, sp->len, flags);
3476
3477 io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED));
3478 req->flags &= ~REQ_F_NEED_CLEANUP;
3479
Pavel Begunkovf2a8d5c2020-05-17 14:18:06 +03003480 if (ret != sp->len)
3481 req_set_fail_links(req);
Jens Axboee1e16092020-06-22 09:17:17 -06003482 io_req_complete(req, ret);
Pavel Begunkovf2a8d5c2020-05-17 14:18:06 +03003483 return 0;
3484}
3485
3486static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3487{
3488 struct io_splice* sp = &req->splice;
3489
3490 sp->off_in = READ_ONCE(sqe->splice_off_in);
3491 sp->off_out = READ_ONCE(sqe->off);
3492 return __io_splice_prep(req, sqe);
3493}
3494
Pavel Begunkov014db002020-03-03 21:33:12 +03003495static int io_splice(struct io_kiocb *req, bool force_nonblock)
Pavel Begunkov7d67af22020-02-24 11:32:45 +03003496{
3497 struct io_splice *sp = &req->splice;
3498 struct file *in = sp->file_in;
3499 struct file *out = sp->file_out;
3500 unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
3501 loff_t *poff_in, *poff_out;
Pavel Begunkovc9687422020-05-04 23:00:54 +03003502 long ret = 0;
Pavel Begunkov7d67af22020-02-24 11:32:45 +03003503
Pavel Begunkov2fb3e822020-05-01 17:09:38 +03003504 if (force_nonblock)
3505 return -EAGAIN;
Pavel Begunkov7d67af22020-02-24 11:32:45 +03003506
3507 poff_in = (sp->off_in == -1) ? NULL : &sp->off_in;
3508 poff_out = (sp->off_out == -1) ? NULL : &sp->off_out;
Pavel Begunkovc9687422020-05-04 23:00:54 +03003509
Jens Axboe948a7742020-05-17 14:21:38 -06003510 if (sp->len)
Pavel Begunkovc9687422020-05-04 23:00:54 +03003511 ret = do_splice(in, poff_in, out, poff_out, sp->len, flags);
Pavel Begunkov7d67af22020-02-24 11:32:45 +03003512
3513 io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED));
3514 req->flags &= ~REQ_F_NEED_CLEANUP;
3515
Pavel Begunkov7d67af22020-02-24 11:32:45 +03003516 if (ret != sp->len)
3517 req_set_fail_links(req);
Jens Axboee1e16092020-06-22 09:17:17 -06003518 io_req_complete(req, ret);
Pavel Begunkov7d67af22020-02-24 11:32:45 +03003519 return 0;
3520}
3521
Jens Axboe2b188cc2019-01-07 10:46:33 -07003522/*
3523 * IORING_OP_NOP just posts a completion event, nothing else.
3524 */
Jens Axboe229a7b62020-06-22 10:13:11 -06003525static int io_nop(struct io_kiocb *req, struct io_comp_state *cs)
Jens Axboe2b188cc2019-01-07 10:46:33 -07003526{
3527 struct io_ring_ctx *ctx = req->ctx;
Jens Axboe2b188cc2019-01-07 10:46:33 -07003528
Jens Axboedef596e2019-01-09 08:59:42 -07003529 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
3530 return -EINVAL;
3531
Jens Axboe229a7b62020-06-22 10:13:11 -06003532 __io_req_complete(req, 0, 0, cs);
Jens Axboe2b188cc2019-01-07 10:46:33 -07003533 return 0;
3534}
3535
Jens Axboe3529d8c2019-12-19 18:24:38 -07003536static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe)
Christoph Hellwigc992fe22019-01-11 09:43:02 -07003537{
Jens Axboe6b063142019-01-10 22:13:58 -07003538 struct io_ring_ctx *ctx = req->ctx;
Christoph Hellwigc992fe22019-01-11 09:43:02 -07003539
Jens Axboe09bb8392019-03-13 12:39:28 -06003540 if (!req->file)
3541 return -EBADF;
Christoph Hellwigc992fe22019-01-11 09:43:02 -07003542
Jens Axboe6b063142019-01-10 22:13:58 -07003543 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
Jens Axboedef596e2019-01-09 08:59:42 -07003544 return -EINVAL;
Jens Axboeedafcce2019-01-09 09:16:05 -07003545 if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
Christoph Hellwigc992fe22019-01-11 09:43:02 -07003546 return -EINVAL;
3547
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07003548 req->sync.flags = READ_ONCE(sqe->fsync_flags);
3549 if (unlikely(req->sync.flags & ~IORING_FSYNC_DATASYNC))
3550 return -EINVAL;
3551
3552 req->sync.off = READ_ONCE(sqe->off);
3553 req->sync.len = READ_ONCE(sqe->len);
Christoph Hellwigc992fe22019-01-11 09:43:02 -07003554 return 0;
3555}
3556
Pavel Begunkovac45abc2020-06-08 21:08:18 +03003557static int io_fsync(struct io_kiocb *req, bool force_nonblock)
Jens Axboe78912932020-01-14 22:09:06 -07003558{
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07003559 loff_t end = req->sync.off + req->sync.len;
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07003560 int ret;
3561
Pavel Begunkovac45abc2020-06-08 21:08:18 +03003562 /* fsync always requires a blocking context */
3563 if (force_nonblock)
3564 return -EAGAIN;
3565
Jens Axboe9adbd452019-12-20 08:45:55 -07003566 ret = vfs_fsync_range(req->file, req->sync.off,
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07003567 end > 0 ? end : LLONG_MAX,
3568 req->sync.flags & IORING_FSYNC_DATASYNC);
3569 if (ret < 0)
3570 req_set_fail_links(req);
Jens Axboee1e16092020-06-22 09:17:17 -06003571 io_req_complete(req, ret);
Christoph Hellwigc992fe22019-01-11 09:43:02 -07003572 return 0;
3573}
3574
Jens Axboed63d1b52019-12-10 10:38:56 -07003575static int io_fallocate_prep(struct io_kiocb *req,
3576 const struct io_uring_sqe *sqe)
3577{
3578 if (sqe->ioprio || sqe->buf_index || sqe->rw_flags)
3579 return -EINVAL;
Pavel Begunkov3232dd02020-06-03 18:03:22 +03003580 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3581 return -EINVAL;
Jens Axboed63d1b52019-12-10 10:38:56 -07003582
3583 req->sync.off = READ_ONCE(sqe->off);
3584 req->sync.len = READ_ONCE(sqe->addr);
3585 req->sync.mode = READ_ONCE(sqe->len);
3586 return 0;
3587}
3588
Pavel Begunkov014db002020-03-03 21:33:12 +03003589static int io_fallocate(struct io_kiocb *req, bool force_nonblock)
Jens Axboed63d1b52019-12-10 10:38:56 -07003590{
Pavel Begunkovac45abc2020-06-08 21:08:18 +03003591 int ret;
Jens Axboed63d1b52019-12-10 10:38:56 -07003592
Pavel Begunkovac45abc2020-06-08 21:08:18 +03003593 /* fallocate always requiring blocking context */
3594 if (force_nonblock)
3595 return -EAGAIN;
Pavel Begunkovac45abc2020-06-08 21:08:18 +03003596 ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
3597 req->sync.len);
Pavel Begunkovac45abc2020-06-08 21:08:18 +03003598 if (ret < 0)
3599 req_set_fail_links(req);
Jens Axboee1e16092020-06-22 09:17:17 -06003600 io_req_complete(req, ret);
Jens Axboed63d1b52019-12-10 10:38:56 -07003601 return 0;
3602}
3603
Pavel Begunkovec65fea2020-06-03 18:03:24 +03003604static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
Jens Axboe15b71ab2019-12-11 11:20:36 -07003605{
Jens Axboef8748882020-01-08 17:47:02 -07003606 const char __user *fname;
Jens Axboe15b71ab2019-12-11 11:20:36 -07003607 int ret;
3608
Pavel Begunkovec65fea2020-06-03 18:03:24 +03003609 if (unlikely(sqe->ioprio || sqe->buf_index))
Jens Axboe15b71ab2019-12-11 11:20:36 -07003610 return -EINVAL;
Pavel Begunkovec65fea2020-06-03 18:03:24 +03003611 if (unlikely(req->flags & REQ_F_FIXED_FILE))
Jens Axboecf3040c2020-02-06 21:31:40 -07003612 return -EBADF;
Jens Axboe15b71ab2019-12-11 11:20:36 -07003613
Pavel Begunkovec65fea2020-06-03 18:03:24 +03003614 /* open.how should be already initialised */
3615 if (!(req->open.how.flags & O_PATH) && force_o_largefile())
Jens Axboe08a1d26eb2020-04-08 09:20:54 -06003616 req->open.how.flags |= O_LARGEFILE;
Jens Axboe15b71ab2019-12-11 11:20:36 -07003617
Pavel Begunkov25e72d12020-06-03 18:03:23 +03003618 req->open.dfd = READ_ONCE(sqe->fd);
3619 fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
Jens Axboef8748882020-01-08 17:47:02 -07003620 req->open.filename = getname(fname);
Jens Axboe15b71ab2019-12-11 11:20:36 -07003621 if (IS_ERR(req->open.filename)) {
3622 ret = PTR_ERR(req->open.filename);
3623 req->open.filename = NULL;
3624 return ret;
3625 }
Jens Axboe4022e7a2020-03-19 19:23:18 -06003626 req->open.nofile = rlimit(RLIMIT_NOFILE);
Pavel Begunkov8fef80b2020-02-07 23:59:53 +03003627 req->flags |= REQ_F_NEED_CLEANUP;
Jens Axboe15b71ab2019-12-11 11:20:36 -07003628 return 0;
3629}
3630
Pavel Begunkovec65fea2020-06-03 18:03:24 +03003631static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3632{
3633 u64 flags, mode;
3634
Jens Axboe4eb8dde2020-09-18 19:36:24 -06003635 if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
3636 return -EINVAL;
Pavel Begunkovec65fea2020-06-03 18:03:24 +03003637 if (req->flags & REQ_F_NEED_CLEANUP)
3638 return 0;
3639 mode = READ_ONCE(sqe->len);
3640 flags = READ_ONCE(sqe->open_flags);
3641 req->open.how = build_open_how(flags, mode);
3642 return __io_openat_prep(req, sqe);
3643}
3644
Jens Axboecebdb982020-01-08 17:59:24 -07003645static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3646{
3647 struct open_how __user *how;
Jens Axboecebdb982020-01-08 17:59:24 -07003648 size_t len;
3649 int ret;
3650
Jens Axboe4eb8dde2020-09-18 19:36:24 -06003651 if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
3652 return -EINVAL;
Pavel Begunkov0bdbdd02020-02-08 13:28:03 +03003653 if (req->flags & REQ_F_NEED_CLEANUP)
3654 return 0;
Jens Axboecebdb982020-01-08 17:59:24 -07003655 how = u64_to_user_ptr(READ_ONCE(sqe->addr2));
3656 len = READ_ONCE(sqe->len);
Jens Axboecebdb982020-01-08 17:59:24 -07003657 if (len < OPEN_HOW_SIZE_VER0)
3658 return -EINVAL;
3659
3660 ret = copy_struct_from_user(&req->open.how, sizeof(req->open.how), how,
3661 len);
3662 if (ret)
3663 return ret;
3664
Pavel Begunkovec65fea2020-06-03 18:03:24 +03003665 return __io_openat_prep(req, sqe);
Jens Axboecebdb982020-01-08 17:59:24 -07003666}
3667
Pavel Begunkov014db002020-03-03 21:33:12 +03003668static int io_openat2(struct io_kiocb *req, bool force_nonblock)
Jens Axboe15b71ab2019-12-11 11:20:36 -07003669{
3670 struct open_flags op;
Jens Axboe15b71ab2019-12-11 11:20:36 -07003671 struct file *file;
3672 int ret;
3673
Jens Axboef86cd202020-01-29 13:46:44 -07003674 if (force_nonblock)
Jens Axboe15b71ab2019-12-11 11:20:36 -07003675 return -EAGAIN;
Jens Axboe15b71ab2019-12-11 11:20:36 -07003676
Jens Axboecebdb982020-01-08 17:59:24 -07003677 ret = build_open_flags(&req->open.how, &op);
Jens Axboe15b71ab2019-12-11 11:20:36 -07003678 if (ret)
3679 goto err;
3680
Jens Axboe4022e7a2020-03-19 19:23:18 -06003681 ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile);
Jens Axboe15b71ab2019-12-11 11:20:36 -07003682 if (ret < 0)
3683 goto err;
3684
3685 file = do_filp_open(req->open.dfd, req->open.filename, &op);
3686 if (IS_ERR(file)) {
3687 put_unused_fd(ret);
3688 ret = PTR_ERR(file);
3689 } else {
3690 fsnotify_open(file);
3691 fd_install(ret, file);
3692 }
3693err:
3694 putname(req->open.filename);
Pavel Begunkov8fef80b2020-02-07 23:59:53 +03003695 req->flags &= ~REQ_F_NEED_CLEANUP;
Jens Axboe15b71ab2019-12-11 11:20:36 -07003696 if (ret < 0)
3697 req_set_fail_links(req);
Jens Axboee1e16092020-06-22 09:17:17 -06003698 io_req_complete(req, ret);
Jens Axboe15b71ab2019-12-11 11:20:36 -07003699 return 0;
3700}
3701
Pavel Begunkov014db002020-03-03 21:33:12 +03003702static int io_openat(struct io_kiocb *req, bool force_nonblock)
Jens Axboecebdb982020-01-08 17:59:24 -07003703{
Pavel Begunkov014db002020-03-03 21:33:12 +03003704 return io_openat2(req, force_nonblock);
Jens Axboecebdb982020-01-08 17:59:24 -07003705}
3706
Jens Axboe067524e2020-03-02 16:32:28 -07003707static int io_remove_buffers_prep(struct io_kiocb *req,
3708 const struct io_uring_sqe *sqe)
3709{
3710 struct io_provide_buf *p = &req->pbuf;
3711 u64 tmp;
3712
3713 if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off)
3714 return -EINVAL;
3715
3716 tmp = READ_ONCE(sqe->fd);
3717 if (!tmp || tmp > USHRT_MAX)
3718 return -EINVAL;
3719
3720 memset(p, 0, sizeof(*p));
3721 p->nbufs = tmp;
3722 p->bgid = READ_ONCE(sqe->buf_group);
3723 return 0;
3724}
3725
3726static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf,
3727 int bgid, unsigned nbufs)
3728{
3729 unsigned i = 0;
3730
3731 /* shouldn't happen */
3732 if (!nbufs)
3733 return 0;
3734
3735 /* the head kbuf is the list itself */
3736 while (!list_empty(&buf->list)) {
3737 struct io_buffer *nxt;
3738
3739 nxt = list_first_entry(&buf->list, struct io_buffer, list);
3740 list_del(&nxt->list);
3741 kfree(nxt);
3742 if (++i == nbufs)
3743 return i;
3744 }
3745 i++;
3746 kfree(buf);
3747 idr_remove(&ctx->io_buffer_idr, bgid);
3748
3749 return i;
3750}
3751
Jens Axboe229a7b62020-06-22 10:13:11 -06003752static int io_remove_buffers(struct io_kiocb *req, bool force_nonblock,
3753 struct io_comp_state *cs)
Jens Axboe067524e2020-03-02 16:32:28 -07003754{
3755 struct io_provide_buf *p = &req->pbuf;
3756 struct io_ring_ctx *ctx = req->ctx;
3757 struct io_buffer *head;
3758 int ret = 0;
3759
3760 io_ring_submit_lock(ctx, !force_nonblock);
3761
3762 lockdep_assert_held(&ctx->uring_lock);
3763
3764 ret = -ENOENT;
3765 head = idr_find(&ctx->io_buffer_idr, p->bgid);
3766 if (head)
3767 ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs);
3768
3769 io_ring_submit_lock(ctx, !force_nonblock);
3770 if (ret < 0)
3771 req_set_fail_links(req);
Jens Axboe229a7b62020-06-22 10:13:11 -06003772 __io_req_complete(req, ret, 0, cs);
Jens Axboe067524e2020-03-02 16:32:28 -07003773 return 0;
3774}
3775
Jens Axboeddf0322d2020-02-23 16:41:33 -07003776static int io_provide_buffers_prep(struct io_kiocb *req,
3777 const struct io_uring_sqe *sqe)
3778{
3779 struct io_provide_buf *p = &req->pbuf;
3780 u64 tmp;
3781
3782 if (sqe->ioprio || sqe->rw_flags)
3783 return -EINVAL;
3784
3785 tmp = READ_ONCE(sqe->fd);
3786 if (!tmp || tmp > USHRT_MAX)
3787 return -E2BIG;
3788 p->nbufs = tmp;
3789 p->addr = READ_ONCE(sqe->addr);
3790 p->len = READ_ONCE(sqe->len);
3791
Bijan Mottahedehefe68c12020-06-04 18:01:52 -07003792 if (!access_ok(u64_to_user_ptr(p->addr), (p->len * p->nbufs)))
Jens Axboeddf0322d2020-02-23 16:41:33 -07003793 return -EFAULT;
3794
3795 p->bgid = READ_ONCE(sqe->buf_group);
3796 tmp = READ_ONCE(sqe->off);
3797 if (tmp > USHRT_MAX)
3798 return -E2BIG;
3799 p->bid = tmp;
3800 return 0;
3801}
3802
3803static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head)
3804{
3805 struct io_buffer *buf;
3806 u64 addr = pbuf->addr;
3807 int i, bid = pbuf->bid;
3808
3809 for (i = 0; i < pbuf->nbufs; i++) {
3810 buf = kmalloc(sizeof(*buf), GFP_KERNEL);
3811 if (!buf)
3812 break;
3813
3814 buf->addr = addr;
3815 buf->len = pbuf->len;
3816 buf->bid = bid;
3817 addr += pbuf->len;
3818 bid++;
3819 if (!*head) {
3820 INIT_LIST_HEAD(&buf->list);
3821 *head = buf;
3822 } else {
3823 list_add_tail(&buf->list, &(*head)->list);
3824 }
3825 }
3826
3827 return i ? i : -ENOMEM;
3828}
3829
Jens Axboe229a7b62020-06-22 10:13:11 -06003830static int io_provide_buffers(struct io_kiocb *req, bool force_nonblock,
3831 struct io_comp_state *cs)
Jens Axboeddf0322d2020-02-23 16:41:33 -07003832{
3833 struct io_provide_buf *p = &req->pbuf;
3834 struct io_ring_ctx *ctx = req->ctx;
3835 struct io_buffer *head, *list;
3836 int ret = 0;
3837
3838 io_ring_submit_lock(ctx, !force_nonblock);
3839
3840 lockdep_assert_held(&ctx->uring_lock);
3841
3842 list = head = idr_find(&ctx->io_buffer_idr, p->bgid);
3843
3844 ret = io_add_buffers(p, &head);
3845 if (ret < 0)
3846 goto out;
3847
3848 if (!list) {
3849 ret = idr_alloc(&ctx->io_buffer_idr, head, p->bgid, p->bgid + 1,
3850 GFP_KERNEL);
3851 if (ret < 0) {
Jens Axboe067524e2020-03-02 16:32:28 -07003852 __io_remove_buffers(ctx, head, p->bgid, -1U);
Jens Axboeddf0322d2020-02-23 16:41:33 -07003853 goto out;
3854 }
3855 }
3856out:
3857 io_ring_submit_unlock(ctx, !force_nonblock);
3858 if (ret < 0)
3859 req_set_fail_links(req);
Jens Axboe229a7b62020-06-22 10:13:11 -06003860 __io_req_complete(req, ret, 0, cs);
Jens Axboeddf0322d2020-02-23 16:41:33 -07003861 return 0;
Jens Axboe2b188cc2019-01-07 10:46:33 -07003862}
3863
Jens Axboe3e4827b2020-01-08 15:18:09 -07003864static int io_epoll_ctl_prep(struct io_kiocb *req,
3865 const struct io_uring_sqe *sqe)
3866{
3867#if defined(CONFIG_EPOLL)
3868 if (sqe->ioprio || sqe->buf_index)
3869 return -EINVAL;
Jens Axboe6ca56f82020-09-18 16:51:19 -06003870 if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL)))
Pavel Begunkov3232dd02020-06-03 18:03:22 +03003871 return -EINVAL;
Jens Axboe3e4827b2020-01-08 15:18:09 -07003872
3873 req->epoll.epfd = READ_ONCE(sqe->fd);
3874 req->epoll.op = READ_ONCE(sqe->len);
3875 req->epoll.fd = READ_ONCE(sqe->off);
3876
3877 if (ep_op_has_event(req->epoll.op)) {
3878 struct epoll_event __user *ev;
3879
3880 ev = u64_to_user_ptr(READ_ONCE(sqe->addr));
3881 if (copy_from_user(&req->epoll.event, ev, sizeof(*ev)))
3882 return -EFAULT;
3883 }
3884
3885 return 0;
3886#else
3887 return -EOPNOTSUPP;
3888#endif
3889}
3890
Jens Axboe229a7b62020-06-22 10:13:11 -06003891static int io_epoll_ctl(struct io_kiocb *req, bool force_nonblock,
3892 struct io_comp_state *cs)
Jens Axboe3e4827b2020-01-08 15:18:09 -07003893{
3894#if defined(CONFIG_EPOLL)
3895 struct io_epoll *ie = &req->epoll;
3896 int ret;
3897
3898 ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock);
3899 if (force_nonblock && ret == -EAGAIN)
3900 return -EAGAIN;
3901
3902 if (ret < 0)
3903 req_set_fail_links(req);
Jens Axboe229a7b62020-06-22 10:13:11 -06003904 __io_req_complete(req, ret, 0, cs);
Jens Axboe3e4827b2020-01-08 15:18:09 -07003905 return 0;
3906#else
3907 return -EOPNOTSUPP;
3908#endif
3909}
3910
Jens Axboec1ca7572019-12-25 22:18:28 -07003911static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3912{
3913#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
3914 if (sqe->ioprio || sqe->buf_index || sqe->off)
3915 return -EINVAL;
Pavel Begunkov3232dd02020-06-03 18:03:22 +03003916 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3917 return -EINVAL;
Jens Axboec1ca7572019-12-25 22:18:28 -07003918
3919 req->madvise.addr = READ_ONCE(sqe->addr);
3920 req->madvise.len = READ_ONCE(sqe->len);
3921 req->madvise.advice = READ_ONCE(sqe->fadvise_advice);
3922 return 0;
3923#else
3924 return -EOPNOTSUPP;
3925#endif
3926}
3927
Pavel Begunkov014db002020-03-03 21:33:12 +03003928static int io_madvise(struct io_kiocb *req, bool force_nonblock)
Jens Axboec1ca7572019-12-25 22:18:28 -07003929{
3930#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
3931 struct io_madvise *ma = &req->madvise;
3932 int ret;
3933
3934 if (force_nonblock)
3935 return -EAGAIN;
3936
3937 ret = do_madvise(ma->addr, ma->len, ma->advice);
3938 if (ret < 0)
3939 req_set_fail_links(req);
Jens Axboee1e16092020-06-22 09:17:17 -06003940 io_req_complete(req, ret);
Jens Axboec1ca7572019-12-25 22:18:28 -07003941 return 0;
3942#else
3943 return -EOPNOTSUPP;
3944#endif
3945}
3946
Jens Axboe4840e412019-12-25 22:03:45 -07003947static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3948{
3949 if (sqe->ioprio || sqe->buf_index || sqe->addr)
3950 return -EINVAL;
Pavel Begunkov3232dd02020-06-03 18:03:22 +03003951 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3952 return -EINVAL;
Jens Axboe4840e412019-12-25 22:03:45 -07003953
3954 req->fadvise.offset = READ_ONCE(sqe->off);
3955 req->fadvise.len = READ_ONCE(sqe->len);
3956 req->fadvise.advice = READ_ONCE(sqe->fadvise_advice);
3957 return 0;
3958}
3959
Pavel Begunkov014db002020-03-03 21:33:12 +03003960static int io_fadvise(struct io_kiocb *req, bool force_nonblock)
Jens Axboe4840e412019-12-25 22:03:45 -07003961{
3962 struct io_fadvise *fa = &req->fadvise;
3963 int ret;
3964
Jens Axboe3e694262020-02-01 09:22:49 -07003965 if (force_nonblock) {
3966 switch (fa->advice) {
3967 case POSIX_FADV_NORMAL:
3968 case POSIX_FADV_RANDOM:
3969 case POSIX_FADV_SEQUENTIAL:
3970 break;
3971 default:
3972 return -EAGAIN;
3973 }
3974 }
Jens Axboe4840e412019-12-25 22:03:45 -07003975
3976 ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice);
3977 if (ret < 0)
3978 req_set_fail_links(req);
Jens Axboee1e16092020-06-22 09:17:17 -06003979 io_req_complete(req, ret);
Jens Axboe4840e412019-12-25 22:03:45 -07003980 return 0;
3981}
3982
Jens Axboeeddc7ef2019-12-13 21:18:10 -07003983static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3984{
Jens Axboe6ca56f82020-09-18 16:51:19 -06003985 if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL)))
Pavel Begunkov3232dd02020-06-03 18:03:22 +03003986 return -EINVAL;
Jens Axboeeddc7ef2019-12-13 21:18:10 -07003987 if (sqe->ioprio || sqe->buf_index)
3988 return -EINVAL;
Pavel Begunkov9c280f92020-04-08 08:58:46 +03003989 if (req->flags & REQ_F_FIXED_FILE)
Jens Axboecf3040c2020-02-06 21:31:40 -07003990 return -EBADF;
Jens Axboeeddc7ef2019-12-13 21:18:10 -07003991
Bijan Mottahedeh1d9e1282020-05-22 21:31:16 -07003992 req->statx.dfd = READ_ONCE(sqe->fd);
3993 req->statx.mask = READ_ONCE(sqe->len);
Bijan Mottahedehe62753e2020-05-22 21:31:18 -07003994 req->statx.filename = u64_to_user_ptr(READ_ONCE(sqe->addr));
Bijan Mottahedeh1d9e1282020-05-22 21:31:16 -07003995 req->statx.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2));
3996 req->statx.flags = READ_ONCE(sqe->statx_flags);
Jens Axboeeddc7ef2019-12-13 21:18:10 -07003997
Jens Axboeeddc7ef2019-12-13 21:18:10 -07003998 return 0;
3999}
4000
Pavel Begunkov014db002020-03-03 21:33:12 +03004001static int io_statx(struct io_kiocb *req, bool force_nonblock)
Jens Axboeeddc7ef2019-12-13 21:18:10 -07004002{
Bijan Mottahedeh1d9e1282020-05-22 21:31:16 -07004003 struct io_statx *ctx = &req->statx;
Jens Axboeeddc7ef2019-12-13 21:18:10 -07004004 int ret;
4005
Jens Axboe5b0bbee2020-04-27 10:41:22 -06004006 if (force_nonblock) {
4007 /* only need file table for an actual valid fd */
4008 if (ctx->dfd == -1 || ctx->dfd == AT_FDCWD)
4009 req->flags |= REQ_F_NO_FILE_TABLE;
Jens Axboeeddc7ef2019-12-13 21:18:10 -07004010 return -EAGAIN;
Jens Axboe5b0bbee2020-04-27 10:41:22 -06004011 }
Jens Axboeeddc7ef2019-12-13 21:18:10 -07004012
Bijan Mottahedehe62753e2020-05-22 21:31:18 -07004013 ret = do_statx(ctx->dfd, ctx->filename, ctx->flags, ctx->mask,
4014 ctx->buffer);
Jens Axboeeddc7ef2019-12-13 21:18:10 -07004015
Jens Axboeeddc7ef2019-12-13 21:18:10 -07004016 if (ret < 0)
4017 req_set_fail_links(req);
Jens Axboee1e16092020-06-22 09:17:17 -06004018 io_req_complete(req, ret);
Jens Axboeeddc7ef2019-12-13 21:18:10 -07004019 return 0;
4020}
4021
Jens Axboeb5dba592019-12-11 14:02:38 -07004022static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4023{
4024 /*
4025 * If we queue this for async, it must not be cancellable. That would
Xiaoguang Wang7cdaf582020-06-10 19:41:19 +08004026 * leave the 'file' in an undeterminate state, and here need to modify
4027 * io_wq_work.flags, so initialize io_wq_work firstly.
Jens Axboeb5dba592019-12-11 14:02:38 -07004028 */
Xiaoguang Wang7cdaf582020-06-10 19:41:19 +08004029 io_req_init_async(req);
Jens Axboeb5dba592019-12-11 14:02:38 -07004030 req->work.flags |= IO_WQ_WORK_NO_CANCEL;
4031
Pavel Begunkov3232dd02020-06-03 18:03:22 +03004032 if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
4033 return -EINVAL;
Jens Axboeb5dba592019-12-11 14:02:38 -07004034 if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
4035 sqe->rw_flags || sqe->buf_index)
4036 return -EINVAL;
Pavel Begunkov9c280f92020-04-08 08:58:46 +03004037 if (req->flags & REQ_F_FIXED_FILE)
Jens Axboecf3040c2020-02-06 21:31:40 -07004038 return -EBADF;
Jens Axboeb5dba592019-12-11 14:02:38 -07004039
4040 req->close.fd = READ_ONCE(sqe->fd);
Jens Axboe0f212202020-09-13 13:09:39 -06004041 if ((req->file && req->file->f_op == &io_uring_fops))
Jens Axboefd2206e2020-06-02 16:40:47 -06004042 return -EBADF;
4043
Pavel Begunkov3af73b22020-06-08 21:08:17 +03004044 req->close.put_file = NULL;
Jens Axboeb5dba592019-12-11 14:02:38 -07004045 return 0;
4046}
4047
Jens Axboe229a7b62020-06-22 10:13:11 -06004048static int io_close(struct io_kiocb *req, bool force_nonblock,
4049 struct io_comp_state *cs)
Jens Axboeb5dba592019-12-11 14:02:38 -07004050{
Pavel Begunkov3af73b22020-06-08 21:08:17 +03004051 struct io_close *close = &req->close;
Jens Axboeb5dba592019-12-11 14:02:38 -07004052 int ret;
4053
Pavel Begunkov3af73b22020-06-08 21:08:17 +03004054 /* might be already done during nonblock submission */
4055 if (!close->put_file) {
4056 ret = __close_fd_get_file(close->fd, &close->put_file);
4057 if (ret < 0)
4058 return (ret == -ENOENT) ? -EBADF : ret;
4059 }
Jens Axboeb5dba592019-12-11 14:02:38 -07004060
4061 /* if the file has a flush method, be safe and punt to async */
Pavel Begunkov3af73b22020-06-08 21:08:17 +03004062 if (close->put_file->f_op->flush && force_nonblock) {
Pavel Begunkov24c74672020-06-21 13:09:51 +03004063 /* was never set, but play safe */
4064 req->flags &= ~REQ_F_NOWAIT;
Pavel Begunkov0bf0eef2020-05-26 20:34:06 +03004065 /* avoid grabbing files - we don't need the files */
Pavel Begunkov24c74672020-06-21 13:09:51 +03004066 req->flags |= REQ_F_NO_FILE_TABLE;
Pavel Begunkov0bf0eef2020-05-26 20:34:06 +03004067 return -EAGAIN;
Pavel Begunkova2100672020-03-02 23:45:16 +03004068 }
Jens Axboeb5dba592019-12-11 14:02:38 -07004069
Pavel Begunkov3af73b22020-06-08 21:08:17 +03004070 /* No ->flush() or already async, safely close from here */
4071 ret = filp_close(close->put_file, req->work.files);
4072 if (ret < 0)
4073 req_set_fail_links(req);
Pavel Begunkov3af73b22020-06-08 21:08:17 +03004074 fput(close->put_file);
4075 close->put_file = NULL;
Jens Axboe229a7b62020-06-22 10:13:11 -06004076 __io_req_complete(req, ret, 0, cs);
Jens Axboe1a417f42020-01-31 17:16:48 -07004077 return 0;
Jens Axboeb5dba592019-12-11 14:02:38 -07004078}
4079
Jens Axboe3529d8c2019-12-19 18:24:38 -07004080static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe)
Jens Axboe5d17b4a2019-04-09 14:56:44 -06004081{
4082 struct io_ring_ctx *ctx = req->ctx;
Jens Axboe5d17b4a2019-04-09 14:56:44 -06004083
4084 if (!req->file)
4085 return -EBADF;
Jens Axboe5d17b4a2019-04-09 14:56:44 -06004086
4087 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
4088 return -EINVAL;
4089 if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
4090 return -EINVAL;
4091
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07004092 req->sync.off = READ_ONCE(sqe->off);
4093 req->sync.len = READ_ONCE(sqe->len);
4094 req->sync.flags = READ_ONCE(sqe->sync_range_flags);
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07004095 return 0;
4096}
4097
Pavel Begunkovac45abc2020-06-08 21:08:18 +03004098static int io_sync_file_range(struct io_kiocb *req, bool force_nonblock)
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07004099{
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07004100 int ret;
4101
Pavel Begunkovac45abc2020-06-08 21:08:18 +03004102 /* sync_file_range always requires a blocking context */
4103 if (force_nonblock)
4104 return -EAGAIN;
4105
Jens Axboe9adbd452019-12-20 08:45:55 -07004106 ret = sync_file_range(req->file, req->sync.off, req->sync.len,
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07004107 req->sync.flags);
4108 if (ret < 0)
4109 req_set_fail_links(req);
Jens Axboee1e16092020-06-22 09:17:17 -06004110 io_req_complete(req, ret);
Jens Axboe5d17b4a2019-04-09 14:56:44 -06004111 return 0;
4112}
4113
YueHaibing469956e2020-03-04 15:53:52 +08004114#if defined(CONFIG_NET)
Pavel Begunkov02d27d82020-02-28 10:36:36 +03004115static int io_setup_async_msg(struct io_kiocb *req,
4116 struct io_async_msghdr *kmsg)
4117{
Jens Axboee8c2bc12020-08-15 18:44:09 -07004118 struct io_async_msghdr *async_msg = req->async_data;
4119
4120 if (async_msg)
Pavel Begunkov02d27d82020-02-28 10:36:36 +03004121 return -EAGAIN;
Jens Axboee8c2bc12020-08-15 18:44:09 -07004122 if (io_alloc_async_data(req)) {
Pavel Begunkov02d27d82020-02-28 10:36:36 +03004123 if (kmsg->iov != kmsg->fast_iov)
4124 kfree(kmsg->iov);
4125 return -ENOMEM;
4126 }
Jens Axboee8c2bc12020-08-15 18:44:09 -07004127 async_msg = req->async_data;
Pavel Begunkov02d27d82020-02-28 10:36:36 +03004128 req->flags |= REQ_F_NEED_CLEANUP;
Jens Axboee8c2bc12020-08-15 18:44:09 -07004129 memcpy(async_msg, kmsg, sizeof(*kmsg));
Pavel Begunkov02d27d82020-02-28 10:36:36 +03004130 return -EAGAIN;
4131}
4132
Pavel Begunkov2ae523e2020-07-12 20:41:06 +03004133static int io_sendmsg_copy_hdr(struct io_kiocb *req,
4134 struct io_async_msghdr *iomsg)
4135{
4136 iomsg->iov = iomsg->fast_iov;
4137 iomsg->msg.msg_name = &iomsg->addr;
4138 return sendmsg_copy_msghdr(&iomsg->msg, req->sr_msg.umsg,
4139 req->sr_msg.msg_flags, &iomsg->iov);
4140}
4141
Jens Axboe3529d8c2019-12-19 18:24:38 -07004142static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
Jens Axboeaa1fa282019-04-19 13:38:09 -06004143{
Jens Axboee8c2bc12020-08-15 18:44:09 -07004144 struct io_async_msghdr *async_msg = req->async_data;
Jens Axboee47293f2019-12-20 08:58:21 -07004145 struct io_sr_msg *sr = &req->sr_msg;
Pavel Begunkov99bc4c32020-02-07 22:04:45 +03004146 int ret;
Jens Axboe03b12302019-12-02 18:50:25 -07004147
Pavel Begunkovd2b6f482020-06-03 18:03:25 +03004148 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4149 return -EINVAL;
4150
Jens Axboee47293f2019-12-20 08:58:21 -07004151 sr->msg_flags = READ_ONCE(sqe->msg_flags);
Pavel Begunkov270a5942020-07-12 20:41:04 +03004152 sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
Jens Axboefddafac2020-01-04 20:19:44 -07004153 sr->len = READ_ONCE(sqe->len);
Jens Axboe3529d8c2019-12-19 18:24:38 -07004154
Jens Axboed8768362020-02-27 14:17:49 -07004155#ifdef CONFIG_COMPAT
4156 if (req->ctx->compat)
4157 sr->msg_flags |= MSG_CMSG_COMPAT;
4158#endif
4159
Jens Axboee8c2bc12020-08-15 18:44:09 -07004160 if (!async_msg || !io_op_defs[req->opcode].needs_async_data)
Jens Axboe3529d8c2019-12-19 18:24:38 -07004161 return 0;
Pavel Begunkov5f798be2020-02-08 13:28:02 +03004162 /* iovec is already imported */
4163 if (req->flags & REQ_F_NEED_CLEANUP)
4164 return 0;
Jens Axboe3529d8c2019-12-19 18:24:38 -07004165
Jens Axboee8c2bc12020-08-15 18:44:09 -07004166 ret = io_sendmsg_copy_hdr(req, async_msg);
Pavel Begunkov99bc4c32020-02-07 22:04:45 +03004167 if (!ret)
4168 req->flags |= REQ_F_NEED_CLEANUP;
4169 return ret;
Jens Axboe03b12302019-12-02 18:50:25 -07004170}
4171
Jens Axboe229a7b62020-06-22 10:13:11 -06004172static int io_sendmsg(struct io_kiocb *req, bool force_nonblock,
4173 struct io_comp_state *cs)
Jens Axboe03b12302019-12-02 18:50:25 -07004174{
Pavel Begunkov6b754c82020-07-16 23:28:00 +03004175 struct io_async_msghdr iomsg, *kmsg;
Jens Axboe03b12302019-12-02 18:50:25 -07004176 struct socket *sock;
Pavel Begunkov7a7cacb2020-07-16 23:27:59 +03004177 unsigned flags;
Jens Axboe03b12302019-12-02 18:50:25 -07004178 int ret;
4179
Jens Axboe03b12302019-12-02 18:50:25 -07004180 sock = sock_from_file(req->file, &ret);
Pavel Begunkov7a7cacb2020-07-16 23:27:59 +03004181 if (unlikely(!sock))
4182 return ret;
Jens Axboe03b12302019-12-02 18:50:25 -07004183
Jens Axboee8c2bc12020-08-15 18:44:09 -07004184 if (req->async_data) {
4185 kmsg = req->async_data;
4186 kmsg->msg.msg_name = &kmsg->addr;
Pavel Begunkov7a7cacb2020-07-16 23:27:59 +03004187 /* if iov is set, it's allocated already */
4188 if (!kmsg->iov)
4189 kmsg->iov = kmsg->fast_iov;
4190 kmsg->msg.msg_iter.iov = kmsg->iov;
4191 } else {
4192 ret = io_sendmsg_copy_hdr(req, &iomsg);
Jens Axboefddafac2020-01-04 20:19:44 -07004193 if (ret)
4194 return ret;
Pavel Begunkov7a7cacb2020-07-16 23:27:59 +03004195 kmsg = &iomsg;
Jens Axboefddafac2020-01-04 20:19:44 -07004196 }
4197
Pavel Begunkov7a7cacb2020-07-16 23:27:59 +03004198 flags = req->sr_msg.msg_flags;
4199 if (flags & MSG_DONTWAIT)
4200 req->flags |= REQ_F_NOWAIT;
4201 else if (force_nonblock)
4202 flags |= MSG_DONTWAIT;
4203
4204 ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
4205 if (force_nonblock && ret == -EAGAIN)
4206 return io_setup_async_msg(req, kmsg);
4207 if (ret == -ERESTARTSYS)
4208 ret = -EINTR;
4209
Pavel Begunkov6b754c82020-07-16 23:28:00 +03004210 if (kmsg->iov != kmsg->fast_iov)
Jens Axboe03b12302019-12-02 18:50:25 -07004211 kfree(kmsg->iov);
4212 req->flags &= ~REQ_F_NEED_CLEANUP;
Jens Axboefddafac2020-01-04 20:19:44 -07004213 if (ret < 0)
4214 req_set_fail_links(req);
Jens Axboe229a7b62020-06-22 10:13:11 -06004215 __io_req_complete(req, ret, 0, cs);
Jens Axboefddafac2020-01-04 20:19:44 -07004216 return 0;
Jens Axboefddafac2020-01-04 20:19:44 -07004217}
4218
Jens Axboe229a7b62020-06-22 10:13:11 -06004219static int io_send(struct io_kiocb *req, bool force_nonblock,
4220 struct io_comp_state *cs)
Jens Axboe03b12302019-12-02 18:50:25 -07004221{
Pavel Begunkov7a7cacb2020-07-16 23:27:59 +03004222 struct io_sr_msg *sr = &req->sr_msg;
4223 struct msghdr msg;
4224 struct iovec iov;
Jens Axboe03b12302019-12-02 18:50:25 -07004225 struct socket *sock;
Pavel Begunkov7a7cacb2020-07-16 23:27:59 +03004226 unsigned flags;
Jens Axboe03b12302019-12-02 18:50:25 -07004227 int ret;
4228
4229 sock = sock_from_file(req->file, &ret);
Pavel Begunkov7a7cacb2020-07-16 23:27:59 +03004230 if (unlikely(!sock))
4231 return ret;
Jens Axboe03b12302019-12-02 18:50:25 -07004232
Pavel Begunkov7a7cacb2020-07-16 23:27:59 +03004233 ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter);
4234 if (unlikely(ret))
Zheng Bin14db8412020-09-09 20:12:37 +08004235 return ret;
Jens Axboe03b12302019-12-02 18:50:25 -07004236
Pavel Begunkov7a7cacb2020-07-16 23:27:59 +03004237 msg.msg_name = NULL;
4238 msg.msg_control = NULL;
4239 msg.msg_controllen = 0;
4240 msg.msg_namelen = 0;
Jens Axboe03b12302019-12-02 18:50:25 -07004241
Pavel Begunkov7a7cacb2020-07-16 23:27:59 +03004242 flags = req->sr_msg.msg_flags;
4243 if (flags & MSG_DONTWAIT)
4244 req->flags |= REQ_F_NOWAIT;
4245 else if (force_nonblock)
4246 flags |= MSG_DONTWAIT;
Jens Axboe03b12302019-12-02 18:50:25 -07004247
Pavel Begunkov7a7cacb2020-07-16 23:27:59 +03004248 msg.msg_flags = flags;
4249 ret = sock_sendmsg(sock, &msg);
4250 if (force_nonblock && ret == -EAGAIN)
4251 return -EAGAIN;
4252 if (ret == -ERESTARTSYS)
4253 ret = -EINTR;
Jens Axboe03b12302019-12-02 18:50:25 -07004254
Jens Axboe03b12302019-12-02 18:50:25 -07004255 if (ret < 0)
4256 req_set_fail_links(req);
Jens Axboe229a7b62020-06-22 10:13:11 -06004257 __io_req_complete(req, ret, 0, cs);
Jens Axboe03b12302019-12-02 18:50:25 -07004258 return 0;
Jens Axboe03b12302019-12-02 18:50:25 -07004259}
4260
Pavel Begunkov1400e692020-07-12 20:41:05 +03004261static int __io_recvmsg_copy_hdr(struct io_kiocb *req,
4262 struct io_async_msghdr *iomsg)
Jens Axboe52de1fe2020-02-27 10:15:42 -07004263{
4264 struct io_sr_msg *sr = &req->sr_msg;
4265 struct iovec __user *uiov;
4266 size_t iov_len;
4267 int ret;
4268
Pavel Begunkov1400e692020-07-12 20:41:05 +03004269 ret = __copy_msghdr_from_user(&iomsg->msg, sr->umsg,
4270 &iomsg->uaddr, &uiov, &iov_len);
Jens Axboe52de1fe2020-02-27 10:15:42 -07004271 if (ret)
4272 return ret;
4273
4274 if (req->flags & REQ_F_BUFFER_SELECT) {
4275 if (iov_len > 1)
4276 return -EINVAL;
Pavel Begunkov1400e692020-07-12 20:41:05 +03004277 if (copy_from_user(iomsg->iov, uiov, sizeof(*uiov)))
Jens Axboe52de1fe2020-02-27 10:15:42 -07004278 return -EFAULT;
Pavel Begunkov1400e692020-07-12 20:41:05 +03004279 sr->len = iomsg->iov[0].iov_len;
4280 iov_iter_init(&iomsg->msg.msg_iter, READ, iomsg->iov, 1,
Jens Axboe52de1fe2020-02-27 10:15:42 -07004281 sr->len);
Pavel Begunkov1400e692020-07-12 20:41:05 +03004282 iomsg->iov = NULL;
Jens Axboe52de1fe2020-02-27 10:15:42 -07004283 } else {
4284 ret = import_iovec(READ, uiov, iov_len, UIO_FASTIOV,
Pavel Begunkov1400e692020-07-12 20:41:05 +03004285 &iomsg->iov, &iomsg->msg.msg_iter);
Jens Axboe52de1fe2020-02-27 10:15:42 -07004286 if (ret > 0)
4287 ret = 0;
4288 }
4289
4290 return ret;
4291}
4292
4293#ifdef CONFIG_COMPAT
4294static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
Pavel Begunkov1400e692020-07-12 20:41:05 +03004295 struct io_async_msghdr *iomsg)
Jens Axboe52de1fe2020-02-27 10:15:42 -07004296{
4297 struct compat_msghdr __user *msg_compat;
4298 struct io_sr_msg *sr = &req->sr_msg;
4299 struct compat_iovec __user *uiov;
4300 compat_uptr_t ptr;
4301 compat_size_t len;
4302 int ret;
4303
Pavel Begunkov270a5942020-07-12 20:41:04 +03004304 msg_compat = (struct compat_msghdr __user *) sr->umsg;
Pavel Begunkov1400e692020-07-12 20:41:05 +03004305 ret = __get_compat_msghdr(&iomsg->msg, msg_compat, &iomsg->uaddr,
Jens Axboe52de1fe2020-02-27 10:15:42 -07004306 &ptr, &len);
4307 if (ret)
4308 return ret;
4309
4310 uiov = compat_ptr(ptr);
4311 if (req->flags & REQ_F_BUFFER_SELECT) {
4312 compat_ssize_t clen;
4313
4314 if (len > 1)
4315 return -EINVAL;
4316 if (!access_ok(uiov, sizeof(*uiov)))
4317 return -EFAULT;
4318 if (__get_user(clen, &uiov->iov_len))
4319 return -EFAULT;
4320 if (clen < 0)
4321 return -EINVAL;
Pavel Begunkov1400e692020-07-12 20:41:05 +03004322 sr->len = iomsg->iov[0].iov_len;
4323 iomsg->iov = NULL;
Jens Axboe52de1fe2020-02-27 10:15:42 -07004324 } else {
4325 ret = compat_import_iovec(READ, uiov, len, UIO_FASTIOV,
Pavel Begunkov1400e692020-07-12 20:41:05 +03004326 &iomsg->iov,
4327 &iomsg->msg.msg_iter);
Jens Axboe52de1fe2020-02-27 10:15:42 -07004328 if (ret < 0)
4329 return ret;
4330 }
4331
4332 return 0;
4333}
Jens Axboe03b12302019-12-02 18:50:25 -07004334#endif
Jens Axboe52de1fe2020-02-27 10:15:42 -07004335
Pavel Begunkov1400e692020-07-12 20:41:05 +03004336static int io_recvmsg_copy_hdr(struct io_kiocb *req,
4337 struct io_async_msghdr *iomsg)
Jens Axboe52de1fe2020-02-27 10:15:42 -07004338{
Pavel Begunkov1400e692020-07-12 20:41:05 +03004339 iomsg->msg.msg_name = &iomsg->addr;
4340 iomsg->iov = iomsg->fast_iov;
Jens Axboe52de1fe2020-02-27 10:15:42 -07004341
4342#ifdef CONFIG_COMPAT
4343 if (req->ctx->compat)
Pavel Begunkov1400e692020-07-12 20:41:05 +03004344 return __io_compat_recvmsg_copy_hdr(req, iomsg);
Jens Axboe52de1fe2020-02-27 10:15:42 -07004345#endif
4346
Pavel Begunkov1400e692020-07-12 20:41:05 +03004347 return __io_recvmsg_copy_hdr(req, iomsg);
Jens Axboe52de1fe2020-02-27 10:15:42 -07004348}
4349
Jens Axboebcda7ba2020-02-23 16:42:51 -07004350static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req,
Pavel Begunkov7fbb1b52020-07-16 23:28:05 +03004351 bool needs_lock)
Jens Axboebcda7ba2020-02-23 16:42:51 -07004352{
4353 struct io_sr_msg *sr = &req->sr_msg;
4354 struct io_buffer *kbuf;
4355
Jens Axboebcda7ba2020-02-23 16:42:51 -07004356 kbuf = io_buffer_select(req, &sr->len, sr->bgid, sr->kbuf, needs_lock);
4357 if (IS_ERR(kbuf))
4358 return kbuf;
4359
4360 sr->kbuf = kbuf;
4361 req->flags |= REQ_F_BUFFER_SELECTED;
Jens Axboebcda7ba2020-02-23 16:42:51 -07004362 return kbuf;
Jens Axboe03b12302019-12-02 18:50:25 -07004363}
4364
Pavel Begunkov7fbb1b52020-07-16 23:28:05 +03004365static inline unsigned int io_put_recv_kbuf(struct io_kiocb *req)
4366{
4367 return io_put_kbuf(req, req->sr_msg.kbuf);
4368}
4369
Jens Axboe3529d8c2019-12-19 18:24:38 -07004370static int io_recvmsg_prep(struct io_kiocb *req,
4371 const struct io_uring_sqe *sqe)
Jens Axboe03b12302019-12-02 18:50:25 -07004372{
Jens Axboee8c2bc12020-08-15 18:44:09 -07004373 struct io_async_msghdr *async_msg = req->async_data;
Jens Axboee47293f2019-12-20 08:58:21 -07004374 struct io_sr_msg *sr = &req->sr_msg;
Pavel Begunkov99bc4c32020-02-07 22:04:45 +03004375 int ret;
Jens Axboe06b76d42019-12-19 14:44:26 -07004376
Pavel Begunkovd2b6f482020-06-03 18:03:25 +03004377 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4378 return -EINVAL;
4379
Jens Axboe3529d8c2019-12-19 18:24:38 -07004380 sr->msg_flags = READ_ONCE(sqe->msg_flags);
Pavel Begunkov270a5942020-07-12 20:41:04 +03004381 sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
Jens Axboe0b7b21e2020-01-31 08:34:59 -07004382 sr->len = READ_ONCE(sqe->len);
Jens Axboebcda7ba2020-02-23 16:42:51 -07004383 sr->bgid = READ_ONCE(sqe->buf_group);
Jens Axboe3529d8c2019-12-19 18:24:38 -07004384
Jens Axboed8768362020-02-27 14:17:49 -07004385#ifdef CONFIG_COMPAT
4386 if (req->ctx->compat)
4387 sr->msg_flags |= MSG_CMSG_COMPAT;
4388#endif
4389
Jens Axboee8c2bc12020-08-15 18:44:09 -07004390 if (!async_msg || !io_op_defs[req->opcode].needs_async_data)
Jens Axboe06b76d42019-12-19 14:44:26 -07004391 return 0;
Pavel Begunkov5f798be2020-02-08 13:28:02 +03004392 /* iovec is already imported */
4393 if (req->flags & REQ_F_NEED_CLEANUP)
4394 return 0;
Jens Axboe03b12302019-12-02 18:50:25 -07004395
Jens Axboee8c2bc12020-08-15 18:44:09 -07004396 ret = io_recvmsg_copy_hdr(req, async_msg);
Pavel Begunkov99bc4c32020-02-07 22:04:45 +03004397 if (!ret)
4398 req->flags |= REQ_F_NEED_CLEANUP;
4399 return ret;
Jens Axboe03b12302019-12-02 18:50:25 -07004400}
4401
Jens Axboe229a7b62020-06-22 10:13:11 -06004402static int io_recvmsg(struct io_kiocb *req, bool force_nonblock,
4403 struct io_comp_state *cs)
Jens Axboe03b12302019-12-02 18:50:25 -07004404{
Pavel Begunkov6b754c82020-07-16 23:28:00 +03004405 struct io_async_msghdr iomsg, *kmsg;
Jens Axboe0fa03c62019-04-19 13:34:07 -06004406 struct socket *sock;
Pavel Begunkov7fbb1b52020-07-16 23:28:05 +03004407 struct io_buffer *kbuf;
Pavel Begunkov7a7cacb2020-07-16 23:27:59 +03004408 unsigned flags;
Jens Axboe52de1fe2020-02-27 10:15:42 -07004409 int ret, cflags = 0;
Jens Axboe0fa03c62019-04-19 13:34:07 -06004410
Jens Axboe0fa03c62019-04-19 13:34:07 -06004411 sock = sock_from_file(req->file, &ret);
Pavel Begunkov7a7cacb2020-07-16 23:27:59 +03004412 if (unlikely(!sock))
4413 return ret;
Jens Axboe0fa03c62019-04-19 13:34:07 -06004414
Jens Axboee8c2bc12020-08-15 18:44:09 -07004415 if (req->async_data) {
4416 kmsg = req->async_data;
4417 kmsg->msg.msg_name = &kmsg->addr;
Pavel Begunkov7a7cacb2020-07-16 23:27:59 +03004418 /* if iov is set, it's allocated already */
4419 if (!kmsg->iov)
4420 kmsg->iov = kmsg->fast_iov;
4421 kmsg->msg.msg_iter.iov = kmsg->iov;
4422 } else {
4423 ret = io_recvmsg_copy_hdr(req, &iomsg);
4424 if (ret)
Pavel Begunkov681fda82020-07-15 22:20:45 +03004425 return ret;
Pavel Begunkov7a7cacb2020-07-16 23:27:59 +03004426 kmsg = &iomsg;
Jens Axboe0fa03c62019-04-19 13:34:07 -06004427 }
4428
Pavel Begunkovbc02ef32020-07-16 23:28:03 +03004429 if (req->flags & REQ_F_BUFFER_SELECT) {
Pavel Begunkov7fbb1b52020-07-16 23:28:05 +03004430 kbuf = io_recv_buffer_select(req, !force_nonblock);
Pavel Begunkovbc02ef32020-07-16 23:28:03 +03004431 if (IS_ERR(kbuf))
4432 return PTR_ERR(kbuf);
Pavel Begunkov7a7cacb2020-07-16 23:27:59 +03004433 kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
4434 iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->iov,
4435 1, req->sr_msg.len);
4436 }
4437
4438 flags = req->sr_msg.msg_flags;
4439 if (flags & MSG_DONTWAIT)
4440 req->flags |= REQ_F_NOWAIT;
4441 else if (force_nonblock)
4442 flags |= MSG_DONTWAIT;
4443
4444 ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg,
4445 kmsg->uaddr, flags);
Pavel Begunkov0e1b6fe2020-07-16 23:28:02 +03004446 if (force_nonblock && ret == -EAGAIN)
4447 return io_setup_async_msg(req, kmsg);
Pavel Begunkov7a7cacb2020-07-16 23:27:59 +03004448 if (ret == -ERESTARTSYS)
4449 ret = -EINTR;
Pavel Begunkov0e1b6fe2020-07-16 23:28:02 +03004450
Pavel Begunkov7fbb1b52020-07-16 23:28:05 +03004451 if (req->flags & REQ_F_BUFFER_SELECTED)
4452 cflags = io_put_recv_kbuf(req);
Pavel Begunkov6b754c82020-07-16 23:28:00 +03004453 if (kmsg->iov != kmsg->fast_iov)
Jens Axboe0b416c32019-12-15 10:57:46 -07004454 kfree(kmsg->iov);
Pavel Begunkov99bc4c32020-02-07 22:04:45 +03004455 req->flags &= ~REQ_F_NEED_CLEANUP;
Jens Axboe4e88d6e2019-12-07 20:59:47 -07004456 if (ret < 0)
4457 req_set_fail_links(req);
Jens Axboe229a7b62020-06-22 10:13:11 -06004458 __io_req_complete(req, ret, cflags, cs);
Jens Axboe0fa03c62019-04-19 13:34:07 -06004459 return 0;
Jens Axboe0fa03c62019-04-19 13:34:07 -06004460}
4461
Jens Axboe229a7b62020-06-22 10:13:11 -06004462static int io_recv(struct io_kiocb *req, bool force_nonblock,
4463 struct io_comp_state *cs)
Jens Axboefddafac2020-01-04 20:19:44 -07004464{
Pavel Begunkov6b754c82020-07-16 23:28:00 +03004465 struct io_buffer *kbuf;
Pavel Begunkov7a7cacb2020-07-16 23:27:59 +03004466 struct io_sr_msg *sr = &req->sr_msg;
4467 struct msghdr msg;
4468 void __user *buf = sr->buf;
Jens Axboefddafac2020-01-04 20:19:44 -07004469 struct socket *sock;
Pavel Begunkov7a7cacb2020-07-16 23:27:59 +03004470 struct iovec iov;
4471 unsigned flags;
Jens Axboebcda7ba2020-02-23 16:42:51 -07004472 int ret, cflags = 0;
Jens Axboefddafac2020-01-04 20:19:44 -07004473
Jens Axboefddafac2020-01-04 20:19:44 -07004474 sock = sock_from_file(req->file, &ret);
Pavel Begunkov7a7cacb2020-07-16 23:27:59 +03004475 if (unlikely(!sock))
4476 return ret;
Jens Axboefddafac2020-01-04 20:19:44 -07004477
Pavel Begunkovbc02ef32020-07-16 23:28:03 +03004478 if (req->flags & REQ_F_BUFFER_SELECT) {
Pavel Begunkov7fbb1b52020-07-16 23:28:05 +03004479 kbuf = io_recv_buffer_select(req, !force_nonblock);
Jens Axboebcda7ba2020-02-23 16:42:51 -07004480 if (IS_ERR(kbuf))
4481 return PTR_ERR(kbuf);
Pavel Begunkov7a7cacb2020-07-16 23:27:59 +03004482 buf = u64_to_user_ptr(kbuf->addr);
Jens Axboefddafac2020-01-04 20:19:44 -07004483 }
4484
Pavel Begunkov7a7cacb2020-07-16 23:27:59 +03004485 ret = import_single_range(READ, buf, sr->len, &iov, &msg.msg_iter);
Pavel Begunkov14c32ee2020-07-16 23:28:01 +03004486 if (unlikely(ret))
4487 goto out_free;
Jens Axboefddafac2020-01-04 20:19:44 -07004488
Pavel Begunkov7a7cacb2020-07-16 23:27:59 +03004489 msg.msg_name = NULL;
4490 msg.msg_control = NULL;
4491 msg.msg_controllen = 0;
4492 msg.msg_namelen = 0;
4493 msg.msg_iocb = NULL;
4494 msg.msg_flags = 0;
4495
4496 flags = req->sr_msg.msg_flags;
4497 if (flags & MSG_DONTWAIT)
4498 req->flags |= REQ_F_NOWAIT;
4499 else if (force_nonblock)
4500 flags |= MSG_DONTWAIT;
4501
4502 ret = sock_recvmsg(sock, &msg, flags);
4503 if (force_nonblock && ret == -EAGAIN)
4504 return -EAGAIN;
4505 if (ret == -ERESTARTSYS)
4506 ret = -EINTR;
Pavel Begunkov14c32ee2020-07-16 23:28:01 +03004507out_free:
Pavel Begunkov7fbb1b52020-07-16 23:28:05 +03004508 if (req->flags & REQ_F_BUFFER_SELECTED)
4509 cflags = io_put_recv_kbuf(req);
Jens Axboefddafac2020-01-04 20:19:44 -07004510 if (ret < 0)
4511 req_set_fail_links(req);
Jens Axboe229a7b62020-06-22 10:13:11 -06004512 __io_req_complete(req, ret, cflags, cs);
Jens Axboefddafac2020-01-04 20:19:44 -07004513 return 0;
Jens Axboefddafac2020-01-04 20:19:44 -07004514}
4515
Jens Axboe3529d8c2019-12-19 18:24:38 -07004516static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
Jens Axboe17f2fe32019-10-17 14:42:58 -06004517{
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07004518 struct io_accept *accept = &req->accept;
4519
Jens Axboe17f2fe32019-10-17 14:42:58 -06004520 if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
4521 return -EINVAL;
Hrvoje Zeba8042d6c2019-11-25 14:40:22 -05004522 if (sqe->ioprio || sqe->len || sqe->buf_index)
Jens Axboe17f2fe32019-10-17 14:42:58 -06004523 return -EINVAL;
4524
Jens Axboed55e5f52019-12-11 16:12:15 -07004525 accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
4526 accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07004527 accept->flags = READ_ONCE(sqe->accept_flags);
Jens Axboe09952e32020-03-19 20:16:56 -06004528 accept->nofile = rlimit(RLIMIT_NOFILE);
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07004529 return 0;
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07004530}
Jens Axboe17f2fe32019-10-17 14:42:58 -06004531
Jens Axboe229a7b62020-06-22 10:13:11 -06004532static int io_accept(struct io_kiocb *req, bool force_nonblock,
4533 struct io_comp_state *cs)
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07004534{
4535 struct io_accept *accept = &req->accept;
Pavel Begunkovac45abc2020-06-08 21:08:18 +03004536 unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0;
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07004537 int ret;
4538
Jiufei Xuee697dee2020-06-10 13:41:59 +08004539 if (req->file->f_flags & O_NONBLOCK)
4540 req->flags |= REQ_F_NOWAIT;
4541
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07004542 ret = __sys_accept4_file(req->file, file_flags, accept->addr,
Jens Axboe09952e32020-03-19 20:16:56 -06004543 accept->addr_len, accept->flags,
4544 accept->nofile);
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07004545 if (ret == -EAGAIN && force_nonblock)
Jens Axboe17f2fe32019-10-17 14:42:58 -06004546 return -EAGAIN;
Pavel Begunkovac45abc2020-06-08 21:08:18 +03004547 if (ret < 0) {
4548 if (ret == -ERESTARTSYS)
4549 ret = -EINTR;
Jens Axboe4e88d6e2019-12-07 20:59:47 -07004550 req_set_fail_links(req);
Pavel Begunkovac45abc2020-06-08 21:08:18 +03004551 }
Jens Axboe229a7b62020-06-22 10:13:11 -06004552 __io_req_complete(req, ret, 0, cs);
Jens Axboe17f2fe32019-10-17 14:42:58 -06004553 return 0;
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07004554}
4555
Jens Axboe3529d8c2019-12-19 18:24:38 -07004556static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
Jens Axboef499a022019-12-02 16:28:46 -07004557{
Jens Axboe3529d8c2019-12-19 18:24:38 -07004558 struct io_connect *conn = &req->connect;
Jens Axboee8c2bc12020-08-15 18:44:09 -07004559 struct io_async_connect *io = req->async_data;
Jens Axboef499a022019-12-02 16:28:46 -07004560
Jens Axboe3fbb51c2019-12-20 08:51:52 -07004561 if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
4562 return -EINVAL;
4563 if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags)
4564 return -EINVAL;
4565
Jens Axboe3529d8c2019-12-19 18:24:38 -07004566 conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
4567 conn->addr_len = READ_ONCE(sqe->addr2);
4568
4569 if (!io)
4570 return 0;
4571
4572 return move_addr_to_kernel(conn->addr, conn->addr_len,
Jens Axboee8c2bc12020-08-15 18:44:09 -07004573 &io->address);
Jens Axboef499a022019-12-02 16:28:46 -07004574}
4575
Jens Axboe229a7b62020-06-22 10:13:11 -06004576static int io_connect(struct io_kiocb *req, bool force_nonblock,
4577 struct io_comp_state *cs)
Jens Axboef8e85cf2019-11-23 14:24:24 -07004578{
Jens Axboee8c2bc12020-08-15 18:44:09 -07004579 struct io_async_connect __io, *io;
Jens Axboef8e85cf2019-11-23 14:24:24 -07004580 unsigned file_flags;
Jens Axboe3fbb51c2019-12-20 08:51:52 -07004581 int ret;
Jens Axboef8e85cf2019-11-23 14:24:24 -07004582
Jens Axboee8c2bc12020-08-15 18:44:09 -07004583 if (req->async_data) {
4584 io = req->async_data;
Jens Axboef499a022019-12-02 16:28:46 -07004585 } else {
Jens Axboe3529d8c2019-12-19 18:24:38 -07004586 ret = move_addr_to_kernel(req->connect.addr,
4587 req->connect.addr_len,
Jens Axboee8c2bc12020-08-15 18:44:09 -07004588 &__io.address);
Jens Axboef499a022019-12-02 16:28:46 -07004589 if (ret)
4590 goto out;
4591 io = &__io;
4592 }
4593
Jens Axboe3fbb51c2019-12-20 08:51:52 -07004594 file_flags = force_nonblock ? O_NONBLOCK : 0;
4595
Jens Axboee8c2bc12020-08-15 18:44:09 -07004596 ret = __sys_connect_file(req->file, &io->address,
Jens Axboe3fbb51c2019-12-20 08:51:52 -07004597 req->connect.addr_len, file_flags);
Jens Axboe87f80d62019-12-03 11:23:54 -07004598 if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) {
Jens Axboee8c2bc12020-08-15 18:44:09 -07004599 if (req->async_data)
Jens Axboeb7bb4f72019-12-15 22:13:43 -07004600 return -EAGAIN;
Jens Axboee8c2bc12020-08-15 18:44:09 -07004601 if (io_alloc_async_data(req)) {
Jens Axboef499a022019-12-02 16:28:46 -07004602 ret = -ENOMEM;
4603 goto out;
4604 }
Jens Axboee8c2bc12020-08-15 18:44:09 -07004605 io = req->async_data;
4606 memcpy(req->async_data, &__io, sizeof(__io));
Jens Axboef8e85cf2019-11-23 14:24:24 -07004607 return -EAGAIN;
Jens Axboef499a022019-12-02 16:28:46 -07004608 }
Jens Axboef8e85cf2019-11-23 14:24:24 -07004609 if (ret == -ERESTARTSYS)
4610 ret = -EINTR;
Jens Axboef499a022019-12-02 16:28:46 -07004611out:
Jens Axboe4e88d6e2019-12-07 20:59:47 -07004612 if (ret < 0)
4613 req_set_fail_links(req);
Jens Axboe229a7b62020-06-22 10:13:11 -06004614 __io_req_complete(req, ret, 0, cs);
Jens Axboef8e85cf2019-11-23 14:24:24 -07004615 return 0;
Jens Axboef8e85cf2019-11-23 14:24:24 -07004616}
YueHaibing469956e2020-03-04 15:53:52 +08004617#else /* !CONFIG_NET */
4618static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4619{
Jens Axboef8e85cf2019-11-23 14:24:24 -07004620 return -EOPNOTSUPP;
Jens Axboef8e85cf2019-11-23 14:24:24 -07004621}
4622
Randy Dunlap1e16c2f2020-06-26 16:32:50 -07004623static int io_sendmsg(struct io_kiocb *req, bool force_nonblock,
4624 struct io_comp_state *cs)
Jens Axboe221c5eb2019-01-17 09:41:58 -07004625{
YueHaibing469956e2020-03-04 15:53:52 +08004626 return -EOPNOTSUPP;
4627}
4628
Randy Dunlap1e16c2f2020-06-26 16:32:50 -07004629static int io_send(struct io_kiocb *req, bool force_nonblock,
4630 struct io_comp_state *cs)
YueHaibing469956e2020-03-04 15:53:52 +08004631{
4632 return -EOPNOTSUPP;
4633}
4634
4635static int io_recvmsg_prep(struct io_kiocb *req,
4636 const struct io_uring_sqe *sqe)
4637{
4638 return -EOPNOTSUPP;
4639}
4640
Randy Dunlap1e16c2f2020-06-26 16:32:50 -07004641static int io_recvmsg(struct io_kiocb *req, bool force_nonblock,
4642 struct io_comp_state *cs)
YueHaibing469956e2020-03-04 15:53:52 +08004643{
4644 return -EOPNOTSUPP;
4645}
4646
Randy Dunlap1e16c2f2020-06-26 16:32:50 -07004647static int io_recv(struct io_kiocb *req, bool force_nonblock,
4648 struct io_comp_state *cs)
YueHaibing469956e2020-03-04 15:53:52 +08004649{
4650 return -EOPNOTSUPP;
4651}
4652
4653static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4654{
4655 return -EOPNOTSUPP;
4656}
4657
Randy Dunlap1e16c2f2020-06-26 16:32:50 -07004658static int io_accept(struct io_kiocb *req, bool force_nonblock,
4659 struct io_comp_state *cs)
YueHaibing469956e2020-03-04 15:53:52 +08004660{
4661 return -EOPNOTSUPP;
4662}
4663
4664static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4665{
4666 return -EOPNOTSUPP;
4667}
4668
Randy Dunlap1e16c2f2020-06-26 16:32:50 -07004669static int io_connect(struct io_kiocb *req, bool force_nonblock,
4670 struct io_comp_state *cs)
YueHaibing469956e2020-03-04 15:53:52 +08004671{
4672 return -EOPNOTSUPP;
4673}
4674#endif /* CONFIG_NET */
Jens Axboe2b188cc2019-01-07 10:46:33 -07004675
Jens Axboed7718a92020-02-14 22:23:12 -07004676struct io_poll_table {
4677 struct poll_table_struct pt;
4678 struct io_kiocb *req;
4679 int error;
4680};
4681
Jens Axboed7718a92020-02-14 22:23:12 -07004682static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
4683 __poll_t mask, task_work_func_t func)
4684{
Jens Axboefd7d6de2020-08-23 11:00:37 -06004685 bool twa_signal_ok;
Jens Axboeaa96bf82020-04-03 11:26:26 -06004686 int ret;
Jens Axboed7718a92020-02-14 22:23:12 -07004687
4688 /* for instances that support it check for an event match first: */
4689 if (mask && !(mask & poll->events))
4690 return 0;
4691
4692 trace_io_uring_task_add(req->ctx, req->opcode, req->user_data, mask);
4693
4694 list_del_init(&poll->wait.entry);
4695
Jens Axboed7718a92020-02-14 22:23:12 -07004696 req->result = mask;
4697 init_task_work(&req->task_work, func);
Jens Axboe6d816e02020-08-11 08:04:14 -06004698 percpu_ref_get(&req->ctx->refs);
4699
Jens Axboed7718a92020-02-14 22:23:12 -07004700 /*
Jens Axboefd7d6de2020-08-23 11:00:37 -06004701 * If we using the signalfd wait_queue_head for this wakeup, then
4702 * it's not safe to use TWA_SIGNAL as we could be recursing on the
4703 * tsk->sighand->siglock on doing the wakeup. Should not be needed
4704 * either, as the normal wakeup will suffice.
4705 */
4706 twa_signal_ok = (poll->head != &req->task->sighand->signalfd_wqh);
4707
4708 /*
Jens Axboee3aabf92020-05-18 11:04:17 -06004709 * If this fails, then the task is exiting. When a task exits, the
4710 * work gets canceled, so just cancel this request as well instead
4711 * of executing it. We can't safely execute it anyway, as we may not
4712 * have the needed state needed for it anyway.
Jens Axboed7718a92020-02-14 22:23:12 -07004713 */
Jens Axboefd7d6de2020-08-23 11:00:37 -06004714 ret = io_req_task_work_add(req, &req->task_work, twa_signal_ok);
Jens Axboeaa96bf82020-04-03 11:26:26 -06004715 if (unlikely(ret)) {
Jens Axboec2c4c832020-07-01 15:37:11 -06004716 struct task_struct *tsk;
4717
Jens Axboee3aabf92020-05-18 11:04:17 -06004718 WRITE_ONCE(poll->canceled, true);
Jens Axboeaa96bf82020-04-03 11:26:26 -06004719 tsk = io_wq_get_task(req->ctx->io_wq);
Jens Axboece593a62020-06-30 12:39:05 -06004720 task_work_add(tsk, &req->task_work, 0);
4721 wake_up_process(tsk);
Jens Axboeaa96bf82020-04-03 11:26:26 -06004722 }
Jens Axboed7718a92020-02-14 22:23:12 -07004723 return 1;
4724}
4725
Jens Axboe74ce6ce2020-04-13 11:09:12 -06004726static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll)
4727 __acquires(&req->ctx->completion_lock)
4728{
4729 struct io_ring_ctx *ctx = req->ctx;
4730
4731 if (!req->result && !READ_ONCE(poll->canceled)) {
4732 struct poll_table_struct pt = { ._key = poll->events };
4733
4734 req->result = vfs_poll(req->file, &pt) & poll->events;
4735 }
4736
4737 spin_lock_irq(&ctx->completion_lock);
4738 if (!req->result && !READ_ONCE(poll->canceled)) {
4739 add_wait_queue(poll->head, &poll->wait);
4740 return true;
4741 }
4742
4743 return false;
4744}
4745
Jens Axboed4e7cd32020-08-15 11:44:50 -07004746static struct io_poll_iocb *io_poll_get_double(struct io_kiocb *req)
Jens Axboe18bceab2020-05-15 11:56:54 -06004747{
Jens Axboee8c2bc12020-08-15 18:44:09 -07004748 /* pure poll stashes this in ->async_data, poll driven retry elsewhere */
Jens Axboed4e7cd32020-08-15 11:44:50 -07004749 if (req->opcode == IORING_OP_POLL_ADD)
Jens Axboee8c2bc12020-08-15 18:44:09 -07004750 return req->async_data;
Jens Axboed4e7cd32020-08-15 11:44:50 -07004751 return req->apoll->double_poll;
4752}
4753
4754static struct io_poll_iocb *io_poll_get_single(struct io_kiocb *req)
4755{
4756 if (req->opcode == IORING_OP_POLL_ADD)
4757 return &req->poll;
4758 return &req->apoll->poll;
4759}
4760
4761static void io_poll_remove_double(struct io_kiocb *req)
4762{
4763 struct io_poll_iocb *poll = io_poll_get_double(req);
Jens Axboe18bceab2020-05-15 11:56:54 -06004764
4765 lockdep_assert_held(&req->ctx->completion_lock);
4766
4767 if (poll && poll->head) {
4768 struct wait_queue_head *head = poll->head;
4769
4770 spin_lock(&head->lock);
4771 list_del_init(&poll->wait.entry);
4772 if (poll->wait.private)
4773 refcount_dec(&req->refs);
4774 poll->head = NULL;
4775 spin_unlock(&head->lock);
4776 }
4777}
4778
4779static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error)
4780{
4781 struct io_ring_ctx *ctx = req->ctx;
4782
Jens Axboed4e7cd32020-08-15 11:44:50 -07004783 io_poll_remove_double(req);
Jens Axboe18bceab2020-05-15 11:56:54 -06004784 req->poll.done = true;
4785 io_cqring_fill_event(req, error ? error : mangle_poll(mask));
4786 io_commit_cqring(ctx);
4787}
4788
4789static void io_poll_task_handler(struct io_kiocb *req, struct io_kiocb **nxt)
4790{
4791 struct io_ring_ctx *ctx = req->ctx;
4792
4793 if (io_poll_rewait(req, &req->poll)) {
4794 spin_unlock_irq(&ctx->completion_lock);
4795 return;
4796 }
4797
4798 hash_del(&req->hash_node);
4799 io_poll_complete(req, req->result, 0);
4800 req->flags |= REQ_F_COMP_LOCKED;
Pavel Begunkov9b5f7bd2020-06-29 13:13:00 +03004801 *nxt = io_put_req_find_next(req);
Jens Axboe18bceab2020-05-15 11:56:54 -06004802 spin_unlock_irq(&ctx->completion_lock);
4803
4804 io_cqring_ev_posted(ctx);
4805}
4806
4807static void io_poll_task_func(struct callback_head *cb)
4808{
4809 struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
Jens Axboe6d816e02020-08-11 08:04:14 -06004810 struct io_ring_ctx *ctx = req->ctx;
Jens Axboe18bceab2020-05-15 11:56:54 -06004811 struct io_kiocb *nxt = NULL;
4812
4813 io_poll_task_handler(req, &nxt);
Pavel Begunkovea1164e2020-06-30 15:20:41 +03004814 if (nxt)
4815 __io_req_task_submit(nxt);
Jens Axboe6d816e02020-08-11 08:04:14 -06004816 percpu_ref_put(&ctx->refs);
Jens Axboe18bceab2020-05-15 11:56:54 -06004817}
4818
4819static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode,
4820 int sync, void *key)
4821{
4822 struct io_kiocb *req = wait->private;
Jens Axboed4e7cd32020-08-15 11:44:50 -07004823 struct io_poll_iocb *poll = io_poll_get_single(req);
Jens Axboe18bceab2020-05-15 11:56:54 -06004824 __poll_t mask = key_to_poll(key);
4825
4826 /* for instances that support it check for an event match first: */
4827 if (mask && !(mask & poll->events))
4828 return 0;
4829
Jens Axboe8706e042020-09-28 08:38:54 -06004830 list_del_init(&wait->entry);
4831
Jens Axboe807abcb2020-07-17 17:09:27 -06004832 if (poll && poll->head) {
Jens Axboe18bceab2020-05-15 11:56:54 -06004833 bool done;
4834
Jens Axboe807abcb2020-07-17 17:09:27 -06004835 spin_lock(&poll->head->lock);
4836 done = list_empty(&poll->wait.entry);
Jens Axboe18bceab2020-05-15 11:56:54 -06004837 if (!done)
Jens Axboe807abcb2020-07-17 17:09:27 -06004838 list_del_init(&poll->wait.entry);
Jens Axboed4e7cd32020-08-15 11:44:50 -07004839 /* make sure double remove sees this as being gone */
4840 wait->private = NULL;
Jens Axboe807abcb2020-07-17 17:09:27 -06004841 spin_unlock(&poll->head->lock);
Jens Axboe18bceab2020-05-15 11:56:54 -06004842 if (!done)
4843 __io_async_wake(req, poll, mask, io_poll_task_func);
4844 }
4845 refcount_dec(&req->refs);
4846 return 1;
4847}
4848
4849static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events,
4850 wait_queue_func_t wake_func)
4851{
4852 poll->head = NULL;
4853 poll->done = false;
4854 poll->canceled = false;
4855 poll->events = events;
4856 INIT_LIST_HEAD(&poll->wait.entry);
4857 init_waitqueue_func_entry(&poll->wait, wake_func);
4858}
4859
4860static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
Jens Axboe807abcb2020-07-17 17:09:27 -06004861 struct wait_queue_head *head,
4862 struct io_poll_iocb **poll_ptr)
Jens Axboe18bceab2020-05-15 11:56:54 -06004863{
4864 struct io_kiocb *req = pt->req;
4865
4866 /*
4867 * If poll->head is already set, it's because the file being polled
4868 * uses multiple waitqueues for poll handling (eg one for read, one
4869 * for write). Setup a separate io_poll_iocb if this happens.
4870 */
4871 if (unlikely(poll->head)) {
4872 /* already have a 2nd entry, fail a third attempt */
Jens Axboe807abcb2020-07-17 17:09:27 -06004873 if (*poll_ptr) {
Jens Axboe18bceab2020-05-15 11:56:54 -06004874 pt->error = -EINVAL;
4875 return;
4876 }
4877 poll = kmalloc(sizeof(*poll), GFP_ATOMIC);
4878 if (!poll) {
4879 pt->error = -ENOMEM;
4880 return;
4881 }
4882 io_init_poll_iocb(poll, req->poll.events, io_poll_double_wake);
4883 refcount_inc(&req->refs);
4884 poll->wait.private = req;
Jens Axboe807abcb2020-07-17 17:09:27 -06004885 *poll_ptr = poll;
Jens Axboe18bceab2020-05-15 11:56:54 -06004886 }
4887
4888 pt->error = 0;
4889 poll->head = head;
Jiufei Xuea31eb4a2020-06-17 17:53:56 +08004890
4891 if (poll->events & EPOLLEXCLUSIVE)
4892 add_wait_queue_exclusive(head, &poll->wait);
4893 else
4894 add_wait_queue(head, &poll->wait);
Jens Axboe18bceab2020-05-15 11:56:54 -06004895}
4896
4897static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
4898 struct poll_table_struct *p)
4899{
4900 struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
Jens Axboe807abcb2020-07-17 17:09:27 -06004901 struct async_poll *apoll = pt->req->apoll;
Jens Axboe18bceab2020-05-15 11:56:54 -06004902
Jens Axboe807abcb2020-07-17 17:09:27 -06004903 __io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll);
Jens Axboe18bceab2020-05-15 11:56:54 -06004904}
4905
Jens Axboed7718a92020-02-14 22:23:12 -07004906static void io_async_task_func(struct callback_head *cb)
4907{
4908 struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
4909 struct async_poll *apoll = req->apoll;
4910 struct io_ring_ctx *ctx = req->ctx;
4911
4912 trace_io_uring_task_run(req->ctx, req->opcode, req->user_data);
4913
Jens Axboe74ce6ce2020-04-13 11:09:12 -06004914 if (io_poll_rewait(req, &apoll->poll)) {
Jens Axboed7718a92020-02-14 22:23:12 -07004915 spin_unlock_irq(&ctx->completion_lock);
Jens Axboe6d816e02020-08-11 08:04:14 -06004916 percpu_ref_put(&ctx->refs);
Jens Axboe74ce6ce2020-04-13 11:09:12 -06004917 return;
Jens Axboed7718a92020-02-14 22:23:12 -07004918 }
4919
Jens Axboe31067252020-05-17 17:43:31 -06004920 /* If req is still hashed, it cannot have been canceled. Don't check. */
Pavel Begunkov0be0b0e2020-06-30 15:20:42 +03004921 if (hash_hashed(&req->hash_node))
Jens Axboe74ce6ce2020-04-13 11:09:12 -06004922 hash_del(&req->hash_node);
Jens Axboe2bae0472020-04-13 11:16:34 -06004923
Jens Axboed4e7cd32020-08-15 11:44:50 -07004924 io_poll_remove_double(req);
Jens Axboe74ce6ce2020-04-13 11:09:12 -06004925 spin_unlock_irq(&ctx->completion_lock);
4926
Pavel Begunkov0be0b0e2020-06-30 15:20:42 +03004927 if (!READ_ONCE(apoll->poll.canceled))
4928 __io_req_task_submit(req);
4929 else
4930 __io_req_task_cancel(req, -ECANCELED);
Dan Carpenteraa340842020-07-08 21:47:11 +03004931
Jens Axboe6d816e02020-08-11 08:04:14 -06004932 percpu_ref_put(&ctx->refs);
Jens Axboe807abcb2020-07-17 17:09:27 -06004933 kfree(apoll->double_poll);
Jens Axboe31067252020-05-17 17:43:31 -06004934 kfree(apoll);
Jens Axboed7718a92020-02-14 22:23:12 -07004935}
4936
4937static int io_async_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
4938 void *key)
4939{
4940 struct io_kiocb *req = wait->private;
4941 struct io_poll_iocb *poll = &req->apoll->poll;
4942
4943 trace_io_uring_poll_wake(req->ctx, req->opcode, req->user_data,
4944 key_to_poll(key));
4945
4946 return __io_async_wake(req, poll, key_to_poll(key), io_async_task_func);
4947}
4948
4949static void io_poll_req_insert(struct io_kiocb *req)
4950{
4951 struct io_ring_ctx *ctx = req->ctx;
4952 struct hlist_head *list;
4953
4954 list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)];
4955 hlist_add_head(&req->hash_node, list);
4956}
4957
4958static __poll_t __io_arm_poll_handler(struct io_kiocb *req,
4959 struct io_poll_iocb *poll,
4960 struct io_poll_table *ipt, __poll_t mask,
4961 wait_queue_func_t wake_func)
4962 __acquires(&ctx->completion_lock)
4963{
4964 struct io_ring_ctx *ctx = req->ctx;
4965 bool cancel = false;
4966
Jens Axboe18bceab2020-05-15 11:56:54 -06004967 io_init_poll_iocb(poll, mask, wake_func);
Pavel Begunkovb90cd192020-06-21 13:09:52 +03004968 poll->file = req->file;
Jens Axboe18bceab2020-05-15 11:56:54 -06004969 poll->wait.private = req;
Jens Axboed7718a92020-02-14 22:23:12 -07004970
4971 ipt->pt._key = mask;
4972 ipt->req = req;
4973 ipt->error = -EINVAL;
4974
Jens Axboed7718a92020-02-14 22:23:12 -07004975 mask = vfs_poll(req->file, &ipt->pt) & poll->events;
4976
4977 spin_lock_irq(&ctx->completion_lock);
4978 if (likely(poll->head)) {
4979 spin_lock(&poll->head->lock);
4980 if (unlikely(list_empty(&poll->wait.entry))) {
4981 if (ipt->error)
4982 cancel = true;
4983 ipt->error = 0;
4984 mask = 0;
4985 }
4986 if (mask || ipt->error)
4987 list_del_init(&poll->wait.entry);
4988 else if (cancel)
4989 WRITE_ONCE(poll->canceled, true);
4990 else if (!poll->done) /* actually waiting for an event */
4991 io_poll_req_insert(req);
4992 spin_unlock(&poll->head->lock);
4993 }
4994
4995 return mask;
4996}
4997
4998static bool io_arm_poll_handler(struct io_kiocb *req)
4999{
5000 const struct io_op_def *def = &io_op_defs[req->opcode];
5001 struct io_ring_ctx *ctx = req->ctx;
5002 struct async_poll *apoll;
5003 struct io_poll_table ipt;
5004 __poll_t mask, ret;
Jens Axboe9dab14b2020-08-25 12:27:50 -06005005 int rw;
Jens Axboed7718a92020-02-14 22:23:12 -07005006
5007 if (!req->file || !file_can_poll(req->file))
5008 return false;
Pavel Begunkov24c74672020-06-21 13:09:51 +03005009 if (req->flags & REQ_F_POLLED)
Jens Axboed7718a92020-02-14 22:23:12 -07005010 return false;
Jens Axboe9dab14b2020-08-25 12:27:50 -06005011 if (def->pollin)
5012 rw = READ;
5013 else if (def->pollout)
5014 rw = WRITE;
5015 else
5016 return false;
5017 /* if we can't nonblock try, then no point in arming a poll handler */
5018 if (!io_file_supports_async(req->file, rw))
Jens Axboed7718a92020-02-14 22:23:12 -07005019 return false;
5020
5021 apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
5022 if (unlikely(!apoll))
5023 return false;
Jens Axboe807abcb2020-07-17 17:09:27 -06005024 apoll->double_poll = NULL;
Jens Axboed7718a92020-02-14 22:23:12 -07005025
5026 req->flags |= REQ_F_POLLED;
Jens Axboed7718a92020-02-14 22:23:12 -07005027 req->apoll = apoll;
5028 INIT_HLIST_NODE(&req->hash_node);
5029
Nathan Chancellor8755d972020-03-02 16:01:19 -07005030 mask = 0;
Jens Axboed7718a92020-02-14 22:23:12 -07005031 if (def->pollin)
Nathan Chancellor8755d972020-03-02 16:01:19 -07005032 mask |= POLLIN | POLLRDNORM;
Jens Axboed7718a92020-02-14 22:23:12 -07005033 if (def->pollout)
5034 mask |= POLLOUT | POLLWRNORM;
5035 mask |= POLLERR | POLLPRI;
5036
5037 ipt.pt._qproc = io_async_queue_proc;
5038
5039 ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask,
5040 io_async_wake);
Jens Axboea36da652020-08-11 09:50:19 -06005041 if (ret || ipt.error) {
Jens Axboed4e7cd32020-08-15 11:44:50 -07005042 io_poll_remove_double(req);
Jens Axboed7718a92020-02-14 22:23:12 -07005043 spin_unlock_irq(&ctx->completion_lock);
Jens Axboe807abcb2020-07-17 17:09:27 -06005044 kfree(apoll->double_poll);
Jens Axboed7718a92020-02-14 22:23:12 -07005045 kfree(apoll);
5046 return false;
5047 }
5048 spin_unlock_irq(&ctx->completion_lock);
5049 trace_io_uring_poll_arm(ctx, req->opcode, req->user_data, mask,
5050 apoll->poll.events);
5051 return true;
5052}
5053
5054static bool __io_poll_remove_one(struct io_kiocb *req,
5055 struct io_poll_iocb *poll)
5056{
Jens Axboeb41e9852020-02-17 09:52:41 -07005057 bool do_complete = false;
Jens Axboe221c5eb2019-01-17 09:41:58 -07005058
5059 spin_lock(&poll->head->lock);
5060 WRITE_ONCE(poll->canceled, true);
Jens Axboe392edb42019-12-09 17:52:20 -07005061 if (!list_empty(&poll->wait.entry)) {
5062 list_del_init(&poll->wait.entry);
Jens Axboeb41e9852020-02-17 09:52:41 -07005063 do_complete = true;
Jens Axboe221c5eb2019-01-17 09:41:58 -07005064 }
5065 spin_unlock(&poll->head->lock);
Jens Axboe3bfa5bc2020-05-17 13:54:12 -06005066 hash_del(&req->hash_node);
Jens Axboed7718a92020-02-14 22:23:12 -07005067 return do_complete;
5068}
5069
5070static bool io_poll_remove_one(struct io_kiocb *req)
5071{
5072 bool do_complete;
5073
Jens Axboed4e7cd32020-08-15 11:44:50 -07005074 io_poll_remove_double(req);
5075
Jens Axboed7718a92020-02-14 22:23:12 -07005076 if (req->opcode == IORING_OP_POLL_ADD) {
5077 do_complete = __io_poll_remove_one(req, &req->poll);
5078 } else {
Jens Axboe3bfa5bc2020-05-17 13:54:12 -06005079 struct async_poll *apoll = req->apoll;
5080
Jens Axboed7718a92020-02-14 22:23:12 -07005081 /* non-poll requests have submit ref still */
Jens Axboe3bfa5bc2020-05-17 13:54:12 -06005082 do_complete = __io_poll_remove_one(req, &apoll->poll);
5083 if (do_complete) {
Jens Axboed7718a92020-02-14 22:23:12 -07005084 io_put_req(req);
Jens Axboe807abcb2020-07-17 17:09:27 -06005085 kfree(apoll->double_poll);
Jens Axboe3bfa5bc2020-05-17 13:54:12 -06005086 kfree(apoll);
5087 }
Xiaoguang Wangb1f573b2020-04-12 14:50:54 +08005088 }
5089
Jens Axboeb41e9852020-02-17 09:52:41 -07005090 if (do_complete) {
5091 io_cqring_fill_event(req, -ECANCELED);
5092 io_commit_cqring(req->ctx);
5093 req->flags |= REQ_F_COMP_LOCKED;
Jens Axboef254ac02020-08-12 17:33:30 -06005094 req_set_fail_links(req);
Jens Axboeb41e9852020-02-17 09:52:41 -07005095 io_put_req(req);
5096 }
5097
5098 return do_complete;
Jens Axboe221c5eb2019-01-17 09:41:58 -07005099}
5100
Jens Axboe76e1b642020-09-26 15:05:03 -06005101/*
5102 * Returns true if we found and killed one or more poll requests
5103 */
5104static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk)
Jens Axboe221c5eb2019-01-17 09:41:58 -07005105{
Jens Axboe78076bb2019-12-04 19:56:40 -07005106 struct hlist_node *tmp;
Jens Axboe221c5eb2019-01-17 09:41:58 -07005107 struct io_kiocb *req;
Jens Axboe8e2e1fa2020-04-13 17:05:14 -06005108 int posted = 0, i;
Jens Axboe221c5eb2019-01-17 09:41:58 -07005109
5110 spin_lock_irq(&ctx->completion_lock);
Jens Axboe78076bb2019-12-04 19:56:40 -07005111 for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
5112 struct hlist_head *list;
5113
5114 list = &ctx->cancel_hash[i];
Jens Axboef3606e32020-09-22 08:18:24 -06005115 hlist_for_each_entry_safe(req, tmp, list, hash_node) {
5116 if (io_task_match(req, tsk))
5117 posted += io_poll_remove_one(req);
5118 }
Jens Axboe221c5eb2019-01-17 09:41:58 -07005119 }
5120 spin_unlock_irq(&ctx->completion_lock);
Jens Axboeb41e9852020-02-17 09:52:41 -07005121
Jens Axboe8e2e1fa2020-04-13 17:05:14 -06005122 if (posted)
5123 io_cqring_ev_posted(ctx);
Jens Axboe76e1b642020-09-26 15:05:03 -06005124
5125 return posted != 0;
Jens Axboe221c5eb2019-01-17 09:41:58 -07005126}
5127
Jens Axboe47f46762019-11-09 17:43:02 -07005128static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr)
5129{
Jens Axboe78076bb2019-12-04 19:56:40 -07005130 struct hlist_head *list;
Jens Axboe47f46762019-11-09 17:43:02 -07005131 struct io_kiocb *req;
5132
Jens Axboe78076bb2019-12-04 19:56:40 -07005133 list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)];
5134 hlist_for_each_entry(req, list, hash_node) {
Jens Axboeb41e9852020-02-17 09:52:41 -07005135 if (sqe_addr != req->user_data)
5136 continue;
5137 if (io_poll_remove_one(req))
Jens Axboeeac406c2019-11-14 12:09:58 -07005138 return 0;
Jens Axboeb41e9852020-02-17 09:52:41 -07005139 return -EALREADY;
Jens Axboe47f46762019-11-09 17:43:02 -07005140 }
5141
5142 return -ENOENT;
5143}
5144
Jens Axboe3529d8c2019-12-19 18:24:38 -07005145static int io_poll_remove_prep(struct io_kiocb *req,
5146 const struct io_uring_sqe *sqe)
Jens Axboe221c5eb2019-01-17 09:41:58 -07005147{
Jens Axboe221c5eb2019-01-17 09:41:58 -07005148 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5149 return -EINVAL;
5150 if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
5151 sqe->poll_events)
5152 return -EINVAL;
5153
Jens Axboe0969e782019-12-17 18:40:57 -07005154 req->poll.addr = READ_ONCE(sqe->addr);
Jens Axboe0969e782019-12-17 18:40:57 -07005155 return 0;
5156}
5157
5158/*
5159 * Find a running poll command that matches one specified in sqe->addr,
5160 * and remove it if found.
5161 */
5162static int io_poll_remove(struct io_kiocb *req)
5163{
5164 struct io_ring_ctx *ctx = req->ctx;
5165 u64 addr;
5166 int ret;
5167
Jens Axboe0969e782019-12-17 18:40:57 -07005168 addr = req->poll.addr;
Jens Axboe221c5eb2019-01-17 09:41:58 -07005169 spin_lock_irq(&ctx->completion_lock);
Jens Axboe0969e782019-12-17 18:40:57 -07005170 ret = io_poll_cancel(ctx, addr);
Jens Axboe221c5eb2019-01-17 09:41:58 -07005171 spin_unlock_irq(&ctx->completion_lock);
5172
Jens Axboe4e88d6e2019-12-07 20:59:47 -07005173 if (ret < 0)
5174 req_set_fail_links(req);
Jens Axboee1e16092020-06-22 09:17:17 -06005175 io_req_complete(req, ret);
Jens Axboe221c5eb2019-01-17 09:41:58 -07005176 return 0;
5177}
5178
Jens Axboe221c5eb2019-01-17 09:41:58 -07005179static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
5180 void *key)
5181{
Jens Axboec2f2eb72020-02-10 09:07:05 -07005182 struct io_kiocb *req = wait->private;
5183 struct io_poll_iocb *poll = &req->poll;
Jens Axboe221c5eb2019-01-17 09:41:58 -07005184
Jens Axboed7718a92020-02-14 22:23:12 -07005185 return __io_async_wake(req, poll, key_to_poll(key), io_poll_task_func);
Jens Axboe221c5eb2019-01-17 09:41:58 -07005186}
5187
Jens Axboe221c5eb2019-01-17 09:41:58 -07005188static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
5189 struct poll_table_struct *p)
5190{
5191 struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
5192
Jens Axboee8c2bc12020-08-15 18:44:09 -07005193 __io_queue_proc(&pt->req->poll, pt, head, (struct io_poll_iocb **) &pt->req->async_data);
Jens Axboeeac406c2019-11-14 12:09:58 -07005194}
5195
Jens Axboe3529d8c2019-12-19 18:24:38 -07005196static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
Jens Axboe221c5eb2019-01-17 09:41:58 -07005197{
5198 struct io_poll_iocb *poll = &req->poll;
Jiufei Xue5769a352020-06-17 17:53:55 +08005199 u32 events;
Jens Axboe221c5eb2019-01-17 09:41:58 -07005200
5201 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5202 return -EINVAL;
5203 if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index)
5204 return -EINVAL;
Jens Axboe09bb8392019-03-13 12:39:28 -06005205 if (!poll->file)
5206 return -EBADF;
Jens Axboe221c5eb2019-01-17 09:41:58 -07005207
Jiufei Xue5769a352020-06-17 17:53:55 +08005208 events = READ_ONCE(sqe->poll32_events);
5209#ifdef __BIG_ENDIAN
5210 events = swahw32(events);
5211#endif
Jiufei Xuea31eb4a2020-06-17 17:53:56 +08005212 poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP |
5213 (events & EPOLLEXCLUSIVE);
Jens Axboe0969e782019-12-17 18:40:57 -07005214 return 0;
5215}
5216
Pavel Begunkov014db002020-03-03 21:33:12 +03005217static int io_poll_add(struct io_kiocb *req)
Jens Axboe0969e782019-12-17 18:40:57 -07005218{
5219 struct io_poll_iocb *poll = &req->poll;
5220 struct io_ring_ctx *ctx = req->ctx;
5221 struct io_poll_table ipt;
Jens Axboe0969e782019-12-17 18:40:57 -07005222 __poll_t mask;
Jens Axboe0969e782019-12-17 18:40:57 -07005223
Jens Axboe78076bb2019-12-04 19:56:40 -07005224 INIT_HLIST_NODE(&req->hash_node);
Jens Axboed7718a92020-02-14 22:23:12 -07005225 ipt.pt._qproc = io_poll_queue_proc;
Jens Axboe36703242019-07-25 10:20:18 -06005226
Jens Axboed7718a92020-02-14 22:23:12 -07005227 mask = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events,
5228 io_poll_wake);
Jens Axboe221c5eb2019-01-17 09:41:58 -07005229
Jens Axboe8c838782019-03-12 15:48:16 -06005230 if (mask) { /* no async, we'd stolen it */
Jens Axboe8c838782019-03-12 15:48:16 -06005231 ipt.error = 0;
Jens Axboeb0dd8a42019-11-18 12:14:54 -07005232 io_poll_complete(req, mask, 0);
Jens Axboe8c838782019-03-12 15:48:16 -06005233 }
Jens Axboe221c5eb2019-01-17 09:41:58 -07005234 spin_unlock_irq(&ctx->completion_lock);
5235
Jens Axboe8c838782019-03-12 15:48:16 -06005236 if (mask) {
5237 io_cqring_ev_posted(ctx);
Pavel Begunkov014db002020-03-03 21:33:12 +03005238 io_put_req(req);
Jens Axboe221c5eb2019-01-17 09:41:58 -07005239 }
Jens Axboe8c838782019-03-12 15:48:16 -06005240 return ipt.error;
Jens Axboe221c5eb2019-01-17 09:41:58 -07005241}
5242
Jens Axboe5262f562019-09-17 12:26:57 -06005243static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
5244{
Jens Axboead8a48a2019-11-15 08:49:11 -07005245 struct io_timeout_data *data = container_of(timer,
5246 struct io_timeout_data, timer);
5247 struct io_kiocb *req = data->req;
5248 struct io_ring_ctx *ctx = req->ctx;
Jens Axboe5262f562019-09-17 12:26:57 -06005249 unsigned long flags;
5250
Jens Axboe5262f562019-09-17 12:26:57 -06005251 spin_lock_irqsave(&ctx->completion_lock, flags);
Pavel Begunkov01cec8c2020-07-30 18:43:50 +03005252 atomic_set(&req->ctx->cq_timeouts,
5253 atomic_read(&req->ctx->cq_timeouts) + 1);
5254
zhangyi (F)ef036812019-10-23 15:10:08 +08005255 /*
Jens Axboe11365042019-10-16 09:08:32 -06005256 * We could be racing with timeout deletion. If the list is empty,
5257 * then timeout lookup already found it and will be handling it.
zhangyi (F)ef036812019-10-23 15:10:08 +08005258 */
Pavel Begunkov135fcde2020-07-13 23:37:12 +03005259 if (!list_empty(&req->timeout.list))
5260 list_del_init(&req->timeout.list);
Jens Axboe842f9612019-10-29 12:34:10 -06005261
Jens Axboe78e19bb2019-11-06 15:21:34 -07005262 io_cqring_fill_event(req, -ETIME);
Jens Axboe5262f562019-09-17 12:26:57 -06005263 io_commit_cqring(ctx);
5264 spin_unlock_irqrestore(&ctx->completion_lock, flags);
5265
5266 io_cqring_ev_posted(ctx);
Jens Axboe4e88d6e2019-12-07 20:59:47 -07005267 req_set_fail_links(req);
Jens Axboe5262f562019-09-17 12:26:57 -06005268 io_put_req(req);
5269 return HRTIMER_NORESTART;
5270}
5271
Jens Axboef254ac02020-08-12 17:33:30 -06005272static int __io_timeout_cancel(struct io_kiocb *req)
Jens Axboe47f46762019-11-09 17:43:02 -07005273{
Jens Axboee8c2bc12020-08-15 18:44:09 -07005274 struct io_timeout_data *io = req->async_data;
Jens Axboef254ac02020-08-12 17:33:30 -06005275 int ret;
Jens Axboe47f46762019-11-09 17:43:02 -07005276
Jens Axboef254ac02020-08-12 17:33:30 -06005277 list_del_init(&req->timeout.list);
Jens Axboe47f46762019-11-09 17:43:02 -07005278
Jens Axboee8c2bc12020-08-15 18:44:09 -07005279 ret = hrtimer_try_to_cancel(&io->timer);
Jens Axboe47f46762019-11-09 17:43:02 -07005280 if (ret == -1)
5281 return -EALREADY;
5282
Jens Axboe4e88d6e2019-12-07 20:59:47 -07005283 req_set_fail_links(req);
Jens Axboe9b7adba2020-08-10 10:54:02 -06005284 req->flags |= REQ_F_COMP_LOCKED;
Jens Axboe47f46762019-11-09 17:43:02 -07005285 io_cqring_fill_event(req, -ECANCELED);
5286 io_put_req(req);
5287 return 0;
5288}
5289
Jens Axboef254ac02020-08-12 17:33:30 -06005290static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
5291{
5292 struct io_kiocb *req;
5293 int ret = -ENOENT;
5294
5295 list_for_each_entry(req, &ctx->timeout_list, timeout.list) {
5296 if (user_data == req->user_data) {
5297 ret = 0;
5298 break;
5299 }
5300 }
5301
5302 if (ret == -ENOENT)
5303 return ret;
5304
5305 return __io_timeout_cancel(req);
5306}
5307
Jens Axboe3529d8c2019-12-19 18:24:38 -07005308static int io_timeout_remove_prep(struct io_kiocb *req,
5309 const struct io_uring_sqe *sqe)
Jens Axboeb29472e2019-12-17 18:50:29 -07005310{
Jens Axboeb29472e2019-12-17 18:50:29 -07005311 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5312 return -EINVAL;
Daniele Albano61710e42020-07-18 14:15:16 -06005313 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
5314 return -EINVAL;
5315 if (sqe->ioprio || sqe->buf_index || sqe->len)
Jens Axboeb29472e2019-12-17 18:50:29 -07005316 return -EINVAL;
5317
5318 req->timeout.addr = READ_ONCE(sqe->addr);
5319 req->timeout.flags = READ_ONCE(sqe->timeout_flags);
5320 if (req->timeout.flags)
5321 return -EINVAL;
5322
Jens Axboeb29472e2019-12-17 18:50:29 -07005323 return 0;
5324}
5325
Jens Axboe11365042019-10-16 09:08:32 -06005326/*
5327 * Remove or update an existing timeout command
5328 */
Jens Axboefc4df992019-12-10 14:38:45 -07005329static int io_timeout_remove(struct io_kiocb *req)
Jens Axboe11365042019-10-16 09:08:32 -06005330{
5331 struct io_ring_ctx *ctx = req->ctx;
Jens Axboe47f46762019-11-09 17:43:02 -07005332 int ret;
Jens Axboe11365042019-10-16 09:08:32 -06005333
Jens Axboe11365042019-10-16 09:08:32 -06005334 spin_lock_irq(&ctx->completion_lock);
Jens Axboeb29472e2019-12-17 18:50:29 -07005335 ret = io_timeout_cancel(ctx, req->timeout.addr);
Jens Axboe11365042019-10-16 09:08:32 -06005336
Jens Axboe47f46762019-11-09 17:43:02 -07005337 io_cqring_fill_event(req, ret);
Jens Axboe11365042019-10-16 09:08:32 -06005338 io_commit_cqring(ctx);
5339 spin_unlock_irq(&ctx->completion_lock);
Jens Axboe5262f562019-09-17 12:26:57 -06005340 io_cqring_ev_posted(ctx);
Jens Axboe4e88d6e2019-12-07 20:59:47 -07005341 if (ret < 0)
5342 req_set_fail_links(req);
Jackie Liuec9c02a2019-11-08 23:50:36 +08005343 io_put_req(req);
Jens Axboe11365042019-10-16 09:08:32 -06005344 return 0;
Jens Axboe5262f562019-09-17 12:26:57 -06005345}
5346
Jens Axboe3529d8c2019-12-19 18:24:38 -07005347static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
Jens Axboe2d283902019-12-04 11:08:05 -07005348 bool is_timeout_link)
Jens Axboe5262f562019-09-17 12:26:57 -06005349{
Jens Axboead8a48a2019-11-15 08:49:11 -07005350 struct io_timeout_data *data;
Jens Axboea41525a2019-10-15 16:48:15 -06005351 unsigned flags;
Pavel Begunkov56080b02020-05-26 20:34:04 +03005352 u32 off = READ_ONCE(sqe->off);
Jens Axboe5262f562019-09-17 12:26:57 -06005353
Jens Axboead8a48a2019-11-15 08:49:11 -07005354 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
Jens Axboe5262f562019-09-17 12:26:57 -06005355 return -EINVAL;
Jens Axboead8a48a2019-11-15 08:49:11 -07005356 if (sqe->ioprio || sqe->buf_index || sqe->len != 1)
Jens Axboea41525a2019-10-15 16:48:15 -06005357 return -EINVAL;
Pavel Begunkov56080b02020-05-26 20:34:04 +03005358 if (off && is_timeout_link)
Jens Axboe2d283902019-12-04 11:08:05 -07005359 return -EINVAL;
Jens Axboea41525a2019-10-15 16:48:15 -06005360 flags = READ_ONCE(sqe->timeout_flags);
5361 if (flags & ~IORING_TIMEOUT_ABS)
Jens Axboe5262f562019-09-17 12:26:57 -06005362 return -EINVAL;
Arnd Bergmannbdf20072019-10-01 09:53:29 -06005363
Pavel Begunkovbfe68a22020-05-30 14:54:18 +03005364 req->timeout.off = off;
Jens Axboe26a61672019-12-20 09:02:01 -07005365
Jens Axboee8c2bc12020-08-15 18:44:09 -07005366 if (!req->async_data && io_alloc_async_data(req))
Jens Axboe26a61672019-12-20 09:02:01 -07005367 return -ENOMEM;
5368
Jens Axboee8c2bc12020-08-15 18:44:09 -07005369 data = req->async_data;
Jens Axboead8a48a2019-11-15 08:49:11 -07005370 data->req = req;
Jens Axboead8a48a2019-11-15 08:49:11 -07005371
5372 if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
Jens Axboe5262f562019-09-17 12:26:57 -06005373 return -EFAULT;
5374
Jens Axboe11365042019-10-16 09:08:32 -06005375 if (flags & IORING_TIMEOUT_ABS)
Jens Axboead8a48a2019-11-15 08:49:11 -07005376 data->mode = HRTIMER_MODE_ABS;
Jens Axboe11365042019-10-16 09:08:32 -06005377 else
Jens Axboead8a48a2019-11-15 08:49:11 -07005378 data->mode = HRTIMER_MODE_REL;
Jens Axboe11365042019-10-16 09:08:32 -06005379
Jens Axboead8a48a2019-11-15 08:49:11 -07005380 hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode);
5381 return 0;
5382}
5383
Jens Axboefc4df992019-12-10 14:38:45 -07005384static int io_timeout(struct io_kiocb *req)
Jens Axboead8a48a2019-11-15 08:49:11 -07005385{
Jens Axboead8a48a2019-11-15 08:49:11 -07005386 struct io_ring_ctx *ctx = req->ctx;
Jens Axboee8c2bc12020-08-15 18:44:09 -07005387 struct io_timeout_data *data = req->async_data;
Jens Axboead8a48a2019-11-15 08:49:11 -07005388 struct list_head *entry;
Pavel Begunkovbfe68a22020-05-30 14:54:18 +03005389 u32 tail, off = req->timeout.off;
Jens Axboead8a48a2019-11-15 08:49:11 -07005390
Pavel Begunkov733f5c92020-05-26 20:34:03 +03005391 spin_lock_irq(&ctx->completion_lock);
Jens Axboe93bd25b2019-11-11 23:34:31 -07005392
Jens Axboe5262f562019-09-17 12:26:57 -06005393 /*
5394 * sqe->off holds how many events that need to occur for this
Jens Axboe93bd25b2019-11-11 23:34:31 -07005395 * timeout event to be satisfied. If it isn't set, then this is
5396 * a pure timeout request, sequence isn't used.
Jens Axboe5262f562019-09-17 12:26:57 -06005397 */
Pavel Begunkov8eb7e2d2020-06-29 13:13:02 +03005398 if (io_is_timeout_noseq(req)) {
Jens Axboe93bd25b2019-11-11 23:34:31 -07005399 entry = ctx->timeout_list.prev;
5400 goto add;
5401 }
Jens Axboe5262f562019-09-17 12:26:57 -06005402
Pavel Begunkovbfe68a22020-05-30 14:54:18 +03005403 tail = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
5404 req->timeout.target_seq = tail + off;
Jens Axboe5262f562019-09-17 12:26:57 -06005405
5406 /*
5407 * Insertion sort, ensuring the first entry in the list is always
5408 * the one we need first.
5409 */
Jens Axboe5262f562019-09-17 12:26:57 -06005410 list_for_each_prev(entry, &ctx->timeout_list) {
Pavel Begunkov135fcde2020-07-13 23:37:12 +03005411 struct io_kiocb *nxt = list_entry(entry, struct io_kiocb,
5412 timeout.list);
Jens Axboe5262f562019-09-17 12:26:57 -06005413
Pavel Begunkov8eb7e2d2020-06-29 13:13:02 +03005414 if (io_is_timeout_noseq(nxt))
Jens Axboe93bd25b2019-11-11 23:34:31 -07005415 continue;
Pavel Begunkovbfe68a22020-05-30 14:54:18 +03005416 /* nxt.seq is behind @tail, otherwise would've been completed */
5417 if (off >= nxt->timeout.target_seq - tail)
Jens Axboe5262f562019-09-17 12:26:57 -06005418 break;
5419 }
Jens Axboe93bd25b2019-11-11 23:34:31 -07005420add:
Pavel Begunkov135fcde2020-07-13 23:37:12 +03005421 list_add(&req->timeout.list, entry);
Jens Axboead8a48a2019-11-15 08:49:11 -07005422 data->timer.function = io_timeout_fn;
5423 hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
Jens Axboe842f9612019-10-29 12:34:10 -06005424 spin_unlock_irq(&ctx->completion_lock);
Jens Axboe5262f562019-09-17 12:26:57 -06005425 return 0;
5426}
5427
Jens Axboe62755e32019-10-28 21:49:21 -06005428static bool io_cancel_cb(struct io_wq_work *work, void *data)
Jens Axboede0617e2019-04-06 21:51:27 -06005429{
Jens Axboe62755e32019-10-28 21:49:21 -06005430 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
Jens Axboede0617e2019-04-06 21:51:27 -06005431
Jens Axboe62755e32019-10-28 21:49:21 -06005432 return req->user_data == (unsigned long) data;
5433}
5434
Jens Axboee977d6d2019-11-05 12:39:45 -07005435static int io_async_cancel_one(struct io_ring_ctx *ctx, void *sqe_addr)
Jens Axboe62755e32019-10-28 21:49:21 -06005436{
Jens Axboe62755e32019-10-28 21:49:21 -06005437 enum io_wq_cancel cancel_ret;
Jens Axboe62755e32019-10-28 21:49:21 -06005438 int ret = 0;
5439
Pavel Begunkov4f26bda2020-06-15 10:24:03 +03005440 cancel_ret = io_wq_cancel_cb(ctx->io_wq, io_cancel_cb, sqe_addr, false);
Jens Axboe62755e32019-10-28 21:49:21 -06005441 switch (cancel_ret) {
5442 case IO_WQ_CANCEL_OK:
5443 ret = 0;
5444 break;
5445 case IO_WQ_CANCEL_RUNNING:
5446 ret = -EALREADY;
5447 break;
5448 case IO_WQ_CANCEL_NOTFOUND:
5449 ret = -ENOENT;
5450 break;
5451 }
5452
Jens Axboee977d6d2019-11-05 12:39:45 -07005453 return ret;
5454}
5455
Jens Axboe47f46762019-11-09 17:43:02 -07005456static void io_async_find_and_cancel(struct io_ring_ctx *ctx,
5457 struct io_kiocb *req, __u64 sqe_addr,
Pavel Begunkov014db002020-03-03 21:33:12 +03005458 int success_ret)
Jens Axboe47f46762019-11-09 17:43:02 -07005459{
5460 unsigned long flags;
5461 int ret;
5462
5463 ret = io_async_cancel_one(ctx, (void *) (unsigned long) sqe_addr);
5464 if (ret != -ENOENT) {
5465 spin_lock_irqsave(&ctx->completion_lock, flags);
5466 goto done;
5467 }
5468
5469 spin_lock_irqsave(&ctx->completion_lock, flags);
5470 ret = io_timeout_cancel(ctx, sqe_addr);
5471 if (ret != -ENOENT)
5472 goto done;
5473 ret = io_poll_cancel(ctx, sqe_addr);
5474done:
Jens Axboeb0dd8a42019-11-18 12:14:54 -07005475 if (!ret)
5476 ret = success_ret;
Jens Axboe47f46762019-11-09 17:43:02 -07005477 io_cqring_fill_event(req, ret);
5478 io_commit_cqring(ctx);
5479 spin_unlock_irqrestore(&ctx->completion_lock, flags);
5480 io_cqring_ev_posted(ctx);
5481
Jens Axboe4e88d6e2019-12-07 20:59:47 -07005482 if (ret < 0)
5483 req_set_fail_links(req);
Pavel Begunkov014db002020-03-03 21:33:12 +03005484 io_put_req(req);
Jens Axboe47f46762019-11-09 17:43:02 -07005485}
5486
Jens Axboe3529d8c2019-12-19 18:24:38 -07005487static int io_async_cancel_prep(struct io_kiocb *req,
5488 const struct io_uring_sqe *sqe)
Jens Axboee977d6d2019-11-05 12:39:45 -07005489{
Jens Axboefbf23842019-12-17 18:45:56 -07005490 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
Jens Axboee977d6d2019-11-05 12:39:45 -07005491 return -EINVAL;
Daniele Albano61710e42020-07-18 14:15:16 -06005492 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
5493 return -EINVAL;
5494 if (sqe->ioprio || sqe->off || sqe->len || sqe->cancel_flags)
Jens Axboee977d6d2019-11-05 12:39:45 -07005495 return -EINVAL;
5496
Jens Axboefbf23842019-12-17 18:45:56 -07005497 req->cancel.addr = READ_ONCE(sqe->addr);
5498 return 0;
5499}
5500
Pavel Begunkov014db002020-03-03 21:33:12 +03005501static int io_async_cancel(struct io_kiocb *req)
Jens Axboefbf23842019-12-17 18:45:56 -07005502{
5503 struct io_ring_ctx *ctx = req->ctx;
Jens Axboefbf23842019-12-17 18:45:56 -07005504
Pavel Begunkov014db002020-03-03 21:33:12 +03005505 io_async_find_and_cancel(ctx, req, req->cancel.addr, 0);
Jens Axboe62755e32019-10-28 21:49:21 -06005506 return 0;
5507}
5508
Jens Axboe05f3fb32019-12-09 11:22:50 -07005509static int io_files_update_prep(struct io_kiocb *req,
5510 const struct io_uring_sqe *sqe)
5511{
Jens Axboe6ca56f82020-09-18 16:51:19 -06005512 if (unlikely(req->ctx->flags & IORING_SETUP_SQPOLL))
5513 return -EINVAL;
Daniele Albano61710e42020-07-18 14:15:16 -06005514 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
5515 return -EINVAL;
5516 if (sqe->ioprio || sqe->rw_flags)
Jens Axboe05f3fb32019-12-09 11:22:50 -07005517 return -EINVAL;
5518
5519 req->files_update.offset = READ_ONCE(sqe->off);
5520 req->files_update.nr_args = READ_ONCE(sqe->len);
5521 if (!req->files_update.nr_args)
5522 return -EINVAL;
5523 req->files_update.arg = READ_ONCE(sqe->addr);
5524 return 0;
5525}
5526
Jens Axboe229a7b62020-06-22 10:13:11 -06005527static int io_files_update(struct io_kiocb *req, bool force_nonblock,
5528 struct io_comp_state *cs)
Jens Axboe05f3fb32019-12-09 11:22:50 -07005529{
5530 struct io_ring_ctx *ctx = req->ctx;
5531 struct io_uring_files_update up;
5532 int ret;
5533
Jens Axboef86cd202020-01-29 13:46:44 -07005534 if (force_nonblock)
Jens Axboe05f3fb32019-12-09 11:22:50 -07005535 return -EAGAIN;
Jens Axboe05f3fb32019-12-09 11:22:50 -07005536
5537 up.offset = req->files_update.offset;
5538 up.fds = req->files_update.arg;
5539
5540 mutex_lock(&ctx->uring_lock);
5541 ret = __io_sqe_files_update(ctx, &up, req->files_update.nr_args);
5542 mutex_unlock(&ctx->uring_lock);
5543
5544 if (ret < 0)
5545 req_set_fail_links(req);
Jens Axboe229a7b62020-06-22 10:13:11 -06005546 __io_req_complete(req, ret, 0, cs);
Jens Axboe05f3fb32019-12-09 11:22:50 -07005547 return 0;
5548}
5549
Jens Axboe3529d8c2019-12-19 18:24:38 -07005550static int io_req_defer_prep(struct io_kiocb *req,
5551 const struct io_uring_sqe *sqe)
Jens Axboef67676d2019-12-02 11:03:47 -07005552{
Jens Axboee7815732019-12-17 19:45:06 -07005553 ssize_t ret = 0;
Jens Axboef67676d2019-12-02 11:03:47 -07005554
Pavel Begunkovf1d96a82020-03-13 22:29:14 +03005555 if (!sqe)
5556 return 0;
5557
Jens Axboee8c2bc12020-08-15 18:44:09 -07005558 if (io_alloc_async_data(req))
Pavel Begunkov327d6d92020-07-15 12:46:51 +03005559 return -EAGAIN;
Pavel Begunkovf56040b2020-07-23 20:25:21 +03005560 ret = io_prep_work_files(req);
5561 if (unlikely(ret))
5562 return ret;
Jens Axboecccf0ee2020-01-27 16:34:48 -07005563
Jens Axboe202700e12020-09-12 13:18:10 -06005564 io_prep_async_work(req);
5565
Jens Axboed625c6e2019-12-17 19:53:05 -07005566 switch (req->opcode) {
Jens Axboee7815732019-12-17 19:45:06 -07005567 case IORING_OP_NOP:
5568 break;
Jens Axboef67676d2019-12-02 11:03:47 -07005569 case IORING_OP_READV:
5570 case IORING_OP_READ_FIXED:
Jens Axboe3a6820f2019-12-22 15:19:35 -07005571 case IORING_OP_READ:
Jens Axboe3529d8c2019-12-19 18:24:38 -07005572 ret = io_read_prep(req, sqe, true);
Jens Axboef67676d2019-12-02 11:03:47 -07005573 break;
5574 case IORING_OP_WRITEV:
5575 case IORING_OP_WRITE_FIXED:
Jens Axboe3a6820f2019-12-22 15:19:35 -07005576 case IORING_OP_WRITE:
Jens Axboe3529d8c2019-12-19 18:24:38 -07005577 ret = io_write_prep(req, sqe, true);
Jens Axboef67676d2019-12-02 11:03:47 -07005578 break;
Jens Axboe0969e782019-12-17 18:40:57 -07005579 case IORING_OP_POLL_ADD:
Jens Axboe3529d8c2019-12-19 18:24:38 -07005580 ret = io_poll_add_prep(req, sqe);
Jens Axboe0969e782019-12-17 18:40:57 -07005581 break;
5582 case IORING_OP_POLL_REMOVE:
Jens Axboe3529d8c2019-12-19 18:24:38 -07005583 ret = io_poll_remove_prep(req, sqe);
Jens Axboe0969e782019-12-17 18:40:57 -07005584 break;
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07005585 case IORING_OP_FSYNC:
Jens Axboe3529d8c2019-12-19 18:24:38 -07005586 ret = io_prep_fsync(req, sqe);
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07005587 break;
5588 case IORING_OP_SYNC_FILE_RANGE:
Jens Axboe3529d8c2019-12-19 18:24:38 -07005589 ret = io_prep_sfr(req, sqe);
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07005590 break;
Jens Axboe03b12302019-12-02 18:50:25 -07005591 case IORING_OP_SENDMSG:
Jens Axboefddafac2020-01-04 20:19:44 -07005592 case IORING_OP_SEND:
Jens Axboe3529d8c2019-12-19 18:24:38 -07005593 ret = io_sendmsg_prep(req, sqe);
Jens Axboe03b12302019-12-02 18:50:25 -07005594 break;
5595 case IORING_OP_RECVMSG:
Jens Axboefddafac2020-01-04 20:19:44 -07005596 case IORING_OP_RECV:
Jens Axboe3529d8c2019-12-19 18:24:38 -07005597 ret = io_recvmsg_prep(req, sqe);
Jens Axboe03b12302019-12-02 18:50:25 -07005598 break;
Jens Axboef499a022019-12-02 16:28:46 -07005599 case IORING_OP_CONNECT:
Jens Axboe3529d8c2019-12-19 18:24:38 -07005600 ret = io_connect_prep(req, sqe);
Jens Axboef499a022019-12-02 16:28:46 -07005601 break;
Jens Axboe2d283902019-12-04 11:08:05 -07005602 case IORING_OP_TIMEOUT:
Jens Axboe3529d8c2019-12-19 18:24:38 -07005603 ret = io_timeout_prep(req, sqe, false);
Jens Axboeb7bb4f72019-12-15 22:13:43 -07005604 break;
Jens Axboeb29472e2019-12-17 18:50:29 -07005605 case IORING_OP_TIMEOUT_REMOVE:
Jens Axboe3529d8c2019-12-19 18:24:38 -07005606 ret = io_timeout_remove_prep(req, sqe);
Jens Axboeb29472e2019-12-17 18:50:29 -07005607 break;
Jens Axboefbf23842019-12-17 18:45:56 -07005608 case IORING_OP_ASYNC_CANCEL:
Jens Axboe3529d8c2019-12-19 18:24:38 -07005609 ret = io_async_cancel_prep(req, sqe);
Jens Axboefbf23842019-12-17 18:45:56 -07005610 break;
Jens Axboe2d283902019-12-04 11:08:05 -07005611 case IORING_OP_LINK_TIMEOUT:
Jens Axboe3529d8c2019-12-19 18:24:38 -07005612 ret = io_timeout_prep(req, sqe, true);
Jens Axboeb7bb4f72019-12-15 22:13:43 -07005613 break;
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07005614 case IORING_OP_ACCEPT:
Jens Axboe3529d8c2019-12-19 18:24:38 -07005615 ret = io_accept_prep(req, sqe);
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07005616 break;
Jens Axboed63d1b52019-12-10 10:38:56 -07005617 case IORING_OP_FALLOCATE:
5618 ret = io_fallocate_prep(req, sqe);
5619 break;
Jens Axboe15b71ab2019-12-11 11:20:36 -07005620 case IORING_OP_OPENAT:
5621 ret = io_openat_prep(req, sqe);
5622 break;
Jens Axboeb5dba592019-12-11 14:02:38 -07005623 case IORING_OP_CLOSE:
5624 ret = io_close_prep(req, sqe);
5625 break;
Jens Axboe05f3fb32019-12-09 11:22:50 -07005626 case IORING_OP_FILES_UPDATE:
5627 ret = io_files_update_prep(req, sqe);
5628 break;
Jens Axboeeddc7ef2019-12-13 21:18:10 -07005629 case IORING_OP_STATX:
5630 ret = io_statx_prep(req, sqe);
5631 break;
Jens Axboe4840e412019-12-25 22:03:45 -07005632 case IORING_OP_FADVISE:
5633 ret = io_fadvise_prep(req, sqe);
5634 break;
Jens Axboec1ca7572019-12-25 22:18:28 -07005635 case IORING_OP_MADVISE:
5636 ret = io_madvise_prep(req, sqe);
5637 break;
Jens Axboecebdb982020-01-08 17:59:24 -07005638 case IORING_OP_OPENAT2:
5639 ret = io_openat2_prep(req, sqe);
5640 break;
Jens Axboe3e4827b2020-01-08 15:18:09 -07005641 case IORING_OP_EPOLL_CTL:
5642 ret = io_epoll_ctl_prep(req, sqe);
5643 break;
Pavel Begunkov7d67af22020-02-24 11:32:45 +03005644 case IORING_OP_SPLICE:
5645 ret = io_splice_prep(req, sqe);
5646 break;
Jens Axboeddf0322d2020-02-23 16:41:33 -07005647 case IORING_OP_PROVIDE_BUFFERS:
5648 ret = io_provide_buffers_prep(req, sqe);
5649 break;
Jens Axboe067524e2020-03-02 16:32:28 -07005650 case IORING_OP_REMOVE_BUFFERS:
5651 ret = io_remove_buffers_prep(req, sqe);
5652 break;
Pavel Begunkovf2a8d5c2020-05-17 14:18:06 +03005653 case IORING_OP_TEE:
5654 ret = io_tee_prep(req, sqe);
5655 break;
Jens Axboef67676d2019-12-02 11:03:47 -07005656 default:
Jens Axboee7815732019-12-17 19:45:06 -07005657 printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
5658 req->opcode);
5659 ret = -EINVAL;
Jens Axboeb7bb4f72019-12-15 22:13:43 -07005660 break;
Jens Axboef67676d2019-12-02 11:03:47 -07005661 }
5662
Jens Axboeb7bb4f72019-12-15 22:13:43 -07005663 return ret;
Jens Axboef67676d2019-12-02 11:03:47 -07005664}
5665
Pavel Begunkov9cf7c102020-07-13 23:37:15 +03005666static u32 io_get_sequence(struct io_kiocb *req)
5667{
5668 struct io_kiocb *pos;
5669 struct io_ring_ctx *ctx = req->ctx;
5670 u32 total_submitted, nr_reqs = 1;
5671
5672 if (req->flags & REQ_F_LINK_HEAD)
5673 list_for_each_entry(pos, &req->link_list, link_list)
5674 nr_reqs++;
5675
5676 total_submitted = ctx->cached_sq_head - ctx->cached_sq_dropped;
5677 return total_submitted - nr_reqs;
5678}
5679
Jens Axboe3529d8c2019-12-19 18:24:38 -07005680static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe)
Jens Axboede0617e2019-04-06 21:51:27 -06005681{
Jackie Liua197f662019-11-08 08:09:12 -07005682 struct io_ring_ctx *ctx = req->ctx;
Pavel Begunkov27dc8332020-07-13 23:37:14 +03005683 struct io_defer_entry *de;
Jens Axboef67676d2019-12-02 11:03:47 -07005684 int ret;
Pavel Begunkov9cf7c102020-07-13 23:37:15 +03005685 u32 seq;
Jens Axboede0617e2019-04-06 21:51:27 -06005686
Bob Liu9d858b22019-11-13 18:06:25 +08005687 /* Still need defer if there is pending req in defer list. */
Pavel Begunkov9cf7c102020-07-13 23:37:15 +03005688 if (likely(list_empty_careful(&ctx->defer_list) &&
5689 !(req->flags & REQ_F_IO_DRAIN)))
5690 return 0;
5691
5692 seq = io_get_sequence(req);
5693 /* Still a chance to pass the sequence check */
5694 if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list))
Jens Axboede0617e2019-04-06 21:51:27 -06005695 return 0;
5696
Jens Axboee8c2bc12020-08-15 18:44:09 -07005697 if (!req->async_data) {
Pavel Begunkov650b5482020-05-17 14:02:11 +03005698 ret = io_req_defer_prep(req, sqe);
Pavel Begunkov327d6d92020-07-15 12:46:51 +03005699 if (ret)
Pavel Begunkov650b5482020-05-17 14:02:11 +03005700 return ret;
5701 }
Pavel Begunkovcbdcb432020-06-29 19:18:43 +03005702 io_prep_async_link(req);
Pavel Begunkov27dc8332020-07-13 23:37:14 +03005703 de = kmalloc(sizeof(*de), GFP_KERNEL);
5704 if (!de)
5705 return -ENOMEM;
Jens Axboe2d283902019-12-04 11:08:05 -07005706
Jens Axboede0617e2019-04-06 21:51:27 -06005707 spin_lock_irq(&ctx->completion_lock);
Pavel Begunkov9cf7c102020-07-13 23:37:15 +03005708 if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) {
Jens Axboede0617e2019-04-06 21:51:27 -06005709 spin_unlock_irq(&ctx->completion_lock);
Pavel Begunkov27dc8332020-07-13 23:37:14 +03005710 kfree(de);
Pavel Begunkovae348172020-07-23 20:25:20 +03005711 io_queue_async_work(req);
5712 return -EIOCBQUEUED;
Jens Axboede0617e2019-04-06 21:51:27 -06005713 }
5714
Jens Axboe915967f2019-11-21 09:01:20 -07005715 trace_io_uring_defer(ctx, req, req->user_data);
Pavel Begunkov27dc8332020-07-13 23:37:14 +03005716 de->req = req;
Pavel Begunkov9cf7c102020-07-13 23:37:15 +03005717 de->seq = seq;
Pavel Begunkov27dc8332020-07-13 23:37:14 +03005718 list_add_tail(&de->list, &ctx->defer_list);
Jens Axboede0617e2019-04-06 21:51:27 -06005719 spin_unlock_irq(&ctx->completion_lock);
5720 return -EIOCBQUEUED;
5721}
5722
Jens Axboef573d382020-09-22 10:19:24 -06005723static void io_req_drop_files(struct io_kiocb *req)
5724{
5725 struct io_ring_ctx *ctx = req->ctx;
5726 unsigned long flags;
5727
5728 spin_lock_irqsave(&ctx->inflight_lock, flags);
5729 list_del(&req->inflight_entry);
5730 if (waitqueue_active(&ctx->inflight_wait))
5731 wake_up(&ctx->inflight_wait);
5732 spin_unlock_irqrestore(&ctx->inflight_lock, flags);
5733 req->flags &= ~REQ_F_INFLIGHT;
Jens Axboe0f212202020-09-13 13:09:39 -06005734 put_files_struct(req->work.files);
Jens Axboe9b828492020-09-18 20:13:06 -06005735 put_nsproxy(req->work.nsproxy);
Jens Axboef573d382020-09-22 10:19:24 -06005736 req->work.files = NULL;
5737}
5738
Pavel Begunkov3ca405e2020-07-13 23:37:08 +03005739static void __io_clean_op(struct io_kiocb *req)
Pavel Begunkov99bc4c32020-02-07 22:04:45 +03005740{
Pavel Begunkov0e1b6fe2020-07-16 23:28:02 +03005741 if (req->flags & REQ_F_BUFFER_SELECTED) {
5742 switch (req->opcode) {
5743 case IORING_OP_READV:
5744 case IORING_OP_READ_FIXED:
5745 case IORING_OP_READ:
Jens Axboebcda7ba2020-02-23 16:42:51 -07005746 kfree((void *)(unsigned long)req->rw.addr);
Pavel Begunkov0e1b6fe2020-07-16 23:28:02 +03005747 break;
5748 case IORING_OP_RECVMSG:
5749 case IORING_OP_RECV:
Jens Axboe52de1fe2020-02-27 10:15:42 -07005750 kfree(req->sr_msg.kbuf);
Pavel Begunkov0e1b6fe2020-07-16 23:28:02 +03005751 break;
5752 }
5753 req->flags &= ~REQ_F_BUFFER_SELECTED;
Pavel Begunkov99bc4c32020-02-07 22:04:45 +03005754 }
5755
Pavel Begunkov0e1b6fe2020-07-16 23:28:02 +03005756 if (req->flags & REQ_F_NEED_CLEANUP) {
5757 switch (req->opcode) {
5758 case IORING_OP_READV:
5759 case IORING_OP_READ_FIXED:
5760 case IORING_OP_READ:
5761 case IORING_OP_WRITEV:
5762 case IORING_OP_WRITE_FIXED:
Jens Axboee8c2bc12020-08-15 18:44:09 -07005763 case IORING_OP_WRITE: {
5764 struct io_async_rw *io = req->async_data;
5765 if (io->free_iovec)
5766 kfree(io->free_iovec);
Pavel Begunkov0e1b6fe2020-07-16 23:28:02 +03005767 break;
Jens Axboee8c2bc12020-08-15 18:44:09 -07005768 }
Pavel Begunkov0e1b6fe2020-07-16 23:28:02 +03005769 case IORING_OP_RECVMSG:
Jens Axboee8c2bc12020-08-15 18:44:09 -07005770 case IORING_OP_SENDMSG: {
5771 struct io_async_msghdr *io = req->async_data;
5772 if (io->iov != io->fast_iov)
5773 kfree(io->iov);
Pavel Begunkov0e1b6fe2020-07-16 23:28:02 +03005774 break;
Jens Axboee8c2bc12020-08-15 18:44:09 -07005775 }
Pavel Begunkov0e1b6fe2020-07-16 23:28:02 +03005776 case IORING_OP_SPLICE:
5777 case IORING_OP_TEE:
5778 io_put_file(req, req->splice.file_in,
5779 (req->splice.flags & SPLICE_F_FD_IN_FIXED));
5780 break;
Jens Axboef3cd48502020-09-24 14:55:54 -06005781 case IORING_OP_OPENAT:
5782 case IORING_OP_OPENAT2:
5783 if (req->open.filename)
5784 putname(req->open.filename);
5785 break;
Pavel Begunkov0e1b6fe2020-07-16 23:28:02 +03005786 }
5787 req->flags &= ~REQ_F_NEED_CLEANUP;
5788 }
Pavel Begunkovbb175342020-08-20 11:33:35 +03005789
Jens Axboef573d382020-09-22 10:19:24 -06005790 if (req->flags & REQ_F_INFLIGHT)
5791 io_req_drop_files(req);
Pavel Begunkov99bc4c32020-02-07 22:04:45 +03005792}
5793
Jens Axboe3529d8c2019-12-19 18:24:38 -07005794static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
Jens Axboef13fad72020-06-22 09:34:30 -06005795 bool force_nonblock, struct io_comp_state *cs)
Jens Axboe2b188cc2019-01-07 10:46:33 -07005796{
Jackie Liua197f662019-11-08 08:09:12 -07005797 struct io_ring_ctx *ctx = req->ctx;
Jens Axboed625c6e2019-12-17 19:53:05 -07005798 int ret;
Jens Axboe2b188cc2019-01-07 10:46:33 -07005799
Jens Axboed625c6e2019-12-17 19:53:05 -07005800 switch (req->opcode) {
Jens Axboe2b188cc2019-01-07 10:46:33 -07005801 case IORING_OP_NOP:
Jens Axboe229a7b62020-06-22 10:13:11 -06005802 ret = io_nop(req, cs);
Jens Axboe2b188cc2019-01-07 10:46:33 -07005803 break;
5804 case IORING_OP_READV:
Jens Axboe3529d8c2019-12-19 18:24:38 -07005805 case IORING_OP_READ_FIXED:
Jens Axboe3a6820f2019-12-22 15:19:35 -07005806 case IORING_OP_READ:
Jens Axboe3529d8c2019-12-19 18:24:38 -07005807 if (sqe) {
5808 ret = io_read_prep(req, sqe, force_nonblock);
5809 if (ret < 0)
5810 break;
5811 }
Jens Axboea1d7c392020-06-22 11:09:46 -06005812 ret = io_read(req, force_nonblock, cs);
Jens Axboe2b188cc2019-01-07 10:46:33 -07005813 break;
5814 case IORING_OP_WRITEV:
Jens Axboeedafcce2019-01-09 09:16:05 -07005815 case IORING_OP_WRITE_FIXED:
Jens Axboe3a6820f2019-12-22 15:19:35 -07005816 case IORING_OP_WRITE:
Jens Axboe3529d8c2019-12-19 18:24:38 -07005817 if (sqe) {
5818 ret = io_write_prep(req, sqe, force_nonblock);
5819 if (ret < 0)
5820 break;
5821 }
Jens Axboea1d7c392020-06-22 11:09:46 -06005822 ret = io_write(req, force_nonblock, cs);
Jens Axboe2b188cc2019-01-07 10:46:33 -07005823 break;
Christoph Hellwigc992fe22019-01-11 09:43:02 -07005824 case IORING_OP_FSYNC:
Jens Axboe3529d8c2019-12-19 18:24:38 -07005825 if (sqe) {
5826 ret = io_prep_fsync(req, sqe);
5827 if (ret < 0)
5828 break;
5829 }
Pavel Begunkov014db002020-03-03 21:33:12 +03005830 ret = io_fsync(req, force_nonblock);
Christoph Hellwigc992fe22019-01-11 09:43:02 -07005831 break;
Jens Axboe221c5eb2019-01-17 09:41:58 -07005832 case IORING_OP_POLL_ADD:
Jens Axboe3529d8c2019-12-19 18:24:38 -07005833 if (sqe) {
5834 ret = io_poll_add_prep(req, sqe);
5835 if (ret)
5836 break;
5837 }
Pavel Begunkov014db002020-03-03 21:33:12 +03005838 ret = io_poll_add(req);
Jens Axboe221c5eb2019-01-17 09:41:58 -07005839 break;
5840 case IORING_OP_POLL_REMOVE:
Jens Axboe3529d8c2019-12-19 18:24:38 -07005841 if (sqe) {
5842 ret = io_poll_remove_prep(req, sqe);
5843 if (ret < 0)
5844 break;
5845 }
Jens Axboefc4df992019-12-10 14:38:45 -07005846 ret = io_poll_remove(req);
Jens Axboe221c5eb2019-01-17 09:41:58 -07005847 break;
Jens Axboe5d17b4a2019-04-09 14:56:44 -06005848 case IORING_OP_SYNC_FILE_RANGE:
Jens Axboe3529d8c2019-12-19 18:24:38 -07005849 if (sqe) {
5850 ret = io_prep_sfr(req, sqe);
5851 if (ret < 0)
5852 break;
5853 }
Pavel Begunkov014db002020-03-03 21:33:12 +03005854 ret = io_sync_file_range(req, force_nonblock);
Jens Axboe5d17b4a2019-04-09 14:56:44 -06005855 break;
Jens Axboe0fa03c62019-04-19 13:34:07 -06005856 case IORING_OP_SENDMSG:
Jens Axboefddafac2020-01-04 20:19:44 -07005857 case IORING_OP_SEND:
Jens Axboe3529d8c2019-12-19 18:24:38 -07005858 if (sqe) {
5859 ret = io_sendmsg_prep(req, sqe);
5860 if (ret < 0)
5861 break;
5862 }
Jens Axboefddafac2020-01-04 20:19:44 -07005863 if (req->opcode == IORING_OP_SENDMSG)
Jens Axboe229a7b62020-06-22 10:13:11 -06005864 ret = io_sendmsg(req, force_nonblock, cs);
Jens Axboefddafac2020-01-04 20:19:44 -07005865 else
Jens Axboe229a7b62020-06-22 10:13:11 -06005866 ret = io_send(req, force_nonblock, cs);
Jens Axboe0fa03c62019-04-19 13:34:07 -06005867 break;
Jens Axboeaa1fa282019-04-19 13:38:09 -06005868 case IORING_OP_RECVMSG:
Jens Axboefddafac2020-01-04 20:19:44 -07005869 case IORING_OP_RECV:
Jens Axboe3529d8c2019-12-19 18:24:38 -07005870 if (sqe) {
5871 ret = io_recvmsg_prep(req, sqe);
5872 if (ret)
5873 break;
5874 }
Jens Axboefddafac2020-01-04 20:19:44 -07005875 if (req->opcode == IORING_OP_RECVMSG)
Jens Axboe229a7b62020-06-22 10:13:11 -06005876 ret = io_recvmsg(req, force_nonblock, cs);
Jens Axboefddafac2020-01-04 20:19:44 -07005877 else
Jens Axboe229a7b62020-06-22 10:13:11 -06005878 ret = io_recv(req, force_nonblock, cs);
Jens Axboeaa1fa282019-04-19 13:38:09 -06005879 break;
Jens Axboe5262f562019-09-17 12:26:57 -06005880 case IORING_OP_TIMEOUT:
Jens Axboe3529d8c2019-12-19 18:24:38 -07005881 if (sqe) {
5882 ret = io_timeout_prep(req, sqe, false);
5883 if (ret)
5884 break;
5885 }
Jens Axboefc4df992019-12-10 14:38:45 -07005886 ret = io_timeout(req);
Jens Axboe5262f562019-09-17 12:26:57 -06005887 break;
Jens Axboe11365042019-10-16 09:08:32 -06005888 case IORING_OP_TIMEOUT_REMOVE:
Jens Axboe3529d8c2019-12-19 18:24:38 -07005889 if (sqe) {
5890 ret = io_timeout_remove_prep(req, sqe);
5891 if (ret)
5892 break;
5893 }
Jens Axboefc4df992019-12-10 14:38:45 -07005894 ret = io_timeout_remove(req);
Jens Axboe11365042019-10-16 09:08:32 -06005895 break;
Jens Axboe17f2fe32019-10-17 14:42:58 -06005896 case IORING_OP_ACCEPT:
Jens Axboe3529d8c2019-12-19 18:24:38 -07005897 if (sqe) {
5898 ret = io_accept_prep(req, sqe);
5899 if (ret)
5900 break;
5901 }
Jens Axboe229a7b62020-06-22 10:13:11 -06005902 ret = io_accept(req, force_nonblock, cs);
Jens Axboe17f2fe32019-10-17 14:42:58 -06005903 break;
Jens Axboef8e85cf2019-11-23 14:24:24 -07005904 case IORING_OP_CONNECT:
Jens Axboe3529d8c2019-12-19 18:24:38 -07005905 if (sqe) {
5906 ret = io_connect_prep(req, sqe);
5907 if (ret)
5908 break;
5909 }
Jens Axboe229a7b62020-06-22 10:13:11 -06005910 ret = io_connect(req, force_nonblock, cs);
Jens Axboef8e85cf2019-11-23 14:24:24 -07005911 break;
Jens Axboe62755e32019-10-28 21:49:21 -06005912 case IORING_OP_ASYNC_CANCEL:
Jens Axboe3529d8c2019-12-19 18:24:38 -07005913 if (sqe) {
5914 ret = io_async_cancel_prep(req, sqe);
5915 if (ret)
5916 break;
5917 }
Pavel Begunkov014db002020-03-03 21:33:12 +03005918 ret = io_async_cancel(req);
Jens Axboe62755e32019-10-28 21:49:21 -06005919 break;
Jens Axboed63d1b52019-12-10 10:38:56 -07005920 case IORING_OP_FALLOCATE:
5921 if (sqe) {
5922 ret = io_fallocate_prep(req, sqe);
5923 if (ret)
5924 break;
5925 }
Pavel Begunkov014db002020-03-03 21:33:12 +03005926 ret = io_fallocate(req, force_nonblock);
Jens Axboed63d1b52019-12-10 10:38:56 -07005927 break;
Jens Axboe15b71ab2019-12-11 11:20:36 -07005928 case IORING_OP_OPENAT:
5929 if (sqe) {
5930 ret = io_openat_prep(req, sqe);
5931 if (ret)
5932 break;
5933 }
Pavel Begunkov014db002020-03-03 21:33:12 +03005934 ret = io_openat(req, force_nonblock);
Jens Axboe15b71ab2019-12-11 11:20:36 -07005935 break;
Jens Axboeb5dba592019-12-11 14:02:38 -07005936 case IORING_OP_CLOSE:
5937 if (sqe) {
5938 ret = io_close_prep(req, sqe);
5939 if (ret)
5940 break;
5941 }
Jens Axboe229a7b62020-06-22 10:13:11 -06005942 ret = io_close(req, force_nonblock, cs);
Jens Axboeb5dba592019-12-11 14:02:38 -07005943 break;
Jens Axboe05f3fb32019-12-09 11:22:50 -07005944 case IORING_OP_FILES_UPDATE:
5945 if (sqe) {
5946 ret = io_files_update_prep(req, sqe);
5947 if (ret)
5948 break;
5949 }
Jens Axboe229a7b62020-06-22 10:13:11 -06005950 ret = io_files_update(req, force_nonblock, cs);
Jens Axboe05f3fb32019-12-09 11:22:50 -07005951 break;
Jens Axboeeddc7ef2019-12-13 21:18:10 -07005952 case IORING_OP_STATX:
5953 if (sqe) {
5954 ret = io_statx_prep(req, sqe);
5955 if (ret)
5956 break;
5957 }
Pavel Begunkov014db002020-03-03 21:33:12 +03005958 ret = io_statx(req, force_nonblock);
Jens Axboeeddc7ef2019-12-13 21:18:10 -07005959 break;
Jens Axboe4840e412019-12-25 22:03:45 -07005960 case IORING_OP_FADVISE:
5961 if (sqe) {
5962 ret = io_fadvise_prep(req, sqe);
5963 if (ret)
5964 break;
5965 }
Pavel Begunkov014db002020-03-03 21:33:12 +03005966 ret = io_fadvise(req, force_nonblock);
Jens Axboe4840e412019-12-25 22:03:45 -07005967 break;
Jens Axboec1ca7572019-12-25 22:18:28 -07005968 case IORING_OP_MADVISE:
5969 if (sqe) {
5970 ret = io_madvise_prep(req, sqe);
5971 if (ret)
5972 break;
5973 }
Pavel Begunkov014db002020-03-03 21:33:12 +03005974 ret = io_madvise(req, force_nonblock);
Jens Axboec1ca7572019-12-25 22:18:28 -07005975 break;
Jens Axboecebdb982020-01-08 17:59:24 -07005976 case IORING_OP_OPENAT2:
5977 if (sqe) {
5978 ret = io_openat2_prep(req, sqe);
5979 if (ret)
5980 break;
5981 }
Pavel Begunkov014db002020-03-03 21:33:12 +03005982 ret = io_openat2(req, force_nonblock);
Jens Axboecebdb982020-01-08 17:59:24 -07005983 break;
Jens Axboe3e4827b2020-01-08 15:18:09 -07005984 case IORING_OP_EPOLL_CTL:
5985 if (sqe) {
5986 ret = io_epoll_ctl_prep(req, sqe);
5987 if (ret)
5988 break;
5989 }
Jens Axboe229a7b62020-06-22 10:13:11 -06005990 ret = io_epoll_ctl(req, force_nonblock, cs);
Jens Axboe3e4827b2020-01-08 15:18:09 -07005991 break;
Pavel Begunkov7d67af22020-02-24 11:32:45 +03005992 case IORING_OP_SPLICE:
5993 if (sqe) {
5994 ret = io_splice_prep(req, sqe);
5995 if (ret < 0)
5996 break;
5997 }
Pavel Begunkov014db002020-03-03 21:33:12 +03005998 ret = io_splice(req, force_nonblock);
Pavel Begunkov7d67af22020-02-24 11:32:45 +03005999 break;
Jens Axboeddf0322d2020-02-23 16:41:33 -07006000 case IORING_OP_PROVIDE_BUFFERS:
6001 if (sqe) {
6002 ret = io_provide_buffers_prep(req, sqe);
6003 if (ret)
6004 break;
6005 }
Jens Axboe229a7b62020-06-22 10:13:11 -06006006 ret = io_provide_buffers(req, force_nonblock, cs);
Jens Axboeddf0322d2020-02-23 16:41:33 -07006007 break;
Jens Axboe067524e2020-03-02 16:32:28 -07006008 case IORING_OP_REMOVE_BUFFERS:
6009 if (sqe) {
6010 ret = io_remove_buffers_prep(req, sqe);
6011 if (ret)
6012 break;
6013 }
Jens Axboe229a7b62020-06-22 10:13:11 -06006014 ret = io_remove_buffers(req, force_nonblock, cs);
Jens Axboe31b51512019-01-18 22:56:34 -07006015 break;
Pavel Begunkovf2a8d5c2020-05-17 14:18:06 +03006016 case IORING_OP_TEE:
6017 if (sqe) {
6018 ret = io_tee_prep(req, sqe);
6019 if (ret < 0)
6020 break;
6021 }
6022 ret = io_tee(req, force_nonblock);
6023 break;
Jens Axboe2b188cc2019-01-07 10:46:33 -07006024 default:
6025 ret = -EINVAL;
6026 break;
6027 }
6028
6029 if (ret)
6030 return ret;
6031
Jens Axboeb5325762020-05-19 21:20:27 -06006032 /* If the op doesn't have a file, we're not polling for it */
6033 if ((ctx->flags & IORING_SETUP_IOPOLL) && req->file) {
Jens Axboe11ba8202020-01-15 21:51:17 -07006034 const bool in_async = io_wq_current_is_worker();
6035
Jens Axboe11ba8202020-01-15 21:51:17 -07006036 /* workqueue context doesn't hold uring_lock, grab it now */
6037 if (in_async)
6038 mutex_lock(&ctx->uring_lock);
6039
Jens Axboe2b188cc2019-01-07 10:46:33 -07006040 io_iopoll_req_issued(req);
Jens Axboe11ba8202020-01-15 21:51:17 -07006041
6042 if (in_async)
6043 mutex_unlock(&ctx->uring_lock);
Jens Axboedef596e2019-01-09 08:59:42 -07006044 }
6045
6046 return 0;
6047}
6048
Pavel Begunkovf4db7182020-06-25 18:20:54 +03006049static struct io_wq_work *io_wq_submit_work(struct io_wq_work *work)
Pavel Begunkovd4c81f32020-06-08 21:08:19 +03006050{
Jens Axboe2b188cc2019-01-07 10:46:33 -07006051 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
Pavel Begunkov6df1db62020-07-03 22:15:06 +03006052 struct io_kiocb *timeout;
Jens Axboe561fb042019-10-24 07:25:42 -06006053 int ret = 0;
Jens Axboe2b188cc2019-01-07 10:46:33 -07006054
Pavel Begunkov6df1db62020-07-03 22:15:06 +03006055 timeout = io_prep_linked_timeout(req);
6056 if (timeout)
6057 io_queue_linked_timeout(timeout);
Pavel Begunkovd4c81f32020-06-08 21:08:19 +03006058
Jens Axboe0c9d5cc2019-12-11 19:29:43 -07006059 /* if NO_CANCEL is set, we must still run the work */
6060 if ((work->flags & (IO_WQ_WORK_CANCEL|IO_WQ_WORK_NO_CANCEL)) ==
6061 IO_WQ_WORK_CANCEL) {
Jens Axboe561fb042019-10-24 07:25:42 -06006062 ret = -ECANCELED;
Jens Axboe0c9d5cc2019-12-11 19:29:43 -07006063 }
Jens Axboe31b51512019-01-18 22:56:34 -07006064
Jens Axboe561fb042019-10-24 07:25:42 -06006065 if (!ret) {
Jens Axboe561fb042019-10-24 07:25:42 -06006066 do {
Jens Axboef13fad72020-06-22 09:34:30 -06006067 ret = io_issue_sqe(req, NULL, false, NULL);
Jens Axboe561fb042019-10-24 07:25:42 -06006068 /*
6069 * We can get EAGAIN for polled IO even though we're
6070 * forcing a sync submission from here, since we can't
6071 * wait for request slots on the block side.
6072 */
6073 if (ret != -EAGAIN)
6074 break;
6075 cond_resched();
6076 } while (1);
6077 }
Jens Axboe31b51512019-01-18 22:56:34 -07006078
Jens Axboe561fb042019-10-24 07:25:42 -06006079 if (ret) {
Jens Axboe4e88d6e2019-12-07 20:59:47 -07006080 req_set_fail_links(req);
Jens Axboee1e16092020-06-22 09:17:17 -06006081 io_req_complete(req, ret);
Jens Axboeedafcce2019-01-09 09:16:05 -07006082 }
Jens Axboe2b188cc2019-01-07 10:46:33 -07006083
Pavel Begunkovf4db7182020-06-25 18:20:54 +03006084 return io_steal_work(req);
Jens Axboe31b51512019-01-18 22:56:34 -07006085}
Jens Axboe2b188cc2019-01-07 10:46:33 -07006086
Jens Axboe65e19f52019-10-26 07:20:21 -06006087static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
6088 int index)
Jens Axboe09bb8392019-03-13 12:39:28 -06006089{
Jens Axboe65e19f52019-10-26 07:20:21 -06006090 struct fixed_file_table *table;
6091
Jens Axboe05f3fb32019-12-09 11:22:50 -07006092 table = &ctx->file_data->table[index >> IORING_FILE_TABLE_SHIFT];
Xiaoming Ni84695082020-05-11 19:25:43 +08006093 return table->files[index & IORING_FILE_TABLE_MASK];
Jens Axboe65e19f52019-10-26 07:20:21 -06006094}
6095
Pavel Begunkov8da11c12020-02-24 11:32:44 +03006096static int io_file_get(struct io_submit_state *state, struct io_kiocb *req,
6097 int fd, struct file **out_file, bool fixed)
6098{
6099 struct io_ring_ctx *ctx = req->ctx;
6100 struct file *file;
6101
6102 if (fixed) {
6103 if (unlikely(!ctx->file_data ||
6104 (unsigned) fd >= ctx->nr_user_files))
6105 return -EBADF;
6106 fd = array_index_nospec(fd, ctx->nr_user_files);
6107 file = io_file_from_index(ctx, fd);
Jens Axboefd2206e2020-06-02 16:40:47 -06006108 if (file) {
6109 req->fixed_file_refs = ctx->file_data->cur_refs;
6110 percpu_ref_get(req->fixed_file_refs);
6111 }
Pavel Begunkov8da11c12020-02-24 11:32:44 +03006112 } else {
6113 trace_io_uring_file_get(ctx, fd);
6114 file = __io_file_get(state, fd);
Pavel Begunkov8da11c12020-02-24 11:32:44 +03006115 }
6116
Jens Axboefd2206e2020-06-02 16:40:47 -06006117 if (file || io_op_defs[req->opcode].needs_file_no_error) {
6118 *out_file = file;
6119 return 0;
6120 }
6121 return -EBADF;
Pavel Begunkov8da11c12020-02-24 11:32:44 +03006122}
6123
Jens Axboe3529d8c2019-12-19 18:24:38 -07006124static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req,
Jens Axboe63ff8222020-05-07 14:56:15 -06006125 int fd)
Jens Axboe09bb8392019-03-13 12:39:28 -06006126{
Pavel Begunkov8da11c12020-02-24 11:32:44 +03006127 bool fixed;
Jens Axboe09bb8392019-03-13 12:39:28 -06006128
Jens Axboe63ff8222020-05-07 14:56:15 -06006129 fixed = (req->flags & REQ_F_FIXED_FILE) != 0;
Pavel Begunkov0cdaf762020-05-17 14:13:40 +03006130 if (unlikely(!fixed && io_async_submit(req->ctx)))
Pavel Begunkov8da11c12020-02-24 11:32:44 +03006131 return -EBADF;
Jens Axboe09bb8392019-03-13 12:39:28 -06006132
Pavel Begunkov8da11c12020-02-24 11:32:44 +03006133 return io_file_get(state, req, fd, &req->file, fixed);
Jens Axboe09bb8392019-03-13 12:39:28 -06006134}
6135
Jackie Liua197f662019-11-08 08:09:12 -07006136static int io_grab_files(struct io_kiocb *req)
Jens Axboe2b188cc2019-01-07 10:46:33 -07006137{
Jackie Liua197f662019-11-08 08:09:12 -07006138 struct io_ring_ctx *ctx = req->ctx;
Jens Axboefcb323c2019-10-24 12:39:47 -06006139
Pavel Begunkovf56040b2020-07-23 20:25:21 +03006140 io_req_init_async(req);
6141
Jens Axboe5b0bbee2020-04-27 10:41:22 -06006142 if (req->work.files || (req->flags & REQ_F_NO_FILE_TABLE))
Jens Axboef86cd202020-01-29 13:46:44 -07006143 return 0;
Jens Axboeb5dba592019-12-11 14:02:38 -07006144
Jens Axboe0f212202020-09-13 13:09:39 -06006145 req->work.files = get_files_struct(current);
Jens Axboe9b828492020-09-18 20:13:06 -06006146 get_nsproxy(current->nsproxy);
6147 req->work.nsproxy = current->nsproxy;
Jens Axboe0f212202020-09-13 13:09:39 -06006148 req->flags |= REQ_F_INFLIGHT;
6149
Jens Axboefcb323c2019-10-24 12:39:47 -06006150 spin_lock_irq(&ctx->inflight_lock);
Jens Axboe0f212202020-09-13 13:09:39 -06006151 list_add(&req->inflight_entry, &ctx->inflight_list);
Jens Axboefcb323c2019-10-24 12:39:47 -06006152 spin_unlock_irq(&ctx->inflight_lock);
Jens Axboe0f212202020-09-13 13:09:39 -06006153 return 0;
Jens Axboefcb323c2019-10-24 12:39:47 -06006154}
6155
Pavel Begunkovf56040b2020-07-23 20:25:21 +03006156static inline int io_prep_work_files(struct io_kiocb *req)
6157{
6158 if (!io_op_defs[req->opcode].file_table)
6159 return 0;
6160 return io_grab_files(req);
6161}
6162
Jens Axboe2665abf2019-11-05 12:40:47 -07006163static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
6164{
Jens Axboead8a48a2019-11-15 08:49:11 -07006165 struct io_timeout_data *data = container_of(timer,
6166 struct io_timeout_data, timer);
6167 struct io_kiocb *req = data->req;
Jens Axboe2665abf2019-11-05 12:40:47 -07006168 struct io_ring_ctx *ctx = req->ctx;
6169 struct io_kiocb *prev = NULL;
6170 unsigned long flags;
Jens Axboe2665abf2019-11-05 12:40:47 -07006171
6172 spin_lock_irqsave(&ctx->completion_lock, flags);
6173
6174 /*
6175 * We don't expect the list to be empty, that will only happen if we
6176 * race with the completion of the linked work.
6177 */
Pavel Begunkov44932332019-12-05 16:16:35 +03006178 if (!list_empty(&req->link_list)) {
6179 prev = list_entry(req->link_list.prev, struct io_kiocb,
6180 link_list);
Jens Axboe5d960722019-11-19 15:31:28 -07006181 if (refcount_inc_not_zero(&prev->refs)) {
Pavel Begunkov44932332019-12-05 16:16:35 +03006182 list_del_init(&req->link_list);
Jens Axboe5d960722019-11-19 15:31:28 -07006183 prev->flags &= ~REQ_F_LINK_TIMEOUT;
6184 } else
Jens Axboe76a46e02019-11-10 23:34:16 -07006185 prev = NULL;
Jens Axboe2665abf2019-11-05 12:40:47 -07006186 }
6187
6188 spin_unlock_irqrestore(&ctx->completion_lock, flags);
6189
6190 if (prev) {
Jens Axboe4e88d6e2019-12-07 20:59:47 -07006191 req_set_fail_links(prev);
Pavel Begunkov014db002020-03-03 21:33:12 +03006192 io_async_find_and_cancel(ctx, req, prev->user_data, -ETIME);
Jens Axboe76a46e02019-11-10 23:34:16 -07006193 io_put_req(prev);
Jens Axboe47f46762019-11-09 17:43:02 -07006194 } else {
Jens Axboee1e16092020-06-22 09:17:17 -06006195 io_req_complete(req, -ETIME);
Jens Axboe2665abf2019-11-05 12:40:47 -07006196 }
Jens Axboe2665abf2019-11-05 12:40:47 -07006197 return HRTIMER_NORESTART;
6198}
6199
Jens Axboe7271ef32020-08-10 09:55:22 -06006200static void __io_queue_linked_timeout(struct io_kiocb *req)
Jens Axboe2665abf2019-11-05 12:40:47 -07006201{
Jens Axboe76a46e02019-11-10 23:34:16 -07006202 /*
6203 * If the list is now empty, then our linked request finished before
6204 * we got a chance to setup the timer
6205 */
Pavel Begunkov44932332019-12-05 16:16:35 +03006206 if (!list_empty(&req->link_list)) {
Jens Axboee8c2bc12020-08-15 18:44:09 -07006207 struct io_timeout_data *data = req->async_data;
Jens Axboe94ae5e72019-11-14 19:39:52 -07006208
Jens Axboead8a48a2019-11-15 08:49:11 -07006209 data->timer.function = io_link_timeout_fn;
6210 hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
6211 data->mode);
Jens Axboe2665abf2019-11-05 12:40:47 -07006212 }
Jens Axboe7271ef32020-08-10 09:55:22 -06006213}
6214
6215static void io_queue_linked_timeout(struct io_kiocb *req)
6216{
6217 struct io_ring_ctx *ctx = req->ctx;
6218
6219 spin_lock_irq(&ctx->completion_lock);
6220 __io_queue_linked_timeout(req);
Jens Axboe76a46e02019-11-10 23:34:16 -07006221 spin_unlock_irq(&ctx->completion_lock);
Jens Axboe2665abf2019-11-05 12:40:47 -07006222
Jens Axboe2665abf2019-11-05 12:40:47 -07006223 /* drop submission reference */
Jens Axboe76a46e02019-11-10 23:34:16 -07006224 io_put_req(req);
Jens Axboe2665abf2019-11-05 12:40:47 -07006225}
6226
Jens Axboead8a48a2019-11-15 08:49:11 -07006227static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
Jens Axboe2665abf2019-11-05 12:40:47 -07006228{
6229 struct io_kiocb *nxt;
Jens Axboe2b188cc2019-01-07 10:46:33 -07006230
Pavel Begunkovdea3b492020-04-12 02:05:04 +03006231 if (!(req->flags & REQ_F_LINK_HEAD))
Jens Axboe2665abf2019-11-05 12:40:47 -07006232 return NULL;
Pavel Begunkov6df1db62020-07-03 22:15:06 +03006233 if (req->flags & REQ_F_LINK_TIMEOUT)
Jens Axboed7718a92020-02-14 22:23:12 -07006234 return NULL;
Jens Axboe2665abf2019-11-05 12:40:47 -07006235
Pavel Begunkov44932332019-12-05 16:16:35 +03006236 nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb,
6237 link_list);
Jens Axboed625c6e2019-12-17 19:53:05 -07006238 if (!nxt || nxt->opcode != IORING_OP_LINK_TIMEOUT)
Jens Axboe76a46e02019-11-10 23:34:16 -07006239 return NULL;
Jens Axboe2665abf2019-11-05 12:40:47 -07006240
Jens Axboe76a46e02019-11-10 23:34:16 -07006241 req->flags |= REQ_F_LINK_TIMEOUT;
Jens Axboe76a46e02019-11-10 23:34:16 -07006242 return nxt;
Jens Axboe2665abf2019-11-05 12:40:47 -07006243}
6244
Jens Axboef13fad72020-06-22 09:34:30 -06006245static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
6246 struct io_comp_state *cs)
Jens Axboe2b188cc2019-01-07 10:46:33 -07006247{
Jens Axboe4a0a7a12019-12-09 20:01:01 -07006248 struct io_kiocb *linked_timeout;
Pavel Begunkov4bc44942020-02-29 22:48:24 +03006249 struct io_kiocb *nxt;
Jens Axboe193155c2020-02-22 23:22:19 -07006250 const struct cred *old_creds = NULL;
Jens Axboe2b188cc2019-01-07 10:46:33 -07006251 int ret;
6252
Jens Axboe4a0a7a12019-12-09 20:01:01 -07006253again:
6254 linked_timeout = io_prep_linked_timeout(req);
6255
Xiaoguang Wang7cdaf582020-06-10 19:41:19 +08006256 if ((req->flags & REQ_F_WORK_INITIALIZED) && req->work.creds &&
6257 req->work.creds != current_cred()) {
Jens Axboe193155c2020-02-22 23:22:19 -07006258 if (old_creds)
6259 revert_creds(old_creds);
6260 if (old_creds == req->work.creds)
6261 old_creds = NULL; /* restored original creds */
6262 else
6263 old_creds = override_creds(req->work.creds);
6264 }
6265
Jens Axboef13fad72020-06-22 09:34:30 -06006266 ret = io_issue_sqe(req, sqe, true, cs);
Jens Axboe491381ce2019-10-17 09:20:46 -06006267
6268 /*
6269 * We async punt it if the file wasn't marked NOWAIT, or if the file
6270 * doesn't support non-blocking read/write attempts
6271 */
Pavel Begunkov24c74672020-06-21 13:09:51 +03006272 if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
Pavel Begunkovf063c542020-07-25 14:41:59 +03006273 if (!io_arm_poll_handler(req)) {
Pavel Begunkov86a761f2020-01-22 23:09:36 +03006274punt:
Pavel Begunkovf063c542020-07-25 14:41:59 +03006275 ret = io_prep_work_files(req);
6276 if (unlikely(ret))
Pavel Begunkovbbad27b2019-11-19 23:32:47 +03006277 goto err;
Pavel Begunkovf063c542020-07-25 14:41:59 +03006278 /*
6279 * Queued up for async execution, worker will release
6280 * submit reference when the iocb is actually submitted.
6281 */
6282 io_queue_async_work(req);
Jens Axboe2b188cc2019-01-07 10:46:33 -07006283 }
Pavel Begunkovbbad27b2019-11-19 23:32:47 +03006284
Pavel Begunkovf063c542020-07-25 14:41:59 +03006285 if (linked_timeout)
6286 io_queue_linked_timeout(linked_timeout);
Pavel Begunkov4bc44942020-02-29 22:48:24 +03006287 goto exit;
Jens Axboe2b188cc2019-01-07 10:46:33 -07006288 }
Jens Axboee65ef562019-03-12 10:16:44 -06006289
Pavel Begunkov652532a2020-07-03 22:15:07 +03006290 if (unlikely(ret)) {
Jens Axboefcb323c2019-10-24 12:39:47 -06006291err:
Pavel Begunkov652532a2020-07-03 22:15:07 +03006292 /* un-prep timeout, so it'll be killed as any other linked */
6293 req->flags &= ~REQ_F_LINK_TIMEOUT;
Jens Axboe4e88d6e2019-12-07 20:59:47 -07006294 req_set_fail_links(req);
Jens Axboee65ef562019-03-12 10:16:44 -06006295 io_put_req(req);
Pavel Begunkov652532a2020-07-03 22:15:07 +03006296 io_req_complete(req, ret);
6297 goto exit;
Jens Axboe9e645e112019-05-10 16:07:28 -06006298 }
Pavel Begunkov652532a2020-07-03 22:15:07 +03006299
Jens Axboe6c271ce2019-01-10 11:22:30 -07006300 /* drop submission reference */
Pavel Begunkov9b5f7bd2020-06-29 13:13:00 +03006301 nxt = io_put_req_find_next(req);
Pavel Begunkov652532a2020-07-03 22:15:07 +03006302 if (linked_timeout)
6303 io_queue_linked_timeout(linked_timeout);
Jens Axboe6c271ce2019-01-10 11:22:30 -07006304
Jens Axboe4a0a7a12019-12-09 20:01:01 -07006305 if (nxt) {
6306 req = nxt;
Pavel Begunkov86a761f2020-01-22 23:09:36 +03006307
6308 if (req->flags & REQ_F_FORCE_ASYNC)
6309 goto punt;
Jens Axboe4a0a7a12019-12-09 20:01:01 -07006310 goto again;
6311 }
Pavel Begunkov4bc44942020-02-29 22:48:24 +03006312exit:
Jens Axboe193155c2020-02-22 23:22:19 -07006313 if (old_creds)
6314 revert_creds(old_creds);
Jens Axboe2b188cc2019-01-07 10:46:33 -07006315}
6316
Jens Axboef13fad72020-06-22 09:34:30 -06006317static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
6318 struct io_comp_state *cs)
Jackie Liu4fe2c962019-09-09 20:50:40 +08006319{
6320 int ret;
6321
Jens Axboe3529d8c2019-12-19 18:24:38 -07006322 ret = io_req_defer(req, sqe);
Jackie Liu4fe2c962019-09-09 20:50:40 +08006323 if (ret) {
6324 if (ret != -EIOCBQUEUED) {
Pavel Begunkov11185912020-01-22 23:09:35 +03006325fail_req:
Jens Axboe4e88d6e2019-12-07 20:59:47 -07006326 req_set_fail_links(req);
Jens Axboee1e16092020-06-22 09:17:17 -06006327 io_put_req(req);
6328 io_req_complete(req, ret);
Jackie Liu4fe2c962019-09-09 20:50:40 +08006329 }
Pavel Begunkov25508782019-12-30 21:24:47 +03006330 } else if (req->flags & REQ_F_FORCE_ASYNC) {
Jens Axboee8c2bc12020-08-15 18:44:09 -07006331 if (!req->async_data) {
Pavel Begunkovbd2ab182020-05-17 14:02:12 +03006332 ret = io_req_defer_prep(req, sqe);
Pavel Begunkov327d6d92020-07-15 12:46:51 +03006333 if (unlikely(ret))
Pavel Begunkovbd2ab182020-05-17 14:02:12 +03006334 goto fail_req;
6335 }
6336
Jens Axboece35a472019-12-17 08:04:44 -07006337 /*
6338 * Never try inline submit of IOSQE_ASYNC is set, go straight
6339 * to async execution.
6340 */
Pavel Begunkov3e863ea2020-07-23 20:17:20 +03006341 io_req_init_async(req);
Jens Axboece35a472019-12-17 08:04:44 -07006342 req->work.flags |= IO_WQ_WORK_CONCURRENT;
6343 io_queue_async_work(req);
6344 } else {
Jens Axboef13fad72020-06-22 09:34:30 -06006345 __io_queue_sqe(req, sqe, cs);
Jens Axboece35a472019-12-17 08:04:44 -07006346 }
Jackie Liu4fe2c962019-09-09 20:50:40 +08006347}
6348
Jens Axboef13fad72020-06-22 09:34:30 -06006349static inline void io_queue_link_head(struct io_kiocb *req,
6350 struct io_comp_state *cs)
Jackie Liu4fe2c962019-09-09 20:50:40 +08006351{
Jens Axboe94ae5e72019-11-14 19:39:52 -07006352 if (unlikely(req->flags & REQ_F_FAIL_LINK)) {
Jens Axboee1e16092020-06-22 09:17:17 -06006353 io_put_req(req);
6354 io_req_complete(req, -ECANCELED);
Pavel Begunkov1b4a51b2019-11-21 11:54:28 +03006355 } else
Jens Axboef13fad72020-06-22 09:34:30 -06006356 io_queue_sqe(req, NULL, cs);
Jackie Liu4fe2c962019-09-09 20:50:40 +08006357}
6358
Pavel Begunkov1d4240c2020-04-12 02:05:03 +03006359static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
Jens Axboef13fad72020-06-22 09:34:30 -06006360 struct io_kiocb **link, struct io_comp_state *cs)
Jens Axboe9e645e112019-05-10 16:07:28 -06006361{
Jackie Liua197f662019-11-08 08:09:12 -07006362 struct io_ring_ctx *ctx = req->ctx;
Pavel Begunkovef4ff582020-04-12 02:05:05 +03006363 int ret;
Jens Axboe9e645e112019-05-10 16:07:28 -06006364
Jens Axboe9e645e112019-05-10 16:07:28 -06006365 /*
6366 * If we already have a head request, queue this one for async
6367 * submittal once the head completes. If we don't have a head but
6368 * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be
6369 * submitted sync once the chain is complete. If none of those
6370 * conditions are true (normal request), then just queue it.
6371 */
6372 if (*link) {
Pavel Begunkov9d763772019-12-17 02:22:07 +03006373 struct io_kiocb *head = *link;
Jens Axboe9e645e112019-05-10 16:07:28 -06006374
Pavel Begunkov8cdf2192020-01-25 00:40:24 +03006375 /*
6376 * Taking sequential execution of a link, draining both sides
6377 * of the link also fullfils IOSQE_IO_DRAIN semantics for all
6378 * requests in the link. So, it drains the head and the
6379 * next after the link request. The last one is done via
6380 * drain_next flag to persist the effect across calls.
6381 */
Pavel Begunkovef4ff582020-04-12 02:05:05 +03006382 if (req->flags & REQ_F_IO_DRAIN) {
Pavel Begunkov711be032020-01-17 03:57:59 +03006383 head->flags |= REQ_F_IO_DRAIN;
6384 ctx->drain_next = 1;
6385 }
Jens Axboe3529d8c2019-12-19 18:24:38 -07006386 ret = io_req_defer_prep(req, sqe);
Pavel Begunkov327d6d92020-07-15 12:46:51 +03006387 if (unlikely(ret)) {
Jens Axboe4e88d6e2019-12-07 20:59:47 -07006388 /* fail even hard links since we don't submit */
Pavel Begunkov9d763772019-12-17 02:22:07 +03006389 head->flags |= REQ_F_FAIL_LINK;
Pavel Begunkov1d4240c2020-04-12 02:05:03 +03006390 return ret;
Jens Axboe2d283902019-12-04 11:08:05 -07006391 }
Pavel Begunkov9d763772019-12-17 02:22:07 +03006392 trace_io_uring_link(ctx, req, head);
6393 list_add_tail(&req->link_list, &head->link_list);
Jens Axboe9e645e112019-05-10 16:07:28 -06006394
Pavel Begunkov32fe5252019-12-17 22:26:58 +03006395 /* last request of a link, enqueue the link */
Pavel Begunkovef4ff582020-04-12 02:05:05 +03006396 if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) {
Jens Axboef13fad72020-06-22 09:34:30 -06006397 io_queue_link_head(head, cs);
Pavel Begunkov32fe5252019-12-17 22:26:58 +03006398 *link = NULL;
6399 }
Jens Axboe9e645e112019-05-10 16:07:28 -06006400 } else {
Pavel Begunkov711be032020-01-17 03:57:59 +03006401 if (unlikely(ctx->drain_next)) {
6402 req->flags |= REQ_F_IO_DRAIN;
Pavel Begunkovef4ff582020-04-12 02:05:05 +03006403 ctx->drain_next = 0;
Pavel Begunkov711be032020-01-17 03:57:59 +03006404 }
Pavel Begunkovef4ff582020-04-12 02:05:05 +03006405 if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
Pavel Begunkovdea3b492020-04-12 02:05:04 +03006406 req->flags |= REQ_F_LINK_HEAD;
Pavel Begunkov711be032020-01-17 03:57:59 +03006407 INIT_LIST_HEAD(&req->link_list);
Pavel Begunkovf1d96a82020-03-13 22:29:14 +03006408
Pavel Begunkov711be032020-01-17 03:57:59 +03006409 ret = io_req_defer_prep(req, sqe);
Pavel Begunkov327d6d92020-07-15 12:46:51 +03006410 if (unlikely(ret))
Pavel Begunkov711be032020-01-17 03:57:59 +03006411 req->flags |= REQ_F_FAIL_LINK;
6412 *link = req;
6413 } else {
Jens Axboef13fad72020-06-22 09:34:30 -06006414 io_queue_sqe(req, sqe, cs);
Pavel Begunkov711be032020-01-17 03:57:59 +03006415 }
Jens Axboe9e645e112019-05-10 16:07:28 -06006416 }
Pavel Begunkov2e6e1fd2019-12-05 16:15:45 +03006417
Pavel Begunkov1d4240c2020-04-12 02:05:03 +03006418 return 0;
Jens Axboe9e645e112019-05-10 16:07:28 -06006419}
6420
Jens Axboe9a56a232019-01-09 09:06:50 -07006421/*
6422 * Batched submission is done, ensure local IO is flushed out.
6423 */
6424static void io_submit_state_end(struct io_submit_state *state)
6425{
Jens Axboef13fad72020-06-22 09:34:30 -06006426 if (!list_empty(&state->comp.list))
6427 io_submit_flush_completions(&state->comp);
Jens Axboe9a56a232019-01-09 09:06:50 -07006428 blk_finish_plug(&state->plug);
Pavel Begunkov9f13c352020-05-17 14:13:41 +03006429 io_state_file_put(state);
Jens Axboe2579f912019-01-09 09:10:43 -07006430 if (state->free_reqs)
Pavel Begunkov6c8a3132020-02-01 03:58:00 +03006431 kmem_cache_free_bulk(req_cachep, state->free_reqs, state->reqs);
Jens Axboe9a56a232019-01-09 09:06:50 -07006432}
6433
6434/*
6435 * Start submission side cache.
6436 */
6437static void io_submit_state_start(struct io_submit_state *state,
Jens Axboe013538b2020-06-22 09:29:15 -06006438 struct io_ring_ctx *ctx, unsigned int max_ios)
Jens Axboe9a56a232019-01-09 09:06:50 -07006439{
6440 blk_start_plug(&state->plug);
Jens Axboe013538b2020-06-22 09:29:15 -06006441 state->comp.nr = 0;
6442 INIT_LIST_HEAD(&state->comp.list);
6443 state->comp.ctx = ctx;
Jens Axboe2579f912019-01-09 09:10:43 -07006444 state->free_reqs = 0;
Jens Axboe9a56a232019-01-09 09:06:50 -07006445 state->file = NULL;
6446 state->ios_left = max_ios;
6447}
6448
Jens Axboe2b188cc2019-01-07 10:46:33 -07006449static void io_commit_sqring(struct io_ring_ctx *ctx)
6450{
Hristo Venev75b28af2019-08-26 17:23:46 +00006451 struct io_rings *rings = ctx->rings;
Jens Axboe2b188cc2019-01-07 10:46:33 -07006452
Pavel Begunkovcaf582c2019-12-30 21:24:46 +03006453 /*
6454 * Ensure any loads from the SQEs are done at this point,
6455 * since once we write the new head, the application could
6456 * write new data to them.
6457 */
6458 smp_store_release(&rings->sq.head, ctx->cached_sq_head);
Jens Axboe2b188cc2019-01-07 10:46:33 -07006459}
6460
6461/*
Jens Axboe3529d8c2019-12-19 18:24:38 -07006462 * Fetch an sqe, if one is available. Note that sqe_ptr will point to memory
Jens Axboe2b188cc2019-01-07 10:46:33 -07006463 * that is mapped by userspace. This means that care needs to be taken to
6464 * ensure that reads are stable, as we cannot rely on userspace always
6465 * being a good citizen. If members of the sqe are validated and then later
6466 * used, it's important that those reads are done through READ_ONCE() to
6467 * prevent a re-load down the line.
6468 */
Pavel Begunkov709b3022020-04-08 08:58:43 +03006469static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
Jens Axboe2b188cc2019-01-07 10:46:33 -07006470{
Hristo Venev75b28af2019-08-26 17:23:46 +00006471 u32 *sq_array = ctx->sq_array;
Jens Axboe2b188cc2019-01-07 10:46:33 -07006472 unsigned head;
6473
6474 /*
6475 * The cached sq head (or cq tail) serves two purposes:
6476 *
6477 * 1) allows us to batch the cost of updating the user visible
6478 * head updates.
6479 * 2) allows the kernel side to track the head on its own, even
6480 * though the application is the one updating it.
6481 */
Pavel Begunkovee7d46d2019-12-30 21:24:45 +03006482 head = READ_ONCE(sq_array[ctx->cached_sq_head & ctx->sq_mask]);
Pavel Begunkov709b3022020-04-08 08:58:43 +03006483 if (likely(head < ctx->sq_entries))
6484 return &ctx->sq_sqes[head];
Jens Axboe2b188cc2019-01-07 10:46:33 -07006485
6486 /* drop invalid entries */
Jens Axboe498ccd92019-10-25 10:04:25 -06006487 ctx->cached_sq_dropped++;
Pavel Begunkovee7d46d2019-12-30 21:24:45 +03006488 WRITE_ONCE(ctx->rings->sq_dropped, ctx->cached_sq_dropped);
Pavel Begunkov709b3022020-04-08 08:58:43 +03006489 return NULL;
6490}
6491
6492static inline void io_consume_sqe(struct io_ring_ctx *ctx)
6493{
6494 ctx->cached_sq_head++;
Jens Axboe2b188cc2019-01-07 10:46:33 -07006495}
6496
Stefano Garzarella21b55db2020-08-27 16:58:30 +02006497/*
6498 * Check SQE restrictions (opcode and flags).
6499 *
6500 * Returns 'true' if SQE is allowed, 'false' otherwise.
6501 */
6502static inline bool io_check_restriction(struct io_ring_ctx *ctx,
6503 struct io_kiocb *req,
6504 unsigned int sqe_flags)
6505{
6506 if (!ctx->restricted)
6507 return true;
6508
6509 if (!test_bit(req->opcode, ctx->restrictions.sqe_op))
6510 return false;
6511
6512 if ((sqe_flags & ctx->restrictions.sqe_flags_required) !=
6513 ctx->restrictions.sqe_flags_required)
6514 return false;
6515
6516 if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed |
6517 ctx->restrictions.sqe_flags_required))
6518 return false;
6519
6520 return true;
6521}
6522
Pavel Begunkovef4ff582020-04-12 02:05:05 +03006523#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \
6524 IOSQE_IO_HARDLINK | IOSQE_ASYNC | \
6525 IOSQE_BUFFER_SELECT)
6526
6527static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
6528 const struct io_uring_sqe *sqe,
Pavel Begunkov0cdaf762020-05-17 14:13:40 +03006529 struct io_submit_state *state)
Pavel Begunkov0553b8b2020-04-08 08:58:45 +03006530{
Pavel Begunkovef4ff582020-04-12 02:05:05 +03006531 unsigned int sqe_flags;
Jens Axboe63ff8222020-05-07 14:56:15 -06006532 int id;
Pavel Begunkovef4ff582020-04-12 02:05:05 +03006533
Pavel Begunkov0553b8b2020-04-08 08:58:45 +03006534 req->opcode = READ_ONCE(sqe->opcode);
6535 req->user_data = READ_ONCE(sqe->user_data);
Jens Axboee8c2bc12020-08-15 18:44:09 -07006536 req->async_data = NULL;
Pavel Begunkov0553b8b2020-04-08 08:58:45 +03006537 req->file = NULL;
6538 req->ctx = ctx;
6539 req->flags = 0;
6540 /* one is dropped after submission, the other at completion */
6541 refcount_set(&req->refs, 2);
Pavel Begunkov4dd28242020-06-15 10:33:13 +03006542 req->task = current;
Jens Axboee3bc8e92020-09-24 08:45:57 -06006543 get_task_struct(req->task);
Jens Axboe0f212202020-09-13 13:09:39 -06006544 atomic_long_inc(&req->task->io_uring->req_issue);
Pavel Begunkov0553b8b2020-04-08 08:58:45 +03006545 req->result = 0;
Pavel Begunkovef4ff582020-04-12 02:05:05 +03006546
6547 if (unlikely(req->opcode >= IORING_OP_LAST))
6548 return -EINVAL;
6549
Jens Axboe9d8426a2020-06-16 18:42:49 -06006550 if (unlikely(io_sq_thread_acquire_mm(ctx, req)))
6551 return -EFAULT;
Pavel Begunkovef4ff582020-04-12 02:05:05 +03006552
6553 sqe_flags = READ_ONCE(sqe->flags);
6554 /* enforce forwards compatibility on users */
6555 if (unlikely(sqe_flags & ~SQE_VALID_FLAGS))
6556 return -EINVAL;
6557
Stefano Garzarella21b55db2020-08-27 16:58:30 +02006558 if (unlikely(!io_check_restriction(ctx, req, sqe_flags)))
6559 return -EACCES;
6560
Pavel Begunkovef4ff582020-04-12 02:05:05 +03006561 if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
6562 !io_op_defs[req->opcode].buffer_select)
6563 return -EOPNOTSUPP;
6564
6565 id = READ_ONCE(sqe->personality);
6566 if (id) {
Xiaoguang Wang7cdaf582020-06-10 19:41:19 +08006567 io_req_init_async(req);
Pavel Begunkovef4ff582020-04-12 02:05:05 +03006568 req->work.creds = idr_find(&ctx->personality_idr, id);
6569 if (unlikely(!req->work.creds))
6570 return -EINVAL;
6571 get_cred(req->work.creds);
6572 }
6573
6574 /* same numerical values with corresponding REQ_F_*, safe to copy */
Pavel Begunkovc11368a52020-05-17 14:13:42 +03006575 req->flags |= sqe_flags;
Pavel Begunkovef4ff582020-04-12 02:05:05 +03006576
Jens Axboe63ff8222020-05-07 14:56:15 -06006577 if (!io_op_defs[req->opcode].needs_file)
6578 return 0;
6579
6580 return io_req_set_file(state, req, READ_ONCE(sqe->fd));
Pavel Begunkov0553b8b2020-04-08 08:58:45 +03006581}
6582
Jens Axboe0f212202020-09-13 13:09:39 -06006583static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
Jens Axboe6c271ce2019-01-10 11:22:30 -07006584{
Jens Axboeac8691c2020-06-01 08:30:41 -06006585 struct io_submit_state state;
Jens Axboe9e645e112019-05-10 16:07:28 -06006586 struct io_kiocb *link = NULL;
Jens Axboe9e645e112019-05-10 16:07:28 -06006587 int i, submitted = 0;
Jens Axboe6c271ce2019-01-10 11:22:30 -07006588
Jens Axboec4a2ed72019-11-21 21:01:26 -07006589 /* if we have a backlog and couldn't flush it all, return BUSY */
Jens Axboead3eb2c2019-12-18 17:12:20 -07006590 if (test_bit(0, &ctx->sq_check_overflow)) {
6591 if (!list_empty(&ctx->cq_overflow_list) &&
Jens Axboee6c8aa92020-09-28 13:10:13 -06006592 !io_cqring_overflow_flush(ctx, false, NULL, NULL))
Jens Axboead3eb2c2019-12-18 17:12:20 -07006593 return -EBUSY;
6594 }
Jens Axboe6c271ce2019-01-10 11:22:30 -07006595
Pavel Begunkovee7d46d2019-12-30 21:24:45 +03006596 /* make sure SQ entry isn't read before tail */
6597 nr = min3(nr, ctx->sq_entries, io_sqring_entries(ctx));
Pavel Begunkov9ef4f122019-12-30 21:24:44 +03006598
Pavel Begunkov2b85edf2019-12-28 14:13:03 +03006599 if (!percpu_ref_tryget_many(&ctx->refs, nr))
6600 return -EAGAIN;
Jens Axboe6c271ce2019-01-10 11:22:30 -07006601
Jens Axboe013538b2020-06-22 09:29:15 -06006602 io_submit_state_start(&state, ctx, nr);
Jens Axboe6c271ce2019-01-10 11:22:30 -07006603
6604 for (i = 0; i < nr; i++) {
Jens Axboe3529d8c2019-12-19 18:24:38 -07006605 const struct io_uring_sqe *sqe;
Pavel Begunkov196be952019-11-07 01:41:06 +03006606 struct io_kiocb *req;
Pavel Begunkov1cb1edb2020-02-06 21:16:09 +03006607 int err;
Pavel Begunkovfb5ccc92019-10-25 12:31:30 +03006608
Pavel Begunkovb1e50e52020-04-08 08:58:44 +03006609 sqe = io_get_sqe(ctx);
6610 if (unlikely(!sqe)) {
6611 io_consume_sqe(ctx);
6612 break;
6613 }
Jens Axboeac8691c2020-06-01 08:30:41 -06006614 req = io_alloc_req(ctx, &state);
Pavel Begunkov196be952019-11-07 01:41:06 +03006615 if (unlikely(!req)) {
6616 if (!submitted)
6617 submitted = -EAGAIN;
Pavel Begunkovfb5ccc92019-10-25 12:31:30 +03006618 break;
Jens Axboe9e645e112019-05-10 16:07:28 -06006619 }
Jens Axboe9e645e112019-05-10 16:07:28 -06006620
Jens Axboeac8691c2020-06-01 08:30:41 -06006621 err = io_init_req(ctx, req, sqe, &state);
Pavel Begunkov709b3022020-04-08 08:58:43 +03006622 io_consume_sqe(ctx);
Jens Axboed3656342019-12-18 09:50:26 -07006623 /* will complete beyond this point, count as submitted */
6624 submitted++;
6625
Pavel Begunkovef4ff582020-04-12 02:05:05 +03006626 if (unlikely(err)) {
Pavel Begunkov1cb1edb2020-02-06 21:16:09 +03006627fail_req:
Jens Axboee1e16092020-06-22 09:17:17 -06006628 io_put_req(req);
6629 io_req_complete(req, err);
Jens Axboed3656342019-12-18 09:50:26 -07006630 break;
6631 }
6632
Jens Axboe354420f2020-01-08 18:55:15 -07006633 trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data,
Pavel Begunkov0cdaf762020-05-17 14:13:40 +03006634 true, io_async_submit(ctx));
Jens Axboef13fad72020-06-22 09:34:30 -06006635 err = io_submit_sqe(req, sqe, &link, &state.comp);
Pavel Begunkov1d4240c2020-04-12 02:05:03 +03006636 if (err)
6637 goto fail_req;
Jens Axboe6c271ce2019-01-10 11:22:30 -07006638 }
6639
Pavel Begunkov9466f432020-01-25 22:34:01 +03006640 if (unlikely(submitted != nr)) {
6641 int ref_used = (submitted == -EAGAIN) ? 0 : submitted;
6642
6643 percpu_ref_put_many(&ctx->refs, nr - ref_used);
6644 }
Jens Axboe9e645e112019-05-10 16:07:28 -06006645 if (link)
Jens Axboef13fad72020-06-22 09:34:30 -06006646 io_queue_link_head(link, &state.comp);
Jens Axboeac8691c2020-06-01 08:30:41 -06006647 io_submit_state_end(&state);
Jens Axboe6c271ce2019-01-10 11:22:30 -07006648
Pavel Begunkovae9428c2019-11-06 00:22:14 +03006649 /* Commit SQ ring head once we've consumed and submitted all SQEs */
6650 io_commit_sqring(ctx);
6651
Jens Axboe6c271ce2019-01-10 11:22:30 -07006652 return submitted;
6653}
6654
Xiaoguang Wang23b36282020-07-23 20:57:24 +08006655static inline void io_ring_set_wakeup_flag(struct io_ring_ctx *ctx)
6656{
6657 /* Tell userspace we may need a wakeup call */
6658 spin_lock_irq(&ctx->completion_lock);
6659 ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP;
6660 spin_unlock_irq(&ctx->completion_lock);
6661}
6662
6663static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx)
6664{
6665 spin_lock_irq(&ctx->completion_lock);
6666 ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
6667 spin_unlock_irq(&ctx->completion_lock);
6668}
6669
Jens Axboe3f0e64d2020-09-02 12:42:47 -06006670static int io_sq_wake_function(struct wait_queue_entry *wqe, unsigned mode,
6671 int sync, void *key)
6672{
6673 struct io_ring_ctx *ctx = container_of(wqe, struct io_ring_ctx, sqo_wait_entry);
6674 int ret;
6675
6676 ret = autoremove_wake_function(wqe, mode, sync, key);
6677 if (ret) {
6678 unsigned long flags;
6679
6680 spin_lock_irqsave(&ctx->completion_lock, flags);
6681 ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
6682 spin_unlock_irqrestore(&ctx->completion_lock, flags);
6683 }
6684 return ret;
6685}
6686
Jens Axboec8d1ba52020-09-14 11:07:26 -06006687enum sq_ret {
6688 SQT_IDLE = 1,
6689 SQT_SPIN = 2,
6690 SQT_DID_WORK = 4,
6691};
6692
6693static enum sq_ret __io_sq_thread(struct io_ring_ctx *ctx,
Jens Axboee95eee22020-09-08 09:11:32 -06006694 unsigned long start_jiffies, bool cap_entries)
Jens Axboec8d1ba52020-09-14 11:07:26 -06006695{
6696 unsigned long timeout = start_jiffies + ctx->sq_thread_idle;
Jens Axboe534ca6d2020-09-02 13:52:19 -06006697 struct io_sq_data *sqd = ctx->sq_data;
Jens Axboec8d1ba52020-09-14 11:07:26 -06006698 unsigned int to_submit;
6699 int ret = 0;
6700
6701again:
6702 if (!list_empty(&ctx->iopoll_list)) {
6703 unsigned nr_events = 0;
6704
6705 mutex_lock(&ctx->uring_lock);
6706 if (!list_empty(&ctx->iopoll_list) && !need_resched())
6707 io_do_iopoll(ctx, &nr_events, 0);
6708 mutex_unlock(&ctx->uring_lock);
6709 }
6710
6711 to_submit = io_sqring_entries(ctx);
6712
6713 /*
6714 * If submit got -EBUSY, flag us as needing the application
6715 * to enter the kernel to reap and flush events.
6716 */
6717 if (!to_submit || ret == -EBUSY || need_resched()) {
6718 /*
6719 * Drop cur_mm before scheduling, we can't hold it for
6720 * long periods (or over schedule()). Do this before
6721 * adding ourselves to the waitqueue, as the unuse/drop
6722 * may sleep.
6723 */
6724 io_sq_thread_drop_mm();
6725
6726 /*
6727 * We're polling. If we're within the defined idle
6728 * period, then let us spin without work before going
6729 * to sleep. The exception is if we got EBUSY doing
6730 * more IO, we should wait for the application to
6731 * reap events and wake us up.
6732 */
6733 if (!list_empty(&ctx->iopoll_list) || need_resched() ||
6734 (!time_after(jiffies, timeout) && ret != -EBUSY &&
6735 !percpu_ref_is_dying(&ctx->refs)))
6736 return SQT_SPIN;
6737
Jens Axboe534ca6d2020-09-02 13:52:19 -06006738 prepare_to_wait(&sqd->wait, &ctx->sqo_wait_entry,
Jens Axboec8d1ba52020-09-14 11:07:26 -06006739 TASK_INTERRUPTIBLE);
6740
6741 /*
6742 * While doing polled IO, before going to sleep, we need
6743 * to check if there are new reqs added to iopoll_list,
6744 * it is because reqs may have been punted to io worker
6745 * and will be added to iopoll_list later, hence check
6746 * the iopoll_list again.
6747 */
6748 if ((ctx->flags & IORING_SETUP_IOPOLL) &&
6749 !list_empty_careful(&ctx->iopoll_list)) {
Jens Axboe534ca6d2020-09-02 13:52:19 -06006750 finish_wait(&sqd->wait, &ctx->sqo_wait_entry);
Jens Axboec8d1ba52020-09-14 11:07:26 -06006751 goto again;
6752 }
6753
Jens Axboec8d1ba52020-09-14 11:07:26 -06006754 to_submit = io_sqring_entries(ctx);
6755 if (!to_submit || ret == -EBUSY)
6756 return SQT_IDLE;
6757 }
6758
Jens Axboe534ca6d2020-09-02 13:52:19 -06006759 finish_wait(&sqd->wait, &ctx->sqo_wait_entry);
Jens Axboec8d1ba52020-09-14 11:07:26 -06006760 io_ring_clear_wakeup_flag(ctx);
6761
Jens Axboee95eee22020-09-08 09:11:32 -06006762 /* if we're handling multiple rings, cap submit size for fairness */
6763 if (cap_entries && to_submit > 8)
6764 to_submit = 8;
6765
Jens Axboec8d1ba52020-09-14 11:07:26 -06006766 mutex_lock(&ctx->uring_lock);
6767 if (likely(!percpu_ref_is_dying(&ctx->refs)))
6768 ret = io_submit_sqes(ctx, to_submit);
6769 mutex_unlock(&ctx->uring_lock);
Jens Axboe90554202020-09-03 12:12:41 -06006770
6771 if (!io_sqring_full(ctx) && wq_has_sleeper(&ctx->sqo_sq_wait))
6772 wake_up(&ctx->sqo_sq_wait);
6773
Jens Axboec8d1ba52020-09-14 11:07:26 -06006774 return SQT_DID_WORK;
6775}
6776
Jens Axboe69fb2132020-09-14 11:16:23 -06006777static void io_sqd_init_new(struct io_sq_data *sqd)
6778{
6779 struct io_ring_ctx *ctx;
6780
6781 while (!list_empty(&sqd->ctx_new_list)) {
6782 ctx = list_first_entry(&sqd->ctx_new_list, struct io_ring_ctx, sqd_list);
6783 init_wait(&ctx->sqo_wait_entry);
6784 ctx->sqo_wait_entry.func = io_sq_wake_function;
6785 list_move_tail(&ctx->sqd_list, &sqd->ctx_list);
6786 complete(&ctx->sq_thread_comp);
6787 }
6788}
6789
Jens Axboe6c271ce2019-01-10 11:22:30 -07006790static int io_sq_thread(void *data)
6791{
Jens Axboe69fb2132020-09-14 11:16:23 -06006792 const struct cred *old_cred = NULL;
6793 struct io_sq_data *sqd = data;
6794 struct io_ring_ctx *ctx;
Jens Axboec8d1ba52020-09-14 11:07:26 -06006795 unsigned long start_jiffies;
Jens Axboe6c271ce2019-01-10 11:22:30 -07006796
Jens Axboec8d1ba52020-09-14 11:07:26 -06006797 start_jiffies = jiffies;
Jens Axboe69fb2132020-09-14 11:16:23 -06006798 while (!kthread_should_stop()) {
6799 enum sq_ret ret = 0;
Jens Axboee95eee22020-09-08 09:11:32 -06006800 bool cap_entries;
Jens Axboe6c271ce2019-01-10 11:22:30 -07006801
Jens Axboe69fb2132020-09-14 11:16:23 -06006802 /*
6803 * Any changes to the sqd lists are synchronized through the
6804 * kthread parking. This synchronizes the thread vs users,
6805 * the users are synchronized on the sqd->ctx_lock.
6806 */
6807 if (kthread_should_park())
6808 kthread_parkme();
6809
6810 if (unlikely(!list_empty(&sqd->ctx_new_list)))
6811 io_sqd_init_new(sqd);
6812
Jens Axboee95eee22020-09-08 09:11:32 -06006813 cap_entries = !list_is_singular(&sqd->ctx_list);
6814
Jens Axboe69fb2132020-09-14 11:16:23 -06006815 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
6816 if (current->cred != ctx->creds) {
6817 if (old_cred)
6818 revert_creds(old_cred);
6819 old_cred = override_creds(ctx->creds);
6820 }
6821
Jens Axboee95eee22020-09-08 09:11:32 -06006822 ret |= __io_sq_thread(ctx, start_jiffies, cap_entries);
Jens Axboe69fb2132020-09-14 11:16:23 -06006823
6824 io_sq_thread_drop_mm();
6825 }
6826
6827 if (ret & SQT_SPIN) {
Jens Axboec8d1ba52020-09-14 11:07:26 -06006828 io_run_task_work();
6829 cond_resched();
Jens Axboe69fb2132020-09-14 11:16:23 -06006830 } else if (ret == SQT_IDLE) {
6831 if (kthread_should_park())
6832 continue;
6833 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
6834 io_ring_set_wakeup_flag(ctx);
6835 schedule();
6836 start_jiffies = jiffies;
6837 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
6838 io_ring_clear_wakeup_flag(ctx);
Jens Axboe6c271ce2019-01-10 11:22:30 -07006839 }
Jens Axboe6c271ce2019-01-10 11:22:30 -07006840 }
6841
Jens Axboe4c6e2772020-07-01 11:29:10 -06006842 io_run_task_work();
Jens Axboeb41e9852020-02-17 09:52:41 -07006843
Jens Axboe69fb2132020-09-14 11:16:23 -06006844 if (old_cred)
6845 revert_creds(old_cred);
Jens Axboe06058632019-04-13 09:26:03 -06006846
Roman Penyaev2bbcd6d2019-05-16 10:53:57 +02006847 kthread_parkme();
Jens Axboe06058632019-04-13 09:26:03 -06006848
Jens Axboe6c271ce2019-01-10 11:22:30 -07006849 return 0;
6850}
6851
Jens Axboebda52162019-09-24 13:47:15 -06006852struct io_wait_queue {
6853 struct wait_queue_entry wq;
6854 struct io_ring_ctx *ctx;
6855 unsigned to_wait;
6856 unsigned nr_timeouts;
6857};
6858
Jens Axboe1d7bb1d2019-11-06 11:31:17 -07006859static inline bool io_should_wake(struct io_wait_queue *iowq, bool noflush)
Jens Axboebda52162019-09-24 13:47:15 -06006860{
6861 struct io_ring_ctx *ctx = iowq->ctx;
6862
6863 /*
Brian Gianforcarod195a662019-12-13 03:09:50 -08006864 * Wake up if we have enough events, or if a timeout occurred since we
Jens Axboebda52162019-09-24 13:47:15 -06006865 * started waiting. For timeouts, we always want to return to userspace,
6866 * regardless of event count.
6867 */
Jens Axboe1d7bb1d2019-11-06 11:31:17 -07006868 return io_cqring_events(ctx, noflush) >= iowq->to_wait ||
Jens Axboebda52162019-09-24 13:47:15 -06006869 atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
6870}
6871
6872static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
6873 int wake_flags, void *key)
6874{
6875 struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
6876 wq);
6877
Jens Axboe1d7bb1d2019-11-06 11:31:17 -07006878 /* use noflush == true, as we can't safely rely on locking context */
6879 if (!io_should_wake(iowq, true))
Jens Axboebda52162019-09-24 13:47:15 -06006880 return -1;
6881
6882 return autoremove_wake_function(curr, mode, wake_flags, key);
6883}
6884
Jens Axboe2b188cc2019-01-07 10:46:33 -07006885/*
6886 * Wait until events become available, if we don't already have some. The
6887 * application must reap them itself, as they reside on the shared cq ring.
6888 */
6889static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
6890 const sigset_t __user *sig, size_t sigsz)
6891{
Jens Axboebda52162019-09-24 13:47:15 -06006892 struct io_wait_queue iowq = {
6893 .wq = {
6894 .private = current,
6895 .func = io_wake_function,
6896 .entry = LIST_HEAD_INIT(iowq.wq.entry),
6897 },
6898 .ctx = ctx,
6899 .to_wait = min_events,
6900 };
Hristo Venev75b28af2019-08-26 17:23:46 +00006901 struct io_rings *rings = ctx->rings;
Jackie Liue9ffa5c2019-10-29 11:16:42 +08006902 int ret = 0;
Jens Axboe2b188cc2019-01-07 10:46:33 -07006903
Jens Axboeb41e9852020-02-17 09:52:41 -07006904 do {
6905 if (io_cqring_events(ctx, false) >= min_events)
6906 return 0;
Jens Axboe4c6e2772020-07-01 11:29:10 -06006907 if (!io_run_task_work())
Jens Axboeb41e9852020-02-17 09:52:41 -07006908 break;
Jens Axboeb41e9852020-02-17 09:52:41 -07006909 } while (1);
Jens Axboe2b188cc2019-01-07 10:46:33 -07006910
6911 if (sig) {
Arnd Bergmann9e75ad52019-03-25 15:34:53 +01006912#ifdef CONFIG_COMPAT
6913 if (in_compat_syscall())
6914 ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
Oleg Nesterovb7724342019-07-16 16:29:53 -07006915 sigsz);
Arnd Bergmann9e75ad52019-03-25 15:34:53 +01006916 else
6917#endif
Oleg Nesterovb7724342019-07-16 16:29:53 -07006918 ret = set_user_sigmask(sig, sigsz);
Arnd Bergmann9e75ad52019-03-25 15:34:53 +01006919
Jens Axboe2b188cc2019-01-07 10:46:33 -07006920 if (ret)
6921 return ret;
6922 }
6923
Jens Axboebda52162019-09-24 13:47:15 -06006924 iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
Dmitrii Dolgovc826bd72019-10-15 19:02:01 +02006925 trace_io_uring_cqring_wait(ctx, min_events);
Jens Axboebda52162019-09-24 13:47:15 -06006926 do {
6927 prepare_to_wait_exclusive(&ctx->wait, &iowq.wq,
6928 TASK_INTERRUPTIBLE);
Jens Axboece593a62020-06-30 12:39:05 -06006929 /* make sure we run task_work before checking for signals */
Jens Axboe4c6e2772020-07-01 11:29:10 -06006930 if (io_run_task_work())
6931 continue;
Jens Axboece593a62020-06-30 12:39:05 -06006932 if (signal_pending(current)) {
Jens Axboeb7db41c2020-07-04 08:55:50 -06006933 if (current->jobctl & JOBCTL_TASK_WORK) {
6934 spin_lock_irq(&current->sighand->siglock);
6935 current->jobctl &= ~JOBCTL_TASK_WORK;
6936 recalc_sigpending();
6937 spin_unlock_irq(&current->sighand->siglock);
6938 continue;
6939 }
6940 ret = -EINTR;
Jens Axboece593a62020-06-30 12:39:05 -06006941 break;
6942 }
Jens Axboe1d7bb1d2019-11-06 11:31:17 -07006943 if (io_should_wake(&iowq, false))
Jens Axboebda52162019-09-24 13:47:15 -06006944 break;
6945 schedule();
Jens Axboebda52162019-09-24 13:47:15 -06006946 } while (1);
6947 finish_wait(&ctx->wait, &iowq.wq);
6948
Jens Axboeb7db41c2020-07-04 08:55:50 -06006949 restore_saved_sigmask_unless(ret == -EINTR);
Jens Axboe2b188cc2019-01-07 10:46:33 -07006950
Hristo Venev75b28af2019-08-26 17:23:46 +00006951 return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
Jens Axboe2b188cc2019-01-07 10:46:33 -07006952}
6953
Jens Axboe6b063142019-01-10 22:13:58 -07006954static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
6955{
6956#if defined(CONFIG_UNIX)
6957 if (ctx->ring_sock) {
6958 struct sock *sock = ctx->ring_sock->sk;
6959 struct sk_buff *skb;
6960
6961 while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
6962 kfree_skb(skb);
6963 }
6964#else
6965 int i;
6966
Jens Axboe65e19f52019-10-26 07:20:21 -06006967 for (i = 0; i < ctx->nr_user_files; i++) {
6968 struct file *file;
6969
6970 file = io_file_from_index(ctx, i);
6971 if (file)
6972 fput(file);
6973 }
Jens Axboe6b063142019-01-10 22:13:58 -07006974#endif
6975}
6976
Jens Axboe05f3fb32019-12-09 11:22:50 -07006977static void io_file_ref_kill(struct percpu_ref *ref)
6978{
6979 struct fixed_file_data *data;
6980
6981 data = container_of(ref, struct fixed_file_data, refs);
6982 complete(&data->done);
6983}
6984
Jens Axboe6b063142019-01-10 22:13:58 -07006985static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
6986{
Jens Axboe05f3fb32019-12-09 11:22:50 -07006987 struct fixed_file_data *data = ctx->file_data;
Xiaoguang Wang05589552020-03-31 14:05:18 +08006988 struct fixed_file_ref_node *ref_node = NULL;
Jens Axboe65e19f52019-10-26 07:20:21 -06006989 unsigned nr_tables, i;
6990
Jens Axboe05f3fb32019-12-09 11:22:50 -07006991 if (!data)
Jens Axboe6b063142019-01-10 22:13:58 -07006992 return -ENXIO;
6993
Jens Axboe6a4d07c2020-05-15 14:30:38 -06006994 spin_lock(&data->lock);
Xiaoguang Wang05589552020-03-31 14:05:18 +08006995 if (!list_empty(&data->ref_list))
6996 ref_node = list_first_entry(&data->ref_list,
6997 struct fixed_file_ref_node, node);
Jens Axboe6a4d07c2020-05-15 14:30:38 -06006998 spin_unlock(&data->lock);
Xiaoguang Wang05589552020-03-31 14:05:18 +08006999 if (ref_node)
7000 percpu_ref_kill(&ref_node->refs);
7001
7002 percpu_ref_kill(&data->refs);
7003
7004 /* wait for all refs nodes to complete */
Jens Axboe4a38aed22020-05-14 17:21:15 -06007005 flush_delayed_work(&ctx->file_put_work);
Jens Axboe2faf8522020-02-04 19:54:55 -07007006 wait_for_completion(&data->done);
Jens Axboe05f3fb32019-12-09 11:22:50 -07007007
Jens Axboe6b063142019-01-10 22:13:58 -07007008 __io_sqe_files_unregister(ctx);
Jens Axboe65e19f52019-10-26 07:20:21 -06007009 nr_tables = DIV_ROUND_UP(ctx->nr_user_files, IORING_MAX_FILES_TABLE);
7010 for (i = 0; i < nr_tables; i++)
Jens Axboe05f3fb32019-12-09 11:22:50 -07007011 kfree(data->table[i].files);
7012 kfree(data->table);
Xiaoguang Wang05589552020-03-31 14:05:18 +08007013 percpu_ref_exit(&data->refs);
7014 kfree(data);
Jens Axboe05f3fb32019-12-09 11:22:50 -07007015 ctx->file_data = NULL;
Jens Axboe6b063142019-01-10 22:13:58 -07007016 ctx->nr_user_files = 0;
7017 return 0;
7018}
7019
Jens Axboe534ca6d2020-09-02 13:52:19 -06007020static void io_put_sq_data(struct io_sq_data *sqd)
Jens Axboe6c271ce2019-01-10 11:22:30 -07007021{
Jens Axboe534ca6d2020-09-02 13:52:19 -06007022 if (refcount_dec_and_test(&sqd->refs)) {
Roman Penyaev2bbcd6d2019-05-16 10:53:57 +02007023 /*
7024 * The park is a bit of a work-around, without it we get
7025 * warning spews on shutdown with SQPOLL set and affinity
7026 * set to a single CPU.
7027 */
Jens Axboe534ca6d2020-09-02 13:52:19 -06007028 if (sqd->thread) {
7029 kthread_park(sqd->thread);
7030 kthread_stop(sqd->thread);
7031 }
7032
7033 kfree(sqd);
7034 }
7035}
7036
Jens Axboeaa061652020-09-02 14:50:27 -06007037static struct io_sq_data *io_attach_sq_data(struct io_uring_params *p)
7038{
7039 struct io_ring_ctx *ctx_attach;
7040 struct io_sq_data *sqd;
7041 struct fd f;
7042
7043 f = fdget(p->wq_fd);
7044 if (!f.file)
7045 return ERR_PTR(-ENXIO);
7046 if (f.file->f_op != &io_uring_fops) {
7047 fdput(f);
7048 return ERR_PTR(-EINVAL);
7049 }
7050
7051 ctx_attach = f.file->private_data;
7052 sqd = ctx_attach->sq_data;
7053 if (!sqd) {
7054 fdput(f);
7055 return ERR_PTR(-EINVAL);
7056 }
7057
7058 refcount_inc(&sqd->refs);
7059 fdput(f);
7060 return sqd;
7061}
7062
Jens Axboe534ca6d2020-09-02 13:52:19 -06007063static struct io_sq_data *io_get_sq_data(struct io_uring_params *p)
7064{
7065 struct io_sq_data *sqd;
7066
Jens Axboeaa061652020-09-02 14:50:27 -06007067 if (p->flags & IORING_SETUP_ATTACH_WQ)
7068 return io_attach_sq_data(p);
7069
Jens Axboe534ca6d2020-09-02 13:52:19 -06007070 sqd = kzalloc(sizeof(*sqd), GFP_KERNEL);
7071 if (!sqd)
7072 return ERR_PTR(-ENOMEM);
7073
7074 refcount_set(&sqd->refs, 1);
Jens Axboe69fb2132020-09-14 11:16:23 -06007075 INIT_LIST_HEAD(&sqd->ctx_list);
7076 INIT_LIST_HEAD(&sqd->ctx_new_list);
7077 mutex_init(&sqd->ctx_lock);
7078 mutex_init(&sqd->lock);
Jens Axboe534ca6d2020-09-02 13:52:19 -06007079 init_waitqueue_head(&sqd->wait);
7080 return sqd;
7081}
7082
Jens Axboe69fb2132020-09-14 11:16:23 -06007083static void io_sq_thread_unpark(struct io_sq_data *sqd)
7084 __releases(&sqd->lock)
7085{
7086 if (!sqd->thread)
7087 return;
7088 kthread_unpark(sqd->thread);
7089 mutex_unlock(&sqd->lock);
7090}
7091
7092static void io_sq_thread_park(struct io_sq_data *sqd)
7093 __acquires(&sqd->lock)
7094{
7095 if (!sqd->thread)
7096 return;
7097 mutex_lock(&sqd->lock);
7098 kthread_park(sqd->thread);
7099}
7100
Jens Axboe534ca6d2020-09-02 13:52:19 -06007101static void io_sq_thread_stop(struct io_ring_ctx *ctx)
7102{
7103 struct io_sq_data *sqd = ctx->sq_data;
7104
7105 if (sqd) {
7106 if (sqd->thread) {
7107 /*
7108 * We may arrive here from the error branch in
7109 * io_sq_offload_create() where the kthread is created
7110 * without being waked up, thus wake it up now to make
7111 * sure the wait will complete.
7112 */
7113 wake_up_process(sqd->thread);
7114 wait_for_completion(&ctx->sq_thread_comp);
Jens Axboe69fb2132020-09-14 11:16:23 -06007115
7116 io_sq_thread_park(sqd);
7117 }
7118
7119 mutex_lock(&sqd->ctx_lock);
7120 list_del(&ctx->sqd_list);
7121 mutex_unlock(&sqd->ctx_lock);
7122
7123 if (sqd->thread) {
7124 finish_wait(&sqd->wait, &ctx->sqo_wait_entry);
7125 io_sq_thread_unpark(sqd);
Jens Axboe534ca6d2020-09-02 13:52:19 -06007126 }
7127
7128 io_put_sq_data(sqd);
7129 ctx->sq_data = NULL;
Jens Axboe6c271ce2019-01-10 11:22:30 -07007130 }
7131}
7132
Jens Axboe6b063142019-01-10 22:13:58 -07007133static void io_finish_async(struct io_ring_ctx *ctx)
7134{
Jens Axboe6c271ce2019-01-10 11:22:30 -07007135 io_sq_thread_stop(ctx);
7136
Jens Axboe561fb042019-10-24 07:25:42 -06007137 if (ctx->io_wq) {
7138 io_wq_destroy(ctx->io_wq);
7139 ctx->io_wq = NULL;
Jens Axboe6b063142019-01-10 22:13:58 -07007140 }
7141}
7142
7143#if defined(CONFIG_UNIX)
Jens Axboe6b063142019-01-10 22:13:58 -07007144/*
7145 * Ensure the UNIX gc is aware of our file set, so we are certain that
7146 * the io_uring can be safely unregistered on process exit, even if we have
7147 * loops in the file referencing.
7148 */
7149static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
7150{
7151 struct sock *sk = ctx->ring_sock->sk;
7152 struct scm_fp_list *fpl;
7153 struct sk_buff *skb;
Jens Axboe08a45172019-10-03 08:11:03 -06007154 int i, nr_files;
Jens Axboe6b063142019-01-10 22:13:58 -07007155
Jens Axboe6b063142019-01-10 22:13:58 -07007156 fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
7157 if (!fpl)
7158 return -ENOMEM;
7159
7160 skb = alloc_skb(0, GFP_KERNEL);
7161 if (!skb) {
7162 kfree(fpl);
7163 return -ENOMEM;
7164 }
7165
7166 skb->sk = sk;
Jens Axboe6b063142019-01-10 22:13:58 -07007167
Jens Axboe08a45172019-10-03 08:11:03 -06007168 nr_files = 0;
Jens Axboe6b063142019-01-10 22:13:58 -07007169 fpl->user = get_uid(ctx->user);
7170 for (i = 0; i < nr; i++) {
Jens Axboe65e19f52019-10-26 07:20:21 -06007171 struct file *file = io_file_from_index(ctx, i + offset);
7172
7173 if (!file)
Jens Axboe08a45172019-10-03 08:11:03 -06007174 continue;
Jens Axboe65e19f52019-10-26 07:20:21 -06007175 fpl->fp[nr_files] = get_file(file);
Jens Axboe08a45172019-10-03 08:11:03 -06007176 unix_inflight(fpl->user, fpl->fp[nr_files]);
7177 nr_files++;
Jens Axboe6b063142019-01-10 22:13:58 -07007178 }
7179
Jens Axboe08a45172019-10-03 08:11:03 -06007180 if (nr_files) {
7181 fpl->max = SCM_MAX_FD;
7182 fpl->count = nr_files;
7183 UNIXCB(skb).fp = fpl;
Jens Axboe05f3fb32019-12-09 11:22:50 -07007184 skb->destructor = unix_destruct_scm;
Jens Axboe08a45172019-10-03 08:11:03 -06007185 refcount_add(skb->truesize, &sk->sk_wmem_alloc);
7186 skb_queue_head(&sk->sk_receive_queue, skb);
Jens Axboe6b063142019-01-10 22:13:58 -07007187
Jens Axboe08a45172019-10-03 08:11:03 -06007188 for (i = 0; i < nr_files; i++)
7189 fput(fpl->fp[i]);
7190 } else {
7191 kfree_skb(skb);
7192 kfree(fpl);
7193 }
Jens Axboe6b063142019-01-10 22:13:58 -07007194
7195 return 0;
7196}
7197
7198/*
7199 * If UNIX sockets are enabled, fd passing can cause a reference cycle which
7200 * causes regular reference counting to break down. We rely on the UNIX
7201 * garbage collection to take care of this problem for us.
7202 */
7203static int io_sqe_files_scm(struct io_ring_ctx *ctx)
7204{
7205 unsigned left, total;
7206 int ret = 0;
7207
7208 total = 0;
7209 left = ctx->nr_user_files;
7210 while (left) {
7211 unsigned this_files = min_t(unsigned, left, SCM_MAX_FD);
Jens Axboe6b063142019-01-10 22:13:58 -07007212
7213 ret = __io_sqe_files_scm(ctx, this_files, total);
7214 if (ret)
7215 break;
7216 left -= this_files;
7217 total += this_files;
7218 }
7219
7220 if (!ret)
7221 return 0;
7222
7223 while (total < ctx->nr_user_files) {
Jens Axboe65e19f52019-10-26 07:20:21 -06007224 struct file *file = io_file_from_index(ctx, total);
7225
7226 if (file)
7227 fput(file);
Jens Axboe6b063142019-01-10 22:13:58 -07007228 total++;
7229 }
7230
7231 return ret;
7232}
7233#else
7234static int io_sqe_files_scm(struct io_ring_ctx *ctx)
7235{
7236 return 0;
7237}
7238#endif
7239
Jens Axboe65e19f52019-10-26 07:20:21 -06007240static int io_sqe_alloc_file_tables(struct io_ring_ctx *ctx, unsigned nr_tables,
7241 unsigned nr_files)
7242{
7243 int i;
7244
7245 for (i = 0; i < nr_tables; i++) {
Jens Axboe05f3fb32019-12-09 11:22:50 -07007246 struct fixed_file_table *table = &ctx->file_data->table[i];
Jens Axboe65e19f52019-10-26 07:20:21 -06007247 unsigned this_files;
7248
7249 this_files = min(nr_files, IORING_MAX_FILES_TABLE);
7250 table->files = kcalloc(this_files, sizeof(struct file *),
7251 GFP_KERNEL);
7252 if (!table->files)
7253 break;
7254 nr_files -= this_files;
7255 }
7256
7257 if (i == nr_tables)
7258 return 0;
7259
7260 for (i = 0; i < nr_tables; i++) {
Jens Axboe05f3fb32019-12-09 11:22:50 -07007261 struct fixed_file_table *table = &ctx->file_data->table[i];
Jens Axboe65e19f52019-10-26 07:20:21 -06007262 kfree(table->files);
7263 }
7264 return 1;
7265}
7266
Jens Axboe05f3fb32019-12-09 11:22:50 -07007267static void io_ring_file_put(struct io_ring_ctx *ctx, struct file *file)
Jens Axboec3a31e62019-10-03 13:59:56 -06007268{
7269#if defined(CONFIG_UNIX)
Jens Axboec3a31e62019-10-03 13:59:56 -06007270 struct sock *sock = ctx->ring_sock->sk;
7271 struct sk_buff_head list, *head = &sock->sk_receive_queue;
7272 struct sk_buff *skb;
7273 int i;
7274
7275 __skb_queue_head_init(&list);
7276
7277 /*
7278 * Find the skb that holds this file in its SCM_RIGHTS. When found,
7279 * remove this entry and rearrange the file array.
7280 */
7281 skb = skb_dequeue(head);
7282 while (skb) {
7283 struct scm_fp_list *fp;
7284
7285 fp = UNIXCB(skb).fp;
7286 for (i = 0; i < fp->count; i++) {
7287 int left;
7288
7289 if (fp->fp[i] != file)
7290 continue;
7291
7292 unix_notinflight(fp->user, fp->fp[i]);
7293 left = fp->count - 1 - i;
7294 if (left) {
7295 memmove(&fp->fp[i], &fp->fp[i + 1],
7296 left * sizeof(struct file *));
7297 }
7298 fp->count--;
7299 if (!fp->count) {
7300 kfree_skb(skb);
7301 skb = NULL;
7302 } else {
7303 __skb_queue_tail(&list, skb);
7304 }
7305 fput(file);
7306 file = NULL;
7307 break;
7308 }
7309
7310 if (!file)
7311 break;
7312
7313 __skb_queue_tail(&list, skb);
7314
7315 skb = skb_dequeue(head);
7316 }
7317
7318 if (skb_peek(&list)) {
7319 spin_lock_irq(&head->lock);
7320 while ((skb = __skb_dequeue(&list)) != NULL)
7321 __skb_queue_tail(head, skb);
7322 spin_unlock_irq(&head->lock);
7323 }
7324#else
Jens Axboe05f3fb32019-12-09 11:22:50 -07007325 fput(file);
Jens Axboec3a31e62019-10-03 13:59:56 -06007326#endif
7327}
7328
Jens Axboe05f3fb32019-12-09 11:22:50 -07007329struct io_file_put {
Xiaoguang Wang05589552020-03-31 14:05:18 +08007330 struct list_head list;
Jens Axboe05f3fb32019-12-09 11:22:50 -07007331 struct file *file;
Jens Axboe05f3fb32019-12-09 11:22:50 -07007332};
7333
Jens Axboe4a38aed22020-05-14 17:21:15 -06007334static void __io_file_put_work(struct fixed_file_ref_node *ref_node)
Jens Axboe05f3fb32019-12-09 11:22:50 -07007335{
Jens Axboe4a38aed22020-05-14 17:21:15 -06007336 struct fixed_file_data *file_data = ref_node->file_data;
7337 struct io_ring_ctx *ctx = file_data->ctx;
Jens Axboe05f3fb32019-12-09 11:22:50 -07007338 struct io_file_put *pfile, *tmp;
Xiaoguang Wang05589552020-03-31 14:05:18 +08007339
7340 list_for_each_entry_safe(pfile, tmp, &ref_node->file_list, list) {
Jens Axboe6a4d07c2020-05-15 14:30:38 -06007341 list_del(&pfile->list);
Xiaoguang Wang05589552020-03-31 14:05:18 +08007342 io_ring_file_put(ctx, pfile->file);
7343 kfree(pfile);
Jens Axboe05f3fb32019-12-09 11:22:50 -07007344 }
7345
Jens Axboe6a4d07c2020-05-15 14:30:38 -06007346 spin_lock(&file_data->lock);
7347 list_del(&ref_node->node);
7348 spin_unlock(&file_data->lock);
Jens Axboe2faf8522020-02-04 19:54:55 -07007349
Xiaoguang Wang05589552020-03-31 14:05:18 +08007350 percpu_ref_exit(&ref_node->refs);
7351 kfree(ref_node);
7352 percpu_ref_put(&file_data->refs);
Jens Axboe05f3fb32019-12-09 11:22:50 -07007353}
7354
Jens Axboe4a38aed22020-05-14 17:21:15 -06007355static void io_file_put_work(struct work_struct *work)
7356{
7357 struct io_ring_ctx *ctx;
7358 struct llist_node *node;
7359
7360 ctx = container_of(work, struct io_ring_ctx, file_put_work.work);
7361 node = llist_del_all(&ctx->file_put_llist);
7362
7363 while (node) {
7364 struct fixed_file_ref_node *ref_node;
7365 struct llist_node *next = node->next;
7366
7367 ref_node = llist_entry(node, struct fixed_file_ref_node, llist);
7368 __io_file_put_work(ref_node);
7369 node = next;
7370 }
7371}
7372
Jens Axboe05f3fb32019-12-09 11:22:50 -07007373static void io_file_data_ref_zero(struct percpu_ref *ref)
7374{
Xiaoguang Wang05589552020-03-31 14:05:18 +08007375 struct fixed_file_ref_node *ref_node;
Jens Axboe4a38aed22020-05-14 17:21:15 -06007376 struct io_ring_ctx *ctx;
7377 bool first_add;
7378 int delay = HZ;
Jens Axboe05f3fb32019-12-09 11:22:50 -07007379
Xiaoguang Wang05589552020-03-31 14:05:18 +08007380 ref_node = container_of(ref, struct fixed_file_ref_node, refs);
Jens Axboe4a38aed22020-05-14 17:21:15 -06007381 ctx = ref_node->file_data->ctx;
Jens Axboe05f3fb32019-12-09 11:22:50 -07007382
Jens Axboe4a38aed22020-05-14 17:21:15 -06007383 if (percpu_ref_is_dying(&ctx->file_data->refs))
7384 delay = 0;
7385
7386 first_add = llist_add(&ref_node->llist, &ctx->file_put_llist);
7387 if (!delay)
7388 mod_delayed_work(system_wq, &ctx->file_put_work, 0);
7389 else if (first_add)
7390 queue_delayed_work(system_wq, &ctx->file_put_work, delay);
Xiaoguang Wang05589552020-03-31 14:05:18 +08007391}
7392
7393static struct fixed_file_ref_node *alloc_fixed_file_ref_node(
7394 struct io_ring_ctx *ctx)
7395{
7396 struct fixed_file_ref_node *ref_node;
7397
7398 ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
7399 if (!ref_node)
7400 return ERR_PTR(-ENOMEM);
7401
7402 if (percpu_ref_init(&ref_node->refs, io_file_data_ref_zero,
7403 0, GFP_KERNEL)) {
7404 kfree(ref_node);
7405 return ERR_PTR(-ENOMEM);
7406 }
7407 INIT_LIST_HEAD(&ref_node->node);
7408 INIT_LIST_HEAD(&ref_node->file_list);
Xiaoguang Wang05589552020-03-31 14:05:18 +08007409 ref_node->file_data = ctx->file_data;
7410 return ref_node;
Xiaoguang Wang05589552020-03-31 14:05:18 +08007411}
7412
7413static void destroy_fixed_file_ref_node(struct fixed_file_ref_node *ref_node)
7414{
7415 percpu_ref_exit(&ref_node->refs);
7416 kfree(ref_node);
Jens Axboe05f3fb32019-12-09 11:22:50 -07007417}
7418
7419static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
7420 unsigned nr_args)
7421{
7422 __s32 __user *fds = (__s32 __user *) arg;
7423 unsigned nr_tables;
7424 struct file *file;
7425 int fd, ret = 0;
7426 unsigned i;
Xiaoguang Wang05589552020-03-31 14:05:18 +08007427 struct fixed_file_ref_node *ref_node;
Jens Axboe05f3fb32019-12-09 11:22:50 -07007428
7429 if (ctx->file_data)
7430 return -EBUSY;
7431 if (!nr_args)
7432 return -EINVAL;
7433 if (nr_args > IORING_MAX_FIXED_FILES)
7434 return -EMFILE;
7435
7436 ctx->file_data = kzalloc(sizeof(*ctx->file_data), GFP_KERNEL);
7437 if (!ctx->file_data)
7438 return -ENOMEM;
7439 ctx->file_data->ctx = ctx;
7440 init_completion(&ctx->file_data->done);
Xiaoguang Wang05589552020-03-31 14:05:18 +08007441 INIT_LIST_HEAD(&ctx->file_data->ref_list);
Xiaoguang Wangf7fe9342020-04-07 20:02:31 +08007442 spin_lock_init(&ctx->file_data->lock);
Jens Axboe05f3fb32019-12-09 11:22:50 -07007443
7444 nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_FILES_TABLE);
7445 ctx->file_data->table = kcalloc(nr_tables,
7446 sizeof(struct fixed_file_table),
7447 GFP_KERNEL);
7448 if (!ctx->file_data->table) {
7449 kfree(ctx->file_data);
7450 ctx->file_data = NULL;
7451 return -ENOMEM;
7452 }
7453
Xiaoguang Wang05589552020-03-31 14:05:18 +08007454 if (percpu_ref_init(&ctx->file_data->refs, io_file_ref_kill,
Jens Axboe05f3fb32019-12-09 11:22:50 -07007455 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
7456 kfree(ctx->file_data->table);
7457 kfree(ctx->file_data);
7458 ctx->file_data = NULL;
7459 return -ENOMEM;
7460 }
Jens Axboe05f3fb32019-12-09 11:22:50 -07007461
7462 if (io_sqe_alloc_file_tables(ctx, nr_tables, nr_args)) {
7463 percpu_ref_exit(&ctx->file_data->refs);
7464 kfree(ctx->file_data->table);
7465 kfree(ctx->file_data);
7466 ctx->file_data = NULL;
7467 return -ENOMEM;
7468 }
7469
7470 for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
7471 struct fixed_file_table *table;
7472 unsigned index;
7473
7474 ret = -EFAULT;
7475 if (copy_from_user(&fd, &fds[i], sizeof(fd)))
7476 break;
7477 /* allow sparse sets */
7478 if (fd == -1) {
7479 ret = 0;
7480 continue;
7481 }
7482
7483 table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT];
7484 index = i & IORING_FILE_TABLE_MASK;
7485 file = fget(fd);
7486
7487 ret = -EBADF;
7488 if (!file)
7489 break;
7490
7491 /*
7492 * Don't allow io_uring instances to be registered. If UNIX
7493 * isn't enabled, then this causes a reference cycle and this
7494 * instance can never get freed. If UNIX is enabled we'll
7495 * handle it just fine, but there's still no point in allowing
7496 * a ring fd as it doesn't support regular read/write anyway.
7497 */
7498 if (file->f_op == &io_uring_fops) {
7499 fput(file);
7500 break;
7501 }
7502 ret = 0;
7503 table->files[index] = file;
7504 }
7505
7506 if (ret) {
7507 for (i = 0; i < ctx->nr_user_files; i++) {
7508 file = io_file_from_index(ctx, i);
7509 if (file)
7510 fput(file);
7511 }
7512 for (i = 0; i < nr_tables; i++)
7513 kfree(ctx->file_data->table[i].files);
7514
Yang Yingliang667e57d2020-07-10 14:14:20 +00007515 percpu_ref_exit(&ctx->file_data->refs);
Jens Axboe05f3fb32019-12-09 11:22:50 -07007516 kfree(ctx->file_data->table);
7517 kfree(ctx->file_data);
7518 ctx->file_data = NULL;
7519 ctx->nr_user_files = 0;
7520 return ret;
7521 }
7522
7523 ret = io_sqe_files_scm(ctx);
Xiaoguang Wang05589552020-03-31 14:05:18 +08007524 if (ret) {
Jens Axboe05f3fb32019-12-09 11:22:50 -07007525 io_sqe_files_unregister(ctx);
Xiaoguang Wang05589552020-03-31 14:05:18 +08007526 return ret;
7527 }
Jens Axboe05f3fb32019-12-09 11:22:50 -07007528
Xiaoguang Wang05589552020-03-31 14:05:18 +08007529 ref_node = alloc_fixed_file_ref_node(ctx);
7530 if (IS_ERR(ref_node)) {
7531 io_sqe_files_unregister(ctx);
7532 return PTR_ERR(ref_node);
7533 }
7534
7535 ctx->file_data->cur_refs = &ref_node->refs;
Jens Axboe6a4d07c2020-05-15 14:30:38 -06007536 spin_lock(&ctx->file_data->lock);
Xiaoguang Wang05589552020-03-31 14:05:18 +08007537 list_add(&ref_node->node, &ctx->file_data->ref_list);
Jens Axboe6a4d07c2020-05-15 14:30:38 -06007538 spin_unlock(&ctx->file_data->lock);
Xiaoguang Wang05589552020-03-31 14:05:18 +08007539 percpu_ref_get(&ctx->file_data->refs);
Jens Axboe05f3fb32019-12-09 11:22:50 -07007540 return ret;
7541}
7542
Jens Axboec3a31e62019-10-03 13:59:56 -06007543static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
7544 int index)
7545{
7546#if defined(CONFIG_UNIX)
7547 struct sock *sock = ctx->ring_sock->sk;
7548 struct sk_buff_head *head = &sock->sk_receive_queue;
7549 struct sk_buff *skb;
7550
7551 /*
7552 * See if we can merge this file into an existing skb SCM_RIGHTS
7553 * file set. If there's no room, fall back to allocating a new skb
7554 * and filling it in.
7555 */
7556 spin_lock_irq(&head->lock);
7557 skb = skb_peek(head);
7558 if (skb) {
7559 struct scm_fp_list *fpl = UNIXCB(skb).fp;
7560
7561 if (fpl->count < SCM_MAX_FD) {
7562 __skb_unlink(skb, head);
7563 spin_unlock_irq(&head->lock);
7564 fpl->fp[fpl->count] = get_file(file);
7565 unix_inflight(fpl->user, fpl->fp[fpl->count]);
7566 fpl->count++;
7567 spin_lock_irq(&head->lock);
7568 __skb_queue_head(head, skb);
7569 } else {
7570 skb = NULL;
7571 }
7572 }
7573 spin_unlock_irq(&head->lock);
7574
7575 if (skb) {
7576 fput(file);
7577 return 0;
7578 }
7579
7580 return __io_sqe_files_scm(ctx, 1, index);
7581#else
7582 return 0;
7583#endif
7584}
7585
Hillf Dantona5318d32020-03-23 17:47:15 +08007586static int io_queue_file_removal(struct fixed_file_data *data,
Xiaoguang Wang05589552020-03-31 14:05:18 +08007587 struct file *file)
Jens Axboe05f3fb32019-12-09 11:22:50 -07007588{
Hillf Dantona5318d32020-03-23 17:47:15 +08007589 struct io_file_put *pfile;
Xiaoguang Wang05589552020-03-31 14:05:18 +08007590 struct percpu_ref *refs = data->cur_refs;
7591 struct fixed_file_ref_node *ref_node;
Jens Axboe05f3fb32019-12-09 11:22:50 -07007592
Jens Axboe05f3fb32019-12-09 11:22:50 -07007593 pfile = kzalloc(sizeof(*pfile), GFP_KERNEL);
Hillf Dantona5318d32020-03-23 17:47:15 +08007594 if (!pfile)
7595 return -ENOMEM;
Jens Axboe05f3fb32019-12-09 11:22:50 -07007596
Xiaoguang Wang05589552020-03-31 14:05:18 +08007597 ref_node = container_of(refs, struct fixed_file_ref_node, refs);
Jens Axboe05f3fb32019-12-09 11:22:50 -07007598 pfile->file = file;
Xiaoguang Wang05589552020-03-31 14:05:18 +08007599 list_add(&pfile->list, &ref_node->file_list);
7600
Hillf Dantona5318d32020-03-23 17:47:15 +08007601 return 0;
Jens Axboe05f3fb32019-12-09 11:22:50 -07007602}
7603
7604static int __io_sqe_files_update(struct io_ring_ctx *ctx,
7605 struct io_uring_files_update *up,
7606 unsigned nr_args)
7607{
7608 struct fixed_file_data *data = ctx->file_data;
Xiaoguang Wang05589552020-03-31 14:05:18 +08007609 struct fixed_file_ref_node *ref_node;
Jens Axboe05f3fb32019-12-09 11:22:50 -07007610 struct file *file;
Jens Axboec3a31e62019-10-03 13:59:56 -06007611 __s32 __user *fds;
7612 int fd, i, err;
7613 __u32 done;
Xiaoguang Wang05589552020-03-31 14:05:18 +08007614 bool needs_switch = false;
Jens Axboec3a31e62019-10-03 13:59:56 -06007615
Jens Axboe05f3fb32019-12-09 11:22:50 -07007616 if (check_add_overflow(up->offset, nr_args, &done))
Jens Axboec3a31e62019-10-03 13:59:56 -06007617 return -EOVERFLOW;
7618 if (done > ctx->nr_user_files)
7619 return -EINVAL;
7620
Xiaoguang Wang05589552020-03-31 14:05:18 +08007621 ref_node = alloc_fixed_file_ref_node(ctx);
7622 if (IS_ERR(ref_node))
7623 return PTR_ERR(ref_node);
7624
Jens Axboec3a31e62019-10-03 13:59:56 -06007625 done = 0;
Jens Axboe05f3fb32019-12-09 11:22:50 -07007626 fds = u64_to_user_ptr(up->fds);
Jens Axboec3a31e62019-10-03 13:59:56 -06007627 while (nr_args) {
Jens Axboe65e19f52019-10-26 07:20:21 -06007628 struct fixed_file_table *table;
7629 unsigned index;
7630
Jens Axboec3a31e62019-10-03 13:59:56 -06007631 err = 0;
7632 if (copy_from_user(&fd, &fds[done], sizeof(fd))) {
7633 err = -EFAULT;
7634 break;
7635 }
Jens Axboe05f3fb32019-12-09 11:22:50 -07007636 i = array_index_nospec(up->offset, ctx->nr_user_files);
7637 table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT];
Jens Axboe65e19f52019-10-26 07:20:21 -06007638 index = i & IORING_FILE_TABLE_MASK;
7639 if (table->files[index]) {
Jiufei Xue98dfd502020-09-01 13:35:02 +08007640 file = table->files[index];
Hillf Dantona5318d32020-03-23 17:47:15 +08007641 err = io_queue_file_removal(data, file);
7642 if (err)
7643 break;
Jens Axboe65e19f52019-10-26 07:20:21 -06007644 table->files[index] = NULL;
Xiaoguang Wang05589552020-03-31 14:05:18 +08007645 needs_switch = true;
Jens Axboec3a31e62019-10-03 13:59:56 -06007646 }
7647 if (fd != -1) {
Jens Axboec3a31e62019-10-03 13:59:56 -06007648 file = fget(fd);
7649 if (!file) {
7650 err = -EBADF;
7651 break;
7652 }
7653 /*
7654 * Don't allow io_uring instances to be registered. If
7655 * UNIX isn't enabled, then this causes a reference
7656 * cycle and this instance can never get freed. If UNIX
7657 * is enabled we'll handle it just fine, but there's
7658 * still no point in allowing a ring fd as it doesn't
7659 * support regular read/write anyway.
7660 */
7661 if (file->f_op == &io_uring_fops) {
7662 fput(file);
7663 err = -EBADF;
7664 break;
7665 }
Jens Axboe65e19f52019-10-26 07:20:21 -06007666 table->files[index] = file;
Jens Axboec3a31e62019-10-03 13:59:56 -06007667 err = io_sqe_file_register(ctx, file, i);
Yang Yingliangf3bd9da2020-07-09 10:11:41 +00007668 if (err) {
Jiufei Xue95d1c8e2020-09-02 17:59:39 +08007669 table->files[index] = NULL;
Yang Yingliangf3bd9da2020-07-09 10:11:41 +00007670 fput(file);
Jens Axboec3a31e62019-10-03 13:59:56 -06007671 break;
Yang Yingliangf3bd9da2020-07-09 10:11:41 +00007672 }
Jens Axboec3a31e62019-10-03 13:59:56 -06007673 }
7674 nr_args--;
7675 done++;
Jens Axboe05f3fb32019-12-09 11:22:50 -07007676 up->offset++;
7677 }
7678
Xiaoguang Wang05589552020-03-31 14:05:18 +08007679 if (needs_switch) {
7680 percpu_ref_kill(data->cur_refs);
Jens Axboe6a4d07c2020-05-15 14:30:38 -06007681 spin_lock(&data->lock);
Xiaoguang Wang05589552020-03-31 14:05:18 +08007682 list_add(&ref_node->node, &data->ref_list);
7683 data->cur_refs = &ref_node->refs;
Jens Axboe6a4d07c2020-05-15 14:30:38 -06007684 spin_unlock(&data->lock);
Xiaoguang Wang05589552020-03-31 14:05:18 +08007685 percpu_ref_get(&ctx->file_data->refs);
7686 } else
7687 destroy_fixed_file_ref_node(ref_node);
Jens Axboec3a31e62019-10-03 13:59:56 -06007688
7689 return done ? done : err;
7690}
Xiaoguang Wang05589552020-03-31 14:05:18 +08007691
Jens Axboe05f3fb32019-12-09 11:22:50 -07007692static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
7693 unsigned nr_args)
7694{
7695 struct io_uring_files_update up;
7696
7697 if (!ctx->file_data)
7698 return -ENXIO;
7699 if (!nr_args)
7700 return -EINVAL;
7701 if (copy_from_user(&up, arg, sizeof(up)))
7702 return -EFAULT;
7703 if (up.resv)
7704 return -EINVAL;
7705
7706 return __io_sqe_files_update(ctx, &up, nr_args);
7707}
Jens Axboec3a31e62019-10-03 13:59:56 -06007708
Pavel Begunkove9fd9392020-03-04 16:14:12 +03007709static void io_free_work(struct io_wq_work *work)
Jens Axboe7d723062019-11-12 22:31:31 -07007710{
7711 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
7712
Pavel Begunkove9fd9392020-03-04 16:14:12 +03007713 /* Consider that io_steal_work() relies on this ref */
Jens Axboe7d723062019-11-12 22:31:31 -07007714 io_put_req(req);
7715}
7716
Pavel Begunkov24369c22020-01-28 03:15:48 +03007717static int io_init_wq_offload(struct io_ring_ctx *ctx,
7718 struct io_uring_params *p)
7719{
7720 struct io_wq_data data;
7721 struct fd f;
7722 struct io_ring_ctx *ctx_attach;
7723 unsigned int concurrency;
7724 int ret = 0;
7725
7726 data.user = ctx->user;
Pavel Begunkove9fd9392020-03-04 16:14:12 +03007727 data.free_work = io_free_work;
Pavel Begunkovf5fa38c2020-06-08 21:08:20 +03007728 data.do_work = io_wq_submit_work;
Pavel Begunkov24369c22020-01-28 03:15:48 +03007729
7730 if (!(p->flags & IORING_SETUP_ATTACH_WQ)) {
7731 /* Do QD, or 4 * CPUS, whatever is smallest */
7732 concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
7733
7734 ctx->io_wq = io_wq_create(concurrency, &data);
7735 if (IS_ERR(ctx->io_wq)) {
7736 ret = PTR_ERR(ctx->io_wq);
7737 ctx->io_wq = NULL;
7738 }
7739 return ret;
7740 }
7741
7742 f = fdget(p->wq_fd);
7743 if (!f.file)
7744 return -EBADF;
7745
7746 if (f.file->f_op != &io_uring_fops) {
7747 ret = -EINVAL;
7748 goto out_fput;
7749 }
7750
7751 ctx_attach = f.file->private_data;
7752 /* @io_wq is protected by holding the fd */
7753 if (!io_wq_get(ctx_attach->io_wq, &data)) {
7754 ret = -EINVAL;
7755 goto out_fput;
7756 }
7757
7758 ctx->io_wq = ctx_attach->io_wq;
7759out_fput:
7760 fdput(f);
7761 return ret;
7762}
7763
Jens Axboe0f212202020-09-13 13:09:39 -06007764static int io_uring_alloc_task_context(struct task_struct *task)
7765{
7766 struct io_uring_task *tctx;
7767
7768 tctx = kmalloc(sizeof(*tctx), GFP_KERNEL);
7769 if (unlikely(!tctx))
7770 return -ENOMEM;
7771
7772 xa_init(&tctx->xa);
7773 init_waitqueue_head(&tctx->wait);
7774 tctx->last = NULL;
7775 tctx->in_idle = 0;
7776 atomic_long_set(&tctx->req_issue, 0);
7777 atomic_long_set(&tctx->req_complete, 0);
7778 task->io_uring = tctx;
7779 return 0;
7780}
7781
7782void __io_uring_free(struct task_struct *tsk)
7783{
7784 struct io_uring_task *tctx = tsk->io_uring;
7785
7786 WARN_ON_ONCE(!xa_empty(&tctx->xa));
7787 xa_destroy(&tctx->xa);
7788 kfree(tctx);
7789 tsk->io_uring = NULL;
7790}
7791
Stefano Garzarella7e84e1c2020-08-27 16:58:31 +02007792static int io_sq_offload_create(struct io_ring_ctx *ctx,
7793 struct io_uring_params *p)
Jens Axboe2b188cc2019-01-07 10:46:33 -07007794{
7795 int ret;
7796
Jens Axboe6c271ce2019-01-10 11:22:30 -07007797 if (ctx->flags & IORING_SETUP_SQPOLL) {
Jens Axboe534ca6d2020-09-02 13:52:19 -06007798 struct io_sq_data *sqd;
7799
Jens Axboe3ec482d2019-04-08 10:51:01 -06007800 ret = -EPERM;
7801 if (!capable(CAP_SYS_ADMIN))
7802 goto err;
7803
Jens Axboe534ca6d2020-09-02 13:52:19 -06007804 sqd = io_get_sq_data(p);
7805 if (IS_ERR(sqd)) {
7806 ret = PTR_ERR(sqd);
7807 goto err;
7808 }
Jens Axboe69fb2132020-09-14 11:16:23 -06007809
Jens Axboe534ca6d2020-09-02 13:52:19 -06007810 ctx->sq_data = sqd;
Jens Axboe69fb2132020-09-14 11:16:23 -06007811 io_sq_thread_park(sqd);
7812 mutex_lock(&sqd->ctx_lock);
7813 list_add(&ctx->sqd_list, &sqd->ctx_new_list);
7814 mutex_unlock(&sqd->ctx_lock);
7815 io_sq_thread_unpark(sqd);
Jens Axboe534ca6d2020-09-02 13:52:19 -06007816
Jens Axboe917257d2019-04-13 09:28:55 -06007817 ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
7818 if (!ctx->sq_thread_idle)
7819 ctx->sq_thread_idle = HZ;
7820
Jens Axboeaa061652020-09-02 14:50:27 -06007821 if (sqd->thread)
7822 goto done;
7823
Jens Axboe6c271ce2019-01-10 11:22:30 -07007824 if (p->flags & IORING_SETUP_SQ_AFF) {
Jens Axboe44a9bd12019-05-14 20:00:30 -06007825 int cpu = p->sq_thread_cpu;
Jens Axboe6c271ce2019-01-10 11:22:30 -07007826
Jens Axboe917257d2019-04-13 09:28:55 -06007827 ret = -EINVAL;
Jens Axboe44a9bd12019-05-14 20:00:30 -06007828 if (cpu >= nr_cpu_ids)
7829 goto err;
Shenghui Wang7889f442019-05-07 16:03:19 +08007830 if (!cpu_online(cpu))
Jens Axboe917257d2019-04-13 09:28:55 -06007831 goto err;
7832
Jens Axboe69fb2132020-09-14 11:16:23 -06007833 sqd->thread = kthread_create_on_cpu(io_sq_thread, sqd,
Jens Axboe534ca6d2020-09-02 13:52:19 -06007834 cpu, "io_uring-sq");
Jens Axboe6c271ce2019-01-10 11:22:30 -07007835 } else {
Jens Axboe69fb2132020-09-14 11:16:23 -06007836 sqd->thread = kthread_create(io_sq_thread, sqd,
Jens Axboe6c271ce2019-01-10 11:22:30 -07007837 "io_uring-sq");
7838 }
Jens Axboe534ca6d2020-09-02 13:52:19 -06007839 if (IS_ERR(sqd->thread)) {
7840 ret = PTR_ERR(sqd->thread);
7841 sqd->thread = NULL;
Jens Axboe6c271ce2019-01-10 11:22:30 -07007842 goto err;
7843 }
Jens Axboe534ca6d2020-09-02 13:52:19 -06007844 ret = io_uring_alloc_task_context(sqd->thread);
Jens Axboe0f212202020-09-13 13:09:39 -06007845 if (ret)
7846 goto err;
Jens Axboe6c271ce2019-01-10 11:22:30 -07007847 } else if (p->flags & IORING_SETUP_SQ_AFF) {
7848 /* Can't have SQ_AFF without SQPOLL */
7849 ret = -EINVAL;
7850 goto err;
7851 }
7852
Jens Axboeaa061652020-09-02 14:50:27 -06007853done:
Pavel Begunkov24369c22020-01-28 03:15:48 +03007854 ret = io_init_wq_offload(ctx, p);
7855 if (ret)
Jens Axboe2b188cc2019-01-07 10:46:33 -07007856 goto err;
Jens Axboe2b188cc2019-01-07 10:46:33 -07007857
7858 return 0;
7859err:
Jens Axboe54a91f32019-09-10 09:15:04 -06007860 io_finish_async(ctx);
Jens Axboe2b188cc2019-01-07 10:46:33 -07007861 return ret;
7862}
7863
Stefano Garzarella7e84e1c2020-08-27 16:58:31 +02007864static void io_sq_offload_start(struct io_ring_ctx *ctx)
7865{
Jens Axboe534ca6d2020-09-02 13:52:19 -06007866 struct io_sq_data *sqd = ctx->sq_data;
7867
7868 if ((ctx->flags & IORING_SETUP_SQPOLL) && sqd->thread)
7869 wake_up_process(sqd->thread);
Stefano Garzarella7e84e1c2020-08-27 16:58:31 +02007870}
7871
Bijan Mottahedeha087e2b2020-06-16 16:36:07 -07007872static inline void __io_unaccount_mem(struct user_struct *user,
7873 unsigned long nr_pages)
Jens Axboe2b188cc2019-01-07 10:46:33 -07007874{
7875 atomic_long_sub(nr_pages, &user->locked_vm);
7876}
7877
Bijan Mottahedeha087e2b2020-06-16 16:36:07 -07007878static inline int __io_account_mem(struct user_struct *user,
7879 unsigned long nr_pages)
Jens Axboe2b188cc2019-01-07 10:46:33 -07007880{
7881 unsigned long page_limit, cur_pages, new_pages;
7882
7883 /* Don't allow more pages than we can safely lock */
7884 page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
7885
7886 do {
7887 cur_pages = atomic_long_read(&user->locked_vm);
7888 new_pages = cur_pages + nr_pages;
7889 if (new_pages > page_limit)
7890 return -ENOMEM;
7891 } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
7892 new_pages) != cur_pages);
7893
7894 return 0;
7895}
7896
Bijan Mottahedeh2e0464d2020-06-16 16:36:10 -07007897static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages,
7898 enum io_mem_account acct)
Bijan Mottahedeha087e2b2020-06-16 16:36:07 -07007899{
Bijan Mottahedehaad5d8d2020-06-16 16:36:08 -07007900 if (ctx->limit_mem)
Bijan Mottahedeha087e2b2020-06-16 16:36:07 -07007901 __io_unaccount_mem(ctx->user, nr_pages);
Bijan Mottahedeh30975822020-06-16 16:36:09 -07007902
Jens Axboe2aede0e2020-09-14 10:45:53 -06007903 if (ctx->mm_account) {
Bijan Mottahedeh2e0464d2020-06-16 16:36:10 -07007904 if (acct == ACCT_LOCKED)
Jens Axboe2aede0e2020-09-14 10:45:53 -06007905 ctx->mm_account->locked_vm -= nr_pages;
Bijan Mottahedeh2e0464d2020-06-16 16:36:10 -07007906 else if (acct == ACCT_PINNED)
Jens Axboe2aede0e2020-09-14 10:45:53 -06007907 atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm);
Bijan Mottahedeh2e0464d2020-06-16 16:36:10 -07007908 }
Bijan Mottahedeha087e2b2020-06-16 16:36:07 -07007909}
7910
Bijan Mottahedeh2e0464d2020-06-16 16:36:10 -07007911static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages,
7912 enum io_mem_account acct)
Bijan Mottahedeha087e2b2020-06-16 16:36:07 -07007913{
Bijan Mottahedeh30975822020-06-16 16:36:09 -07007914 int ret;
7915
7916 if (ctx->limit_mem) {
7917 ret = __io_account_mem(ctx->user, nr_pages);
7918 if (ret)
7919 return ret;
7920 }
7921
Jens Axboe2aede0e2020-09-14 10:45:53 -06007922 if (ctx->mm_account) {
Bijan Mottahedeh2e0464d2020-06-16 16:36:10 -07007923 if (acct == ACCT_LOCKED)
Jens Axboe2aede0e2020-09-14 10:45:53 -06007924 ctx->mm_account->locked_vm += nr_pages;
Bijan Mottahedeh2e0464d2020-06-16 16:36:10 -07007925 else if (acct == ACCT_PINNED)
Jens Axboe2aede0e2020-09-14 10:45:53 -06007926 atomic64_add(nr_pages, &ctx->mm_account->pinned_vm);
Bijan Mottahedeh2e0464d2020-06-16 16:36:10 -07007927 }
Bijan Mottahedeha087e2b2020-06-16 16:36:07 -07007928
7929 return 0;
7930}
7931
Jens Axboe2b188cc2019-01-07 10:46:33 -07007932static void io_mem_free(void *ptr)
7933{
Mark Rutland52e04ef2019-04-30 17:30:21 +01007934 struct page *page;
Jens Axboe2b188cc2019-01-07 10:46:33 -07007935
Mark Rutland52e04ef2019-04-30 17:30:21 +01007936 if (!ptr)
7937 return;
7938
7939 page = virt_to_head_page(ptr);
Jens Axboe2b188cc2019-01-07 10:46:33 -07007940 if (put_page_testzero(page))
7941 free_compound_page(page);
7942}
7943
7944static void *io_mem_alloc(size_t size)
7945{
7946 gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP |
7947 __GFP_NORETRY;
7948
7949 return (void *) __get_free_pages(gfp_flags, get_order(size));
7950}
7951
Hristo Venev75b28af2019-08-26 17:23:46 +00007952static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
7953 size_t *sq_offset)
7954{
7955 struct io_rings *rings;
7956 size_t off, sq_array_size;
7957
7958 off = struct_size(rings, cqes, cq_entries);
7959 if (off == SIZE_MAX)
7960 return SIZE_MAX;
7961
7962#ifdef CONFIG_SMP
7963 off = ALIGN(off, SMP_CACHE_BYTES);
7964 if (off == 0)
7965 return SIZE_MAX;
7966#endif
7967
Dmitry Vyukovb36200f2020-07-11 11:31:11 +02007968 if (sq_offset)
7969 *sq_offset = off;
7970
Hristo Venev75b28af2019-08-26 17:23:46 +00007971 sq_array_size = array_size(sizeof(u32), sq_entries);
7972 if (sq_array_size == SIZE_MAX)
7973 return SIZE_MAX;
7974
7975 if (check_add_overflow(off, sq_array_size, &off))
7976 return SIZE_MAX;
7977
Hristo Venev75b28af2019-08-26 17:23:46 +00007978 return off;
7979}
7980
Jens Axboe2b188cc2019-01-07 10:46:33 -07007981static unsigned long ring_pages(unsigned sq_entries, unsigned cq_entries)
7982{
Hristo Venev75b28af2019-08-26 17:23:46 +00007983 size_t pages;
Jens Axboe2b188cc2019-01-07 10:46:33 -07007984
Hristo Venev75b28af2019-08-26 17:23:46 +00007985 pages = (size_t)1 << get_order(
7986 rings_size(sq_entries, cq_entries, NULL));
7987 pages += (size_t)1 << get_order(
7988 array_size(sizeof(struct io_uring_sqe), sq_entries));
Jens Axboe2b188cc2019-01-07 10:46:33 -07007989
Hristo Venev75b28af2019-08-26 17:23:46 +00007990 return pages;
Jens Axboe2b188cc2019-01-07 10:46:33 -07007991}
7992
Jens Axboeedafcce2019-01-09 09:16:05 -07007993static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx)
7994{
7995 int i, j;
7996
7997 if (!ctx->user_bufs)
7998 return -ENXIO;
7999
8000 for (i = 0; i < ctx->nr_user_bufs; i++) {
8001 struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
8002
8003 for (j = 0; j < imu->nr_bvecs; j++)
John Hubbardf1f6a7d2020-01-30 22:13:35 -08008004 unpin_user_page(imu->bvec[j].bv_page);
Jens Axboeedafcce2019-01-09 09:16:05 -07008005
Jens Axboede293932020-09-17 16:19:16 -06008006 if (imu->acct_pages)
8007 io_unaccount_mem(ctx, imu->acct_pages, ACCT_PINNED);
Mark Rutlandd4ef6472019-05-01 16:59:16 +01008008 kvfree(imu->bvec);
Jens Axboeedafcce2019-01-09 09:16:05 -07008009 imu->nr_bvecs = 0;
8010 }
8011
8012 kfree(ctx->user_bufs);
8013 ctx->user_bufs = NULL;
8014 ctx->nr_user_bufs = 0;
8015 return 0;
8016}
8017
8018static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
8019 void __user *arg, unsigned index)
8020{
8021 struct iovec __user *src;
8022
8023#ifdef CONFIG_COMPAT
8024 if (ctx->compat) {
8025 struct compat_iovec __user *ciovs;
8026 struct compat_iovec ciov;
8027
8028 ciovs = (struct compat_iovec __user *) arg;
8029 if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
8030 return -EFAULT;
8031
Jens Axboed55e5f52019-12-11 16:12:15 -07008032 dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base);
Jens Axboeedafcce2019-01-09 09:16:05 -07008033 dst->iov_len = ciov.iov_len;
8034 return 0;
8035 }
8036#endif
8037 src = (struct iovec __user *) arg;
8038 if (copy_from_user(dst, &src[index], sizeof(*dst)))
8039 return -EFAULT;
8040 return 0;
8041}
8042
Jens Axboede293932020-09-17 16:19:16 -06008043/*
8044 * Not super efficient, but this is just a registration time. And we do cache
8045 * the last compound head, so generally we'll only do a full search if we don't
8046 * match that one.
8047 *
8048 * We check if the given compound head page has already been accounted, to
8049 * avoid double accounting it. This allows us to account the full size of the
8050 * page, not just the constituent pages of a huge page.
8051 */
8052static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
8053 int nr_pages, struct page *hpage)
8054{
8055 int i, j;
8056
8057 /* check current page array */
8058 for (i = 0; i < nr_pages; i++) {
8059 if (!PageCompound(pages[i]))
8060 continue;
8061 if (compound_head(pages[i]) == hpage)
8062 return true;
8063 }
8064
8065 /* check previously registered pages */
8066 for (i = 0; i < ctx->nr_user_bufs; i++) {
8067 struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
8068
8069 for (j = 0; j < imu->nr_bvecs; j++) {
8070 if (!PageCompound(imu->bvec[j].bv_page))
8071 continue;
8072 if (compound_head(imu->bvec[j].bv_page) == hpage)
8073 return true;
8074 }
8075 }
8076
8077 return false;
8078}
8079
8080static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
8081 int nr_pages, struct io_mapped_ubuf *imu,
8082 struct page **last_hpage)
8083{
8084 int i, ret;
8085
8086 for (i = 0; i < nr_pages; i++) {
8087 if (!PageCompound(pages[i])) {
8088 imu->acct_pages++;
8089 } else {
8090 struct page *hpage;
8091
8092 hpage = compound_head(pages[i]);
8093 if (hpage == *last_hpage)
8094 continue;
8095 *last_hpage = hpage;
8096 if (headpage_already_acct(ctx, pages, i, hpage))
8097 continue;
8098 imu->acct_pages += page_size(hpage) >> PAGE_SHIFT;
8099 }
8100 }
8101
8102 if (!imu->acct_pages)
8103 return 0;
8104
8105 ret = io_account_mem(ctx, imu->acct_pages, ACCT_PINNED);
8106 if (ret)
8107 imu->acct_pages = 0;
8108 return ret;
8109}
8110
Jens Axboeedafcce2019-01-09 09:16:05 -07008111static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
8112 unsigned nr_args)
8113{
8114 struct vm_area_struct **vmas = NULL;
8115 struct page **pages = NULL;
Jens Axboede293932020-09-17 16:19:16 -06008116 struct page *last_hpage = NULL;
Jens Axboeedafcce2019-01-09 09:16:05 -07008117 int i, j, got_pages = 0;
8118 int ret = -EINVAL;
8119
8120 if (ctx->user_bufs)
8121 return -EBUSY;
8122 if (!nr_args || nr_args > UIO_MAXIOV)
8123 return -EINVAL;
8124
8125 ctx->user_bufs = kcalloc(nr_args, sizeof(struct io_mapped_ubuf),
8126 GFP_KERNEL);
8127 if (!ctx->user_bufs)
8128 return -ENOMEM;
8129
8130 for (i = 0; i < nr_args; i++) {
8131 struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
8132 unsigned long off, start, end, ubuf;
8133 int pret, nr_pages;
8134 struct iovec iov;
8135 size_t size;
8136
8137 ret = io_copy_iov(ctx, &iov, arg, i);
8138 if (ret)
Pavel Begunkova2786822019-05-26 12:35:47 +03008139 goto err;
Jens Axboeedafcce2019-01-09 09:16:05 -07008140
8141 /*
8142 * Don't impose further limits on the size and buffer
8143 * constraints here, we'll -EINVAL later when IO is
8144 * submitted if they are wrong.
8145 */
8146 ret = -EFAULT;
8147 if (!iov.iov_base || !iov.iov_len)
8148 goto err;
8149
8150 /* arbitrary limit, but we need something */
8151 if (iov.iov_len > SZ_1G)
8152 goto err;
8153
8154 ubuf = (unsigned long) iov.iov_base;
8155 end = (ubuf + iov.iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
8156 start = ubuf >> PAGE_SHIFT;
8157 nr_pages = end - start;
8158
Jens Axboeedafcce2019-01-09 09:16:05 -07008159 ret = 0;
8160 if (!pages || nr_pages > got_pages) {
Denis Efremova8c73c12020-06-05 12:32:03 +03008161 kvfree(vmas);
8162 kvfree(pages);
Mark Rutlandd4ef6472019-05-01 16:59:16 +01008163 pages = kvmalloc_array(nr_pages, sizeof(struct page *),
Jens Axboeedafcce2019-01-09 09:16:05 -07008164 GFP_KERNEL);
Mark Rutlandd4ef6472019-05-01 16:59:16 +01008165 vmas = kvmalloc_array(nr_pages,
Jens Axboeedafcce2019-01-09 09:16:05 -07008166 sizeof(struct vm_area_struct *),
8167 GFP_KERNEL);
8168 if (!pages || !vmas) {
8169 ret = -ENOMEM;
Jens Axboeedafcce2019-01-09 09:16:05 -07008170 goto err;
8171 }
8172 got_pages = nr_pages;
8173 }
8174
Mark Rutlandd4ef6472019-05-01 16:59:16 +01008175 imu->bvec = kvmalloc_array(nr_pages, sizeof(struct bio_vec),
Jens Axboeedafcce2019-01-09 09:16:05 -07008176 GFP_KERNEL);
8177 ret = -ENOMEM;
Jens Axboede293932020-09-17 16:19:16 -06008178 if (!imu->bvec)
Jens Axboeedafcce2019-01-09 09:16:05 -07008179 goto err;
Jens Axboeedafcce2019-01-09 09:16:05 -07008180
8181 ret = 0;
Michel Lespinassed8ed45c2020-06-08 21:33:25 -07008182 mmap_read_lock(current->mm);
John Hubbard2113b052020-01-30 22:13:13 -08008183 pret = pin_user_pages(ubuf, nr_pages,
Ira Weiny932f4a62019-05-13 17:17:03 -07008184 FOLL_WRITE | FOLL_LONGTERM,
8185 pages, vmas);
Jens Axboeedafcce2019-01-09 09:16:05 -07008186 if (pret == nr_pages) {
8187 /* don't support file backed memory */
8188 for (j = 0; j < nr_pages; j++) {
8189 struct vm_area_struct *vma = vmas[j];
8190
8191 if (vma->vm_file &&
8192 !is_file_hugepages(vma->vm_file)) {
8193 ret = -EOPNOTSUPP;
8194 break;
8195 }
8196 }
8197 } else {
8198 ret = pret < 0 ? pret : -EFAULT;
8199 }
Michel Lespinassed8ed45c2020-06-08 21:33:25 -07008200 mmap_read_unlock(current->mm);
Jens Axboeedafcce2019-01-09 09:16:05 -07008201 if (ret) {
8202 /*
8203 * if we did partial map, or found file backed vmas,
8204 * release any pages we did get
8205 */
John Hubbard27c4d3a2019-08-04 19:32:06 -07008206 if (pret > 0)
John Hubbardf1f6a7d2020-01-30 22:13:35 -08008207 unpin_user_pages(pages, pret);
Jens Axboede293932020-09-17 16:19:16 -06008208 kvfree(imu->bvec);
8209 goto err;
8210 }
8211
8212 ret = io_buffer_account_pin(ctx, pages, pret, imu, &last_hpage);
8213 if (ret) {
8214 unpin_user_pages(pages, pret);
Mark Rutlandd4ef6472019-05-01 16:59:16 +01008215 kvfree(imu->bvec);
Jens Axboeedafcce2019-01-09 09:16:05 -07008216 goto err;
8217 }
8218
8219 off = ubuf & ~PAGE_MASK;
8220 size = iov.iov_len;
8221 for (j = 0; j < nr_pages; j++) {
8222 size_t vec_len;
8223
8224 vec_len = min_t(size_t, size, PAGE_SIZE - off);
8225 imu->bvec[j].bv_page = pages[j];
8226 imu->bvec[j].bv_len = vec_len;
8227 imu->bvec[j].bv_offset = off;
8228 off = 0;
8229 size -= vec_len;
8230 }
8231 /* store original address for later verification */
8232 imu->ubuf = ubuf;
8233 imu->len = iov.iov_len;
8234 imu->nr_bvecs = nr_pages;
8235
8236 ctx->nr_user_bufs++;
8237 }
Mark Rutlandd4ef6472019-05-01 16:59:16 +01008238 kvfree(pages);
8239 kvfree(vmas);
Jens Axboeedafcce2019-01-09 09:16:05 -07008240 return 0;
8241err:
Mark Rutlandd4ef6472019-05-01 16:59:16 +01008242 kvfree(pages);
8243 kvfree(vmas);
Jens Axboeedafcce2019-01-09 09:16:05 -07008244 io_sqe_buffer_unregister(ctx);
8245 return ret;
8246}
8247
Jens Axboe9b402842019-04-11 11:45:41 -06008248static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
8249{
8250 __s32 __user *fds = arg;
8251 int fd;
8252
8253 if (ctx->cq_ev_fd)
8254 return -EBUSY;
8255
8256 if (copy_from_user(&fd, fds, sizeof(*fds)))
8257 return -EFAULT;
8258
8259 ctx->cq_ev_fd = eventfd_ctx_fdget(fd);
8260 if (IS_ERR(ctx->cq_ev_fd)) {
8261 int ret = PTR_ERR(ctx->cq_ev_fd);
8262 ctx->cq_ev_fd = NULL;
8263 return ret;
8264 }
8265
8266 return 0;
8267}
8268
8269static int io_eventfd_unregister(struct io_ring_ctx *ctx)
8270{
8271 if (ctx->cq_ev_fd) {
8272 eventfd_ctx_put(ctx->cq_ev_fd);
8273 ctx->cq_ev_fd = NULL;
8274 return 0;
8275 }
8276
8277 return -ENXIO;
8278}
8279
Jens Axboe5a2e7452020-02-23 16:23:11 -07008280static int __io_destroy_buffers(int id, void *p, void *data)
8281{
8282 struct io_ring_ctx *ctx = data;
8283 struct io_buffer *buf = p;
8284
Jens Axboe067524e2020-03-02 16:32:28 -07008285 __io_remove_buffers(ctx, buf, id, -1U);
Jens Axboe5a2e7452020-02-23 16:23:11 -07008286 return 0;
8287}
8288
8289static void io_destroy_buffers(struct io_ring_ctx *ctx)
8290{
8291 idr_for_each(&ctx->io_buffer_idr, __io_destroy_buffers, ctx);
8292 idr_destroy(&ctx->io_buffer_idr);
8293}
8294
Jens Axboe2b188cc2019-01-07 10:46:33 -07008295static void io_ring_ctx_free(struct io_ring_ctx *ctx)
8296{
Jens Axboe6b063142019-01-10 22:13:58 -07008297 io_finish_async(ctx);
Jens Axboeedafcce2019-01-09 09:16:05 -07008298 io_sqe_buffer_unregister(ctx);
Jens Axboe2aede0e2020-09-14 10:45:53 -06008299
8300 if (ctx->sqo_task) {
8301 put_task_struct(ctx->sqo_task);
8302 ctx->sqo_task = NULL;
8303 mmdrop(ctx->mm_account);
8304 ctx->mm_account = NULL;
Bijan Mottahedeh30975822020-06-16 16:36:09 -07008305 }
Jens Axboe2b188cc2019-01-07 10:46:33 -07008306
Jens Axboe6b063142019-01-10 22:13:58 -07008307 io_sqe_files_unregister(ctx);
Jens Axboe9b402842019-04-11 11:45:41 -06008308 io_eventfd_unregister(ctx);
Jens Axboe5a2e7452020-02-23 16:23:11 -07008309 io_destroy_buffers(ctx);
Jens Axboe41726c92020-02-23 13:11:42 -07008310 idr_destroy(&ctx->personality_idr);
Jens Axboedef596e2019-01-09 08:59:42 -07008311
Jens Axboe2b188cc2019-01-07 10:46:33 -07008312#if defined(CONFIG_UNIX)
Eric Biggers355e8d22019-06-12 14:58:43 -07008313 if (ctx->ring_sock) {
8314 ctx->ring_sock->file = NULL; /* so that iput() is called */
Jens Axboe2b188cc2019-01-07 10:46:33 -07008315 sock_release(ctx->ring_sock);
Eric Biggers355e8d22019-06-12 14:58:43 -07008316 }
Jens Axboe2b188cc2019-01-07 10:46:33 -07008317#endif
8318
Hristo Venev75b28af2019-08-26 17:23:46 +00008319 io_mem_free(ctx->rings);
Jens Axboe2b188cc2019-01-07 10:46:33 -07008320 io_mem_free(ctx->sq_sqes);
Jens Axboe2b188cc2019-01-07 10:46:33 -07008321
8322 percpu_ref_exit(&ctx->refs);
Jens Axboe2b188cc2019-01-07 10:46:33 -07008323 free_uid(ctx->user);
Jens Axboe181e4482019-11-25 08:52:30 -07008324 put_cred(ctx->creds);
Jens Axboe78076bb2019-12-04 19:56:40 -07008325 kfree(ctx->cancel_hash);
Jens Axboe0ddf92e2019-11-08 08:52:53 -07008326 kmem_cache_free(req_cachep, ctx->fallback_req);
Jens Axboe2b188cc2019-01-07 10:46:33 -07008327 kfree(ctx);
8328}
8329
8330static __poll_t io_uring_poll(struct file *file, poll_table *wait)
8331{
8332 struct io_ring_ctx *ctx = file->private_data;
8333 __poll_t mask = 0;
8334
8335 poll_wait(file, &ctx->cq_wait, wait);
Stefan Bühler4f7067c2019-04-24 23:54:17 +02008336 /*
8337 * synchronizes with barrier from wq_has_sleeper call in
8338 * io_commit_cqring
8339 */
Jens Axboe2b188cc2019-01-07 10:46:33 -07008340 smp_rmb();
Jens Axboe90554202020-09-03 12:12:41 -06008341 if (!io_sqring_full(ctx))
Jens Axboe2b188cc2019-01-07 10:46:33 -07008342 mask |= EPOLLOUT | EPOLLWRNORM;
Stefano Garzarella63e5d812020-02-07 13:18:28 +01008343 if (io_cqring_events(ctx, false))
Jens Axboe2b188cc2019-01-07 10:46:33 -07008344 mask |= EPOLLIN | EPOLLRDNORM;
8345
8346 return mask;
8347}
8348
8349static int io_uring_fasync(int fd, struct file *file, int on)
8350{
8351 struct io_ring_ctx *ctx = file->private_data;
8352
8353 return fasync_helper(fd, file, on, &ctx->cq_fasync);
8354}
8355
Jens Axboe071698e2020-01-28 10:04:42 -07008356static int io_remove_personalities(int id, void *p, void *data)
8357{
8358 struct io_ring_ctx *ctx = data;
8359 const struct cred *cred;
8360
8361 cred = idr_remove(&ctx->personality_idr, id);
8362 if (cred)
8363 put_cred(cred);
8364 return 0;
8365}
8366
Jens Axboe85faa7b2020-04-09 18:14:00 -06008367static void io_ring_exit_work(struct work_struct *work)
8368{
Pavel Begunkovb2edc0a2020-07-07 16:36:22 +03008369 struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx,
8370 exit_work);
Jens Axboe85faa7b2020-04-09 18:14:00 -06008371
Jens Axboe56952e92020-06-17 15:00:04 -06008372 /*
8373 * If we're doing polled IO and end up having requests being
8374 * submitted async (out-of-line), then completions can come in while
8375 * we're waiting for refs to drop. We need to reap these manually,
8376 * as nobody else will be looking for them.
8377 */
Pavel Begunkovb2edc0a2020-07-07 16:36:22 +03008378 do {
Jens Axboe56952e92020-06-17 15:00:04 -06008379 if (ctx->rings)
Jens Axboee6c8aa92020-09-28 13:10:13 -06008380 io_cqring_overflow_flush(ctx, true, NULL, NULL);
Pavel Begunkovb2edc0a2020-07-07 16:36:22 +03008381 io_iopoll_try_reap_events(ctx);
8382 } while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20));
Jens Axboe85faa7b2020-04-09 18:14:00 -06008383 io_ring_ctx_free(ctx);
8384}
8385
Jens Axboe2b188cc2019-01-07 10:46:33 -07008386static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
8387{
8388 mutex_lock(&ctx->uring_lock);
8389 percpu_ref_kill(&ctx->refs);
8390 mutex_unlock(&ctx->uring_lock);
8391
Jens Axboef3606e32020-09-22 08:18:24 -06008392 io_kill_timeouts(ctx, NULL);
8393 io_poll_remove_all(ctx, NULL);
Jens Axboe561fb042019-10-24 07:25:42 -06008394
8395 if (ctx->io_wq)
8396 io_wq_cancel_all(ctx->io_wq);
8397
Jens Axboe15dff282019-11-13 09:09:23 -07008398 /* if we failed setting up the ctx, we might not have any rings */
8399 if (ctx->rings)
Jens Axboee6c8aa92020-09-28 13:10:13 -06008400 io_cqring_overflow_flush(ctx, true, NULL, NULL);
Pavel Begunkovb2edc0a2020-07-07 16:36:22 +03008401 io_iopoll_try_reap_events(ctx);
Jens Axboe071698e2020-01-28 10:04:42 -07008402 idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx);
Jens Axboe309fc032020-07-10 09:13:34 -06008403
8404 /*
8405 * Do this upfront, so we won't have a grace period where the ring
8406 * is closed but resources aren't reaped yet. This can cause
8407 * spurious failure in setting up a new ring.
8408 */
Jens Axboe760618f2020-07-24 12:53:31 -06008409 io_unaccount_mem(ctx, ring_pages(ctx->sq_entries, ctx->cq_entries),
8410 ACCT_LOCKED);
Jens Axboe309fc032020-07-10 09:13:34 -06008411
Jens Axboe85faa7b2020-04-09 18:14:00 -06008412 INIT_WORK(&ctx->exit_work, io_ring_exit_work);
Jens Axboefc666772020-08-19 11:10:51 -06008413 /*
8414 * Use system_unbound_wq to avoid spawning tons of event kworkers
8415 * if we're exiting a ton of rings at the same time. It just adds
8416 * noise and overhead, there's no discernable change in runtime
8417 * over using system_wq.
8418 */
8419 queue_work(system_unbound_wq, &ctx->exit_work);
Jens Axboe2b188cc2019-01-07 10:46:33 -07008420}
8421
8422static int io_uring_release(struct inode *inode, struct file *file)
8423{
8424 struct io_ring_ctx *ctx = file->private_data;
8425
8426 file->private_data = NULL;
8427 io_ring_ctx_wait_and_kill(ctx);
8428 return 0;
8429}
8430
Pavel Begunkov67c4d9e2020-06-15 10:24:05 +03008431static bool io_wq_files_match(struct io_wq_work *work, void *data)
8432{
8433 struct files_struct *files = data;
8434
Jens Axboe0f212202020-09-13 13:09:39 -06008435 return !files || work->files == files;
Pavel Begunkov67c4d9e2020-06-15 10:24:05 +03008436}
8437
Jens Axboef254ac02020-08-12 17:33:30 -06008438/*
8439 * Returns true if 'preq' is the link parent of 'req'
8440 */
8441static bool io_match_link(struct io_kiocb *preq, struct io_kiocb *req)
8442{
8443 struct io_kiocb *link;
8444
8445 if (!(preq->flags & REQ_F_LINK_HEAD))
8446 return false;
8447
8448 list_for_each_entry(link, &preq->link_list, link_list) {
8449 if (link == req)
8450 return true;
8451 }
8452
8453 return false;
8454}
8455
Pavel Begunkovc127a2a2020-09-06 00:45:15 +03008456static bool io_match_link_files(struct io_kiocb *req,
8457 struct files_struct *files)
8458{
8459 struct io_kiocb *link;
8460
8461 if (io_match_files(req, files))
8462 return true;
8463 if (req->flags & REQ_F_LINK_HEAD) {
8464 list_for_each_entry(link, &req->link_list, link_list) {
8465 if (io_match_files(link, files))
8466 return true;
8467 }
8468 }
8469 return false;
8470}
8471
Jens Axboef254ac02020-08-12 17:33:30 -06008472/*
8473 * We're looking to cancel 'req' because it's holding on to our files, but
8474 * 'req' could be a link to another request. See if it is, and cancel that
8475 * parent request if so.
8476 */
8477static bool io_poll_remove_link(struct io_ring_ctx *ctx, struct io_kiocb *req)
8478{
8479 struct hlist_node *tmp;
8480 struct io_kiocb *preq;
8481 bool found = false;
8482 int i;
8483
8484 spin_lock_irq(&ctx->completion_lock);
8485 for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
8486 struct hlist_head *list;
8487
8488 list = &ctx->cancel_hash[i];
8489 hlist_for_each_entry_safe(preq, tmp, list, hash_node) {
8490 found = io_match_link(preq, req);
8491 if (found) {
8492 io_poll_remove_one(preq);
8493 break;
8494 }
8495 }
8496 }
8497 spin_unlock_irq(&ctx->completion_lock);
8498 return found;
8499}
8500
8501static bool io_timeout_remove_link(struct io_ring_ctx *ctx,
8502 struct io_kiocb *req)
8503{
8504 struct io_kiocb *preq;
8505 bool found = false;
8506
8507 spin_lock_irq(&ctx->completion_lock);
8508 list_for_each_entry(preq, &ctx->timeout_list, timeout.list) {
8509 found = io_match_link(preq, req);
8510 if (found) {
8511 __io_timeout_cancel(preq);
8512 break;
8513 }
8514 }
8515 spin_unlock_irq(&ctx->completion_lock);
8516 return found;
8517}
8518
Jens Axboeb711d4e2020-08-16 08:23:05 -07008519static bool io_cancel_link_cb(struct io_wq_work *work, void *data)
8520{
8521 return io_match_link(container_of(work, struct io_kiocb, work), data);
8522}
8523
8524static void io_attempt_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req)
8525{
8526 enum io_wq_cancel cret;
8527
8528 /* cancel this particular work, if it's running */
8529 cret = io_wq_cancel_work(ctx->io_wq, &req->work);
8530 if (cret != IO_WQ_CANCEL_NOTFOUND)
8531 return;
8532
8533 /* find links that hold this pending, cancel those */
8534 cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_link_cb, req, true);
8535 if (cret != IO_WQ_CANCEL_NOTFOUND)
8536 return;
8537
8538 /* if we have a poll link holding this pending, cancel that */
8539 if (io_poll_remove_link(ctx, req))
8540 return;
8541
8542 /* final option, timeout link is holding this req pending */
8543 io_timeout_remove_link(ctx, req);
8544}
8545
Pavel Begunkovb7ddce32020-09-06 00:45:14 +03008546static void io_cancel_defer_files(struct io_ring_ctx *ctx,
8547 struct files_struct *files)
8548{
8549 struct io_defer_entry *de = NULL;
8550 LIST_HEAD(list);
8551
8552 spin_lock_irq(&ctx->completion_lock);
8553 list_for_each_entry_reverse(de, &ctx->defer_list, list) {
Pavel Begunkovc127a2a2020-09-06 00:45:15 +03008554 if (io_match_link_files(de->req, files)) {
Pavel Begunkovb7ddce32020-09-06 00:45:14 +03008555 list_cut_position(&list, &ctx->defer_list, &de->list);
8556 break;
8557 }
8558 }
8559 spin_unlock_irq(&ctx->completion_lock);
8560
8561 while (!list_empty(&list)) {
8562 de = list_first_entry(&list, struct io_defer_entry, list);
8563 list_del_init(&de->list);
8564 req_set_fail_links(de->req);
8565 io_put_req(de->req);
8566 io_req_complete(de->req, -ECANCELED);
8567 kfree(de);
8568 }
8569}
8570
Jens Axboe76e1b642020-09-26 15:05:03 -06008571/*
8572 * Returns true if we found and killed one or more files pinning requests
8573 */
8574static bool io_uring_cancel_files(struct io_ring_ctx *ctx,
Jens Axboefcb323c2019-10-24 12:39:47 -06008575 struct files_struct *files)
8576{
Pavel Begunkov67c4d9e2020-06-15 10:24:05 +03008577 if (list_empty_careful(&ctx->inflight_list))
Jens Axboe76e1b642020-09-26 15:05:03 -06008578 return false;
Pavel Begunkov67c4d9e2020-06-15 10:24:05 +03008579
Pavel Begunkovb7ddce32020-09-06 00:45:14 +03008580 io_cancel_defer_files(ctx, files);
Pavel Begunkov67c4d9e2020-06-15 10:24:05 +03008581 /* cancel all at once, should be faster than doing it one by one*/
8582 io_wq_cancel_cb(ctx->io_wq, io_wq_files_match, files, true);
8583
Jens Axboefcb323c2019-10-24 12:39:47 -06008584 while (!list_empty_careful(&ctx->inflight_list)) {
Xiaoguang Wangd8f1b972020-04-26 15:54:43 +08008585 struct io_kiocb *cancel_req = NULL, *req;
8586 DEFINE_WAIT(wait);
Jens Axboefcb323c2019-10-24 12:39:47 -06008587
8588 spin_lock_irq(&ctx->inflight_lock);
8589 list_for_each_entry(req, &ctx->inflight_list, inflight_entry) {
Jens Axboe0f212202020-09-13 13:09:39 -06008590 if (files && req->work.files != files)
Jens Axboe768134d2019-11-10 20:30:53 -07008591 continue;
8592 /* req is being completed, ignore */
8593 if (!refcount_inc_not_zero(&req->refs))
8594 continue;
8595 cancel_req = req;
8596 break;
Jens Axboefcb323c2019-10-24 12:39:47 -06008597 }
Jens Axboe768134d2019-11-10 20:30:53 -07008598 if (cancel_req)
Jens Axboefcb323c2019-10-24 12:39:47 -06008599 prepare_to_wait(&ctx->inflight_wait, &wait,
Jens Axboe768134d2019-11-10 20:30:53 -07008600 TASK_UNINTERRUPTIBLE);
Jens Axboefcb323c2019-10-24 12:39:47 -06008601 spin_unlock_irq(&ctx->inflight_lock);
8602
Jens Axboe768134d2019-11-10 20:30:53 -07008603 /* We need to keep going until we don't find a matching req */
8604 if (!cancel_req)
Jens Axboefcb323c2019-10-24 12:39:47 -06008605 break;
Pavel Begunkovbb175342020-08-20 11:33:35 +03008606 /* cancel this request, or head link requests */
8607 io_attempt_cancel(ctx, cancel_req);
8608 io_put_req(cancel_req);
Jens Axboe6200b0a2020-09-13 14:38:30 -06008609 /* cancellations _may_ trigger task work */
8610 io_run_task_work();
Jens Axboefcb323c2019-10-24 12:39:47 -06008611 schedule();
Xiaoguang Wangd8f1b972020-04-26 15:54:43 +08008612 finish_wait(&ctx->inflight_wait, &wait);
Jens Axboefcb323c2019-10-24 12:39:47 -06008613 }
Jens Axboe76e1b642020-09-26 15:05:03 -06008614
8615 return true;
Jens Axboefcb323c2019-10-24 12:39:47 -06008616}
8617
Pavel Begunkov801dd572020-06-15 10:33:14 +03008618static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
Pavel Begunkov44e728b2020-06-15 10:24:04 +03008619{
Pavel Begunkov801dd572020-06-15 10:33:14 +03008620 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
8621 struct task_struct *task = data;
Pavel Begunkov44e728b2020-06-15 10:24:04 +03008622
Jens Axboef3606e32020-09-22 08:18:24 -06008623 return io_task_match(req, task);
Pavel Begunkov44e728b2020-06-15 10:24:04 +03008624}
8625
Jens Axboe0f212202020-09-13 13:09:39 -06008626static bool __io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
8627 struct task_struct *task,
8628 struct files_struct *files)
8629{
8630 bool ret;
8631
8632 ret = io_uring_cancel_files(ctx, files);
8633 if (!files) {
8634 enum io_wq_cancel cret;
8635
8636 cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, task, true);
8637 if (cret != IO_WQ_CANCEL_NOTFOUND)
8638 ret = true;
8639
8640 /* SQPOLL thread does its own polling */
8641 if (!(ctx->flags & IORING_SETUP_SQPOLL)) {
8642 while (!list_empty_careful(&ctx->iopoll_list)) {
8643 io_iopoll_try_reap_events(ctx);
8644 ret = true;
8645 }
8646 }
8647
8648 ret |= io_poll_remove_all(ctx, task);
8649 ret |= io_kill_timeouts(ctx, task);
8650 }
8651
8652 return ret;
8653}
8654
8655/*
8656 * We need to iteratively cancel requests, in case a request has dependent
8657 * hard links. These persist even for failure of cancelations, hence keep
8658 * looping until none are found.
8659 */
8660static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
8661 struct files_struct *files)
8662{
8663 struct task_struct *task = current;
8664
Jens Axboe534ca6d2020-09-02 13:52:19 -06008665 if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data)
8666 task = ctx->sq_data->thread;
Jens Axboe0f212202020-09-13 13:09:39 -06008667
8668 io_cqring_overflow_flush(ctx, true, task, files);
8669
8670 while (__io_uring_cancel_task_requests(ctx, task, files)) {
8671 io_run_task_work();
8672 cond_resched();
8673 }
8674}
8675
8676/*
8677 * Note that this task has used io_uring. We use it for cancelation purposes.
8678 */
8679static int io_uring_add_task_file(struct file *file)
8680{
8681 if (unlikely(!current->io_uring)) {
8682 int ret;
8683
8684 ret = io_uring_alloc_task_context(current);
8685 if (unlikely(ret))
8686 return ret;
8687 }
8688 if (current->io_uring->last != file) {
8689 XA_STATE(xas, &current->io_uring->xa, (unsigned long) file);
8690 void *old;
8691
8692 rcu_read_lock();
8693 old = xas_load(&xas);
8694 if (old != file) {
8695 get_file(file);
8696 xas_lock(&xas);
8697 xas_store(&xas, file);
8698 xas_unlock(&xas);
8699 }
8700 rcu_read_unlock();
8701 current->io_uring->last = file;
8702 }
8703
8704 return 0;
8705}
8706
8707/*
8708 * Remove this io_uring_file -> task mapping.
8709 */
8710static void io_uring_del_task_file(struct file *file)
8711{
8712 struct io_uring_task *tctx = current->io_uring;
8713 XA_STATE(xas, &tctx->xa, (unsigned long) file);
8714
8715 if (tctx->last == file)
8716 tctx->last = NULL;
8717
8718 xas_lock(&xas);
8719 file = xas_store(&xas, NULL);
8720 xas_unlock(&xas);
8721
8722 if (file)
8723 fput(file);
8724}
8725
8726static void __io_uring_attempt_task_drop(struct file *file)
8727{
8728 XA_STATE(xas, &current->io_uring->xa, (unsigned long) file);
8729 struct file *old;
8730
8731 rcu_read_lock();
8732 old = xas_load(&xas);
8733 rcu_read_unlock();
8734
8735 if (old == file)
8736 io_uring_del_task_file(file);
8737}
8738
8739/*
8740 * Drop task note for this file if we're the only ones that hold it after
8741 * pending fput()
8742 */
8743static void io_uring_attempt_task_drop(struct file *file, bool exiting)
8744{
8745 if (!current->io_uring)
8746 return;
8747 /*
8748 * fput() is pending, will be 2 if the only other ref is our potential
8749 * task file note. If the task is exiting, drop regardless of count.
8750 */
8751 if (!exiting && atomic_long_read(&file->f_count) != 2)
8752 return;
8753
8754 __io_uring_attempt_task_drop(file);
8755}
8756
8757void __io_uring_files_cancel(struct files_struct *files)
8758{
8759 struct io_uring_task *tctx = current->io_uring;
8760 XA_STATE(xas, &tctx->xa, 0);
8761
8762 /* make sure overflow events are dropped */
8763 tctx->in_idle = true;
8764
8765 do {
8766 struct io_ring_ctx *ctx;
8767 struct file *file;
8768
8769 xas_lock(&xas);
8770 file = xas_next_entry(&xas, ULONG_MAX);
8771 xas_unlock(&xas);
8772
8773 if (!file)
8774 break;
8775
8776 ctx = file->private_data;
8777
8778 io_uring_cancel_task_requests(ctx, files);
8779 if (files)
8780 io_uring_del_task_file(file);
8781 } while (1);
8782}
8783
8784static inline bool io_uring_task_idle(struct io_uring_task *tctx)
8785{
8786 return atomic_long_read(&tctx->req_issue) ==
8787 atomic_long_read(&tctx->req_complete);
8788}
8789
8790/*
8791 * Find any io_uring fd that this task has registered or done IO on, and cancel
8792 * requests.
8793 */
8794void __io_uring_task_cancel(void)
8795{
8796 struct io_uring_task *tctx = current->io_uring;
8797 DEFINE_WAIT(wait);
8798 long completions;
8799
8800 /* make sure overflow events are dropped */
8801 tctx->in_idle = true;
8802
8803 while (!io_uring_task_idle(tctx)) {
8804 /* read completions before cancelations */
8805 completions = atomic_long_read(&tctx->req_complete);
8806 __io_uring_files_cancel(NULL);
8807
8808 prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE);
8809
8810 /*
8811 * If we've seen completions, retry. This avoids a race where
8812 * a completion comes in before we did prepare_to_wait().
8813 */
8814 if (completions != atomic_long_read(&tctx->req_complete))
8815 continue;
8816 if (io_uring_task_idle(tctx))
8817 break;
8818 schedule();
8819 }
8820
8821 finish_wait(&tctx->wait, &wait);
8822 tctx->in_idle = false;
8823}
8824
Jens Axboefcb323c2019-10-24 12:39:47 -06008825static int io_uring_flush(struct file *file, void *data)
8826{
8827 struct io_ring_ctx *ctx = file->private_data;
8828
Jens Axboe6ab23142020-02-08 20:23:59 -07008829 /*
8830 * If the task is going away, cancel work it may have pending
8831 */
Pavel Begunkov801dd572020-06-15 10:33:14 +03008832 if (fatal_signal_pending(current) || (current->flags & PF_EXITING))
Jens Axboe0f212202020-09-13 13:09:39 -06008833 data = NULL;
Jens Axboe6ab23142020-02-08 20:23:59 -07008834
Jens Axboe0f212202020-09-13 13:09:39 -06008835 io_uring_cancel_task_requests(ctx, data);
8836 io_uring_attempt_task_drop(file, !data);
Jens Axboefcb323c2019-10-24 12:39:47 -06008837 return 0;
8838}
8839
Roman Penyaev6c5c2402019-11-28 12:53:22 +01008840static void *io_uring_validate_mmap_request(struct file *file,
8841 loff_t pgoff, size_t sz)
Jens Axboe2b188cc2019-01-07 10:46:33 -07008842{
Jens Axboe2b188cc2019-01-07 10:46:33 -07008843 struct io_ring_ctx *ctx = file->private_data;
Roman Penyaev6c5c2402019-11-28 12:53:22 +01008844 loff_t offset = pgoff << PAGE_SHIFT;
Jens Axboe2b188cc2019-01-07 10:46:33 -07008845 struct page *page;
8846 void *ptr;
8847
8848 switch (offset) {
8849 case IORING_OFF_SQ_RING:
Hristo Venev75b28af2019-08-26 17:23:46 +00008850 case IORING_OFF_CQ_RING:
8851 ptr = ctx->rings;
Jens Axboe2b188cc2019-01-07 10:46:33 -07008852 break;
8853 case IORING_OFF_SQES:
8854 ptr = ctx->sq_sqes;
8855 break;
Jens Axboe2b188cc2019-01-07 10:46:33 -07008856 default:
Roman Penyaev6c5c2402019-11-28 12:53:22 +01008857 return ERR_PTR(-EINVAL);
Jens Axboe2b188cc2019-01-07 10:46:33 -07008858 }
8859
8860 page = virt_to_head_page(ptr);
Matthew Wilcox (Oracle)a50b8542019-09-23 15:34:25 -07008861 if (sz > page_size(page))
Roman Penyaev6c5c2402019-11-28 12:53:22 +01008862 return ERR_PTR(-EINVAL);
8863
8864 return ptr;
8865}
8866
8867#ifdef CONFIG_MMU
8868
8869static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
8870{
8871 size_t sz = vma->vm_end - vma->vm_start;
8872 unsigned long pfn;
8873 void *ptr;
8874
8875 ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
8876 if (IS_ERR(ptr))
8877 return PTR_ERR(ptr);
Jens Axboe2b188cc2019-01-07 10:46:33 -07008878
8879 pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
8880 return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
8881}
8882
Roman Penyaev6c5c2402019-11-28 12:53:22 +01008883#else /* !CONFIG_MMU */
8884
8885static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
8886{
8887 return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -EINVAL;
8888}
8889
8890static unsigned int io_uring_nommu_mmap_capabilities(struct file *file)
8891{
8892 return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE;
8893}
8894
8895static unsigned long io_uring_nommu_get_unmapped_area(struct file *file,
8896 unsigned long addr, unsigned long len,
8897 unsigned long pgoff, unsigned long flags)
8898{
8899 void *ptr;
8900
8901 ptr = io_uring_validate_mmap_request(file, pgoff, len);
8902 if (IS_ERR(ptr))
8903 return PTR_ERR(ptr);
8904
8905 return (unsigned long) ptr;
8906}
8907
8908#endif /* !CONFIG_MMU */
8909
Jens Axboe90554202020-09-03 12:12:41 -06008910static void io_sqpoll_wait_sq(struct io_ring_ctx *ctx)
8911{
8912 DEFINE_WAIT(wait);
8913
8914 do {
8915 if (!io_sqring_full(ctx))
8916 break;
8917
8918 prepare_to_wait(&ctx->sqo_sq_wait, &wait, TASK_INTERRUPTIBLE);
8919
8920 if (!io_sqring_full(ctx))
8921 break;
8922
8923 schedule();
8924 } while (!signal_pending(current));
8925
8926 finish_wait(&ctx->sqo_sq_wait, &wait);
8927}
8928
Jens Axboe2b188cc2019-01-07 10:46:33 -07008929SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
8930 u32, min_complete, u32, flags, const sigset_t __user *, sig,
8931 size_t, sigsz)
8932{
8933 struct io_ring_ctx *ctx;
8934 long ret = -EBADF;
8935 int submitted = 0;
8936 struct fd f;
8937
Jens Axboe4c6e2772020-07-01 11:29:10 -06008938 io_run_task_work();
Jens Axboeb41e9852020-02-17 09:52:41 -07008939
Jens Axboe90554202020-09-03 12:12:41 -06008940 if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
8941 IORING_ENTER_SQ_WAIT))
Jens Axboe2b188cc2019-01-07 10:46:33 -07008942 return -EINVAL;
8943
8944 f = fdget(fd);
8945 if (!f.file)
8946 return -EBADF;
8947
8948 ret = -EOPNOTSUPP;
8949 if (f.file->f_op != &io_uring_fops)
8950 goto out_fput;
8951
8952 ret = -ENXIO;
8953 ctx = f.file->private_data;
8954 if (!percpu_ref_tryget(&ctx->refs))
8955 goto out_fput;
8956
Stefano Garzarella7e84e1c2020-08-27 16:58:31 +02008957 ret = -EBADFD;
8958 if (ctx->flags & IORING_SETUP_R_DISABLED)
8959 goto out;
8960
Jens Axboe6c271ce2019-01-10 11:22:30 -07008961 /*
8962 * For SQ polling, the thread will do all submissions and completions.
8963 * Just return the requested submit count, and wake the thread if
8964 * we were asked to.
8965 */
Jens Axboeb2a9ead2019-09-12 14:19:16 -06008966 ret = 0;
Jens Axboe6c271ce2019-01-10 11:22:30 -07008967 if (ctx->flags & IORING_SETUP_SQPOLL) {
Jens Axboec1edbf52019-11-10 16:56:04 -07008968 if (!list_empty_careful(&ctx->cq_overflow_list))
Jens Axboee6c8aa92020-09-28 13:10:13 -06008969 io_cqring_overflow_flush(ctx, false, NULL, NULL);
Jens Axboe6c271ce2019-01-10 11:22:30 -07008970 if (flags & IORING_ENTER_SQ_WAKEUP)
Jens Axboe534ca6d2020-09-02 13:52:19 -06008971 wake_up(&ctx->sq_data->wait);
Jens Axboe90554202020-09-03 12:12:41 -06008972 if (flags & IORING_ENTER_SQ_WAIT)
8973 io_sqpoll_wait_sq(ctx);
Jens Axboe6c271ce2019-01-10 11:22:30 -07008974 submitted = to_submit;
Jens Axboeb2a9ead2019-09-12 14:19:16 -06008975 } else if (to_submit) {
Jens Axboe0f212202020-09-13 13:09:39 -06008976 ret = io_uring_add_task_file(f.file);
8977 if (unlikely(ret))
8978 goto out;
Jens Axboe2b188cc2019-01-07 10:46:33 -07008979 mutex_lock(&ctx->uring_lock);
Jens Axboe0f212202020-09-13 13:09:39 -06008980 submitted = io_submit_sqes(ctx, to_submit);
Jens Axboe2b188cc2019-01-07 10:46:33 -07008981 mutex_unlock(&ctx->uring_lock);
Pavel Begunkov7c504e652019-12-18 19:53:45 +03008982
8983 if (submitted != to_submit)
8984 goto out;
Jens Axboe2b188cc2019-01-07 10:46:33 -07008985 }
8986 if (flags & IORING_ENTER_GETEVENTS) {
8987 min_complete = min(min_complete, ctx->cq_entries);
8988
Xiaoguang Wang32b22442020-03-11 09:26:09 +08008989 /*
8990 * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user
8991 * space applications don't need to do io completion events
8992 * polling again, they can rely on io_sq_thread to do polling
8993 * work, which can reduce cpu usage and uring_lock contention.
8994 */
8995 if (ctx->flags & IORING_SETUP_IOPOLL &&
8996 !(ctx->flags & IORING_SETUP_SQPOLL)) {
Pavel Begunkov7668b922020-07-07 16:36:21 +03008997 ret = io_iopoll_check(ctx, min_complete);
Jens Axboedef596e2019-01-09 08:59:42 -07008998 } else {
8999 ret = io_cqring_wait(ctx, min_complete, sig, sigsz);
9000 }
Jens Axboe2b188cc2019-01-07 10:46:33 -07009001 }
9002
Pavel Begunkov7c504e652019-12-18 19:53:45 +03009003out:
Pavel Begunkov6805b322019-10-08 02:18:42 +03009004 percpu_ref_put(&ctx->refs);
Jens Axboe2b188cc2019-01-07 10:46:33 -07009005out_fput:
9006 fdput(f);
9007 return submitted ? submitted : ret;
9008}
9009
Tobias Klauserbebdb652020-02-26 18:38:32 +01009010#ifdef CONFIG_PROC_FS
Jens Axboe87ce9552020-01-30 08:25:34 -07009011static int io_uring_show_cred(int id, void *p, void *data)
9012{
9013 const struct cred *cred = p;
9014 struct seq_file *m = data;
9015 struct user_namespace *uns = seq_user_ns(m);
9016 struct group_info *gi;
9017 kernel_cap_t cap;
9018 unsigned __capi;
9019 int g;
9020
9021 seq_printf(m, "%5d\n", id);
9022 seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid));
9023 seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid));
9024 seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid));
9025 seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid));
9026 seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid));
9027 seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid));
9028 seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid));
9029 seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid));
9030 seq_puts(m, "\n\tGroups:\t");
9031 gi = cred->group_info;
9032 for (g = 0; g < gi->ngroups; g++) {
9033 seq_put_decimal_ull(m, g ? " " : "",
9034 from_kgid_munged(uns, gi->gid[g]));
9035 }
9036 seq_puts(m, "\n\tCapEff:\t");
9037 cap = cred->cap_effective;
9038 CAP_FOR_EACH_U32(__capi)
9039 seq_put_hex_ll(m, NULL, cap.cap[CAP_LAST_U32 - __capi], 8);
9040 seq_putc(m, '\n');
9041 return 0;
9042}
9043
9044static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
9045{
Jens Axboefad8e0d2020-09-28 08:57:48 -06009046 bool has_lock;
Jens Axboe87ce9552020-01-30 08:25:34 -07009047 int i;
9048
Jens Axboefad8e0d2020-09-28 08:57:48 -06009049 /*
9050 * Avoid ABBA deadlock between the seq lock and the io_uring mutex,
9051 * since fdinfo case grabs it in the opposite direction of normal use
9052 * cases. If we fail to get the lock, we just don't iterate any
9053 * structures that could be going away outside the io_uring mutex.
9054 */
9055 has_lock = mutex_trylock(&ctx->uring_lock);
9056
Jens Axboe87ce9552020-01-30 08:25:34 -07009057 seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
Jens Axboefad8e0d2020-09-28 08:57:48 -06009058 for (i = 0; has_lock && i < ctx->nr_user_files; i++) {
Jens Axboe87ce9552020-01-30 08:25:34 -07009059 struct fixed_file_table *table;
9060 struct file *f;
9061
9062 table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT];
9063 f = table->files[i & IORING_FILE_TABLE_MASK];
9064 if (f)
9065 seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname);
9066 else
9067 seq_printf(m, "%5u: <none>\n", i);
9068 }
9069 seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
Jens Axboefad8e0d2020-09-28 08:57:48 -06009070 for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) {
Jens Axboe87ce9552020-01-30 08:25:34 -07009071 struct io_mapped_ubuf *buf = &ctx->user_bufs[i];
9072
9073 seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf,
9074 (unsigned int) buf->len);
9075 }
Jens Axboefad8e0d2020-09-28 08:57:48 -06009076 if (has_lock && !idr_is_empty(&ctx->personality_idr)) {
Jens Axboe87ce9552020-01-30 08:25:34 -07009077 seq_printf(m, "Personalities:\n");
9078 idr_for_each(&ctx->personality_idr, io_uring_show_cred, m);
9079 }
Jens Axboed7718a92020-02-14 22:23:12 -07009080 seq_printf(m, "PollList:\n");
9081 spin_lock_irq(&ctx->completion_lock);
9082 for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
9083 struct hlist_head *list = &ctx->cancel_hash[i];
9084 struct io_kiocb *req;
9085
9086 hlist_for_each_entry(req, list, hash_node)
9087 seq_printf(m, " op=%d, task_works=%d\n", req->opcode,
9088 req->task->task_works != NULL);
9089 }
9090 spin_unlock_irq(&ctx->completion_lock);
Jens Axboefad8e0d2020-09-28 08:57:48 -06009091 if (has_lock)
9092 mutex_unlock(&ctx->uring_lock);
Jens Axboe87ce9552020-01-30 08:25:34 -07009093}
9094
9095static void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
9096{
9097 struct io_ring_ctx *ctx = f->private_data;
9098
9099 if (percpu_ref_tryget(&ctx->refs)) {
9100 __io_uring_show_fdinfo(ctx, m);
9101 percpu_ref_put(&ctx->refs);
9102 }
9103}
Tobias Klauserbebdb652020-02-26 18:38:32 +01009104#endif
Jens Axboe87ce9552020-01-30 08:25:34 -07009105
Jens Axboe2b188cc2019-01-07 10:46:33 -07009106static const struct file_operations io_uring_fops = {
9107 .release = io_uring_release,
Jens Axboefcb323c2019-10-24 12:39:47 -06009108 .flush = io_uring_flush,
Jens Axboe2b188cc2019-01-07 10:46:33 -07009109 .mmap = io_uring_mmap,
Roman Penyaev6c5c2402019-11-28 12:53:22 +01009110#ifndef CONFIG_MMU
9111 .get_unmapped_area = io_uring_nommu_get_unmapped_area,
9112 .mmap_capabilities = io_uring_nommu_mmap_capabilities,
9113#endif
Jens Axboe2b188cc2019-01-07 10:46:33 -07009114 .poll = io_uring_poll,
9115 .fasync = io_uring_fasync,
Tobias Klauserbebdb652020-02-26 18:38:32 +01009116#ifdef CONFIG_PROC_FS
Jens Axboe87ce9552020-01-30 08:25:34 -07009117 .show_fdinfo = io_uring_show_fdinfo,
Tobias Klauserbebdb652020-02-26 18:38:32 +01009118#endif
Jens Axboe2b188cc2019-01-07 10:46:33 -07009119};
9120
9121static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
9122 struct io_uring_params *p)
9123{
Hristo Venev75b28af2019-08-26 17:23:46 +00009124 struct io_rings *rings;
9125 size_t size, sq_array_offset;
Jens Axboe2b188cc2019-01-07 10:46:33 -07009126
Jens Axboebd740482020-08-05 12:58:23 -06009127 /* make sure these are sane, as we already accounted them */
9128 ctx->sq_entries = p->sq_entries;
9129 ctx->cq_entries = p->cq_entries;
9130
Hristo Venev75b28af2019-08-26 17:23:46 +00009131 size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset);
9132 if (size == SIZE_MAX)
9133 return -EOVERFLOW;
9134
9135 rings = io_mem_alloc(size);
9136 if (!rings)
Jens Axboe2b188cc2019-01-07 10:46:33 -07009137 return -ENOMEM;
9138
Hristo Venev75b28af2019-08-26 17:23:46 +00009139 ctx->rings = rings;
9140 ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
9141 rings->sq_ring_mask = p->sq_entries - 1;
9142 rings->cq_ring_mask = p->cq_entries - 1;
9143 rings->sq_ring_entries = p->sq_entries;
9144 rings->cq_ring_entries = p->cq_entries;
9145 ctx->sq_mask = rings->sq_ring_mask;
9146 ctx->cq_mask = rings->cq_ring_mask;
Jens Axboe2b188cc2019-01-07 10:46:33 -07009147
9148 size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
Jens Axboeeb065d32019-11-20 09:26:29 -07009149 if (size == SIZE_MAX) {
9150 io_mem_free(ctx->rings);
9151 ctx->rings = NULL;
Jens Axboe2b188cc2019-01-07 10:46:33 -07009152 return -EOVERFLOW;
Jens Axboeeb065d32019-11-20 09:26:29 -07009153 }
Jens Axboe2b188cc2019-01-07 10:46:33 -07009154
9155 ctx->sq_sqes = io_mem_alloc(size);
Jens Axboeeb065d32019-11-20 09:26:29 -07009156 if (!ctx->sq_sqes) {
9157 io_mem_free(ctx->rings);
9158 ctx->rings = NULL;
Jens Axboe2b188cc2019-01-07 10:46:33 -07009159 return -ENOMEM;
Jens Axboeeb065d32019-11-20 09:26:29 -07009160 }
Jens Axboe2b188cc2019-01-07 10:46:33 -07009161
Jens Axboe2b188cc2019-01-07 10:46:33 -07009162 return 0;
9163}
9164
9165/*
9166 * Allocate an anonymous fd, this is what constitutes the application
9167 * visible backing of an io_uring instance. The application mmaps this
9168 * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
9169 * we have to tie this fd to a socket for file garbage collection purposes.
9170 */
9171static int io_uring_get_fd(struct io_ring_ctx *ctx)
9172{
9173 struct file *file;
9174 int ret;
9175
9176#if defined(CONFIG_UNIX)
9177 ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
9178 &ctx->ring_sock);
9179 if (ret)
9180 return ret;
9181#endif
9182
9183 ret = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
9184 if (ret < 0)
9185 goto err;
9186
9187 file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx,
9188 O_RDWR | O_CLOEXEC);
9189 if (IS_ERR(file)) {
Jens Axboe0f212202020-09-13 13:09:39 -06009190err_fd:
Jens Axboe2b188cc2019-01-07 10:46:33 -07009191 put_unused_fd(ret);
9192 ret = PTR_ERR(file);
9193 goto err;
9194 }
9195
9196#if defined(CONFIG_UNIX)
9197 ctx->ring_sock->file = file;
9198#endif
Jens Axboe0f212202020-09-13 13:09:39 -06009199 if (unlikely(io_uring_add_task_file(file))) {
9200 file = ERR_PTR(-ENOMEM);
9201 goto err_fd;
9202 }
Jens Axboe2b188cc2019-01-07 10:46:33 -07009203 fd_install(ret, file);
9204 return ret;
9205err:
9206#if defined(CONFIG_UNIX)
9207 sock_release(ctx->ring_sock);
9208 ctx->ring_sock = NULL;
9209#endif
9210 return ret;
9211}
9212
Xiaoguang Wang7f136572020-05-05 16:28:53 +08009213static int io_uring_create(unsigned entries, struct io_uring_params *p,
9214 struct io_uring_params __user *params)
Jens Axboe2b188cc2019-01-07 10:46:33 -07009215{
9216 struct user_struct *user = NULL;
9217 struct io_ring_ctx *ctx;
Bijan Mottahedehaad5d8d2020-06-16 16:36:08 -07009218 bool limit_mem;
Jens Axboe2b188cc2019-01-07 10:46:33 -07009219 int ret;
9220
Jens Axboe8110c1a2019-12-28 15:39:54 -07009221 if (!entries)
Jens Axboe2b188cc2019-01-07 10:46:33 -07009222 return -EINVAL;
Jens Axboe8110c1a2019-12-28 15:39:54 -07009223 if (entries > IORING_MAX_ENTRIES) {
9224 if (!(p->flags & IORING_SETUP_CLAMP))
9225 return -EINVAL;
9226 entries = IORING_MAX_ENTRIES;
9227 }
Jens Axboe2b188cc2019-01-07 10:46:33 -07009228
9229 /*
9230 * Use twice as many entries for the CQ ring. It's possible for the
9231 * application to drive a higher depth than the size of the SQ ring,
9232 * since the sqes are only used at submission time. This allows for
Jens Axboe33a107f2019-10-04 12:10:03 -06009233 * some flexibility in overcommitting a bit. If the application has
9234 * set IORING_SETUP_CQSIZE, it will have passed in the desired number
9235 * of CQ ring entries manually.
Jens Axboe2b188cc2019-01-07 10:46:33 -07009236 */
9237 p->sq_entries = roundup_pow_of_two(entries);
Jens Axboe33a107f2019-10-04 12:10:03 -06009238 if (p->flags & IORING_SETUP_CQSIZE) {
9239 /*
9240 * If IORING_SETUP_CQSIZE is set, we do the same roundup
9241 * to a power-of-two, if it isn't already. We do NOT impose
9242 * any cq vs sq ring sizing.
9243 */
Jens Axboe8110c1a2019-12-28 15:39:54 -07009244 if (p->cq_entries < p->sq_entries)
Jens Axboe33a107f2019-10-04 12:10:03 -06009245 return -EINVAL;
Jens Axboe8110c1a2019-12-28 15:39:54 -07009246 if (p->cq_entries > IORING_MAX_CQ_ENTRIES) {
9247 if (!(p->flags & IORING_SETUP_CLAMP))
9248 return -EINVAL;
9249 p->cq_entries = IORING_MAX_CQ_ENTRIES;
9250 }
Jens Axboe33a107f2019-10-04 12:10:03 -06009251 p->cq_entries = roundup_pow_of_two(p->cq_entries);
9252 } else {
9253 p->cq_entries = 2 * p->sq_entries;
9254 }
Jens Axboe2b188cc2019-01-07 10:46:33 -07009255
9256 user = get_uid(current_user());
Bijan Mottahedehaad5d8d2020-06-16 16:36:08 -07009257 limit_mem = !capable(CAP_IPC_LOCK);
Jens Axboe2b188cc2019-01-07 10:46:33 -07009258
Bijan Mottahedehaad5d8d2020-06-16 16:36:08 -07009259 if (limit_mem) {
Bijan Mottahedeha087e2b2020-06-16 16:36:07 -07009260 ret = __io_account_mem(user,
Jens Axboe2b188cc2019-01-07 10:46:33 -07009261 ring_pages(p->sq_entries, p->cq_entries));
9262 if (ret) {
9263 free_uid(user);
9264 return ret;
9265 }
9266 }
9267
9268 ctx = io_ring_ctx_alloc(p);
9269 if (!ctx) {
Bijan Mottahedehaad5d8d2020-06-16 16:36:08 -07009270 if (limit_mem)
Bijan Mottahedeha087e2b2020-06-16 16:36:07 -07009271 __io_unaccount_mem(user, ring_pages(p->sq_entries,
Jens Axboe2b188cc2019-01-07 10:46:33 -07009272 p->cq_entries));
9273 free_uid(user);
9274 return -ENOMEM;
9275 }
9276 ctx->compat = in_compat_syscall();
Jens Axboe2b188cc2019-01-07 10:46:33 -07009277 ctx->user = user;
Jens Axboe0b8c0ec2019-12-02 08:50:00 -07009278 ctx->creds = get_current_cred();
Jens Axboe2b188cc2019-01-07 10:46:33 -07009279
Jens Axboe2aede0e2020-09-14 10:45:53 -06009280 ctx->sqo_task = get_task_struct(current);
9281
9282 /*
9283 * This is just grabbed for accounting purposes. When a process exits,
9284 * the mm is exited and dropped before the files, hence we need to hang
9285 * on to this mm purely for the purposes of being able to unaccount
9286 * memory (locked/pinned vm). It's not used for anything else.
9287 */
Jens Axboe6b7898e2020-08-25 07:58:00 -06009288 mmgrab(current->mm);
Jens Axboe2aede0e2020-09-14 10:45:53 -06009289 ctx->mm_account = current->mm;
Jens Axboe6b7898e2020-08-25 07:58:00 -06009290
Jens Axboef74441e2020-08-05 13:00:44 -06009291 /*
9292 * Account memory _before_ installing the file descriptor. Once
9293 * the descriptor is installed, it can get closed at any time. Also
9294 * do this before hitting the general error path, as ring freeing
9295 * will un-account as well.
9296 */
9297 io_account_mem(ctx, ring_pages(p->sq_entries, p->cq_entries),
9298 ACCT_LOCKED);
9299 ctx->limit_mem = limit_mem;
9300
Jens Axboe2b188cc2019-01-07 10:46:33 -07009301 ret = io_allocate_scq_urings(ctx, p);
9302 if (ret)
9303 goto err;
9304
Stefano Garzarella7e84e1c2020-08-27 16:58:31 +02009305 ret = io_sq_offload_create(ctx, p);
Jens Axboe2b188cc2019-01-07 10:46:33 -07009306 if (ret)
9307 goto err;
9308
Stefano Garzarella7e84e1c2020-08-27 16:58:31 +02009309 if (!(p->flags & IORING_SETUP_R_DISABLED))
9310 io_sq_offload_start(ctx);
9311
Jens Axboe2b188cc2019-01-07 10:46:33 -07009312 memset(&p->sq_off, 0, sizeof(p->sq_off));
Hristo Venev75b28af2019-08-26 17:23:46 +00009313 p->sq_off.head = offsetof(struct io_rings, sq.head);
9314 p->sq_off.tail = offsetof(struct io_rings, sq.tail);
9315 p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask);
9316 p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
9317 p->sq_off.flags = offsetof(struct io_rings, sq_flags);
9318 p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
9319 p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
Jens Axboe2b188cc2019-01-07 10:46:33 -07009320
9321 memset(&p->cq_off, 0, sizeof(p->cq_off));
Hristo Venev75b28af2019-08-26 17:23:46 +00009322 p->cq_off.head = offsetof(struct io_rings, cq.head);
9323 p->cq_off.tail = offsetof(struct io_rings, cq.tail);
9324 p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);
9325 p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
9326 p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
9327 p->cq_off.cqes = offsetof(struct io_rings, cqes);
Stefano Garzarella0d9b5b32020-05-15 18:38:04 +02009328 p->cq_off.flags = offsetof(struct io_rings, cq_flags);
Jens Axboeac90f242019-09-06 10:26:21 -06009329
Xiaoguang Wang7f136572020-05-05 16:28:53 +08009330 p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
9331 IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
Jiufei Xue5769a352020-06-17 17:53:55 +08009332 IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
9333 IORING_FEAT_POLL_32BITS;
Xiaoguang Wang7f136572020-05-05 16:28:53 +08009334
9335 if (copy_to_user(params, p, sizeof(*p))) {
9336 ret = -EFAULT;
9337 goto err;
9338 }
Jens Axboed1719f72020-07-30 13:43:53 -06009339
9340 /*
Jens Axboe044c1ab2019-10-28 09:15:33 -06009341 * Install ring fd as the very last thing, so we don't risk someone
9342 * having closed it before we finish setup
9343 */
9344 ret = io_uring_get_fd(ctx);
9345 if (ret < 0)
9346 goto err;
9347
Dmitrii Dolgovc826bd72019-10-15 19:02:01 +02009348 trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
Jens Axboe2b188cc2019-01-07 10:46:33 -07009349 return ret;
9350err:
9351 io_ring_ctx_wait_and_kill(ctx);
9352 return ret;
9353}
9354
9355/*
9356 * Sets up an aio uring context, and returns the fd. Applications asks for a
9357 * ring size, we return the actual sq/cq ring sizes (among other things) in the
9358 * params structure passed in.
9359 */
9360static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
9361{
9362 struct io_uring_params p;
Jens Axboe2b188cc2019-01-07 10:46:33 -07009363 int i;
9364
9365 if (copy_from_user(&p, params, sizeof(p)))
9366 return -EFAULT;
9367 for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
9368 if (p.resv[i])
9369 return -EINVAL;
9370 }
9371
Jens Axboe6c271ce2019-01-10 11:22:30 -07009372 if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
Jens Axboe8110c1a2019-12-28 15:39:54 -07009373 IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
Stefano Garzarella7e84e1c2020-08-27 16:58:31 +02009374 IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ |
9375 IORING_SETUP_R_DISABLED))
Jens Axboe2b188cc2019-01-07 10:46:33 -07009376 return -EINVAL;
9377
Xiaoguang Wang7f136572020-05-05 16:28:53 +08009378 return io_uring_create(entries, &p, params);
Jens Axboe2b188cc2019-01-07 10:46:33 -07009379}
9380
9381SYSCALL_DEFINE2(io_uring_setup, u32, entries,
9382 struct io_uring_params __user *, params)
9383{
9384 return io_uring_setup(entries, params);
9385}
9386
Jens Axboe66f4af92020-01-16 15:36:52 -07009387static int io_probe(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args)
9388{
9389 struct io_uring_probe *p;
9390 size_t size;
9391 int i, ret;
9392
9393 size = struct_size(p, ops, nr_args);
9394 if (size == SIZE_MAX)
9395 return -EOVERFLOW;
9396 p = kzalloc(size, GFP_KERNEL);
9397 if (!p)
9398 return -ENOMEM;
9399
9400 ret = -EFAULT;
9401 if (copy_from_user(p, arg, size))
9402 goto out;
9403 ret = -EINVAL;
9404 if (memchr_inv(p, 0, size))
9405 goto out;
9406
9407 p->last_op = IORING_OP_LAST - 1;
9408 if (nr_args > IORING_OP_LAST)
9409 nr_args = IORING_OP_LAST;
9410
9411 for (i = 0; i < nr_args; i++) {
9412 p->ops[i].op = i;
9413 if (!io_op_defs[i].not_supported)
9414 p->ops[i].flags = IO_URING_OP_SUPPORTED;
9415 }
9416 p->ops_len = i;
9417
9418 ret = 0;
9419 if (copy_to_user(arg, p, size))
9420 ret = -EFAULT;
9421out:
9422 kfree(p);
9423 return ret;
9424}
9425
Jens Axboe071698e2020-01-28 10:04:42 -07009426static int io_register_personality(struct io_ring_ctx *ctx)
9427{
9428 const struct cred *creds = get_current_cred();
9429 int id;
9430
9431 id = idr_alloc_cyclic(&ctx->personality_idr, (void *) creds, 1,
9432 USHRT_MAX, GFP_KERNEL);
9433 if (id < 0)
9434 put_cred(creds);
9435 return id;
9436}
9437
9438static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
9439{
9440 const struct cred *old_creds;
9441
9442 old_creds = idr_remove(&ctx->personality_idr, id);
9443 if (old_creds) {
9444 put_cred(old_creds);
9445 return 0;
9446 }
9447
9448 return -EINVAL;
9449}
9450
Stefano Garzarella21b55db2020-08-27 16:58:30 +02009451static int io_register_restrictions(struct io_ring_ctx *ctx, void __user *arg,
9452 unsigned int nr_args)
9453{
9454 struct io_uring_restriction *res;
9455 size_t size;
9456 int i, ret;
9457
Stefano Garzarella7e84e1c2020-08-27 16:58:31 +02009458 /* Restrictions allowed only if rings started disabled */
9459 if (!(ctx->flags & IORING_SETUP_R_DISABLED))
9460 return -EBADFD;
9461
Stefano Garzarella21b55db2020-08-27 16:58:30 +02009462 /* We allow only a single restrictions registration */
Stefano Garzarella7e84e1c2020-08-27 16:58:31 +02009463 if (ctx->restrictions.registered)
Stefano Garzarella21b55db2020-08-27 16:58:30 +02009464 return -EBUSY;
9465
9466 if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
9467 return -EINVAL;
9468
9469 size = array_size(nr_args, sizeof(*res));
9470 if (size == SIZE_MAX)
9471 return -EOVERFLOW;
9472
9473 res = memdup_user(arg, size);
9474 if (IS_ERR(res))
9475 return PTR_ERR(res);
9476
9477 ret = 0;
9478
9479 for (i = 0; i < nr_args; i++) {
9480 switch (res[i].opcode) {
9481 case IORING_RESTRICTION_REGISTER_OP:
9482 if (res[i].register_op >= IORING_REGISTER_LAST) {
9483 ret = -EINVAL;
9484 goto out;
9485 }
9486
9487 __set_bit(res[i].register_op,
9488 ctx->restrictions.register_op);
9489 break;
9490 case IORING_RESTRICTION_SQE_OP:
9491 if (res[i].sqe_op >= IORING_OP_LAST) {
9492 ret = -EINVAL;
9493 goto out;
9494 }
9495
9496 __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op);
9497 break;
9498 case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
9499 ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags;
9500 break;
9501 case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
9502 ctx->restrictions.sqe_flags_required = res[i].sqe_flags;
9503 break;
9504 default:
9505 ret = -EINVAL;
9506 goto out;
9507 }
9508 }
9509
9510out:
9511 /* Reset all restrictions if an error happened */
9512 if (ret != 0)
9513 memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
9514 else
Stefano Garzarella7e84e1c2020-08-27 16:58:31 +02009515 ctx->restrictions.registered = true;
Stefano Garzarella21b55db2020-08-27 16:58:30 +02009516
9517 kfree(res);
9518 return ret;
9519}
9520
Stefano Garzarella7e84e1c2020-08-27 16:58:31 +02009521static int io_register_enable_rings(struct io_ring_ctx *ctx)
9522{
9523 if (!(ctx->flags & IORING_SETUP_R_DISABLED))
9524 return -EBADFD;
9525
9526 if (ctx->restrictions.registered)
9527 ctx->restricted = 1;
9528
9529 ctx->flags &= ~IORING_SETUP_R_DISABLED;
9530
9531 io_sq_offload_start(ctx);
9532
9533 return 0;
9534}
9535
Jens Axboe071698e2020-01-28 10:04:42 -07009536static bool io_register_op_must_quiesce(int op)
9537{
9538 switch (op) {
9539 case IORING_UNREGISTER_FILES:
9540 case IORING_REGISTER_FILES_UPDATE:
9541 case IORING_REGISTER_PROBE:
9542 case IORING_REGISTER_PERSONALITY:
9543 case IORING_UNREGISTER_PERSONALITY:
9544 return false;
9545 default:
9546 return true;
9547 }
9548}
9549
Jens Axboeedafcce2019-01-09 09:16:05 -07009550static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
9551 void __user *arg, unsigned nr_args)
Jens Axboeb19062a2019-04-15 10:49:38 -06009552 __releases(ctx->uring_lock)
9553 __acquires(ctx->uring_lock)
Jens Axboeedafcce2019-01-09 09:16:05 -07009554{
9555 int ret;
9556
Jens Axboe35fa71a2019-04-22 10:23:23 -06009557 /*
9558 * We're inside the ring mutex, if the ref is already dying, then
9559 * someone else killed the ctx or is already going through
9560 * io_uring_register().
9561 */
9562 if (percpu_ref_is_dying(&ctx->refs))
9563 return -ENXIO;
9564
Jens Axboe071698e2020-01-28 10:04:42 -07009565 if (io_register_op_must_quiesce(opcode)) {
Jens Axboe05f3fb32019-12-09 11:22:50 -07009566 percpu_ref_kill(&ctx->refs);
Jens Axboeb19062a2019-04-15 10:49:38 -06009567
Jens Axboe05f3fb32019-12-09 11:22:50 -07009568 /*
9569 * Drop uring mutex before waiting for references to exit. If
9570 * another thread is currently inside io_uring_enter() it might
9571 * need to grab the uring_lock to make progress. If we hold it
9572 * here across the drain wait, then we can deadlock. It's safe
9573 * to drop the mutex here, since no new references will come in
9574 * after we've killed the percpu ref.
9575 */
9576 mutex_unlock(&ctx->uring_lock);
Jens Axboe0f158b42020-05-14 17:18:39 -06009577 ret = wait_for_completion_interruptible(&ctx->ref_comp);
Jens Axboe05f3fb32019-12-09 11:22:50 -07009578 mutex_lock(&ctx->uring_lock);
Jens Axboec1503682020-01-08 08:26:07 -07009579 if (ret) {
9580 percpu_ref_resurrect(&ctx->refs);
9581 ret = -EINTR;
Stefano Garzarella21b55db2020-08-27 16:58:30 +02009582 goto out_quiesce;
9583 }
9584 }
9585
9586 if (ctx->restricted) {
9587 if (opcode >= IORING_REGISTER_LAST) {
9588 ret = -EINVAL;
9589 goto out;
9590 }
9591
9592 if (!test_bit(opcode, ctx->restrictions.register_op)) {
9593 ret = -EACCES;
Jens Axboec1503682020-01-08 08:26:07 -07009594 goto out;
9595 }
Jens Axboe05f3fb32019-12-09 11:22:50 -07009596 }
Jens Axboeedafcce2019-01-09 09:16:05 -07009597
9598 switch (opcode) {
9599 case IORING_REGISTER_BUFFERS:
9600 ret = io_sqe_buffer_register(ctx, arg, nr_args);
9601 break;
9602 case IORING_UNREGISTER_BUFFERS:
9603 ret = -EINVAL;
9604 if (arg || nr_args)
9605 break;
9606 ret = io_sqe_buffer_unregister(ctx);
9607 break;
Jens Axboe6b063142019-01-10 22:13:58 -07009608 case IORING_REGISTER_FILES:
9609 ret = io_sqe_files_register(ctx, arg, nr_args);
9610 break;
9611 case IORING_UNREGISTER_FILES:
9612 ret = -EINVAL;
9613 if (arg || nr_args)
9614 break;
9615 ret = io_sqe_files_unregister(ctx);
9616 break;
Jens Axboec3a31e62019-10-03 13:59:56 -06009617 case IORING_REGISTER_FILES_UPDATE:
9618 ret = io_sqe_files_update(ctx, arg, nr_args);
9619 break;
Jens Axboe9b402842019-04-11 11:45:41 -06009620 case IORING_REGISTER_EVENTFD:
Jens Axboef2842ab2020-01-08 11:04:00 -07009621 case IORING_REGISTER_EVENTFD_ASYNC:
Jens Axboe9b402842019-04-11 11:45:41 -06009622 ret = -EINVAL;
9623 if (nr_args != 1)
9624 break;
9625 ret = io_eventfd_register(ctx, arg);
Jens Axboef2842ab2020-01-08 11:04:00 -07009626 if (ret)
9627 break;
9628 if (opcode == IORING_REGISTER_EVENTFD_ASYNC)
9629 ctx->eventfd_async = 1;
9630 else
9631 ctx->eventfd_async = 0;
Jens Axboe9b402842019-04-11 11:45:41 -06009632 break;
9633 case IORING_UNREGISTER_EVENTFD:
9634 ret = -EINVAL;
9635 if (arg || nr_args)
9636 break;
9637 ret = io_eventfd_unregister(ctx);
9638 break;
Jens Axboe66f4af92020-01-16 15:36:52 -07009639 case IORING_REGISTER_PROBE:
9640 ret = -EINVAL;
9641 if (!arg || nr_args > 256)
9642 break;
9643 ret = io_probe(ctx, arg, nr_args);
9644 break;
Jens Axboe071698e2020-01-28 10:04:42 -07009645 case IORING_REGISTER_PERSONALITY:
9646 ret = -EINVAL;
9647 if (arg || nr_args)
9648 break;
9649 ret = io_register_personality(ctx);
9650 break;
9651 case IORING_UNREGISTER_PERSONALITY:
9652 ret = -EINVAL;
9653 if (arg)
9654 break;
9655 ret = io_unregister_personality(ctx, nr_args);
9656 break;
Stefano Garzarella7e84e1c2020-08-27 16:58:31 +02009657 case IORING_REGISTER_ENABLE_RINGS:
9658 ret = -EINVAL;
9659 if (arg || nr_args)
9660 break;
9661 ret = io_register_enable_rings(ctx);
9662 break;
Stefano Garzarella21b55db2020-08-27 16:58:30 +02009663 case IORING_REGISTER_RESTRICTIONS:
9664 ret = io_register_restrictions(ctx, arg, nr_args);
9665 break;
Jens Axboeedafcce2019-01-09 09:16:05 -07009666 default:
9667 ret = -EINVAL;
9668 break;
9669 }
9670
Stefano Garzarella21b55db2020-08-27 16:58:30 +02009671out:
Jens Axboe071698e2020-01-28 10:04:42 -07009672 if (io_register_op_must_quiesce(opcode)) {
Jens Axboe05f3fb32019-12-09 11:22:50 -07009673 /* bring the ctx back to life */
Jens Axboe05f3fb32019-12-09 11:22:50 -07009674 percpu_ref_reinit(&ctx->refs);
Stefano Garzarella21b55db2020-08-27 16:58:30 +02009675out_quiesce:
Jens Axboe0f158b42020-05-14 17:18:39 -06009676 reinit_completion(&ctx->ref_comp);
Jens Axboe05f3fb32019-12-09 11:22:50 -07009677 }
Jens Axboeedafcce2019-01-09 09:16:05 -07009678 return ret;
9679}
9680
9681SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
9682 void __user *, arg, unsigned int, nr_args)
9683{
9684 struct io_ring_ctx *ctx;
9685 long ret = -EBADF;
9686 struct fd f;
9687
9688 f = fdget(fd);
9689 if (!f.file)
9690 return -EBADF;
9691
9692 ret = -EOPNOTSUPP;
9693 if (f.file->f_op != &io_uring_fops)
9694 goto out_fput;
9695
9696 ctx = f.file->private_data;
9697
9698 mutex_lock(&ctx->uring_lock);
9699 ret = __io_uring_register(ctx, opcode, arg, nr_args);
9700 mutex_unlock(&ctx->uring_lock);
Dmitrii Dolgovc826bd72019-10-15 19:02:01 +02009701 trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs,
9702 ctx->cq_ev_fd != NULL, ret);
Jens Axboeedafcce2019-01-09 09:16:05 -07009703out_fput:
9704 fdput(f);
9705 return ret;
9706}
9707
Jens Axboe2b188cc2019-01-07 10:46:33 -07009708static int __init io_uring_init(void)
9709{
Stefan Metzmacherd7f62e82020-01-29 14:39:41 +01009710#define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \
9711 BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \
9712 BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \
9713} while (0)
9714
9715#define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \
9716 __BUILD_BUG_VERIFY_ELEMENT(struct io_uring_sqe, eoffset, etype, ename)
9717 BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64);
9718 BUILD_BUG_SQE_ELEM(0, __u8, opcode);
9719 BUILD_BUG_SQE_ELEM(1, __u8, flags);
9720 BUILD_BUG_SQE_ELEM(2, __u16, ioprio);
9721 BUILD_BUG_SQE_ELEM(4, __s32, fd);
9722 BUILD_BUG_SQE_ELEM(8, __u64, off);
9723 BUILD_BUG_SQE_ELEM(8, __u64, addr2);
9724 BUILD_BUG_SQE_ELEM(16, __u64, addr);
Pavel Begunkov7d67af22020-02-24 11:32:45 +03009725 BUILD_BUG_SQE_ELEM(16, __u64, splice_off_in);
Stefan Metzmacherd7f62e82020-01-29 14:39:41 +01009726 BUILD_BUG_SQE_ELEM(24, __u32, len);
9727 BUILD_BUG_SQE_ELEM(28, __kernel_rwf_t, rw_flags);
9728 BUILD_BUG_SQE_ELEM(28, /* compat */ int, rw_flags);
9729 BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags);
9730 BUILD_BUG_SQE_ELEM(28, __u32, fsync_flags);
Jiufei Xue5769a352020-06-17 17:53:55 +08009731 BUILD_BUG_SQE_ELEM(28, /* compat */ __u16, poll_events);
9732 BUILD_BUG_SQE_ELEM(28, __u32, poll32_events);
Stefan Metzmacherd7f62e82020-01-29 14:39:41 +01009733 BUILD_BUG_SQE_ELEM(28, __u32, sync_range_flags);
9734 BUILD_BUG_SQE_ELEM(28, __u32, msg_flags);
9735 BUILD_BUG_SQE_ELEM(28, __u32, timeout_flags);
9736 BUILD_BUG_SQE_ELEM(28, __u32, accept_flags);
9737 BUILD_BUG_SQE_ELEM(28, __u32, cancel_flags);
9738 BUILD_BUG_SQE_ELEM(28, __u32, open_flags);
9739 BUILD_BUG_SQE_ELEM(28, __u32, statx_flags);
9740 BUILD_BUG_SQE_ELEM(28, __u32, fadvise_advice);
Pavel Begunkov7d67af22020-02-24 11:32:45 +03009741 BUILD_BUG_SQE_ELEM(28, __u32, splice_flags);
Stefan Metzmacherd7f62e82020-01-29 14:39:41 +01009742 BUILD_BUG_SQE_ELEM(32, __u64, user_data);
9743 BUILD_BUG_SQE_ELEM(40, __u16, buf_index);
9744 BUILD_BUG_SQE_ELEM(42, __u16, personality);
Pavel Begunkov7d67af22020-02-24 11:32:45 +03009745 BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in);
Stefan Metzmacherd7f62e82020-01-29 14:39:41 +01009746
Jens Axboed3656342019-12-18 09:50:26 -07009747 BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
Jens Axboe84557872020-03-03 15:28:17 -07009748 BUILD_BUG_ON(__REQ_F_LAST_BIT >= 8 * sizeof(int));
Jens Axboe2b188cc2019-01-07 10:46:33 -07009749 req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
9750 return 0;
9751};
9752__initcall(io_uring_init);