blob: b5f91d21fd04a94d21945113a6dd3f5dfd3e1836 [file] [log] [blame]
Jens Axboe2b188cc2019-01-07 10:46:33 -07001// SPDX-License-Identifier: GPL-2.0
2/*
3 * Shared application/kernel submission and completion ring pairs, for
4 * supporting fast/efficient IO.
5 *
6 * A note on the read/write ordering memory barriers that are matched between
Stefan Bühler1e84b972019-04-24 23:54:16 +02007 * the application and kernel side.
8 *
9 * After the application reads the CQ ring tail, it must use an
10 * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses
11 * before writing the tail (using smp_load_acquire to read the tail will
12 * do). It also needs a smp_mb() before updating CQ head (ordering the
13 * entry load(s) with the head store), pairing with an implicit barrier
14 * through a control-dependency in io_get_cqring (smp_store_release to
15 * store head will do). Failure to do so could lead to reading invalid
16 * CQ entries.
17 *
18 * Likewise, the application must use an appropriate smp_wmb() before
19 * writing the SQ tail (ordering SQ entry stores with the tail store),
20 * which pairs with smp_load_acquire in io_get_sqring (smp_store_release
21 * to store the tail will do). And it needs a barrier ordering the SQ
22 * head load before writing new SQ entries (smp_load_acquire to read
23 * head will do).
24 *
25 * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application
26 * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after*
27 * updating the SQ tail; a full memory barrier smp_mb() is needed
28 * between.
Jens Axboe2b188cc2019-01-07 10:46:33 -070029 *
30 * Also see the examples in the liburing library:
31 *
32 * git://git.kernel.dk/liburing
33 *
34 * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
35 * from data shared between the kernel and application. This is done both
36 * for ordering purposes, but also to ensure that once a value is loaded from
37 * data that the application could potentially modify, it remains stable.
38 *
39 * Copyright (C) 2018-2019 Jens Axboe
Christoph Hellwigc992fe22019-01-11 09:43:02 -070040 * Copyright (c) 2018-2019 Christoph Hellwig
Jens Axboe2b188cc2019-01-07 10:46:33 -070041 */
42#include <linux/kernel.h>
43#include <linux/init.h>
44#include <linux/errno.h>
45#include <linux/syscalls.h>
46#include <linux/compat.h>
47#include <linux/refcount.h>
48#include <linux/uio.h>
49
50#include <linux/sched/signal.h>
51#include <linux/fs.h>
52#include <linux/file.h>
53#include <linux/fdtable.h>
54#include <linux/mm.h>
55#include <linux/mman.h>
56#include <linux/mmu_context.h>
57#include <linux/percpu.h>
58#include <linux/slab.h>
Jens Axboe6c271ce2019-01-10 11:22:30 -070059#include <linux/kthread.h>
Jens Axboe2b188cc2019-01-07 10:46:33 -070060#include <linux/blkdev.h>
Jens Axboeedafcce2019-01-09 09:16:05 -070061#include <linux/bvec.h>
Jens Axboe2b188cc2019-01-07 10:46:33 -070062#include <linux/net.h>
63#include <net/sock.h>
64#include <net/af_unix.h>
Jens Axboe6b063142019-01-10 22:13:58 -070065#include <net/scm.h>
Jens Axboe2b188cc2019-01-07 10:46:33 -070066#include <linux/anon_inodes.h>
67#include <linux/sched/mm.h>
68#include <linux/uaccess.h>
69#include <linux/nospec.h>
Jens Axboeedafcce2019-01-09 09:16:05 -070070#include <linux/sizes.h>
71#include <linux/hugetlb.h>
Jens Axboeaa4c3962019-11-29 10:14:00 -070072#include <linux/highmem.h>
Jens Axboe2b188cc2019-01-07 10:46:33 -070073
Dmitrii Dolgovc826bd72019-10-15 19:02:01 +020074#define CREATE_TRACE_POINTS
75#include <trace/events/io_uring.h>
76
Jens Axboe2b188cc2019-01-07 10:46:33 -070077#include <uapi/linux/io_uring.h>
78
79#include "internal.h"
Jens Axboe561fb042019-10-24 07:25:42 -060080#include "io-wq.h"
Jens Axboe2b188cc2019-01-07 10:46:33 -070081
Daniel Xu5277dea2019-09-14 14:23:45 -070082#define IORING_MAX_ENTRIES 32768
Jens Axboe33a107f2019-10-04 12:10:03 -060083#define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES)
Jens Axboe65e19f52019-10-26 07:20:21 -060084
85/*
86 * Shift of 9 is 512 entries, or exactly one page on 64-bit archs
87 */
88#define IORING_FILE_TABLE_SHIFT 9
89#define IORING_MAX_FILES_TABLE (1U << IORING_FILE_TABLE_SHIFT)
90#define IORING_FILE_TABLE_MASK (IORING_MAX_FILES_TABLE - 1)
91#define IORING_MAX_FIXED_FILES (64 * IORING_MAX_FILES_TABLE)
Jens Axboe2b188cc2019-01-07 10:46:33 -070092
93struct io_uring {
94 u32 head ____cacheline_aligned_in_smp;
95 u32 tail ____cacheline_aligned_in_smp;
96};
97
Stefan Bühler1e84b972019-04-24 23:54:16 +020098/*
Hristo Venev75b28af2019-08-26 17:23:46 +000099 * This data is shared with the application through the mmap at offsets
100 * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING.
Stefan Bühler1e84b972019-04-24 23:54:16 +0200101 *
102 * The offsets to the member fields are published through struct
103 * io_sqring_offsets when calling io_uring_setup.
104 */
Hristo Venev75b28af2019-08-26 17:23:46 +0000105struct io_rings {
Stefan Bühler1e84b972019-04-24 23:54:16 +0200106 /*
107 * Head and tail offsets into the ring; the offsets need to be
108 * masked to get valid indices.
109 *
Hristo Venev75b28af2019-08-26 17:23:46 +0000110 * The kernel controls head of the sq ring and the tail of the cq ring,
111 * and the application controls tail of the sq ring and the head of the
112 * cq ring.
Stefan Bühler1e84b972019-04-24 23:54:16 +0200113 */
Hristo Venev75b28af2019-08-26 17:23:46 +0000114 struct io_uring sq, cq;
Stefan Bühler1e84b972019-04-24 23:54:16 +0200115 /*
Hristo Venev75b28af2019-08-26 17:23:46 +0000116 * Bitmasks to apply to head and tail offsets (constant, equals
Stefan Bühler1e84b972019-04-24 23:54:16 +0200117 * ring_entries - 1)
118 */
Hristo Venev75b28af2019-08-26 17:23:46 +0000119 u32 sq_ring_mask, cq_ring_mask;
120 /* Ring sizes (constant, power of 2) */
121 u32 sq_ring_entries, cq_ring_entries;
Stefan Bühler1e84b972019-04-24 23:54:16 +0200122 /*
123 * Number of invalid entries dropped by the kernel due to
124 * invalid index stored in array
125 *
126 * Written by the kernel, shouldn't be modified by the
127 * application (i.e. get number of "new events" by comparing to
128 * cached value).
129 *
130 * After a new SQ head value was read by the application this
131 * counter includes all submissions that were dropped reaching
132 * the new SQ head (and possibly more).
133 */
Hristo Venev75b28af2019-08-26 17:23:46 +0000134 u32 sq_dropped;
Stefan Bühler1e84b972019-04-24 23:54:16 +0200135 /*
136 * Runtime flags
137 *
138 * Written by the kernel, shouldn't be modified by the
139 * application.
140 *
141 * The application needs a full memory barrier before checking
142 * for IORING_SQ_NEED_WAKEUP after updating the sq tail.
143 */
Hristo Venev75b28af2019-08-26 17:23:46 +0000144 u32 sq_flags;
Stefan Bühler1e84b972019-04-24 23:54:16 +0200145 /*
146 * Number of completion events lost because the queue was full;
147 * this should be avoided by the application by making sure
LimingWu0b4295b2019-12-05 20:18:18 +0800148 * there are not more requests pending than there is space in
Stefan Bühler1e84b972019-04-24 23:54:16 +0200149 * the completion queue.
150 *
151 * Written by the kernel, shouldn't be modified by the
152 * application (i.e. get number of "new events" by comparing to
153 * cached value).
154 *
155 * As completion events come in out of order this counter is not
156 * ordered with any other data.
157 */
Hristo Venev75b28af2019-08-26 17:23:46 +0000158 u32 cq_overflow;
Stefan Bühler1e84b972019-04-24 23:54:16 +0200159 /*
160 * Ring buffer of completion events.
161 *
162 * The kernel writes completion events fresh every time they are
163 * produced, so the application is allowed to modify pending
164 * entries.
165 */
Hristo Venev75b28af2019-08-26 17:23:46 +0000166 struct io_uring_cqe cqes[] ____cacheline_aligned_in_smp;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700167};
168
Jens Axboeedafcce2019-01-09 09:16:05 -0700169struct io_mapped_ubuf {
170 u64 ubuf;
171 size_t len;
172 struct bio_vec *bvec;
173 unsigned int nr_bvecs;
174};
175
Jens Axboe65e19f52019-10-26 07:20:21 -0600176struct fixed_file_table {
177 struct file **files;
Jens Axboe31b51512019-01-18 22:56:34 -0700178};
179
Jens Axboe2b188cc2019-01-07 10:46:33 -0700180struct io_ring_ctx {
181 struct {
182 struct percpu_ref refs;
183 } ____cacheline_aligned_in_smp;
184
185 struct {
186 unsigned int flags;
187 bool compat;
188 bool account_mem;
Jens Axboe1d7bb1d2019-11-06 11:31:17 -0700189 bool cq_overflow_flushed;
Pavel Begunkov1b4a51b2019-11-21 11:54:28 +0300190 bool drain_next;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700191
Hristo Venev75b28af2019-08-26 17:23:46 +0000192 /*
193 * Ring buffer of indices into array of io_uring_sqe, which is
194 * mmapped by the application using the IORING_OFF_SQES offset.
195 *
196 * This indirection could e.g. be used to assign fixed
197 * io_uring_sqe entries to operations and only submit them to
198 * the queue when needed.
199 *
200 * The kernel modifies neither the indices array nor the entries
201 * array.
202 */
203 u32 *sq_array;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700204 unsigned cached_sq_head;
205 unsigned sq_entries;
206 unsigned sq_mask;
Jens Axboe6c271ce2019-01-10 11:22:30 -0700207 unsigned sq_thread_idle;
Jens Axboe498ccd92019-10-25 10:04:25 -0600208 unsigned cached_sq_dropped;
Jens Axboe206aefd2019-11-07 18:27:42 -0700209 atomic_t cached_cq_overflow;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700210 struct io_uring_sqe *sq_sqes;
Jens Axboede0617e2019-04-06 21:51:27 -0600211
212 struct list_head defer_list;
Jens Axboe5262f562019-09-17 12:26:57 -0600213 struct list_head timeout_list;
Jens Axboe1d7bb1d2019-11-06 11:31:17 -0700214 struct list_head cq_overflow_list;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700215
Jens Axboefcb323c2019-10-24 12:39:47 -0600216 wait_queue_head_t inflight_wait;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700217 } ____cacheline_aligned_in_smp;
218
Hristo Venev75b28af2019-08-26 17:23:46 +0000219 struct io_rings *rings;
220
Jens Axboe2b188cc2019-01-07 10:46:33 -0700221 /* IO offload */
Jens Axboe561fb042019-10-24 07:25:42 -0600222 struct io_wq *io_wq;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700223 struct task_struct *sqo_thread; /* if using sq thread polling */
224 struct mm_struct *sqo_mm;
225 wait_queue_head_t sqo_wait;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700226
Jens Axboe6b063142019-01-10 22:13:58 -0700227 /*
228 * If used, fixed file set. Writers must ensure that ->refs is dead,
229 * readers must ensure that ->refs is alive as long as the file* is
230 * used. Only updated through io_uring_register(2).
231 */
Jens Axboe65e19f52019-10-26 07:20:21 -0600232 struct fixed_file_table *file_table;
Jens Axboe6b063142019-01-10 22:13:58 -0700233 unsigned nr_user_files;
234
Jens Axboeedafcce2019-01-09 09:16:05 -0700235 /* if used, fixed mapped user buffers */
236 unsigned nr_user_bufs;
237 struct io_mapped_ubuf *user_bufs;
238
Jens Axboe2b188cc2019-01-07 10:46:33 -0700239 struct user_struct *user;
240
Jens Axboe0b8c0ec2019-12-02 08:50:00 -0700241 const struct cred *creds;
Jens Axboe181e4482019-11-25 08:52:30 -0700242
Jens Axboe206aefd2019-11-07 18:27:42 -0700243 /* 0 is for ctx quiesce/reinit/free, 1 is for sqo_thread started */
244 struct completion *completions;
245
Jens Axboe0ddf92e2019-11-08 08:52:53 -0700246 /* if all else fails... */
247 struct io_kiocb *fallback_req;
248
Jens Axboe206aefd2019-11-07 18:27:42 -0700249#if defined(CONFIG_UNIX)
250 struct socket *ring_sock;
251#endif
252
253 struct {
254 unsigned cached_cq_tail;
255 unsigned cq_entries;
256 unsigned cq_mask;
257 atomic_t cq_timeouts;
258 struct wait_queue_head cq_wait;
259 struct fasync_struct *cq_fasync;
260 struct eventfd_ctx *cq_ev_fd;
261 } ____cacheline_aligned_in_smp;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700262
263 struct {
264 struct mutex uring_lock;
265 wait_queue_head_t wait;
266 } ____cacheline_aligned_in_smp;
267
268 struct {
269 spinlock_t completion_lock;
Jens Axboedef596e2019-01-09 08:59:42 -0700270 bool poll_multi_file;
271 /*
272 * ->poll_list is protected by the ctx->uring_lock for
273 * io_uring instances that don't use IORING_SETUP_SQPOLL.
274 * For SQPOLL, only the single threaded io_sq_thread() will
275 * manipulate the list, hence no extra locking is needed there.
276 */
277 struct list_head poll_list;
Jens Axboe78076bb2019-12-04 19:56:40 -0700278 struct hlist_head *cancel_hash;
279 unsigned cancel_hash_bits;
Jens Axboefcb323c2019-10-24 12:39:47 -0600280
281 spinlock_t inflight_lock;
282 struct list_head inflight_list;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700283 } ____cacheline_aligned_in_smp;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700284};
285
Jens Axboe09bb8392019-03-13 12:39:28 -0600286/*
287 * First field must be the file pointer in all the
288 * iocb unions! See also 'struct kiocb' in <linux/fs.h>
289 */
Jens Axboe221c5eb2019-01-17 09:41:58 -0700290struct io_poll_iocb {
291 struct file *file;
Jens Axboe0969e782019-12-17 18:40:57 -0700292 union {
293 struct wait_queue_head *head;
294 u64 addr;
295 };
Jens Axboe221c5eb2019-01-17 09:41:58 -0700296 __poll_t events;
Jens Axboe8c838782019-03-12 15:48:16 -0600297 bool done;
Jens Axboe221c5eb2019-01-17 09:41:58 -0700298 bool canceled;
Jens Axboe392edb42019-12-09 17:52:20 -0700299 struct wait_queue_entry wait;
Jens Axboe221c5eb2019-01-17 09:41:58 -0700300};
301
Jens Axboead8a48a2019-11-15 08:49:11 -0700302struct io_timeout_data {
303 struct io_kiocb *req;
304 struct hrtimer timer;
305 struct timespec64 ts;
306 enum hrtimer_mode mode;
Pavel Begunkovcc42e0a2019-11-25 23:14:38 +0300307 u32 seq_offset;
Jens Axboead8a48a2019-11-15 08:49:11 -0700308};
309
Jens Axboe8ed8d3c2019-12-16 11:55:28 -0700310struct io_accept {
311 struct file *file;
312 struct sockaddr __user *addr;
313 int __user *addr_len;
314 int flags;
315};
316
317struct io_sync {
318 struct file *file;
319 loff_t len;
320 loff_t off;
321 int flags;
322};
323
Jens Axboefbf23842019-12-17 18:45:56 -0700324struct io_cancel {
325 struct file *file;
326 u64 addr;
327};
328
Jens Axboeb29472e2019-12-17 18:50:29 -0700329struct io_timeout {
330 struct file *file;
331 u64 addr;
332 int flags;
333};
334
Jens Axboe9adbd452019-12-20 08:45:55 -0700335struct io_rw {
336 /* NOTE: kiocb has the file as the first member, so don't do it here */
337 struct kiocb kiocb;
338 u64 addr;
339 u64 len;
340};
341
Jens Axboef499a022019-12-02 16:28:46 -0700342struct io_async_connect {
343 struct sockaddr_storage address;
344};
345
Jens Axboe03b12302019-12-02 18:50:25 -0700346struct io_async_msghdr {
347 struct iovec fast_iov[UIO_FASTIOV];
348 struct iovec *iov;
349 struct sockaddr __user *uaddr;
350 struct msghdr msg;
351};
352
Jens Axboef67676d2019-12-02 11:03:47 -0700353struct io_async_rw {
354 struct iovec fast_iov[UIO_FASTIOV];
355 struct iovec *iov;
356 ssize_t nr_segs;
357 ssize_t size;
358};
359
Jens Axboe1a6b74f2019-12-02 10:33:15 -0700360struct io_async_ctx {
361 struct io_uring_sqe sqe;
Jens Axboef67676d2019-12-02 11:03:47 -0700362 union {
363 struct io_async_rw rw;
Jens Axboe03b12302019-12-02 18:50:25 -0700364 struct io_async_msghdr msg;
Jens Axboef499a022019-12-02 16:28:46 -0700365 struct io_async_connect connect;
Jens Axboe2d283902019-12-04 11:08:05 -0700366 struct io_timeout_data timeout;
Jens Axboef67676d2019-12-02 11:03:47 -0700367 };
Jens Axboe1a6b74f2019-12-02 10:33:15 -0700368};
369
Jens Axboe09bb8392019-03-13 12:39:28 -0600370/*
371 * NOTE! Each of the iocb union members has the file pointer
372 * as the first entry in their struct definition. So you can
373 * access the file pointer through any of the sub-structs,
374 * or directly as just 'ki_filp' in this struct.
375 */
Jens Axboe2b188cc2019-01-07 10:46:33 -0700376struct io_kiocb {
Jens Axboe221c5eb2019-01-17 09:41:58 -0700377 union {
Jens Axboe09bb8392019-03-13 12:39:28 -0600378 struct file *file;
Jens Axboe9adbd452019-12-20 08:45:55 -0700379 struct io_rw rw;
Jens Axboe221c5eb2019-01-17 09:41:58 -0700380 struct io_poll_iocb poll;
Jens Axboe8ed8d3c2019-12-16 11:55:28 -0700381 struct io_accept accept;
382 struct io_sync sync;
Jens Axboefbf23842019-12-17 18:45:56 -0700383 struct io_cancel cancel;
Jens Axboeb29472e2019-12-17 18:50:29 -0700384 struct io_timeout timeout;
Jens Axboe221c5eb2019-01-17 09:41:58 -0700385 };
Jens Axboe2b188cc2019-01-07 10:46:33 -0700386
Pavel Begunkovcf6fd4b2019-11-25 23:14:39 +0300387 const struct io_uring_sqe *sqe;
Jens Axboe1a6b74f2019-12-02 10:33:15 -0700388 struct io_async_ctx *io;
Pavel Begunkovcf6fd4b2019-11-25 23:14:39 +0300389 struct file *ring_file;
390 int ring_fd;
391 bool has_user;
392 bool in_async;
393 bool needs_fixed_file;
Jens Axboed625c6e2019-12-17 19:53:05 -0700394 u8 opcode;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700395
396 struct io_ring_ctx *ctx;
Jens Axboeeac406c2019-11-14 12:09:58 -0700397 union {
398 struct list_head list;
Jens Axboe78076bb2019-12-04 19:56:40 -0700399 struct hlist_node hash_node;
Jens Axboeeac406c2019-11-14 12:09:58 -0700400 };
Jens Axboe9e645e112019-05-10 16:07:28 -0600401 struct list_head link_list;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700402 unsigned int flags;
Jens Axboec16361c2019-01-17 08:39:48 -0700403 refcount_t refs;
Stefan Bühler8449eed2019-04-27 20:34:19 +0200404#define REQ_F_NOWAIT 1 /* must not punt to workers */
Jens Axboedef596e2019-01-09 08:59:42 -0700405#define REQ_F_IOPOLL_COMPLETED 2 /* polled IO has completed */
Jens Axboe6b063142019-01-10 22:13:58 -0700406#define REQ_F_FIXED_FILE 4 /* ctx owns file */
Jens Axboe4d7dd462019-11-20 13:03:52 -0700407#define REQ_F_LINK_NEXT 8 /* already grabbed next link */
Stefan Bühlere2033e32019-05-11 19:08:01 +0200408#define REQ_F_IO_DRAIN 16 /* drain existing IO first */
409#define REQ_F_IO_DRAINED 32 /* drain done */
Jens Axboe9e645e112019-05-10 16:07:28 -0600410#define REQ_F_LINK 64 /* linked sqes */
Jens Axboe2665abf2019-11-05 12:40:47 -0700411#define REQ_F_LINK_TIMEOUT 128 /* has linked timeout */
Zhengyuan Liuf7b76ac2019-07-16 23:26:14 +0800412#define REQ_F_FAIL_LINK 256 /* fail rest of links */
Pavel Begunkov1b4a51b2019-11-21 11:54:28 +0300413#define REQ_F_DRAIN_LINK 512 /* link should be fully drained */
Jens Axboe5262f562019-09-17 12:26:57 -0600414#define REQ_F_TIMEOUT 1024 /* timeout request */
Jens Axboe491381ce2019-10-17 09:20:46 -0600415#define REQ_F_ISREG 2048 /* regular file */
416#define REQ_F_MUST_PUNT 4096 /* must be punted even for NONBLOCK */
Jens Axboe93bd25b2019-11-11 23:34:31 -0700417#define REQ_F_TIMEOUT_NOSEQ 8192 /* no timeout sequence */
Linus Torvaldsfb4b3d32019-11-25 10:40:27 -0800418#define REQ_F_INFLIGHT 16384 /* on inflight list */
419#define REQ_F_COMP_LOCKED 32768 /* completion under lock */
Jens Axboe4e88d6e2019-12-07 20:59:47 -0700420#define REQ_F_HARDLINK 65536 /* doesn't sever on completion < 0 */
Jens Axboe8ed8d3c2019-12-16 11:55:28 -0700421#define REQ_F_PREPPED 131072 /* request already opcode prepared */
Jens Axboe2b188cc2019-01-07 10:46:33 -0700422 u64 user_data;
Jens Axboe9e645e112019-05-10 16:07:28 -0600423 u32 result;
Jens Axboede0617e2019-04-06 21:51:27 -0600424 u32 sequence;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700425
Jens Axboefcb323c2019-10-24 12:39:47 -0600426 struct list_head inflight_entry;
427
Jens Axboe561fb042019-10-24 07:25:42 -0600428 struct io_wq_work work;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700429};
430
431#define IO_PLUG_THRESHOLD 2
Jens Axboedef596e2019-01-09 08:59:42 -0700432#define IO_IOPOLL_BATCH 8
Jens Axboe2b188cc2019-01-07 10:46:33 -0700433
Jens Axboe9a56a232019-01-09 09:06:50 -0700434struct io_submit_state {
435 struct blk_plug plug;
436
437 /*
Jens Axboe2579f912019-01-09 09:10:43 -0700438 * io_kiocb alloc cache
439 */
440 void *reqs[IO_IOPOLL_BATCH];
441 unsigned int free_reqs;
442 unsigned int cur_req;
443
444 /*
Jens Axboe9a56a232019-01-09 09:06:50 -0700445 * File reference cache
446 */
447 struct file *file;
448 unsigned int fd;
449 unsigned int has_refs;
450 unsigned int used_refs;
451 unsigned int ios_left;
452};
453
Jens Axboe561fb042019-10-24 07:25:42 -0600454static void io_wq_submit_work(struct io_wq_work **workptr);
Jens Axboe78e19bb2019-11-06 15:21:34 -0700455static void io_cqring_fill_event(struct io_kiocb *req, long res);
Jackie Liu4fe2c962019-09-09 20:50:40 +0800456static void __io_free_req(struct io_kiocb *req);
Jackie Liuec9c02a2019-11-08 23:50:36 +0800457static void io_put_req(struct io_kiocb *req);
Jens Axboe78e19bb2019-11-06 15:21:34 -0700458static void io_double_put_req(struct io_kiocb *req);
Jens Axboe978db572019-11-14 22:39:04 -0700459static void __io_double_put_req(struct io_kiocb *req);
Jens Axboe94ae5e72019-11-14 19:39:52 -0700460static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
461static void io_queue_linked_timeout(struct io_kiocb *req);
Jens Axboede0617e2019-04-06 21:51:27 -0600462
Jens Axboe2b188cc2019-01-07 10:46:33 -0700463static struct kmem_cache *req_cachep;
464
465static const struct file_operations io_uring_fops;
466
467struct sock *io_uring_get_socket(struct file *file)
468{
469#if defined(CONFIG_UNIX)
470 if (file->f_op == &io_uring_fops) {
471 struct io_ring_ctx *ctx = file->private_data;
472
473 return ctx->ring_sock->sk;
474 }
475#endif
476 return NULL;
477}
478EXPORT_SYMBOL(io_uring_get_socket);
479
480static void io_ring_ctx_ref_free(struct percpu_ref *ref)
481{
482 struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
483
Jens Axboe206aefd2019-11-07 18:27:42 -0700484 complete(&ctx->completions[0]);
Jens Axboe2b188cc2019-01-07 10:46:33 -0700485}
486
487static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
488{
489 struct io_ring_ctx *ctx;
Jens Axboe78076bb2019-12-04 19:56:40 -0700490 int hash_bits;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700491
492 ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
493 if (!ctx)
494 return NULL;
495
Jens Axboe0ddf92e2019-11-08 08:52:53 -0700496 ctx->fallback_req = kmem_cache_alloc(req_cachep, GFP_KERNEL);
497 if (!ctx->fallback_req)
498 goto err;
499
Jens Axboe206aefd2019-11-07 18:27:42 -0700500 ctx->completions = kmalloc(2 * sizeof(struct completion), GFP_KERNEL);
501 if (!ctx->completions)
502 goto err;
503
Jens Axboe78076bb2019-12-04 19:56:40 -0700504 /*
505 * Use 5 bits less than the max cq entries, that should give us around
506 * 32 entries per hash list if totally full and uniformly spread.
507 */
508 hash_bits = ilog2(p->cq_entries);
509 hash_bits -= 5;
510 if (hash_bits <= 0)
511 hash_bits = 1;
512 ctx->cancel_hash_bits = hash_bits;
513 ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head),
514 GFP_KERNEL);
515 if (!ctx->cancel_hash)
516 goto err;
517 __hash_init(ctx->cancel_hash, 1U << hash_bits);
518
Roman Gushchin21482892019-05-07 10:01:48 -0700519 if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
Jens Axboe206aefd2019-11-07 18:27:42 -0700520 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
521 goto err;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700522
523 ctx->flags = p->flags;
524 init_waitqueue_head(&ctx->cq_wait);
Jens Axboe1d7bb1d2019-11-06 11:31:17 -0700525 INIT_LIST_HEAD(&ctx->cq_overflow_list);
Jens Axboe206aefd2019-11-07 18:27:42 -0700526 init_completion(&ctx->completions[0]);
527 init_completion(&ctx->completions[1]);
Jens Axboe2b188cc2019-01-07 10:46:33 -0700528 mutex_init(&ctx->uring_lock);
529 init_waitqueue_head(&ctx->wait);
530 spin_lock_init(&ctx->completion_lock);
Jens Axboedef596e2019-01-09 08:59:42 -0700531 INIT_LIST_HEAD(&ctx->poll_list);
Jens Axboede0617e2019-04-06 21:51:27 -0600532 INIT_LIST_HEAD(&ctx->defer_list);
Jens Axboe5262f562019-09-17 12:26:57 -0600533 INIT_LIST_HEAD(&ctx->timeout_list);
Jens Axboefcb323c2019-10-24 12:39:47 -0600534 init_waitqueue_head(&ctx->inflight_wait);
535 spin_lock_init(&ctx->inflight_lock);
536 INIT_LIST_HEAD(&ctx->inflight_list);
Jens Axboe2b188cc2019-01-07 10:46:33 -0700537 return ctx;
Jens Axboe206aefd2019-11-07 18:27:42 -0700538err:
Jens Axboe0ddf92e2019-11-08 08:52:53 -0700539 if (ctx->fallback_req)
540 kmem_cache_free(req_cachep, ctx->fallback_req);
Jens Axboe206aefd2019-11-07 18:27:42 -0700541 kfree(ctx->completions);
Jens Axboe78076bb2019-12-04 19:56:40 -0700542 kfree(ctx->cancel_hash);
Jens Axboe206aefd2019-11-07 18:27:42 -0700543 kfree(ctx);
544 return NULL;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700545}
546
Bob Liu9d858b22019-11-13 18:06:25 +0800547static inline bool __req_need_defer(struct io_kiocb *req)
Jens Axboede0617e2019-04-06 21:51:27 -0600548{
Jackie Liua197f662019-11-08 08:09:12 -0700549 struct io_ring_ctx *ctx = req->ctx;
550
Jens Axboe498ccd92019-10-25 10:04:25 -0600551 return req->sequence != ctx->cached_cq_tail + ctx->cached_sq_dropped
552 + atomic_read(&ctx->cached_cq_overflow);
Jens Axboede0617e2019-04-06 21:51:27 -0600553}
554
Bob Liu9d858b22019-11-13 18:06:25 +0800555static inline bool req_need_defer(struct io_kiocb *req)
Jens Axboe7adf4ea2019-10-10 21:42:58 -0600556{
Bob Liu9d858b22019-11-13 18:06:25 +0800557 if ((req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) == REQ_F_IO_DRAIN)
558 return __req_need_defer(req);
Jens Axboe7adf4ea2019-10-10 21:42:58 -0600559
Bob Liu9d858b22019-11-13 18:06:25 +0800560 return false;
Jens Axboe7adf4ea2019-10-10 21:42:58 -0600561}
562
563static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx)
Jens Axboede0617e2019-04-06 21:51:27 -0600564{
565 struct io_kiocb *req;
566
Jens Axboe7adf4ea2019-10-10 21:42:58 -0600567 req = list_first_entry_or_null(&ctx->defer_list, struct io_kiocb, list);
Bob Liu9d858b22019-11-13 18:06:25 +0800568 if (req && !req_need_defer(req)) {
Jens Axboede0617e2019-04-06 21:51:27 -0600569 list_del_init(&req->list);
570 return req;
571 }
572
573 return NULL;
574}
575
Jens Axboe5262f562019-09-17 12:26:57 -0600576static struct io_kiocb *io_get_timeout_req(struct io_ring_ctx *ctx)
577{
Jens Axboe7adf4ea2019-10-10 21:42:58 -0600578 struct io_kiocb *req;
579
580 req = list_first_entry_or_null(&ctx->timeout_list, struct io_kiocb, list);
Jens Axboe93bd25b2019-11-11 23:34:31 -0700581 if (req) {
582 if (req->flags & REQ_F_TIMEOUT_NOSEQ)
583 return NULL;
Linus Torvaldsfb4b3d32019-11-25 10:40:27 -0800584 if (!__req_need_defer(req)) {
Jens Axboe93bd25b2019-11-11 23:34:31 -0700585 list_del_init(&req->list);
586 return req;
587 }
Jens Axboe7adf4ea2019-10-10 21:42:58 -0600588 }
589
590 return NULL;
Jens Axboe5262f562019-09-17 12:26:57 -0600591}
592
Jens Axboede0617e2019-04-06 21:51:27 -0600593static void __io_commit_cqring(struct io_ring_ctx *ctx)
Jens Axboe2b188cc2019-01-07 10:46:33 -0700594{
Hristo Venev75b28af2019-08-26 17:23:46 +0000595 struct io_rings *rings = ctx->rings;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700596
Hristo Venev75b28af2019-08-26 17:23:46 +0000597 if (ctx->cached_cq_tail != READ_ONCE(rings->cq.tail)) {
Jens Axboe2b188cc2019-01-07 10:46:33 -0700598 /* order cqe stores with ring update */
Hristo Venev75b28af2019-08-26 17:23:46 +0000599 smp_store_release(&rings->cq.tail, ctx->cached_cq_tail);
Jens Axboe2b188cc2019-01-07 10:46:33 -0700600
Jens Axboe2b188cc2019-01-07 10:46:33 -0700601 if (wq_has_sleeper(&ctx->cq_wait)) {
602 wake_up_interruptible(&ctx->cq_wait);
603 kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
604 }
605 }
606}
607
Jens Axboed625c6e2019-12-17 19:53:05 -0700608static inline bool io_req_needs_user(struct io_kiocb *req)
Jens Axboe18d9be12019-09-10 09:13:05 -0600609{
Jens Axboed625c6e2019-12-17 19:53:05 -0700610 return !(req->opcode == IORING_OP_READ_FIXED ||
611 req->opcode == IORING_OP_WRITE_FIXED);
Jens Axboe561fb042019-10-24 07:25:42 -0600612}
613
Jens Axboe94ae5e72019-11-14 19:39:52 -0700614static inline bool io_prep_async_work(struct io_kiocb *req,
615 struct io_kiocb **link)
Jens Axboe561fb042019-10-24 07:25:42 -0600616{
617 bool do_hashed = false;
Jens Axboe54a91f32019-09-10 09:15:04 -0600618
Pavel Begunkovcf6fd4b2019-11-25 23:14:39 +0300619 if (req->sqe) {
Jens Axboed625c6e2019-12-17 19:53:05 -0700620 switch (req->opcode) {
Jens Axboe6cc47d12019-09-18 11:18:23 -0600621 case IORING_OP_WRITEV:
622 case IORING_OP_WRITE_FIXED:
Jens Axboe53108d42019-12-09 20:12:38 -0700623 /* only regular files should be hashed for writes */
624 if (req->flags & REQ_F_ISREG)
625 do_hashed = true;
Jens Axboe5f8fd2d2019-11-07 10:57:36 -0700626 /* fall-through */
627 case IORING_OP_READV:
628 case IORING_OP_READ_FIXED:
629 case IORING_OP_SENDMSG:
630 case IORING_OP_RECVMSG:
631 case IORING_OP_ACCEPT:
632 case IORING_OP_POLL_ADD:
Jens Axboef8e85cf2019-11-23 14:24:24 -0700633 case IORING_OP_CONNECT:
Jens Axboe5f8fd2d2019-11-07 10:57:36 -0700634 /*
635 * We know REQ_F_ISREG is not set on some of these
636 * opcodes, but this enables us to keep the check in
637 * just one place.
638 */
639 if (!(req->flags & REQ_F_ISREG))
640 req->work.flags |= IO_WQ_WORK_UNBOUND;
Jens Axboe6cc47d12019-09-18 11:18:23 -0600641 break;
642 }
Jens Axboed625c6e2019-12-17 19:53:05 -0700643 if (io_req_needs_user(req))
Jens Axboe561fb042019-10-24 07:25:42 -0600644 req->work.flags |= IO_WQ_WORK_NEEDS_USER;
Jens Axboe54a91f32019-09-10 09:15:04 -0600645 }
646
Jens Axboe94ae5e72019-11-14 19:39:52 -0700647 *link = io_prep_linked_timeout(req);
Jens Axboe561fb042019-10-24 07:25:42 -0600648 return do_hashed;
649}
650
Jackie Liua197f662019-11-08 08:09:12 -0700651static inline void io_queue_async_work(struct io_kiocb *req)
Jens Axboe561fb042019-10-24 07:25:42 -0600652{
Jackie Liua197f662019-11-08 08:09:12 -0700653 struct io_ring_ctx *ctx = req->ctx;
Jens Axboe94ae5e72019-11-14 19:39:52 -0700654 struct io_kiocb *link;
655 bool do_hashed;
656
657 do_hashed = io_prep_async_work(req, &link);
Jens Axboe561fb042019-10-24 07:25:42 -0600658
659 trace_io_uring_queue_async_work(ctx, do_hashed, req, &req->work,
660 req->flags);
661 if (!do_hashed) {
662 io_wq_enqueue(ctx->io_wq, &req->work);
663 } else {
664 io_wq_enqueue_hashed(ctx->io_wq, &req->work,
665 file_inode(req->file));
666 }
Jens Axboe94ae5e72019-11-14 19:39:52 -0700667
668 if (link)
669 io_queue_linked_timeout(link);
Jens Axboe18d9be12019-09-10 09:13:05 -0600670}
671
Jens Axboe5262f562019-09-17 12:26:57 -0600672static void io_kill_timeout(struct io_kiocb *req)
673{
674 int ret;
675
Jens Axboe2d283902019-12-04 11:08:05 -0700676 ret = hrtimer_try_to_cancel(&req->io->timeout.timer);
Jens Axboe5262f562019-09-17 12:26:57 -0600677 if (ret != -1) {
678 atomic_inc(&req->ctx->cq_timeouts);
Jens Axboe842f9612019-10-29 12:34:10 -0600679 list_del_init(&req->list);
Jens Axboe78e19bb2019-11-06 15:21:34 -0700680 io_cqring_fill_event(req, 0);
Jackie Liuec9c02a2019-11-08 23:50:36 +0800681 io_put_req(req);
Jens Axboe5262f562019-09-17 12:26:57 -0600682 }
683}
684
685static void io_kill_timeouts(struct io_ring_ctx *ctx)
686{
687 struct io_kiocb *req, *tmp;
688
689 spin_lock_irq(&ctx->completion_lock);
690 list_for_each_entry_safe(req, tmp, &ctx->timeout_list, list)
691 io_kill_timeout(req);
692 spin_unlock_irq(&ctx->completion_lock);
693}
694
Jens Axboede0617e2019-04-06 21:51:27 -0600695static void io_commit_cqring(struct io_ring_ctx *ctx)
696{
697 struct io_kiocb *req;
698
Jens Axboe5262f562019-09-17 12:26:57 -0600699 while ((req = io_get_timeout_req(ctx)) != NULL)
700 io_kill_timeout(req);
701
Jens Axboede0617e2019-04-06 21:51:27 -0600702 __io_commit_cqring(ctx);
703
704 while ((req = io_get_deferred_req(ctx)) != NULL) {
705 req->flags |= REQ_F_IO_DRAINED;
Jackie Liua197f662019-11-08 08:09:12 -0700706 io_queue_async_work(req);
Jens Axboede0617e2019-04-06 21:51:27 -0600707 }
708}
709
Jens Axboe2b188cc2019-01-07 10:46:33 -0700710static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
711{
Hristo Venev75b28af2019-08-26 17:23:46 +0000712 struct io_rings *rings = ctx->rings;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700713 unsigned tail;
714
715 tail = ctx->cached_cq_tail;
Stefan Bühler115e12e2019-04-24 23:54:18 +0200716 /*
717 * writes to the cq entry need to come after reading head; the
718 * control dependency is enough as we're using WRITE_ONCE to
719 * fill the cq entry
720 */
Hristo Venev75b28af2019-08-26 17:23:46 +0000721 if (tail - READ_ONCE(rings->cq.head) == rings->cq_ring_entries)
Jens Axboe2b188cc2019-01-07 10:46:33 -0700722 return NULL;
723
724 ctx->cached_cq_tail++;
Hristo Venev75b28af2019-08-26 17:23:46 +0000725 return &rings->cqes[tail & ctx->cq_mask];
Jens Axboe2b188cc2019-01-07 10:46:33 -0700726}
727
Jens Axboe8c838782019-03-12 15:48:16 -0600728static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
729{
730 if (waitqueue_active(&ctx->wait))
731 wake_up(&ctx->wait);
732 if (waitqueue_active(&ctx->sqo_wait))
733 wake_up(&ctx->sqo_wait);
Jens Axboe9b402842019-04-11 11:45:41 -0600734 if (ctx->cq_ev_fd)
735 eventfd_signal(ctx->cq_ev_fd, 1);
Jens Axboe8c838782019-03-12 15:48:16 -0600736}
737
Jens Axboec4a2ed72019-11-21 21:01:26 -0700738/* Returns true if there are no backlogged entries after the flush */
739static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
Jens Axboe2b188cc2019-01-07 10:46:33 -0700740{
Jens Axboe1d7bb1d2019-11-06 11:31:17 -0700741 struct io_rings *rings = ctx->rings;
742 struct io_uring_cqe *cqe;
743 struct io_kiocb *req;
744 unsigned long flags;
745 LIST_HEAD(list);
746
747 if (!force) {
748 if (list_empty_careful(&ctx->cq_overflow_list))
Jens Axboec4a2ed72019-11-21 21:01:26 -0700749 return true;
Jens Axboe1d7bb1d2019-11-06 11:31:17 -0700750 if ((ctx->cached_cq_tail - READ_ONCE(rings->cq.head) ==
751 rings->cq_ring_entries))
Jens Axboec4a2ed72019-11-21 21:01:26 -0700752 return false;
Jens Axboe1d7bb1d2019-11-06 11:31:17 -0700753 }
754
755 spin_lock_irqsave(&ctx->completion_lock, flags);
756
757 /* if force is set, the ring is going away. always drop after that */
758 if (force)
759 ctx->cq_overflow_flushed = true;
760
Jens Axboec4a2ed72019-11-21 21:01:26 -0700761 cqe = NULL;
Jens Axboe1d7bb1d2019-11-06 11:31:17 -0700762 while (!list_empty(&ctx->cq_overflow_list)) {
763 cqe = io_get_cqring(ctx);
764 if (!cqe && !force)
765 break;
766
767 req = list_first_entry(&ctx->cq_overflow_list, struct io_kiocb,
768 list);
769 list_move(&req->list, &list);
770 if (cqe) {
771 WRITE_ONCE(cqe->user_data, req->user_data);
772 WRITE_ONCE(cqe->res, req->result);
773 WRITE_ONCE(cqe->flags, 0);
774 } else {
775 WRITE_ONCE(ctx->rings->cq_overflow,
776 atomic_inc_return(&ctx->cached_cq_overflow));
777 }
778 }
779
780 io_commit_cqring(ctx);
781 spin_unlock_irqrestore(&ctx->completion_lock, flags);
782 io_cqring_ev_posted(ctx);
783
784 while (!list_empty(&list)) {
785 req = list_first_entry(&list, struct io_kiocb, list);
786 list_del(&req->list);
Jackie Liuec9c02a2019-11-08 23:50:36 +0800787 io_put_req(req);
Jens Axboe1d7bb1d2019-11-06 11:31:17 -0700788 }
Jens Axboec4a2ed72019-11-21 21:01:26 -0700789
790 return cqe != NULL;
Jens Axboe1d7bb1d2019-11-06 11:31:17 -0700791}
792
Jens Axboe78e19bb2019-11-06 15:21:34 -0700793static void io_cqring_fill_event(struct io_kiocb *req, long res)
Jens Axboe2b188cc2019-01-07 10:46:33 -0700794{
Jens Axboe78e19bb2019-11-06 15:21:34 -0700795 struct io_ring_ctx *ctx = req->ctx;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700796 struct io_uring_cqe *cqe;
797
Jens Axboe78e19bb2019-11-06 15:21:34 -0700798 trace_io_uring_complete(ctx, req->user_data, res);
Jens Axboe51c3ff62019-11-03 06:52:50 -0700799
Jens Axboe2b188cc2019-01-07 10:46:33 -0700800 /*
801 * If we can't get a cq entry, userspace overflowed the
802 * submission (by quite a lot). Increment the overflow count in
803 * the ring.
804 */
805 cqe = io_get_cqring(ctx);
Jens Axboe1d7bb1d2019-11-06 11:31:17 -0700806 if (likely(cqe)) {
Jens Axboe78e19bb2019-11-06 15:21:34 -0700807 WRITE_ONCE(cqe->user_data, req->user_data);
Jens Axboe2b188cc2019-01-07 10:46:33 -0700808 WRITE_ONCE(cqe->res, res);
809 WRITE_ONCE(cqe->flags, 0);
Jens Axboe1d7bb1d2019-11-06 11:31:17 -0700810 } else if (ctx->cq_overflow_flushed) {
Jens Axboe2b188cc2019-01-07 10:46:33 -0700811 WRITE_ONCE(ctx->rings->cq_overflow,
812 atomic_inc_return(&ctx->cached_cq_overflow));
Jens Axboe1d7bb1d2019-11-06 11:31:17 -0700813 } else {
814 refcount_inc(&req->refs);
815 req->result = res;
816 list_add_tail(&req->list, &ctx->cq_overflow_list);
Jens Axboe2b188cc2019-01-07 10:46:33 -0700817 }
818}
819
Jens Axboe78e19bb2019-11-06 15:21:34 -0700820static void io_cqring_add_event(struct io_kiocb *req, long res)
Jens Axboe2b188cc2019-01-07 10:46:33 -0700821{
Jens Axboe78e19bb2019-11-06 15:21:34 -0700822 struct io_ring_ctx *ctx = req->ctx;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700823 unsigned long flags;
824
825 spin_lock_irqsave(&ctx->completion_lock, flags);
Jens Axboe78e19bb2019-11-06 15:21:34 -0700826 io_cqring_fill_event(req, res);
Jens Axboe2b188cc2019-01-07 10:46:33 -0700827 io_commit_cqring(ctx);
828 spin_unlock_irqrestore(&ctx->completion_lock, flags);
829
Jens Axboe8c838782019-03-12 15:48:16 -0600830 io_cqring_ev_posted(ctx);
Jens Axboe2b188cc2019-01-07 10:46:33 -0700831}
832
Jens Axboe0ddf92e2019-11-08 08:52:53 -0700833static inline bool io_is_fallback_req(struct io_kiocb *req)
834{
835 return req == (struct io_kiocb *)
836 ((unsigned long) req->ctx->fallback_req & ~1UL);
837}
838
839static struct io_kiocb *io_get_fallback_req(struct io_ring_ctx *ctx)
840{
841 struct io_kiocb *req;
842
843 req = ctx->fallback_req;
844 if (!test_and_set_bit_lock(0, (unsigned long *) ctx->fallback_req))
845 return req;
846
847 return NULL;
848}
849
Jens Axboe2579f912019-01-09 09:10:43 -0700850static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
851 struct io_submit_state *state)
Jens Axboe2b188cc2019-01-07 10:46:33 -0700852{
Jens Axboefd6fab22019-03-14 16:30:06 -0600853 gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700854 struct io_kiocb *req;
855
856 if (!percpu_ref_tryget(&ctx->refs))
857 return NULL;
858
Jens Axboe2579f912019-01-09 09:10:43 -0700859 if (!state) {
Jens Axboefd6fab22019-03-14 16:30:06 -0600860 req = kmem_cache_alloc(req_cachep, gfp);
Jens Axboe2579f912019-01-09 09:10:43 -0700861 if (unlikely(!req))
Jens Axboe0ddf92e2019-11-08 08:52:53 -0700862 goto fallback;
Jens Axboe2579f912019-01-09 09:10:43 -0700863 } else if (!state->free_reqs) {
864 size_t sz;
865 int ret;
866
867 sz = min_t(size_t, state->ios_left, ARRAY_SIZE(state->reqs));
Jens Axboefd6fab22019-03-14 16:30:06 -0600868 ret = kmem_cache_alloc_bulk(req_cachep, gfp, sz, state->reqs);
869
870 /*
871 * Bulk alloc is all-or-nothing. If we fail to get a batch,
872 * retry single alloc to be on the safe side.
873 */
874 if (unlikely(ret <= 0)) {
875 state->reqs[0] = kmem_cache_alloc(req_cachep, gfp);
876 if (!state->reqs[0])
Jens Axboe0ddf92e2019-11-08 08:52:53 -0700877 goto fallback;
Jens Axboefd6fab22019-03-14 16:30:06 -0600878 ret = 1;
879 }
Jens Axboe2579f912019-01-09 09:10:43 -0700880 state->free_reqs = ret - 1;
881 state->cur_req = 1;
882 req = state->reqs[0];
883 } else {
884 req = state->reqs[state->cur_req];
885 state->free_reqs--;
886 state->cur_req++;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700887 }
888
Jens Axboe0ddf92e2019-11-08 08:52:53 -0700889got_it:
Jens Axboe1a6b74f2019-12-02 10:33:15 -0700890 req->io = NULL;
Pavel Begunkovcf6fd4b2019-11-25 23:14:39 +0300891 req->ring_file = NULL;
Jens Axboe60c112b2019-06-21 10:20:18 -0600892 req->file = NULL;
Jens Axboe2579f912019-01-09 09:10:43 -0700893 req->ctx = ctx;
894 req->flags = 0;
Jens Axboee65ef562019-03-12 10:16:44 -0600895 /* one is dropped after submission, the other at completion */
896 refcount_set(&req->refs, 2);
Jens Axboe9e645e112019-05-10 16:07:28 -0600897 req->result = 0;
Jens Axboe561fb042019-10-24 07:25:42 -0600898 INIT_IO_WORK(&req->work, io_wq_submit_work);
Jens Axboe2579f912019-01-09 09:10:43 -0700899 return req;
Jens Axboe0ddf92e2019-11-08 08:52:53 -0700900fallback:
901 req = io_get_fallback_req(ctx);
902 if (req)
903 goto got_it;
Pavel Begunkov6805b322019-10-08 02:18:42 +0300904 percpu_ref_put(&ctx->refs);
Jens Axboe2b188cc2019-01-07 10:46:33 -0700905 return NULL;
906}
907
Jens Axboedef596e2019-01-09 08:59:42 -0700908static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr)
909{
910 if (*nr) {
911 kmem_cache_free_bulk(req_cachep, *nr, reqs);
Pavel Begunkov6805b322019-10-08 02:18:42 +0300912 percpu_ref_put_many(&ctx->refs, *nr);
Jens Axboedef596e2019-01-09 08:59:42 -0700913 *nr = 0;
914 }
915}
916
Jens Axboe9e645e112019-05-10 16:07:28 -0600917static void __io_free_req(struct io_kiocb *req)
Jens Axboe2b188cc2019-01-07 10:46:33 -0700918{
Jens Axboefcb323c2019-10-24 12:39:47 -0600919 struct io_ring_ctx *ctx = req->ctx;
920
Jens Axboe1a6b74f2019-12-02 10:33:15 -0700921 if (req->io)
922 kfree(req->io);
Jens Axboe09bb8392019-03-13 12:39:28 -0600923 if (req->file && !(req->flags & REQ_F_FIXED_FILE))
924 fput(req->file);
Jens Axboefcb323c2019-10-24 12:39:47 -0600925 if (req->flags & REQ_F_INFLIGHT) {
926 unsigned long flags;
927
928 spin_lock_irqsave(&ctx->inflight_lock, flags);
929 list_del(&req->inflight_entry);
930 if (waitqueue_active(&ctx->inflight_wait))
931 wake_up(&ctx->inflight_wait);
932 spin_unlock_irqrestore(&ctx->inflight_lock, flags);
933 }
934 percpu_ref_put(&ctx->refs);
Jens Axboe0ddf92e2019-11-08 08:52:53 -0700935 if (likely(!io_is_fallback_req(req)))
936 kmem_cache_free(req_cachep, req);
937 else
938 clear_bit_unlock(0, (unsigned long *) ctx->fallback_req);
Jens Axboee65ef562019-03-12 10:16:44 -0600939}
940
Jackie Liua197f662019-11-08 08:09:12 -0700941static bool io_link_cancel_timeout(struct io_kiocb *req)
Jens Axboe9e645e112019-05-10 16:07:28 -0600942{
Jackie Liua197f662019-11-08 08:09:12 -0700943 struct io_ring_ctx *ctx = req->ctx;
Jens Axboe2665abf2019-11-05 12:40:47 -0700944 int ret;
945
Jens Axboe2d283902019-12-04 11:08:05 -0700946 ret = hrtimer_try_to_cancel(&req->io->timeout.timer);
Jens Axboe2665abf2019-11-05 12:40:47 -0700947 if (ret != -1) {
Jens Axboe78e19bb2019-11-06 15:21:34 -0700948 io_cqring_fill_event(req, -ECANCELED);
Jens Axboe2665abf2019-11-05 12:40:47 -0700949 io_commit_cqring(ctx);
950 req->flags &= ~REQ_F_LINK;
Jackie Liuec9c02a2019-11-08 23:50:36 +0800951 io_put_req(req);
Jens Axboe2665abf2019-11-05 12:40:47 -0700952 return true;
953 }
954
955 return false;
956}
957
Jens Axboeba816ad2019-09-28 11:36:45 -0600958static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
Jens Axboe9e645e112019-05-10 16:07:28 -0600959{
Jens Axboe2665abf2019-11-05 12:40:47 -0700960 struct io_ring_ctx *ctx = req->ctx;
Jens Axboe2665abf2019-11-05 12:40:47 -0700961 bool wake_ev = false;
Jens Axboe9e645e112019-05-10 16:07:28 -0600962
Jens Axboe4d7dd462019-11-20 13:03:52 -0700963 /* Already got next link */
964 if (req->flags & REQ_F_LINK_NEXT)
965 return;
966
Jens Axboe9e645e112019-05-10 16:07:28 -0600967 /*
968 * The list should never be empty when we are called here. But could
969 * potentially happen if the chain is messed up, check to be on the
970 * safe side.
971 */
Pavel Begunkov44932332019-12-05 16:16:35 +0300972 while (!list_empty(&req->link_list)) {
973 struct io_kiocb *nxt = list_first_entry(&req->link_list,
974 struct io_kiocb, link_list);
Jens Axboe94ae5e72019-11-14 19:39:52 -0700975
Pavel Begunkov44932332019-12-05 16:16:35 +0300976 if (unlikely((req->flags & REQ_F_LINK_TIMEOUT) &&
977 (nxt->flags & REQ_F_TIMEOUT))) {
978 list_del_init(&nxt->link_list);
Jens Axboe94ae5e72019-11-14 19:39:52 -0700979 wake_ev |= io_link_cancel_timeout(nxt);
Jens Axboe94ae5e72019-11-14 19:39:52 -0700980 req->flags &= ~REQ_F_LINK_TIMEOUT;
981 continue;
982 }
Jens Axboe9e645e112019-05-10 16:07:28 -0600983
Pavel Begunkov44932332019-12-05 16:16:35 +0300984 list_del_init(&req->link_list);
985 if (!list_empty(&nxt->link_list))
986 nxt->flags |= REQ_F_LINK;
Pavel Begunkovb18fdf72019-11-21 23:21:02 +0300987 *nxtptr = nxt;
Jens Axboe94ae5e72019-11-14 19:39:52 -0700988 break;
Jens Axboe9e645e112019-05-10 16:07:28 -0600989 }
Jens Axboe2665abf2019-11-05 12:40:47 -0700990
Jens Axboe4d7dd462019-11-20 13:03:52 -0700991 req->flags |= REQ_F_LINK_NEXT;
Jens Axboe2665abf2019-11-05 12:40:47 -0700992 if (wake_ev)
993 io_cqring_ev_posted(ctx);
Jens Axboe9e645e112019-05-10 16:07:28 -0600994}
995
996/*
997 * Called if REQ_F_LINK is set, and we fail the head request
998 */
999static void io_fail_links(struct io_kiocb *req)
1000{
Jens Axboe2665abf2019-11-05 12:40:47 -07001001 struct io_ring_ctx *ctx = req->ctx;
Jens Axboe2665abf2019-11-05 12:40:47 -07001002 unsigned long flags;
1003
1004 spin_lock_irqsave(&ctx->completion_lock, flags);
Jens Axboe9e645e112019-05-10 16:07:28 -06001005
1006 while (!list_empty(&req->link_list)) {
Pavel Begunkov44932332019-12-05 16:16:35 +03001007 struct io_kiocb *link = list_first_entry(&req->link_list,
1008 struct io_kiocb, link_list);
Jens Axboe9e645e112019-05-10 16:07:28 -06001009
Pavel Begunkov44932332019-12-05 16:16:35 +03001010 list_del_init(&link->link_list);
Dmitrii Dolgovc826bd72019-10-15 19:02:01 +02001011 trace_io_uring_fail_link(req, link);
Jens Axboe2665abf2019-11-05 12:40:47 -07001012
1013 if ((req->flags & REQ_F_LINK_TIMEOUT) &&
Jens Axboed625c6e2019-12-17 19:53:05 -07001014 link->opcode == IORING_OP_LINK_TIMEOUT) {
Jackie Liua197f662019-11-08 08:09:12 -07001015 io_link_cancel_timeout(link);
Jens Axboe2665abf2019-11-05 12:40:47 -07001016 } else {
Jens Axboe78e19bb2019-11-06 15:21:34 -07001017 io_cqring_fill_event(link, -ECANCELED);
Jens Axboe978db572019-11-14 22:39:04 -07001018 __io_double_put_req(link);
Jens Axboe2665abf2019-11-05 12:40:47 -07001019 }
Jens Axboe5d960722019-11-19 15:31:28 -07001020 req->flags &= ~REQ_F_LINK_TIMEOUT;
Jens Axboe9e645e112019-05-10 16:07:28 -06001021 }
Jens Axboe2665abf2019-11-05 12:40:47 -07001022
1023 io_commit_cqring(ctx);
1024 spin_unlock_irqrestore(&ctx->completion_lock, flags);
1025 io_cqring_ev_posted(ctx);
Jens Axboe9e645e112019-05-10 16:07:28 -06001026}
1027
Jens Axboe4d7dd462019-11-20 13:03:52 -07001028static void io_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt)
Jens Axboe9e645e112019-05-10 16:07:28 -06001029{
Jens Axboe4d7dd462019-11-20 13:03:52 -07001030 if (likely(!(req->flags & REQ_F_LINK)))
Jens Axboe2665abf2019-11-05 12:40:47 -07001031 return;
Jens Axboe2665abf2019-11-05 12:40:47 -07001032
Jens Axboe9e645e112019-05-10 16:07:28 -06001033 /*
1034 * If LINK is set, we have dependent requests in this chain. If we
1035 * didn't fail this request, queue the first one up, moving any other
1036 * dependencies to the next request. In case of failure, fail the rest
1037 * of the chain.
1038 */
Jens Axboe2665abf2019-11-05 12:40:47 -07001039 if (req->flags & REQ_F_FAIL_LINK) {
1040 io_fail_links(req);
Jens Axboe7c9e7f02019-11-12 08:15:53 -07001041 } else if ((req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_COMP_LOCKED)) ==
1042 REQ_F_LINK_TIMEOUT) {
Jens Axboe2665abf2019-11-05 12:40:47 -07001043 struct io_ring_ctx *ctx = req->ctx;
1044 unsigned long flags;
1045
1046 /*
1047 * If this is a timeout link, we could be racing with the
1048 * timeout timer. Grab the completion lock for this case to
Jens Axboe7c9e7f02019-11-12 08:15:53 -07001049 * protect against that.
Jens Axboe2665abf2019-11-05 12:40:47 -07001050 */
1051 spin_lock_irqsave(&ctx->completion_lock, flags);
1052 io_req_link_next(req, nxt);
1053 spin_unlock_irqrestore(&ctx->completion_lock, flags);
1054 } else {
1055 io_req_link_next(req, nxt);
Jens Axboe9e645e112019-05-10 16:07:28 -06001056 }
Jens Axboe4d7dd462019-11-20 13:03:52 -07001057}
Jens Axboe9e645e112019-05-10 16:07:28 -06001058
Jackie Liuc69f8db2019-11-09 11:00:08 +08001059static void io_free_req(struct io_kiocb *req)
1060{
Pavel Begunkov944e58b2019-11-21 23:21:01 +03001061 struct io_kiocb *nxt = NULL;
1062
1063 io_req_find_next(req, &nxt);
Pavel Begunkov70cf9f32019-11-21 23:21:00 +03001064 __io_free_req(req);
Pavel Begunkov944e58b2019-11-21 23:21:01 +03001065
1066 if (nxt)
1067 io_queue_async_work(nxt);
Jackie Liuc69f8db2019-11-09 11:00:08 +08001068}
1069
Jens Axboeba816ad2019-09-28 11:36:45 -06001070/*
1071 * Drop reference to request, return next in chain (if there is one) if this
1072 * was the last reference to this request.
1073 */
Pavel Begunkovf9bd67f2019-11-21 23:21:03 +03001074__attribute__((nonnull))
Jackie Liuec9c02a2019-11-08 23:50:36 +08001075static void io_put_req_find_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
Jens Axboee65ef562019-03-12 10:16:44 -06001076{
Pavel Begunkovf9bd67f2019-11-21 23:21:03 +03001077 io_req_find_next(req, nxtptr);
Jens Axboe4d7dd462019-11-20 13:03:52 -07001078
Jens Axboee65ef562019-03-12 10:16:44 -06001079 if (refcount_dec_and_test(&req->refs))
Jens Axboe4d7dd462019-11-20 13:03:52 -07001080 __io_free_req(req);
Jens Axboe2b188cc2019-01-07 10:46:33 -07001081}
1082
Jens Axboe2b188cc2019-01-07 10:46:33 -07001083static void io_put_req(struct io_kiocb *req)
1084{
Jens Axboedef596e2019-01-09 08:59:42 -07001085 if (refcount_dec_and_test(&req->refs))
1086 io_free_req(req);
1087}
1088
Jens Axboe978db572019-11-14 22:39:04 -07001089/*
1090 * Must only be used if we don't need to care about links, usually from
1091 * within the completion handling itself.
1092 */
1093static void __io_double_put_req(struct io_kiocb *req)
Jens Axboea3a0e432019-08-20 11:03:11 -06001094{
Jens Axboe78e19bb2019-11-06 15:21:34 -07001095 /* drop both submit and complete references */
1096 if (refcount_sub_and_test(2, &req->refs))
1097 __io_free_req(req);
1098}
1099
Jens Axboe978db572019-11-14 22:39:04 -07001100static void io_double_put_req(struct io_kiocb *req)
1101{
1102 /* drop both submit and complete references */
1103 if (refcount_sub_and_test(2, &req->refs))
1104 io_free_req(req);
1105}
1106
Jens Axboe1d7bb1d2019-11-06 11:31:17 -07001107static unsigned io_cqring_events(struct io_ring_ctx *ctx, bool noflush)
Jens Axboea3a0e432019-08-20 11:03:11 -06001108{
Jens Axboe84f97dc2019-11-06 11:27:53 -07001109 struct io_rings *rings = ctx->rings;
1110
Jens Axboe1d7bb1d2019-11-06 11:31:17 -07001111 /*
1112 * noflush == true is from the waitqueue handler, just ensure we wake
1113 * up the task, and the next invocation will flush the entries. We
1114 * cannot safely to it from here.
1115 */
1116 if (noflush && !list_empty(&ctx->cq_overflow_list))
1117 return -1U;
1118
1119 io_cqring_overflow_flush(ctx, false);
1120
Jens Axboea3a0e432019-08-20 11:03:11 -06001121 /* See comment at the top of this file */
1122 smp_rmb();
Hristo Venev75b28af2019-08-26 17:23:46 +00001123 return READ_ONCE(rings->cq.tail) - READ_ONCE(rings->cq.head);
Jens Axboea3a0e432019-08-20 11:03:11 -06001124}
1125
Pavel Begunkovfb5ccc92019-10-25 12:31:30 +03001126static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
1127{
1128 struct io_rings *rings = ctx->rings;
1129
1130 /* make sure SQ entry isn't read before tail */
1131 return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
1132}
1133
Jens Axboedef596e2019-01-09 08:59:42 -07001134/*
1135 * Find and free completed poll iocbs
1136 */
1137static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
1138 struct list_head *done)
1139{
1140 void *reqs[IO_IOPOLL_BATCH];
1141 struct io_kiocb *req;
Jens Axboe09bb8392019-03-13 12:39:28 -06001142 int to_free;
Jens Axboedef596e2019-01-09 08:59:42 -07001143
Jens Axboe09bb8392019-03-13 12:39:28 -06001144 to_free = 0;
Jens Axboedef596e2019-01-09 08:59:42 -07001145 while (!list_empty(done)) {
1146 req = list_first_entry(done, struct io_kiocb, list);
1147 list_del(&req->list);
1148
Jens Axboe78e19bb2019-11-06 15:21:34 -07001149 io_cqring_fill_event(req, req->result);
Jens Axboedef596e2019-01-09 08:59:42 -07001150 (*nr_events)++;
1151
Jens Axboe09bb8392019-03-13 12:39:28 -06001152 if (refcount_dec_and_test(&req->refs)) {
1153 /* If we're not using fixed files, we have to pair the
1154 * completion part with the file put. Use regular
1155 * completions for those, only batch free for fixed
Jens Axboe9e645e112019-05-10 16:07:28 -06001156 * file and non-linked commands.
Jens Axboe09bb8392019-03-13 12:39:28 -06001157 */
Jens Axboe1a6b74f2019-12-02 10:33:15 -07001158 if (((req->flags & (REQ_F_FIXED_FILE|REQ_F_LINK)) ==
1159 REQ_F_FIXED_FILE) && !io_is_fallback_req(req) &&
1160 !req->io) {
Jens Axboe09bb8392019-03-13 12:39:28 -06001161 reqs[to_free++] = req;
1162 if (to_free == ARRAY_SIZE(reqs))
1163 io_free_req_many(ctx, reqs, &to_free);
Jens Axboe6b063142019-01-10 22:13:58 -07001164 } else {
Jens Axboe09bb8392019-03-13 12:39:28 -06001165 io_free_req(req);
Jens Axboe6b063142019-01-10 22:13:58 -07001166 }
Jens Axboe9a56a232019-01-09 09:06:50 -07001167 }
Jens Axboedef596e2019-01-09 08:59:42 -07001168 }
Jens Axboedef596e2019-01-09 08:59:42 -07001169
Jens Axboe09bb8392019-03-13 12:39:28 -06001170 io_commit_cqring(ctx);
Jens Axboedef596e2019-01-09 08:59:42 -07001171 io_free_req_many(ctx, reqs, &to_free);
1172}
1173
1174static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
1175 long min)
1176{
1177 struct io_kiocb *req, *tmp;
1178 LIST_HEAD(done);
1179 bool spin;
1180 int ret;
1181
1182 /*
1183 * Only spin for completions if we don't have multiple devices hanging
1184 * off our complete list, and we're under the requested amount.
1185 */
1186 spin = !ctx->poll_multi_file && *nr_events < min;
1187
1188 ret = 0;
1189 list_for_each_entry_safe(req, tmp, &ctx->poll_list, list) {
Jens Axboe9adbd452019-12-20 08:45:55 -07001190 struct kiocb *kiocb = &req->rw.kiocb;
Jens Axboedef596e2019-01-09 08:59:42 -07001191
1192 /*
1193 * Move completed entries to our local list. If we find a
1194 * request that requires polling, break out and complete
1195 * the done list first, if we have entries there.
1196 */
1197 if (req->flags & REQ_F_IOPOLL_COMPLETED) {
1198 list_move_tail(&req->list, &done);
1199 continue;
1200 }
1201 if (!list_empty(&done))
1202 break;
1203
1204 ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
1205 if (ret < 0)
1206 break;
1207
1208 if (ret && spin)
1209 spin = false;
1210 ret = 0;
1211 }
1212
1213 if (!list_empty(&done))
1214 io_iopoll_complete(ctx, nr_events, &done);
1215
1216 return ret;
1217}
1218
1219/*
Brian Gianforcarod195a662019-12-13 03:09:50 -08001220 * Poll for a minimum of 'min' events. Note that if min == 0 we consider that a
Jens Axboedef596e2019-01-09 08:59:42 -07001221 * non-spinning poll check - we'll still enter the driver poll loop, but only
1222 * as a non-spinning completion check.
1223 */
1224static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events,
1225 long min)
1226{
Jens Axboe08f54392019-08-21 22:19:11 -06001227 while (!list_empty(&ctx->poll_list) && !need_resched()) {
Jens Axboedef596e2019-01-09 08:59:42 -07001228 int ret;
1229
1230 ret = io_do_iopoll(ctx, nr_events, min);
1231 if (ret < 0)
1232 return ret;
1233 if (!min || *nr_events >= min)
1234 return 0;
1235 }
1236
1237 return 1;
1238}
1239
1240/*
1241 * We can't just wait for polled events to come to us, we have to actively
1242 * find and complete them.
1243 */
1244static void io_iopoll_reap_events(struct io_ring_ctx *ctx)
1245{
1246 if (!(ctx->flags & IORING_SETUP_IOPOLL))
1247 return;
1248
1249 mutex_lock(&ctx->uring_lock);
1250 while (!list_empty(&ctx->poll_list)) {
1251 unsigned int nr_events = 0;
1252
1253 io_iopoll_getevents(ctx, &nr_events, 1);
Jens Axboe08f54392019-08-21 22:19:11 -06001254
1255 /*
1256 * Ensure we allow local-to-the-cpu processing to take place,
1257 * in this case we need to ensure that we reap all events.
1258 */
1259 cond_resched();
Jens Axboedef596e2019-01-09 08:59:42 -07001260 }
1261 mutex_unlock(&ctx->uring_lock);
1262}
1263
Jens Axboe2b2ed972019-10-25 10:06:15 -06001264static int __io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
1265 long min)
Jens Axboedef596e2019-01-09 08:59:42 -07001266{
Jens Axboe2b2ed972019-10-25 10:06:15 -06001267 int iters = 0, ret = 0;
Jens Axboedef596e2019-01-09 08:59:42 -07001268
1269 do {
1270 int tmin = 0;
1271
Jens Axboe500f9fb2019-08-19 12:15:59 -06001272 /*
Jens Axboea3a0e432019-08-20 11:03:11 -06001273 * Don't enter poll loop if we already have events pending.
1274 * If we do, we can potentially be spinning for commands that
1275 * already triggered a CQE (eg in error).
1276 */
Jens Axboe1d7bb1d2019-11-06 11:31:17 -07001277 if (io_cqring_events(ctx, false))
Jens Axboea3a0e432019-08-20 11:03:11 -06001278 break;
1279
1280 /*
Jens Axboe500f9fb2019-08-19 12:15:59 -06001281 * If a submit got punted to a workqueue, we can have the
1282 * application entering polling for a command before it gets
1283 * issued. That app will hold the uring_lock for the duration
1284 * of the poll right here, so we need to take a breather every
1285 * now and then to ensure that the issue has a chance to add
1286 * the poll to the issued list. Otherwise we can spin here
1287 * forever, while the workqueue is stuck trying to acquire the
1288 * very same mutex.
1289 */
1290 if (!(++iters & 7)) {
1291 mutex_unlock(&ctx->uring_lock);
1292 mutex_lock(&ctx->uring_lock);
1293 }
1294
Jens Axboedef596e2019-01-09 08:59:42 -07001295 if (*nr_events < min)
1296 tmin = min - *nr_events;
1297
1298 ret = io_iopoll_getevents(ctx, nr_events, tmin);
1299 if (ret <= 0)
1300 break;
1301 ret = 0;
1302 } while (min && !*nr_events && !need_resched());
1303
Jens Axboe2b2ed972019-10-25 10:06:15 -06001304 return ret;
1305}
1306
1307static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
1308 long min)
1309{
1310 int ret;
1311
1312 /*
1313 * We disallow the app entering submit/complete with polling, but we
1314 * still need to lock the ring to prevent racing with polled issue
1315 * that got punted to a workqueue.
1316 */
1317 mutex_lock(&ctx->uring_lock);
1318 ret = __io_iopoll_check(ctx, nr_events, min);
Jens Axboe500f9fb2019-08-19 12:15:59 -06001319 mutex_unlock(&ctx->uring_lock);
Jens Axboedef596e2019-01-09 08:59:42 -07001320 return ret;
1321}
1322
Jens Axboe491381ce2019-10-17 09:20:46 -06001323static void kiocb_end_write(struct io_kiocb *req)
Jens Axboe2b188cc2019-01-07 10:46:33 -07001324{
Jens Axboe491381ce2019-10-17 09:20:46 -06001325 /*
1326 * Tell lockdep we inherited freeze protection from submission
1327 * thread.
1328 */
1329 if (req->flags & REQ_F_ISREG) {
1330 struct inode *inode = file_inode(req->file);
Jens Axboe2b188cc2019-01-07 10:46:33 -07001331
Jens Axboe491381ce2019-10-17 09:20:46 -06001332 __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
Jens Axboe2b188cc2019-01-07 10:46:33 -07001333 }
Jens Axboe491381ce2019-10-17 09:20:46 -06001334 file_end_write(req->file);
Jens Axboe2b188cc2019-01-07 10:46:33 -07001335}
1336
Jens Axboe4e88d6e2019-12-07 20:59:47 -07001337static inline void req_set_fail_links(struct io_kiocb *req)
1338{
1339 if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK)
1340 req->flags |= REQ_F_FAIL_LINK;
1341}
1342
Jens Axboeba816ad2019-09-28 11:36:45 -06001343static void io_complete_rw_common(struct kiocb *kiocb, long res)
Jens Axboe2b188cc2019-01-07 10:46:33 -07001344{
Jens Axboe9adbd452019-12-20 08:45:55 -07001345 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
Jens Axboe2b188cc2019-01-07 10:46:33 -07001346
Jens Axboe491381ce2019-10-17 09:20:46 -06001347 if (kiocb->ki_flags & IOCB_WRITE)
1348 kiocb_end_write(req);
Jens Axboe2b188cc2019-01-07 10:46:33 -07001349
Jens Axboe4e88d6e2019-12-07 20:59:47 -07001350 if (res != req->result)
1351 req_set_fail_links(req);
Jens Axboe78e19bb2019-11-06 15:21:34 -07001352 io_cqring_add_event(req, res);
Jens Axboeba816ad2019-09-28 11:36:45 -06001353}
1354
1355static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
1356{
Jens Axboe9adbd452019-12-20 08:45:55 -07001357 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
Jens Axboeba816ad2019-09-28 11:36:45 -06001358
1359 io_complete_rw_common(kiocb, res);
Jens Axboee65ef562019-03-12 10:16:44 -06001360 io_put_req(req);
Jens Axboe2b188cc2019-01-07 10:46:33 -07001361}
1362
Jens Axboeba816ad2019-09-28 11:36:45 -06001363static struct io_kiocb *__io_complete_rw(struct kiocb *kiocb, long res)
1364{
Jens Axboe9adbd452019-12-20 08:45:55 -07001365 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
Jackie Liuec9c02a2019-11-08 23:50:36 +08001366 struct io_kiocb *nxt = NULL;
Jens Axboeba816ad2019-09-28 11:36:45 -06001367
1368 io_complete_rw_common(kiocb, res);
Jackie Liuec9c02a2019-11-08 23:50:36 +08001369 io_put_req_find_next(req, &nxt);
1370
1371 return nxt;
Jens Axboe2b188cc2019-01-07 10:46:33 -07001372}
1373
Jens Axboedef596e2019-01-09 08:59:42 -07001374static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
1375{
Jens Axboe9adbd452019-12-20 08:45:55 -07001376 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
Jens Axboedef596e2019-01-09 08:59:42 -07001377
Jens Axboe491381ce2019-10-17 09:20:46 -06001378 if (kiocb->ki_flags & IOCB_WRITE)
1379 kiocb_end_write(req);
Jens Axboedef596e2019-01-09 08:59:42 -07001380
Jens Axboe4e88d6e2019-12-07 20:59:47 -07001381 if (res != req->result)
1382 req_set_fail_links(req);
Jens Axboe9e645e112019-05-10 16:07:28 -06001383 req->result = res;
Jens Axboedef596e2019-01-09 08:59:42 -07001384 if (res != -EAGAIN)
1385 req->flags |= REQ_F_IOPOLL_COMPLETED;
1386}
1387
1388/*
1389 * After the iocb has been issued, it's safe to be found on the poll list.
1390 * Adding the kiocb to the list AFTER submission ensures that we don't
1391 * find it from a io_iopoll_getevents() thread before the issuer is done
1392 * accessing the kiocb cookie.
1393 */
1394static void io_iopoll_req_issued(struct io_kiocb *req)
1395{
1396 struct io_ring_ctx *ctx = req->ctx;
1397
1398 /*
1399 * Track whether we have multiple files in our lists. This will impact
1400 * how we do polling eventually, not spinning if we're on potentially
1401 * different devices.
1402 */
1403 if (list_empty(&ctx->poll_list)) {
1404 ctx->poll_multi_file = false;
1405 } else if (!ctx->poll_multi_file) {
1406 struct io_kiocb *list_req;
1407
1408 list_req = list_first_entry(&ctx->poll_list, struct io_kiocb,
1409 list);
Jens Axboe9adbd452019-12-20 08:45:55 -07001410 if (list_req->file != req->file)
Jens Axboedef596e2019-01-09 08:59:42 -07001411 ctx->poll_multi_file = true;
1412 }
1413
1414 /*
1415 * For fast devices, IO may have already completed. If it has, add
1416 * it to the front so we find it first.
1417 */
1418 if (req->flags & REQ_F_IOPOLL_COMPLETED)
1419 list_add(&req->list, &ctx->poll_list);
1420 else
1421 list_add_tail(&req->list, &ctx->poll_list);
1422}
1423
Jens Axboe3d6770f2019-04-13 11:50:54 -06001424static void io_file_put(struct io_submit_state *state)
Jens Axboe9a56a232019-01-09 09:06:50 -07001425{
Jens Axboe3d6770f2019-04-13 11:50:54 -06001426 if (state->file) {
Jens Axboe9a56a232019-01-09 09:06:50 -07001427 int diff = state->has_refs - state->used_refs;
1428
1429 if (diff)
1430 fput_many(state->file, diff);
1431 state->file = NULL;
1432 }
1433}
1434
1435/*
1436 * Get as many references to a file as we have IOs left in this submission,
1437 * assuming most submissions are for one file, or at least that each file
1438 * has more than one submission.
1439 */
1440static struct file *io_file_get(struct io_submit_state *state, int fd)
1441{
1442 if (!state)
1443 return fget(fd);
1444
1445 if (state->file) {
1446 if (state->fd == fd) {
1447 state->used_refs++;
1448 state->ios_left--;
1449 return state->file;
1450 }
Jens Axboe3d6770f2019-04-13 11:50:54 -06001451 io_file_put(state);
Jens Axboe9a56a232019-01-09 09:06:50 -07001452 }
1453 state->file = fget_many(fd, state->ios_left);
1454 if (!state->file)
1455 return NULL;
1456
1457 state->fd = fd;
1458 state->has_refs = state->ios_left;
1459 state->used_refs = 1;
1460 state->ios_left--;
1461 return state->file;
1462}
1463
Jens Axboe2b188cc2019-01-07 10:46:33 -07001464/*
1465 * If we tracked the file through the SCM inflight mechanism, we could support
1466 * any file. For now, just ensure that anything potentially problematic is done
1467 * inline.
1468 */
1469static bool io_file_supports_async(struct file *file)
1470{
1471 umode_t mode = file_inode(file)->i_mode;
1472
Jens Axboe10d59342019-12-09 20:16:22 -07001473 if (S_ISBLK(mode) || S_ISCHR(mode) || S_ISSOCK(mode))
Jens Axboe2b188cc2019-01-07 10:46:33 -07001474 return true;
1475 if (S_ISREG(mode) && file->f_op != &io_uring_fops)
1476 return true;
1477
1478 return false;
1479}
1480
Pavel Begunkov267bc902019-11-07 01:41:08 +03001481static int io_prep_rw(struct io_kiocb *req, bool force_nonblock)
Jens Axboe2b188cc2019-01-07 10:46:33 -07001482{
Pavel Begunkovcf6fd4b2019-11-25 23:14:39 +03001483 const struct io_uring_sqe *sqe = req->sqe;
Jens Axboedef596e2019-01-09 08:59:42 -07001484 struct io_ring_ctx *ctx = req->ctx;
Jens Axboe9adbd452019-12-20 08:45:55 -07001485 struct kiocb *kiocb = &req->rw.kiocb;
Jens Axboe09bb8392019-03-13 12:39:28 -06001486 unsigned ioprio;
1487 int ret;
Jens Axboe2b188cc2019-01-07 10:46:33 -07001488
Jens Axboe09bb8392019-03-13 12:39:28 -06001489 if (!req->file)
1490 return -EBADF;
Jens Axboe2b188cc2019-01-07 10:46:33 -07001491
Jens Axboe491381ce2019-10-17 09:20:46 -06001492 if (S_ISREG(file_inode(req->file)->i_mode))
1493 req->flags |= REQ_F_ISREG;
1494
Jens Axboe2b188cc2019-01-07 10:46:33 -07001495 kiocb->ki_pos = READ_ONCE(sqe->off);
1496 kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
1497 kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
1498
1499 ioprio = READ_ONCE(sqe->ioprio);
1500 if (ioprio) {
1501 ret = ioprio_check_cap(ioprio);
1502 if (ret)
Jens Axboe09bb8392019-03-13 12:39:28 -06001503 return ret;
Jens Axboe2b188cc2019-01-07 10:46:33 -07001504
1505 kiocb->ki_ioprio = ioprio;
1506 } else
1507 kiocb->ki_ioprio = get_current_ioprio();
1508
1509 ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
1510 if (unlikely(ret))
Jens Axboe09bb8392019-03-13 12:39:28 -06001511 return ret;
Stefan Bühler8449eed2019-04-27 20:34:19 +02001512
1513 /* don't allow async punt if RWF_NOWAIT was requested */
Jens Axboe491381ce2019-10-17 09:20:46 -06001514 if ((kiocb->ki_flags & IOCB_NOWAIT) ||
1515 (req->file->f_flags & O_NONBLOCK))
Stefan Bühler8449eed2019-04-27 20:34:19 +02001516 req->flags |= REQ_F_NOWAIT;
1517
1518 if (force_nonblock)
Jens Axboe2b188cc2019-01-07 10:46:33 -07001519 kiocb->ki_flags |= IOCB_NOWAIT;
Stefan Bühler8449eed2019-04-27 20:34:19 +02001520
Jens Axboedef596e2019-01-09 08:59:42 -07001521 if (ctx->flags & IORING_SETUP_IOPOLL) {
Jens Axboedef596e2019-01-09 08:59:42 -07001522 if (!(kiocb->ki_flags & IOCB_DIRECT) ||
1523 !kiocb->ki_filp->f_op->iopoll)
Jens Axboe09bb8392019-03-13 12:39:28 -06001524 return -EOPNOTSUPP;
Jens Axboe2b188cc2019-01-07 10:46:33 -07001525
Jens Axboedef596e2019-01-09 08:59:42 -07001526 kiocb->ki_flags |= IOCB_HIPRI;
1527 kiocb->ki_complete = io_complete_rw_iopoll;
Jens Axboe6873e0b2019-10-30 13:53:09 -06001528 req->result = 0;
Jens Axboedef596e2019-01-09 08:59:42 -07001529 } else {
Jens Axboe09bb8392019-03-13 12:39:28 -06001530 if (kiocb->ki_flags & IOCB_HIPRI)
1531 return -EINVAL;
Jens Axboedef596e2019-01-09 08:59:42 -07001532 kiocb->ki_complete = io_complete_rw;
1533 }
Jens Axboe9adbd452019-12-20 08:45:55 -07001534
1535 req->rw.addr = READ_ONCE(req->sqe->addr);
1536 req->rw.len = READ_ONCE(req->sqe->len);
1537 /* we own ->private, reuse it for the buffer index */
1538 req->rw.kiocb.private = (void *) (unsigned long)
1539 READ_ONCE(req->sqe->buf_index);
Jens Axboe2b188cc2019-01-07 10:46:33 -07001540 return 0;
Jens Axboe2b188cc2019-01-07 10:46:33 -07001541}
1542
1543static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
1544{
1545 switch (ret) {
1546 case -EIOCBQUEUED:
1547 break;
1548 case -ERESTARTSYS:
1549 case -ERESTARTNOINTR:
1550 case -ERESTARTNOHAND:
1551 case -ERESTART_RESTARTBLOCK:
1552 /*
1553 * We can't just restart the syscall, since previously
1554 * submitted sqes may already be in progress. Just fail this
1555 * IO with EINTR.
1556 */
1557 ret = -EINTR;
1558 /* fall through */
1559 default:
1560 kiocb->ki_complete(kiocb, ret, 0);
1561 }
1562}
1563
Jens Axboeba816ad2019-09-28 11:36:45 -06001564static void kiocb_done(struct kiocb *kiocb, ssize_t ret, struct io_kiocb **nxt,
1565 bool in_async)
1566{
Pavel Begunkovf9bd67f2019-11-21 23:21:03 +03001567 if (in_async && ret >= 0 && kiocb->ki_complete == io_complete_rw)
Jens Axboeba816ad2019-09-28 11:36:45 -06001568 *nxt = __io_complete_rw(kiocb, ret);
1569 else
1570 io_rw_done(kiocb, ret);
1571}
1572
Jens Axboe9adbd452019-12-20 08:45:55 -07001573static ssize_t io_import_fixed(struct io_kiocb *req, int rw,
Pavel Begunkov7d009162019-11-25 23:14:40 +03001574 struct iov_iter *iter)
Jens Axboeedafcce2019-01-09 09:16:05 -07001575{
Jens Axboe9adbd452019-12-20 08:45:55 -07001576 struct io_ring_ctx *ctx = req->ctx;
1577 size_t len = req->rw.len;
Jens Axboeedafcce2019-01-09 09:16:05 -07001578 struct io_mapped_ubuf *imu;
1579 unsigned index, buf_index;
1580 size_t offset;
1581 u64 buf_addr;
1582
1583 /* attempt to use fixed buffers without having provided iovecs */
1584 if (unlikely(!ctx->user_bufs))
1585 return -EFAULT;
1586
Jens Axboe9adbd452019-12-20 08:45:55 -07001587 buf_index = (unsigned long) req->rw.kiocb.private;
Jens Axboeedafcce2019-01-09 09:16:05 -07001588 if (unlikely(buf_index >= ctx->nr_user_bufs))
1589 return -EFAULT;
1590
1591 index = array_index_nospec(buf_index, ctx->nr_user_bufs);
1592 imu = &ctx->user_bufs[index];
Jens Axboe9adbd452019-12-20 08:45:55 -07001593 buf_addr = req->rw.addr;
Jens Axboeedafcce2019-01-09 09:16:05 -07001594
1595 /* overflow */
1596 if (buf_addr + len < buf_addr)
1597 return -EFAULT;
1598 /* not inside the mapped region */
1599 if (buf_addr < imu->ubuf || buf_addr + len > imu->ubuf + imu->len)
1600 return -EFAULT;
1601
1602 /*
1603 * May not be a start of buffer, set size appropriately
1604 * and advance us to the beginning.
1605 */
1606 offset = buf_addr - imu->ubuf;
1607 iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
Jens Axboebd11b3a2019-07-20 08:37:31 -06001608
1609 if (offset) {
1610 /*
1611 * Don't use iov_iter_advance() here, as it's really slow for
1612 * using the latter parts of a big fixed buffer - it iterates
1613 * over each segment manually. We can cheat a bit here, because
1614 * we know that:
1615 *
1616 * 1) it's a BVEC iter, we set it up
1617 * 2) all bvecs are PAGE_SIZE in size, except potentially the
1618 * first and last bvec
1619 *
1620 * So just find our index, and adjust the iterator afterwards.
1621 * If the offset is within the first bvec (or the whole first
1622 * bvec, just use iov_iter_advance(). This makes it easier
1623 * since we can just skip the first segment, which may not
1624 * be PAGE_SIZE aligned.
1625 */
1626 const struct bio_vec *bvec = imu->bvec;
1627
1628 if (offset <= bvec->bv_len) {
1629 iov_iter_advance(iter, offset);
1630 } else {
1631 unsigned long seg_skip;
1632
1633 /* skip first vec */
1634 offset -= bvec->bv_len;
1635 seg_skip = 1 + (offset >> PAGE_SHIFT);
1636
1637 iter->bvec = bvec + seg_skip;
1638 iter->nr_segs -= seg_skip;
Aleix Roca Nonell99c79f62019-08-15 14:03:22 +02001639 iter->count -= bvec->bv_len + offset;
Jens Axboebd11b3a2019-07-20 08:37:31 -06001640 iter->iov_offset = offset & ~PAGE_MASK;
Jens Axboebd11b3a2019-07-20 08:37:31 -06001641 }
1642 }
1643
Jens Axboe5e559562019-11-13 16:12:46 -07001644 return len;
Jens Axboeedafcce2019-01-09 09:16:05 -07001645}
1646
Pavel Begunkovcf6fd4b2019-11-25 23:14:39 +03001647static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
1648 struct iovec **iovec, struct iov_iter *iter)
Jens Axboe2b188cc2019-01-07 10:46:33 -07001649{
Jens Axboe9adbd452019-12-20 08:45:55 -07001650 void __user *buf = u64_to_user_ptr(req->rw.addr);
1651 size_t sqe_len = req->rw.len;
Jens Axboeedafcce2019-01-09 09:16:05 -07001652 u8 opcode;
1653
Jens Axboed625c6e2019-12-17 19:53:05 -07001654 opcode = req->opcode;
Pavel Begunkov7d009162019-11-25 23:14:40 +03001655 if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
Jens Axboeedafcce2019-01-09 09:16:05 -07001656 *iovec = NULL;
Jens Axboe9adbd452019-12-20 08:45:55 -07001657 return io_import_fixed(req, rw, iter);
Jens Axboeedafcce2019-01-09 09:16:05 -07001658 }
Jens Axboe2b188cc2019-01-07 10:46:33 -07001659
Jens Axboe9adbd452019-12-20 08:45:55 -07001660 /* buffer index only valid with fixed read/write */
1661 if (req->rw.kiocb.private)
1662 return -EINVAL;
1663
Jens Axboef67676d2019-12-02 11:03:47 -07001664 if (req->io) {
1665 struct io_async_rw *iorw = &req->io->rw;
1666
1667 *iovec = iorw->iov;
1668 iov_iter_init(iter, rw, *iovec, iorw->nr_segs, iorw->size);
1669 if (iorw->iov == iorw->fast_iov)
1670 *iovec = NULL;
1671 return iorw->size;
1672 }
1673
Pavel Begunkovcf6fd4b2019-11-25 23:14:39 +03001674 if (!req->has_user)
Jens Axboe2b188cc2019-01-07 10:46:33 -07001675 return -EFAULT;
1676
1677#ifdef CONFIG_COMPAT
Pavel Begunkovcf6fd4b2019-11-25 23:14:39 +03001678 if (req->ctx->compat)
Jens Axboe2b188cc2019-01-07 10:46:33 -07001679 return compat_import_iovec(rw, buf, sqe_len, UIO_FASTIOV,
1680 iovec, iter);
1681#endif
1682
1683 return import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter);
1684}
1685
Jens Axboe32960612019-09-23 11:05:34 -06001686/*
1687 * For files that don't have ->read_iter() and ->write_iter(), handle them
1688 * by looping over ->read() or ->write() manually.
1689 */
1690static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb,
1691 struct iov_iter *iter)
1692{
1693 ssize_t ret = 0;
1694
1695 /*
1696 * Don't support polled IO through this interface, and we can't
1697 * support non-blocking either. For the latter, this just causes
1698 * the kiocb to be handled from an async context.
1699 */
1700 if (kiocb->ki_flags & IOCB_HIPRI)
1701 return -EOPNOTSUPP;
1702 if (kiocb->ki_flags & IOCB_NOWAIT)
1703 return -EAGAIN;
1704
1705 while (iov_iter_count(iter)) {
Pavel Begunkov311ae9e2019-11-24 11:58:24 +03001706 struct iovec iovec;
Jens Axboe32960612019-09-23 11:05:34 -06001707 ssize_t nr;
1708
Pavel Begunkov311ae9e2019-11-24 11:58:24 +03001709 if (!iov_iter_is_bvec(iter)) {
1710 iovec = iov_iter_iovec(iter);
1711 } else {
1712 /* fixed buffers import bvec */
1713 iovec.iov_base = kmap(iter->bvec->bv_page)
1714 + iter->iov_offset;
1715 iovec.iov_len = min(iter->count,
1716 iter->bvec->bv_len - iter->iov_offset);
1717 }
1718
Jens Axboe32960612019-09-23 11:05:34 -06001719 if (rw == READ) {
1720 nr = file->f_op->read(file, iovec.iov_base,
1721 iovec.iov_len, &kiocb->ki_pos);
1722 } else {
1723 nr = file->f_op->write(file, iovec.iov_base,
1724 iovec.iov_len, &kiocb->ki_pos);
1725 }
1726
Pavel Begunkov311ae9e2019-11-24 11:58:24 +03001727 if (iov_iter_is_bvec(iter))
1728 kunmap(iter->bvec->bv_page);
1729
Jens Axboe32960612019-09-23 11:05:34 -06001730 if (nr < 0) {
1731 if (!ret)
1732 ret = nr;
1733 break;
1734 }
1735 ret += nr;
1736 if (nr != iovec.iov_len)
1737 break;
1738 iov_iter_advance(iter, nr);
1739 }
1740
1741 return ret;
1742}
1743
Jens Axboeb7bb4f72019-12-15 22:13:43 -07001744static void io_req_map_rw(struct io_kiocb *req, ssize_t io_size,
Jens Axboef67676d2019-12-02 11:03:47 -07001745 struct iovec *iovec, struct iovec *fast_iov,
1746 struct iov_iter *iter)
1747{
1748 req->io->rw.nr_segs = iter->nr_segs;
1749 req->io->rw.size = io_size;
1750 req->io->rw.iov = iovec;
1751 if (!req->io->rw.iov) {
1752 req->io->rw.iov = req->io->rw.fast_iov;
1753 memcpy(req->io->rw.iov, fast_iov,
1754 sizeof(struct iovec) * iter->nr_segs);
1755 }
1756}
1757
Jens Axboeb7bb4f72019-12-15 22:13:43 -07001758static int io_alloc_async_ctx(struct io_kiocb *req)
Jens Axboef67676d2019-12-02 11:03:47 -07001759{
1760 req->io = kmalloc(sizeof(*req->io), GFP_KERNEL);
1761 if (req->io) {
Jens Axboef67676d2019-12-02 11:03:47 -07001762 memcpy(&req->io->sqe, req->sqe, sizeof(req->io->sqe));
1763 req->sqe = &req->io->sqe;
1764 return 0;
1765 }
1766
Jens Axboeb7bb4f72019-12-15 22:13:43 -07001767 return 1;
1768}
1769
1770static void io_rw_async(struct io_wq_work **workptr)
1771{
1772 struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
1773 struct iovec *iov = NULL;
1774
1775 if (req->io->rw.iov != req->io->rw.fast_iov)
1776 iov = req->io->rw.iov;
1777 io_wq_submit_work(workptr);
1778 kfree(iov);
1779}
1780
1781static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size,
1782 struct iovec *iovec, struct iovec *fast_iov,
1783 struct iov_iter *iter)
1784{
1785 if (!req->io && io_alloc_async_ctx(req))
1786 return -ENOMEM;
1787
1788 io_req_map_rw(req, io_size, iovec, fast_iov, iter);
1789 req->work.func = io_rw_async;
1790 return 0;
Jens Axboef67676d2019-12-02 11:03:47 -07001791}
1792
1793static int io_read_prep(struct io_kiocb *req, struct iovec **iovec,
1794 struct iov_iter *iter, bool force_nonblock)
1795{
1796 ssize_t ret;
1797
1798 ret = io_prep_rw(req, force_nonblock);
1799 if (ret)
1800 return ret;
1801
1802 if (unlikely(!(req->file->f_mode & FMODE_READ)))
1803 return -EBADF;
1804
1805 return io_import_iovec(READ, req, iovec, iter);
1806}
1807
Pavel Begunkov267bc902019-11-07 01:41:08 +03001808static int io_read(struct io_kiocb *req, struct io_kiocb **nxt,
Jens Axboe8358e3a2019-04-23 08:17:58 -06001809 bool force_nonblock)
Jens Axboe2b188cc2019-01-07 10:46:33 -07001810{
1811 struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
Jens Axboe9adbd452019-12-20 08:45:55 -07001812 struct kiocb *kiocb = &req->rw.kiocb;
Jens Axboe2b188cc2019-01-07 10:46:33 -07001813 struct iov_iter iter;
Jens Axboe31b51512019-01-18 22:56:34 -07001814 size_t iov_count;
Jens Axboef67676d2019-12-02 11:03:47 -07001815 ssize_t io_size, ret;
Jens Axboe2b188cc2019-01-07 10:46:33 -07001816
Jens Axboef67676d2019-12-02 11:03:47 -07001817 if (!req->io) {
1818 ret = io_read_prep(req, &iovec, &iter, force_nonblock);
1819 if (ret < 0)
1820 return ret;
1821 } else {
1822 ret = io_import_iovec(READ, req, &iovec, &iter);
1823 if (ret < 0)
1824 return ret;
1825 }
Jens Axboe2b188cc2019-01-07 10:46:33 -07001826
Jens Axboefd6c2e42019-12-18 12:19:41 -07001827 /* Ensure we clear previously set non-block flag */
1828 if (!force_nonblock)
Jens Axboe9adbd452019-12-20 08:45:55 -07001829 req->rw.kiocb.ki_flags &= ~IOCB_NOWAIT;
Jens Axboefd6c2e42019-12-18 12:19:41 -07001830
Jens Axboef67676d2019-12-02 11:03:47 -07001831 io_size = ret;
Jens Axboe9e645e112019-05-10 16:07:28 -06001832 if (req->flags & REQ_F_LINK)
Jens Axboef67676d2019-12-02 11:03:47 -07001833 req->result = io_size;
1834
1835 /*
1836 * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so
1837 * we know to async punt it even if it was opened O_NONBLOCK
1838 */
Jens Axboe9adbd452019-12-20 08:45:55 -07001839 if (force_nonblock && !io_file_supports_async(req->file)) {
Jens Axboef67676d2019-12-02 11:03:47 -07001840 req->flags |= REQ_F_MUST_PUNT;
1841 goto copy_iov;
1842 }
Jens Axboe9e645e112019-05-10 16:07:28 -06001843
Jens Axboe31b51512019-01-18 22:56:34 -07001844 iov_count = iov_iter_count(&iter);
Jens Axboe9adbd452019-12-20 08:45:55 -07001845 ret = rw_verify_area(READ, req->file, &kiocb->ki_pos, iov_count);
Jens Axboe2b188cc2019-01-07 10:46:33 -07001846 if (!ret) {
1847 ssize_t ret2;
1848
Jens Axboe9adbd452019-12-20 08:45:55 -07001849 if (req->file->f_op->read_iter)
1850 ret2 = call_read_iter(req->file, kiocb, &iter);
Jens Axboe32960612019-09-23 11:05:34 -06001851 else
Jens Axboe9adbd452019-12-20 08:45:55 -07001852 ret2 = loop_rw_iter(READ, req->file, kiocb, &iter);
Jens Axboe32960612019-09-23 11:05:34 -06001853
Jens Axboe9d93a3f2019-05-15 13:53:07 -06001854 /*
1855 * In case of a short read, punt to async. This can happen
1856 * if we have data partially cached. Alternatively we can
1857 * return the short read, in which case the application will
1858 * need to issue another SQE and wait for it. That SQE will
1859 * need async punt anyway, so it's more efficient to do it
1860 * here.
1861 */
Jens Axboe491381ce2019-10-17 09:20:46 -06001862 if (force_nonblock && !(req->flags & REQ_F_NOWAIT) &&
1863 (req->flags & REQ_F_ISREG) &&
Jens Axboef67676d2019-12-02 11:03:47 -07001864 ret2 > 0 && ret2 < io_size)
Jens Axboe9d93a3f2019-05-15 13:53:07 -06001865 ret2 = -EAGAIN;
1866 /* Catch -EAGAIN return for forced non-blocking submission */
Jens Axboef67676d2019-12-02 11:03:47 -07001867 if (!force_nonblock || ret2 != -EAGAIN) {
Pavel Begunkovcf6fd4b2019-11-25 23:14:39 +03001868 kiocb_done(kiocb, ret2, nxt, req->in_async);
Jens Axboef67676d2019-12-02 11:03:47 -07001869 } else {
1870copy_iov:
Jens Axboeb7bb4f72019-12-15 22:13:43 -07001871 ret = io_setup_async_rw(req, io_size, iovec,
Jens Axboef67676d2019-12-02 11:03:47 -07001872 inline_vecs, &iter);
1873 if (ret)
1874 goto out_free;
1875 return -EAGAIN;
1876 }
Jens Axboe2b188cc2019-01-07 10:46:33 -07001877 }
Jens Axboef67676d2019-12-02 11:03:47 -07001878out_free:
Jens Axboeb7bb4f72019-12-15 22:13:43 -07001879 if (!io_wq_current_is_worker())
1880 kfree(iovec);
Jens Axboe2b188cc2019-01-07 10:46:33 -07001881 return ret;
1882}
1883
Jens Axboef67676d2019-12-02 11:03:47 -07001884static int io_write_prep(struct io_kiocb *req, struct iovec **iovec,
1885 struct iov_iter *iter, bool force_nonblock)
1886{
1887 ssize_t ret;
1888
1889 ret = io_prep_rw(req, force_nonblock);
1890 if (ret)
1891 return ret;
1892
1893 if (unlikely(!(req->file->f_mode & FMODE_WRITE)))
1894 return -EBADF;
1895
1896 return io_import_iovec(WRITE, req, iovec, iter);
1897}
1898
Pavel Begunkov267bc902019-11-07 01:41:08 +03001899static int io_write(struct io_kiocb *req, struct io_kiocb **nxt,
Jens Axboe8358e3a2019-04-23 08:17:58 -06001900 bool force_nonblock)
Jens Axboe2b188cc2019-01-07 10:46:33 -07001901{
1902 struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
Jens Axboe9adbd452019-12-20 08:45:55 -07001903 struct kiocb *kiocb = &req->rw.kiocb;
Jens Axboe2b188cc2019-01-07 10:46:33 -07001904 struct iov_iter iter;
Jens Axboe31b51512019-01-18 22:56:34 -07001905 size_t iov_count;
Jens Axboef67676d2019-12-02 11:03:47 -07001906 ssize_t ret, io_size;
Jens Axboe2b188cc2019-01-07 10:46:33 -07001907
Jens Axboef67676d2019-12-02 11:03:47 -07001908 if (!req->io) {
1909 ret = io_write_prep(req, &iovec, &iter, force_nonblock);
1910 if (ret < 0)
1911 return ret;
1912 } else {
1913 ret = io_import_iovec(WRITE, req, &iovec, &iter);
1914 if (ret < 0)
1915 return ret;
1916 }
Jens Axboe2b188cc2019-01-07 10:46:33 -07001917
Jens Axboefd6c2e42019-12-18 12:19:41 -07001918 /* Ensure we clear previously set non-block flag */
1919 if (!force_nonblock)
Jens Axboe9adbd452019-12-20 08:45:55 -07001920 req->rw.kiocb.ki_flags &= ~IOCB_NOWAIT;
Jens Axboefd6c2e42019-12-18 12:19:41 -07001921
Jens Axboef67676d2019-12-02 11:03:47 -07001922 io_size = ret;
Jens Axboe9e645e112019-05-10 16:07:28 -06001923 if (req->flags & REQ_F_LINK)
Jens Axboef67676d2019-12-02 11:03:47 -07001924 req->result = io_size;
1925
1926 /*
1927 * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so
1928 * we know to async punt it even if it was opened O_NONBLOCK
1929 */
1930 if (force_nonblock && !io_file_supports_async(req->file)) {
1931 req->flags |= REQ_F_MUST_PUNT;
1932 goto copy_iov;
1933 }
1934
Jens Axboe10d59342019-12-09 20:16:22 -07001935 /* file path doesn't support NOWAIT for non-direct_IO */
1936 if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
1937 (req->flags & REQ_F_ISREG))
Jens Axboef67676d2019-12-02 11:03:47 -07001938 goto copy_iov;
Jens Axboe9e645e112019-05-10 16:07:28 -06001939
Jens Axboe31b51512019-01-18 22:56:34 -07001940 iov_count = iov_iter_count(&iter);
Jens Axboe9adbd452019-12-20 08:45:55 -07001941 ret = rw_verify_area(WRITE, req->file, &kiocb->ki_pos, iov_count);
Jens Axboe2b188cc2019-01-07 10:46:33 -07001942 if (!ret) {
Roman Penyaev9bf79332019-03-25 20:09:24 +01001943 ssize_t ret2;
1944
Jens Axboe2b188cc2019-01-07 10:46:33 -07001945 /*
1946 * Open-code file_start_write here to grab freeze protection,
1947 * which will be released by another thread in
1948 * io_complete_rw(). Fool lockdep by telling it the lock got
1949 * released so that it doesn't complain about the held lock when
1950 * we return to userspace.
1951 */
Jens Axboe491381ce2019-10-17 09:20:46 -06001952 if (req->flags & REQ_F_ISREG) {
Jens Axboe9adbd452019-12-20 08:45:55 -07001953 __sb_start_write(file_inode(req->file)->i_sb,
Jens Axboe2b188cc2019-01-07 10:46:33 -07001954 SB_FREEZE_WRITE, true);
Jens Axboe9adbd452019-12-20 08:45:55 -07001955 __sb_writers_release(file_inode(req->file)->i_sb,
Jens Axboe2b188cc2019-01-07 10:46:33 -07001956 SB_FREEZE_WRITE);
1957 }
1958 kiocb->ki_flags |= IOCB_WRITE;
Roman Penyaev9bf79332019-03-25 20:09:24 +01001959
Jens Axboe9adbd452019-12-20 08:45:55 -07001960 if (req->file->f_op->write_iter)
1961 ret2 = call_write_iter(req->file, kiocb, &iter);
Jens Axboe32960612019-09-23 11:05:34 -06001962 else
Jens Axboe9adbd452019-12-20 08:45:55 -07001963 ret2 = loop_rw_iter(WRITE, req->file, kiocb, &iter);
Jens Axboef67676d2019-12-02 11:03:47 -07001964 if (!force_nonblock || ret2 != -EAGAIN) {
Pavel Begunkovcf6fd4b2019-11-25 23:14:39 +03001965 kiocb_done(kiocb, ret2, nxt, req->in_async);
Jens Axboef67676d2019-12-02 11:03:47 -07001966 } else {
1967copy_iov:
Jens Axboeb7bb4f72019-12-15 22:13:43 -07001968 ret = io_setup_async_rw(req, io_size, iovec,
Jens Axboef67676d2019-12-02 11:03:47 -07001969 inline_vecs, &iter);
1970 if (ret)
1971 goto out_free;
1972 return -EAGAIN;
1973 }
Jens Axboe2b188cc2019-01-07 10:46:33 -07001974 }
Jens Axboe31b51512019-01-18 22:56:34 -07001975out_free:
Jens Axboeb7bb4f72019-12-15 22:13:43 -07001976 if (!io_wq_current_is_worker())
1977 kfree(iovec);
Jens Axboe2b188cc2019-01-07 10:46:33 -07001978 return ret;
1979}
1980
1981/*
1982 * IORING_OP_NOP just posts a completion event, nothing else.
1983 */
Jens Axboe78e19bb2019-11-06 15:21:34 -07001984static int io_nop(struct io_kiocb *req)
Jens Axboe2b188cc2019-01-07 10:46:33 -07001985{
1986 struct io_ring_ctx *ctx = req->ctx;
Jens Axboe2b188cc2019-01-07 10:46:33 -07001987
Jens Axboedef596e2019-01-09 08:59:42 -07001988 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
1989 return -EINVAL;
1990
Jens Axboe78e19bb2019-11-06 15:21:34 -07001991 io_cqring_add_event(req, 0);
Jens Axboee65ef562019-03-12 10:16:44 -06001992 io_put_req(req);
Jens Axboe2b188cc2019-01-07 10:46:33 -07001993 return 0;
1994}
1995
Jens Axboefc4df992019-12-10 14:38:45 -07001996static int io_prep_fsync(struct io_kiocb *req)
Christoph Hellwigc992fe22019-01-11 09:43:02 -07001997{
Jens Axboefc4df992019-12-10 14:38:45 -07001998 const struct io_uring_sqe *sqe = req->sqe;
Jens Axboe6b063142019-01-10 22:13:58 -07001999 struct io_ring_ctx *ctx = req->ctx;
Christoph Hellwigc992fe22019-01-11 09:43:02 -07002000
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07002001 if (req->flags & REQ_F_PREPPED)
2002 return 0;
Jens Axboe09bb8392019-03-13 12:39:28 -06002003 if (!req->file)
2004 return -EBADF;
Christoph Hellwigc992fe22019-01-11 09:43:02 -07002005
Jens Axboe6b063142019-01-10 22:13:58 -07002006 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
Jens Axboedef596e2019-01-09 08:59:42 -07002007 return -EINVAL;
Jens Axboeedafcce2019-01-09 09:16:05 -07002008 if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
Christoph Hellwigc992fe22019-01-11 09:43:02 -07002009 return -EINVAL;
2010
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07002011 req->sync.flags = READ_ONCE(sqe->fsync_flags);
2012 if (unlikely(req->sync.flags & ~IORING_FSYNC_DATASYNC))
2013 return -EINVAL;
2014
2015 req->sync.off = READ_ONCE(sqe->off);
2016 req->sync.len = READ_ONCE(sqe->len);
2017 req->flags |= REQ_F_PREPPED;
Christoph Hellwigc992fe22019-01-11 09:43:02 -07002018 return 0;
2019}
2020
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07002021static bool io_req_cancelled(struct io_kiocb *req)
2022{
2023 if (req->work.flags & IO_WQ_WORK_CANCEL) {
2024 req_set_fail_links(req);
2025 io_cqring_add_event(req, -ECANCELED);
2026 io_put_req(req);
2027 return true;
2028 }
2029
2030 return false;
2031}
2032
2033static void io_fsync_finish(struct io_wq_work **workptr)
2034{
2035 struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
2036 loff_t end = req->sync.off + req->sync.len;
2037 struct io_kiocb *nxt = NULL;
2038 int ret;
2039
2040 if (io_req_cancelled(req))
2041 return;
2042
Jens Axboe9adbd452019-12-20 08:45:55 -07002043 ret = vfs_fsync_range(req->file, req->sync.off,
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07002044 end > 0 ? end : LLONG_MAX,
2045 req->sync.flags & IORING_FSYNC_DATASYNC);
2046 if (ret < 0)
2047 req_set_fail_links(req);
2048 io_cqring_add_event(req, ret);
2049 io_put_req_find_next(req, &nxt);
2050 if (nxt)
2051 *workptr = &nxt->work;
2052}
2053
Jens Axboefc4df992019-12-10 14:38:45 -07002054static int io_fsync(struct io_kiocb *req, struct io_kiocb **nxt,
2055 bool force_nonblock)
Christoph Hellwigc992fe22019-01-11 09:43:02 -07002056{
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07002057 struct io_wq_work *work, *old_work;
Christoph Hellwigc992fe22019-01-11 09:43:02 -07002058 int ret;
2059
Jens Axboefc4df992019-12-10 14:38:45 -07002060 ret = io_prep_fsync(req);
Christoph Hellwigc992fe22019-01-11 09:43:02 -07002061 if (ret)
2062 return ret;
2063
2064 /* fsync always requires a blocking context */
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07002065 if (force_nonblock) {
2066 io_put_req(req);
2067 req->work.func = io_fsync_finish;
Christoph Hellwigc992fe22019-01-11 09:43:02 -07002068 return -EAGAIN;
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07002069 }
Christoph Hellwigc992fe22019-01-11 09:43:02 -07002070
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07002071 work = old_work = &req->work;
2072 io_fsync_finish(&work);
2073 if (work && work != old_work)
2074 *nxt = container_of(work, struct io_kiocb, work);
Christoph Hellwigc992fe22019-01-11 09:43:02 -07002075 return 0;
2076}
2077
Jens Axboefc4df992019-12-10 14:38:45 -07002078static int io_prep_sfr(struct io_kiocb *req)
Jens Axboe5d17b4a2019-04-09 14:56:44 -06002079{
Jens Axboefc4df992019-12-10 14:38:45 -07002080 const struct io_uring_sqe *sqe = req->sqe;
Jens Axboe5d17b4a2019-04-09 14:56:44 -06002081 struct io_ring_ctx *ctx = req->ctx;
Jens Axboe5d17b4a2019-04-09 14:56:44 -06002082
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07002083 if (req->flags & REQ_F_PREPPED)
2084 return 0;
Jens Axboe5d17b4a2019-04-09 14:56:44 -06002085 if (!req->file)
2086 return -EBADF;
Jens Axboe5d17b4a2019-04-09 14:56:44 -06002087
2088 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
2089 return -EINVAL;
2090 if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
2091 return -EINVAL;
2092
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07002093 req->sync.off = READ_ONCE(sqe->off);
2094 req->sync.len = READ_ONCE(sqe->len);
2095 req->sync.flags = READ_ONCE(sqe->sync_range_flags);
2096 req->flags |= REQ_F_PREPPED;
2097 return 0;
2098}
2099
2100static void io_sync_file_range_finish(struct io_wq_work **workptr)
2101{
2102 struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
2103 struct io_kiocb *nxt = NULL;
2104 int ret;
2105
2106 if (io_req_cancelled(req))
2107 return;
2108
Jens Axboe9adbd452019-12-20 08:45:55 -07002109 ret = sync_file_range(req->file, req->sync.off, req->sync.len,
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07002110 req->sync.flags);
2111 if (ret < 0)
2112 req_set_fail_links(req);
2113 io_cqring_add_event(req, ret);
2114 io_put_req_find_next(req, &nxt);
2115 if (nxt)
2116 *workptr = &nxt->work;
Jens Axboe5d17b4a2019-04-09 14:56:44 -06002117}
2118
Jens Axboefc4df992019-12-10 14:38:45 -07002119static int io_sync_file_range(struct io_kiocb *req, struct io_kiocb **nxt,
Jens Axboe5d17b4a2019-04-09 14:56:44 -06002120 bool force_nonblock)
2121{
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07002122 struct io_wq_work *work, *old_work;
Jens Axboe5d17b4a2019-04-09 14:56:44 -06002123 int ret;
2124
Jens Axboefc4df992019-12-10 14:38:45 -07002125 ret = io_prep_sfr(req);
Jens Axboe5d17b4a2019-04-09 14:56:44 -06002126 if (ret)
2127 return ret;
2128
2129 /* sync_file_range always requires a blocking context */
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07002130 if (force_nonblock) {
2131 io_put_req(req);
2132 req->work.func = io_sync_file_range_finish;
Jens Axboe5d17b4a2019-04-09 14:56:44 -06002133 return -EAGAIN;
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07002134 }
Jens Axboe5d17b4a2019-04-09 14:56:44 -06002135
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07002136 work = old_work = &req->work;
2137 io_sync_file_range_finish(&work);
2138 if (work && work != old_work)
2139 *nxt = container_of(work, struct io_kiocb, work);
Jens Axboe5d17b4a2019-04-09 14:56:44 -06002140 return 0;
2141}
2142
Jens Axboeb7bb4f72019-12-15 22:13:43 -07002143#if defined(CONFIG_NET)
2144static void io_sendrecv_async(struct io_wq_work **workptr)
2145{
2146 struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
2147 struct iovec *iov = NULL;
2148
2149 if (req->io->rw.iov != req->io->rw.fast_iov)
2150 iov = req->io->msg.iov;
2151 io_wq_submit_work(workptr);
2152 kfree(iov);
2153}
2154#endif
2155
Jens Axboe03b12302019-12-02 18:50:25 -07002156static int io_sendmsg_prep(struct io_kiocb *req, struct io_async_ctx *io)
Jens Axboeaa1fa282019-04-19 13:38:09 -06002157{
Jens Axboe03b12302019-12-02 18:50:25 -07002158#if defined(CONFIG_NET)
2159 const struct io_uring_sqe *sqe = req->sqe;
2160 struct user_msghdr __user *msg;
2161 unsigned flags;
2162
2163 flags = READ_ONCE(sqe->msg_flags);
Jens Axboed55e5f52019-12-11 16:12:15 -07002164 msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
Jens Axboed9688562019-12-09 19:35:20 -07002165 io->msg.iov = io->msg.fast_iov;
Jens Axboe03b12302019-12-02 18:50:25 -07002166 return sendmsg_copy_msghdr(&io->msg.msg, msg, flags, &io->msg.iov);
2167#else
2168 return 0;
2169#endif
2170}
2171
Jens Axboefc4df992019-12-10 14:38:45 -07002172static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt,
2173 bool force_nonblock)
Jens Axboe03b12302019-12-02 18:50:25 -07002174{
2175#if defined(CONFIG_NET)
Jens Axboefc4df992019-12-10 14:38:45 -07002176 const struct io_uring_sqe *sqe = req->sqe;
Jens Axboe0b416c32019-12-15 10:57:46 -07002177 struct io_async_msghdr *kmsg = NULL;
Jens Axboe03b12302019-12-02 18:50:25 -07002178 struct socket *sock;
2179 int ret;
2180
2181 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
2182 return -EINVAL;
2183
2184 sock = sock_from_file(req->file, &ret);
2185 if (sock) {
Jens Axboeb7bb4f72019-12-15 22:13:43 -07002186 struct io_async_ctx io;
Jens Axboe03b12302019-12-02 18:50:25 -07002187 struct sockaddr_storage addr;
Jens Axboe03b12302019-12-02 18:50:25 -07002188 unsigned flags;
2189
2190 flags = READ_ONCE(sqe->msg_flags);
2191 if (flags & MSG_DONTWAIT)
2192 req->flags |= REQ_F_NOWAIT;
2193 else if (force_nonblock)
2194 flags |= MSG_DONTWAIT;
2195
2196 if (req->io) {
Jens Axboe0b416c32019-12-15 10:57:46 -07002197 kmsg = &req->io->msg;
2198 kmsg->msg.msg_name = &addr;
2199 /* if iov is set, it's allocated already */
2200 if (!kmsg->iov)
2201 kmsg->iov = kmsg->fast_iov;
2202 kmsg->msg.msg_iter.iov = kmsg->iov;
Jens Axboe03b12302019-12-02 18:50:25 -07002203 } else {
Jens Axboe0b416c32019-12-15 10:57:46 -07002204 kmsg = &io.msg;
2205 kmsg->msg.msg_name = &addr;
Jens Axboe03b12302019-12-02 18:50:25 -07002206 ret = io_sendmsg_prep(req, &io);
2207 if (ret)
2208 goto out;
2209 }
2210
Jens Axboe0b416c32019-12-15 10:57:46 -07002211 ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
Jens Axboe03b12302019-12-02 18:50:25 -07002212 if (force_nonblock && ret == -EAGAIN) {
Jens Axboeb7bb4f72019-12-15 22:13:43 -07002213 if (req->io)
2214 return -EAGAIN;
2215 if (io_alloc_async_ctx(req))
2216 return -ENOMEM;
2217 memcpy(&req->io->msg, &io.msg, sizeof(io.msg));
2218 req->work.func = io_sendrecv_async;
Jens Axboe0b416c32019-12-15 10:57:46 -07002219 return -EAGAIN;
Jens Axboe03b12302019-12-02 18:50:25 -07002220 }
2221 if (ret == -ERESTARTSYS)
2222 ret = -EINTR;
2223 }
2224
2225out:
Jens Axboeb7bb4f72019-12-15 22:13:43 -07002226 if (!io_wq_current_is_worker() && kmsg && kmsg->iov != kmsg->fast_iov)
Jens Axboe0b416c32019-12-15 10:57:46 -07002227 kfree(kmsg->iov);
Jens Axboe03b12302019-12-02 18:50:25 -07002228 io_cqring_add_event(req, ret);
Jens Axboe4e88d6e2019-12-07 20:59:47 -07002229 if (ret < 0)
2230 req_set_fail_links(req);
Jens Axboe03b12302019-12-02 18:50:25 -07002231 io_put_req_find_next(req, nxt);
2232 return 0;
2233#else
2234 return -EOPNOTSUPP;
2235#endif
2236}
2237
2238static int io_recvmsg_prep(struct io_kiocb *req, struct io_async_ctx *io)
2239{
2240#if defined(CONFIG_NET)
2241 const struct io_uring_sqe *sqe = req->sqe;
2242 struct user_msghdr __user *msg;
2243 unsigned flags;
2244
2245 flags = READ_ONCE(sqe->msg_flags);
Jens Axboed55e5f52019-12-11 16:12:15 -07002246 msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
Jens Axboed9688562019-12-09 19:35:20 -07002247 io->msg.iov = io->msg.fast_iov;
Jens Axboe03b12302019-12-02 18:50:25 -07002248 return recvmsg_copy_msghdr(&io->msg.msg, msg, flags, &io->msg.uaddr,
2249 &io->msg.iov);
2250#else
2251 return 0;
2252#endif
2253}
2254
Jens Axboefc4df992019-12-10 14:38:45 -07002255static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt,
2256 bool force_nonblock)
Jens Axboe03b12302019-12-02 18:50:25 -07002257{
2258#if defined(CONFIG_NET)
Jens Axboefc4df992019-12-10 14:38:45 -07002259 const struct io_uring_sqe *sqe = req->sqe;
Jens Axboe0b416c32019-12-15 10:57:46 -07002260 struct io_async_msghdr *kmsg = NULL;
Jens Axboe0fa03c62019-04-19 13:34:07 -06002261 struct socket *sock;
2262 int ret;
2263
2264 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
2265 return -EINVAL;
2266
2267 sock = sock_from_file(req->file, &ret);
2268 if (sock) {
2269 struct user_msghdr __user *msg;
Jens Axboeb7bb4f72019-12-15 22:13:43 -07002270 struct io_async_ctx io;
Jens Axboe03b12302019-12-02 18:50:25 -07002271 struct sockaddr_storage addr;
Jens Axboe0fa03c62019-04-19 13:34:07 -06002272 unsigned flags;
2273
2274 flags = READ_ONCE(sqe->msg_flags);
2275 if (flags & MSG_DONTWAIT)
2276 req->flags |= REQ_F_NOWAIT;
2277 else if (force_nonblock)
2278 flags |= MSG_DONTWAIT;
2279
Jens Axboed55e5f52019-12-11 16:12:15 -07002280 msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
Jens Axboe03b12302019-12-02 18:50:25 -07002281 if (req->io) {
Jens Axboe0b416c32019-12-15 10:57:46 -07002282 kmsg = &req->io->msg;
2283 kmsg->msg.msg_name = &addr;
2284 /* if iov is set, it's allocated already */
2285 if (!kmsg->iov)
2286 kmsg->iov = kmsg->fast_iov;
2287 kmsg->msg.msg_iter.iov = kmsg->iov;
Jens Axboe03b12302019-12-02 18:50:25 -07002288 } else {
Jens Axboe0b416c32019-12-15 10:57:46 -07002289 kmsg = &io.msg;
2290 kmsg->msg.msg_name = &addr;
Jens Axboe03b12302019-12-02 18:50:25 -07002291 ret = io_recvmsg_prep(req, &io);
2292 if (ret)
2293 goto out;
2294 }
Jens Axboe0fa03c62019-04-19 13:34:07 -06002295
Jens Axboe0b416c32019-12-15 10:57:46 -07002296 ret = __sys_recvmsg_sock(sock, &kmsg->msg, msg, kmsg->uaddr, flags);
Jens Axboe03b12302019-12-02 18:50:25 -07002297 if (force_nonblock && ret == -EAGAIN) {
Jens Axboeb7bb4f72019-12-15 22:13:43 -07002298 if (req->io)
2299 return -EAGAIN;
2300 if (io_alloc_async_ctx(req))
2301 return -ENOMEM;
2302 memcpy(&req->io->msg, &io.msg, sizeof(io.msg));
2303 req->work.func = io_sendrecv_async;
Jens Axboe0b416c32019-12-15 10:57:46 -07002304 return -EAGAIN;
Jens Axboe03b12302019-12-02 18:50:25 -07002305 }
Jens Axboe441cdbd2019-12-02 18:49:10 -07002306 if (ret == -ERESTARTSYS)
2307 ret = -EINTR;
Jens Axboe0fa03c62019-04-19 13:34:07 -06002308 }
2309
Jens Axboe03b12302019-12-02 18:50:25 -07002310out:
Jens Axboeb7bb4f72019-12-15 22:13:43 -07002311 if (!io_wq_current_is_worker() && kmsg && kmsg->iov != kmsg->fast_iov)
Jens Axboe0b416c32019-12-15 10:57:46 -07002312 kfree(kmsg->iov);
Jens Axboe78e19bb2019-11-06 15:21:34 -07002313 io_cqring_add_event(req, ret);
Jens Axboe4e88d6e2019-12-07 20:59:47 -07002314 if (ret < 0)
2315 req_set_fail_links(req);
Jackie Liuec9c02a2019-11-08 23:50:36 +08002316 io_put_req_find_next(req, nxt);
Jens Axboe0fa03c62019-04-19 13:34:07 -06002317 return 0;
2318#else
2319 return -EOPNOTSUPP;
2320#endif
2321}
2322
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07002323static int io_accept_prep(struct io_kiocb *req)
Jens Axboe17f2fe32019-10-17 14:42:58 -06002324{
2325#if defined(CONFIG_NET)
Jens Axboefc4df992019-12-10 14:38:45 -07002326 const struct io_uring_sqe *sqe = req->sqe;
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07002327 struct io_accept *accept = &req->accept;
2328
2329 if (req->flags & REQ_F_PREPPED)
2330 return 0;
Jens Axboe17f2fe32019-10-17 14:42:58 -06002331
2332 if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
2333 return -EINVAL;
Hrvoje Zeba8042d6c2019-11-25 14:40:22 -05002334 if (sqe->ioprio || sqe->len || sqe->buf_index)
Jens Axboe17f2fe32019-10-17 14:42:58 -06002335 return -EINVAL;
2336
Jens Axboed55e5f52019-12-11 16:12:15 -07002337 accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
2338 accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07002339 accept->flags = READ_ONCE(sqe->accept_flags);
2340 req->flags |= REQ_F_PREPPED;
2341 return 0;
2342#else
2343 return -EOPNOTSUPP;
2344#endif
2345}
Jens Axboe17f2fe32019-10-17 14:42:58 -06002346
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07002347#if defined(CONFIG_NET)
2348static int __io_accept(struct io_kiocb *req, struct io_kiocb **nxt,
2349 bool force_nonblock)
2350{
2351 struct io_accept *accept = &req->accept;
2352 unsigned file_flags;
2353 int ret;
2354
2355 file_flags = force_nonblock ? O_NONBLOCK : 0;
2356 ret = __sys_accept4_file(req->file, file_flags, accept->addr,
2357 accept->addr_len, accept->flags);
2358 if (ret == -EAGAIN && force_nonblock)
Jens Axboe17f2fe32019-10-17 14:42:58 -06002359 return -EAGAIN;
Jens Axboe8e3cca12019-11-09 19:52:33 -07002360 if (ret == -ERESTARTSYS)
2361 ret = -EINTR;
Jens Axboe4e88d6e2019-12-07 20:59:47 -07002362 if (ret < 0)
2363 req_set_fail_links(req);
Jens Axboe78e19bb2019-11-06 15:21:34 -07002364 io_cqring_add_event(req, ret);
Jackie Liuec9c02a2019-11-08 23:50:36 +08002365 io_put_req_find_next(req, nxt);
Jens Axboe17f2fe32019-10-17 14:42:58 -06002366 return 0;
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07002367}
2368
2369static void io_accept_finish(struct io_wq_work **workptr)
2370{
2371 struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
2372 struct io_kiocb *nxt = NULL;
2373
2374 if (io_req_cancelled(req))
2375 return;
2376 __io_accept(req, &nxt, false);
2377 if (nxt)
2378 *workptr = &nxt->work;
2379}
2380#endif
2381
2382static int io_accept(struct io_kiocb *req, struct io_kiocb **nxt,
2383 bool force_nonblock)
2384{
2385#if defined(CONFIG_NET)
2386 int ret;
2387
2388 ret = io_accept_prep(req);
2389 if (ret)
2390 return ret;
2391
2392 ret = __io_accept(req, nxt, force_nonblock);
2393 if (ret == -EAGAIN && force_nonblock) {
2394 req->work.func = io_accept_finish;
2395 req->work.flags |= IO_WQ_WORK_NEEDS_FILES;
2396 io_put_req(req);
2397 return -EAGAIN;
2398 }
2399 return 0;
Jens Axboe17f2fe32019-10-17 14:42:58 -06002400#else
2401 return -EOPNOTSUPP;
2402#endif
2403}
2404
Jens Axboef499a022019-12-02 16:28:46 -07002405static int io_connect_prep(struct io_kiocb *req, struct io_async_ctx *io)
2406{
2407#if defined(CONFIG_NET)
2408 const struct io_uring_sqe *sqe = req->sqe;
2409 struct sockaddr __user *addr;
2410 int addr_len;
2411
Jens Axboed55e5f52019-12-11 16:12:15 -07002412 addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
Jens Axboef499a022019-12-02 16:28:46 -07002413 addr_len = READ_ONCE(sqe->addr2);
2414 return move_addr_to_kernel(addr, addr_len, &io->connect.address);
2415#else
2416 return 0;
2417#endif
2418}
2419
Jens Axboefc4df992019-12-10 14:38:45 -07002420static int io_connect(struct io_kiocb *req, struct io_kiocb **nxt,
2421 bool force_nonblock)
Jens Axboef8e85cf2019-11-23 14:24:24 -07002422{
2423#if defined(CONFIG_NET)
Jens Axboefc4df992019-12-10 14:38:45 -07002424 const struct io_uring_sqe *sqe = req->sqe;
Jens Axboef499a022019-12-02 16:28:46 -07002425 struct io_async_ctx __io, *io;
Jens Axboef8e85cf2019-11-23 14:24:24 -07002426 unsigned file_flags;
2427 int addr_len, ret;
2428
2429 if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
2430 return -EINVAL;
2431 if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags)
2432 return -EINVAL;
2433
Jens Axboef8e85cf2019-11-23 14:24:24 -07002434 addr_len = READ_ONCE(sqe->addr2);
2435 file_flags = force_nonblock ? O_NONBLOCK : 0;
2436
Jens Axboef499a022019-12-02 16:28:46 -07002437 if (req->io) {
2438 io = req->io;
2439 } else {
2440 ret = io_connect_prep(req, &__io);
2441 if (ret)
2442 goto out;
2443 io = &__io;
2444 }
2445
2446 ret = __sys_connect_file(req->file, &io->connect.address, addr_len,
2447 file_flags);
Jens Axboe87f80d62019-12-03 11:23:54 -07002448 if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) {
Jens Axboeb7bb4f72019-12-15 22:13:43 -07002449 if (req->io)
2450 return -EAGAIN;
2451 if (io_alloc_async_ctx(req)) {
Jens Axboef499a022019-12-02 16:28:46 -07002452 ret = -ENOMEM;
2453 goto out;
2454 }
Jens Axboeb7bb4f72019-12-15 22:13:43 -07002455 memcpy(&req->io->connect, &__io.connect, sizeof(__io.connect));
Jens Axboef8e85cf2019-11-23 14:24:24 -07002456 return -EAGAIN;
Jens Axboef499a022019-12-02 16:28:46 -07002457 }
Jens Axboef8e85cf2019-11-23 14:24:24 -07002458 if (ret == -ERESTARTSYS)
2459 ret = -EINTR;
Jens Axboef499a022019-12-02 16:28:46 -07002460out:
Jens Axboe4e88d6e2019-12-07 20:59:47 -07002461 if (ret < 0)
2462 req_set_fail_links(req);
Jens Axboef8e85cf2019-11-23 14:24:24 -07002463 io_cqring_add_event(req, ret);
2464 io_put_req_find_next(req, nxt);
2465 return 0;
2466#else
2467 return -EOPNOTSUPP;
2468#endif
2469}
2470
Jens Axboe221c5eb2019-01-17 09:41:58 -07002471static void io_poll_remove_one(struct io_kiocb *req)
2472{
2473 struct io_poll_iocb *poll = &req->poll;
2474
2475 spin_lock(&poll->head->lock);
2476 WRITE_ONCE(poll->canceled, true);
Jens Axboe392edb42019-12-09 17:52:20 -07002477 if (!list_empty(&poll->wait.entry)) {
2478 list_del_init(&poll->wait.entry);
Jackie Liua197f662019-11-08 08:09:12 -07002479 io_queue_async_work(req);
Jens Axboe221c5eb2019-01-17 09:41:58 -07002480 }
2481 spin_unlock(&poll->head->lock);
Jens Axboe78076bb2019-12-04 19:56:40 -07002482 hash_del(&req->hash_node);
Jens Axboe221c5eb2019-01-17 09:41:58 -07002483}
2484
2485static void io_poll_remove_all(struct io_ring_ctx *ctx)
2486{
Jens Axboe78076bb2019-12-04 19:56:40 -07002487 struct hlist_node *tmp;
Jens Axboe221c5eb2019-01-17 09:41:58 -07002488 struct io_kiocb *req;
Jens Axboe78076bb2019-12-04 19:56:40 -07002489 int i;
Jens Axboe221c5eb2019-01-17 09:41:58 -07002490
2491 spin_lock_irq(&ctx->completion_lock);
Jens Axboe78076bb2019-12-04 19:56:40 -07002492 for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
2493 struct hlist_head *list;
2494
2495 list = &ctx->cancel_hash[i];
2496 hlist_for_each_entry_safe(req, tmp, list, hash_node)
2497 io_poll_remove_one(req);
Jens Axboe221c5eb2019-01-17 09:41:58 -07002498 }
2499 spin_unlock_irq(&ctx->completion_lock);
2500}
2501
Jens Axboe47f46762019-11-09 17:43:02 -07002502static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr)
2503{
Jens Axboe78076bb2019-12-04 19:56:40 -07002504 struct hlist_head *list;
Jens Axboe47f46762019-11-09 17:43:02 -07002505 struct io_kiocb *req;
2506
Jens Axboe78076bb2019-12-04 19:56:40 -07002507 list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)];
2508 hlist_for_each_entry(req, list, hash_node) {
2509 if (sqe_addr == req->user_data) {
Jens Axboeeac406c2019-11-14 12:09:58 -07002510 io_poll_remove_one(req);
2511 return 0;
2512 }
Jens Axboe47f46762019-11-09 17:43:02 -07002513 }
2514
2515 return -ENOENT;
2516}
2517
Jens Axboe0969e782019-12-17 18:40:57 -07002518static int io_poll_remove_prep(struct io_kiocb *req)
Jens Axboe221c5eb2019-01-17 09:41:58 -07002519{
Jens Axboefc4df992019-12-10 14:38:45 -07002520 const struct io_uring_sqe *sqe = req->sqe;
Jens Axboe221c5eb2019-01-17 09:41:58 -07002521
Jens Axboe0969e782019-12-17 18:40:57 -07002522 if (req->flags & REQ_F_PREPPED)
2523 return 0;
Jens Axboe221c5eb2019-01-17 09:41:58 -07002524 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
2525 return -EINVAL;
2526 if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
2527 sqe->poll_events)
2528 return -EINVAL;
2529
Jens Axboe0969e782019-12-17 18:40:57 -07002530 req->poll.addr = READ_ONCE(sqe->addr);
2531 req->flags |= REQ_F_PREPPED;
2532 return 0;
2533}
2534
2535/*
2536 * Find a running poll command that matches one specified in sqe->addr,
2537 * and remove it if found.
2538 */
2539static int io_poll_remove(struct io_kiocb *req)
2540{
2541 struct io_ring_ctx *ctx = req->ctx;
2542 u64 addr;
2543 int ret;
2544
2545 ret = io_poll_remove_prep(req);
2546 if (ret)
2547 return ret;
2548
2549 addr = req->poll.addr;
Jens Axboe221c5eb2019-01-17 09:41:58 -07002550 spin_lock_irq(&ctx->completion_lock);
Jens Axboe0969e782019-12-17 18:40:57 -07002551 ret = io_poll_cancel(ctx, addr);
Jens Axboe221c5eb2019-01-17 09:41:58 -07002552 spin_unlock_irq(&ctx->completion_lock);
2553
Jens Axboe78e19bb2019-11-06 15:21:34 -07002554 io_cqring_add_event(req, ret);
Jens Axboe4e88d6e2019-12-07 20:59:47 -07002555 if (ret < 0)
2556 req_set_fail_links(req);
Jens Axboee65ef562019-03-12 10:16:44 -06002557 io_put_req(req);
Jens Axboe221c5eb2019-01-17 09:41:58 -07002558 return 0;
2559}
2560
Jens Axboeb0dd8a42019-11-18 12:14:54 -07002561static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error)
Jens Axboe221c5eb2019-01-17 09:41:58 -07002562{
Jackie Liua197f662019-11-08 08:09:12 -07002563 struct io_ring_ctx *ctx = req->ctx;
2564
Jens Axboe8c838782019-03-12 15:48:16 -06002565 req->poll.done = true;
Jens Axboeb0dd8a42019-11-18 12:14:54 -07002566 if (error)
2567 io_cqring_fill_event(req, error);
2568 else
2569 io_cqring_fill_event(req, mangle_poll(mask));
Jens Axboe8c838782019-03-12 15:48:16 -06002570 io_commit_cqring(ctx);
Jens Axboe221c5eb2019-01-17 09:41:58 -07002571}
2572
Jens Axboe561fb042019-10-24 07:25:42 -06002573static void io_poll_complete_work(struct io_wq_work **workptr)
Jens Axboe221c5eb2019-01-17 09:41:58 -07002574{
Jens Axboe561fb042019-10-24 07:25:42 -06002575 struct io_wq_work *work = *workptr;
Jens Axboe221c5eb2019-01-17 09:41:58 -07002576 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
2577 struct io_poll_iocb *poll = &req->poll;
2578 struct poll_table_struct pt = { ._key = poll->events };
2579 struct io_ring_ctx *ctx = req->ctx;
Jens Axboe89723d02019-11-05 15:32:58 -07002580 struct io_kiocb *nxt = NULL;
Jens Axboe221c5eb2019-01-17 09:41:58 -07002581 __poll_t mask = 0;
Jens Axboeb0dd8a42019-11-18 12:14:54 -07002582 int ret = 0;
Jens Axboe221c5eb2019-01-17 09:41:58 -07002583
Jens Axboeb0dd8a42019-11-18 12:14:54 -07002584 if (work->flags & IO_WQ_WORK_CANCEL) {
Jens Axboe561fb042019-10-24 07:25:42 -06002585 WRITE_ONCE(poll->canceled, true);
Jens Axboeb0dd8a42019-11-18 12:14:54 -07002586 ret = -ECANCELED;
2587 } else if (READ_ONCE(poll->canceled)) {
2588 ret = -ECANCELED;
2589 }
Jens Axboe561fb042019-10-24 07:25:42 -06002590
Jens Axboeb0dd8a42019-11-18 12:14:54 -07002591 if (ret != -ECANCELED)
Jens Axboe221c5eb2019-01-17 09:41:58 -07002592 mask = vfs_poll(poll->file, &pt) & poll->events;
2593
2594 /*
2595 * Note that ->ki_cancel callers also delete iocb from active_reqs after
2596 * calling ->ki_cancel. We need the ctx_lock roundtrip here to
2597 * synchronize with them. In the cancellation case the list_del_init
2598 * itself is not actually needed, but harmless so we keep it in to
2599 * avoid further branches in the fast path.
2600 */
2601 spin_lock_irq(&ctx->completion_lock);
Jens Axboeb0dd8a42019-11-18 12:14:54 -07002602 if (!mask && ret != -ECANCELED) {
Jens Axboe392edb42019-12-09 17:52:20 -07002603 add_wait_queue(poll->head, &poll->wait);
Jens Axboe221c5eb2019-01-17 09:41:58 -07002604 spin_unlock_irq(&ctx->completion_lock);
2605 return;
2606 }
Jens Axboe78076bb2019-12-04 19:56:40 -07002607 hash_del(&req->hash_node);
Jens Axboeb0dd8a42019-11-18 12:14:54 -07002608 io_poll_complete(req, mask, ret);
Jens Axboe221c5eb2019-01-17 09:41:58 -07002609 spin_unlock_irq(&ctx->completion_lock);
2610
Jens Axboe8c838782019-03-12 15:48:16 -06002611 io_cqring_ev_posted(ctx);
Jens Axboe89723d02019-11-05 15:32:58 -07002612
Jens Axboe4e88d6e2019-12-07 20:59:47 -07002613 if (ret < 0)
2614 req_set_fail_links(req);
Jackie Liuec9c02a2019-11-08 23:50:36 +08002615 io_put_req_find_next(req, &nxt);
Jens Axboe89723d02019-11-05 15:32:58 -07002616 if (nxt)
2617 *workptr = &nxt->work;
Jens Axboe221c5eb2019-01-17 09:41:58 -07002618}
2619
2620static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
2621 void *key)
2622{
Jens Axboee9444752019-11-26 15:02:04 -07002623 struct io_poll_iocb *poll = wait->private;
Jens Axboe221c5eb2019-01-17 09:41:58 -07002624 struct io_kiocb *req = container_of(poll, struct io_kiocb, poll);
2625 struct io_ring_ctx *ctx = req->ctx;
2626 __poll_t mask = key_to_poll(key);
Jens Axboe8c838782019-03-12 15:48:16 -06002627 unsigned long flags;
Jens Axboe221c5eb2019-01-17 09:41:58 -07002628
2629 /* for instances that support it check for an event match first: */
Jens Axboe8c838782019-03-12 15:48:16 -06002630 if (mask && !(mask & poll->events))
2631 return 0;
Jens Axboe221c5eb2019-01-17 09:41:58 -07002632
Jens Axboe392edb42019-12-09 17:52:20 -07002633 list_del_init(&poll->wait.entry);
Jens Axboe8c838782019-03-12 15:48:16 -06002634
Jens Axboe7c9e7f02019-11-12 08:15:53 -07002635 /*
2636 * Run completion inline if we can. We're using trylock here because
2637 * we are violating the completion_lock -> poll wq lock ordering.
2638 * If we have a link timeout we're going to need the completion_lock
2639 * for finalizing the request, mark us as having grabbed that already.
2640 */
Jens Axboe8c838782019-03-12 15:48:16 -06002641 if (mask && spin_trylock_irqsave(&ctx->completion_lock, flags)) {
Jens Axboe78076bb2019-12-04 19:56:40 -07002642 hash_del(&req->hash_node);
Jens Axboeb0dd8a42019-11-18 12:14:54 -07002643 io_poll_complete(req, mask, 0);
Jens Axboe7c9e7f02019-11-12 08:15:53 -07002644 req->flags |= REQ_F_COMP_LOCKED;
2645 io_put_req(req);
Jens Axboe8c838782019-03-12 15:48:16 -06002646 spin_unlock_irqrestore(&ctx->completion_lock, flags);
2647
2648 io_cqring_ev_posted(ctx);
Jens Axboe8c838782019-03-12 15:48:16 -06002649 } else {
Jackie Liua197f662019-11-08 08:09:12 -07002650 io_queue_async_work(req);
Jens Axboe8c838782019-03-12 15:48:16 -06002651 }
2652
Jens Axboe221c5eb2019-01-17 09:41:58 -07002653 return 1;
2654}
2655
2656struct io_poll_table {
2657 struct poll_table_struct pt;
2658 struct io_kiocb *req;
2659 int error;
2660};
2661
2662static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
2663 struct poll_table_struct *p)
2664{
2665 struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
2666
2667 if (unlikely(pt->req->poll.head)) {
2668 pt->error = -EINVAL;
2669 return;
2670 }
2671
2672 pt->error = 0;
2673 pt->req->poll.head = head;
Jens Axboe392edb42019-12-09 17:52:20 -07002674 add_wait_queue(head, &pt->req->poll.wait);
Jens Axboe221c5eb2019-01-17 09:41:58 -07002675}
2676
Jens Axboeeac406c2019-11-14 12:09:58 -07002677static void io_poll_req_insert(struct io_kiocb *req)
2678{
2679 struct io_ring_ctx *ctx = req->ctx;
Jens Axboe78076bb2019-12-04 19:56:40 -07002680 struct hlist_head *list;
Jens Axboeeac406c2019-11-14 12:09:58 -07002681
Jens Axboe78076bb2019-12-04 19:56:40 -07002682 list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)];
2683 hlist_add_head(&req->hash_node, list);
Jens Axboeeac406c2019-11-14 12:09:58 -07002684}
2685
Jens Axboe0969e782019-12-17 18:40:57 -07002686static int io_poll_add_prep(struct io_kiocb *req)
Jens Axboe221c5eb2019-01-17 09:41:58 -07002687{
Jens Axboefc4df992019-12-10 14:38:45 -07002688 const struct io_uring_sqe *sqe = req->sqe;
Jens Axboe221c5eb2019-01-17 09:41:58 -07002689 struct io_poll_iocb *poll = &req->poll;
Jens Axboe221c5eb2019-01-17 09:41:58 -07002690 u16 events;
Jens Axboe221c5eb2019-01-17 09:41:58 -07002691
Jens Axboe0969e782019-12-17 18:40:57 -07002692 if (req->flags & REQ_F_PREPPED)
2693 return 0;
Jens Axboe221c5eb2019-01-17 09:41:58 -07002694 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
2695 return -EINVAL;
2696 if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index)
2697 return -EINVAL;
Jens Axboe09bb8392019-03-13 12:39:28 -06002698 if (!poll->file)
2699 return -EBADF;
Jens Axboe221c5eb2019-01-17 09:41:58 -07002700
Jens Axboe0969e782019-12-17 18:40:57 -07002701 req->flags |= REQ_F_PREPPED;
Jens Axboe221c5eb2019-01-17 09:41:58 -07002702 events = READ_ONCE(sqe->poll_events);
2703 poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;
Jens Axboe0969e782019-12-17 18:40:57 -07002704 return 0;
2705}
2706
2707static int io_poll_add(struct io_kiocb *req, struct io_kiocb **nxt)
2708{
2709 struct io_poll_iocb *poll = &req->poll;
2710 struct io_ring_ctx *ctx = req->ctx;
2711 struct io_poll_table ipt;
2712 bool cancel = false;
2713 __poll_t mask;
2714 int ret;
2715
2716 ret = io_poll_add_prep(req);
2717 if (ret)
2718 return ret;
2719
2720 INIT_IO_WORK(&req->work, io_poll_complete_work);
Jens Axboe78076bb2019-12-04 19:56:40 -07002721 INIT_HLIST_NODE(&req->hash_node);
Jens Axboe221c5eb2019-01-17 09:41:58 -07002722
Jens Axboe221c5eb2019-01-17 09:41:58 -07002723 poll->head = NULL;
Jens Axboe8c838782019-03-12 15:48:16 -06002724 poll->done = false;
Jens Axboe221c5eb2019-01-17 09:41:58 -07002725 poll->canceled = false;
2726
2727 ipt.pt._qproc = io_poll_queue_proc;
2728 ipt.pt._key = poll->events;
2729 ipt.req = req;
2730 ipt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */
2731
2732 /* initialized the list so that we can do list_empty checks */
Jens Axboe392edb42019-12-09 17:52:20 -07002733 INIT_LIST_HEAD(&poll->wait.entry);
2734 init_waitqueue_func_entry(&poll->wait, io_poll_wake);
2735 poll->wait.private = poll;
Jens Axboe221c5eb2019-01-17 09:41:58 -07002736
Jens Axboe36703242019-07-25 10:20:18 -06002737 INIT_LIST_HEAD(&req->list);
2738
Jens Axboe221c5eb2019-01-17 09:41:58 -07002739 mask = vfs_poll(poll->file, &ipt.pt) & poll->events;
Jens Axboe221c5eb2019-01-17 09:41:58 -07002740
2741 spin_lock_irq(&ctx->completion_lock);
Jens Axboe8c838782019-03-12 15:48:16 -06002742 if (likely(poll->head)) {
2743 spin_lock(&poll->head->lock);
Jens Axboe392edb42019-12-09 17:52:20 -07002744 if (unlikely(list_empty(&poll->wait.entry))) {
Jens Axboe8c838782019-03-12 15:48:16 -06002745 if (ipt.error)
2746 cancel = true;
2747 ipt.error = 0;
2748 mask = 0;
2749 }
2750 if (mask || ipt.error)
Jens Axboe392edb42019-12-09 17:52:20 -07002751 list_del_init(&poll->wait.entry);
Jens Axboe8c838782019-03-12 15:48:16 -06002752 else if (cancel)
2753 WRITE_ONCE(poll->canceled, true);
2754 else if (!poll->done) /* actually waiting for an event */
Jens Axboeeac406c2019-11-14 12:09:58 -07002755 io_poll_req_insert(req);
Jens Axboe8c838782019-03-12 15:48:16 -06002756 spin_unlock(&poll->head->lock);
Jens Axboe221c5eb2019-01-17 09:41:58 -07002757 }
Jens Axboe8c838782019-03-12 15:48:16 -06002758 if (mask) { /* no async, we'd stolen it */
Jens Axboe8c838782019-03-12 15:48:16 -06002759 ipt.error = 0;
Jens Axboeb0dd8a42019-11-18 12:14:54 -07002760 io_poll_complete(req, mask, 0);
Jens Axboe8c838782019-03-12 15:48:16 -06002761 }
Jens Axboe221c5eb2019-01-17 09:41:58 -07002762 spin_unlock_irq(&ctx->completion_lock);
2763
Jens Axboe8c838782019-03-12 15:48:16 -06002764 if (mask) {
2765 io_cqring_ev_posted(ctx);
Jackie Liuec9c02a2019-11-08 23:50:36 +08002766 io_put_req_find_next(req, nxt);
Jens Axboe221c5eb2019-01-17 09:41:58 -07002767 }
Jens Axboe8c838782019-03-12 15:48:16 -06002768 return ipt.error;
Jens Axboe221c5eb2019-01-17 09:41:58 -07002769}
2770
Jens Axboe5262f562019-09-17 12:26:57 -06002771static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
2772{
Jens Axboead8a48a2019-11-15 08:49:11 -07002773 struct io_timeout_data *data = container_of(timer,
2774 struct io_timeout_data, timer);
2775 struct io_kiocb *req = data->req;
2776 struct io_ring_ctx *ctx = req->ctx;
Jens Axboe5262f562019-09-17 12:26:57 -06002777 unsigned long flags;
2778
Jens Axboe5262f562019-09-17 12:26:57 -06002779 atomic_inc(&ctx->cq_timeouts);
2780
2781 spin_lock_irqsave(&ctx->completion_lock, flags);
zhangyi (F)ef036812019-10-23 15:10:08 +08002782 /*
Jens Axboe11365042019-10-16 09:08:32 -06002783 * We could be racing with timeout deletion. If the list is empty,
2784 * then timeout lookup already found it and will be handling it.
zhangyi (F)ef036812019-10-23 15:10:08 +08002785 */
Jens Axboe842f9612019-10-29 12:34:10 -06002786 if (!list_empty(&req->list)) {
Jens Axboe11365042019-10-16 09:08:32 -06002787 struct io_kiocb *prev;
Jens Axboe5262f562019-09-17 12:26:57 -06002788
Jens Axboe11365042019-10-16 09:08:32 -06002789 /*
2790 * Adjust the reqs sequence before the current one because it
Brian Gianforcarod195a662019-12-13 03:09:50 -08002791 * will consume a slot in the cq_ring and the cq_tail
Jens Axboe11365042019-10-16 09:08:32 -06002792 * pointer will be increased, otherwise other timeout reqs may
2793 * return in advance without waiting for enough wait_nr.
2794 */
2795 prev = req;
2796 list_for_each_entry_continue_reverse(prev, &ctx->timeout_list, list)
2797 prev->sequence++;
Jens Axboe11365042019-10-16 09:08:32 -06002798 list_del_init(&req->list);
Jens Axboe11365042019-10-16 09:08:32 -06002799 }
Jens Axboe842f9612019-10-29 12:34:10 -06002800
Jens Axboe78e19bb2019-11-06 15:21:34 -07002801 io_cqring_fill_event(req, -ETIME);
Jens Axboe5262f562019-09-17 12:26:57 -06002802 io_commit_cqring(ctx);
2803 spin_unlock_irqrestore(&ctx->completion_lock, flags);
2804
2805 io_cqring_ev_posted(ctx);
Jens Axboe4e88d6e2019-12-07 20:59:47 -07002806 req_set_fail_links(req);
Jens Axboe5262f562019-09-17 12:26:57 -06002807 io_put_req(req);
2808 return HRTIMER_NORESTART;
2809}
2810
Jens Axboe47f46762019-11-09 17:43:02 -07002811static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
2812{
2813 struct io_kiocb *req;
2814 int ret = -ENOENT;
2815
2816 list_for_each_entry(req, &ctx->timeout_list, list) {
2817 if (user_data == req->user_data) {
2818 list_del_init(&req->list);
2819 ret = 0;
2820 break;
2821 }
2822 }
2823
2824 if (ret == -ENOENT)
2825 return ret;
2826
Jens Axboe2d283902019-12-04 11:08:05 -07002827 ret = hrtimer_try_to_cancel(&req->io->timeout.timer);
Jens Axboe47f46762019-11-09 17:43:02 -07002828 if (ret == -1)
2829 return -EALREADY;
2830
Jens Axboe4e88d6e2019-12-07 20:59:47 -07002831 req_set_fail_links(req);
Jens Axboe47f46762019-11-09 17:43:02 -07002832 io_cqring_fill_event(req, -ECANCELED);
2833 io_put_req(req);
2834 return 0;
2835}
2836
Jens Axboeb29472e2019-12-17 18:50:29 -07002837static int io_timeout_remove_prep(struct io_kiocb *req)
2838{
2839 const struct io_uring_sqe *sqe = req->sqe;
2840
2841 if (req->flags & REQ_F_PREPPED)
2842 return 0;
2843 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
2844 return -EINVAL;
2845 if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->len)
2846 return -EINVAL;
2847
2848 req->timeout.addr = READ_ONCE(sqe->addr);
2849 req->timeout.flags = READ_ONCE(sqe->timeout_flags);
2850 if (req->timeout.flags)
2851 return -EINVAL;
2852
2853 req->flags |= REQ_F_PREPPED;
2854 return 0;
2855}
2856
Jens Axboe11365042019-10-16 09:08:32 -06002857/*
2858 * Remove or update an existing timeout command
2859 */
Jens Axboefc4df992019-12-10 14:38:45 -07002860static int io_timeout_remove(struct io_kiocb *req)
Jens Axboe11365042019-10-16 09:08:32 -06002861{
2862 struct io_ring_ctx *ctx = req->ctx;
Jens Axboe47f46762019-11-09 17:43:02 -07002863 int ret;
Jens Axboe11365042019-10-16 09:08:32 -06002864
Jens Axboeb29472e2019-12-17 18:50:29 -07002865 ret = io_timeout_remove_prep(req);
2866 if (ret)
2867 return ret;
Jens Axboe11365042019-10-16 09:08:32 -06002868
Jens Axboe11365042019-10-16 09:08:32 -06002869 spin_lock_irq(&ctx->completion_lock);
Jens Axboeb29472e2019-12-17 18:50:29 -07002870 ret = io_timeout_cancel(ctx, req->timeout.addr);
Jens Axboe11365042019-10-16 09:08:32 -06002871
Jens Axboe47f46762019-11-09 17:43:02 -07002872 io_cqring_fill_event(req, ret);
Jens Axboe11365042019-10-16 09:08:32 -06002873 io_commit_cqring(ctx);
2874 spin_unlock_irq(&ctx->completion_lock);
Jens Axboe5262f562019-09-17 12:26:57 -06002875 io_cqring_ev_posted(ctx);
Jens Axboe4e88d6e2019-12-07 20:59:47 -07002876 if (ret < 0)
2877 req_set_fail_links(req);
Jackie Liuec9c02a2019-11-08 23:50:36 +08002878 io_put_req(req);
Jens Axboe11365042019-10-16 09:08:32 -06002879 return 0;
Jens Axboe5262f562019-09-17 12:26:57 -06002880}
2881
Jens Axboe2d283902019-12-04 11:08:05 -07002882static int io_timeout_prep(struct io_kiocb *req, struct io_async_ctx *io,
2883 bool is_timeout_link)
Jens Axboe5262f562019-09-17 12:26:57 -06002884{
Pavel Begunkovcf6fd4b2019-11-25 23:14:39 +03002885 const struct io_uring_sqe *sqe = req->sqe;
Jens Axboead8a48a2019-11-15 08:49:11 -07002886 struct io_timeout_data *data;
Jens Axboea41525a2019-10-15 16:48:15 -06002887 unsigned flags;
Jens Axboe5262f562019-09-17 12:26:57 -06002888
Jens Axboead8a48a2019-11-15 08:49:11 -07002889 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
Jens Axboe5262f562019-09-17 12:26:57 -06002890 return -EINVAL;
Jens Axboead8a48a2019-11-15 08:49:11 -07002891 if (sqe->ioprio || sqe->buf_index || sqe->len != 1)
Jens Axboea41525a2019-10-15 16:48:15 -06002892 return -EINVAL;
Jens Axboe2d283902019-12-04 11:08:05 -07002893 if (sqe->off && is_timeout_link)
2894 return -EINVAL;
Jens Axboea41525a2019-10-15 16:48:15 -06002895 flags = READ_ONCE(sqe->timeout_flags);
2896 if (flags & ~IORING_TIMEOUT_ABS)
Jens Axboe5262f562019-09-17 12:26:57 -06002897 return -EINVAL;
Arnd Bergmannbdf20072019-10-01 09:53:29 -06002898
Jens Axboe2d283902019-12-04 11:08:05 -07002899 data = &io->timeout;
Jens Axboead8a48a2019-11-15 08:49:11 -07002900 data->req = req;
Jens Axboead8a48a2019-11-15 08:49:11 -07002901 req->flags |= REQ_F_TIMEOUT;
2902
2903 if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
Jens Axboe5262f562019-09-17 12:26:57 -06002904 return -EFAULT;
2905
Jens Axboe11365042019-10-16 09:08:32 -06002906 if (flags & IORING_TIMEOUT_ABS)
Jens Axboead8a48a2019-11-15 08:49:11 -07002907 data->mode = HRTIMER_MODE_ABS;
Jens Axboe11365042019-10-16 09:08:32 -06002908 else
Jens Axboead8a48a2019-11-15 08:49:11 -07002909 data->mode = HRTIMER_MODE_REL;
Jens Axboe11365042019-10-16 09:08:32 -06002910
Jens Axboead8a48a2019-11-15 08:49:11 -07002911 hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode);
2912 return 0;
2913}
2914
Jens Axboefc4df992019-12-10 14:38:45 -07002915static int io_timeout(struct io_kiocb *req)
Jens Axboead8a48a2019-11-15 08:49:11 -07002916{
Jens Axboefc4df992019-12-10 14:38:45 -07002917 const struct io_uring_sqe *sqe = req->sqe;
Jens Axboead8a48a2019-11-15 08:49:11 -07002918 unsigned count;
2919 struct io_ring_ctx *ctx = req->ctx;
2920 struct io_timeout_data *data;
2921 struct list_head *entry;
2922 unsigned span = 0;
Jens Axboeb7bb4f72019-12-15 22:13:43 -07002923 int ret;
Jens Axboead8a48a2019-11-15 08:49:11 -07002924
Jens Axboeb7bb4f72019-12-15 22:13:43 -07002925 if (!req->io) {
2926 if (io_alloc_async_ctx(req))
Jens Axboe2d283902019-12-04 11:08:05 -07002927 return -ENOMEM;
Jens Axboeb7bb4f72019-12-15 22:13:43 -07002928 ret = io_timeout_prep(req, req->io, false);
2929 if (ret)
Jens Axboe2d283902019-12-04 11:08:05 -07002930 return ret;
Jens Axboe2d283902019-12-04 11:08:05 -07002931 }
2932 data = &req->io->timeout;
Jens Axboe93bd25b2019-11-11 23:34:31 -07002933
Jens Axboe5262f562019-09-17 12:26:57 -06002934 /*
2935 * sqe->off holds how many events that need to occur for this
Jens Axboe93bd25b2019-11-11 23:34:31 -07002936 * timeout event to be satisfied. If it isn't set, then this is
2937 * a pure timeout request, sequence isn't used.
Jens Axboe5262f562019-09-17 12:26:57 -06002938 */
2939 count = READ_ONCE(sqe->off);
Jens Axboe93bd25b2019-11-11 23:34:31 -07002940 if (!count) {
2941 req->flags |= REQ_F_TIMEOUT_NOSEQ;
2942 spin_lock_irq(&ctx->completion_lock);
2943 entry = ctx->timeout_list.prev;
2944 goto add;
2945 }
Jens Axboe5262f562019-09-17 12:26:57 -06002946
2947 req->sequence = ctx->cached_sq_head + count - 1;
Jens Axboe2d283902019-12-04 11:08:05 -07002948 data->seq_offset = count;
Jens Axboe5262f562019-09-17 12:26:57 -06002949
2950 /*
2951 * Insertion sort, ensuring the first entry in the list is always
2952 * the one we need first.
2953 */
Jens Axboe5262f562019-09-17 12:26:57 -06002954 spin_lock_irq(&ctx->completion_lock);
2955 list_for_each_prev(entry, &ctx->timeout_list) {
2956 struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list);
yangerkun5da0fb12019-10-15 21:59:29 +08002957 unsigned nxt_sq_head;
2958 long long tmp, tmp_nxt;
Jens Axboe2d283902019-12-04 11:08:05 -07002959 u32 nxt_offset = nxt->io->timeout.seq_offset;
Jens Axboe5262f562019-09-17 12:26:57 -06002960
Jens Axboe93bd25b2019-11-11 23:34:31 -07002961 if (nxt->flags & REQ_F_TIMEOUT_NOSEQ)
2962 continue;
2963
yangerkun5da0fb12019-10-15 21:59:29 +08002964 /*
2965 * Since cached_sq_head + count - 1 can overflow, use type long
2966 * long to store it.
2967 */
2968 tmp = (long long)ctx->cached_sq_head + count - 1;
Pavel Begunkovcc42e0a2019-11-25 23:14:38 +03002969 nxt_sq_head = nxt->sequence - nxt_offset + 1;
2970 tmp_nxt = (long long)nxt_sq_head + nxt_offset - 1;
yangerkun5da0fb12019-10-15 21:59:29 +08002971
2972 /*
2973 * cached_sq_head may overflow, and it will never overflow twice
2974 * once there is some timeout req still be valid.
2975 */
2976 if (ctx->cached_sq_head < nxt_sq_head)
yangerkun8b07a652019-10-17 12:12:35 +08002977 tmp += UINT_MAX;
yangerkun5da0fb12019-10-15 21:59:29 +08002978
zhangyi (F)a1f58ba2019-10-23 15:10:09 +08002979 if (tmp > tmp_nxt)
Jens Axboe5262f562019-09-17 12:26:57 -06002980 break;
zhangyi (F)a1f58ba2019-10-23 15:10:09 +08002981
2982 /*
2983 * Sequence of reqs after the insert one and itself should
2984 * be adjusted because each timeout req consumes a slot.
2985 */
2986 span++;
2987 nxt->sequence++;
Jens Axboe5262f562019-09-17 12:26:57 -06002988 }
zhangyi (F)a1f58ba2019-10-23 15:10:09 +08002989 req->sequence -= span;
Jens Axboe93bd25b2019-11-11 23:34:31 -07002990add:
Jens Axboe5262f562019-09-17 12:26:57 -06002991 list_add(&req->list, entry);
Jens Axboead8a48a2019-11-15 08:49:11 -07002992 data->timer.function = io_timeout_fn;
2993 hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
Jens Axboe842f9612019-10-29 12:34:10 -06002994 spin_unlock_irq(&ctx->completion_lock);
Jens Axboe5262f562019-09-17 12:26:57 -06002995 return 0;
2996}
2997
Jens Axboe62755e32019-10-28 21:49:21 -06002998static bool io_cancel_cb(struct io_wq_work *work, void *data)
Jens Axboede0617e2019-04-06 21:51:27 -06002999{
Jens Axboe62755e32019-10-28 21:49:21 -06003000 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
Jens Axboede0617e2019-04-06 21:51:27 -06003001
Jens Axboe62755e32019-10-28 21:49:21 -06003002 return req->user_data == (unsigned long) data;
3003}
3004
Jens Axboee977d6d2019-11-05 12:39:45 -07003005static int io_async_cancel_one(struct io_ring_ctx *ctx, void *sqe_addr)
Jens Axboe62755e32019-10-28 21:49:21 -06003006{
Jens Axboe62755e32019-10-28 21:49:21 -06003007 enum io_wq_cancel cancel_ret;
Jens Axboe62755e32019-10-28 21:49:21 -06003008 int ret = 0;
3009
Jens Axboe62755e32019-10-28 21:49:21 -06003010 cancel_ret = io_wq_cancel_cb(ctx->io_wq, io_cancel_cb, sqe_addr);
3011 switch (cancel_ret) {
3012 case IO_WQ_CANCEL_OK:
3013 ret = 0;
3014 break;
3015 case IO_WQ_CANCEL_RUNNING:
3016 ret = -EALREADY;
3017 break;
3018 case IO_WQ_CANCEL_NOTFOUND:
3019 ret = -ENOENT;
3020 break;
3021 }
3022
Jens Axboee977d6d2019-11-05 12:39:45 -07003023 return ret;
3024}
3025
Jens Axboe47f46762019-11-09 17:43:02 -07003026static void io_async_find_and_cancel(struct io_ring_ctx *ctx,
3027 struct io_kiocb *req, __u64 sqe_addr,
Jens Axboeb0dd8a42019-11-18 12:14:54 -07003028 struct io_kiocb **nxt, int success_ret)
Jens Axboe47f46762019-11-09 17:43:02 -07003029{
3030 unsigned long flags;
3031 int ret;
3032
3033 ret = io_async_cancel_one(ctx, (void *) (unsigned long) sqe_addr);
3034 if (ret != -ENOENT) {
3035 spin_lock_irqsave(&ctx->completion_lock, flags);
3036 goto done;
3037 }
3038
3039 spin_lock_irqsave(&ctx->completion_lock, flags);
3040 ret = io_timeout_cancel(ctx, sqe_addr);
3041 if (ret != -ENOENT)
3042 goto done;
3043 ret = io_poll_cancel(ctx, sqe_addr);
3044done:
Jens Axboeb0dd8a42019-11-18 12:14:54 -07003045 if (!ret)
3046 ret = success_ret;
Jens Axboe47f46762019-11-09 17:43:02 -07003047 io_cqring_fill_event(req, ret);
3048 io_commit_cqring(ctx);
3049 spin_unlock_irqrestore(&ctx->completion_lock, flags);
3050 io_cqring_ev_posted(ctx);
3051
Jens Axboe4e88d6e2019-12-07 20:59:47 -07003052 if (ret < 0)
3053 req_set_fail_links(req);
Jens Axboe47f46762019-11-09 17:43:02 -07003054 io_put_req_find_next(req, nxt);
3055}
3056
Jens Axboefbf23842019-12-17 18:45:56 -07003057static int io_async_cancel_prep(struct io_kiocb *req)
Jens Axboee977d6d2019-11-05 12:39:45 -07003058{
Jens Axboefc4df992019-12-10 14:38:45 -07003059 const struct io_uring_sqe *sqe = req->sqe;
Jens Axboee977d6d2019-11-05 12:39:45 -07003060
Jens Axboefbf23842019-12-17 18:45:56 -07003061 if (req->flags & REQ_F_PREPPED)
3062 return 0;
3063 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
Jens Axboee977d6d2019-11-05 12:39:45 -07003064 return -EINVAL;
3065 if (sqe->flags || sqe->ioprio || sqe->off || sqe->len ||
3066 sqe->cancel_flags)
3067 return -EINVAL;
3068
Jens Axboefbf23842019-12-17 18:45:56 -07003069 req->flags |= REQ_F_PREPPED;
3070 req->cancel.addr = READ_ONCE(sqe->addr);
3071 return 0;
3072}
3073
3074static int io_async_cancel(struct io_kiocb *req, struct io_kiocb **nxt)
3075{
3076 struct io_ring_ctx *ctx = req->ctx;
3077 int ret;
3078
3079 ret = io_async_cancel_prep(req);
3080 if (ret)
3081 return ret;
3082
3083 io_async_find_and_cancel(ctx, req, req->cancel.addr, nxt, 0);
Jens Axboe62755e32019-10-28 21:49:21 -06003084 return 0;
3085}
3086
Jens Axboeb7bb4f72019-12-15 22:13:43 -07003087static int io_req_defer_prep(struct io_kiocb *req)
Jens Axboef67676d2019-12-02 11:03:47 -07003088{
3089 struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
Jens Axboeb7bb4f72019-12-15 22:13:43 -07003090 struct io_async_ctx *io = req->io;
Jens Axboef67676d2019-12-02 11:03:47 -07003091 struct iov_iter iter;
Jens Axboee7815732019-12-17 19:45:06 -07003092 ssize_t ret = 0;
Jens Axboef67676d2019-12-02 11:03:47 -07003093
Jens Axboed625c6e2019-12-17 19:53:05 -07003094 switch (req->opcode) {
Jens Axboee7815732019-12-17 19:45:06 -07003095 case IORING_OP_NOP:
3096 break;
Jens Axboef67676d2019-12-02 11:03:47 -07003097 case IORING_OP_READV:
3098 case IORING_OP_READ_FIXED:
Jens Axboeb7bb4f72019-12-15 22:13:43 -07003099 /* ensure prep does right import */
3100 req->io = NULL;
Jens Axboef67676d2019-12-02 11:03:47 -07003101 ret = io_read_prep(req, &iovec, &iter, true);
Jens Axboeb7bb4f72019-12-15 22:13:43 -07003102 req->io = io;
3103 if (ret < 0)
3104 break;
3105 io_req_map_rw(req, ret, iovec, inline_vecs, &iter);
3106 ret = 0;
Jens Axboef67676d2019-12-02 11:03:47 -07003107 break;
3108 case IORING_OP_WRITEV:
3109 case IORING_OP_WRITE_FIXED:
Jens Axboeb7bb4f72019-12-15 22:13:43 -07003110 /* ensure prep does right import */
3111 req->io = NULL;
Jens Axboef67676d2019-12-02 11:03:47 -07003112 ret = io_write_prep(req, &iovec, &iter, true);
Jens Axboeb7bb4f72019-12-15 22:13:43 -07003113 req->io = io;
3114 if (ret < 0)
3115 break;
3116 io_req_map_rw(req, ret, iovec, inline_vecs, &iter);
3117 ret = 0;
Jens Axboef67676d2019-12-02 11:03:47 -07003118 break;
Jens Axboe0969e782019-12-17 18:40:57 -07003119 case IORING_OP_POLL_ADD:
3120 ret = io_poll_add_prep(req);
3121 break;
3122 case IORING_OP_POLL_REMOVE:
3123 ret = io_poll_remove_prep(req);
3124 break;
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07003125 case IORING_OP_FSYNC:
3126 ret = io_prep_fsync(req);
3127 break;
3128 case IORING_OP_SYNC_FILE_RANGE:
3129 ret = io_prep_sfr(req);
3130 break;
Jens Axboe03b12302019-12-02 18:50:25 -07003131 case IORING_OP_SENDMSG:
3132 ret = io_sendmsg_prep(req, io);
3133 break;
3134 case IORING_OP_RECVMSG:
3135 ret = io_recvmsg_prep(req, io);
3136 break;
Jens Axboef499a022019-12-02 16:28:46 -07003137 case IORING_OP_CONNECT:
3138 ret = io_connect_prep(req, io);
3139 break;
Jens Axboe2d283902019-12-04 11:08:05 -07003140 case IORING_OP_TIMEOUT:
Jens Axboeb7bb4f72019-12-15 22:13:43 -07003141 ret = io_timeout_prep(req, io, false);
3142 break;
Jens Axboeb29472e2019-12-17 18:50:29 -07003143 case IORING_OP_TIMEOUT_REMOVE:
3144 ret = io_timeout_remove_prep(req);
3145 break;
Jens Axboefbf23842019-12-17 18:45:56 -07003146 case IORING_OP_ASYNC_CANCEL:
3147 ret = io_async_cancel_prep(req);
3148 break;
Jens Axboe2d283902019-12-04 11:08:05 -07003149 case IORING_OP_LINK_TIMEOUT:
Jens Axboeb7bb4f72019-12-15 22:13:43 -07003150 ret = io_timeout_prep(req, io, true);
3151 break;
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07003152 case IORING_OP_ACCEPT:
3153 ret = io_accept_prep(req);
3154 break;
Jens Axboef67676d2019-12-02 11:03:47 -07003155 default:
Jens Axboee7815732019-12-17 19:45:06 -07003156 printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
3157 req->opcode);
3158 ret = -EINVAL;
Jens Axboeb7bb4f72019-12-15 22:13:43 -07003159 break;
Jens Axboef67676d2019-12-02 11:03:47 -07003160 }
3161
Jens Axboeb7bb4f72019-12-15 22:13:43 -07003162 return ret;
Jens Axboef67676d2019-12-02 11:03:47 -07003163}
3164
Jackie Liua197f662019-11-08 08:09:12 -07003165static int io_req_defer(struct io_kiocb *req)
Jens Axboede0617e2019-04-06 21:51:27 -06003166{
Jackie Liua197f662019-11-08 08:09:12 -07003167 struct io_ring_ctx *ctx = req->ctx;
Jens Axboef67676d2019-12-02 11:03:47 -07003168 int ret;
Jens Axboede0617e2019-04-06 21:51:27 -06003169
Bob Liu9d858b22019-11-13 18:06:25 +08003170 /* Still need defer if there is pending req in defer list. */
3171 if (!req_need_defer(req) && list_empty(&ctx->defer_list))
Jens Axboede0617e2019-04-06 21:51:27 -06003172 return 0;
3173
Jens Axboeb7bb4f72019-12-15 22:13:43 -07003174 if (io_alloc_async_ctx(req))
Jens Axboede0617e2019-04-06 21:51:27 -06003175 return -EAGAIN;
3176
Jens Axboeb7bb4f72019-12-15 22:13:43 -07003177 ret = io_req_defer_prep(req);
3178 if (ret < 0)
Jens Axboe2d283902019-12-04 11:08:05 -07003179 return ret;
Jens Axboe2d283902019-12-04 11:08:05 -07003180
Jens Axboede0617e2019-04-06 21:51:27 -06003181 spin_lock_irq(&ctx->completion_lock);
Bob Liu9d858b22019-11-13 18:06:25 +08003182 if (!req_need_defer(req) && list_empty(&ctx->defer_list)) {
Jens Axboede0617e2019-04-06 21:51:27 -06003183 spin_unlock_irq(&ctx->completion_lock);
Jens Axboede0617e2019-04-06 21:51:27 -06003184 return 0;
3185 }
3186
Jens Axboe915967f2019-11-21 09:01:20 -07003187 trace_io_uring_defer(ctx, req, req->user_data);
Jens Axboede0617e2019-04-06 21:51:27 -06003188 list_add_tail(&req->list, &ctx->defer_list);
3189 spin_unlock_irq(&ctx->completion_lock);
3190 return -EIOCBQUEUED;
3191}
3192
Pavel Begunkovf9bd67f2019-11-21 23:21:03 +03003193__attribute__((nonnull))
Pavel Begunkovd7324472019-11-21 21:24:36 +03003194static int io_issue_sqe(struct io_kiocb *req, struct io_kiocb **nxt,
3195 bool force_nonblock)
Jens Axboe2b188cc2019-01-07 10:46:33 -07003196{
Jackie Liua197f662019-11-08 08:09:12 -07003197 struct io_ring_ctx *ctx = req->ctx;
Jens Axboed625c6e2019-12-17 19:53:05 -07003198 int ret;
Jens Axboe2b188cc2019-01-07 10:46:33 -07003199
Jens Axboed625c6e2019-12-17 19:53:05 -07003200 switch (req->opcode) {
Jens Axboe2b188cc2019-01-07 10:46:33 -07003201 case IORING_OP_NOP:
Jens Axboe78e19bb2019-11-06 15:21:34 -07003202 ret = io_nop(req);
Jens Axboe2b188cc2019-01-07 10:46:33 -07003203 break;
3204 case IORING_OP_READV:
Pavel Begunkovcf6fd4b2019-11-25 23:14:39 +03003205 if (unlikely(req->sqe->buf_index))
Jens Axboeedafcce2019-01-09 09:16:05 -07003206 return -EINVAL;
Pavel Begunkov267bc902019-11-07 01:41:08 +03003207 ret = io_read(req, nxt, force_nonblock);
Jens Axboe2b188cc2019-01-07 10:46:33 -07003208 break;
3209 case IORING_OP_WRITEV:
Pavel Begunkovcf6fd4b2019-11-25 23:14:39 +03003210 if (unlikely(req->sqe->buf_index))
Jens Axboeedafcce2019-01-09 09:16:05 -07003211 return -EINVAL;
Pavel Begunkov267bc902019-11-07 01:41:08 +03003212 ret = io_write(req, nxt, force_nonblock);
Jens Axboeedafcce2019-01-09 09:16:05 -07003213 break;
3214 case IORING_OP_READ_FIXED:
Pavel Begunkov267bc902019-11-07 01:41:08 +03003215 ret = io_read(req, nxt, force_nonblock);
Jens Axboeedafcce2019-01-09 09:16:05 -07003216 break;
3217 case IORING_OP_WRITE_FIXED:
Pavel Begunkov267bc902019-11-07 01:41:08 +03003218 ret = io_write(req, nxt, force_nonblock);
Jens Axboe2b188cc2019-01-07 10:46:33 -07003219 break;
Christoph Hellwigc992fe22019-01-11 09:43:02 -07003220 case IORING_OP_FSYNC:
Jens Axboefc4df992019-12-10 14:38:45 -07003221 ret = io_fsync(req, nxt, force_nonblock);
Christoph Hellwigc992fe22019-01-11 09:43:02 -07003222 break;
Jens Axboe221c5eb2019-01-17 09:41:58 -07003223 case IORING_OP_POLL_ADD:
Jens Axboefc4df992019-12-10 14:38:45 -07003224 ret = io_poll_add(req, nxt);
Jens Axboe221c5eb2019-01-17 09:41:58 -07003225 break;
3226 case IORING_OP_POLL_REMOVE:
Jens Axboefc4df992019-12-10 14:38:45 -07003227 ret = io_poll_remove(req);
Jens Axboe221c5eb2019-01-17 09:41:58 -07003228 break;
Jens Axboe5d17b4a2019-04-09 14:56:44 -06003229 case IORING_OP_SYNC_FILE_RANGE:
Jens Axboefc4df992019-12-10 14:38:45 -07003230 ret = io_sync_file_range(req, nxt, force_nonblock);
Jens Axboe5d17b4a2019-04-09 14:56:44 -06003231 break;
Jens Axboe0fa03c62019-04-19 13:34:07 -06003232 case IORING_OP_SENDMSG:
Jens Axboefc4df992019-12-10 14:38:45 -07003233 ret = io_sendmsg(req, nxt, force_nonblock);
Jens Axboe0fa03c62019-04-19 13:34:07 -06003234 break;
Jens Axboeaa1fa282019-04-19 13:38:09 -06003235 case IORING_OP_RECVMSG:
Jens Axboefc4df992019-12-10 14:38:45 -07003236 ret = io_recvmsg(req, nxt, force_nonblock);
Jens Axboeaa1fa282019-04-19 13:38:09 -06003237 break;
Jens Axboe5262f562019-09-17 12:26:57 -06003238 case IORING_OP_TIMEOUT:
Jens Axboefc4df992019-12-10 14:38:45 -07003239 ret = io_timeout(req);
Jens Axboe5262f562019-09-17 12:26:57 -06003240 break;
Jens Axboe11365042019-10-16 09:08:32 -06003241 case IORING_OP_TIMEOUT_REMOVE:
Jens Axboefc4df992019-12-10 14:38:45 -07003242 ret = io_timeout_remove(req);
Jens Axboe11365042019-10-16 09:08:32 -06003243 break;
Jens Axboe17f2fe32019-10-17 14:42:58 -06003244 case IORING_OP_ACCEPT:
Jens Axboefc4df992019-12-10 14:38:45 -07003245 ret = io_accept(req, nxt, force_nonblock);
Jens Axboe17f2fe32019-10-17 14:42:58 -06003246 break;
Jens Axboef8e85cf2019-11-23 14:24:24 -07003247 case IORING_OP_CONNECT:
Jens Axboefc4df992019-12-10 14:38:45 -07003248 ret = io_connect(req, nxt, force_nonblock);
Jens Axboef8e85cf2019-11-23 14:24:24 -07003249 break;
Jens Axboe62755e32019-10-28 21:49:21 -06003250 case IORING_OP_ASYNC_CANCEL:
Jens Axboefc4df992019-12-10 14:38:45 -07003251 ret = io_async_cancel(req, nxt);
Jens Axboe62755e32019-10-28 21:49:21 -06003252 break;
Jens Axboe2b188cc2019-01-07 10:46:33 -07003253 default:
3254 ret = -EINVAL;
3255 break;
3256 }
3257
Jens Axboedef596e2019-01-09 08:59:42 -07003258 if (ret)
3259 return ret;
3260
3261 if (ctx->flags & IORING_SETUP_IOPOLL) {
Jens Axboe9e645e112019-05-10 16:07:28 -06003262 if (req->result == -EAGAIN)
Jens Axboedef596e2019-01-09 08:59:42 -07003263 return -EAGAIN;
3264
Jens Axboedef596e2019-01-09 08:59:42 -07003265 io_iopoll_req_issued(req);
Jens Axboedef596e2019-01-09 08:59:42 -07003266 }
3267
3268 return 0;
Jens Axboe2b188cc2019-01-07 10:46:33 -07003269}
3270
Jens Axboeb76da702019-11-20 13:05:32 -07003271static void io_link_work_cb(struct io_wq_work **workptr)
3272{
3273 struct io_wq_work *work = *workptr;
3274 struct io_kiocb *link = work->data;
3275
3276 io_queue_linked_timeout(link);
3277 work->func = io_wq_submit_work;
3278}
3279
Jens Axboe561fb042019-10-24 07:25:42 -06003280static void io_wq_submit_work(struct io_wq_work **workptr)
Jens Axboe31b51512019-01-18 22:56:34 -07003281{
Jens Axboe561fb042019-10-24 07:25:42 -06003282 struct io_wq_work *work = *workptr;
Jens Axboe2b188cc2019-01-07 10:46:33 -07003283 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
Jens Axboe561fb042019-10-24 07:25:42 -06003284 struct io_kiocb *nxt = NULL;
3285 int ret = 0;
Jens Axboe2b188cc2019-01-07 10:46:33 -07003286
Jens Axboe561fb042019-10-24 07:25:42 -06003287 if (work->flags & IO_WQ_WORK_CANCEL)
3288 ret = -ECANCELED;
Jens Axboe31b51512019-01-18 22:56:34 -07003289
Jens Axboe561fb042019-10-24 07:25:42 -06003290 if (!ret) {
Pavel Begunkovcf6fd4b2019-11-25 23:14:39 +03003291 req->has_user = (work->flags & IO_WQ_WORK_HAS_MM) != 0;
3292 req->in_async = true;
Jens Axboe561fb042019-10-24 07:25:42 -06003293 do {
Pavel Begunkovd7324472019-11-21 21:24:36 +03003294 ret = io_issue_sqe(req, &nxt, false);
Jens Axboe561fb042019-10-24 07:25:42 -06003295 /*
3296 * We can get EAGAIN for polled IO even though we're
3297 * forcing a sync submission from here, since we can't
3298 * wait for request slots on the block side.
3299 */
3300 if (ret != -EAGAIN)
3301 break;
3302 cond_resched();
3303 } while (1);
3304 }
Jens Axboe31b51512019-01-18 22:56:34 -07003305
Jens Axboe561fb042019-10-24 07:25:42 -06003306 /* drop submission reference */
Jackie Liuec9c02a2019-11-08 23:50:36 +08003307 io_put_req(req);
Jens Axboe817869d2019-04-30 14:44:05 -06003308
Jens Axboe561fb042019-10-24 07:25:42 -06003309 if (ret) {
Jens Axboe4e88d6e2019-12-07 20:59:47 -07003310 req_set_fail_links(req);
Jens Axboe78e19bb2019-11-06 15:21:34 -07003311 io_cqring_add_event(req, ret);
Jens Axboe817869d2019-04-30 14:44:05 -06003312 io_put_req(req);
Jens Axboeedafcce2019-01-09 09:16:05 -07003313 }
Jens Axboe2b188cc2019-01-07 10:46:33 -07003314
Jens Axboe561fb042019-10-24 07:25:42 -06003315 /* if a dependent link is ready, pass it back */
3316 if (!ret && nxt) {
Jens Axboe94ae5e72019-11-14 19:39:52 -07003317 struct io_kiocb *link;
3318
3319 io_prep_async_work(nxt, &link);
Jens Axboe561fb042019-10-24 07:25:42 -06003320 *workptr = &nxt->work;
Jens Axboeb76da702019-11-20 13:05:32 -07003321 if (link) {
3322 nxt->work.flags |= IO_WQ_WORK_CB;
3323 nxt->work.func = io_link_work_cb;
3324 nxt->work.data = link;
3325 }
Jens Axboeedafcce2019-01-09 09:16:05 -07003326 }
Jens Axboe31b51512019-01-18 22:56:34 -07003327}
Jens Axboe2b188cc2019-01-07 10:46:33 -07003328
Jens Axboe9e3aa612019-12-11 15:55:43 -07003329static bool io_req_op_valid(int op)
3330{
3331 return op >= IORING_OP_NOP && op < IORING_OP_LAST;
3332}
3333
Jens Axboed625c6e2019-12-17 19:53:05 -07003334static int io_req_needs_file(struct io_kiocb *req)
Jens Axboe09bb8392019-03-13 12:39:28 -06003335{
Jens Axboed625c6e2019-12-17 19:53:05 -07003336 switch (req->opcode) {
Jens Axboe09bb8392019-03-13 12:39:28 -06003337 case IORING_OP_NOP:
3338 case IORING_OP_POLL_REMOVE:
Pavel Begunkov5683e542019-11-14 00:59:19 +03003339 case IORING_OP_TIMEOUT:
Pavel Begunkova320e9f2019-11-14 00:11:01 +03003340 case IORING_OP_TIMEOUT_REMOVE:
3341 case IORING_OP_ASYNC_CANCEL:
3342 case IORING_OP_LINK_TIMEOUT:
Jens Axboe9e3aa612019-12-11 15:55:43 -07003343 return 0;
Jens Axboe09bb8392019-03-13 12:39:28 -06003344 default:
Jens Axboed625c6e2019-12-17 19:53:05 -07003345 if (io_req_op_valid(req->opcode))
Jens Axboe9e3aa612019-12-11 15:55:43 -07003346 return 1;
3347 return -EINVAL;
Jens Axboe09bb8392019-03-13 12:39:28 -06003348 }
3349}
3350
Jens Axboe65e19f52019-10-26 07:20:21 -06003351static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
3352 int index)
Jens Axboe09bb8392019-03-13 12:39:28 -06003353{
Jens Axboe65e19f52019-10-26 07:20:21 -06003354 struct fixed_file_table *table;
3355
3356 table = &ctx->file_table[index >> IORING_FILE_TABLE_SHIFT];
3357 return table->files[index & IORING_FILE_TABLE_MASK];
3358}
3359
Jackie Liua197f662019-11-08 08:09:12 -07003360static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req)
Jens Axboe09bb8392019-03-13 12:39:28 -06003361{
Jackie Liua197f662019-11-08 08:09:12 -07003362 struct io_ring_ctx *ctx = req->ctx;
Jens Axboe09bb8392019-03-13 12:39:28 -06003363 unsigned flags;
Jens Axboe9e3aa612019-12-11 15:55:43 -07003364 int fd, ret;
Jens Axboe09bb8392019-03-13 12:39:28 -06003365
Pavel Begunkovcf6fd4b2019-11-25 23:14:39 +03003366 flags = READ_ONCE(req->sqe->flags);
3367 fd = READ_ONCE(req->sqe->fd);
Jens Axboe09bb8392019-03-13 12:39:28 -06003368
Jackie Liu4fe2c962019-09-09 20:50:40 +08003369 if (flags & IOSQE_IO_DRAIN)
Jens Axboede0617e2019-04-06 21:51:27 -06003370 req->flags |= REQ_F_IO_DRAIN;
Jens Axboede0617e2019-04-06 21:51:27 -06003371
Jens Axboed625c6e2019-12-17 19:53:05 -07003372 ret = io_req_needs_file(req);
Jens Axboe9e3aa612019-12-11 15:55:43 -07003373 if (ret <= 0)
3374 return ret;
Jens Axboe09bb8392019-03-13 12:39:28 -06003375
3376 if (flags & IOSQE_FIXED_FILE) {
Jens Axboe65e19f52019-10-26 07:20:21 -06003377 if (unlikely(!ctx->file_table ||
Jens Axboe09bb8392019-03-13 12:39:28 -06003378 (unsigned) fd >= ctx->nr_user_files))
3379 return -EBADF;
Jens Axboeb7620122019-10-26 07:22:55 -06003380 fd = array_index_nospec(fd, ctx->nr_user_files);
Jens Axboe65e19f52019-10-26 07:20:21 -06003381 req->file = io_file_from_index(ctx, fd);
3382 if (!req->file)
Jens Axboe08a45172019-10-03 08:11:03 -06003383 return -EBADF;
Jens Axboe09bb8392019-03-13 12:39:28 -06003384 req->flags |= REQ_F_FIXED_FILE;
3385 } else {
Pavel Begunkovcf6fd4b2019-11-25 23:14:39 +03003386 if (req->needs_fixed_file)
Jens Axboe09bb8392019-03-13 12:39:28 -06003387 return -EBADF;
Dmitrii Dolgovc826bd72019-10-15 19:02:01 +02003388 trace_io_uring_file_get(ctx, fd);
Jens Axboe09bb8392019-03-13 12:39:28 -06003389 req->file = io_file_get(state, fd);
3390 if (unlikely(!req->file))
3391 return -EBADF;
3392 }
3393
3394 return 0;
3395}
3396
Jackie Liua197f662019-11-08 08:09:12 -07003397static int io_grab_files(struct io_kiocb *req)
Jens Axboe2b188cc2019-01-07 10:46:33 -07003398{
Jens Axboefcb323c2019-10-24 12:39:47 -06003399 int ret = -EBADF;
Jackie Liua197f662019-11-08 08:09:12 -07003400 struct io_ring_ctx *ctx = req->ctx;
Jens Axboefcb323c2019-10-24 12:39:47 -06003401
3402 rcu_read_lock();
3403 spin_lock_irq(&ctx->inflight_lock);
3404 /*
3405 * We use the f_ops->flush() handler to ensure that we can flush
3406 * out work accessing these files if the fd is closed. Check if
3407 * the fd has changed since we started down this path, and disallow
3408 * this operation if it has.
3409 */
Pavel Begunkovcf6fd4b2019-11-25 23:14:39 +03003410 if (fcheck(req->ring_fd) == req->ring_file) {
Jens Axboefcb323c2019-10-24 12:39:47 -06003411 list_add(&req->inflight_entry, &ctx->inflight_list);
3412 req->flags |= REQ_F_INFLIGHT;
3413 req->work.files = current->files;
3414 ret = 0;
3415 }
3416 spin_unlock_irq(&ctx->inflight_lock);
3417 rcu_read_unlock();
3418
3419 return ret;
3420}
3421
Jens Axboe2665abf2019-11-05 12:40:47 -07003422static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
3423{
Jens Axboead8a48a2019-11-15 08:49:11 -07003424 struct io_timeout_data *data = container_of(timer,
3425 struct io_timeout_data, timer);
3426 struct io_kiocb *req = data->req;
Jens Axboe2665abf2019-11-05 12:40:47 -07003427 struct io_ring_ctx *ctx = req->ctx;
3428 struct io_kiocb *prev = NULL;
3429 unsigned long flags;
Jens Axboe2665abf2019-11-05 12:40:47 -07003430
3431 spin_lock_irqsave(&ctx->completion_lock, flags);
3432
3433 /*
3434 * We don't expect the list to be empty, that will only happen if we
3435 * race with the completion of the linked work.
3436 */
Pavel Begunkov44932332019-12-05 16:16:35 +03003437 if (!list_empty(&req->link_list)) {
3438 prev = list_entry(req->link_list.prev, struct io_kiocb,
3439 link_list);
Jens Axboe5d960722019-11-19 15:31:28 -07003440 if (refcount_inc_not_zero(&prev->refs)) {
Pavel Begunkov44932332019-12-05 16:16:35 +03003441 list_del_init(&req->link_list);
Jens Axboe5d960722019-11-19 15:31:28 -07003442 prev->flags &= ~REQ_F_LINK_TIMEOUT;
3443 } else
Jens Axboe76a46e02019-11-10 23:34:16 -07003444 prev = NULL;
Jens Axboe2665abf2019-11-05 12:40:47 -07003445 }
3446
3447 spin_unlock_irqrestore(&ctx->completion_lock, flags);
3448
3449 if (prev) {
Jens Axboe4e88d6e2019-12-07 20:59:47 -07003450 req_set_fail_links(prev);
Jens Axboeb0dd8a42019-11-18 12:14:54 -07003451 io_async_find_and_cancel(ctx, req, prev->user_data, NULL,
3452 -ETIME);
Jens Axboe76a46e02019-11-10 23:34:16 -07003453 io_put_req(prev);
Jens Axboe47f46762019-11-09 17:43:02 -07003454 } else {
3455 io_cqring_add_event(req, -ETIME);
3456 io_put_req(req);
Jens Axboe2665abf2019-11-05 12:40:47 -07003457 }
Jens Axboe2665abf2019-11-05 12:40:47 -07003458 return HRTIMER_NORESTART;
3459}
3460
Jens Axboead8a48a2019-11-15 08:49:11 -07003461static void io_queue_linked_timeout(struct io_kiocb *req)
Jens Axboe2665abf2019-11-05 12:40:47 -07003462{
Jens Axboe76a46e02019-11-10 23:34:16 -07003463 struct io_ring_ctx *ctx = req->ctx;
Jens Axboe2665abf2019-11-05 12:40:47 -07003464
Jens Axboe76a46e02019-11-10 23:34:16 -07003465 /*
3466 * If the list is now empty, then our linked request finished before
3467 * we got a chance to setup the timer
3468 */
3469 spin_lock_irq(&ctx->completion_lock);
Pavel Begunkov44932332019-12-05 16:16:35 +03003470 if (!list_empty(&req->link_list)) {
Jens Axboe2d283902019-12-04 11:08:05 -07003471 struct io_timeout_data *data = &req->io->timeout;
Jens Axboe94ae5e72019-11-14 19:39:52 -07003472
Jens Axboead8a48a2019-11-15 08:49:11 -07003473 data->timer.function = io_link_timeout_fn;
3474 hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
3475 data->mode);
Jens Axboe2665abf2019-11-05 12:40:47 -07003476 }
Jens Axboe76a46e02019-11-10 23:34:16 -07003477 spin_unlock_irq(&ctx->completion_lock);
Jens Axboe2665abf2019-11-05 12:40:47 -07003478
Jens Axboe2665abf2019-11-05 12:40:47 -07003479 /* drop submission reference */
Jens Axboe76a46e02019-11-10 23:34:16 -07003480 io_put_req(req);
Jens Axboe2665abf2019-11-05 12:40:47 -07003481}
3482
Jens Axboead8a48a2019-11-15 08:49:11 -07003483static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
Jens Axboe2665abf2019-11-05 12:40:47 -07003484{
3485 struct io_kiocb *nxt;
Jens Axboe2b188cc2019-01-07 10:46:33 -07003486
Jens Axboe2665abf2019-11-05 12:40:47 -07003487 if (!(req->flags & REQ_F_LINK))
3488 return NULL;
3489
Pavel Begunkov44932332019-12-05 16:16:35 +03003490 nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb,
3491 link_list);
Jens Axboed625c6e2019-12-17 19:53:05 -07003492 if (!nxt || nxt->opcode != IORING_OP_LINK_TIMEOUT)
Jens Axboe76a46e02019-11-10 23:34:16 -07003493 return NULL;
Jens Axboe2665abf2019-11-05 12:40:47 -07003494
Jens Axboe76a46e02019-11-10 23:34:16 -07003495 req->flags |= REQ_F_LINK_TIMEOUT;
Jens Axboe76a46e02019-11-10 23:34:16 -07003496 return nxt;
Jens Axboe2665abf2019-11-05 12:40:47 -07003497}
3498
Jens Axboe0e0702d2019-11-14 21:42:10 -07003499static void __io_queue_sqe(struct io_kiocb *req)
Jens Axboe2b188cc2019-01-07 10:46:33 -07003500{
Jens Axboe4a0a7a12019-12-09 20:01:01 -07003501 struct io_kiocb *linked_timeout;
Pavel Begunkovf9bd67f2019-11-21 23:21:03 +03003502 struct io_kiocb *nxt = NULL;
Jens Axboe2b188cc2019-01-07 10:46:33 -07003503 int ret;
3504
Jens Axboe4a0a7a12019-12-09 20:01:01 -07003505again:
3506 linked_timeout = io_prep_linked_timeout(req);
3507
Pavel Begunkovf9bd67f2019-11-21 23:21:03 +03003508 ret = io_issue_sqe(req, &nxt, true);
Jens Axboe491381ce2019-10-17 09:20:46 -06003509
3510 /*
3511 * We async punt it if the file wasn't marked NOWAIT, or if the file
3512 * doesn't support non-blocking read/write attempts
3513 */
3514 if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) ||
3515 (req->flags & REQ_F_MUST_PUNT))) {
Pavel Begunkovbbad27b2019-11-19 23:32:47 +03003516 if (req->work.flags & IO_WQ_WORK_NEEDS_FILES) {
3517 ret = io_grab_files(req);
3518 if (ret)
3519 goto err;
Jens Axboe2b188cc2019-01-07 10:46:33 -07003520 }
Pavel Begunkovbbad27b2019-11-19 23:32:47 +03003521
3522 /*
3523 * Queued up for async execution, worker will release
3524 * submit reference when the iocb is actually submitted.
3525 */
3526 io_queue_async_work(req);
Jens Axboe4a0a7a12019-12-09 20:01:01 -07003527 goto done_req;
Jens Axboe2b188cc2019-01-07 10:46:33 -07003528 }
Jens Axboee65ef562019-03-12 10:16:44 -06003529
Jens Axboefcb323c2019-10-24 12:39:47 -06003530err:
Jens Axboee65ef562019-03-12 10:16:44 -06003531 /* drop submission reference */
3532 io_put_req(req);
3533
Pavel Begunkovf9bd67f2019-11-21 23:21:03 +03003534 if (linked_timeout) {
Jens Axboe76a46e02019-11-10 23:34:16 -07003535 if (!ret)
Pavel Begunkovf9bd67f2019-11-21 23:21:03 +03003536 io_queue_linked_timeout(linked_timeout);
Jens Axboe76a46e02019-11-10 23:34:16 -07003537 else
Pavel Begunkovf9bd67f2019-11-21 23:21:03 +03003538 io_put_req(linked_timeout);
Jens Axboe76a46e02019-11-10 23:34:16 -07003539 }
3540
Jens Axboee65ef562019-03-12 10:16:44 -06003541 /* and drop final reference, if we failed */
Jens Axboe9e645e112019-05-10 16:07:28 -06003542 if (ret) {
Jens Axboe78e19bb2019-11-06 15:21:34 -07003543 io_cqring_add_event(req, ret);
Jens Axboe4e88d6e2019-12-07 20:59:47 -07003544 req_set_fail_links(req);
Jens Axboee65ef562019-03-12 10:16:44 -06003545 io_put_req(req);
Jens Axboe9e645e112019-05-10 16:07:28 -06003546 }
Jens Axboe4a0a7a12019-12-09 20:01:01 -07003547done_req:
3548 if (nxt) {
3549 req = nxt;
3550 nxt = NULL;
3551 goto again;
3552 }
Jens Axboe2b188cc2019-01-07 10:46:33 -07003553}
3554
Jens Axboe0e0702d2019-11-14 21:42:10 -07003555static void io_queue_sqe(struct io_kiocb *req)
Jackie Liu4fe2c962019-09-09 20:50:40 +08003556{
3557 int ret;
3558
Pavel Begunkov1b4a51b2019-11-21 11:54:28 +03003559 if (unlikely(req->ctx->drain_next)) {
3560 req->flags |= REQ_F_IO_DRAIN;
3561 req->ctx->drain_next = false;
3562 }
3563 req->ctx->drain_next = (req->flags & REQ_F_DRAIN_LINK);
3564
Jackie Liua197f662019-11-08 08:09:12 -07003565 ret = io_req_defer(req);
Jackie Liu4fe2c962019-09-09 20:50:40 +08003566 if (ret) {
3567 if (ret != -EIOCBQUEUED) {
Jens Axboe78e19bb2019-11-06 15:21:34 -07003568 io_cqring_add_event(req, ret);
Jens Axboe4e88d6e2019-12-07 20:59:47 -07003569 req_set_fail_links(req);
Jens Axboe78e19bb2019-11-06 15:21:34 -07003570 io_double_put_req(req);
Jackie Liu4fe2c962019-09-09 20:50:40 +08003571 }
Jens Axboe0e0702d2019-11-14 21:42:10 -07003572 } else
3573 __io_queue_sqe(req);
Jackie Liu4fe2c962019-09-09 20:50:40 +08003574}
3575
Pavel Begunkov1b4a51b2019-11-21 11:54:28 +03003576static inline void io_queue_link_head(struct io_kiocb *req)
Jackie Liu4fe2c962019-09-09 20:50:40 +08003577{
Jens Axboe94ae5e72019-11-14 19:39:52 -07003578 if (unlikely(req->flags & REQ_F_FAIL_LINK)) {
Pavel Begunkov1b4a51b2019-11-21 11:54:28 +03003579 io_cqring_add_event(req, -ECANCELED);
3580 io_double_put_req(req);
3581 } else
Jens Axboe0e0702d2019-11-14 21:42:10 -07003582 io_queue_sqe(req);
Jackie Liu4fe2c962019-09-09 20:50:40 +08003583}
3584
Jens Axboe4e88d6e2019-12-07 20:59:47 -07003585#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \
3586 IOSQE_IO_HARDLINK)
Jens Axboe9e645e112019-05-10 16:07:28 -06003587
Pavel Begunkov2e6e1fd2019-12-05 16:15:45 +03003588static bool io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state,
Jackie Liua197f662019-11-08 08:09:12 -07003589 struct io_kiocb **link)
Jens Axboe9e645e112019-05-10 16:07:28 -06003590{
Jackie Liua197f662019-11-08 08:09:12 -07003591 struct io_ring_ctx *ctx = req->ctx;
Jens Axboe9e645e112019-05-10 16:07:28 -06003592 int ret;
3593
3594 /* enforce forwards compatibility on users */
Pavel Begunkovcf6fd4b2019-11-25 23:14:39 +03003595 if (unlikely(req->sqe->flags & ~SQE_VALID_FLAGS)) {
Jens Axboe9e645e112019-05-10 16:07:28 -06003596 ret = -EINVAL;
Pavel Begunkov196be952019-11-07 01:41:06 +03003597 goto err_req;
Jens Axboe9e645e112019-05-10 16:07:28 -06003598 }
3599
Jackie Liua197f662019-11-08 08:09:12 -07003600 ret = io_req_set_file(state, req);
Jens Axboe9e645e112019-05-10 16:07:28 -06003601 if (unlikely(ret)) {
3602err_req:
Jens Axboe78e19bb2019-11-06 15:21:34 -07003603 io_cqring_add_event(req, ret);
3604 io_double_put_req(req);
Pavel Begunkov2e6e1fd2019-12-05 16:15:45 +03003605 return false;
Jens Axboe9e645e112019-05-10 16:07:28 -06003606 }
3607
Jens Axboe9e645e112019-05-10 16:07:28 -06003608 /*
3609 * If we already have a head request, queue this one for async
3610 * submittal once the head completes. If we don't have a head but
3611 * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be
3612 * submitted sync once the chain is complete. If none of those
3613 * conditions are true (normal request), then just queue it.
3614 */
3615 if (*link) {
3616 struct io_kiocb *prev = *link;
3617
Pavel Begunkovcf6fd4b2019-11-25 23:14:39 +03003618 if (req->sqe->flags & IOSQE_IO_DRAIN)
Pavel Begunkov1b4a51b2019-11-21 11:54:28 +03003619 (*link)->flags |= REQ_F_DRAIN_LINK | REQ_F_IO_DRAIN;
3620
Jens Axboe4e88d6e2019-12-07 20:59:47 -07003621 if (req->sqe->flags & IOSQE_IO_HARDLINK)
3622 req->flags |= REQ_F_HARDLINK;
3623
Jens Axboeb7bb4f72019-12-15 22:13:43 -07003624 if (io_alloc_async_ctx(req)) {
Jens Axboe9e645e112019-05-10 16:07:28 -06003625 ret = -EAGAIN;
3626 goto err_req;
3627 }
3628
Jens Axboeb7bb4f72019-12-15 22:13:43 -07003629 ret = io_req_defer_prep(req);
Jens Axboe2d283902019-12-04 11:08:05 -07003630 if (ret) {
Jens Axboe4e88d6e2019-12-07 20:59:47 -07003631 /* fail even hard links since we don't submit */
Jens Axboe2d283902019-12-04 11:08:05 -07003632 prev->flags |= REQ_F_FAIL_LINK;
Jens Axboef67676d2019-12-02 11:03:47 -07003633 goto err_req;
Jens Axboe2d283902019-12-04 11:08:05 -07003634 }
Dmitrii Dolgovc826bd72019-10-15 19:02:01 +02003635 trace_io_uring_link(ctx, req, prev);
Pavel Begunkov44932332019-12-05 16:16:35 +03003636 list_add_tail(&req->link_list, &prev->link_list);
Jens Axboe4e88d6e2019-12-07 20:59:47 -07003637 } else if (req->sqe->flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) {
Jens Axboe9e645e112019-05-10 16:07:28 -06003638 req->flags |= REQ_F_LINK;
Jens Axboe4e88d6e2019-12-07 20:59:47 -07003639 if (req->sqe->flags & IOSQE_IO_HARDLINK)
3640 req->flags |= REQ_F_HARDLINK;
Jens Axboe9e645e112019-05-10 16:07:28 -06003641
Jens Axboe9e645e112019-05-10 16:07:28 -06003642 INIT_LIST_HEAD(&req->link_list);
3643 *link = req;
3644 } else {
Jackie Liua197f662019-11-08 08:09:12 -07003645 io_queue_sqe(req);
Jens Axboe9e645e112019-05-10 16:07:28 -06003646 }
Pavel Begunkov2e6e1fd2019-12-05 16:15:45 +03003647
3648 return true;
Jens Axboe9e645e112019-05-10 16:07:28 -06003649}
3650
Jens Axboe9a56a232019-01-09 09:06:50 -07003651/*
3652 * Batched submission is done, ensure local IO is flushed out.
3653 */
3654static void io_submit_state_end(struct io_submit_state *state)
3655{
3656 blk_finish_plug(&state->plug);
Jens Axboe3d6770f2019-04-13 11:50:54 -06003657 io_file_put(state);
Jens Axboe2579f912019-01-09 09:10:43 -07003658 if (state->free_reqs)
3659 kmem_cache_free_bulk(req_cachep, state->free_reqs,
3660 &state->reqs[state->cur_req]);
Jens Axboe9a56a232019-01-09 09:06:50 -07003661}
3662
3663/*
3664 * Start submission side cache.
3665 */
3666static void io_submit_state_start(struct io_submit_state *state,
Jackie Liu22efde52019-12-02 17:14:52 +08003667 unsigned int max_ios)
Jens Axboe9a56a232019-01-09 09:06:50 -07003668{
3669 blk_start_plug(&state->plug);
Jens Axboe2579f912019-01-09 09:10:43 -07003670 state->free_reqs = 0;
Jens Axboe9a56a232019-01-09 09:06:50 -07003671 state->file = NULL;
3672 state->ios_left = max_ios;
3673}
3674
Jens Axboe2b188cc2019-01-07 10:46:33 -07003675static void io_commit_sqring(struct io_ring_ctx *ctx)
3676{
Hristo Venev75b28af2019-08-26 17:23:46 +00003677 struct io_rings *rings = ctx->rings;
Jens Axboe2b188cc2019-01-07 10:46:33 -07003678
Hristo Venev75b28af2019-08-26 17:23:46 +00003679 if (ctx->cached_sq_head != READ_ONCE(rings->sq.head)) {
Jens Axboe2b188cc2019-01-07 10:46:33 -07003680 /*
3681 * Ensure any loads from the SQEs are done at this point,
3682 * since once we write the new head, the application could
3683 * write new data to them.
3684 */
Hristo Venev75b28af2019-08-26 17:23:46 +00003685 smp_store_release(&rings->sq.head, ctx->cached_sq_head);
Jens Axboe2b188cc2019-01-07 10:46:33 -07003686 }
3687}
3688
3689/*
Brian Gianforcarod195a662019-12-13 03:09:50 -08003690 * Fetch an sqe, if one is available. Note that req->sqe will point to memory
Jens Axboe2b188cc2019-01-07 10:46:33 -07003691 * that is mapped by userspace. This means that care needs to be taken to
3692 * ensure that reads are stable, as we cannot rely on userspace always
3693 * being a good citizen. If members of the sqe are validated and then later
3694 * used, it's important that those reads are done through READ_ONCE() to
3695 * prevent a re-load down the line.
3696 */
Pavel Begunkovcf6fd4b2019-11-25 23:14:39 +03003697static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req)
Jens Axboe2b188cc2019-01-07 10:46:33 -07003698{
Hristo Venev75b28af2019-08-26 17:23:46 +00003699 struct io_rings *rings = ctx->rings;
3700 u32 *sq_array = ctx->sq_array;
Jens Axboe2b188cc2019-01-07 10:46:33 -07003701 unsigned head;
3702
3703 /*
3704 * The cached sq head (or cq tail) serves two purposes:
3705 *
3706 * 1) allows us to batch the cost of updating the user visible
3707 * head updates.
3708 * 2) allows the kernel side to track the head on its own, even
3709 * though the application is the one updating it.
3710 */
3711 head = ctx->cached_sq_head;
Stefan Bühlere523a292019-04-19 11:57:44 +02003712 /* make sure SQ entry isn't read before tail */
Pavel Begunkov9835d6f2019-11-21 21:24:56 +03003713 if (unlikely(head == smp_load_acquire(&rings->sq.tail)))
Jens Axboe2b188cc2019-01-07 10:46:33 -07003714 return false;
3715
Hristo Venev75b28af2019-08-26 17:23:46 +00003716 head = READ_ONCE(sq_array[head & ctx->sq_mask]);
Pavel Begunkov9835d6f2019-11-21 21:24:56 +03003717 if (likely(head < ctx->sq_entries)) {
Pavel Begunkovcf6fd4b2019-11-25 23:14:39 +03003718 /*
3719 * All io need record the previous position, if LINK vs DARIN,
3720 * it can be used to mark the position of the first IO in the
3721 * link list.
3722 */
3723 req->sequence = ctx->cached_sq_head;
3724 req->sqe = &ctx->sq_sqes[head];
Jens Axboed625c6e2019-12-17 19:53:05 -07003725 req->opcode = READ_ONCE(req->sqe->opcode);
3726 req->user_data = READ_ONCE(req->sqe->user_data);
Jens Axboe2b188cc2019-01-07 10:46:33 -07003727 ctx->cached_sq_head++;
3728 return true;
3729 }
3730
3731 /* drop invalid entries */
3732 ctx->cached_sq_head++;
Jens Axboe498ccd92019-10-25 10:04:25 -06003733 ctx->cached_sq_dropped++;
3734 WRITE_ONCE(rings->sq_dropped, ctx->cached_sq_dropped);
Jens Axboe2b188cc2019-01-07 10:46:33 -07003735 return false;
3736}
3737
Pavel Begunkovfb5ccc92019-10-25 12:31:30 +03003738static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
Pavel Begunkovae9428c2019-11-06 00:22:14 +03003739 struct file *ring_file, int ring_fd,
3740 struct mm_struct **mm, bool async)
Jens Axboe6c271ce2019-01-10 11:22:30 -07003741{
3742 struct io_submit_state state, *statep = NULL;
Jens Axboe9e645e112019-05-10 16:07:28 -06003743 struct io_kiocb *link = NULL;
Jens Axboe9e645e112019-05-10 16:07:28 -06003744 int i, submitted = 0;
Pavel Begunkov95a1b3ff2019-10-27 23:15:41 +03003745 bool mm_fault = false;
Jens Axboe6c271ce2019-01-10 11:22:30 -07003746
Jens Axboec4a2ed72019-11-21 21:01:26 -07003747 /* if we have a backlog and couldn't flush it all, return BUSY */
3748 if (!list_empty(&ctx->cq_overflow_list) &&
3749 !io_cqring_overflow_flush(ctx, false))
Jens Axboe1d7bb1d2019-11-06 11:31:17 -07003750 return -EBUSY;
Jens Axboe6c271ce2019-01-10 11:22:30 -07003751
3752 if (nr > IO_PLUG_THRESHOLD) {
Jackie Liu22efde52019-12-02 17:14:52 +08003753 io_submit_state_start(&state, nr);
Jens Axboe6c271ce2019-01-10 11:22:30 -07003754 statep = &state;
3755 }
3756
3757 for (i = 0; i < nr; i++) {
Pavel Begunkov196be952019-11-07 01:41:06 +03003758 struct io_kiocb *req;
Pavel Begunkov50585b92019-11-07 01:41:07 +03003759 unsigned int sqe_flags;
Pavel Begunkovfb5ccc92019-10-25 12:31:30 +03003760
Pavel Begunkov196be952019-11-07 01:41:06 +03003761 req = io_get_req(ctx, statep);
3762 if (unlikely(!req)) {
3763 if (!submitted)
3764 submitted = -EAGAIN;
Pavel Begunkovfb5ccc92019-10-25 12:31:30 +03003765 break;
Jens Axboe9e645e112019-05-10 16:07:28 -06003766 }
Pavel Begunkovcf6fd4b2019-11-25 23:14:39 +03003767 if (!io_get_sqring(ctx, req)) {
Pavel Begunkov196be952019-11-07 01:41:06 +03003768 __io_free_req(req);
3769 break;
3770 }
Jens Axboe9e645e112019-05-10 16:07:28 -06003771
Jens Axboed625c6e2019-12-17 19:53:05 -07003772 if (io_req_needs_user(req) && !*mm) {
Pavel Begunkov95a1b3ff2019-10-27 23:15:41 +03003773 mm_fault = mm_fault || !mmget_not_zero(ctx->sqo_mm);
3774 if (!mm_fault) {
3775 use_mm(ctx->sqo_mm);
3776 *mm = ctx->sqo_mm;
3777 }
3778 }
3779
Pavel Begunkov2e6e1fd2019-12-05 16:15:45 +03003780 submitted++;
Pavel Begunkovcf6fd4b2019-11-25 23:14:39 +03003781 sqe_flags = req->sqe->flags;
Pavel Begunkov50585b92019-11-07 01:41:07 +03003782
Pavel Begunkovcf6fd4b2019-11-25 23:14:39 +03003783 req->ring_file = ring_file;
3784 req->ring_fd = ring_fd;
3785 req->has_user = *mm != NULL;
3786 req->in_async = async;
3787 req->needs_fixed_file = async;
Jens Axboed625c6e2019-12-17 19:53:05 -07003788 trace_io_uring_submit_sqe(ctx, req->user_data, true, async);
Pavel Begunkov2e6e1fd2019-12-05 16:15:45 +03003789 if (!io_submit_sqe(req, statep, &link))
3790 break;
Pavel Begunkove5eb6362019-11-06 00:22:15 +03003791 /*
3792 * If previous wasn't linked and we have a linked command,
3793 * that's the end of the chain. Submit the previous link.
3794 */
Pavel Begunkovffbb8d62019-12-17 20:57:05 +03003795 if (!(sqe_flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) && link) {
Pavel Begunkov1b4a51b2019-11-21 11:54:28 +03003796 io_queue_link_head(link);
Pavel Begunkove5eb6362019-11-06 00:22:15 +03003797 link = NULL;
Jens Axboe6c271ce2019-01-10 11:22:30 -07003798 }
Jens Axboe6c271ce2019-01-10 11:22:30 -07003799 }
3800
Jens Axboe9e645e112019-05-10 16:07:28 -06003801 if (link)
Pavel Begunkov1b4a51b2019-11-21 11:54:28 +03003802 io_queue_link_head(link);
Jens Axboe6c271ce2019-01-10 11:22:30 -07003803 if (statep)
3804 io_submit_state_end(&state);
3805
Pavel Begunkovae9428c2019-11-06 00:22:14 +03003806 /* Commit SQ ring head once we've consumed and submitted all SQEs */
3807 io_commit_sqring(ctx);
3808
Jens Axboe6c271ce2019-01-10 11:22:30 -07003809 return submitted;
3810}
3811
3812static int io_sq_thread(void *data)
3813{
Jens Axboe6c271ce2019-01-10 11:22:30 -07003814 struct io_ring_ctx *ctx = data;
3815 struct mm_struct *cur_mm = NULL;
Jens Axboe181e4482019-11-25 08:52:30 -07003816 const struct cred *old_cred;
Jens Axboe6c271ce2019-01-10 11:22:30 -07003817 mm_segment_t old_fs;
3818 DEFINE_WAIT(wait);
3819 unsigned inflight;
3820 unsigned long timeout;
Jens Axboec1edbf52019-11-10 16:56:04 -07003821 int ret;
Jens Axboe6c271ce2019-01-10 11:22:30 -07003822
Jens Axboe206aefd2019-11-07 18:27:42 -07003823 complete(&ctx->completions[1]);
Jackie Liua4c0b3d2019-07-08 13:41:12 +08003824
Jens Axboe6c271ce2019-01-10 11:22:30 -07003825 old_fs = get_fs();
3826 set_fs(USER_DS);
Jens Axboe181e4482019-11-25 08:52:30 -07003827 old_cred = override_creds(ctx->creds);
Jens Axboe6c271ce2019-01-10 11:22:30 -07003828
Jens Axboec1edbf52019-11-10 16:56:04 -07003829 ret = timeout = inflight = 0;
Roman Penyaev2bbcd6d2019-05-16 10:53:57 +02003830 while (!kthread_should_park()) {
Pavel Begunkovfb5ccc92019-10-25 12:31:30 +03003831 unsigned int to_submit;
Jens Axboe6c271ce2019-01-10 11:22:30 -07003832
3833 if (inflight) {
3834 unsigned nr_events = 0;
3835
3836 if (ctx->flags & IORING_SETUP_IOPOLL) {
Jens Axboe2b2ed972019-10-25 10:06:15 -06003837 /*
3838 * inflight is the count of the maximum possible
3839 * entries we submitted, but it can be smaller
3840 * if we dropped some of them. If we don't have
3841 * poll entries available, then we know that we
3842 * have nothing left to poll for. Reset the
3843 * inflight count to zero in that case.
3844 */
3845 mutex_lock(&ctx->uring_lock);
3846 if (!list_empty(&ctx->poll_list))
3847 __io_iopoll_check(ctx, &nr_events, 0);
3848 else
3849 inflight = 0;
3850 mutex_unlock(&ctx->uring_lock);
Jens Axboe6c271ce2019-01-10 11:22:30 -07003851 } else {
3852 /*
3853 * Normal IO, just pretend everything completed.
3854 * We don't have to poll completions for that.
3855 */
3856 nr_events = inflight;
3857 }
3858
3859 inflight -= nr_events;
3860 if (!inflight)
3861 timeout = jiffies + ctx->sq_thread_idle;
3862 }
3863
Pavel Begunkovfb5ccc92019-10-25 12:31:30 +03003864 to_submit = io_sqring_entries(ctx);
Jens Axboec1edbf52019-11-10 16:56:04 -07003865
3866 /*
3867 * If submit got -EBUSY, flag us as needing the application
3868 * to enter the kernel to reap and flush events.
3869 */
3870 if (!to_submit || ret == -EBUSY) {
Jens Axboe6c271ce2019-01-10 11:22:30 -07003871 /*
3872 * We're polling. If we're within the defined idle
3873 * period, then let us spin without work before going
Jens Axboec1edbf52019-11-10 16:56:04 -07003874 * to sleep. The exception is if we got EBUSY doing
3875 * more IO, we should wait for the application to
3876 * reap events and wake us up.
Jens Axboe6c271ce2019-01-10 11:22:30 -07003877 */
Jens Axboec1edbf52019-11-10 16:56:04 -07003878 if (inflight ||
3879 (!time_after(jiffies, timeout) && ret != -EBUSY)) {
Jens Axboe9831a902019-09-19 09:48:55 -06003880 cond_resched();
Jens Axboe6c271ce2019-01-10 11:22:30 -07003881 continue;
3882 }
3883
3884 /*
3885 * Drop cur_mm before scheduling, we can't hold it for
3886 * long periods (or over schedule()). Do this before
3887 * adding ourselves to the waitqueue, as the unuse/drop
3888 * may sleep.
3889 */
3890 if (cur_mm) {
3891 unuse_mm(cur_mm);
3892 mmput(cur_mm);
3893 cur_mm = NULL;
3894 }
3895
3896 prepare_to_wait(&ctx->sqo_wait, &wait,
3897 TASK_INTERRUPTIBLE);
3898
3899 /* Tell userspace we may need a wakeup call */
Hristo Venev75b28af2019-08-26 17:23:46 +00003900 ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP;
Stefan Bühler0d7bae62019-04-19 11:57:45 +02003901 /* make sure to read SQ tail after writing flags */
3902 smp_mb();
Jens Axboe6c271ce2019-01-10 11:22:30 -07003903
Pavel Begunkovfb5ccc92019-10-25 12:31:30 +03003904 to_submit = io_sqring_entries(ctx);
Jens Axboec1edbf52019-11-10 16:56:04 -07003905 if (!to_submit || ret == -EBUSY) {
Roman Penyaev2bbcd6d2019-05-16 10:53:57 +02003906 if (kthread_should_park()) {
Jens Axboe6c271ce2019-01-10 11:22:30 -07003907 finish_wait(&ctx->sqo_wait, &wait);
3908 break;
3909 }
3910 if (signal_pending(current))
3911 flush_signals(current);
3912 schedule();
3913 finish_wait(&ctx->sqo_wait, &wait);
3914
Hristo Venev75b28af2019-08-26 17:23:46 +00003915 ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
Jens Axboe6c271ce2019-01-10 11:22:30 -07003916 continue;
3917 }
3918 finish_wait(&ctx->sqo_wait, &wait);
3919
Hristo Venev75b28af2019-08-26 17:23:46 +00003920 ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
Jens Axboe6c271ce2019-01-10 11:22:30 -07003921 }
3922
Pavel Begunkovfb5ccc92019-10-25 12:31:30 +03003923 to_submit = min(to_submit, ctx->sq_entries);
Jens Axboe8a4955f2019-12-09 14:52:35 -07003924 mutex_lock(&ctx->uring_lock);
Jens Axboe1d7bb1d2019-11-06 11:31:17 -07003925 ret = io_submit_sqes(ctx, to_submit, NULL, -1, &cur_mm, true);
Jens Axboe8a4955f2019-12-09 14:52:35 -07003926 mutex_unlock(&ctx->uring_lock);
Jens Axboe1d7bb1d2019-11-06 11:31:17 -07003927 if (ret > 0)
3928 inflight += ret;
Jens Axboe6c271ce2019-01-10 11:22:30 -07003929 }
3930
3931 set_fs(old_fs);
3932 if (cur_mm) {
3933 unuse_mm(cur_mm);
3934 mmput(cur_mm);
3935 }
Jens Axboe181e4482019-11-25 08:52:30 -07003936 revert_creds(old_cred);
Jens Axboe06058632019-04-13 09:26:03 -06003937
Roman Penyaev2bbcd6d2019-05-16 10:53:57 +02003938 kthread_parkme();
Jens Axboe06058632019-04-13 09:26:03 -06003939
Jens Axboe6c271ce2019-01-10 11:22:30 -07003940 return 0;
3941}
3942
Jens Axboebda52162019-09-24 13:47:15 -06003943struct io_wait_queue {
3944 struct wait_queue_entry wq;
3945 struct io_ring_ctx *ctx;
3946 unsigned to_wait;
3947 unsigned nr_timeouts;
3948};
3949
Jens Axboe1d7bb1d2019-11-06 11:31:17 -07003950static inline bool io_should_wake(struct io_wait_queue *iowq, bool noflush)
Jens Axboebda52162019-09-24 13:47:15 -06003951{
3952 struct io_ring_ctx *ctx = iowq->ctx;
3953
3954 /*
Brian Gianforcarod195a662019-12-13 03:09:50 -08003955 * Wake up if we have enough events, or if a timeout occurred since we
Jens Axboebda52162019-09-24 13:47:15 -06003956 * started waiting. For timeouts, we always want to return to userspace,
3957 * regardless of event count.
3958 */
Jens Axboe1d7bb1d2019-11-06 11:31:17 -07003959 return io_cqring_events(ctx, noflush) >= iowq->to_wait ||
Jens Axboebda52162019-09-24 13:47:15 -06003960 atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
3961}
3962
3963static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
3964 int wake_flags, void *key)
3965{
3966 struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
3967 wq);
3968
Jens Axboe1d7bb1d2019-11-06 11:31:17 -07003969 /* use noflush == true, as we can't safely rely on locking context */
3970 if (!io_should_wake(iowq, true))
Jens Axboebda52162019-09-24 13:47:15 -06003971 return -1;
3972
3973 return autoremove_wake_function(curr, mode, wake_flags, key);
3974}
3975
Jens Axboe2b188cc2019-01-07 10:46:33 -07003976/*
3977 * Wait until events become available, if we don't already have some. The
3978 * application must reap them itself, as they reside on the shared cq ring.
3979 */
3980static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
3981 const sigset_t __user *sig, size_t sigsz)
3982{
Jens Axboebda52162019-09-24 13:47:15 -06003983 struct io_wait_queue iowq = {
3984 .wq = {
3985 .private = current,
3986 .func = io_wake_function,
3987 .entry = LIST_HEAD_INIT(iowq.wq.entry),
3988 },
3989 .ctx = ctx,
3990 .to_wait = min_events,
3991 };
Hristo Venev75b28af2019-08-26 17:23:46 +00003992 struct io_rings *rings = ctx->rings;
Jackie Liue9ffa5c2019-10-29 11:16:42 +08003993 int ret = 0;
Jens Axboe2b188cc2019-01-07 10:46:33 -07003994
Jens Axboe1d7bb1d2019-11-06 11:31:17 -07003995 if (io_cqring_events(ctx, false) >= min_events)
Jens Axboe2b188cc2019-01-07 10:46:33 -07003996 return 0;
3997
3998 if (sig) {
Arnd Bergmann9e75ad52019-03-25 15:34:53 +01003999#ifdef CONFIG_COMPAT
4000 if (in_compat_syscall())
4001 ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
Oleg Nesterovb7724342019-07-16 16:29:53 -07004002 sigsz);
Arnd Bergmann9e75ad52019-03-25 15:34:53 +01004003 else
4004#endif
Oleg Nesterovb7724342019-07-16 16:29:53 -07004005 ret = set_user_sigmask(sig, sigsz);
Arnd Bergmann9e75ad52019-03-25 15:34:53 +01004006
Jens Axboe2b188cc2019-01-07 10:46:33 -07004007 if (ret)
4008 return ret;
4009 }
4010
Jens Axboebda52162019-09-24 13:47:15 -06004011 iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
Dmitrii Dolgovc826bd72019-10-15 19:02:01 +02004012 trace_io_uring_cqring_wait(ctx, min_events);
Jens Axboebda52162019-09-24 13:47:15 -06004013 do {
4014 prepare_to_wait_exclusive(&ctx->wait, &iowq.wq,
4015 TASK_INTERRUPTIBLE);
Jens Axboe1d7bb1d2019-11-06 11:31:17 -07004016 if (io_should_wake(&iowq, false))
Jens Axboebda52162019-09-24 13:47:15 -06004017 break;
4018 schedule();
4019 if (signal_pending(current)) {
Jackie Liue9ffa5c2019-10-29 11:16:42 +08004020 ret = -EINTR;
Jens Axboebda52162019-09-24 13:47:15 -06004021 break;
4022 }
4023 } while (1);
4024 finish_wait(&ctx->wait, &iowq.wq);
4025
Jackie Liue9ffa5c2019-10-29 11:16:42 +08004026 restore_saved_sigmask_unless(ret == -EINTR);
Jens Axboe2b188cc2019-01-07 10:46:33 -07004027
Hristo Venev75b28af2019-08-26 17:23:46 +00004028 return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
Jens Axboe2b188cc2019-01-07 10:46:33 -07004029}
4030
Jens Axboe6b063142019-01-10 22:13:58 -07004031static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
4032{
4033#if defined(CONFIG_UNIX)
4034 if (ctx->ring_sock) {
4035 struct sock *sock = ctx->ring_sock->sk;
4036 struct sk_buff *skb;
4037
4038 while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
4039 kfree_skb(skb);
4040 }
4041#else
4042 int i;
4043
Jens Axboe65e19f52019-10-26 07:20:21 -06004044 for (i = 0; i < ctx->nr_user_files; i++) {
4045 struct file *file;
4046
4047 file = io_file_from_index(ctx, i);
4048 if (file)
4049 fput(file);
4050 }
Jens Axboe6b063142019-01-10 22:13:58 -07004051#endif
4052}
4053
4054static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
4055{
Jens Axboe65e19f52019-10-26 07:20:21 -06004056 unsigned nr_tables, i;
4057
4058 if (!ctx->file_table)
Jens Axboe6b063142019-01-10 22:13:58 -07004059 return -ENXIO;
4060
4061 __io_sqe_files_unregister(ctx);
Jens Axboe65e19f52019-10-26 07:20:21 -06004062 nr_tables = DIV_ROUND_UP(ctx->nr_user_files, IORING_MAX_FILES_TABLE);
4063 for (i = 0; i < nr_tables; i++)
4064 kfree(ctx->file_table[i].files);
4065 kfree(ctx->file_table);
4066 ctx->file_table = NULL;
Jens Axboe6b063142019-01-10 22:13:58 -07004067 ctx->nr_user_files = 0;
4068 return 0;
4069}
4070
Jens Axboe6c271ce2019-01-10 11:22:30 -07004071static void io_sq_thread_stop(struct io_ring_ctx *ctx)
4072{
4073 if (ctx->sqo_thread) {
Jens Axboe206aefd2019-11-07 18:27:42 -07004074 wait_for_completion(&ctx->completions[1]);
Roman Penyaev2bbcd6d2019-05-16 10:53:57 +02004075 /*
4076 * The park is a bit of a work-around, without it we get
4077 * warning spews on shutdown with SQPOLL set and affinity
4078 * set to a single CPU.
4079 */
Jens Axboe06058632019-04-13 09:26:03 -06004080 kthread_park(ctx->sqo_thread);
Jens Axboe6c271ce2019-01-10 11:22:30 -07004081 kthread_stop(ctx->sqo_thread);
4082 ctx->sqo_thread = NULL;
4083 }
4084}
4085
Jens Axboe6b063142019-01-10 22:13:58 -07004086static void io_finish_async(struct io_ring_ctx *ctx)
4087{
Jens Axboe6c271ce2019-01-10 11:22:30 -07004088 io_sq_thread_stop(ctx);
4089
Jens Axboe561fb042019-10-24 07:25:42 -06004090 if (ctx->io_wq) {
4091 io_wq_destroy(ctx->io_wq);
4092 ctx->io_wq = NULL;
Jens Axboe6b063142019-01-10 22:13:58 -07004093 }
4094}
4095
4096#if defined(CONFIG_UNIX)
4097static void io_destruct_skb(struct sk_buff *skb)
4098{
4099 struct io_ring_ctx *ctx = skb->sk->sk_user_data;
4100
Jens Axboe561fb042019-10-24 07:25:42 -06004101 if (ctx->io_wq)
4102 io_wq_flush(ctx->io_wq);
Jens Axboe8a997342019-10-09 14:40:13 -06004103
Jens Axboe6b063142019-01-10 22:13:58 -07004104 unix_destruct_scm(skb);
4105}
4106
4107/*
4108 * Ensure the UNIX gc is aware of our file set, so we are certain that
4109 * the io_uring can be safely unregistered on process exit, even if we have
4110 * loops in the file referencing.
4111 */
4112static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
4113{
4114 struct sock *sk = ctx->ring_sock->sk;
4115 struct scm_fp_list *fpl;
4116 struct sk_buff *skb;
Jens Axboe08a45172019-10-03 08:11:03 -06004117 int i, nr_files;
Jens Axboe6b063142019-01-10 22:13:58 -07004118
4119 if (!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) {
4120 unsigned long inflight = ctx->user->unix_inflight + nr;
4121
4122 if (inflight > task_rlimit(current, RLIMIT_NOFILE))
4123 return -EMFILE;
4124 }
4125
4126 fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
4127 if (!fpl)
4128 return -ENOMEM;
4129
4130 skb = alloc_skb(0, GFP_KERNEL);
4131 if (!skb) {
4132 kfree(fpl);
4133 return -ENOMEM;
4134 }
4135
4136 skb->sk = sk;
Jens Axboe6b063142019-01-10 22:13:58 -07004137
Jens Axboe08a45172019-10-03 08:11:03 -06004138 nr_files = 0;
Jens Axboe6b063142019-01-10 22:13:58 -07004139 fpl->user = get_uid(ctx->user);
4140 for (i = 0; i < nr; i++) {
Jens Axboe65e19f52019-10-26 07:20:21 -06004141 struct file *file = io_file_from_index(ctx, i + offset);
4142
4143 if (!file)
Jens Axboe08a45172019-10-03 08:11:03 -06004144 continue;
Jens Axboe65e19f52019-10-26 07:20:21 -06004145 fpl->fp[nr_files] = get_file(file);
Jens Axboe08a45172019-10-03 08:11:03 -06004146 unix_inflight(fpl->user, fpl->fp[nr_files]);
4147 nr_files++;
Jens Axboe6b063142019-01-10 22:13:58 -07004148 }
4149
Jens Axboe08a45172019-10-03 08:11:03 -06004150 if (nr_files) {
4151 fpl->max = SCM_MAX_FD;
4152 fpl->count = nr_files;
4153 UNIXCB(skb).fp = fpl;
4154 skb->destructor = io_destruct_skb;
4155 refcount_add(skb->truesize, &sk->sk_wmem_alloc);
4156 skb_queue_head(&sk->sk_receive_queue, skb);
Jens Axboe6b063142019-01-10 22:13:58 -07004157
Jens Axboe08a45172019-10-03 08:11:03 -06004158 for (i = 0; i < nr_files; i++)
4159 fput(fpl->fp[i]);
4160 } else {
4161 kfree_skb(skb);
4162 kfree(fpl);
4163 }
Jens Axboe6b063142019-01-10 22:13:58 -07004164
4165 return 0;
4166}
4167
4168/*
4169 * If UNIX sockets are enabled, fd passing can cause a reference cycle which
4170 * causes regular reference counting to break down. We rely on the UNIX
4171 * garbage collection to take care of this problem for us.
4172 */
4173static int io_sqe_files_scm(struct io_ring_ctx *ctx)
4174{
4175 unsigned left, total;
4176 int ret = 0;
4177
4178 total = 0;
4179 left = ctx->nr_user_files;
4180 while (left) {
4181 unsigned this_files = min_t(unsigned, left, SCM_MAX_FD);
Jens Axboe6b063142019-01-10 22:13:58 -07004182
4183 ret = __io_sqe_files_scm(ctx, this_files, total);
4184 if (ret)
4185 break;
4186 left -= this_files;
4187 total += this_files;
4188 }
4189
4190 if (!ret)
4191 return 0;
4192
4193 while (total < ctx->nr_user_files) {
Jens Axboe65e19f52019-10-26 07:20:21 -06004194 struct file *file = io_file_from_index(ctx, total);
4195
4196 if (file)
4197 fput(file);
Jens Axboe6b063142019-01-10 22:13:58 -07004198 total++;
4199 }
4200
4201 return ret;
4202}
4203#else
4204static int io_sqe_files_scm(struct io_ring_ctx *ctx)
4205{
4206 return 0;
4207}
4208#endif
4209
Jens Axboe65e19f52019-10-26 07:20:21 -06004210static int io_sqe_alloc_file_tables(struct io_ring_ctx *ctx, unsigned nr_tables,
4211 unsigned nr_files)
4212{
4213 int i;
4214
4215 for (i = 0; i < nr_tables; i++) {
4216 struct fixed_file_table *table = &ctx->file_table[i];
4217 unsigned this_files;
4218
4219 this_files = min(nr_files, IORING_MAX_FILES_TABLE);
4220 table->files = kcalloc(this_files, sizeof(struct file *),
4221 GFP_KERNEL);
4222 if (!table->files)
4223 break;
4224 nr_files -= this_files;
4225 }
4226
4227 if (i == nr_tables)
4228 return 0;
4229
4230 for (i = 0; i < nr_tables; i++) {
4231 struct fixed_file_table *table = &ctx->file_table[i];
4232 kfree(table->files);
4233 }
4234 return 1;
4235}
4236
Jens Axboe6b063142019-01-10 22:13:58 -07004237static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
4238 unsigned nr_args)
4239{
4240 __s32 __user *fds = (__s32 __user *) arg;
Jens Axboe65e19f52019-10-26 07:20:21 -06004241 unsigned nr_tables;
Jens Axboe6b063142019-01-10 22:13:58 -07004242 int fd, ret = 0;
4243 unsigned i;
4244
Jens Axboe65e19f52019-10-26 07:20:21 -06004245 if (ctx->file_table)
Jens Axboe6b063142019-01-10 22:13:58 -07004246 return -EBUSY;
4247 if (!nr_args)
4248 return -EINVAL;
4249 if (nr_args > IORING_MAX_FIXED_FILES)
4250 return -EMFILE;
4251
Jens Axboe65e19f52019-10-26 07:20:21 -06004252 nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_FILES_TABLE);
4253 ctx->file_table = kcalloc(nr_tables, sizeof(struct fixed_file_table),
4254 GFP_KERNEL);
4255 if (!ctx->file_table)
Jens Axboe6b063142019-01-10 22:13:58 -07004256 return -ENOMEM;
4257
Jens Axboe65e19f52019-10-26 07:20:21 -06004258 if (io_sqe_alloc_file_tables(ctx, nr_tables, nr_args)) {
4259 kfree(ctx->file_table);
Jens Axboe46568e92019-11-10 08:40:53 -07004260 ctx->file_table = NULL;
Jens Axboe65e19f52019-10-26 07:20:21 -06004261 return -ENOMEM;
4262 }
4263
Jens Axboe08a45172019-10-03 08:11:03 -06004264 for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
Jens Axboe65e19f52019-10-26 07:20:21 -06004265 struct fixed_file_table *table;
4266 unsigned index;
4267
Jens Axboe6b063142019-01-10 22:13:58 -07004268 ret = -EFAULT;
4269 if (copy_from_user(&fd, &fds[i], sizeof(fd)))
4270 break;
Jens Axboe08a45172019-10-03 08:11:03 -06004271 /* allow sparse sets */
4272 if (fd == -1) {
4273 ret = 0;
4274 continue;
4275 }
Jens Axboe6b063142019-01-10 22:13:58 -07004276
Jens Axboe65e19f52019-10-26 07:20:21 -06004277 table = &ctx->file_table[i >> IORING_FILE_TABLE_SHIFT];
4278 index = i & IORING_FILE_TABLE_MASK;
4279 table->files[index] = fget(fd);
Jens Axboe6b063142019-01-10 22:13:58 -07004280
4281 ret = -EBADF;
Jens Axboe65e19f52019-10-26 07:20:21 -06004282 if (!table->files[index])
Jens Axboe6b063142019-01-10 22:13:58 -07004283 break;
4284 /*
4285 * Don't allow io_uring instances to be registered. If UNIX
4286 * isn't enabled, then this causes a reference cycle and this
4287 * instance can never get freed. If UNIX is enabled we'll
4288 * handle it just fine, but there's still no point in allowing
4289 * a ring fd as it doesn't support regular read/write anyway.
4290 */
Jens Axboe65e19f52019-10-26 07:20:21 -06004291 if (table->files[index]->f_op == &io_uring_fops) {
4292 fput(table->files[index]);
Jens Axboe6b063142019-01-10 22:13:58 -07004293 break;
4294 }
Jens Axboe6b063142019-01-10 22:13:58 -07004295 ret = 0;
4296 }
4297
4298 if (ret) {
Jens Axboe65e19f52019-10-26 07:20:21 -06004299 for (i = 0; i < ctx->nr_user_files; i++) {
4300 struct file *file;
Jens Axboe6b063142019-01-10 22:13:58 -07004301
Jens Axboe65e19f52019-10-26 07:20:21 -06004302 file = io_file_from_index(ctx, i);
4303 if (file)
4304 fput(file);
4305 }
4306 for (i = 0; i < nr_tables; i++)
4307 kfree(ctx->file_table[i].files);
4308
4309 kfree(ctx->file_table);
4310 ctx->file_table = NULL;
Jens Axboe6b063142019-01-10 22:13:58 -07004311 ctx->nr_user_files = 0;
4312 return ret;
4313 }
4314
4315 ret = io_sqe_files_scm(ctx);
4316 if (ret)
4317 io_sqe_files_unregister(ctx);
4318
4319 return ret;
4320}
4321
Jens Axboec3a31e62019-10-03 13:59:56 -06004322static void io_sqe_file_unregister(struct io_ring_ctx *ctx, int index)
4323{
4324#if defined(CONFIG_UNIX)
Jens Axboe65e19f52019-10-26 07:20:21 -06004325 struct file *file = io_file_from_index(ctx, index);
Jens Axboec3a31e62019-10-03 13:59:56 -06004326 struct sock *sock = ctx->ring_sock->sk;
4327 struct sk_buff_head list, *head = &sock->sk_receive_queue;
4328 struct sk_buff *skb;
4329 int i;
4330
4331 __skb_queue_head_init(&list);
4332
4333 /*
4334 * Find the skb that holds this file in its SCM_RIGHTS. When found,
4335 * remove this entry and rearrange the file array.
4336 */
4337 skb = skb_dequeue(head);
4338 while (skb) {
4339 struct scm_fp_list *fp;
4340
4341 fp = UNIXCB(skb).fp;
4342 for (i = 0; i < fp->count; i++) {
4343 int left;
4344
4345 if (fp->fp[i] != file)
4346 continue;
4347
4348 unix_notinflight(fp->user, fp->fp[i]);
4349 left = fp->count - 1 - i;
4350 if (left) {
4351 memmove(&fp->fp[i], &fp->fp[i + 1],
4352 left * sizeof(struct file *));
4353 }
4354 fp->count--;
4355 if (!fp->count) {
4356 kfree_skb(skb);
4357 skb = NULL;
4358 } else {
4359 __skb_queue_tail(&list, skb);
4360 }
4361 fput(file);
4362 file = NULL;
4363 break;
4364 }
4365
4366 if (!file)
4367 break;
4368
4369 __skb_queue_tail(&list, skb);
4370
4371 skb = skb_dequeue(head);
4372 }
4373
4374 if (skb_peek(&list)) {
4375 spin_lock_irq(&head->lock);
4376 while ((skb = __skb_dequeue(&list)) != NULL)
4377 __skb_queue_tail(head, skb);
4378 spin_unlock_irq(&head->lock);
4379 }
4380#else
Jens Axboe65e19f52019-10-26 07:20:21 -06004381 fput(io_file_from_index(ctx, index));
Jens Axboec3a31e62019-10-03 13:59:56 -06004382#endif
4383}
4384
4385static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
4386 int index)
4387{
4388#if defined(CONFIG_UNIX)
4389 struct sock *sock = ctx->ring_sock->sk;
4390 struct sk_buff_head *head = &sock->sk_receive_queue;
4391 struct sk_buff *skb;
4392
4393 /*
4394 * See if we can merge this file into an existing skb SCM_RIGHTS
4395 * file set. If there's no room, fall back to allocating a new skb
4396 * and filling it in.
4397 */
4398 spin_lock_irq(&head->lock);
4399 skb = skb_peek(head);
4400 if (skb) {
4401 struct scm_fp_list *fpl = UNIXCB(skb).fp;
4402
4403 if (fpl->count < SCM_MAX_FD) {
4404 __skb_unlink(skb, head);
4405 spin_unlock_irq(&head->lock);
4406 fpl->fp[fpl->count] = get_file(file);
4407 unix_inflight(fpl->user, fpl->fp[fpl->count]);
4408 fpl->count++;
4409 spin_lock_irq(&head->lock);
4410 __skb_queue_head(head, skb);
4411 } else {
4412 skb = NULL;
4413 }
4414 }
4415 spin_unlock_irq(&head->lock);
4416
4417 if (skb) {
4418 fput(file);
4419 return 0;
4420 }
4421
4422 return __io_sqe_files_scm(ctx, 1, index);
4423#else
4424 return 0;
4425#endif
4426}
4427
4428static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
4429 unsigned nr_args)
4430{
4431 struct io_uring_files_update up;
4432 __s32 __user *fds;
4433 int fd, i, err;
4434 __u32 done;
4435
Jens Axboe65e19f52019-10-26 07:20:21 -06004436 if (!ctx->file_table)
Jens Axboec3a31e62019-10-03 13:59:56 -06004437 return -ENXIO;
4438 if (!nr_args)
4439 return -EINVAL;
4440 if (copy_from_user(&up, arg, sizeof(up)))
4441 return -EFAULT;
4442 if (check_add_overflow(up.offset, nr_args, &done))
4443 return -EOVERFLOW;
4444 if (done > ctx->nr_user_files)
4445 return -EINVAL;
4446
4447 done = 0;
4448 fds = (__s32 __user *) up.fds;
4449 while (nr_args) {
Jens Axboe65e19f52019-10-26 07:20:21 -06004450 struct fixed_file_table *table;
4451 unsigned index;
4452
Jens Axboec3a31e62019-10-03 13:59:56 -06004453 err = 0;
4454 if (copy_from_user(&fd, &fds[done], sizeof(fd))) {
4455 err = -EFAULT;
4456 break;
4457 }
4458 i = array_index_nospec(up.offset, ctx->nr_user_files);
Jens Axboe65e19f52019-10-26 07:20:21 -06004459 table = &ctx->file_table[i >> IORING_FILE_TABLE_SHIFT];
4460 index = i & IORING_FILE_TABLE_MASK;
4461 if (table->files[index]) {
Jens Axboec3a31e62019-10-03 13:59:56 -06004462 io_sqe_file_unregister(ctx, i);
Jens Axboe65e19f52019-10-26 07:20:21 -06004463 table->files[index] = NULL;
Jens Axboec3a31e62019-10-03 13:59:56 -06004464 }
4465 if (fd != -1) {
4466 struct file *file;
4467
4468 file = fget(fd);
4469 if (!file) {
4470 err = -EBADF;
4471 break;
4472 }
4473 /*
4474 * Don't allow io_uring instances to be registered. If
4475 * UNIX isn't enabled, then this causes a reference
4476 * cycle and this instance can never get freed. If UNIX
4477 * is enabled we'll handle it just fine, but there's
4478 * still no point in allowing a ring fd as it doesn't
4479 * support regular read/write anyway.
4480 */
4481 if (file->f_op == &io_uring_fops) {
4482 fput(file);
4483 err = -EBADF;
4484 break;
4485 }
Jens Axboe65e19f52019-10-26 07:20:21 -06004486 table->files[index] = file;
Jens Axboec3a31e62019-10-03 13:59:56 -06004487 err = io_sqe_file_register(ctx, file, i);
4488 if (err)
4489 break;
4490 }
4491 nr_args--;
4492 done++;
4493 up.offset++;
4494 }
4495
4496 return done ? done : err;
4497}
4498
Jens Axboe7d723062019-11-12 22:31:31 -07004499static void io_put_work(struct io_wq_work *work)
4500{
4501 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
4502
4503 io_put_req(req);
4504}
4505
4506static void io_get_work(struct io_wq_work *work)
4507{
4508 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
4509
4510 refcount_inc(&req->refs);
4511}
4512
Jens Axboe6c271ce2019-01-10 11:22:30 -07004513static int io_sq_offload_start(struct io_ring_ctx *ctx,
4514 struct io_uring_params *p)
Jens Axboe2b188cc2019-01-07 10:46:33 -07004515{
Jens Axboe576a3472019-11-25 08:49:20 -07004516 struct io_wq_data data;
Jens Axboe561fb042019-10-24 07:25:42 -06004517 unsigned concurrency;
Jens Axboe2b188cc2019-01-07 10:46:33 -07004518 int ret;
4519
Jens Axboe6c271ce2019-01-10 11:22:30 -07004520 init_waitqueue_head(&ctx->sqo_wait);
Jens Axboe2b188cc2019-01-07 10:46:33 -07004521 mmgrab(current->mm);
4522 ctx->sqo_mm = current->mm;
4523
Jens Axboe6c271ce2019-01-10 11:22:30 -07004524 if (ctx->flags & IORING_SETUP_SQPOLL) {
Jens Axboe3ec482d2019-04-08 10:51:01 -06004525 ret = -EPERM;
4526 if (!capable(CAP_SYS_ADMIN))
4527 goto err;
4528
Jens Axboe917257d2019-04-13 09:28:55 -06004529 ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
4530 if (!ctx->sq_thread_idle)
4531 ctx->sq_thread_idle = HZ;
4532
Jens Axboe6c271ce2019-01-10 11:22:30 -07004533 if (p->flags & IORING_SETUP_SQ_AFF) {
Jens Axboe44a9bd12019-05-14 20:00:30 -06004534 int cpu = p->sq_thread_cpu;
Jens Axboe6c271ce2019-01-10 11:22:30 -07004535
Jens Axboe917257d2019-04-13 09:28:55 -06004536 ret = -EINVAL;
Jens Axboe44a9bd12019-05-14 20:00:30 -06004537 if (cpu >= nr_cpu_ids)
4538 goto err;
Shenghui Wang7889f442019-05-07 16:03:19 +08004539 if (!cpu_online(cpu))
Jens Axboe917257d2019-04-13 09:28:55 -06004540 goto err;
4541
Jens Axboe6c271ce2019-01-10 11:22:30 -07004542 ctx->sqo_thread = kthread_create_on_cpu(io_sq_thread,
4543 ctx, cpu,
4544 "io_uring-sq");
4545 } else {
4546 ctx->sqo_thread = kthread_create(io_sq_thread, ctx,
4547 "io_uring-sq");
4548 }
4549 if (IS_ERR(ctx->sqo_thread)) {
4550 ret = PTR_ERR(ctx->sqo_thread);
4551 ctx->sqo_thread = NULL;
4552 goto err;
4553 }
4554 wake_up_process(ctx->sqo_thread);
4555 } else if (p->flags & IORING_SETUP_SQ_AFF) {
4556 /* Can't have SQ_AFF without SQPOLL */
4557 ret = -EINVAL;
4558 goto err;
4559 }
4560
Jens Axboe576a3472019-11-25 08:49:20 -07004561 data.mm = ctx->sqo_mm;
4562 data.user = ctx->user;
Jens Axboe181e4482019-11-25 08:52:30 -07004563 data.creds = ctx->creds;
Jens Axboe576a3472019-11-25 08:49:20 -07004564 data.get_work = io_get_work;
4565 data.put_work = io_put_work;
4566
Jens Axboe561fb042019-10-24 07:25:42 -06004567 /* Do QD, or 4 * CPUS, whatever is smallest */
4568 concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
Jens Axboe576a3472019-11-25 08:49:20 -07004569 ctx->io_wq = io_wq_create(concurrency, &data);
Jens Axboe975c99a52019-10-30 08:42:56 -06004570 if (IS_ERR(ctx->io_wq)) {
4571 ret = PTR_ERR(ctx->io_wq);
4572 ctx->io_wq = NULL;
Jens Axboe2b188cc2019-01-07 10:46:33 -07004573 goto err;
4574 }
4575
4576 return 0;
4577err:
Jens Axboe54a91f32019-09-10 09:15:04 -06004578 io_finish_async(ctx);
Jens Axboe2b188cc2019-01-07 10:46:33 -07004579 mmdrop(ctx->sqo_mm);
4580 ctx->sqo_mm = NULL;
4581 return ret;
4582}
4583
4584static void io_unaccount_mem(struct user_struct *user, unsigned long nr_pages)
4585{
4586 atomic_long_sub(nr_pages, &user->locked_vm);
4587}
4588
4589static int io_account_mem(struct user_struct *user, unsigned long nr_pages)
4590{
4591 unsigned long page_limit, cur_pages, new_pages;
4592
4593 /* Don't allow more pages than we can safely lock */
4594 page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
4595
4596 do {
4597 cur_pages = atomic_long_read(&user->locked_vm);
4598 new_pages = cur_pages + nr_pages;
4599 if (new_pages > page_limit)
4600 return -ENOMEM;
4601 } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
4602 new_pages) != cur_pages);
4603
4604 return 0;
4605}
4606
4607static void io_mem_free(void *ptr)
4608{
Mark Rutland52e04ef2019-04-30 17:30:21 +01004609 struct page *page;
Jens Axboe2b188cc2019-01-07 10:46:33 -07004610
Mark Rutland52e04ef2019-04-30 17:30:21 +01004611 if (!ptr)
4612 return;
4613
4614 page = virt_to_head_page(ptr);
Jens Axboe2b188cc2019-01-07 10:46:33 -07004615 if (put_page_testzero(page))
4616 free_compound_page(page);
4617}
4618
4619static void *io_mem_alloc(size_t size)
4620{
4621 gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP |
4622 __GFP_NORETRY;
4623
4624 return (void *) __get_free_pages(gfp_flags, get_order(size));
4625}
4626
Hristo Venev75b28af2019-08-26 17:23:46 +00004627static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
4628 size_t *sq_offset)
4629{
4630 struct io_rings *rings;
4631 size_t off, sq_array_size;
4632
4633 off = struct_size(rings, cqes, cq_entries);
4634 if (off == SIZE_MAX)
4635 return SIZE_MAX;
4636
4637#ifdef CONFIG_SMP
4638 off = ALIGN(off, SMP_CACHE_BYTES);
4639 if (off == 0)
4640 return SIZE_MAX;
4641#endif
4642
4643 sq_array_size = array_size(sizeof(u32), sq_entries);
4644 if (sq_array_size == SIZE_MAX)
4645 return SIZE_MAX;
4646
4647 if (check_add_overflow(off, sq_array_size, &off))
4648 return SIZE_MAX;
4649
4650 if (sq_offset)
4651 *sq_offset = off;
4652
4653 return off;
4654}
4655
Jens Axboe2b188cc2019-01-07 10:46:33 -07004656static unsigned long ring_pages(unsigned sq_entries, unsigned cq_entries)
4657{
Hristo Venev75b28af2019-08-26 17:23:46 +00004658 size_t pages;
Jens Axboe2b188cc2019-01-07 10:46:33 -07004659
Hristo Venev75b28af2019-08-26 17:23:46 +00004660 pages = (size_t)1 << get_order(
4661 rings_size(sq_entries, cq_entries, NULL));
4662 pages += (size_t)1 << get_order(
4663 array_size(sizeof(struct io_uring_sqe), sq_entries));
Jens Axboe2b188cc2019-01-07 10:46:33 -07004664
Hristo Venev75b28af2019-08-26 17:23:46 +00004665 return pages;
Jens Axboe2b188cc2019-01-07 10:46:33 -07004666}
4667
Jens Axboeedafcce2019-01-09 09:16:05 -07004668static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx)
4669{
4670 int i, j;
4671
4672 if (!ctx->user_bufs)
4673 return -ENXIO;
4674
4675 for (i = 0; i < ctx->nr_user_bufs; i++) {
4676 struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
4677
4678 for (j = 0; j < imu->nr_bvecs; j++)
John Hubbard27c4d3a2019-08-04 19:32:06 -07004679 put_user_page(imu->bvec[j].bv_page);
Jens Axboeedafcce2019-01-09 09:16:05 -07004680
4681 if (ctx->account_mem)
4682 io_unaccount_mem(ctx->user, imu->nr_bvecs);
Mark Rutlandd4ef6472019-05-01 16:59:16 +01004683 kvfree(imu->bvec);
Jens Axboeedafcce2019-01-09 09:16:05 -07004684 imu->nr_bvecs = 0;
4685 }
4686
4687 kfree(ctx->user_bufs);
4688 ctx->user_bufs = NULL;
4689 ctx->nr_user_bufs = 0;
4690 return 0;
4691}
4692
4693static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
4694 void __user *arg, unsigned index)
4695{
4696 struct iovec __user *src;
4697
4698#ifdef CONFIG_COMPAT
4699 if (ctx->compat) {
4700 struct compat_iovec __user *ciovs;
4701 struct compat_iovec ciov;
4702
4703 ciovs = (struct compat_iovec __user *) arg;
4704 if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
4705 return -EFAULT;
4706
Jens Axboed55e5f52019-12-11 16:12:15 -07004707 dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base);
Jens Axboeedafcce2019-01-09 09:16:05 -07004708 dst->iov_len = ciov.iov_len;
4709 return 0;
4710 }
4711#endif
4712 src = (struct iovec __user *) arg;
4713 if (copy_from_user(dst, &src[index], sizeof(*dst)))
4714 return -EFAULT;
4715 return 0;
4716}
4717
4718static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
4719 unsigned nr_args)
4720{
4721 struct vm_area_struct **vmas = NULL;
4722 struct page **pages = NULL;
4723 int i, j, got_pages = 0;
4724 int ret = -EINVAL;
4725
4726 if (ctx->user_bufs)
4727 return -EBUSY;
4728 if (!nr_args || nr_args > UIO_MAXIOV)
4729 return -EINVAL;
4730
4731 ctx->user_bufs = kcalloc(nr_args, sizeof(struct io_mapped_ubuf),
4732 GFP_KERNEL);
4733 if (!ctx->user_bufs)
4734 return -ENOMEM;
4735
4736 for (i = 0; i < nr_args; i++) {
4737 struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
4738 unsigned long off, start, end, ubuf;
4739 int pret, nr_pages;
4740 struct iovec iov;
4741 size_t size;
4742
4743 ret = io_copy_iov(ctx, &iov, arg, i);
4744 if (ret)
Pavel Begunkova2786822019-05-26 12:35:47 +03004745 goto err;
Jens Axboeedafcce2019-01-09 09:16:05 -07004746
4747 /*
4748 * Don't impose further limits on the size and buffer
4749 * constraints here, we'll -EINVAL later when IO is
4750 * submitted if they are wrong.
4751 */
4752 ret = -EFAULT;
4753 if (!iov.iov_base || !iov.iov_len)
4754 goto err;
4755
4756 /* arbitrary limit, but we need something */
4757 if (iov.iov_len > SZ_1G)
4758 goto err;
4759
4760 ubuf = (unsigned long) iov.iov_base;
4761 end = (ubuf + iov.iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
4762 start = ubuf >> PAGE_SHIFT;
4763 nr_pages = end - start;
4764
4765 if (ctx->account_mem) {
4766 ret = io_account_mem(ctx->user, nr_pages);
4767 if (ret)
4768 goto err;
4769 }
4770
4771 ret = 0;
4772 if (!pages || nr_pages > got_pages) {
4773 kfree(vmas);
4774 kfree(pages);
Mark Rutlandd4ef6472019-05-01 16:59:16 +01004775 pages = kvmalloc_array(nr_pages, sizeof(struct page *),
Jens Axboeedafcce2019-01-09 09:16:05 -07004776 GFP_KERNEL);
Mark Rutlandd4ef6472019-05-01 16:59:16 +01004777 vmas = kvmalloc_array(nr_pages,
Jens Axboeedafcce2019-01-09 09:16:05 -07004778 sizeof(struct vm_area_struct *),
4779 GFP_KERNEL);
4780 if (!pages || !vmas) {
4781 ret = -ENOMEM;
4782 if (ctx->account_mem)
4783 io_unaccount_mem(ctx->user, nr_pages);
4784 goto err;
4785 }
4786 got_pages = nr_pages;
4787 }
4788
Mark Rutlandd4ef6472019-05-01 16:59:16 +01004789 imu->bvec = kvmalloc_array(nr_pages, sizeof(struct bio_vec),
Jens Axboeedafcce2019-01-09 09:16:05 -07004790 GFP_KERNEL);
4791 ret = -ENOMEM;
4792 if (!imu->bvec) {
4793 if (ctx->account_mem)
4794 io_unaccount_mem(ctx->user, nr_pages);
4795 goto err;
4796 }
4797
4798 ret = 0;
4799 down_read(&current->mm->mmap_sem);
Ira Weiny932f4a62019-05-13 17:17:03 -07004800 pret = get_user_pages(ubuf, nr_pages,
4801 FOLL_WRITE | FOLL_LONGTERM,
4802 pages, vmas);
Jens Axboeedafcce2019-01-09 09:16:05 -07004803 if (pret == nr_pages) {
4804 /* don't support file backed memory */
4805 for (j = 0; j < nr_pages; j++) {
4806 struct vm_area_struct *vma = vmas[j];
4807
4808 if (vma->vm_file &&
4809 !is_file_hugepages(vma->vm_file)) {
4810 ret = -EOPNOTSUPP;
4811 break;
4812 }
4813 }
4814 } else {
4815 ret = pret < 0 ? pret : -EFAULT;
4816 }
4817 up_read(&current->mm->mmap_sem);
4818 if (ret) {
4819 /*
4820 * if we did partial map, or found file backed vmas,
4821 * release any pages we did get
4822 */
John Hubbard27c4d3a2019-08-04 19:32:06 -07004823 if (pret > 0)
4824 put_user_pages(pages, pret);
Jens Axboeedafcce2019-01-09 09:16:05 -07004825 if (ctx->account_mem)
4826 io_unaccount_mem(ctx->user, nr_pages);
Mark Rutlandd4ef6472019-05-01 16:59:16 +01004827 kvfree(imu->bvec);
Jens Axboeedafcce2019-01-09 09:16:05 -07004828 goto err;
4829 }
4830
4831 off = ubuf & ~PAGE_MASK;
4832 size = iov.iov_len;
4833 for (j = 0; j < nr_pages; j++) {
4834 size_t vec_len;
4835
4836 vec_len = min_t(size_t, size, PAGE_SIZE - off);
4837 imu->bvec[j].bv_page = pages[j];
4838 imu->bvec[j].bv_len = vec_len;
4839 imu->bvec[j].bv_offset = off;
4840 off = 0;
4841 size -= vec_len;
4842 }
4843 /* store original address for later verification */
4844 imu->ubuf = ubuf;
4845 imu->len = iov.iov_len;
4846 imu->nr_bvecs = nr_pages;
4847
4848 ctx->nr_user_bufs++;
4849 }
Mark Rutlandd4ef6472019-05-01 16:59:16 +01004850 kvfree(pages);
4851 kvfree(vmas);
Jens Axboeedafcce2019-01-09 09:16:05 -07004852 return 0;
4853err:
Mark Rutlandd4ef6472019-05-01 16:59:16 +01004854 kvfree(pages);
4855 kvfree(vmas);
Jens Axboeedafcce2019-01-09 09:16:05 -07004856 io_sqe_buffer_unregister(ctx);
4857 return ret;
4858}
4859
Jens Axboe9b402842019-04-11 11:45:41 -06004860static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
4861{
4862 __s32 __user *fds = arg;
4863 int fd;
4864
4865 if (ctx->cq_ev_fd)
4866 return -EBUSY;
4867
4868 if (copy_from_user(&fd, fds, sizeof(*fds)))
4869 return -EFAULT;
4870
4871 ctx->cq_ev_fd = eventfd_ctx_fdget(fd);
4872 if (IS_ERR(ctx->cq_ev_fd)) {
4873 int ret = PTR_ERR(ctx->cq_ev_fd);
4874 ctx->cq_ev_fd = NULL;
4875 return ret;
4876 }
4877
4878 return 0;
4879}
4880
4881static int io_eventfd_unregister(struct io_ring_ctx *ctx)
4882{
4883 if (ctx->cq_ev_fd) {
4884 eventfd_ctx_put(ctx->cq_ev_fd);
4885 ctx->cq_ev_fd = NULL;
4886 return 0;
4887 }
4888
4889 return -ENXIO;
4890}
4891
Jens Axboe2b188cc2019-01-07 10:46:33 -07004892static void io_ring_ctx_free(struct io_ring_ctx *ctx)
4893{
Jens Axboe6b063142019-01-10 22:13:58 -07004894 io_finish_async(ctx);
Jens Axboe2b188cc2019-01-07 10:46:33 -07004895 if (ctx->sqo_mm)
4896 mmdrop(ctx->sqo_mm);
Jens Axboedef596e2019-01-09 08:59:42 -07004897
4898 io_iopoll_reap_events(ctx);
Jens Axboeedafcce2019-01-09 09:16:05 -07004899 io_sqe_buffer_unregister(ctx);
Jens Axboe6b063142019-01-10 22:13:58 -07004900 io_sqe_files_unregister(ctx);
Jens Axboe9b402842019-04-11 11:45:41 -06004901 io_eventfd_unregister(ctx);
Jens Axboedef596e2019-01-09 08:59:42 -07004902
Jens Axboe2b188cc2019-01-07 10:46:33 -07004903#if defined(CONFIG_UNIX)
Eric Biggers355e8d22019-06-12 14:58:43 -07004904 if (ctx->ring_sock) {
4905 ctx->ring_sock->file = NULL; /* so that iput() is called */
Jens Axboe2b188cc2019-01-07 10:46:33 -07004906 sock_release(ctx->ring_sock);
Eric Biggers355e8d22019-06-12 14:58:43 -07004907 }
Jens Axboe2b188cc2019-01-07 10:46:33 -07004908#endif
4909
Hristo Venev75b28af2019-08-26 17:23:46 +00004910 io_mem_free(ctx->rings);
Jens Axboe2b188cc2019-01-07 10:46:33 -07004911 io_mem_free(ctx->sq_sqes);
Jens Axboe2b188cc2019-01-07 10:46:33 -07004912
4913 percpu_ref_exit(&ctx->refs);
4914 if (ctx->account_mem)
4915 io_unaccount_mem(ctx->user,
4916 ring_pages(ctx->sq_entries, ctx->cq_entries));
4917 free_uid(ctx->user);
Jens Axboe181e4482019-11-25 08:52:30 -07004918 put_cred(ctx->creds);
Jens Axboe206aefd2019-11-07 18:27:42 -07004919 kfree(ctx->completions);
Jens Axboe78076bb2019-12-04 19:56:40 -07004920 kfree(ctx->cancel_hash);
Jens Axboe0ddf92e2019-11-08 08:52:53 -07004921 kmem_cache_free(req_cachep, ctx->fallback_req);
Jens Axboe2b188cc2019-01-07 10:46:33 -07004922 kfree(ctx);
4923}
4924
4925static __poll_t io_uring_poll(struct file *file, poll_table *wait)
4926{
4927 struct io_ring_ctx *ctx = file->private_data;
4928 __poll_t mask = 0;
4929
4930 poll_wait(file, &ctx->cq_wait, wait);
Stefan Bühler4f7067c2019-04-24 23:54:17 +02004931 /*
4932 * synchronizes with barrier from wq_has_sleeper call in
4933 * io_commit_cqring
4934 */
Jens Axboe2b188cc2019-01-07 10:46:33 -07004935 smp_rmb();
Hristo Venev75b28af2019-08-26 17:23:46 +00004936 if (READ_ONCE(ctx->rings->sq.tail) - ctx->cached_sq_head !=
4937 ctx->rings->sq_ring_entries)
Jens Axboe2b188cc2019-01-07 10:46:33 -07004938 mask |= EPOLLOUT | EPOLLWRNORM;
yangerkundaa5de52019-09-24 20:53:34 +08004939 if (READ_ONCE(ctx->rings->cq.head) != ctx->cached_cq_tail)
Jens Axboe2b188cc2019-01-07 10:46:33 -07004940 mask |= EPOLLIN | EPOLLRDNORM;
4941
4942 return mask;
4943}
4944
4945static int io_uring_fasync(int fd, struct file *file, int on)
4946{
4947 struct io_ring_ctx *ctx = file->private_data;
4948
4949 return fasync_helper(fd, file, on, &ctx->cq_fasync);
4950}
4951
4952static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
4953{
4954 mutex_lock(&ctx->uring_lock);
4955 percpu_ref_kill(&ctx->refs);
4956 mutex_unlock(&ctx->uring_lock);
4957
Jens Axboe5262f562019-09-17 12:26:57 -06004958 io_kill_timeouts(ctx);
Jens Axboe221c5eb2019-01-17 09:41:58 -07004959 io_poll_remove_all(ctx);
Jens Axboe561fb042019-10-24 07:25:42 -06004960
4961 if (ctx->io_wq)
4962 io_wq_cancel_all(ctx->io_wq);
4963
Jens Axboedef596e2019-01-09 08:59:42 -07004964 io_iopoll_reap_events(ctx);
Jens Axboe15dff282019-11-13 09:09:23 -07004965 /* if we failed setting up the ctx, we might not have any rings */
4966 if (ctx->rings)
4967 io_cqring_overflow_flush(ctx, true);
Jens Axboe206aefd2019-11-07 18:27:42 -07004968 wait_for_completion(&ctx->completions[0]);
Jens Axboe2b188cc2019-01-07 10:46:33 -07004969 io_ring_ctx_free(ctx);
4970}
4971
4972static int io_uring_release(struct inode *inode, struct file *file)
4973{
4974 struct io_ring_ctx *ctx = file->private_data;
4975
4976 file->private_data = NULL;
4977 io_ring_ctx_wait_and_kill(ctx);
4978 return 0;
4979}
4980
Jens Axboefcb323c2019-10-24 12:39:47 -06004981static void io_uring_cancel_files(struct io_ring_ctx *ctx,
4982 struct files_struct *files)
4983{
4984 struct io_kiocb *req;
4985 DEFINE_WAIT(wait);
4986
4987 while (!list_empty_careful(&ctx->inflight_list)) {
Jens Axboe768134d2019-11-10 20:30:53 -07004988 struct io_kiocb *cancel_req = NULL;
Jens Axboefcb323c2019-10-24 12:39:47 -06004989
4990 spin_lock_irq(&ctx->inflight_lock);
4991 list_for_each_entry(req, &ctx->inflight_list, inflight_entry) {
Jens Axboe768134d2019-11-10 20:30:53 -07004992 if (req->work.files != files)
4993 continue;
4994 /* req is being completed, ignore */
4995 if (!refcount_inc_not_zero(&req->refs))
4996 continue;
4997 cancel_req = req;
4998 break;
Jens Axboefcb323c2019-10-24 12:39:47 -06004999 }
Jens Axboe768134d2019-11-10 20:30:53 -07005000 if (cancel_req)
Jens Axboefcb323c2019-10-24 12:39:47 -06005001 prepare_to_wait(&ctx->inflight_wait, &wait,
Jens Axboe768134d2019-11-10 20:30:53 -07005002 TASK_UNINTERRUPTIBLE);
Jens Axboefcb323c2019-10-24 12:39:47 -06005003 spin_unlock_irq(&ctx->inflight_lock);
5004
Jens Axboe768134d2019-11-10 20:30:53 -07005005 /* We need to keep going until we don't find a matching req */
5006 if (!cancel_req)
Jens Axboefcb323c2019-10-24 12:39:47 -06005007 break;
Bob Liu2f6d9b92019-11-13 18:06:24 +08005008
5009 io_wq_cancel_work(ctx->io_wq, &cancel_req->work);
5010 io_put_req(cancel_req);
Jens Axboefcb323c2019-10-24 12:39:47 -06005011 schedule();
5012 }
Jens Axboe768134d2019-11-10 20:30:53 -07005013 finish_wait(&ctx->inflight_wait, &wait);
Jens Axboefcb323c2019-10-24 12:39:47 -06005014}
5015
5016static int io_uring_flush(struct file *file, void *data)
5017{
5018 struct io_ring_ctx *ctx = file->private_data;
5019
5020 io_uring_cancel_files(ctx, data);
Jens Axboe1d7bb1d2019-11-06 11:31:17 -07005021 if (fatal_signal_pending(current) || (current->flags & PF_EXITING)) {
5022 io_cqring_overflow_flush(ctx, true);
Jens Axboefcb323c2019-10-24 12:39:47 -06005023 io_wq_cancel_all(ctx->io_wq);
Jens Axboe1d7bb1d2019-11-06 11:31:17 -07005024 }
Jens Axboefcb323c2019-10-24 12:39:47 -06005025 return 0;
5026}
5027
Roman Penyaev6c5c2402019-11-28 12:53:22 +01005028static void *io_uring_validate_mmap_request(struct file *file,
5029 loff_t pgoff, size_t sz)
Jens Axboe2b188cc2019-01-07 10:46:33 -07005030{
Jens Axboe2b188cc2019-01-07 10:46:33 -07005031 struct io_ring_ctx *ctx = file->private_data;
Roman Penyaev6c5c2402019-11-28 12:53:22 +01005032 loff_t offset = pgoff << PAGE_SHIFT;
Jens Axboe2b188cc2019-01-07 10:46:33 -07005033 struct page *page;
5034 void *ptr;
5035
5036 switch (offset) {
5037 case IORING_OFF_SQ_RING:
Hristo Venev75b28af2019-08-26 17:23:46 +00005038 case IORING_OFF_CQ_RING:
5039 ptr = ctx->rings;
Jens Axboe2b188cc2019-01-07 10:46:33 -07005040 break;
5041 case IORING_OFF_SQES:
5042 ptr = ctx->sq_sqes;
5043 break;
Jens Axboe2b188cc2019-01-07 10:46:33 -07005044 default:
Roman Penyaev6c5c2402019-11-28 12:53:22 +01005045 return ERR_PTR(-EINVAL);
Jens Axboe2b188cc2019-01-07 10:46:33 -07005046 }
5047
5048 page = virt_to_head_page(ptr);
Matthew Wilcox (Oracle)a50b8542019-09-23 15:34:25 -07005049 if (sz > page_size(page))
Roman Penyaev6c5c2402019-11-28 12:53:22 +01005050 return ERR_PTR(-EINVAL);
5051
5052 return ptr;
5053}
5054
5055#ifdef CONFIG_MMU
5056
5057static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
5058{
5059 size_t sz = vma->vm_end - vma->vm_start;
5060 unsigned long pfn;
5061 void *ptr;
5062
5063 ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
5064 if (IS_ERR(ptr))
5065 return PTR_ERR(ptr);
Jens Axboe2b188cc2019-01-07 10:46:33 -07005066
5067 pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
5068 return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
5069}
5070
Roman Penyaev6c5c2402019-11-28 12:53:22 +01005071#else /* !CONFIG_MMU */
5072
5073static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
5074{
5075 return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -EINVAL;
5076}
5077
5078static unsigned int io_uring_nommu_mmap_capabilities(struct file *file)
5079{
5080 return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE;
5081}
5082
5083static unsigned long io_uring_nommu_get_unmapped_area(struct file *file,
5084 unsigned long addr, unsigned long len,
5085 unsigned long pgoff, unsigned long flags)
5086{
5087 void *ptr;
5088
5089 ptr = io_uring_validate_mmap_request(file, pgoff, len);
5090 if (IS_ERR(ptr))
5091 return PTR_ERR(ptr);
5092
5093 return (unsigned long) ptr;
5094}
5095
5096#endif /* !CONFIG_MMU */
5097
Jens Axboe2b188cc2019-01-07 10:46:33 -07005098SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
5099 u32, min_complete, u32, flags, const sigset_t __user *, sig,
5100 size_t, sigsz)
5101{
5102 struct io_ring_ctx *ctx;
5103 long ret = -EBADF;
5104 int submitted = 0;
5105 struct fd f;
5106
Jens Axboe6c271ce2019-01-10 11:22:30 -07005107 if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP))
Jens Axboe2b188cc2019-01-07 10:46:33 -07005108 return -EINVAL;
5109
5110 f = fdget(fd);
5111 if (!f.file)
5112 return -EBADF;
5113
5114 ret = -EOPNOTSUPP;
5115 if (f.file->f_op != &io_uring_fops)
5116 goto out_fput;
5117
5118 ret = -ENXIO;
5119 ctx = f.file->private_data;
5120 if (!percpu_ref_tryget(&ctx->refs))
5121 goto out_fput;
5122
Jens Axboe6c271ce2019-01-10 11:22:30 -07005123 /*
5124 * For SQ polling, the thread will do all submissions and completions.
5125 * Just return the requested submit count, and wake the thread if
5126 * we were asked to.
5127 */
Jens Axboeb2a9ead2019-09-12 14:19:16 -06005128 ret = 0;
Jens Axboe6c271ce2019-01-10 11:22:30 -07005129 if (ctx->flags & IORING_SETUP_SQPOLL) {
Jens Axboec1edbf52019-11-10 16:56:04 -07005130 if (!list_empty_careful(&ctx->cq_overflow_list))
5131 io_cqring_overflow_flush(ctx, false);
Jens Axboe6c271ce2019-01-10 11:22:30 -07005132 if (flags & IORING_ENTER_SQ_WAKEUP)
5133 wake_up(&ctx->sqo_wait);
5134 submitted = to_submit;
Jens Axboeb2a9ead2019-09-12 14:19:16 -06005135 } else if (to_submit) {
Pavel Begunkovae9428c2019-11-06 00:22:14 +03005136 struct mm_struct *cur_mm;
Jens Axboe2b188cc2019-01-07 10:46:33 -07005137
Pavel Begunkovae9428c2019-11-06 00:22:14 +03005138 to_submit = min(to_submit, ctx->sq_entries);
Jens Axboe2b188cc2019-01-07 10:46:33 -07005139 mutex_lock(&ctx->uring_lock);
Pavel Begunkovae9428c2019-11-06 00:22:14 +03005140 /* already have mm, so io_submit_sqes() won't try to grab it */
5141 cur_mm = ctx->sqo_mm;
5142 submitted = io_submit_sqes(ctx, to_submit, f.file, fd,
5143 &cur_mm, false);
Jens Axboe2b188cc2019-01-07 10:46:33 -07005144 mutex_unlock(&ctx->uring_lock);
Pavel Begunkov7c504e652019-12-18 19:53:45 +03005145
5146 if (submitted != to_submit)
5147 goto out;
Jens Axboe2b188cc2019-01-07 10:46:33 -07005148 }
5149 if (flags & IORING_ENTER_GETEVENTS) {
Jens Axboedef596e2019-01-09 08:59:42 -07005150 unsigned nr_events = 0;
5151
Jens Axboe2b188cc2019-01-07 10:46:33 -07005152 min_complete = min(min_complete, ctx->cq_entries);
5153
Jens Axboedef596e2019-01-09 08:59:42 -07005154 if (ctx->flags & IORING_SETUP_IOPOLL) {
Jens Axboedef596e2019-01-09 08:59:42 -07005155 ret = io_iopoll_check(ctx, &nr_events, min_complete);
Jens Axboedef596e2019-01-09 08:59:42 -07005156 } else {
5157 ret = io_cqring_wait(ctx, min_complete, sig, sigsz);
5158 }
Jens Axboe2b188cc2019-01-07 10:46:33 -07005159 }
5160
Pavel Begunkov7c504e652019-12-18 19:53:45 +03005161out:
Pavel Begunkov6805b322019-10-08 02:18:42 +03005162 percpu_ref_put(&ctx->refs);
Jens Axboe2b188cc2019-01-07 10:46:33 -07005163out_fput:
5164 fdput(f);
5165 return submitted ? submitted : ret;
5166}
5167
5168static const struct file_operations io_uring_fops = {
5169 .release = io_uring_release,
Jens Axboefcb323c2019-10-24 12:39:47 -06005170 .flush = io_uring_flush,
Jens Axboe2b188cc2019-01-07 10:46:33 -07005171 .mmap = io_uring_mmap,
Roman Penyaev6c5c2402019-11-28 12:53:22 +01005172#ifndef CONFIG_MMU
5173 .get_unmapped_area = io_uring_nommu_get_unmapped_area,
5174 .mmap_capabilities = io_uring_nommu_mmap_capabilities,
5175#endif
Jens Axboe2b188cc2019-01-07 10:46:33 -07005176 .poll = io_uring_poll,
5177 .fasync = io_uring_fasync,
5178};
5179
5180static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
5181 struct io_uring_params *p)
5182{
Hristo Venev75b28af2019-08-26 17:23:46 +00005183 struct io_rings *rings;
5184 size_t size, sq_array_offset;
Jens Axboe2b188cc2019-01-07 10:46:33 -07005185
Hristo Venev75b28af2019-08-26 17:23:46 +00005186 size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset);
5187 if (size == SIZE_MAX)
5188 return -EOVERFLOW;
5189
5190 rings = io_mem_alloc(size);
5191 if (!rings)
Jens Axboe2b188cc2019-01-07 10:46:33 -07005192 return -ENOMEM;
5193
Hristo Venev75b28af2019-08-26 17:23:46 +00005194 ctx->rings = rings;
5195 ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
5196 rings->sq_ring_mask = p->sq_entries - 1;
5197 rings->cq_ring_mask = p->cq_entries - 1;
5198 rings->sq_ring_entries = p->sq_entries;
5199 rings->cq_ring_entries = p->cq_entries;
5200 ctx->sq_mask = rings->sq_ring_mask;
5201 ctx->cq_mask = rings->cq_ring_mask;
5202 ctx->sq_entries = rings->sq_ring_entries;
5203 ctx->cq_entries = rings->cq_ring_entries;
Jens Axboe2b188cc2019-01-07 10:46:33 -07005204
5205 size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
Jens Axboeeb065d32019-11-20 09:26:29 -07005206 if (size == SIZE_MAX) {
5207 io_mem_free(ctx->rings);
5208 ctx->rings = NULL;
Jens Axboe2b188cc2019-01-07 10:46:33 -07005209 return -EOVERFLOW;
Jens Axboeeb065d32019-11-20 09:26:29 -07005210 }
Jens Axboe2b188cc2019-01-07 10:46:33 -07005211
5212 ctx->sq_sqes = io_mem_alloc(size);
Jens Axboeeb065d32019-11-20 09:26:29 -07005213 if (!ctx->sq_sqes) {
5214 io_mem_free(ctx->rings);
5215 ctx->rings = NULL;
Jens Axboe2b188cc2019-01-07 10:46:33 -07005216 return -ENOMEM;
Jens Axboeeb065d32019-11-20 09:26:29 -07005217 }
Jens Axboe2b188cc2019-01-07 10:46:33 -07005218
Jens Axboe2b188cc2019-01-07 10:46:33 -07005219 return 0;
5220}
5221
5222/*
5223 * Allocate an anonymous fd, this is what constitutes the application
5224 * visible backing of an io_uring instance. The application mmaps this
5225 * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
5226 * we have to tie this fd to a socket for file garbage collection purposes.
5227 */
5228static int io_uring_get_fd(struct io_ring_ctx *ctx)
5229{
5230 struct file *file;
5231 int ret;
5232
5233#if defined(CONFIG_UNIX)
5234 ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
5235 &ctx->ring_sock);
5236 if (ret)
5237 return ret;
5238#endif
5239
5240 ret = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
5241 if (ret < 0)
5242 goto err;
5243
5244 file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx,
5245 O_RDWR | O_CLOEXEC);
5246 if (IS_ERR(file)) {
5247 put_unused_fd(ret);
5248 ret = PTR_ERR(file);
5249 goto err;
5250 }
5251
5252#if defined(CONFIG_UNIX)
5253 ctx->ring_sock->file = file;
Jens Axboe6b063142019-01-10 22:13:58 -07005254 ctx->ring_sock->sk->sk_user_data = ctx;
Jens Axboe2b188cc2019-01-07 10:46:33 -07005255#endif
5256 fd_install(ret, file);
5257 return ret;
5258err:
5259#if defined(CONFIG_UNIX)
5260 sock_release(ctx->ring_sock);
5261 ctx->ring_sock = NULL;
5262#endif
5263 return ret;
5264}
5265
5266static int io_uring_create(unsigned entries, struct io_uring_params *p)
5267{
5268 struct user_struct *user = NULL;
5269 struct io_ring_ctx *ctx;
5270 bool account_mem;
5271 int ret;
5272
5273 if (!entries || entries > IORING_MAX_ENTRIES)
5274 return -EINVAL;
5275
5276 /*
5277 * Use twice as many entries for the CQ ring. It's possible for the
5278 * application to drive a higher depth than the size of the SQ ring,
5279 * since the sqes are only used at submission time. This allows for
Jens Axboe33a107f2019-10-04 12:10:03 -06005280 * some flexibility in overcommitting a bit. If the application has
5281 * set IORING_SETUP_CQSIZE, it will have passed in the desired number
5282 * of CQ ring entries manually.
Jens Axboe2b188cc2019-01-07 10:46:33 -07005283 */
5284 p->sq_entries = roundup_pow_of_two(entries);
Jens Axboe33a107f2019-10-04 12:10:03 -06005285 if (p->flags & IORING_SETUP_CQSIZE) {
5286 /*
5287 * If IORING_SETUP_CQSIZE is set, we do the same roundup
5288 * to a power-of-two, if it isn't already. We do NOT impose
5289 * any cq vs sq ring sizing.
5290 */
5291 if (p->cq_entries < p->sq_entries || p->cq_entries > IORING_MAX_CQ_ENTRIES)
5292 return -EINVAL;
5293 p->cq_entries = roundup_pow_of_two(p->cq_entries);
5294 } else {
5295 p->cq_entries = 2 * p->sq_entries;
5296 }
Jens Axboe2b188cc2019-01-07 10:46:33 -07005297
5298 user = get_uid(current_user());
5299 account_mem = !capable(CAP_IPC_LOCK);
5300
5301 if (account_mem) {
5302 ret = io_account_mem(user,
5303 ring_pages(p->sq_entries, p->cq_entries));
5304 if (ret) {
5305 free_uid(user);
5306 return ret;
5307 }
5308 }
5309
5310 ctx = io_ring_ctx_alloc(p);
5311 if (!ctx) {
5312 if (account_mem)
5313 io_unaccount_mem(user, ring_pages(p->sq_entries,
5314 p->cq_entries));
5315 free_uid(user);
5316 return -ENOMEM;
5317 }
5318 ctx->compat = in_compat_syscall();
5319 ctx->account_mem = account_mem;
5320 ctx->user = user;
Jens Axboe0b8c0ec2019-12-02 08:50:00 -07005321 ctx->creds = get_current_cred();
Jens Axboe2b188cc2019-01-07 10:46:33 -07005322
5323 ret = io_allocate_scq_urings(ctx, p);
5324 if (ret)
5325 goto err;
5326
Jens Axboe6c271ce2019-01-10 11:22:30 -07005327 ret = io_sq_offload_start(ctx, p);
Jens Axboe2b188cc2019-01-07 10:46:33 -07005328 if (ret)
5329 goto err;
5330
Jens Axboe2b188cc2019-01-07 10:46:33 -07005331 memset(&p->sq_off, 0, sizeof(p->sq_off));
Hristo Venev75b28af2019-08-26 17:23:46 +00005332 p->sq_off.head = offsetof(struct io_rings, sq.head);
5333 p->sq_off.tail = offsetof(struct io_rings, sq.tail);
5334 p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask);
5335 p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
5336 p->sq_off.flags = offsetof(struct io_rings, sq_flags);
5337 p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
5338 p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
Jens Axboe2b188cc2019-01-07 10:46:33 -07005339
5340 memset(&p->cq_off, 0, sizeof(p->cq_off));
Hristo Venev75b28af2019-08-26 17:23:46 +00005341 p->cq_off.head = offsetof(struct io_rings, cq.head);
5342 p->cq_off.tail = offsetof(struct io_rings, cq.tail);
5343 p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);
5344 p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
5345 p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
5346 p->cq_off.cqes = offsetof(struct io_rings, cqes);
Jens Axboeac90f242019-09-06 10:26:21 -06005347
Jens Axboe044c1ab2019-10-28 09:15:33 -06005348 /*
5349 * Install ring fd as the very last thing, so we don't risk someone
5350 * having closed it before we finish setup
5351 */
5352 ret = io_uring_get_fd(ctx);
5353 if (ret < 0)
5354 goto err;
5355
Jens Axboeda8c9692019-12-02 18:51:26 -07005356 p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
5357 IORING_FEAT_SUBMIT_STABLE;
Dmitrii Dolgovc826bd72019-10-15 19:02:01 +02005358 trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
Jens Axboe2b188cc2019-01-07 10:46:33 -07005359 return ret;
5360err:
5361 io_ring_ctx_wait_and_kill(ctx);
5362 return ret;
5363}
5364
5365/*
5366 * Sets up an aio uring context, and returns the fd. Applications asks for a
5367 * ring size, we return the actual sq/cq ring sizes (among other things) in the
5368 * params structure passed in.
5369 */
5370static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
5371{
5372 struct io_uring_params p;
5373 long ret;
5374 int i;
5375
5376 if (copy_from_user(&p, params, sizeof(p)))
5377 return -EFAULT;
5378 for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
5379 if (p.resv[i])
5380 return -EINVAL;
5381 }
5382
Jens Axboe6c271ce2019-01-10 11:22:30 -07005383 if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
Jens Axboe33a107f2019-10-04 12:10:03 -06005384 IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE))
Jens Axboe2b188cc2019-01-07 10:46:33 -07005385 return -EINVAL;
5386
5387 ret = io_uring_create(entries, &p);
5388 if (ret < 0)
5389 return ret;
5390
5391 if (copy_to_user(params, &p, sizeof(p)))
5392 return -EFAULT;
5393
5394 return ret;
5395}
5396
5397SYSCALL_DEFINE2(io_uring_setup, u32, entries,
5398 struct io_uring_params __user *, params)
5399{
5400 return io_uring_setup(entries, params);
5401}
5402
Jens Axboeedafcce2019-01-09 09:16:05 -07005403static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
5404 void __user *arg, unsigned nr_args)
Jens Axboeb19062a2019-04-15 10:49:38 -06005405 __releases(ctx->uring_lock)
5406 __acquires(ctx->uring_lock)
Jens Axboeedafcce2019-01-09 09:16:05 -07005407{
5408 int ret;
5409
Jens Axboe35fa71a2019-04-22 10:23:23 -06005410 /*
5411 * We're inside the ring mutex, if the ref is already dying, then
5412 * someone else killed the ctx or is already going through
5413 * io_uring_register().
5414 */
5415 if (percpu_ref_is_dying(&ctx->refs))
5416 return -ENXIO;
5417
Jens Axboeedafcce2019-01-09 09:16:05 -07005418 percpu_ref_kill(&ctx->refs);
Jens Axboeb19062a2019-04-15 10:49:38 -06005419
5420 /*
5421 * Drop uring mutex before waiting for references to exit. If another
5422 * thread is currently inside io_uring_enter() it might need to grab
5423 * the uring_lock to make progress. If we hold it here across the drain
5424 * wait, then we can deadlock. It's safe to drop the mutex here, since
5425 * no new references will come in after we've killed the percpu ref.
5426 */
5427 mutex_unlock(&ctx->uring_lock);
Jens Axboe206aefd2019-11-07 18:27:42 -07005428 wait_for_completion(&ctx->completions[0]);
Jens Axboeb19062a2019-04-15 10:49:38 -06005429 mutex_lock(&ctx->uring_lock);
Jens Axboeedafcce2019-01-09 09:16:05 -07005430
5431 switch (opcode) {
5432 case IORING_REGISTER_BUFFERS:
5433 ret = io_sqe_buffer_register(ctx, arg, nr_args);
5434 break;
5435 case IORING_UNREGISTER_BUFFERS:
5436 ret = -EINVAL;
5437 if (arg || nr_args)
5438 break;
5439 ret = io_sqe_buffer_unregister(ctx);
5440 break;
Jens Axboe6b063142019-01-10 22:13:58 -07005441 case IORING_REGISTER_FILES:
5442 ret = io_sqe_files_register(ctx, arg, nr_args);
5443 break;
5444 case IORING_UNREGISTER_FILES:
5445 ret = -EINVAL;
5446 if (arg || nr_args)
5447 break;
5448 ret = io_sqe_files_unregister(ctx);
5449 break;
Jens Axboec3a31e62019-10-03 13:59:56 -06005450 case IORING_REGISTER_FILES_UPDATE:
5451 ret = io_sqe_files_update(ctx, arg, nr_args);
5452 break;
Jens Axboe9b402842019-04-11 11:45:41 -06005453 case IORING_REGISTER_EVENTFD:
5454 ret = -EINVAL;
5455 if (nr_args != 1)
5456 break;
5457 ret = io_eventfd_register(ctx, arg);
5458 break;
5459 case IORING_UNREGISTER_EVENTFD:
5460 ret = -EINVAL;
5461 if (arg || nr_args)
5462 break;
5463 ret = io_eventfd_unregister(ctx);
5464 break;
Jens Axboeedafcce2019-01-09 09:16:05 -07005465 default:
5466 ret = -EINVAL;
5467 break;
5468 }
5469
5470 /* bring the ctx back to life */
Jens Axboe206aefd2019-11-07 18:27:42 -07005471 reinit_completion(&ctx->completions[0]);
Jens Axboeedafcce2019-01-09 09:16:05 -07005472 percpu_ref_reinit(&ctx->refs);
5473 return ret;
5474}
5475
5476SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
5477 void __user *, arg, unsigned int, nr_args)
5478{
5479 struct io_ring_ctx *ctx;
5480 long ret = -EBADF;
5481 struct fd f;
5482
5483 f = fdget(fd);
5484 if (!f.file)
5485 return -EBADF;
5486
5487 ret = -EOPNOTSUPP;
5488 if (f.file->f_op != &io_uring_fops)
5489 goto out_fput;
5490
5491 ctx = f.file->private_data;
5492
5493 mutex_lock(&ctx->uring_lock);
5494 ret = __io_uring_register(ctx, opcode, arg, nr_args);
5495 mutex_unlock(&ctx->uring_lock);
Dmitrii Dolgovc826bd72019-10-15 19:02:01 +02005496 trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs,
5497 ctx->cq_ev_fd != NULL, ret);
Jens Axboeedafcce2019-01-09 09:16:05 -07005498out_fput:
5499 fdput(f);
5500 return ret;
5501}
5502
Jens Axboe2b188cc2019-01-07 10:46:33 -07005503static int __init io_uring_init(void)
5504{
5505 req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
5506 return 0;
5507};
5508__initcall(io_uring_init);