blob: 8e9e2341e40a832f373062c4aeabc5e75a48d207 [file] [log] [blame]
Thomas Gleixner7a338472019-06-04 10:11:15 +02001// SPDX-License-Identifier: GPL-2.0-only
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002/* Copyright (C) 2009 Red Hat, Inc.
3 * Copyright (C) 2006 Rusty Russell IBM Corporation
4 *
5 * Author: Michael S. Tsirkin <mst@redhat.com>
6 *
7 * Inspiration, some code, and most witty comments come from
Rob Landley61516582011-05-06 09:27:36 -07008 * Documentation/virtual/lguest/lguest.c, by Rusty Russell
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00009 *
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +000010 * Generic code for virtio server in host kernel.
11 */
12
13#include <linux/eventfd.h>
14#include <linux/vhost.h>
Asias He35596b22013-08-19 09:23:19 +080015#include <linux/uio.h>
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +000016#include <linux/mm.h>
Michael S. Tsirkin64e1c802010-10-06 15:34:45 +020017#include <linux/mmu_context.h>
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +000018#include <linux/miscdevice.h>
19#include <linux/mutex.h>
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +000020#include <linux/poll.h>
21#include <linux/file.h>
22#include <linux/highmem.h>
Tejun Heo5a0e3ad2010-03-24 17:04:11 +090023#include <linux/slab.h>
Igor Mammedov4de72552015-07-01 11:07:09 +020024#include <linux/vmalloc.h>
Tejun Heoc23f34452010-06-02 20:40:00 +020025#include <linux/kthread.h>
Michael S. Tsirkin9e3d1952010-07-27 22:56:50 +030026#include <linux/cgroup.h>
Asias He6ac1afb2013-05-06 16:38:21 +080027#include <linux/module.h>
Igor Mammedovbcfeaca2015-06-16 18:33:35 +020028#include <linux/sort.h>
Ingo Molnar6e84f312017-02-08 18:51:29 +010029#include <linux/sched/mm.h>
Ingo Molnar174cd4b2017-02-02 19:15:33 +010030#include <linux/sched/signal.h>
Jason Wanga9709d62016-06-23 02:04:31 -040031#include <linux/interval_tree_generic.h>
Jason Wangff002262018-10-30 14:10:49 +080032#include <linux/nospec.h>
Andrey Konovalov8f6a7f92019-12-04 16:52:50 -080033#include <linux/kcov.h>
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +000034
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +000035#include "vhost.h"
36
Igor Mammedovc9ce42f2015-07-02 15:08:11 +020037static ushort max_mem_regions = 64;
38module_param(max_mem_regions, ushort, 0444);
39MODULE_PARM_DESC(max_mem_regions,
40 "Maximum number of memory regions in memory map. (default: 64)");
Jason Wang6b1e6cc2016-06-23 02:04:32 -040041static int max_iotlb_entries = 2048;
42module_param(max_iotlb_entries, int, 0444);
43MODULE_PARM_DESC(max_iotlb_entries,
44 "Maximum number of iotlb entries. (default: 2048)");
Igor Mammedovc9ce42f2015-07-02 15:08:11 +020045
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +000046enum {
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +000047 VHOST_MEMORY_F_LOG = 0x1,
48};
49
Michael S. Tsirkin3b1bbe82014-10-24 14:04:47 +030050#define vhost_used_event(vq) ((__virtio16 __user *)&vq->avail->ring[vq->num])
51#define vhost_avail_event(vq) ((__virtio16 __user *)&vq->used->ring[vq->num])
Michael S. Tsirkin8ea8cf82011-05-20 02:10:54 +030052
Jason Wanga9709d62016-06-23 02:04:31 -040053INTERVAL_TREE_DEFINE(struct vhost_umem_node,
54 rb, __u64, __subtree_last,
Michael S. Tsirkin2f952c02016-12-06 05:57:54 +020055 START, LAST, static inline, vhost_umem_interval_tree);
Jason Wanga9709d62016-06-23 02:04:31 -040056
Greg Kurz2751c982015-04-24 14:27:24 +020057#ifdef CONFIG_VHOST_CROSS_ENDIAN_LEGACY
Greg Kurzc5072032016-02-16 15:59:34 +010058static void vhost_disable_cross_endian(struct vhost_virtqueue *vq)
Greg Kurz2751c982015-04-24 14:27:24 +020059{
60 vq->user_be = !virtio_legacy_is_little_endian();
61}
62
Greg Kurzc5072032016-02-16 15:59:34 +010063static void vhost_enable_cross_endian_big(struct vhost_virtqueue *vq)
64{
65 vq->user_be = true;
66}
67
68static void vhost_enable_cross_endian_little(struct vhost_virtqueue *vq)
69{
70 vq->user_be = false;
71}
72
Greg Kurz2751c982015-04-24 14:27:24 +020073static long vhost_set_vring_endian(struct vhost_virtqueue *vq, int __user *argp)
74{
75 struct vhost_vring_state s;
76
77 if (vq->private_data)
78 return -EBUSY;
79
80 if (copy_from_user(&s, argp, sizeof(s)))
81 return -EFAULT;
82
83 if (s.num != VHOST_VRING_LITTLE_ENDIAN &&
84 s.num != VHOST_VRING_BIG_ENDIAN)
85 return -EINVAL;
86
Greg Kurzc5072032016-02-16 15:59:34 +010087 if (s.num == VHOST_VRING_BIG_ENDIAN)
88 vhost_enable_cross_endian_big(vq);
89 else
90 vhost_enable_cross_endian_little(vq);
Greg Kurz2751c982015-04-24 14:27:24 +020091
92 return 0;
93}
94
95static long vhost_get_vring_endian(struct vhost_virtqueue *vq, u32 idx,
96 int __user *argp)
97{
98 struct vhost_vring_state s = {
99 .index = idx,
100 .num = vq->user_be
101 };
102
103 if (copy_to_user(argp, &s, sizeof(s)))
104 return -EFAULT;
105
106 return 0;
107}
108
109static void vhost_init_is_le(struct vhost_virtqueue *vq)
110{
111 /* Note for legacy virtio: user_be is initialized at reset time
112 * according to the host endianness. If userspace does not set an
113 * explicit endianness, the default behavior is native endian, as
114 * expected by legacy virtio.
115 */
116 vq->is_le = vhost_has_feature(vq, VIRTIO_F_VERSION_1) || !vq->user_be;
117}
118#else
Greg Kurzc5072032016-02-16 15:59:34 +0100119static void vhost_disable_cross_endian(struct vhost_virtqueue *vq)
Greg Kurz2751c982015-04-24 14:27:24 +0200120{
121}
122
123static long vhost_set_vring_endian(struct vhost_virtqueue *vq, int __user *argp)
124{
125 return -ENOIOCTLCMD;
126}
127
128static long vhost_get_vring_endian(struct vhost_virtqueue *vq, u32 idx,
129 int __user *argp)
130{
131 return -ENOIOCTLCMD;
132}
133
134static void vhost_init_is_le(struct vhost_virtqueue *vq)
135{
Halil Pasiccda8bba2017-01-30 11:09:36 +0100136 vq->is_le = vhost_has_feature(vq, VIRTIO_F_VERSION_1)
137 || virtio_legacy_is_little_endian();
Greg Kurz2751c982015-04-24 14:27:24 +0200138}
139#endif /* CONFIG_VHOST_CROSS_ENDIAN_LEGACY */
140
Greg Kurzc5072032016-02-16 15:59:34 +0100141static void vhost_reset_is_le(struct vhost_virtqueue *vq)
142{
Halil Pasiccda8bba2017-01-30 11:09:36 +0100143 vhost_init_is_le(vq);
Greg Kurzc5072032016-02-16 15:59:34 +0100144}
145
Jason Wang7235acd2016-04-25 22:14:32 -0400146struct vhost_flush_struct {
147 struct vhost_work work;
148 struct completion wait_event;
149};
150
151static void vhost_flush_work(struct vhost_work *work)
152{
153 struct vhost_flush_struct *s;
154
155 s = container_of(work, struct vhost_flush_struct, work);
156 complete(&s->wait_event);
157}
158
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +0000159static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh,
160 poll_table *pt)
161{
162 struct vhost_poll *poll;
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +0000163
Krishna Kumard47effe2011-03-01 17:06:37 +0530164 poll = container_of(pt, struct vhost_poll, table);
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +0000165 poll->wqh = wqh;
166 add_wait_queue(wqh, &poll->wait);
167}
168
Ingo Molnarac6424b2017-06-20 12:06:13 +0200169static int vhost_poll_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync,
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +0000170 void *key)
171{
Tejun Heoc23f34452010-06-02 20:40:00 +0200172 struct vhost_poll *poll = container_of(wait, struct vhost_poll, wait);
173
Al Viro3ad6f932017-07-03 20:14:56 -0400174 if (!(key_to_poll(key) & poll->mask))
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +0000175 return 0;
176
Tejun Heoc23f34452010-06-02 20:40:00 +0200177 vhost_poll_queue(poll);
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +0000178 return 0;
179}
180
Stefan Hajnoczi163049a2012-07-21 06:55:37 +0000181void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn)
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +0000182{
Jason Wang04b96e52016-04-25 22:14:33 -0400183 clear_bit(VHOST_WORK_QUEUED, &work->flags);
Tejun Heoc23f34452010-06-02 20:40:00 +0200184 work->fn = fn;
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +0000185}
Asias He6ac1afb2013-05-06 16:38:21 +0800186EXPORT_SYMBOL_GPL(vhost_work_init);
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +0000187
Michael S. Tsirkin87d6a412010-09-02 14:05:30 +0300188/* Init poll structure */
189void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
Al Viro58e3b602017-07-03 23:50:40 -0400190 __poll_t mask, struct vhost_dev *dev)
Michael S. Tsirkin87d6a412010-09-02 14:05:30 +0300191{
192 init_waitqueue_func_entry(&poll->wait, vhost_poll_wakeup);
193 init_poll_funcptr(&poll->table, vhost_poll_func);
194 poll->mask = mask;
195 poll->dev = dev;
Jason Wang2b8b3282013-01-28 01:05:18 +0000196 poll->wqh = NULL;
Michael S. Tsirkin87d6a412010-09-02 14:05:30 +0300197
198 vhost_work_init(&poll->work, fn);
199}
Asias He6ac1afb2013-05-06 16:38:21 +0800200EXPORT_SYMBOL_GPL(vhost_poll_init);
Michael S. Tsirkin87d6a412010-09-02 14:05:30 +0300201
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +0000202/* Start polling a file. We add ourselves to file's wait queue. The caller must
203 * keep a reference to a file until after vhost_poll_stop is called. */
Jason Wang2b8b3282013-01-28 01:05:18 +0000204int vhost_poll_start(struct vhost_poll *poll, struct file *file)
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +0000205{
Al Viroe6c8adc2017-07-03 22:25:56 -0400206 __poll_t mask;
Krishna Kumard47effe2011-03-01 17:06:37 +0530207
Jason Wang70181d512013-04-10 20:50:48 +0000208 if (poll->wqh)
209 return 0;
210
Christoph Hellwig9965ed172018-03-05 07:26:05 -0800211 mask = vfs_poll(file, &poll->table);
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +0000212 if (mask)
Al Viro3ad6f932017-07-03 20:14:56 -0400213 vhost_poll_wakeup(&poll->wait, 0, 0, poll_to_key(mask));
Linus Torvaldsa9a08842018-02-11 14:34:03 -0800214 if (mask & EPOLLERR) {
Jason Wangdc6455a2018-03-27 20:50:52 +0800215 vhost_poll_stop(poll);
Yunsheng Lin896fc242019-08-20 20:36:32 +0800216 return -EINVAL;
Jason Wang2b8b3282013-01-28 01:05:18 +0000217 }
218
Yunsheng Lin896fc242019-08-20 20:36:32 +0800219 return 0;
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +0000220}
Asias He6ac1afb2013-05-06 16:38:21 +0800221EXPORT_SYMBOL_GPL(vhost_poll_start);
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +0000222
223/* Stop polling a file. After this function returns, it becomes safe to drop the
224 * file reference. You must also flush afterwards. */
225void vhost_poll_stop(struct vhost_poll *poll)
226{
Jason Wang2b8b3282013-01-28 01:05:18 +0000227 if (poll->wqh) {
228 remove_wait_queue(poll->wqh, &poll->wait);
229 poll->wqh = NULL;
230 }
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +0000231}
Asias He6ac1afb2013-05-06 16:38:21 +0800232EXPORT_SYMBOL_GPL(vhost_poll_stop);
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +0000233
Asias He6ac1afb2013-05-06 16:38:21 +0800234void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work)
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +0000235{
Jason Wang7235acd2016-04-25 22:14:32 -0400236 struct vhost_flush_struct flush;
Tejun Heoc23f34452010-06-02 20:40:00 +0200237
Jason Wang7235acd2016-04-25 22:14:32 -0400238 if (dev->worker) {
239 init_completion(&flush.wait_event);
240 vhost_work_init(&flush.work, vhost_flush_work);
241
242 vhost_work_queue(dev, &flush.work);
243 wait_for_completion(&flush.wait_event);
244 }
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +0000245}
Asias He6ac1afb2013-05-06 16:38:21 +0800246EXPORT_SYMBOL_GPL(vhost_work_flush);
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +0000247
Michael S. Tsirkin87d6a412010-09-02 14:05:30 +0300248/* Flush any work that has been scheduled. When calling this, don't hold any
249 * locks that are also used by the callback. */
250void vhost_poll_flush(struct vhost_poll *poll)
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +0000251{
Michael S. Tsirkin87d6a412010-09-02 14:05:30 +0300252 vhost_work_flush(poll->dev, &poll->work);
253}
Asias He6ac1afb2013-05-06 16:38:21 +0800254EXPORT_SYMBOL_GPL(vhost_poll_flush);
Michael S. Tsirkin87d6a412010-09-02 14:05:30 +0300255
Stefan Hajnoczi163049a2012-07-21 06:55:37 +0000256void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work)
Michael S. Tsirkin87d6a412010-09-02 14:05:30 +0300257{
Jason Wang04b96e52016-04-25 22:14:33 -0400258 if (!dev->worker)
259 return;
Tejun Heoc23f34452010-06-02 20:40:00 +0200260
Jason Wang04b96e52016-04-25 22:14:33 -0400261 if (!test_and_set_bit(VHOST_WORK_QUEUED, &work->flags)) {
262 /* We can only add the work to the list after we're
263 * sure it was not in the list.
Peng Tao635abf02016-12-07 17:52:19 +0800264 * test_and_set_bit() implies a memory barrier.
Jason Wang04b96e52016-04-25 22:14:33 -0400265 */
Jason Wang04b96e52016-04-25 22:14:33 -0400266 llist_add(&work->node, &dev->work_list);
Tejun Heoc23f34452010-06-02 20:40:00 +0200267 wake_up_process(dev->worker);
268 }
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +0000269}
Asias He6ac1afb2013-05-06 16:38:21 +0800270EXPORT_SYMBOL_GPL(vhost_work_queue);
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +0000271
Jason Wang526d3e72016-03-04 06:24:51 -0500272/* A lockless hint for busy polling code to exit the loop */
273bool vhost_has_work(struct vhost_dev *dev)
274{
Jason Wang04b96e52016-04-25 22:14:33 -0400275 return !llist_empty(&dev->work_list);
Jason Wang526d3e72016-03-04 06:24:51 -0500276}
277EXPORT_SYMBOL_GPL(vhost_has_work);
278
Michael S. Tsirkin87d6a412010-09-02 14:05:30 +0300279void vhost_poll_queue(struct vhost_poll *poll)
280{
281 vhost_work_queue(poll->dev, &poll->work);
282}
Asias He6ac1afb2013-05-06 16:38:21 +0800283EXPORT_SYMBOL_GPL(vhost_poll_queue);
Michael S. Tsirkin87d6a412010-09-02 14:05:30 +0300284
Jason Wangf8894912017-02-28 17:56:02 +0800285static void __vhost_vq_meta_reset(struct vhost_virtqueue *vq)
286{
287 int j;
288
289 for (j = 0; j < VHOST_NUM_ADDRS; j++)
290 vq->meta_iotlb[j] = NULL;
291}
292
293static void vhost_vq_meta_reset(struct vhost_dev *d)
294{
295 int i;
296
Jason Wang86a07da2018-12-13 10:53:39 +0800297 for (i = 0; i < d->nvqs; ++i)
Jason Wangf8894912017-02-28 17:56:02 +0800298 __vhost_vq_meta_reset(d->vqs[i]);
299}
300
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +0000301static void vhost_vq_reset(struct vhost_dev *dev,
302 struct vhost_virtqueue *vq)
303{
304 vq->num = 1;
305 vq->desc = NULL;
306 vq->avail = NULL;
307 vq->used = NULL;
308 vq->last_avail_idx = 0;
309 vq->avail_idx = 0;
310 vq->last_used_idx = 0;
Michael S. Tsirkin8ea8cf82011-05-20 02:10:54 +0300311 vq->signalled_used = 0;
312 vq->signalled_used_valid = false;
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +0000313 vq->used_flags = 0;
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +0000314 vq->log_used = false;
315 vq->log_addr = -1ull;
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +0000316 vq->private_data = NULL;
Michael S. Tsirkinea16c512014-06-05 15:20:23 +0300317 vq->acked_features = 0;
Jason Wang429711a2018-08-06 11:17:47 +0800318 vq->acked_backend_features = 0;
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +0000319 vq->log_base = NULL;
320 vq->error_ctx = NULL;
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +0000321 vq->kick = NULL;
322 vq->call_ctx = NULL;
Michael S. Tsirkin73a99f02010-02-23 11:23:45 +0200323 vq->log_ctx = NULL;
Greg Kurzc5072032016-02-16 15:59:34 +0100324 vhost_reset_is_le(vq);
325 vhost_disable_cross_endian(vq);
Jason Wang03088132016-03-04 06:24:53 -0500326 vq->busyloop_timeout = 0;
Jason Wanga9709d62016-06-23 02:04:31 -0400327 vq->umem = NULL;
Jason Wang6b1e6cc2016-06-23 02:04:32 -0400328 vq->iotlb = NULL;
Jason Wangf8894912017-02-28 17:56:02 +0800329 __vhost_vq_meta_reset(vq);
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +0000330}
331
Tejun Heoc23f34452010-06-02 20:40:00 +0200332static int vhost_worker(void *data)
333{
334 struct vhost_dev *dev = data;
Jason Wang04b96e52016-04-25 22:14:33 -0400335 struct vhost_work *work, *work_next;
336 struct llist_node *node;
Jens Freimannd7ffde32012-06-26 00:59:58 +0000337 mm_segment_t oldfs = get_fs();
Tejun Heoc23f34452010-06-02 20:40:00 +0200338
Jens Freimannd7ffde32012-06-26 00:59:58 +0000339 set_fs(USER_DS);
Michael S. Tsirkin64e1c802010-10-06 15:34:45 +0200340 use_mm(dev->mm);
341
Tejun Heoc23f34452010-06-02 20:40:00 +0200342 for (;;) {
343 /* mb paired w/ kthread_stop */
344 set_current_state(TASK_INTERRUPTIBLE);
345
Tejun Heoc23f34452010-06-02 20:40:00 +0200346 if (kthread_should_stop()) {
Tejun Heoc23f34452010-06-02 20:40:00 +0200347 __set_current_state(TASK_RUNNING);
Michael S. Tsirkin64e1c802010-10-06 15:34:45 +0200348 break;
Tejun Heoc23f34452010-06-02 20:40:00 +0200349 }
Tejun Heoc23f34452010-06-02 20:40:00 +0200350
Jason Wang04b96e52016-04-25 22:14:33 -0400351 node = llist_del_all(&dev->work_list);
352 if (!node)
353 schedule();
354
355 node = llist_reverse_order(node);
356 /* make sure flag is seen after deletion */
357 smp_wmb();
358 llist_for_each_entry_safe(work, work_next, node, node) {
359 clear_bit(VHOST_WORK_QUEUED, &work->flags);
Tejun Heoc23f34452010-06-02 20:40:00 +0200360 __set_current_state(TASK_RUNNING);
Andrey Konovalov8f6a7f92019-12-04 16:52:50 -0800361 kcov_remote_start_common(dev->kcov_handle);
Tejun Heoc23f34452010-06-02 20:40:00 +0200362 work->fn(work);
Andrey Konovalov8f6a7f92019-12-04 16:52:50 -0800363 kcov_remote_stop();
Nadav Har'Eld550dda2012-02-27 15:07:29 +0200364 if (need_resched())
365 schedule();
Jason Wang04b96e52016-04-25 22:14:33 -0400366 }
Tejun Heoc23f34452010-06-02 20:40:00 +0200367 }
Michael S. Tsirkin64e1c802010-10-06 15:34:45 +0200368 unuse_mm(dev->mm);
Jens Freimannd7ffde32012-06-26 00:59:58 +0000369 set_fs(oldfs);
Michael S. Tsirkin64e1c802010-10-06 15:34:45 +0200370 return 0;
Tejun Heoc23f34452010-06-02 20:40:00 +0200371}
372
Michael S. Tsirkinbab632d2011-07-18 03:48:46 +0000373static void vhost_vq_free_iovecs(struct vhost_virtqueue *vq)
374{
375 kfree(vq->indirect);
376 vq->indirect = NULL;
377 kfree(vq->log);
378 vq->log = NULL;
379 kfree(vq->heads);
380 vq->heads = NULL;
Michael S. Tsirkinbab632d2011-07-18 03:48:46 +0000381}
382
Jason Wange0e9b402010-09-14 23:53:05 +0800383/* Helper to allocate iovec buffers for all vqs. */
384static long vhost_dev_alloc_iovecs(struct vhost_dev *dev)
385{
Asias He6d5e6aa2013-05-06 16:38:23 +0800386 struct vhost_virtqueue *vq;
Jason Wange0e9b402010-09-14 23:53:05 +0800387 int i;
Krishna Kumard47effe2011-03-01 17:06:37 +0530388
Jason Wange0e9b402010-09-14 23:53:05 +0800389 for (i = 0; i < dev->nvqs; ++i) {
Asias He6d5e6aa2013-05-06 16:38:23 +0800390 vq = dev->vqs[i];
Kees Cook6da2ec52018-06-12 13:55:00 -0700391 vq->indirect = kmalloc_array(UIO_MAXIOV,
392 sizeof(*vq->indirect),
393 GFP_KERNEL);
Jason Wangb46a0bf2019-01-28 15:05:05 +0800394 vq->log = kmalloc_array(dev->iov_limit, sizeof(*vq->log),
Kees Cook6da2ec52018-06-12 13:55:00 -0700395 GFP_KERNEL);
Jason Wangb46a0bf2019-01-28 15:05:05 +0800396 vq->heads = kmalloc_array(dev->iov_limit, sizeof(*vq->heads),
Kees Cook6da2ec52018-06-12 13:55:00 -0700397 GFP_KERNEL);
Asias He6d5e6aa2013-05-06 16:38:23 +0800398 if (!vq->indirect || !vq->log || !vq->heads)
Jason Wange0e9b402010-09-14 23:53:05 +0800399 goto err_nomem;
400 }
401 return 0;
Krishna Kumard47effe2011-03-01 17:06:37 +0530402
Jason Wange0e9b402010-09-14 23:53:05 +0800403err_nomem:
Michael S. Tsirkinbab632d2011-07-18 03:48:46 +0000404 for (; i >= 0; --i)
Asias He3ab2e422013-04-27 11:16:48 +0800405 vhost_vq_free_iovecs(dev->vqs[i]);
Jason Wange0e9b402010-09-14 23:53:05 +0800406 return -ENOMEM;
407}
408
409static void vhost_dev_free_iovecs(struct vhost_dev *dev)
410{
411 int i;
Krishna Kumard47effe2011-03-01 17:06:37 +0530412
Michael S. Tsirkinbab632d2011-07-18 03:48:46 +0000413 for (i = 0; i < dev->nvqs; ++i)
Asias He3ab2e422013-04-27 11:16:48 +0800414 vhost_vq_free_iovecs(dev->vqs[i]);
Jason Wange0e9b402010-09-14 23:53:05 +0800415}
416
Jason Wange82b9b02019-05-17 00:29:49 -0400417bool vhost_exceeds_weight(struct vhost_virtqueue *vq,
418 int pkts, int total_len)
419{
420 struct vhost_dev *dev = vq->dev;
421
422 if ((dev->byte_weight && total_len >= dev->byte_weight) ||
423 pkts >= dev->weight) {
424 vhost_poll_queue(&vq->poll);
425 return true;
426 }
427
428 return false;
429}
430EXPORT_SYMBOL_GPL(vhost_exceeds_weight);
431
Jason Wang4942e822019-05-24 04:12:16 -0400432static size_t vhost_get_avail_size(struct vhost_virtqueue *vq,
433 unsigned int num)
434{
435 size_t event __maybe_unused =
436 vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;
437
438 return sizeof(*vq->avail) +
439 sizeof(*vq->avail->ring) * num + event;
440}
441
442static size_t vhost_get_used_size(struct vhost_virtqueue *vq,
443 unsigned int num)
444{
445 size_t event __maybe_unused =
446 vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;
447
448 return sizeof(*vq->used) +
449 sizeof(*vq->used->ring) * num + event;
450}
451
452static size_t vhost_get_desc_size(struct vhost_virtqueue *vq,
453 unsigned int num)
454{
455 return sizeof(*vq->desc) * num;
456}
457
Zhi Yong Wu59566b6e2013-12-07 04:13:03 +0800458void vhost_dev_init(struct vhost_dev *dev,
Jason Wange82b9b02019-05-17 00:29:49 -0400459 struct vhost_virtqueue **vqs, int nvqs,
Jason Wang792a4f22020-03-26 22:01:18 +0800460 int iov_limit, int weight, int byte_weight,
461 int (*msg_handler)(struct vhost_dev *dev,
462 struct vhost_iotlb_msg *msg))
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +0000463{
Asias He6d5e6aa2013-05-06 16:38:23 +0800464 struct vhost_virtqueue *vq;
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +0000465 int i;
Tejun Heoc23f34452010-06-02 20:40:00 +0200466
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +0000467 dev->vqs = vqs;
468 dev->nvqs = nvqs;
469 mutex_init(&dev->mutex);
470 dev->log_ctx = NULL;
Jason Wanga9709d62016-06-23 02:04:31 -0400471 dev->umem = NULL;
Jason Wang6b1e6cc2016-06-23 02:04:32 -0400472 dev->iotlb = NULL;
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +0000473 dev->mm = NULL;
Tejun Heoc23f34452010-06-02 20:40:00 +0200474 dev->worker = NULL;
Jason Wangb46a0bf2019-01-28 15:05:05 +0800475 dev->iov_limit = iov_limit;
Jason Wange82b9b02019-05-17 00:29:49 -0400476 dev->weight = weight;
477 dev->byte_weight = byte_weight;
Jason Wang792a4f22020-03-26 22:01:18 +0800478 dev->msg_handler = msg_handler;
Jason Wang04b96e52016-04-25 22:14:33 -0400479 init_llist_head(&dev->work_list);
Jason Wang6b1e6cc2016-06-23 02:04:32 -0400480 init_waitqueue_head(&dev->wait);
481 INIT_LIST_HEAD(&dev->read_list);
482 INIT_LIST_HEAD(&dev->pending_list);
483 spin_lock_init(&dev->iotlb_lock);
Michael S. Tsirkin3d2c7d32019-08-10 13:53:21 -0400484
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +0000485
486 for (i = 0; i < dev->nvqs; ++i) {
Asias He6d5e6aa2013-05-06 16:38:23 +0800487 vq = dev->vqs[i];
488 vq->log = NULL;
489 vq->indirect = NULL;
490 vq->heads = NULL;
491 vq->dev = dev;
492 mutex_init(&vq->mutex);
493 vhost_vq_reset(dev, vq);
494 if (vq->handle_kick)
495 vhost_poll_init(&vq->poll, vq->handle_kick,
Linus Torvaldsa9a08842018-02-11 14:34:03 -0800496 EPOLLIN, dev);
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +0000497 }
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +0000498}
Asias He6ac1afb2013-05-06 16:38:21 +0800499EXPORT_SYMBOL_GPL(vhost_dev_init);
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +0000500
501/* Caller should have device mutex */
502long vhost_dev_check_owner(struct vhost_dev *dev)
503{
504 /* Are you the owner? If not, I don't think you mean to do that */
505 return dev->mm == current->mm ? 0 : -EPERM;
506}
Asias He6ac1afb2013-05-06 16:38:21 +0800507EXPORT_SYMBOL_GPL(vhost_dev_check_owner);
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +0000508
Michael S. Tsirkin87d6a412010-09-02 14:05:30 +0300509struct vhost_attach_cgroups_struct {
Krishna Kumard47effe2011-03-01 17:06:37 +0530510 struct vhost_work work;
511 struct task_struct *owner;
512 int ret;
Michael S. Tsirkin87d6a412010-09-02 14:05:30 +0300513};
514
515static void vhost_attach_cgroups_work(struct vhost_work *work)
516{
Krishna Kumard47effe2011-03-01 17:06:37 +0530517 struct vhost_attach_cgroups_struct *s;
518
519 s = container_of(work, struct vhost_attach_cgroups_struct, work);
520 s->ret = cgroup_attach_task_all(s->owner, current);
Michael S. Tsirkin87d6a412010-09-02 14:05:30 +0300521}
522
523static int vhost_attach_cgroups(struct vhost_dev *dev)
524{
Krishna Kumard47effe2011-03-01 17:06:37 +0530525 struct vhost_attach_cgroups_struct attach;
526
527 attach.owner = current;
528 vhost_work_init(&attach.work, vhost_attach_cgroups_work);
529 vhost_work_queue(dev, &attach.work);
530 vhost_work_flush(dev, &attach.work);
531 return attach.ret;
Michael S. Tsirkin87d6a412010-09-02 14:05:30 +0300532}
533
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +0000534/* Caller should have device mutex */
Michael S. Tsirkin05c05352013-06-06 15:20:39 +0300535bool vhost_dev_has_owner(struct vhost_dev *dev)
536{
537 return dev->mm;
538}
Asias He6ac1afb2013-05-06 16:38:21 +0800539EXPORT_SYMBOL_GPL(vhost_dev_has_owner);
Michael S. Tsirkin05c05352013-06-06 15:20:39 +0300540
541/* Caller should have device mutex */
Asias He54db63c2013-05-06 11:15:59 +0800542long vhost_dev_set_owner(struct vhost_dev *dev)
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +0000543{
Tejun Heoc23f34452010-06-02 20:40:00 +0200544 struct task_struct *worker;
545 int err;
Krishna Kumard47effe2011-03-01 17:06:37 +0530546
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +0000547 /* Is there an owner already? */
Michael S. Tsirkin05c05352013-06-06 15:20:39 +0300548 if (vhost_dev_has_owner(dev)) {
Tejun Heoc23f34452010-06-02 20:40:00 +0200549 err = -EBUSY;
550 goto err_mm;
551 }
Krishna Kumard47effe2011-03-01 17:06:37 +0530552
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +0000553 /* No owner, become one */
554 dev->mm = get_task_mm(current);
Andrey Konovalov8f6a7f92019-12-04 16:52:50 -0800555 dev->kcov_handle = kcov_common_handle();
Tejun Heoc23f34452010-06-02 20:40:00 +0200556 worker = kthread_create(vhost_worker, dev, "vhost-%d", current->pid);
557 if (IS_ERR(worker)) {
558 err = PTR_ERR(worker);
559 goto err_worker;
560 }
561
562 dev->worker = worker;
Michael S. Tsirkin87d6a412010-09-02 14:05:30 +0300563 wake_up_process(worker); /* avoid contributing to loadavg */
564
565 err = vhost_attach_cgroups(dev);
Michael S. Tsirkin9e3d1952010-07-27 22:56:50 +0300566 if (err)
567 goto err_cgroup;
Tejun Heoc23f34452010-06-02 20:40:00 +0200568
Jason Wange0e9b402010-09-14 23:53:05 +0800569 err = vhost_dev_alloc_iovecs(dev);
570 if (err)
571 goto err_cgroup;
572
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +0000573 return 0;
Michael S. Tsirkin9e3d1952010-07-27 22:56:50 +0300574err_cgroup:
575 kthread_stop(worker);
Michael S. Tsirkin615cc222010-09-02 14:16:36 +0300576 dev->worker = NULL;
Tejun Heoc23f34452010-06-02 20:40:00 +0200577err_worker:
578 if (dev->mm)
579 mmput(dev->mm);
580 dev->mm = NULL;
Andrey Konovalov8f6a7f92019-12-04 16:52:50 -0800581 dev->kcov_handle = 0;
Tejun Heoc23f34452010-06-02 20:40:00 +0200582err_mm:
583 return err;
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +0000584}
Asias He6ac1afb2013-05-06 16:38:21 +0800585EXPORT_SYMBOL_GPL(vhost_dev_set_owner);
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +0000586
Jason Wanga9709d62016-06-23 02:04:31 -0400587struct vhost_umem *vhost_dev_reset_owner_prepare(void)
588{
Michal Hocko6c5ab652017-05-08 15:57:15 -0700589 return kvzalloc(sizeof(struct vhost_umem), GFP_KERNEL);
Michael S. Tsirkin150b9e52013-04-28 17:12:08 +0300590}
Asias He6ac1afb2013-05-06 16:38:21 +0800591EXPORT_SYMBOL_GPL(vhost_dev_reset_owner_prepare);
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +0000592
Michael S. Tsirkin150b9e52013-04-28 17:12:08 +0300593/* Caller should have device mutex */
Jason Wanga9709d62016-06-23 02:04:31 -0400594void vhost_dev_reset_owner(struct vhost_dev *dev, struct vhost_umem *umem)
Michael S. Tsirkin150b9e52013-04-28 17:12:08 +0300595{
Michael S. Tsirkin47283be2014-06-05 15:20:27 +0300596 int i;
597
夷则(Caspar)f6f93f72017-12-25 00:08:58 +0800598 vhost_dev_cleanup(dev);
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +0000599
Michael S. Tsirkin150b9e52013-04-28 17:12:08 +0300600 /* Restore memory to default empty mapping. */
Jason Wanga9709d62016-06-23 02:04:31 -0400601 INIT_LIST_HEAD(&umem->umem_list);
602 dev->umem = umem;
Michael S. Tsirkin47283be2014-06-05 15:20:27 +0300603 /* We don't need VQ locks below since vhost_dev_cleanup makes sure
604 * VQs aren't running.
605 */
606 for (i = 0; i < dev->nvqs; ++i)
Jason Wanga9709d62016-06-23 02:04:31 -0400607 dev->vqs[i]->umem = umem;
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +0000608}
Asias He6ac1afb2013-05-06 16:38:21 +0800609EXPORT_SYMBOL_GPL(vhost_dev_reset_owner);
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +0000610
Michael S. Tsirkinb2116162012-11-01 09:16:46 +0000611void vhost_dev_stop(struct vhost_dev *dev)
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +0000612{
613 int i;
Krishna Kumard47effe2011-03-01 17:06:37 +0530614
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +0000615 for (i = 0; i < dev->nvqs; ++i) {
Asias He3ab2e422013-04-27 11:16:48 +0800616 if (dev->vqs[i]->kick && dev->vqs[i]->handle_kick) {
617 vhost_poll_stop(&dev->vqs[i]->poll);
618 vhost_poll_flush(&dev->vqs[i]->poll);
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +0000619 }
Michael S. Tsirkinb2116162012-11-01 09:16:46 +0000620 }
621}
Asias He6ac1afb2013-05-06 16:38:21 +0800622EXPORT_SYMBOL_GPL(vhost_dev_stop);
Michael S. Tsirkinbab632d2011-07-18 03:48:46 +0000623
Jason Wang6b1e6cc2016-06-23 02:04:32 -0400624static void vhost_umem_free(struct vhost_umem *umem,
625 struct vhost_umem_node *node)
626{
627 vhost_umem_interval_tree_remove(node, &umem->umem_tree);
628 list_del(&node->link);
629 kfree(node);
630 umem->numem--;
631}
632
Jason Wanga9709d62016-06-23 02:04:31 -0400633static void vhost_umem_clean(struct vhost_umem *umem)
634{
635 struct vhost_umem_node *node, *tmp;
636
637 if (!umem)
638 return;
639
Jason Wang6b1e6cc2016-06-23 02:04:32 -0400640 list_for_each_entry_safe(node, tmp, &umem->umem_list, link)
641 vhost_umem_free(umem, node);
642
Jason Wanga9709d62016-06-23 02:04:31 -0400643 kvfree(umem);
644}
645
Jason Wang6b1e6cc2016-06-23 02:04:32 -0400646static void vhost_clear_msg(struct vhost_dev *dev)
647{
648 struct vhost_msg_node *node, *n;
649
650 spin_lock(&dev->iotlb_lock);
651
652 list_for_each_entry_safe(node, n, &dev->read_list, node) {
653 list_del(&node->node);
654 kfree(node);
655 }
656
657 list_for_each_entry_safe(node, n, &dev->pending_list, node) {
658 list_del(&node->node);
659 kfree(node);
660 }
661
662 spin_unlock(&dev->iotlb_lock);
663}
664
夷则(Caspar)f6f93f72017-12-25 00:08:58 +0800665void vhost_dev_cleanup(struct vhost_dev *dev)
Michael S. Tsirkinb2116162012-11-01 09:16:46 +0000666{
667 int i;
Michael S. Tsirkinbab632d2011-07-18 03:48:46 +0000668
Michael S. Tsirkinb2116162012-11-01 09:16:46 +0000669 for (i = 0; i < dev->nvqs; ++i) {
Asias He3ab2e422013-04-27 11:16:48 +0800670 if (dev->vqs[i]->error_ctx)
671 eventfd_ctx_put(dev->vqs[i]->error_ctx);
Asias He3ab2e422013-04-27 11:16:48 +0800672 if (dev->vqs[i]->kick)
673 fput(dev->vqs[i]->kick);
674 if (dev->vqs[i]->call_ctx)
675 eventfd_ctx_put(dev->vqs[i]->call_ctx);
Asias He3ab2e422013-04-27 11:16:48 +0800676 vhost_vq_reset(dev, dev->vqs[i]);
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +0000677 }
Jason Wange0e9b402010-09-14 23:53:05 +0800678 vhost_dev_free_iovecs(dev);
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +0000679 if (dev->log_ctx)
680 eventfd_ctx_put(dev->log_ctx);
681 dev->log_ctx = NULL;
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +0000682 /* No one will access memory at this point */
Jason Wanga9709d62016-06-23 02:04:31 -0400683 vhost_umem_clean(dev->umem);
684 dev->umem = NULL;
Jason Wang6b1e6cc2016-06-23 02:04:32 -0400685 vhost_umem_clean(dev->iotlb);
686 dev->iotlb = NULL;
687 vhost_clear_msg(dev);
Linus Torvaldsa9a08842018-02-11 14:34:03 -0800688 wake_up_interruptible_poll(&dev->wait, EPOLLIN | EPOLLRDNORM);
Jason Wang04b96e52016-04-25 22:14:33 -0400689 WARN_ON(!llist_empty(&dev->work_list));
Eric Dumazet78b620c2010-08-31 02:05:57 +0000690 if (dev->worker) {
691 kthread_stop(dev->worker);
692 dev->worker = NULL;
Andrey Konovalov8f6a7f92019-12-04 16:52:50 -0800693 dev->kcov_handle = 0;
Eric Dumazet78b620c2010-08-31 02:05:57 +0000694 }
Michael S. Tsirkin3d2c7d32019-08-10 13:53:21 -0400695 if (dev->mm)
Michael S. Tsirkin533a19b2010-10-06 15:34:38 +0200696 mmput(dev->mm);
697 dev->mm = NULL;
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +0000698}
Asias He6ac1afb2013-05-06 16:38:21 +0800699EXPORT_SYMBOL_GPL(vhost_dev_cleanup);
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +0000700
Stefan Hajnocziddd3d402018-04-11 10:35:41 +0800701static bool log_access_ok(void __user *log_base, u64 addr, unsigned long sz)
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +0000702{
703 u64 a = addr / VHOST_PAGE_SIZE / 8;
Krishna Kumard47effe2011-03-01 17:06:37 +0530704
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +0000705 /* Make sure 64 bit math will not overflow. */
706 if (a > ULONG_MAX - (unsigned long)log_base ||
707 a + (unsigned long)log_base > ULONG_MAX)
Stefan Hajnocziddd3d402018-04-11 10:35:41 +0800708 return false;
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +0000709
Linus Torvalds96d4f262019-01-03 18:57:57 -0800710 return access_ok(log_base + a,
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +0000711 (sz + VHOST_PAGE_SIZE * 8 - 1) / VHOST_PAGE_SIZE / 8);
712}
713
Michael S. Tsirkinec33d032016-08-01 23:20:53 +0300714static bool vhost_overflow(u64 uaddr, u64 size)
715{
716 /* Make sure 64 bit math will not overflow. */
717 return uaddr > ULONG_MAX || size > ULONG_MAX || uaddr > ULONG_MAX - size;
718}
719
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +0000720/* Caller should have vq mutex and device mutex. */
Stefan Hajnocziddd3d402018-04-11 10:35:41 +0800721static bool vq_memory_access_ok(void __user *log_base, struct vhost_umem *umem,
722 int log_all)
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +0000723{
Jason Wanga9709d62016-06-23 02:04:31 -0400724 struct vhost_umem_node *node;
Jeff Dike179b2842010-04-07 09:59:10 -0400725
Jason Wanga9709d62016-06-23 02:04:31 -0400726 if (!umem)
Stefan Hajnocziddd3d402018-04-11 10:35:41 +0800727 return false;
Jeff Dike179b2842010-04-07 09:59:10 -0400728
Jason Wanga9709d62016-06-23 02:04:31 -0400729 list_for_each_entry(node, &umem->umem_list, link) {
730 unsigned long a = node->userspace_addr;
731
Michael S. Tsirkinec33d032016-08-01 23:20:53 +0300732 if (vhost_overflow(node->userspace_addr, node->size))
Stefan Hajnocziddd3d402018-04-11 10:35:41 +0800733 return false;
Michael S. Tsirkinec33d032016-08-01 23:20:53 +0300734
735
Linus Torvalds96d4f262019-01-03 18:57:57 -0800736 if (!access_ok((void __user *)a,
Jason Wanga9709d62016-06-23 02:04:31 -0400737 node->size))
Stefan Hajnocziddd3d402018-04-11 10:35:41 +0800738 return false;
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +0000739 else if (log_all && !log_access_ok(log_base,
Jason Wanga9709d62016-06-23 02:04:31 -0400740 node->start,
741 node->size))
Stefan Hajnocziddd3d402018-04-11 10:35:41 +0800742 return false;
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +0000743 }
Stefan Hajnocziddd3d402018-04-11 10:35:41 +0800744 return true;
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +0000745}
746
Jason Wangf8894912017-02-28 17:56:02 +0800747static inline void __user *vhost_vq_meta_fetch(struct vhost_virtqueue *vq,
748 u64 addr, unsigned int size,
749 int type)
750{
751 const struct vhost_umem_node *node = vq->meta_iotlb[type];
752
753 if (!node)
754 return NULL;
755
756 return (void *)(uintptr_t)(node->userspace_addr + addr - node->start);
757}
758
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +0000759/* Can we switch to this memory table? */
760/* Caller should have device mutex but not vq mutex */
Stefan Hajnocziddd3d402018-04-11 10:35:41 +0800761static bool memory_access_ok(struct vhost_dev *d, struct vhost_umem *umem,
762 int log_all)
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +0000763{
764 int i;
Krishna Kumard47effe2011-03-01 17:06:37 +0530765
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +0000766 for (i = 0; i < d->nvqs; ++i) {
Stefan Hajnocziddd3d402018-04-11 10:35:41 +0800767 bool ok;
Michael S. Tsirkinea16c512014-06-05 15:20:23 +0300768 bool log;
769
Asias He3ab2e422013-04-27 11:16:48 +0800770 mutex_lock(&d->vqs[i]->mutex);
Michael S. Tsirkinea16c512014-06-05 15:20:23 +0300771 log = log_all || vhost_has_feature(d->vqs[i], VHOST_F_LOG_ALL);
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +0000772 /* If ring is inactive, will check when it's enabled. */
Asias He3ab2e422013-04-27 11:16:48 +0800773 if (d->vqs[i]->private_data)
Jason Wanga9709d62016-06-23 02:04:31 -0400774 ok = vq_memory_access_ok(d->vqs[i]->log_base,
775 umem, log);
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +0000776 else
Stefan Hajnocziddd3d402018-04-11 10:35:41 +0800777 ok = true;
Asias He3ab2e422013-04-27 11:16:48 +0800778 mutex_unlock(&d->vqs[i]->mutex);
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +0000779 if (!ok)
Stefan Hajnocziddd3d402018-04-11 10:35:41 +0800780 return false;
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +0000781 }
Stefan Hajnocziddd3d402018-04-11 10:35:41 +0800782 return true;
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +0000783}
784
Jason Wang6b1e6cc2016-06-23 02:04:32 -0400785static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
786 struct iovec iov[], int iov_size, int access);
Jason Wangbfe2bc52016-06-23 02:04:30 -0400787
Michael S. Tsirkin72952cc2016-12-06 06:01:41 +0200788static int vhost_copy_to_user(struct vhost_virtqueue *vq, void __user *to,
Jason Wangbfe2bc52016-06-23 02:04:30 -0400789 const void *from, unsigned size)
790{
Jason Wang6b1e6cc2016-06-23 02:04:32 -0400791 int ret;
Jason Wangbfe2bc52016-06-23 02:04:30 -0400792
Jason Wang6b1e6cc2016-06-23 02:04:32 -0400793 if (!vq->iotlb)
794 return __copy_to_user(to, from, size);
795 else {
796 /* This function should be called after iotlb
797 * prefetch, which means we're sure that all vq
798 * could be access through iotlb. So -EAGAIN should
799 * not happen in this case.
800 */
Jason Wang6b1e6cc2016-06-23 02:04:32 -0400801 struct iov_iter t;
Jason Wangf8894912017-02-28 17:56:02 +0800802 void __user *uaddr = vhost_vq_meta_fetch(vq,
803 (u64)(uintptr_t)to, size,
Eric Auger7ced6c92018-04-11 15:30:38 +0200804 VHOST_ADDR_USED);
Jason Wangf8894912017-02-28 17:56:02 +0800805
806 if (uaddr)
807 return __copy_to_user(uaddr, from, size);
808
Jason Wang6b1e6cc2016-06-23 02:04:32 -0400809 ret = translate_desc(vq, (u64)(uintptr_t)to, size, vq->iotlb_iov,
810 ARRAY_SIZE(vq->iotlb_iov),
811 VHOST_ACCESS_WO);
812 if (ret < 0)
813 goto out;
814 iov_iter_init(&t, WRITE, vq->iotlb_iov, ret, size);
815 ret = copy_to_iter(from, size, &t);
816 if (ret == size)
817 ret = 0;
818 }
819out:
820 return ret;
821}
Jason Wangbfe2bc52016-06-23 02:04:30 -0400822
823static int vhost_copy_from_user(struct vhost_virtqueue *vq, void *to,
Michael S. Tsirkin72952cc2016-12-06 06:01:41 +0200824 void __user *from, unsigned size)
Jason Wangbfe2bc52016-06-23 02:04:30 -0400825{
Jason Wang6b1e6cc2016-06-23 02:04:32 -0400826 int ret;
827
828 if (!vq->iotlb)
829 return __copy_from_user(to, from, size);
830 else {
831 /* This function should be called after iotlb
832 * prefetch, which means we're sure that vq
833 * could be access through iotlb. So -EAGAIN should
834 * not happen in this case.
835 */
Jason Wangf8894912017-02-28 17:56:02 +0800836 void __user *uaddr = vhost_vq_meta_fetch(vq,
837 (u64)(uintptr_t)from, size,
838 VHOST_ADDR_DESC);
Jason Wang6b1e6cc2016-06-23 02:04:32 -0400839 struct iov_iter f;
Jason Wangf8894912017-02-28 17:56:02 +0800840
841 if (uaddr)
842 return __copy_from_user(to, uaddr, size);
843
Jason Wang6b1e6cc2016-06-23 02:04:32 -0400844 ret = translate_desc(vq, (u64)(uintptr_t)from, size, vq->iotlb_iov,
845 ARRAY_SIZE(vq->iotlb_iov),
846 VHOST_ACCESS_RO);
847 if (ret < 0) {
848 vq_err(vq, "IOTLB translation failure: uaddr "
849 "%p size 0x%llx\n", from,
850 (unsigned long long) size);
851 goto out;
852 }
853 iov_iter_init(&f, READ, vq->iotlb_iov, ret, size);
854 ret = copy_from_iter(to, size, &f);
855 if (ret == size)
856 ret = 0;
857 }
858
859out:
860 return ret;
861}
862
Jason Wangf8894912017-02-28 17:56:02 +0800863static void __user *__vhost_get_user_slow(struct vhost_virtqueue *vq,
864 void __user *addr, unsigned int size,
865 int type)
Jason Wang6b1e6cc2016-06-23 02:04:32 -0400866{
867 int ret;
868
Jason Wang6b1e6cc2016-06-23 02:04:32 -0400869 ret = translate_desc(vq, (u64)(uintptr_t)addr, size, vq->iotlb_iov,
870 ARRAY_SIZE(vq->iotlb_iov),
871 VHOST_ACCESS_RO);
872 if (ret < 0) {
873 vq_err(vq, "IOTLB translation failure: uaddr "
874 "%p size 0x%llx\n", addr,
875 (unsigned long long) size);
876 return NULL;
877 }
878
879 if (ret != 1 || vq->iotlb_iov[0].iov_len != size) {
880 vq_err(vq, "Non atomic userspace memory access: uaddr "
881 "%p size 0x%llx\n", addr,
882 (unsigned long long) size);
883 return NULL;
884 }
885
886 return vq->iotlb_iov[0].iov_base;
887}
888
Jason Wangf8894912017-02-28 17:56:02 +0800889/* This function should be called after iotlb
890 * prefetch, which means we're sure that vq
891 * could be access through iotlb. So -EAGAIN should
892 * not happen in this case.
893 */
894static inline void __user *__vhost_get_user(struct vhost_virtqueue *vq,
895 void *addr, unsigned int size,
896 int type)
897{
898 void __user *uaddr = vhost_vq_meta_fetch(vq,
899 (u64)(uintptr_t)addr, size, type);
900 if (uaddr)
901 return uaddr;
902
903 return __vhost_get_user_slow(vq, addr, size, type);
904}
905
906#define vhost_put_user(vq, x, ptr) \
Jason Wang6b1e6cc2016-06-23 02:04:32 -0400907({ \
908 int ret = -EFAULT; \
909 if (!vq->iotlb) { \
910 ret = __put_user(x, ptr); \
911 } else { \
912 __typeof__(ptr) to = \
Jason Wangf8894912017-02-28 17:56:02 +0800913 (__typeof__(ptr)) __vhost_get_user(vq, ptr, \
914 sizeof(*ptr), VHOST_ADDR_USED); \
Jason Wang6b1e6cc2016-06-23 02:04:32 -0400915 if (to != NULL) \
916 ret = __put_user(x, to); \
917 else \
918 ret = -EFAULT; \
919 } \
920 ret; \
921})
922
Jason Wang7b5d7532019-05-24 04:12:14 -0400923static inline int vhost_put_avail_event(struct vhost_virtqueue *vq)
924{
925 return vhost_put_user(vq, cpu_to_vhost16(vq, vq->avail_idx),
926 vhost_avail_event(vq));
927}
928
929static inline int vhost_put_used(struct vhost_virtqueue *vq,
930 struct vring_used_elem *head, int idx,
931 int count)
932{
933 return vhost_copy_to_user(vq, vq->used->ring + idx, head,
934 count * sizeof(*head));
935}
936
937static inline int vhost_put_used_flags(struct vhost_virtqueue *vq)
938
939{
940 return vhost_put_user(vq, cpu_to_vhost16(vq, vq->used_flags),
941 &vq->used->flags);
942}
943
944static inline int vhost_put_used_idx(struct vhost_virtqueue *vq)
945
946{
947 return vhost_put_user(vq, cpu_to_vhost16(vq, vq->last_used_idx),
948 &vq->used->idx);
949}
950
Jason Wangf8894912017-02-28 17:56:02 +0800951#define vhost_get_user(vq, x, ptr, type) \
Jason Wang6b1e6cc2016-06-23 02:04:32 -0400952({ \
953 int ret; \
954 if (!vq->iotlb) { \
955 ret = __get_user(x, ptr); \
956 } else { \
957 __typeof__(ptr) from = \
Jason Wangf8894912017-02-28 17:56:02 +0800958 (__typeof__(ptr)) __vhost_get_user(vq, ptr, \
959 sizeof(*ptr), \
960 type); \
Jason Wang6b1e6cc2016-06-23 02:04:32 -0400961 if (from != NULL) \
962 ret = __get_user(x, from); \
963 else \
964 ret = -EFAULT; \
965 } \
966 ret; \
967})
968
Jason Wangf8894912017-02-28 17:56:02 +0800969#define vhost_get_avail(vq, x, ptr) \
970 vhost_get_user(vq, x, ptr, VHOST_ADDR_AVAIL)
971
972#define vhost_get_used(vq, x, ptr) \
973 vhost_get_user(vq, x, ptr, VHOST_ADDR_USED)
974
Jason Wang86a07da2018-12-13 10:53:39 +0800975static void vhost_dev_lock_vqs(struct vhost_dev *d)
976{
977 int i = 0;
978 for (i = 0; i < d->nvqs; ++i)
979 mutex_lock_nested(&d->vqs[i]->mutex, i);
980}
981
982static void vhost_dev_unlock_vqs(struct vhost_dev *d)
983{
984 int i = 0;
985 for (i = 0; i < d->nvqs; ++i)
986 mutex_unlock(&d->vqs[i]->mutex);
987}
988
Jason Wang7b5d7532019-05-24 04:12:14 -0400989static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq,
990 __virtio16 *idx)
991{
992 return vhost_get_avail(vq, *idx, &vq->avail->idx);
993}
994
995static inline int vhost_get_avail_head(struct vhost_virtqueue *vq,
996 __virtio16 *head, int idx)
997{
998 return vhost_get_avail(vq, *head,
999 &vq->avail->ring[idx & (vq->num - 1)]);
1000}
1001
1002static inline int vhost_get_avail_flags(struct vhost_virtqueue *vq,
1003 __virtio16 *flags)
1004{
1005 return vhost_get_avail(vq, *flags, &vq->avail->flags);
1006}
1007
1008static inline int vhost_get_used_event(struct vhost_virtqueue *vq,
1009 __virtio16 *event)
1010{
1011 return vhost_get_avail(vq, *event, vhost_used_event(vq));
1012}
1013
1014static inline int vhost_get_used_idx(struct vhost_virtqueue *vq,
1015 __virtio16 *idx)
1016{
1017 return vhost_get_used(vq, *idx, &vq->used->idx);
1018}
1019
1020static inline int vhost_get_desc(struct vhost_virtqueue *vq,
1021 struct vring_desc *desc, int idx)
1022{
1023 return vhost_copy_from_user(vq, desc, vq->desc + idx, sizeof(*desc));
1024}
1025
Jason Wang6b1e6cc2016-06-23 02:04:32 -04001026static int vhost_new_umem_range(struct vhost_umem *umem,
1027 u64 start, u64 size, u64 end,
1028 u64 userspace_addr, int perm)
1029{
Jason Wang813dbeb2019-04-09 12:10:25 +08001030 struct vhost_umem_node *tmp, *node;
Jason Wang6b1e6cc2016-06-23 02:04:32 -04001031
Jason Wang813dbeb2019-04-09 12:10:25 +08001032 if (!size)
1033 return -EFAULT;
1034
1035 node = kmalloc(sizeof(*node), GFP_ATOMIC);
Jason Wang6b1e6cc2016-06-23 02:04:32 -04001036 if (!node)
1037 return -ENOMEM;
1038
1039 if (umem->numem == max_iotlb_entries) {
1040 tmp = list_first_entry(&umem->umem_list, typeof(*tmp), link);
1041 vhost_umem_free(umem, tmp);
1042 }
1043
1044 node->start = start;
1045 node->size = size;
1046 node->last = end;
1047 node->userspace_addr = userspace_addr;
1048 node->perm = perm;
1049 INIT_LIST_HEAD(&node->link);
1050 list_add_tail(&node->link, &umem->umem_list);
1051 vhost_umem_interval_tree_insert(node, &umem->umem_tree);
1052 umem->numem++;
1053
1054 return 0;
1055}
1056
1057static void vhost_del_umem_range(struct vhost_umem *umem,
1058 u64 start, u64 end)
1059{
1060 struct vhost_umem_node *node;
1061
1062 while ((node = vhost_umem_interval_tree_iter_first(&umem->umem_tree,
1063 start, end)))
1064 vhost_umem_free(umem, node);
1065}
1066
1067static void vhost_iotlb_notify_vq(struct vhost_dev *d,
1068 struct vhost_iotlb_msg *msg)
1069{
1070 struct vhost_msg_node *node, *n;
1071
1072 spin_lock(&d->iotlb_lock);
1073
1074 list_for_each_entry_safe(node, n, &d->pending_list, node) {
1075 struct vhost_iotlb_msg *vq_msg = &node->msg.iotlb;
1076 if (msg->iova <= vq_msg->iova &&
Jason Wang2d66f992018-08-24 16:53:13 +08001077 msg->iova + msg->size - 1 >= vq_msg->iova &&
Jason Wang6b1e6cc2016-06-23 02:04:32 -04001078 vq_msg->type == VHOST_IOTLB_MISS) {
1079 vhost_poll_queue(&node->vq->poll);
1080 list_del(&node->node);
1081 kfree(node);
1082 }
1083 }
1084
1085 spin_unlock(&d->iotlb_lock);
1086}
1087
Stefan Hajnocziddd3d402018-04-11 10:35:41 +08001088static bool umem_access_ok(u64 uaddr, u64 size, int access)
Jason Wang6b1e6cc2016-06-23 02:04:32 -04001089{
1090 unsigned long a = uaddr;
1091
Michael S. Tsirkinec33d032016-08-01 23:20:53 +03001092 /* Make sure 64 bit math will not overflow. */
1093 if (vhost_overflow(uaddr, size))
Stefan Hajnocziddd3d402018-04-11 10:35:41 +08001094 return false;
Michael S. Tsirkinec33d032016-08-01 23:20:53 +03001095
Jason Wang6b1e6cc2016-06-23 02:04:32 -04001096 if ((access & VHOST_ACCESS_RO) &&
Linus Torvalds96d4f262019-01-03 18:57:57 -08001097 !access_ok((void __user *)a, size))
Stefan Hajnocziddd3d402018-04-11 10:35:41 +08001098 return false;
Jason Wang6b1e6cc2016-06-23 02:04:32 -04001099 if ((access & VHOST_ACCESS_WO) &&
Linus Torvalds96d4f262019-01-03 18:57:57 -08001100 !access_ok((void __user *)a, size))
Stefan Hajnocziddd3d402018-04-11 10:35:41 +08001101 return false;
1102 return true;
Jason Wang6b1e6cc2016-06-23 02:04:32 -04001103}
1104
Michael S. Tsirkin72952cc2016-12-06 06:01:41 +02001105static int vhost_process_iotlb_msg(struct vhost_dev *dev,
1106 struct vhost_iotlb_msg *msg)
Jason Wang6b1e6cc2016-06-23 02:04:32 -04001107{
1108 int ret = 0;
1109
Jason Wang1b15ad62018-05-22 19:58:57 +08001110 mutex_lock(&dev->mutex);
Jason Wang86a07da2018-12-13 10:53:39 +08001111 vhost_dev_lock_vqs(dev);
Jason Wang6b1e6cc2016-06-23 02:04:32 -04001112 switch (msg->type) {
1113 case VHOST_IOTLB_UPDATE:
1114 if (!dev->iotlb) {
1115 ret = -EFAULT;
1116 break;
1117 }
Stefan Hajnocziddd3d402018-04-11 10:35:41 +08001118 if (!umem_access_ok(msg->uaddr, msg->size, msg->perm)) {
Jason Wang6b1e6cc2016-06-23 02:04:32 -04001119 ret = -EFAULT;
1120 break;
1121 }
Jason Wangf8894912017-02-28 17:56:02 +08001122 vhost_vq_meta_reset(dev);
Jason Wang6b1e6cc2016-06-23 02:04:32 -04001123 if (vhost_new_umem_range(dev->iotlb, msg->iova, msg->size,
1124 msg->iova + msg->size - 1,
1125 msg->uaddr, msg->perm)) {
1126 ret = -ENOMEM;
1127 break;
1128 }
1129 vhost_iotlb_notify_vq(dev, msg);
1130 break;
1131 case VHOST_IOTLB_INVALIDATE:
Jason Wang6f3180a2018-01-23 17:27:26 +08001132 if (!dev->iotlb) {
1133 ret = -EFAULT;
1134 break;
1135 }
Jason Wangf8894912017-02-28 17:56:02 +08001136 vhost_vq_meta_reset(dev);
Jason Wang6b1e6cc2016-06-23 02:04:32 -04001137 vhost_del_umem_range(dev->iotlb, msg->iova,
1138 msg->iova + msg->size - 1);
1139 break;
1140 default:
1141 ret = -EINVAL;
1142 break;
1143 }
1144
Jason Wang86a07da2018-12-13 10:53:39 +08001145 vhost_dev_unlock_vqs(dev);
Jason Wang1b15ad62018-05-22 19:58:57 +08001146 mutex_unlock(&dev->mutex);
1147
Jason Wang6b1e6cc2016-06-23 02:04:32 -04001148 return ret;
1149}
1150ssize_t vhost_chr_write_iter(struct vhost_dev *dev,
1151 struct iov_iter *from)
1152{
Jason Wang429711a2018-08-06 11:17:47 +08001153 struct vhost_iotlb_msg msg;
1154 size_t offset;
1155 int type, ret;
Jason Wang6b1e6cc2016-06-23 02:04:32 -04001156
Jason Wang429711a2018-08-06 11:17:47 +08001157 ret = copy_from_iter(&type, sizeof(type), from);
Pavel Tikhomirov74ad7412018-12-13 17:53:50 +03001158 if (ret != sizeof(type)) {
1159 ret = -EINVAL;
Jason Wang6b1e6cc2016-06-23 02:04:32 -04001160 goto done;
Pavel Tikhomirov74ad7412018-12-13 17:53:50 +03001161 }
Jason Wang6b1e6cc2016-06-23 02:04:32 -04001162
Jason Wang429711a2018-08-06 11:17:47 +08001163 switch (type) {
Jason Wang6b1e6cc2016-06-23 02:04:32 -04001164 case VHOST_IOTLB_MSG:
Jason Wang429711a2018-08-06 11:17:47 +08001165 /* There maybe a hole after type for V1 message type,
1166 * so skip it here.
1167 */
1168 offset = offsetof(struct vhost_msg, iotlb) - sizeof(int);
1169 break;
1170 case VHOST_IOTLB_MSG_V2:
1171 offset = sizeof(__u32);
Jason Wang6b1e6cc2016-06-23 02:04:32 -04001172 break;
1173 default:
1174 ret = -EINVAL;
Jason Wang429711a2018-08-06 11:17:47 +08001175 goto done;
Jason Wang6b1e6cc2016-06-23 02:04:32 -04001176 }
1177
Jason Wang429711a2018-08-06 11:17:47 +08001178 iov_iter_advance(from, offset);
1179 ret = copy_from_iter(&msg, sizeof(msg), from);
Pavel Tikhomirov74ad7412018-12-13 17:53:50 +03001180 if (ret != sizeof(msg)) {
1181 ret = -EINVAL;
Jason Wang429711a2018-08-06 11:17:47 +08001182 goto done;
Pavel Tikhomirov74ad7412018-12-13 17:53:50 +03001183 }
Jason Wang792a4f22020-03-26 22:01:18 +08001184
1185 if (dev->msg_handler)
1186 ret = dev->msg_handler(dev, &msg);
1187 else
1188 ret = vhost_process_iotlb_msg(dev, &msg);
1189 if (ret) {
Jason Wang429711a2018-08-06 11:17:47 +08001190 ret = -EFAULT;
1191 goto done;
1192 }
1193
1194 ret = (type == VHOST_IOTLB_MSG) ? sizeof(struct vhost_msg) :
1195 sizeof(struct vhost_msg_v2);
Jason Wang6b1e6cc2016-06-23 02:04:32 -04001196done:
1197 return ret;
1198}
1199EXPORT_SYMBOL(vhost_chr_write_iter);
1200
Al Viroafc9a422017-07-03 06:39:46 -04001201__poll_t vhost_chr_poll(struct file *file, struct vhost_dev *dev,
Jason Wang6b1e6cc2016-06-23 02:04:32 -04001202 poll_table *wait)
1203{
Al Viroafc9a422017-07-03 06:39:46 -04001204 __poll_t mask = 0;
Jason Wang6b1e6cc2016-06-23 02:04:32 -04001205
1206 poll_wait(file, &dev->wait, wait);
1207
1208 if (!list_empty(&dev->read_list))
Linus Torvaldsa9a08842018-02-11 14:34:03 -08001209 mask |= EPOLLIN | EPOLLRDNORM;
Jason Wang6b1e6cc2016-06-23 02:04:32 -04001210
1211 return mask;
1212}
1213EXPORT_SYMBOL(vhost_chr_poll);
1214
1215ssize_t vhost_chr_read_iter(struct vhost_dev *dev, struct iov_iter *to,
1216 int noblock)
1217{
1218 DEFINE_WAIT(wait);
1219 struct vhost_msg_node *node;
1220 ssize_t ret = 0;
1221 unsigned size = sizeof(struct vhost_msg);
1222
1223 if (iov_iter_count(to) < size)
1224 return 0;
1225
1226 while (1) {
1227 if (!noblock)
1228 prepare_to_wait(&dev->wait, &wait,
1229 TASK_INTERRUPTIBLE);
1230
1231 node = vhost_dequeue_msg(dev, &dev->read_list);
1232 if (node)
1233 break;
1234 if (noblock) {
1235 ret = -EAGAIN;
1236 break;
1237 }
1238 if (signal_pending(current)) {
1239 ret = -ERESTARTSYS;
1240 break;
1241 }
1242 if (!dev->iotlb) {
1243 ret = -EBADFD;
1244 break;
1245 }
1246
1247 schedule();
1248 }
1249
1250 if (!noblock)
1251 finish_wait(&dev->wait, &wait);
1252
1253 if (node) {
Jason Wang429711a2018-08-06 11:17:47 +08001254 struct vhost_iotlb_msg *msg;
1255 void *start = &node->msg;
Jason Wang6b1e6cc2016-06-23 02:04:32 -04001256
Jason Wang429711a2018-08-06 11:17:47 +08001257 switch (node->msg.type) {
1258 case VHOST_IOTLB_MSG:
1259 size = sizeof(node->msg);
1260 msg = &node->msg.iotlb;
1261 break;
1262 case VHOST_IOTLB_MSG_V2:
1263 size = sizeof(node->msg_v2);
1264 msg = &node->msg_v2.iotlb;
1265 break;
1266 default:
1267 BUG();
1268 break;
1269 }
1270
1271 ret = copy_to_iter(start, size, to);
1272 if (ret != size || msg->type != VHOST_IOTLB_MISS) {
Jason Wang6b1e6cc2016-06-23 02:04:32 -04001273 kfree(node);
1274 return ret;
1275 }
Jason Wang6b1e6cc2016-06-23 02:04:32 -04001276 vhost_enqueue_msg(dev, &dev->pending_list, node);
1277 }
1278
1279 return ret;
1280}
1281EXPORT_SYMBOL_GPL(vhost_chr_read_iter);
1282
1283static int vhost_iotlb_miss(struct vhost_virtqueue *vq, u64 iova, int access)
1284{
1285 struct vhost_dev *dev = vq->dev;
1286 struct vhost_msg_node *node;
1287 struct vhost_iotlb_msg *msg;
Jason Wang429711a2018-08-06 11:17:47 +08001288 bool v2 = vhost_backend_has_feature(vq, VHOST_BACKEND_F_IOTLB_MSG_V2);
Jason Wang6b1e6cc2016-06-23 02:04:32 -04001289
Jason Wang429711a2018-08-06 11:17:47 +08001290 node = vhost_new_msg(vq, v2 ? VHOST_IOTLB_MSG_V2 : VHOST_IOTLB_MSG);
Jason Wang6b1e6cc2016-06-23 02:04:32 -04001291 if (!node)
1292 return -ENOMEM;
1293
Jason Wang429711a2018-08-06 11:17:47 +08001294 if (v2) {
1295 node->msg_v2.type = VHOST_IOTLB_MSG_V2;
1296 msg = &node->msg_v2.iotlb;
1297 } else {
1298 msg = &node->msg.iotlb;
1299 }
1300
Jason Wang6b1e6cc2016-06-23 02:04:32 -04001301 msg->type = VHOST_IOTLB_MISS;
1302 msg->iova = iova;
1303 msg->perm = access;
1304
1305 vhost_enqueue_msg(dev, &dev->read_list, node);
1306
1307 return 0;
Jason Wangbfe2bc52016-06-23 02:04:30 -04001308}
1309
Stefan Hajnocziddd3d402018-04-11 10:35:41 +08001310static bool vq_access_ok(struct vhost_virtqueue *vq, unsigned int num,
1311 struct vring_desc __user *desc,
1312 struct vring_avail __user *avail,
1313 struct vring_used __user *used)
Jason Wang6b1e6cc2016-06-23 02:04:32 -04001314
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00001315{
Jason Wang4942e822019-05-24 04:12:16 -04001316 return access_ok(desc, vhost_get_desc_size(vq, num)) &&
1317 access_ok(avail, vhost_get_avail_size(vq, num)) &&
1318 access_ok(used, vhost_get_used_size(vq, num));
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00001319}
1320
Jason Wangf8894912017-02-28 17:56:02 +08001321static void vhost_vq_meta_update(struct vhost_virtqueue *vq,
1322 const struct vhost_umem_node *node,
1323 int type)
1324{
1325 int access = (type == VHOST_ADDR_USED) ?
1326 VHOST_ACCESS_WO : VHOST_ACCESS_RO;
1327
1328 if (likely(node->perm & access))
1329 vq->meta_iotlb[type] = node;
1330}
1331
Stefan Hajnocziddd3d402018-04-11 10:35:41 +08001332static bool iotlb_access_ok(struct vhost_virtqueue *vq,
1333 int access, u64 addr, u64 len, int type)
Jason Wang6b1e6cc2016-06-23 02:04:32 -04001334{
1335 const struct vhost_umem_node *node;
1336 struct vhost_umem *umem = vq->iotlb;
Michael S. Tsirkinca2c5b32017-08-21 22:33:33 +03001337 u64 s = 0, size, orig_addr = addr, last = addr + len - 1;
Jason Wangf8894912017-02-28 17:56:02 +08001338
1339 if (vhost_vq_meta_fetch(vq, addr, len, type))
1340 return true;
Jason Wang6b1e6cc2016-06-23 02:04:32 -04001341
1342 while (len > s) {
1343 node = vhost_umem_interval_tree_iter_first(&umem->umem_tree,
1344 addr,
Michael S. Tsirkinca2c5b32017-08-21 22:33:33 +03001345 last);
Jason Wang6b1e6cc2016-06-23 02:04:32 -04001346 if (node == NULL || node->start > addr) {
1347 vhost_iotlb_miss(vq, addr, access);
1348 return false;
1349 } else if (!(node->perm & access)) {
1350 /* Report the possible access violation by
1351 * request another translation from userspace.
1352 */
1353 return false;
1354 }
1355
1356 size = node->size - addr + node->start;
Jason Wangf8894912017-02-28 17:56:02 +08001357
1358 if (orig_addr == addr && size >= len)
1359 vhost_vq_meta_update(vq, node, type);
1360
Jason Wang6b1e6cc2016-06-23 02:04:32 -04001361 s += size;
1362 addr += size;
1363 }
1364
1365 return true;
1366}
1367
Jason Wang9b5e8302019-05-24 04:12:15 -04001368int vq_meta_prefetch(struct vhost_virtqueue *vq)
Jason Wang6b1e6cc2016-06-23 02:04:32 -04001369{
Jason Wang6b1e6cc2016-06-23 02:04:32 -04001370 unsigned int num = vq->num;
1371
Michael S. Tsirkin3d2c7d32019-08-10 13:53:21 -04001372 if (!vq->iotlb)
Jason Wang6b1e6cc2016-06-23 02:04:32 -04001373 return 1;
1374
1375 return iotlb_access_ok(vq, VHOST_ACCESS_RO, (u64)(uintptr_t)vq->desc,
Jason Wang4942e822019-05-24 04:12:16 -04001376 vhost_get_desc_size(vq, num), VHOST_ADDR_DESC) &&
Jason Wang6b1e6cc2016-06-23 02:04:32 -04001377 iotlb_access_ok(vq, VHOST_ACCESS_RO, (u64)(uintptr_t)vq->avail,
Jason Wang4942e822019-05-24 04:12:16 -04001378 vhost_get_avail_size(vq, num),
Jason Wangf8894912017-02-28 17:56:02 +08001379 VHOST_ADDR_AVAIL) &&
Jason Wang6b1e6cc2016-06-23 02:04:32 -04001380 iotlb_access_ok(vq, VHOST_ACCESS_WO, (u64)(uintptr_t)vq->used,
Jason Wang4942e822019-05-24 04:12:16 -04001381 vhost_get_used_size(vq, num), VHOST_ADDR_USED);
Jason Wang6b1e6cc2016-06-23 02:04:32 -04001382}
Jason Wang9b5e8302019-05-24 04:12:15 -04001383EXPORT_SYMBOL_GPL(vq_meta_prefetch);
Jason Wang6b1e6cc2016-06-23 02:04:32 -04001384
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00001385/* Can we log writes? */
1386/* Caller should have device mutex but not vq mutex */
Stefan Hajnocziddd3d402018-04-11 10:35:41 +08001387bool vhost_log_access_ok(struct vhost_dev *dev)
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00001388{
Jason Wanga9709d62016-06-23 02:04:31 -04001389 return memory_access_ok(dev, dev->umem, 1);
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00001390}
Asias He6ac1afb2013-05-06 16:38:21 +08001391EXPORT_SYMBOL_GPL(vhost_log_access_ok);
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00001392
1393/* Verify access for write logging. */
1394/* Caller should have vq mutex and device mutex */
Stefan Hajnocziddd3d402018-04-11 10:35:41 +08001395static bool vq_log_access_ok(struct vhost_virtqueue *vq,
1396 void __user *log_base)
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00001397{
Jason Wanga9709d62016-06-23 02:04:31 -04001398 return vq_memory_access_ok(log_base, vq->umem,
Michael S. Tsirkinea16c512014-06-05 15:20:23 +03001399 vhost_has_feature(vq, VHOST_F_LOG_ALL)) &&
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00001400 (!vq->log_used || log_access_ok(log_base, vq->log_addr,
Jason Wang4942e822019-05-24 04:12:16 -04001401 vhost_get_used_size(vq, vq->num)));
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00001402}
1403
1404/* Can we start vq? */
1405/* Caller should have vq mutex and device mutex */
Stefan Hajnocziddd3d402018-04-11 10:35:41 +08001406bool vhost_vq_access_ok(struct vhost_virtqueue *vq)
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00001407{
Stefan Hajnoczid14d2b72018-04-11 10:35:40 +08001408 if (!vq_log_access_ok(vq, vq->log_base))
Stefan Hajnocziddd3d402018-04-11 10:35:41 +08001409 return false;
Jason Wangd65026c2018-03-29 16:00:04 +08001410
Stefan Hajnoczid14d2b72018-04-11 10:35:40 +08001411 /* Access validation occurs at prefetch time with IOTLB */
1412 if (vq->iotlb)
Stefan Hajnocziddd3d402018-04-11 10:35:41 +08001413 return true;
Jason Wangd65026c2018-03-29 16:00:04 +08001414
1415 return vq_access_ok(vq, vq->num, vq->desc, vq->avail, vq->used);
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00001416}
Asias He6ac1afb2013-05-06 16:38:21 +08001417EXPORT_SYMBOL_GPL(vhost_vq_access_ok);
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00001418
Jason Wang6b1e6cc2016-06-23 02:04:32 -04001419static struct vhost_umem *vhost_umem_alloc(void)
1420{
Michal Hocko6c5ab652017-05-08 15:57:15 -07001421 struct vhost_umem *umem = kvzalloc(sizeof(*umem), GFP_KERNEL);
Jason Wang6b1e6cc2016-06-23 02:04:32 -04001422
1423 if (!umem)
1424 return NULL;
1425
Davidlohr Buesof808c132017-09-08 16:15:08 -07001426 umem->umem_tree = RB_ROOT_CACHED;
Jason Wang6b1e6cc2016-06-23 02:04:32 -04001427 umem->numem = 0;
1428 INIT_LIST_HEAD(&umem->umem_list);
1429
1430 return umem;
1431}
1432
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00001433static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m)
1434{
Jason Wanga9709d62016-06-23 02:04:31 -04001435 struct vhost_memory mem, *newmem;
1436 struct vhost_memory_region *region;
Jason Wanga9709d62016-06-23 02:04:31 -04001437 struct vhost_umem *newumem, *oldumem;
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00001438 unsigned long size = offsetof(struct vhost_memory, regions);
Michael S. Tsirkin98f9ca02014-05-28 17:07:02 +03001439 int i;
Krishna Kumard47effe2011-03-01 17:06:37 +05301440
Takuya Yoshikawa7ad9c9d2010-05-27 18:58:03 +09001441 if (copy_from_user(&mem, m, size))
1442 return -EFAULT;
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00001443 if (mem.padding)
1444 return -EOPNOTSUPP;
Igor Mammedovc9ce42f2015-07-02 15:08:11 +02001445 if (mem.nregions > max_mem_regions)
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00001446 return -E2BIG;
Matthew Wilcoxb2303d72018-06-07 07:57:18 -07001447 newmem = kvzalloc(struct_size(newmem, regions, mem.nregions),
1448 GFP_KERNEL);
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00001449 if (!newmem)
1450 return -ENOMEM;
1451
1452 memcpy(newmem, &mem, size);
Takuya Yoshikawa7ad9c9d2010-05-27 18:58:03 +09001453 if (copy_from_user(newmem->regions, m->regions,
1454 mem.nregions * sizeof *m->regions)) {
Igor Mammedovbcfeaca2015-06-16 18:33:35 +02001455 kvfree(newmem);
Takuya Yoshikawa7ad9c9d2010-05-27 18:58:03 +09001456 return -EFAULT;
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00001457 }
1458
Jason Wang6b1e6cc2016-06-23 02:04:32 -04001459 newumem = vhost_umem_alloc();
Jason Wanga9709d62016-06-23 02:04:31 -04001460 if (!newumem) {
Igor Mammedov4de72552015-07-01 11:07:09 +02001461 kvfree(newmem);
Jason Wanga9709d62016-06-23 02:04:31 -04001462 return -ENOMEM;
Takuya Yoshikawaa02c3782010-05-27 19:03:56 +09001463 }
Jason Wanga9709d62016-06-23 02:04:31 -04001464
Jason Wanga9709d62016-06-23 02:04:31 -04001465 for (region = newmem->regions;
1466 region < newmem->regions + mem.nregions;
1467 region++) {
Jason Wang6b1e6cc2016-06-23 02:04:32 -04001468 if (vhost_new_umem_range(newumem,
1469 region->guest_phys_addr,
1470 region->memory_size,
1471 region->guest_phys_addr +
1472 region->memory_size - 1,
1473 region->userspace_addr,
1474 VHOST_ACCESS_RW))
Jason Wanga9709d62016-06-23 02:04:31 -04001475 goto err;
Jason Wanga9709d62016-06-23 02:04:31 -04001476 }
1477
1478 if (!memory_access_ok(d, newumem, 0))
1479 goto err;
1480
1481 oldumem = d->umem;
1482 d->umem = newumem;
Michael S. Tsirkin98f9ca02014-05-28 17:07:02 +03001483
Michael S. Tsirkin47283be2014-06-05 15:20:27 +03001484 /* All memory accesses are done under some VQ mutex. */
Michael S. Tsirkin98f9ca02014-05-28 17:07:02 +03001485 for (i = 0; i < d->nvqs; ++i) {
1486 mutex_lock(&d->vqs[i]->mutex);
Jason Wanga9709d62016-06-23 02:04:31 -04001487 d->vqs[i]->umem = newumem;
Michael S. Tsirkin98f9ca02014-05-28 17:07:02 +03001488 mutex_unlock(&d->vqs[i]->mutex);
1489 }
Jason Wanga9709d62016-06-23 02:04:31 -04001490
1491 kvfree(newmem);
1492 vhost_umem_clean(oldumem);
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00001493 return 0;
Jason Wanga9709d62016-06-23 02:04:31 -04001494
1495err:
1496 vhost_umem_clean(newumem);
1497 kvfree(newmem);
1498 return -EFAULT;
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00001499}
1500
Jason Wangfeebcae2019-05-24 04:12:17 -04001501static long vhost_vring_set_num(struct vhost_dev *d,
1502 struct vhost_virtqueue *vq,
1503 void __user *argp)
1504{
1505 struct vhost_vring_state s;
1506
1507 /* Resizing ring with an active backend?
1508 * You don't want to do that. */
1509 if (vq->private_data)
1510 return -EBUSY;
1511
1512 if (copy_from_user(&s, argp, sizeof s))
1513 return -EFAULT;
1514
1515 if (!s.num || s.num > 0xffff || (s.num & (s.num - 1)))
1516 return -EINVAL;
1517 vq->num = s.num;
1518
1519 return 0;
1520}
1521
1522static long vhost_vring_set_addr(struct vhost_dev *d,
1523 struct vhost_virtqueue *vq,
1524 void __user *argp)
1525{
1526 struct vhost_vring_addr a;
1527
1528 if (copy_from_user(&a, argp, sizeof a))
1529 return -EFAULT;
1530 if (a.flags & ~(0x1 << VHOST_VRING_F_LOG))
1531 return -EOPNOTSUPP;
1532
1533 /* For 32bit, verify that the top 32bits of the user
1534 data are set to zero. */
1535 if ((u64)(unsigned long)a.desc_user_addr != a.desc_user_addr ||
1536 (u64)(unsigned long)a.used_user_addr != a.used_user_addr ||
1537 (u64)(unsigned long)a.avail_user_addr != a.avail_user_addr)
1538 return -EFAULT;
1539
1540 /* Make sure it's safe to cast pointers to vring types. */
1541 BUILD_BUG_ON(__alignof__ *vq->avail > VRING_AVAIL_ALIGN_SIZE);
1542 BUILD_BUG_ON(__alignof__ *vq->used > VRING_USED_ALIGN_SIZE);
1543 if ((a.avail_user_addr & (VRING_AVAIL_ALIGN_SIZE - 1)) ||
1544 (a.used_user_addr & (VRING_USED_ALIGN_SIZE - 1)) ||
1545 (a.log_guest_addr & (VRING_USED_ALIGN_SIZE - 1)))
1546 return -EINVAL;
1547
1548 /* We only verify access here if backend is configured.
1549 * If it is not, we don't as size might not have been setup.
1550 * We will verify when backend is configured. */
1551 if (vq->private_data) {
1552 if (!vq_access_ok(vq, vq->num,
1553 (void __user *)(unsigned long)a.desc_user_addr,
1554 (void __user *)(unsigned long)a.avail_user_addr,
1555 (void __user *)(unsigned long)a.used_user_addr))
1556 return -EINVAL;
1557
1558 /* Also validate log access for used ring if enabled. */
1559 if ((a.flags & (0x1 << VHOST_VRING_F_LOG)) &&
1560 !log_access_ok(vq->log_base, a.log_guest_addr,
1561 sizeof *vq->used +
1562 vq->num * sizeof *vq->used->ring))
1563 return -EINVAL;
1564 }
1565
1566 vq->log_used = !!(a.flags & (0x1 << VHOST_VRING_F_LOG));
1567 vq->desc = (void __user *)(unsigned long)a.desc_user_addr;
1568 vq->avail = (void __user *)(unsigned long)a.avail_user_addr;
1569 vq->log_addr = a.log_guest_addr;
1570 vq->used = (void __user *)(unsigned long)a.used_user_addr;
1571
1572 return 0;
1573}
1574
1575static long vhost_vring_set_num_addr(struct vhost_dev *d,
1576 struct vhost_virtqueue *vq,
1577 unsigned int ioctl,
1578 void __user *argp)
1579{
1580 long r;
1581
1582 mutex_lock(&vq->mutex);
1583
1584 switch (ioctl) {
1585 case VHOST_SET_VRING_NUM:
1586 r = vhost_vring_set_num(d, vq, argp);
1587 break;
1588 case VHOST_SET_VRING_ADDR:
1589 r = vhost_vring_set_addr(d, vq, argp);
1590 break;
1591 default:
1592 BUG();
1593 }
1594
1595 mutex_unlock(&vq->mutex);
1596
1597 return r;
1598}
Sonny Rao26b36602018-03-14 10:05:06 -07001599long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp)
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00001600{
Al Virocecb46f2012-08-27 14:21:39 -04001601 struct file *eventfp, *filep = NULL;
1602 bool pollstart = false, pollstop = false;
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00001603 struct eventfd_ctx *ctx = NULL;
1604 u32 __user *idxp = argp;
1605 struct vhost_virtqueue *vq;
1606 struct vhost_vring_state s;
1607 struct vhost_vring_file f;
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00001608 u32 idx;
1609 long r;
1610
1611 r = get_user(idx, idxp);
1612 if (r < 0)
1613 return r;
Krishna Kumar0f3d9a12010-05-25 11:10:36 +05301614 if (idx >= d->nvqs)
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00001615 return -ENOBUFS;
1616
Jason Wangff002262018-10-30 14:10:49 +08001617 idx = array_index_nospec(idx, d->nvqs);
Asias He3ab2e422013-04-27 11:16:48 +08001618 vq = d->vqs[idx];
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00001619
Jason Wangfeebcae2019-05-24 04:12:17 -04001620 if (ioctl == VHOST_SET_VRING_NUM ||
1621 ioctl == VHOST_SET_VRING_ADDR) {
1622 return vhost_vring_set_num_addr(d, vq, ioctl, argp);
1623 }
1624
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00001625 mutex_lock(&vq->mutex);
1626
1627 switch (ioctl) {
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00001628 case VHOST_SET_VRING_BASE:
1629 /* Moving base with an active backend?
1630 * You don't want to do that. */
1631 if (vq->private_data) {
1632 r = -EBUSY;
1633 break;
1634 }
Takuya Yoshikawa7ad9c9d2010-05-27 18:58:03 +09001635 if (copy_from_user(&s, argp, sizeof s)) {
1636 r = -EFAULT;
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00001637 break;
Takuya Yoshikawa7ad9c9d2010-05-27 18:58:03 +09001638 }
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00001639 if (s.num > 0xffff) {
1640 r = -EINVAL;
1641 break;
1642 }
Jason Wang8d658432017-07-27 11:22:05 +08001643 vq->last_avail_idx = s.num;
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00001644 /* Forget the cached index value. */
1645 vq->avail_idx = vq->last_avail_idx;
1646 break;
1647 case VHOST_GET_VRING_BASE:
1648 s.index = idx;
1649 s.num = vq->last_avail_idx;
Takuya Yoshikawa7ad9c9d2010-05-27 18:58:03 +09001650 if (copy_to_user(argp, &s, sizeof s))
1651 r = -EFAULT;
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00001652 break;
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00001653 case VHOST_SET_VRING_KICK:
Takuya Yoshikawa7ad9c9d2010-05-27 18:58:03 +09001654 if (copy_from_user(&f, argp, sizeof f)) {
1655 r = -EFAULT;
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00001656 break;
Takuya Yoshikawa7ad9c9d2010-05-27 18:58:03 +09001657 }
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00001658 eventfp = f.fd == -1 ? NULL : eventfd_fget(f.fd);
Michael S. Tsirkin535297a2010-03-17 16:06:11 +02001659 if (IS_ERR(eventfp)) {
1660 r = PTR_ERR(eventfp);
1661 break;
1662 }
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00001663 if (eventfp != vq->kick) {
Al Virocecb46f2012-08-27 14:21:39 -04001664 pollstop = (filep = vq->kick) != NULL;
1665 pollstart = (vq->kick = eventfp) != NULL;
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00001666 } else
1667 filep = eventfp;
1668 break;
1669 case VHOST_SET_VRING_CALL:
Takuya Yoshikawa7ad9c9d2010-05-27 18:58:03 +09001670 if (copy_from_user(&f, argp, sizeof f)) {
1671 r = -EFAULT;
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00001672 break;
Takuya Yoshikawa7ad9c9d2010-05-27 18:58:03 +09001673 }
Eric Biggerse050c7d2018-01-06 14:52:19 -08001674 ctx = f.fd == -1 ? NULL : eventfd_ctx_fdget(f.fd);
1675 if (IS_ERR(ctx)) {
1676 r = PTR_ERR(ctx);
Michael S. Tsirkin535297a2010-03-17 16:06:11 +02001677 break;
1678 }
Eric Biggerse050c7d2018-01-06 14:52:19 -08001679 swap(ctx, vq->call_ctx);
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00001680 break;
1681 case VHOST_SET_VRING_ERR:
Takuya Yoshikawa7ad9c9d2010-05-27 18:58:03 +09001682 if (copy_from_user(&f, argp, sizeof f)) {
1683 r = -EFAULT;
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00001684 break;
Takuya Yoshikawa7ad9c9d2010-05-27 18:58:03 +09001685 }
Eric Biggers09f332a2018-01-06 14:52:20 -08001686 ctx = f.fd == -1 ? NULL : eventfd_ctx_fdget(f.fd);
1687 if (IS_ERR(ctx)) {
1688 r = PTR_ERR(ctx);
Michael S. Tsirkin535297a2010-03-17 16:06:11 +02001689 break;
1690 }
Eric Biggers09f332a2018-01-06 14:52:20 -08001691 swap(ctx, vq->error_ctx);
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00001692 break;
Greg Kurz2751c982015-04-24 14:27:24 +02001693 case VHOST_SET_VRING_ENDIAN:
1694 r = vhost_set_vring_endian(vq, argp);
1695 break;
1696 case VHOST_GET_VRING_ENDIAN:
1697 r = vhost_get_vring_endian(vq, idx, argp);
1698 break;
Jason Wang03088132016-03-04 06:24:53 -05001699 case VHOST_SET_VRING_BUSYLOOP_TIMEOUT:
1700 if (copy_from_user(&s, argp, sizeof(s))) {
1701 r = -EFAULT;
1702 break;
1703 }
1704 vq->busyloop_timeout = s.num;
1705 break;
1706 case VHOST_GET_VRING_BUSYLOOP_TIMEOUT:
1707 s.index = idx;
1708 s.num = vq->busyloop_timeout;
1709 if (copy_to_user(argp, &s, sizeof(s)))
1710 r = -EFAULT;
1711 break;
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00001712 default:
1713 r = -ENOIOCTLCMD;
1714 }
1715
1716 if (pollstop && vq->handle_kick)
1717 vhost_poll_stop(&vq->poll);
1718
Eric Biggerse050c7d2018-01-06 14:52:19 -08001719 if (!IS_ERR_OR_NULL(ctx))
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00001720 eventfd_ctx_put(ctx);
1721 if (filep)
1722 fput(filep);
1723
1724 if (pollstart && vq->handle_kick)
Jason Wang2b8b3282013-01-28 01:05:18 +00001725 r = vhost_poll_start(&vq->poll, vq->kick);
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00001726
1727 mutex_unlock(&vq->mutex);
1728
1729 if (pollstop && vq->handle_kick)
1730 vhost_poll_flush(&vq->poll);
1731 return r;
1732}
Asias He6ac1afb2013-05-06 16:38:21 +08001733EXPORT_SYMBOL_GPL(vhost_vring_ioctl);
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00001734
Jason Wang6b1e6cc2016-06-23 02:04:32 -04001735int vhost_init_device_iotlb(struct vhost_dev *d, bool enabled)
1736{
1737 struct vhost_umem *niotlb, *oiotlb;
1738 int i;
1739
1740 niotlb = vhost_umem_alloc();
1741 if (!niotlb)
1742 return -ENOMEM;
1743
1744 oiotlb = d->iotlb;
1745 d->iotlb = niotlb;
1746
1747 for (i = 0; i < d->nvqs; ++i) {
Jason Wangb13f9c62018-08-08 11:43:04 +08001748 struct vhost_virtqueue *vq = d->vqs[i];
1749
1750 mutex_lock(&vq->mutex);
1751 vq->iotlb = niotlb;
1752 __vhost_vq_meta_reset(vq);
1753 mutex_unlock(&vq->mutex);
Jason Wang6b1e6cc2016-06-23 02:04:32 -04001754 }
1755
1756 vhost_umem_clean(oiotlb);
1757
1758 return 0;
1759}
1760EXPORT_SYMBOL_GPL(vhost_init_device_iotlb);
1761
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00001762/* Caller must have device mutex */
Michael S. Tsirkin935cdee2012-12-06 14:03:34 +02001763long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp)
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00001764{
Eric Biggersd25cc432018-01-06 14:52:21 -08001765 struct eventfd_ctx *ctx;
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00001766 u64 p;
1767 long r;
1768 int i, fd;
1769
1770 /* If you are not the owner, you can become one */
1771 if (ioctl == VHOST_SET_OWNER) {
1772 r = vhost_dev_set_owner(d);
1773 goto done;
1774 }
1775
1776 /* You must be the owner to do anything else */
1777 r = vhost_dev_check_owner(d);
1778 if (r)
1779 goto done;
1780
1781 switch (ioctl) {
1782 case VHOST_SET_MEM_TABLE:
1783 r = vhost_set_memory(d, argp);
1784 break;
1785 case VHOST_SET_LOG_BASE:
Takuya Yoshikawa7ad9c9d2010-05-27 18:58:03 +09001786 if (copy_from_user(&p, argp, sizeof p)) {
1787 r = -EFAULT;
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00001788 break;
Takuya Yoshikawa7ad9c9d2010-05-27 18:58:03 +09001789 }
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00001790 if ((u64)(unsigned long)p != p) {
1791 r = -EFAULT;
1792 break;
1793 }
1794 for (i = 0; i < d->nvqs; ++i) {
1795 struct vhost_virtqueue *vq;
1796 void __user *base = (void __user *)(unsigned long)p;
Asias He3ab2e422013-04-27 11:16:48 +08001797 vq = d->vqs[i];
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00001798 mutex_lock(&vq->mutex);
1799 /* If ring is inactive, will check when it's enabled. */
Michael S. Tsirkinea16c512014-06-05 15:20:23 +03001800 if (vq->private_data && !vq_log_access_ok(vq, base))
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00001801 r = -EFAULT;
1802 else
1803 vq->log_base = base;
1804 mutex_unlock(&vq->mutex);
1805 }
1806 break;
1807 case VHOST_SET_LOG_FD:
1808 r = get_user(fd, (int __user *)argp);
1809 if (r < 0)
1810 break;
Eric Biggersd25cc432018-01-06 14:52:21 -08001811 ctx = fd == -1 ? NULL : eventfd_ctx_fdget(fd);
1812 if (IS_ERR(ctx)) {
1813 r = PTR_ERR(ctx);
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00001814 break;
1815 }
Eric Biggersd25cc432018-01-06 14:52:21 -08001816 swap(ctx, d->log_ctx);
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00001817 for (i = 0; i < d->nvqs; ++i) {
Asias He3ab2e422013-04-27 11:16:48 +08001818 mutex_lock(&d->vqs[i]->mutex);
1819 d->vqs[i]->log_ctx = d->log_ctx;
1820 mutex_unlock(&d->vqs[i]->mutex);
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00001821 }
1822 if (ctx)
1823 eventfd_ctx_put(ctx);
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00001824 break;
1825 default:
Michael S. Tsirkin935cdee2012-12-06 14:03:34 +02001826 r = -ENOIOCTLCMD;
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00001827 break;
1828 }
1829done:
1830 return r;
1831}
Asias He6ac1afb2013-05-06 16:38:21 +08001832EXPORT_SYMBOL_GPL(vhost_dev_ioctl);
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00001833
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00001834/* TODO: This is really inefficient. We need something like get_user()
1835 * (instruction directly accesses the data, with an exception table entry
Mauro Carvalho Chehabcb1aaeb2019-06-07 15:54:32 -03001836 * returning -EFAULT). See Documentation/x86/exception-tables.rst.
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00001837 */
1838static int set_bit_to_user(int nr, void __user *addr)
1839{
1840 unsigned long log = (unsigned long)addr;
1841 struct page *page;
1842 void *base;
1843 int bit = nr + (log % PAGE_SIZE) * 8;
1844 int r;
Krishna Kumard47effe2011-03-01 17:06:37 +05301845
Ira Weiny73b01402019-05-13 17:17:11 -07001846 r = get_user_pages_fast(log, 1, FOLL_WRITE, &page);
Michael S. Tsirkind6db3f52010-02-23 11:25:23 +02001847 if (r < 0)
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00001848 return r;
Michael S. Tsirkind6db3f52010-02-23 11:25:23 +02001849 BUG_ON(r != 1);
Cong Wangc6daa7f2011-11-25 23:14:26 +08001850 base = kmap_atomic(page);
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00001851 set_bit(bit, base);
Cong Wangc6daa7f2011-11-25 23:14:26 +08001852 kunmap_atomic(base);
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00001853 set_page_dirty_lock(page);
1854 put_page(page);
1855 return 0;
1856}
1857
1858static int log_write(void __user *log_base,
1859 u64 write_address, u64 write_length)
1860{
Michael S. Tsirkin28831ee2010-11-29 10:22:10 +02001861 u64 write_page = write_address / VHOST_PAGE_SIZE;
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00001862 int r;
Krishna Kumard47effe2011-03-01 17:06:37 +05301863
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00001864 if (!write_length)
1865 return 0;
Michael S. Tsirkin3bf9be42010-11-29 10:19:07 +02001866 write_length += write_address % VHOST_PAGE_SIZE;
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00001867 for (;;) {
1868 u64 base = (u64)(unsigned long)log_base;
Michael S. Tsirkin28831ee2010-11-29 10:22:10 +02001869 u64 log = base + write_page / 8;
1870 int bit = write_page % 8;
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00001871 if ((u64)(unsigned long)log != log)
1872 return -EFAULT;
1873 r = set_bit_to_user(bit, (void __user *)(unsigned long)log);
1874 if (r < 0)
1875 return r;
1876 if (write_length <= VHOST_PAGE_SIZE)
1877 break;
1878 write_length -= VHOST_PAGE_SIZE;
Michael S. Tsirkin28831ee2010-11-29 10:22:10 +02001879 write_page += 1;
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00001880 }
1881 return r;
1882}
1883
Jason Wangcc5e7102019-01-16 16:54:42 +08001884static int log_write_hva(struct vhost_virtqueue *vq, u64 hva, u64 len)
1885{
1886 struct vhost_umem *umem = vq->umem;
1887 struct vhost_umem_node *u;
1888 u64 start, end, l, min;
1889 int r;
1890 bool hit = false;
1891
1892 while (len) {
1893 min = len;
1894 /* More than one GPAs can be mapped into a single HVA. So
1895 * iterate all possible umems here to be safe.
1896 */
1897 list_for_each_entry(u, &umem->umem_list, link) {
1898 if (u->userspace_addr > hva - 1 + len ||
1899 u->userspace_addr - 1 + u->size < hva)
1900 continue;
1901 start = max(u->userspace_addr, hva);
1902 end = min(u->userspace_addr - 1 + u->size,
1903 hva - 1 + len);
1904 l = end - start + 1;
1905 r = log_write(vq->log_base,
1906 u->start + start - u->userspace_addr,
1907 l);
1908 if (r < 0)
1909 return r;
1910 hit = true;
1911 min = min(l, min);
1912 }
1913
1914 if (!hit)
1915 return -EFAULT;
1916
1917 len -= min;
1918 hva += min;
1919 }
1920
1921 return 0;
1922}
1923
1924static int log_used(struct vhost_virtqueue *vq, u64 used_offset, u64 len)
1925{
1926 struct iovec iov[64];
1927 int i, ret;
1928
1929 if (!vq->iotlb)
1930 return log_write(vq->log_base, vq->log_addr + used_offset, len);
1931
1932 ret = translate_desc(vq, (uintptr_t)vq->used + used_offset,
1933 len, iov, 64, VHOST_ACCESS_WO);
Jason Wang816db762019-02-19 14:53:44 +08001934 if (ret < 0)
Jason Wangcc5e7102019-01-16 16:54:42 +08001935 return ret;
1936
1937 for (i = 0; i < ret; i++) {
1938 ret = log_write_hva(vq, (uintptr_t)iov[i].iov_base,
1939 iov[i].iov_len);
1940 if (ret)
1941 return ret;
1942 }
1943
1944 return 0;
1945}
1946
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00001947int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log,
Jason Wangcc5e7102019-01-16 16:54:42 +08001948 unsigned int log_num, u64 len, struct iovec *iov, int count)
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00001949{
1950 int i, r;
1951
1952 /* Make sure data written is seen before log. */
Michael S. Tsirkin56593382010-02-01 07:21:02 +00001953 smp_wmb();
Jason Wangcc5e7102019-01-16 16:54:42 +08001954
1955 if (vq->iotlb) {
1956 for (i = 0; i < count; i++) {
1957 r = log_write_hva(vq, (uintptr_t)iov[i].iov_base,
1958 iov[i].iov_len);
1959 if (r < 0)
1960 return r;
1961 }
1962 return 0;
1963 }
1964
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00001965 for (i = 0; i < log_num; ++i) {
1966 u64 l = min(log[i].len, len);
1967 r = log_write(vq->log_base, log[i].addr, l);
1968 if (r < 0)
1969 return r;
1970 len -= l;
Michael S. Tsirkin5786aee2010-09-22 12:31:53 +02001971 if (!len) {
1972 if (vq->log_ctx)
1973 eventfd_signal(vq->log_ctx, 1);
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00001974 return 0;
Michael S. Tsirkin5786aee2010-09-22 12:31:53 +02001975 }
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00001976 }
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00001977 /* Length written exceeds what we have stored. This is a bug. */
1978 BUG();
1979 return 0;
1980}
Asias He6ac1afb2013-05-06 16:38:21 +08001981EXPORT_SYMBOL_GPL(vhost_log_write);
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00001982
Jason Wang2723fea2011-06-21 18:04:38 +08001983static int vhost_update_used_flags(struct vhost_virtqueue *vq)
1984{
1985 void __user *used;
Jason Wang7b5d7532019-05-24 04:12:14 -04001986 if (vhost_put_used_flags(vq))
Jason Wang2723fea2011-06-21 18:04:38 +08001987 return -EFAULT;
1988 if (unlikely(vq->log_used)) {
1989 /* Make sure the flag is seen before log. */
1990 smp_wmb();
1991 /* Log used flag write. */
1992 used = &vq->used->flags;
Jason Wangcc5e7102019-01-16 16:54:42 +08001993 log_used(vq, (used - (void __user *)vq->used),
1994 sizeof vq->used->flags);
Jason Wang2723fea2011-06-21 18:04:38 +08001995 if (vq->log_ctx)
1996 eventfd_signal(vq->log_ctx, 1);
1997 }
1998 return 0;
1999}
2000
2001static int vhost_update_avail_event(struct vhost_virtqueue *vq, u16 avail_event)
2002{
Jason Wang7b5d7532019-05-24 04:12:14 -04002003 if (vhost_put_avail_event(vq))
Jason Wang2723fea2011-06-21 18:04:38 +08002004 return -EFAULT;
2005 if (unlikely(vq->log_used)) {
2006 void __user *used;
2007 /* Make sure the event is seen before log. */
2008 smp_wmb();
2009 /* Log avail event write */
2010 used = vhost_avail_event(vq);
Jason Wangcc5e7102019-01-16 16:54:42 +08002011 log_used(vq, (used - (void __user *)vq->used),
2012 sizeof *vhost_avail_event(vq));
Jason Wang2723fea2011-06-21 18:04:38 +08002013 if (vq->log_ctx)
2014 eventfd_signal(vq->log_ctx, 1);
2015 }
2016 return 0;
2017}
2018
Greg Kurz80f7d032016-02-16 15:59:44 +01002019int vhost_vq_init_access(struct vhost_virtqueue *vq)
Jason Wang2723fea2011-06-21 18:04:38 +08002020{
Michael S. Tsirkin3b1bbe82014-10-24 14:04:47 +03002021 __virtio16 last_used_idx;
Jason Wang2723fea2011-06-21 18:04:38 +08002022 int r;
Greg Kurze1f33be2016-02-16 15:54:28 +01002023 bool is_le = vq->is_le;
2024
Halil Pasiccda8bba2017-01-30 11:09:36 +01002025 if (!vq->private_data)
Jason Wang2723fea2011-06-21 18:04:38 +08002026 return 0;
Greg Kurz2751c982015-04-24 14:27:24 +02002027
2028 vhost_init_is_le(vq);
Jason Wang2723fea2011-06-21 18:04:38 +08002029
2030 r = vhost_update_used_flags(vq);
2031 if (r)
Greg Kurze1f33be2016-02-16 15:54:28 +01002032 goto err;
Jason Wang2723fea2011-06-21 18:04:38 +08002033 vq->signalled_used_valid = false;
Jason Wang6b1e6cc2016-06-23 02:04:32 -04002034 if (!vq->iotlb &&
Linus Torvalds96d4f262019-01-03 18:57:57 -08002035 !access_ok(&vq->used->idx, sizeof vq->used->idx)) {
Greg Kurze1f33be2016-02-16 15:54:28 +01002036 r = -EFAULT;
2037 goto err;
2038 }
Jason Wang7b5d7532019-05-24 04:12:14 -04002039 r = vhost_get_used_idx(vq, &last_used_idx);
Jason Wang6b1e6cc2016-06-23 02:04:32 -04002040 if (r) {
2041 vq_err(vq, "Can't access used idx at %p\n",
2042 &vq->used->idx);
Greg Kurze1f33be2016-02-16 15:54:28 +01002043 goto err;
Jason Wang6b1e6cc2016-06-23 02:04:32 -04002044 }
Michael S. Tsirkin3b1bbe82014-10-24 14:04:47 +03002045 vq->last_used_idx = vhost16_to_cpu(vq, last_used_idx);
Michael S. Tsirkin64f7f052014-12-01 17:39:39 +02002046 return 0;
Jason Wang6b1e6cc2016-06-23 02:04:32 -04002047
Greg Kurze1f33be2016-02-16 15:54:28 +01002048err:
2049 vq->is_le = is_le;
2050 return r;
Jason Wang2723fea2011-06-21 18:04:38 +08002051}
Greg Kurz80f7d032016-02-16 15:59:44 +01002052EXPORT_SYMBOL_GPL(vhost_vq_init_access);
Jason Wang2723fea2011-06-21 18:04:38 +08002053
Michael S. Tsirkin47283be2014-06-05 15:20:27 +03002054static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
Jason Wang6b1e6cc2016-06-23 02:04:32 -04002055 struct iovec iov[], int iov_size, int access)
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002056{
Jason Wanga9709d62016-06-23 02:04:31 -04002057 const struct vhost_umem_node *node;
Jason Wang6b1e6cc2016-06-23 02:04:32 -04002058 struct vhost_dev *dev = vq->dev;
2059 struct vhost_umem *umem = dev->iotlb ? dev->iotlb : dev->umem;
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002060 struct iovec *_iov;
2061 u64 s = 0;
2062 int ret = 0;
2063
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002064 while ((u64)len > s) {
2065 u64 size;
Michael S. Tsirkin7b3384f2010-07-01 18:40:12 +03002066 if (unlikely(ret >= iov_size)) {
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002067 ret = -ENOBUFS;
2068 break;
2069 }
Jason Wang6b1e6cc2016-06-23 02:04:32 -04002070
Jason Wanga9709d62016-06-23 02:04:31 -04002071 node = vhost_umem_interval_tree_iter_first(&umem->umem_tree,
2072 addr, addr + len - 1);
2073 if (node == NULL || node->start > addr) {
Jason Wang6b1e6cc2016-06-23 02:04:32 -04002074 if (umem != dev->iotlb) {
2075 ret = -EFAULT;
2076 break;
2077 }
2078 ret = -EAGAIN;
2079 break;
2080 } else if (!(node->perm & access)) {
2081 ret = -EPERM;
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002082 break;
2083 }
Jason Wang6b1e6cc2016-06-23 02:04:32 -04002084
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002085 _iov = iov + ret;
Jason Wanga9709d62016-06-23 02:04:31 -04002086 size = node->size - addr + node->start;
Michael S. Tsirkinbd971202012-11-26 05:57:27 +00002087 _iov->iov_len = min((u64)len - s, size);
Michael S. Tsirkin0d4a3f22019-09-14 15:21:51 -04002088 _iov->iov_base = (void __user *)(unsigned long)
2089 (node->userspace_addr + addr - node->start);
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002090 s += size;
2091 addr += size;
2092 ++ret;
2093 }
2094
Jason Wang6b1e6cc2016-06-23 02:04:32 -04002095 if (ret == -EAGAIN)
2096 vhost_iotlb_miss(vq, addr, access);
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002097 return ret;
2098}
2099
2100/* Each buffer in the virtqueues is actually a chain of descriptors. This
2101 * function returns the next descriptor in the chain,
2102 * or -1U if we're at the end. */
Michael S. Tsirkin3b1bbe82014-10-24 14:04:47 +03002103static unsigned next_desc(struct vhost_virtqueue *vq, struct vring_desc *desc)
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002104{
2105 unsigned int next;
2106
2107 /* If this descriptor says it doesn't chain, we're done. */
Michael S. Tsirkin3b1bbe82014-10-24 14:04:47 +03002108 if (!(desc->flags & cpu_to_vhost16(vq, VRING_DESC_F_NEXT)))
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002109 return -1U;
2110
2111 /* Check they're not leading us off end of descriptors. */
Paul E. McKenney3a5db0b2017-11-27 09:45:10 -08002112 next = vhost16_to_cpu(vq, READ_ONCE(desc->next));
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002113 return next;
2114}
2115
Michael S. Tsirkin47283be2014-06-05 15:20:27 +03002116static int get_indirect(struct vhost_virtqueue *vq,
Michael S. Tsirkin7b3384f2010-07-01 18:40:12 +03002117 struct iovec iov[], unsigned int iov_size,
2118 unsigned int *out_num, unsigned int *in_num,
2119 struct vhost_log *log, unsigned int *log_num,
2120 struct vring_desc *indirect)
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002121{
2122 struct vring_desc desc;
2123 unsigned int i = 0, count, found = 0;
Michael S. Tsirkin3b1bbe82014-10-24 14:04:47 +03002124 u32 len = vhost32_to_cpu(vq, indirect->len);
Al Viroaad9a1c2014-12-10 14:49:01 -05002125 struct iov_iter from;
Jason Wang6b1e6cc2016-06-23 02:04:32 -04002126 int ret, access;
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002127
2128 /* Sanity check */
Michael S. Tsirkin3b1bbe82014-10-24 14:04:47 +03002129 if (unlikely(len % sizeof desc)) {
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002130 vq_err(vq, "Invalid length in indirect descriptor: "
2131 "len 0x%llx not multiple of 0x%zx\n",
Michael S. Tsirkin3b1bbe82014-10-24 14:04:47 +03002132 (unsigned long long)len,
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002133 sizeof desc);
2134 return -EINVAL;
2135 }
2136
Michael S. Tsirkin3b1bbe82014-10-24 14:04:47 +03002137 ret = translate_desc(vq, vhost64_to_cpu(vq, indirect->addr), len, vq->indirect,
Jason Wang6b1e6cc2016-06-23 02:04:32 -04002138 UIO_MAXIOV, VHOST_ACCESS_RO);
Michael S. Tsirkin7b3384f2010-07-01 18:40:12 +03002139 if (unlikely(ret < 0)) {
Jason Wang6b1e6cc2016-06-23 02:04:32 -04002140 if (ret != -EAGAIN)
2141 vq_err(vq, "Translation failure %d in indirect.\n", ret);
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002142 return ret;
2143 }
Al Viroaad9a1c2014-12-10 14:49:01 -05002144 iov_iter_init(&from, READ, vq->indirect, ret, len);
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002145
2146 /* We will use the result as an address to read from, so most
2147 * architectures only need a compiler barrier here. */
2148 read_barrier_depends();
2149
Michael S. Tsirkin3b1bbe82014-10-24 14:04:47 +03002150 count = len / sizeof desc;
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002151 /* Buffers are chained via a 16 bit next field, so
2152 * we can have at most 2^16 of these. */
Michael S. Tsirkin7b3384f2010-07-01 18:40:12 +03002153 if (unlikely(count > USHRT_MAX + 1)) {
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002154 vq_err(vq, "Indirect buffer length too big: %d\n",
2155 indirect->len);
2156 return -E2BIG;
2157 }
2158
2159 do {
2160 unsigned iov_count = *in_num + *out_num;
Michael S. Tsirkin7b3384f2010-07-01 18:40:12 +03002161 if (unlikely(++found > count)) {
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002162 vq_err(vq, "Loop detected: last one at %u "
2163 "indirect size %u\n",
2164 i, count);
2165 return -EINVAL;
2166 }
Al Virocbbd26b2016-11-01 22:09:04 -04002167 if (unlikely(!copy_from_iter_full(&desc, sizeof(desc), &from))) {
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002168 vq_err(vq, "Failed indirect descriptor: idx %d, %zx\n",
Michael S. Tsirkin3b1bbe82014-10-24 14:04:47 +03002169 i, (size_t)vhost64_to_cpu(vq, indirect->addr) + i * sizeof desc);
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002170 return -EINVAL;
2171 }
Michael S. Tsirkin3b1bbe82014-10-24 14:04:47 +03002172 if (unlikely(desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_INDIRECT))) {
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002173 vq_err(vq, "Nested indirect descriptor: idx %d, %zx\n",
Michael S. Tsirkin3b1bbe82014-10-24 14:04:47 +03002174 i, (size_t)vhost64_to_cpu(vq, indirect->addr) + i * sizeof desc);
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002175 return -EINVAL;
2176 }
2177
Jason Wang6b1e6cc2016-06-23 02:04:32 -04002178 if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_WRITE))
2179 access = VHOST_ACCESS_WO;
2180 else
2181 access = VHOST_ACCESS_RO;
2182
Michael S. Tsirkin3b1bbe82014-10-24 14:04:47 +03002183 ret = translate_desc(vq, vhost64_to_cpu(vq, desc.addr),
2184 vhost32_to_cpu(vq, desc.len), iov + iov_count,
Jason Wang6b1e6cc2016-06-23 02:04:32 -04002185 iov_size - iov_count, access);
Michael S. Tsirkin7b3384f2010-07-01 18:40:12 +03002186 if (unlikely(ret < 0)) {
Jason Wang6b1e6cc2016-06-23 02:04:32 -04002187 if (ret != -EAGAIN)
2188 vq_err(vq, "Translation failure %d indirect idx %d\n",
2189 ret, i);
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002190 return ret;
2191 }
2192 /* If this is an input descriptor, increment that count. */
Jason Wang6b1e6cc2016-06-23 02:04:32 -04002193 if (access == VHOST_ACCESS_WO) {
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002194 *in_num += ret;
yongduan060423b2019-09-11 17:44:24 +08002195 if (unlikely(log && ret)) {
Michael S. Tsirkin3b1bbe82014-10-24 14:04:47 +03002196 log[*log_num].addr = vhost64_to_cpu(vq, desc.addr);
2197 log[*log_num].len = vhost32_to_cpu(vq, desc.len);
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002198 ++*log_num;
2199 }
2200 } else {
2201 /* If it's an output descriptor, they're all supposed
2202 * to come before any input descriptors. */
Michael S. Tsirkin7b3384f2010-07-01 18:40:12 +03002203 if (unlikely(*in_num)) {
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002204 vq_err(vq, "Indirect descriptor "
2205 "has out after in: idx %d\n", i);
2206 return -EINVAL;
2207 }
2208 *out_num += ret;
2209 }
Michael S. Tsirkin3b1bbe82014-10-24 14:04:47 +03002210 } while ((i = next_desc(vq, &desc)) != -1);
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002211 return 0;
2212}
2213
2214/* This looks in the virtqueue and for the first available buffer, and converts
2215 * it to an iovec for convenient access. Since descriptors consist of some
2216 * number of output then some number of input descriptors, it's actually two
2217 * iovecs, but we pack them into one and note how many of each there were.
2218 *
Michael S. Tsirkind5675bd2010-06-24 16:59:59 +03002219 * This function returns the descriptor number found, or vq->num (which is
2220 * never a valid descriptor number) if none was found. A negative code is
2221 * returned on error. */
Michael S. Tsirkin47283be2014-06-05 15:20:27 +03002222int vhost_get_vq_desc(struct vhost_virtqueue *vq,
Michael S. Tsirkind5675bd2010-06-24 16:59:59 +03002223 struct iovec iov[], unsigned int iov_size,
2224 unsigned int *out_num, unsigned int *in_num,
2225 struct vhost_log *log, unsigned int *log_num)
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002226{
2227 struct vring_desc desc;
2228 unsigned int i, head, found = 0;
2229 u16 last_avail_idx;
Michael S. Tsirkin3b1bbe82014-10-24 14:04:47 +03002230 __virtio16 avail_idx;
2231 __virtio16 ring_head;
Jason Wang6b1e6cc2016-06-23 02:04:32 -04002232 int ret, access;
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002233
2234 /* Check it isn't doing very strange things with descriptor numbers. */
2235 last_avail_idx = vq->last_avail_idx;
Jason Wange3b56cd2017-02-07 15:49:50 +08002236
2237 if (vq->avail_idx == vq->last_avail_idx) {
Jason Wang7b5d7532019-05-24 04:12:14 -04002238 if (unlikely(vhost_get_avail_idx(vq, &avail_idx))) {
Jason Wange3b56cd2017-02-07 15:49:50 +08002239 vq_err(vq, "Failed to access avail idx at %p\n",
2240 &vq->avail->idx);
2241 return -EFAULT;
2242 }
2243 vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
2244
2245 if (unlikely((u16)(vq->avail_idx - last_avail_idx) > vq->num)) {
2246 vq_err(vq, "Guest moved used index from %u to %u",
2247 last_avail_idx, vq->avail_idx);
2248 return -EFAULT;
2249 }
2250
2251 /* If there's nothing new since last we looked, return
2252 * invalid.
2253 */
2254 if (vq->avail_idx == last_avail_idx)
2255 return vq->num;
2256
2257 /* Only get avail ring entries after they have been
2258 * exposed by guest.
2259 */
2260 smp_rmb();
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002261 }
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002262
2263 /* Grab the next descriptor number they're advertising, and increment
2264 * the index we've seen. */
Jason Wang7b5d7532019-05-24 04:12:14 -04002265 if (unlikely(vhost_get_avail_head(vq, &ring_head, last_avail_idx))) {
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002266 vq_err(vq, "Failed to read head: idx %d address %p\n",
2267 last_avail_idx,
2268 &vq->avail->ring[last_avail_idx % vq->num]);
Michael S. Tsirkind5675bd2010-06-24 16:59:59 +03002269 return -EFAULT;
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002270 }
2271
Michael S. Tsirkin3b1bbe82014-10-24 14:04:47 +03002272 head = vhost16_to_cpu(vq, ring_head);
2273
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002274 /* If their number is silly, that's an error. */
Michael S. Tsirkin7b3384f2010-07-01 18:40:12 +03002275 if (unlikely(head >= vq->num)) {
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002276 vq_err(vq, "Guest says index %u > %u is available",
2277 head, vq->num);
Michael S. Tsirkind5675bd2010-06-24 16:59:59 +03002278 return -EINVAL;
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002279 }
2280
2281 /* When we start there are none of either input nor output. */
2282 *out_num = *in_num = 0;
2283 if (unlikely(log))
2284 *log_num = 0;
2285
2286 i = head;
2287 do {
2288 unsigned iov_count = *in_num + *out_num;
Michael S. Tsirkin7b3384f2010-07-01 18:40:12 +03002289 if (unlikely(i >= vq->num)) {
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002290 vq_err(vq, "Desc index is %u > %u, head = %u",
2291 i, vq->num, head);
Michael S. Tsirkind5675bd2010-06-24 16:59:59 +03002292 return -EINVAL;
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002293 }
Michael S. Tsirkin7b3384f2010-07-01 18:40:12 +03002294 if (unlikely(++found > vq->num)) {
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002295 vq_err(vq, "Loop detected: last one at %u "
2296 "vq size %u head %u\n",
2297 i, vq->num, head);
Michael S. Tsirkind5675bd2010-06-24 16:59:59 +03002298 return -EINVAL;
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002299 }
Jason Wang7b5d7532019-05-24 04:12:14 -04002300 ret = vhost_get_desc(vq, &desc, i);
Michael S. Tsirkin7b3384f2010-07-01 18:40:12 +03002301 if (unlikely(ret)) {
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002302 vq_err(vq, "Failed to get descriptor: idx %d addr %p\n",
2303 i, vq->desc + i);
Michael S. Tsirkind5675bd2010-06-24 16:59:59 +03002304 return -EFAULT;
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002305 }
Michael S. Tsirkin3b1bbe82014-10-24 14:04:47 +03002306 if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_INDIRECT)) {
Michael S. Tsirkin47283be2014-06-05 15:20:27 +03002307 ret = get_indirect(vq, iov, iov_size,
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002308 out_num, in_num,
2309 log, log_num, &desc);
Michael S. Tsirkin7b3384f2010-07-01 18:40:12 +03002310 if (unlikely(ret < 0)) {
Jason Wang6b1e6cc2016-06-23 02:04:32 -04002311 if (ret != -EAGAIN)
2312 vq_err(vq, "Failure detected "
2313 "in indirect descriptor at idx %d\n", i);
Michael S. Tsirkind5675bd2010-06-24 16:59:59 +03002314 return ret;
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002315 }
2316 continue;
2317 }
2318
Jason Wang6b1e6cc2016-06-23 02:04:32 -04002319 if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_WRITE))
2320 access = VHOST_ACCESS_WO;
2321 else
2322 access = VHOST_ACCESS_RO;
Michael S. Tsirkin3b1bbe82014-10-24 14:04:47 +03002323 ret = translate_desc(vq, vhost64_to_cpu(vq, desc.addr),
2324 vhost32_to_cpu(vq, desc.len), iov + iov_count,
Jason Wang6b1e6cc2016-06-23 02:04:32 -04002325 iov_size - iov_count, access);
Michael S. Tsirkin7b3384f2010-07-01 18:40:12 +03002326 if (unlikely(ret < 0)) {
Jason Wang6b1e6cc2016-06-23 02:04:32 -04002327 if (ret != -EAGAIN)
2328 vq_err(vq, "Translation failure %d descriptor idx %d\n",
2329 ret, i);
Michael S. Tsirkind5675bd2010-06-24 16:59:59 +03002330 return ret;
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002331 }
Jason Wang6b1e6cc2016-06-23 02:04:32 -04002332 if (access == VHOST_ACCESS_WO) {
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002333 /* If this is an input descriptor,
2334 * increment that count. */
2335 *in_num += ret;
yongduan060423b2019-09-11 17:44:24 +08002336 if (unlikely(log && ret)) {
Michael S. Tsirkin3b1bbe82014-10-24 14:04:47 +03002337 log[*log_num].addr = vhost64_to_cpu(vq, desc.addr);
2338 log[*log_num].len = vhost32_to_cpu(vq, desc.len);
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002339 ++*log_num;
2340 }
2341 } else {
2342 /* If it's an output descriptor, they're all supposed
2343 * to come before any input descriptors. */
Michael S. Tsirkin7b3384f2010-07-01 18:40:12 +03002344 if (unlikely(*in_num)) {
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002345 vq_err(vq, "Descriptor has out after in: "
2346 "idx %d\n", i);
Michael S. Tsirkind5675bd2010-06-24 16:59:59 +03002347 return -EINVAL;
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002348 }
2349 *out_num += ret;
2350 }
Michael S. Tsirkin3b1bbe82014-10-24 14:04:47 +03002351 } while ((i = next_desc(vq, &desc)) != -1);
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002352
2353 /* On success, increment avail index. */
2354 vq->last_avail_idx++;
Michael S. Tsirkin8ea8cf82011-05-20 02:10:54 +03002355
2356 /* Assume notifications from guest are disabled at this point,
2357 * if they aren't we would need to update avail_event index. */
2358 BUG_ON(!(vq->used_flags & VRING_USED_F_NO_NOTIFY));
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002359 return head;
2360}
Asias He6ac1afb2013-05-06 16:38:21 +08002361EXPORT_SYMBOL_GPL(vhost_get_vq_desc);
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002362
2363/* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */
David Stevens8dd014a2010-07-27 18:52:21 +03002364void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int n)
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002365{
David Stevens8dd014a2010-07-27 18:52:21 +03002366 vq->last_avail_idx -= n;
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002367}
Asias He6ac1afb2013-05-06 16:38:21 +08002368EXPORT_SYMBOL_GPL(vhost_discard_vq_desc);
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002369
2370/* After we've used one of their buffers, we tell them about it. We'll then
2371 * want to notify the guest, using eventfd. */
2372int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int len)
2373{
Michael S. Tsirkin3b1bbe82014-10-24 14:04:47 +03002374 struct vring_used_elem heads = {
2375 cpu_to_vhost32(vq, head),
2376 cpu_to_vhost32(vq, len)
2377 };
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002378
Jason Wangc49e4e52013-09-02 16:40:58 +08002379 return vhost_add_used_n(vq, &heads, 1);
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002380}
Asias He6ac1afb2013-05-06 16:38:21 +08002381EXPORT_SYMBOL_GPL(vhost_add_used);
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002382
David Stevens8dd014a2010-07-27 18:52:21 +03002383static int __vhost_add_used_n(struct vhost_virtqueue *vq,
2384 struct vring_used_elem *heads,
2385 unsigned count)
2386{
2387 struct vring_used_elem __user *used;
Michael S. Tsirkin8ea8cf82011-05-20 02:10:54 +03002388 u16 old, new;
David Stevens8dd014a2010-07-27 18:52:21 +03002389 int start;
2390
Michael S. Tsirkin5fba13b2015-11-29 13:34:44 +02002391 start = vq->last_used_idx & (vq->num - 1);
David Stevens8dd014a2010-07-27 18:52:21 +03002392 used = vq->used->ring + start;
Jason Wang7b5d7532019-05-24 04:12:14 -04002393 if (vhost_put_used(vq, heads, start, count)) {
David Stevens8dd014a2010-07-27 18:52:21 +03002394 vq_err(vq, "Failed to write used");
2395 return -EFAULT;
2396 }
2397 if (unlikely(vq->log_used)) {
2398 /* Make sure data is seen before log. */
2399 smp_wmb();
2400 /* Log used ring entry write. */
Jason Wangcc5e7102019-01-16 16:54:42 +08002401 log_used(vq, ((void __user *)used - (void __user *)vq->used),
2402 count * sizeof *used);
David Stevens8dd014a2010-07-27 18:52:21 +03002403 }
Michael S. Tsirkin8ea8cf82011-05-20 02:10:54 +03002404 old = vq->last_used_idx;
2405 new = (vq->last_used_idx += count);
2406 /* If the driver never bothers to signal in a very long while,
2407 * used index might wrap around. If that happens, invalidate
2408 * signalled_used index we stored. TODO: make sure driver
2409 * signals at least once in 2^16 and remove this. */
2410 if (unlikely((u16)(new - vq->signalled_used) < (u16)(new - old)))
2411 vq->signalled_used_valid = false;
David Stevens8dd014a2010-07-27 18:52:21 +03002412 return 0;
2413}
2414
2415/* After we've used one of their buffers, we tell them about it. We'll then
2416 * want to notify the guest, using eventfd. */
2417int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads,
2418 unsigned count)
2419{
2420 int start, n, r;
2421
Michael S. Tsirkin5fba13b2015-11-29 13:34:44 +02002422 start = vq->last_used_idx & (vq->num - 1);
David Stevens8dd014a2010-07-27 18:52:21 +03002423 n = vq->num - start;
2424 if (n < count) {
2425 r = __vhost_add_used_n(vq, heads, n);
2426 if (r < 0)
2427 return r;
2428 heads += n;
2429 count -= n;
2430 }
2431 r = __vhost_add_used_n(vq, heads, count);
2432
2433 /* Make sure buffer is written before we update index. */
2434 smp_wmb();
Jason Wang7b5d7532019-05-24 04:12:14 -04002435 if (vhost_put_used_idx(vq)) {
David Stevens8dd014a2010-07-27 18:52:21 +03002436 vq_err(vq, "Failed to increment used idx");
2437 return -EFAULT;
2438 }
2439 if (unlikely(vq->log_used)) {
Jason Wang841df922018-12-13 10:53:37 +08002440 /* Make sure used idx is seen before log. */
2441 smp_wmb();
David Stevens8dd014a2010-07-27 18:52:21 +03002442 /* Log used index update. */
Jason Wangcc5e7102019-01-16 16:54:42 +08002443 log_used(vq, offsetof(struct vring_used, idx),
2444 sizeof vq->used->idx);
David Stevens8dd014a2010-07-27 18:52:21 +03002445 if (vq->log_ctx)
2446 eventfd_signal(vq->log_ctx, 1);
2447 }
2448 return r;
2449}
Asias He6ac1afb2013-05-06 16:38:21 +08002450EXPORT_SYMBOL_GPL(vhost_add_used_n);
David Stevens8dd014a2010-07-27 18:52:21 +03002451
Michael S. Tsirkin8ea8cf82011-05-20 02:10:54 +03002452static bool vhost_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002453{
Michael S. Tsirkin3b1bbe82014-10-24 14:04:47 +03002454 __u16 old, new;
2455 __virtio16 event;
Michael S. Tsirkin8ea8cf82011-05-20 02:10:54 +03002456 bool v;
Jason Wang8d658432017-07-27 11:22:05 +08002457 /* Flush out used index updates. This is paired
2458 * with the barrier that the Guest executes when enabling
2459 * interrupts. */
2460 smp_mb();
Michael S. Tsirkin0d499352010-05-11 19:44:17 +03002461
Michael S. Tsirkinea16c512014-06-05 15:20:23 +03002462 if (vhost_has_feature(vq, VIRTIO_F_NOTIFY_ON_EMPTY) &&
Michael S. Tsirkin8ea8cf82011-05-20 02:10:54 +03002463 unlikely(vq->avail_idx == vq->last_avail_idx))
2464 return true;
2465
Michael S. Tsirkinea16c512014-06-05 15:20:23 +03002466 if (!vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX)) {
Michael S. Tsirkin3b1bbe82014-10-24 14:04:47 +03002467 __virtio16 flags;
Jason Wang7b5d7532019-05-24 04:12:14 -04002468 if (vhost_get_avail_flags(vq, &flags)) {
Michael S. Tsirkin8ea8cf82011-05-20 02:10:54 +03002469 vq_err(vq, "Failed to get flags");
2470 return true;
2471 }
Michael S. Tsirkin3b1bbe82014-10-24 14:04:47 +03002472 return !(flags & cpu_to_vhost16(vq, VRING_AVAIL_F_NO_INTERRUPT));
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002473 }
Michael S. Tsirkin8ea8cf82011-05-20 02:10:54 +03002474 old = vq->signalled_used;
2475 v = vq->signalled_used_valid;
2476 new = vq->signalled_used = vq->last_used_idx;
2477 vq->signalled_used_valid = true;
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002478
Michael S. Tsirkin8ea8cf82011-05-20 02:10:54 +03002479 if (unlikely(!v))
2480 return true;
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002481
Jason Wang7b5d7532019-05-24 04:12:14 -04002482 if (vhost_get_used_event(vq, &event)) {
Michael S. Tsirkin8ea8cf82011-05-20 02:10:54 +03002483 vq_err(vq, "Failed to get used event idx");
2484 return true;
2485 }
Jason Wang8d658432017-07-27 11:22:05 +08002486 return vring_need_event(vhost16_to_cpu(vq, event), new, old);
Michael S. Tsirkin8ea8cf82011-05-20 02:10:54 +03002487}
2488
2489/* This actually signals the guest, using eventfd. */
2490void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq)
2491{
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002492 /* Signal the Guest tell them we used something up. */
Michael S. Tsirkin8ea8cf82011-05-20 02:10:54 +03002493 if (vq->call_ctx && vhost_notify(dev, vq))
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002494 eventfd_signal(vq->call_ctx, 1);
2495}
Asias He6ac1afb2013-05-06 16:38:21 +08002496EXPORT_SYMBOL_GPL(vhost_signal);
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002497
2498/* And here's the combo meal deal. Supersize me! */
2499void vhost_add_used_and_signal(struct vhost_dev *dev,
2500 struct vhost_virtqueue *vq,
2501 unsigned int head, int len)
2502{
2503 vhost_add_used(vq, head, len);
2504 vhost_signal(dev, vq);
2505}
Asias He6ac1afb2013-05-06 16:38:21 +08002506EXPORT_SYMBOL_GPL(vhost_add_used_and_signal);
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002507
David Stevens8dd014a2010-07-27 18:52:21 +03002508/* multi-buffer version of vhost_add_used_and_signal */
2509void vhost_add_used_and_signal_n(struct vhost_dev *dev,
2510 struct vhost_virtqueue *vq,
2511 struct vring_used_elem *heads, unsigned count)
2512{
2513 vhost_add_used_n(vq, heads, count);
2514 vhost_signal(dev, vq);
2515}
Asias He6ac1afb2013-05-06 16:38:21 +08002516EXPORT_SYMBOL_GPL(vhost_add_used_and_signal_n);
David Stevens8dd014a2010-07-27 18:52:21 +03002517
Jason Wangd4a60602016-03-04 06:24:52 -05002518/* return true if we're sure that avaiable ring is empty */
2519bool vhost_vq_avail_empty(struct vhost_dev *dev, struct vhost_virtqueue *vq)
2520{
2521 __virtio16 avail_idx;
2522 int r;
2523
Jason Wang275bf962017-01-18 15:02:01 +08002524 if (vq->avail_idx != vq->last_avail_idx)
Jason Wangd4a60602016-03-04 06:24:52 -05002525 return false;
2526
Jason Wang7b5d7532019-05-24 04:12:14 -04002527 r = vhost_get_avail_idx(vq, &avail_idx);
Jason Wang275bf962017-01-18 15:02:01 +08002528 if (unlikely(r))
2529 return false;
2530 vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
2531
2532 return vq->avail_idx == vq->last_avail_idx;
Jason Wangd4a60602016-03-04 06:24:52 -05002533}
2534EXPORT_SYMBOL_GPL(vhost_vq_avail_empty);
2535
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002536/* OK, now we need to know about added descriptors. */
Michael S. Tsirkin8ea8cf82011-05-20 02:10:54 +03002537bool vhost_enable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002538{
Michael S. Tsirkin3b1bbe82014-10-24 14:04:47 +03002539 __virtio16 avail_idx;
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002540 int r;
Krishna Kumard47effe2011-03-01 17:06:37 +05302541
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002542 if (!(vq->used_flags & VRING_USED_F_NO_NOTIFY))
2543 return false;
2544 vq->used_flags &= ~VRING_USED_F_NO_NOTIFY;
Michael S. Tsirkinea16c512014-06-05 15:20:23 +03002545 if (!vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX)) {
Jason Wang2723fea2011-06-21 18:04:38 +08002546 r = vhost_update_used_flags(vq);
Michael S. Tsirkin8ea8cf82011-05-20 02:10:54 +03002547 if (r) {
2548 vq_err(vq, "Failed to enable notification at %p: %d\n",
2549 &vq->used->flags, r);
2550 return false;
2551 }
2552 } else {
Jason Wang2723fea2011-06-21 18:04:38 +08002553 r = vhost_update_avail_event(vq, vq->avail_idx);
Michael S. Tsirkin8ea8cf82011-05-20 02:10:54 +03002554 if (r) {
2555 vq_err(vq, "Failed to update avail event index at %p: %d\n",
2556 vhost_avail_event(vq), r);
2557 return false;
2558 }
2559 }
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002560 /* They could have slipped one in as we were doing that: make
2561 * sure it's written, then check again. */
Michael S. Tsirkin56593382010-02-01 07:21:02 +00002562 smp_mb();
Jason Wang7b5d7532019-05-24 04:12:14 -04002563 r = vhost_get_avail_idx(vq, &avail_idx);
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002564 if (r) {
2565 vq_err(vq, "Failed to check avail idx at %p: %d\n",
2566 &vq->avail->idx, r);
2567 return false;
2568 }
2569
Michael S. Tsirkin3b1bbe82014-10-24 14:04:47 +03002570 return vhost16_to_cpu(vq, avail_idx) != vq->avail_idx;
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002571}
Asias He6ac1afb2013-05-06 16:38:21 +08002572EXPORT_SYMBOL_GPL(vhost_enable_notify);
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002573
2574/* We don't need to be notified again. */
Michael S. Tsirkin8ea8cf82011-05-20 02:10:54 +03002575void vhost_disable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002576{
2577 int r;
Krishna Kumard47effe2011-03-01 17:06:37 +05302578
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002579 if (vq->used_flags & VRING_USED_F_NO_NOTIFY)
2580 return;
2581 vq->used_flags |= VRING_USED_F_NO_NOTIFY;
Michael S. Tsirkinea16c512014-06-05 15:20:23 +03002582 if (!vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX)) {
Jason Wang2723fea2011-06-21 18:04:38 +08002583 r = vhost_update_used_flags(vq);
Michael S. Tsirkin8ea8cf82011-05-20 02:10:54 +03002584 if (r)
2585 vq_err(vq, "Failed to enable notification at %p: %d\n",
2586 &vq->used->flags, r);
2587 }
Michael S. Tsirkin3a4d5c92010-01-14 06:17:27 +00002588}
Asias He6ac1afb2013-05-06 16:38:21 +08002589EXPORT_SYMBOL_GPL(vhost_disable_notify);
2590
Jason Wang6b1e6cc2016-06-23 02:04:32 -04002591/* Create a new message. */
2592struct vhost_msg_node *vhost_new_msg(struct vhost_virtqueue *vq, int type)
2593{
2594 struct vhost_msg_node *node = kmalloc(sizeof *node, GFP_KERNEL);
2595 if (!node)
2596 return NULL;
Michael S. Tsirkin670ae9c2018-05-12 00:33:10 +03002597
2598 /* Make sure all padding within the structure is initialized. */
2599 memset(&node->msg, 0, sizeof node->msg);
Jason Wang6b1e6cc2016-06-23 02:04:32 -04002600 node->vq = vq;
2601 node->msg.type = type;
2602 return node;
2603}
2604EXPORT_SYMBOL_GPL(vhost_new_msg);
2605
2606void vhost_enqueue_msg(struct vhost_dev *dev, struct list_head *head,
2607 struct vhost_msg_node *node)
2608{
2609 spin_lock(&dev->iotlb_lock);
2610 list_add_tail(&node->node, head);
2611 spin_unlock(&dev->iotlb_lock);
2612
Linus Torvaldsa9a08842018-02-11 14:34:03 -08002613 wake_up_interruptible_poll(&dev->wait, EPOLLIN | EPOLLRDNORM);
Jason Wang6b1e6cc2016-06-23 02:04:32 -04002614}
2615EXPORT_SYMBOL_GPL(vhost_enqueue_msg);
2616
2617struct vhost_msg_node *vhost_dequeue_msg(struct vhost_dev *dev,
2618 struct list_head *head)
2619{
2620 struct vhost_msg_node *node = NULL;
2621
2622 spin_lock(&dev->iotlb_lock);
2623 if (!list_empty(head)) {
2624 node = list_first_entry(head, struct vhost_msg_node,
2625 node);
2626 list_del(&node->node);
2627 }
2628 spin_unlock(&dev->iotlb_lock);
2629
2630 return node;
2631}
2632EXPORT_SYMBOL_GPL(vhost_dequeue_msg);
2633
2634
Asias He6ac1afb2013-05-06 16:38:21 +08002635static int __init vhost_init(void)
2636{
2637 return 0;
2638}
2639
2640static void __exit vhost_exit(void)
2641{
2642}
2643
2644module_init(vhost_init);
2645module_exit(vhost_exit);
2646
2647MODULE_VERSION("0.0.1");
2648MODULE_LICENSE("GPL v2");
2649MODULE_AUTHOR("Michael S. Tsirkin");
2650MODULE_DESCRIPTION("Host kernel accelerator for virtio");