blob: f16074eb53c72a7040421d23028d5762e0c5658d [file] [log] [blame]
Björn Töpelc0c77d82018-05-02 13:01:23 +02001// SPDX-License-Identifier: GPL-2.0
2/* XDP sockets
3 *
4 * AF_XDP sockets allows a channel between XDP programs and userspace
5 * applications.
6 * Copyright(c) 2018 Intel Corporation.
7 *
Björn Töpelc0c77d82018-05-02 13:01:23 +02008 * Author(s): Björn Töpel <bjorn.topel@intel.com>
9 * Magnus Karlsson <magnus.karlsson@intel.com>
10 */
11
12#define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__
13
14#include <linux/if_xdp.h>
15#include <linux/init.h>
16#include <linux/sched/mm.h>
17#include <linux/sched/signal.h>
18#include <linux/sched/task.h>
19#include <linux/socket.h>
20#include <linux/file.h>
21#include <linux/uaccess.h>
22#include <linux/net.h>
23#include <linux/netdevice.h>
Magnus Karlssonac98d8a2018-06-04 14:05:57 +020024#include <linux/rculist.h>
Magnus Karlssona71506a2020-05-20 21:20:51 +020025#include <net/xdp_sock_drv.h>
Björn Töpela0731952020-11-30 19:52:00 +010026#include <net/busy_poll.h>
Björn Töpelb9b6b682018-05-02 13:01:25 +020027#include <net/xdp.h>
Björn Töpelc0c77d82018-05-02 13:01:23 +020028
Magnus Karlsson423f3832018-05-02 13:01:24 +020029#include "xsk_queue.h"
Björn Töpelc0c77d82018-05-02 13:01:23 +020030#include "xdp_umem.h"
Björn Töpela36b38aa2019-01-24 19:59:39 +010031#include "xsk.h"
Björn Töpelc0c77d82018-05-02 13:01:23 +020032
Li RongQinge7a1c132021-04-14 13:39:12 +080033#define TX_BATCH_SIZE 32
Magnus Karlsson35fcde72018-05-02 13:01:34 +020034
Björn Töpele312b9e2019-12-19 07:10:02 +010035static DEFINE_PER_CPU(struct list_head, xskmap_flush_list);
36
Magnus Karlssonc4655762020-08-28 10:26:16 +020037void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool)
Magnus Karlsson77cd0d72019-08-14 09:27:17 +020038{
Magnus Karlssonc2d3d6a2020-08-28 10:26:19 +020039 if (pool->cached_need_wakeup & XDP_WAKEUP_RX)
Magnus Karlsson77cd0d72019-08-14 09:27:17 +020040 return;
41
Magnus Karlsson7361f9c2020-08-28 10:26:18 +020042 pool->fq->ring->flags |= XDP_RING_NEED_WAKEUP;
Magnus Karlssonc2d3d6a2020-08-28 10:26:19 +020043 pool->cached_need_wakeup |= XDP_WAKEUP_RX;
Magnus Karlsson77cd0d72019-08-14 09:27:17 +020044}
45EXPORT_SYMBOL(xsk_set_rx_need_wakeup);
46
Magnus Karlssonc4655762020-08-28 10:26:16 +020047void xsk_set_tx_need_wakeup(struct xsk_buff_pool *pool)
Magnus Karlsson77cd0d72019-08-14 09:27:17 +020048{
49 struct xdp_sock *xs;
50
Magnus Karlssonc2d3d6a2020-08-28 10:26:19 +020051 if (pool->cached_need_wakeup & XDP_WAKEUP_TX)
Magnus Karlsson77cd0d72019-08-14 09:27:17 +020052 return;
53
54 rcu_read_lock();
Magnus Karlssona5aa8e52020-08-28 10:26:20 +020055 list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
Magnus Karlsson77cd0d72019-08-14 09:27:17 +020056 xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
57 }
58 rcu_read_unlock();
59
Magnus Karlssonc2d3d6a2020-08-28 10:26:19 +020060 pool->cached_need_wakeup |= XDP_WAKEUP_TX;
Magnus Karlsson77cd0d72019-08-14 09:27:17 +020061}
62EXPORT_SYMBOL(xsk_set_tx_need_wakeup);
63
Magnus Karlssonc4655762020-08-28 10:26:16 +020064void xsk_clear_rx_need_wakeup(struct xsk_buff_pool *pool)
Magnus Karlsson77cd0d72019-08-14 09:27:17 +020065{
Magnus Karlssonc2d3d6a2020-08-28 10:26:19 +020066 if (!(pool->cached_need_wakeup & XDP_WAKEUP_RX))
Magnus Karlsson77cd0d72019-08-14 09:27:17 +020067 return;
68
Magnus Karlsson7361f9c2020-08-28 10:26:18 +020069 pool->fq->ring->flags &= ~XDP_RING_NEED_WAKEUP;
Magnus Karlssonc2d3d6a2020-08-28 10:26:19 +020070 pool->cached_need_wakeup &= ~XDP_WAKEUP_RX;
Magnus Karlsson77cd0d72019-08-14 09:27:17 +020071}
72EXPORT_SYMBOL(xsk_clear_rx_need_wakeup);
73
Magnus Karlssonc4655762020-08-28 10:26:16 +020074void xsk_clear_tx_need_wakeup(struct xsk_buff_pool *pool)
Magnus Karlsson77cd0d72019-08-14 09:27:17 +020075{
76 struct xdp_sock *xs;
77
Magnus Karlssonc2d3d6a2020-08-28 10:26:19 +020078 if (!(pool->cached_need_wakeup & XDP_WAKEUP_TX))
Magnus Karlsson77cd0d72019-08-14 09:27:17 +020079 return;
80
81 rcu_read_lock();
Magnus Karlssona5aa8e52020-08-28 10:26:20 +020082 list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
Magnus Karlsson77cd0d72019-08-14 09:27:17 +020083 xs->tx->ring->flags &= ~XDP_RING_NEED_WAKEUP;
84 }
85 rcu_read_unlock();
86
Magnus Karlssonc2d3d6a2020-08-28 10:26:19 +020087 pool->cached_need_wakeup &= ~XDP_WAKEUP_TX;
Magnus Karlsson77cd0d72019-08-14 09:27:17 +020088}
89EXPORT_SYMBOL(xsk_clear_tx_need_wakeup);
90
Magnus Karlssonc4655762020-08-28 10:26:16 +020091bool xsk_uses_need_wakeup(struct xsk_buff_pool *pool)
Magnus Karlsson77cd0d72019-08-14 09:27:17 +020092{
Magnus Karlssonc2d3d6a2020-08-28 10:26:19 +020093 return pool->uses_need_wakeup;
Magnus Karlsson77cd0d72019-08-14 09:27:17 +020094}
Magnus Karlssonc4655762020-08-28 10:26:16 +020095EXPORT_SYMBOL(xsk_uses_need_wakeup);
Magnus Karlsson77cd0d72019-08-14 09:27:17 +020096
Magnus Karlsson1c1efc22020-08-28 10:26:17 +020097struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev,
98 u16 queue_id)
99{
100 if (queue_id < dev->real_num_rx_queues)
101 return dev->_rx[queue_id].pool;
102 if (queue_id < dev->real_num_tx_queues)
103 return dev->_tx[queue_id].pool;
104
105 return NULL;
106}
107EXPORT_SYMBOL(xsk_get_pool_from_qid);
108
109void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id)
110{
Maxim Mikityanskiyb425e242021-01-18 18:03:33 +0200111 if (queue_id < dev->num_rx_queues)
Magnus Karlsson1c1efc22020-08-28 10:26:17 +0200112 dev->_rx[queue_id].pool = NULL;
Maxim Mikityanskiyb425e242021-01-18 18:03:33 +0200113 if (queue_id < dev->num_tx_queues)
Magnus Karlsson1c1efc22020-08-28 10:26:17 +0200114 dev->_tx[queue_id].pool = NULL;
115}
116
117/* The buffer pool is stored both in the _rx struct and the _tx struct as we do
118 * not know if the device has more tx queues than rx, or the opposite.
119 * This might also change during run time.
120 */
121int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool,
122 u16 queue_id)
123{
124 if (queue_id >= max_t(unsigned int,
125 dev->real_num_rx_queues,
126 dev->real_num_tx_queues))
127 return -EINVAL;
128
129 if (queue_id < dev->real_num_rx_queues)
130 dev->_rx[queue_id].pool = pool;
131 if (queue_id < dev->real_num_tx_queues)
132 dev->_tx[queue_id].pool = pool;
133
134 return 0;
135}
136
Björn Töpel2b434702020-05-20 21:20:53 +0200137static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
Kevin Laatzc05cd362019-08-27 02:25:22 +0000138{
Björn Töpel2b434702020-05-20 21:20:53 +0200139 struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp);
140 u64 addr;
141 int err;
Kevin Laatzc05cd362019-08-27 02:25:22 +0000142
Björn Töpel2b434702020-05-20 21:20:53 +0200143 addr = xp_get_handle(xskb);
144 err = xskq_prod_reserve_desc(xs->rx, addr, len);
145 if (err) {
Ciara Loftus8aa5a332020-07-08 07:28:33 +0000146 xs->rx_queue_full++;
Björn Töpel2b434702020-05-20 21:20:53 +0200147 return err;
148 }
Kevin Laatzc05cd362019-08-27 02:25:22 +0000149
Björn Töpel2b434702020-05-20 21:20:53 +0200150 xp_release(xskb);
151 return 0;
152}
Kevin Laatzc05cd362019-08-27 02:25:22 +0000153
Björn Töpel2b434702020-05-20 21:20:53 +0200154static void xsk_copy_xdp(struct xdp_buff *to, struct xdp_buff *from, u32 len)
155{
156 void *from_buf, *to_buf;
157 u32 metalen;
158
159 if (unlikely(xdp_data_meta_unsupported(from))) {
160 from_buf = from->data;
161 to_buf = to->data;
162 metalen = 0;
163 } else {
164 from_buf = from->data_meta;
165 metalen = from->data - from->data_meta;
166 to_buf = to->data - metalen;
Kevin Laatzc05cd362019-08-27 02:25:22 +0000167 }
168
169 memcpy(to_buf, from_buf, len + metalen);
170}
171
Björn Töpel458f7272021-01-22 11:53:49 +0100172static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
Björn Töpel173d3ad2018-06-04 14:05:55 +0200173{
Björn Töpel2b434702020-05-20 21:20:53 +0200174 struct xdp_buff *xsk_xdp;
Björn Töpel4e64c832018-06-04 13:57:11 +0200175 int err;
Björn Töpel458f7272021-01-22 11:53:49 +0100176 u32 len;
Björn Töpelc4971762018-05-02 13:01:27 +0200177
Björn Töpel458f7272021-01-22 11:53:49 +0100178 len = xdp->data_end - xdp->data;
Magnus Karlssonc4655762020-08-28 10:26:16 +0200179 if (len > xsk_pool_get_rx_frame_size(xs->pool)) {
Björn Töpela509a952018-06-04 13:57:12 +0200180 xs->rx_dropped++;
Björn Töpelc4971762018-05-02 13:01:27 +0200181 return -ENOSPC;
Björn Töpela509a952018-06-04 13:57:12 +0200182 }
Björn Töpelc4971762018-05-02 13:01:27 +0200183
Magnus Karlssonc4655762020-08-28 10:26:16 +0200184 xsk_xdp = xsk_buff_alloc(xs->pool);
Björn Töpel2b434702020-05-20 21:20:53 +0200185 if (!xsk_xdp) {
Björn Töpela509a952018-06-04 13:57:12 +0200186 xs->rx_dropped++;
Björn Töpel2b434702020-05-20 21:20:53 +0200187 return -ENOSPC;
188 }
Björn Töpelc4971762018-05-02 13:01:27 +0200189
Björn Töpel2b434702020-05-20 21:20:53 +0200190 xsk_copy_xdp(xsk_xdp, xdp, len);
191 err = __xsk_rcv_zc(xs, xsk_xdp, len);
192 if (err) {
193 xsk_buff_free(xsk_xdp);
194 return err;
195 }
Björn Töpel2b434702020-05-20 21:20:53 +0200196 return 0;
Björn Töpelc4971762018-05-02 13:01:27 +0200197}
198
Xuan Zhuo3413f042020-12-01 21:56:58 +0800199static bool xsk_tx_writeable(struct xdp_sock *xs)
200{
201 if (xskq_cons_present_entries(xs->tx) > xs->tx->nentries / 2)
202 return false;
203
204 return true;
205}
206
Björn Töpel42fddcc2019-09-04 13:49:12 +0200207static bool xsk_is_bound(struct xdp_sock *xs)
208{
209 if (READ_ONCE(xs->state) == XSK_BOUND) {
210 /* Matches smp_wmb() in bind(). */
211 smp_rmb();
212 return true;
213 }
214 return false;
215}
216
Björn Töpel458f7272021-01-22 11:53:49 +0100217static int xsk_rcv_check(struct xdp_sock *xs, struct xdp_buff *xdp)
Björn Töpelc4971762018-05-02 13:01:27 +0200218{
Björn Töpel42fddcc2019-09-04 13:49:12 +0200219 if (!xsk_is_bound(xs))
220 return -EINVAL;
221
Björn Töpel173d3ad2018-06-04 14:05:55 +0200222 if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
223 return -EINVAL;
Björn Töpelc4971762018-05-02 13:01:27 +0200224
Björn Töpelb02e5a02020-11-30 19:52:01 +0100225 sk_mark_napi_id_once_xdp(&xs->sk, xdp);
Björn Töpel458f7272021-01-22 11:53:49 +0100226 return 0;
Björn Töpelc4971762018-05-02 13:01:27 +0200227}
228
Björn Töpeld8179912019-11-01 12:03:46 +0100229static void xsk_flush(struct xdp_sock *xs)
Björn Töpelc4971762018-05-02 13:01:27 +0200230{
Magnus Karlsson59e35e52019-12-19 13:39:23 +0100231 xskq_prod_submit(xs->rx);
Magnus Karlsson7361f9c2020-08-28 10:26:18 +0200232 __xskq_cons_release(xs->pool->fq);
Björn Töpel43a825a2020-01-20 10:29:17 +0100233 sock_def_readable(&xs->sk);
Björn Töpelc4971762018-05-02 13:01:27 +0200234}
235
236int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
237{
238 int err;
239
Ilya Maximetsbf0bdd12019-07-03 15:09:16 +0300240 spin_lock_bh(&xs->rx_lock);
Björn Töpel458f7272021-01-22 11:53:49 +0100241 err = xsk_rcv_check(xs, xdp);
242 if (!err) {
243 err = __xsk_rcv(xs, xdp);
244 xsk_flush(xs);
245 }
Ilya Maximetsbf0bdd12019-07-03 15:09:16 +0300246 spin_unlock_bh(&xs->rx_lock);
Björn Töpelc4971762018-05-02 13:01:27 +0200247 return err;
248}
249
Björn Töpel458f7272021-01-22 11:53:49 +0100250static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
251{
252 int err;
253 u32 len;
254
255 err = xsk_rcv_check(xs, xdp);
256 if (err)
257 return err;
258
259 if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) {
260 len = xdp->data_end - xdp->data;
261 return __xsk_rcv_zc(xs, xdp, len);
262 }
263
264 err = __xsk_rcv(xs, xdp);
265 if (!err)
266 xdp_return_buff(xdp);
267 return err;
268}
269
Björn Töpele312b9e2019-12-19 07:10:02 +0100270int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp)
Björn Töpeld8179912019-11-01 12:03:46 +0100271{
Björn Töpele312b9e2019-12-19 07:10:02 +0100272 struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list);
Björn Töpeld8179912019-11-01 12:03:46 +0100273 int err;
274
Björn Töpel458f7272021-01-22 11:53:49 +0100275 err = xsk_rcv(xs, xdp);
Björn Töpeld8179912019-11-01 12:03:46 +0100276 if (err)
277 return err;
278
279 if (!xs->flush_node.prev)
280 list_add(&xs->flush_node, flush_list);
281
282 return 0;
283}
284
Björn Töpele312b9e2019-12-19 07:10:02 +0100285void __xsk_map_flush(void)
Björn Töpeld8179912019-11-01 12:03:46 +0100286{
Björn Töpele312b9e2019-12-19 07:10:02 +0100287 struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list);
Björn Töpeld8179912019-11-01 12:03:46 +0100288 struct xdp_sock *xs, *tmp;
289
290 list_for_each_entry_safe(xs, tmp, flush_list, flush_node) {
291 xsk_flush(xs);
292 __list_del_clearprev(&xs->flush_node);
293 }
294}
295
Magnus Karlssonc4655762020-08-28 10:26:16 +0200296void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries)
Magnus Karlssonac98d8a2018-06-04 14:05:57 +0200297{
Magnus Karlsson7361f9c2020-08-28 10:26:18 +0200298 xskq_prod_submit_n(pool->cq, nb_entries);
Magnus Karlssonac98d8a2018-06-04 14:05:57 +0200299}
Magnus Karlssonc4655762020-08-28 10:26:16 +0200300EXPORT_SYMBOL(xsk_tx_completed);
Magnus Karlssonac98d8a2018-06-04 14:05:57 +0200301
Magnus Karlssonc4655762020-08-28 10:26:16 +0200302void xsk_tx_release(struct xsk_buff_pool *pool)
Magnus Karlssonac98d8a2018-06-04 14:05:57 +0200303{
304 struct xdp_sock *xs;
305
306 rcu_read_lock();
Magnus Karlssona5aa8e52020-08-28 10:26:20 +0200307 list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
Magnus Karlsson30744a62020-02-10 16:27:12 +0100308 __xskq_cons_release(xs->tx);
Xuan Zhuo3413f042020-12-01 21:56:58 +0800309 if (xsk_tx_writeable(xs))
310 xs->sk.sk_write_space(&xs->sk);
Magnus Karlssonac98d8a2018-06-04 14:05:57 +0200311 }
312 rcu_read_unlock();
313}
Magnus Karlssonc4655762020-08-28 10:26:16 +0200314EXPORT_SYMBOL(xsk_tx_release);
Magnus Karlssonac98d8a2018-06-04 14:05:57 +0200315
Magnus Karlssonc4655762020-08-28 10:26:16 +0200316bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc)
Magnus Karlssonac98d8a2018-06-04 14:05:57 +0200317{
Magnus Karlssonac98d8a2018-06-04 14:05:57 +0200318 struct xdp_sock *xs;
319
320 rcu_read_lock();
Magnus Karlssona5aa8e52020-08-28 10:26:20 +0200321 list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
Magnus Karlsson1c1efc22020-08-28 10:26:17 +0200322 if (!xskq_cons_peek_desc(xs->tx, desc, pool)) {
Ciara Loftus8aa5a332020-07-08 07:28:33 +0000323 xs->tx->queue_empty_descs++;
Magnus Karlssonac98d8a2018-06-04 14:05:57 +0200324 continue;
Ciara Loftus8aa5a332020-07-08 07:28:33 +0000325 }
Magnus Karlssonac98d8a2018-06-04 14:05:57 +0200326
Tobias Klauser0a058612020-04-22 01:29:27 +0200327 /* This is the backpressure mechanism for the Tx path.
Magnus Karlsson15d8c912019-12-19 13:39:30 +0100328 * Reserve space in the completion queue and only proceed
329 * if there is space in it. This avoids having to implement
330 * any buffering in the Tx path.
331 */
Magnus Karlsson7361f9c2020-08-28 10:26:18 +0200332 if (xskq_prod_reserve_addr(pool->cq, desc->addr))
Magnus Karlssonac98d8a2018-06-04 14:05:57 +0200333 goto out;
334
Magnus Karlssonc5ed924b2019-12-19 13:39:26 +0100335 xskq_cons_release(xs->tx);
Magnus Karlssonac98d8a2018-06-04 14:05:57 +0200336 rcu_read_unlock();
337 return true;
338 }
339
340out:
341 rcu_read_unlock();
342 return false;
343}
Magnus Karlssonc4655762020-08-28 10:26:16 +0200344EXPORT_SYMBOL(xsk_tx_peek_desc);
Magnus Karlssonac98d8a2018-06-04 14:05:57 +0200345
Magnus Karlsson9349eb32020-11-16 12:12:46 +0100346static u32 xsk_tx_peek_release_fallback(struct xsk_buff_pool *pool, struct xdp_desc *descs,
347 u32 max_entries)
348{
349 u32 nb_pkts = 0;
350
351 while (nb_pkts < max_entries && xsk_tx_peek_desc(pool, &descs[nb_pkts]))
352 nb_pkts++;
353
354 xsk_tx_release(pool);
355 return nb_pkts;
356}
357
358u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, struct xdp_desc *descs,
359 u32 max_entries)
360{
361 struct xdp_sock *xs;
362 u32 nb_pkts;
363
364 rcu_read_lock();
365 if (!list_is_singular(&pool->xsk_tx_list)) {
366 /* Fallback to the non-batched version */
367 rcu_read_unlock();
368 return xsk_tx_peek_release_fallback(pool, descs, max_entries);
369 }
370
371 xs = list_first_or_null_rcu(&pool->xsk_tx_list, struct xdp_sock, tx_list);
372 if (!xs) {
373 nb_pkts = 0;
374 goto out;
375 }
376
377 nb_pkts = xskq_cons_peek_desc_batch(xs->tx, descs, pool, max_entries);
378 if (!nb_pkts) {
379 xs->tx->queue_empty_descs++;
380 goto out;
381 }
382
383 /* This is the backpressure mechanism for the Tx path. Try to
384 * reserve space in the completion queue for all packets, but
385 * if there are fewer slots available, just process that many
386 * packets. This avoids having to implement any buffering in
387 * the Tx path.
388 */
389 nb_pkts = xskq_prod_reserve_addr_batch(pool->cq, descs, nb_pkts);
390 if (!nb_pkts)
391 goto out;
392
393 xskq_cons_release_n(xs->tx, nb_pkts);
394 __xskq_cons_release(xs->tx);
395 xs->sk.sk_write_space(&xs->sk);
396
397out:
398 rcu_read_unlock();
399 return nb_pkts;
400}
401EXPORT_SYMBOL(xsk_tx_peek_release_desc_batch);
402
Maxim Mikityanskiy06870682019-12-17 16:20:42 +0000403static int xsk_wakeup(struct xdp_sock *xs, u8 flags)
Magnus Karlssonac98d8a2018-06-04 14:05:57 +0200404{
Magnus Karlssonac98d8a2018-06-04 14:05:57 +0200405 struct net_device *dev = xs->dev;
Maxim Mikityanskiy06870682019-12-17 16:20:42 +0000406 int err;
Magnus Karlssonac98d8a2018-06-04 14:05:57 +0200407
Maxim Mikityanskiy06870682019-12-17 16:20:42 +0000408 rcu_read_lock();
409 err = dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags);
410 rcu_read_unlock();
411
412 return err;
413}
414
415static int xsk_zc_xmit(struct xdp_sock *xs)
416{
417 return xsk_wakeup(xs, XDP_WAKEUP_TX);
Magnus Karlssonac98d8a2018-06-04 14:05:57 +0200418}
419
Magnus Karlsson35fcde72018-05-02 13:01:34 +0200420static void xsk_destruct_skb(struct sk_buff *skb)
421{
Björn Töpelbbff2f32018-06-04 13:57:13 +0200422 u64 addr = (u64)(long)skb_shinfo(skb)->destructor_arg;
Magnus Karlsson35fcde72018-05-02 13:01:34 +0200423 struct xdp_sock *xs = xdp_sk(skb->sk);
Magnus Karlssona9744f72018-06-29 09:48:20 +0200424 unsigned long flags;
Magnus Karlsson35fcde72018-05-02 13:01:34 +0200425
Magnus Karlssonf09ced42020-12-18 14:45:24 +0100426 spin_lock_irqsave(&xs->pool->cq_lock, flags);
Magnus Karlsson7361f9c2020-08-28 10:26:18 +0200427 xskq_prod_submit_addr(xs->pool->cq, addr);
Magnus Karlssonf09ced42020-12-18 14:45:24 +0100428 spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
Magnus Karlsson35fcde72018-05-02 13:01:34 +0200429
430 sock_wfree(skb);
431}
432
Xuan Zhuo9c8f21e2021-02-18 20:50:45 +0000433static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
434 struct xdp_desc *desc)
435{
436 struct xsk_buff_pool *pool = xs->pool;
437 u32 hr, len, ts, offset, copy, copied;
438 struct sk_buff *skb;
439 struct page *page;
440 void *buffer;
441 int err, i;
442 u64 addr;
443
444 hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(xs->dev->needed_headroom));
445
446 skb = sock_alloc_send_skb(&xs->sk, hr, 1, &err);
447 if (unlikely(!skb))
448 return ERR_PTR(err);
449
450 skb_reserve(skb, hr);
451
452 addr = desc->addr;
453 len = desc->len;
454 ts = pool->unaligned ? len : pool->chunk_size;
455
456 buffer = xsk_buff_raw_get_data(pool, addr);
457 offset = offset_in_page(buffer);
458 addr = buffer - pool->addrs;
459
460 for (copied = 0, i = 0; copied < len; i++) {
461 page = pool->umem->pgs[addr >> PAGE_SHIFT];
462 get_page(page);
463
464 copy = min_t(u32, PAGE_SIZE - offset, len - copied);
465 skb_fill_page_desc(skb, i, page, offset, copy);
466
467 copied += copy;
468 addr += copy;
469 offset = 0;
470 }
471
472 skb->len += len;
473 skb->data_len += len;
474 skb->truesize += ts;
475
476 refcount_add(ts, &xs->sk.sk_wmem_alloc);
477
478 return skb;
479}
480
481static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
482 struct xdp_desc *desc)
483{
484 struct net_device *dev = xs->dev;
485 struct sk_buff *skb;
486
487 if (dev->priv_flags & IFF_TX_SKB_NO_LINEAR) {
488 skb = xsk_build_skb_zerocopy(xs, desc);
489 if (IS_ERR(skb))
490 return skb;
491 } else {
492 u32 hr, tr, len;
493 void *buffer;
494 int err;
495
496 hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(dev->needed_headroom));
497 tr = dev->needed_tailroom;
498 len = desc->len;
499
500 skb = sock_alloc_send_skb(&xs->sk, hr + len + tr, 1, &err);
501 if (unlikely(!skb))
502 return ERR_PTR(err);
503
504 skb_reserve(skb, hr);
505 skb_put(skb, len);
506
507 buffer = xsk_buff_raw_get_data(xs->pool, desc->addr);
508 err = skb_store_bits(skb, 0, buffer, len);
509 if (unlikely(err)) {
510 kfree_skb(skb);
511 return ERR_PTR(err);
512 }
513 }
514
515 skb->dev = dev;
516 skb->priority = xs->sk.sk_priority;
517 skb->mark = xs->sk.sk_mark;
518 skb_shinfo(skb)->destructor_arg = (void *)(long)desc->addr;
519 skb->destructor = xsk_destruct_skb;
520
521 return skb;
522}
523
Magnus Karlssondf551052019-10-02 08:31:59 +0200524static int xsk_generic_xmit(struct sock *sk)
Magnus Karlsson35fcde72018-05-02 13:01:34 +0200525{
Magnus Karlsson35fcde72018-05-02 13:01:34 +0200526 struct xdp_sock *xs = xdp_sk(sk);
Magnus Karlssondf551052019-10-02 08:31:59 +0200527 u32 max_batch = TX_BATCH_SIZE;
Magnus Karlsson35fcde72018-05-02 13:01:34 +0200528 bool sent_frame = false;
529 struct xdp_desc desc;
530 struct sk_buff *skb;
Magnus Karlssonf09ced42020-12-18 14:45:24 +0100531 unsigned long flags;
Magnus Karlsson35fcde72018-05-02 13:01:34 +0200532 int err = 0;
533
Magnus Karlsson35fcde72018-05-02 13:01:34 +0200534 mutex_lock(&xs->mutex);
535
Ilya Maximets67571642019-07-04 17:25:03 +0300536 if (xs->queue_id >= xs->dev->real_num_tx_queues)
537 goto out;
538
Magnus Karlsson1c1efc22020-08-28 10:26:17 +0200539 while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) {
Magnus Karlsson35fcde72018-05-02 13:01:34 +0200540 if (max_batch-- == 0) {
541 err = -EAGAIN;
542 goto out;
543 }
544
Xuan Zhuo9c8f21e2021-02-18 20:50:45 +0000545 skb = xsk_build_skb(xs, &desc);
546 if (IS_ERR(skb)) {
547 err = PTR_ERR(skb);
Magnus Karlsson35fcde72018-05-02 13:01:34 +0200548 goto out;
Xuan Zhuo9c8f21e2021-02-18 20:50:45 +0000549 }
Magnus Karlsson35fcde72018-05-02 13:01:34 +0200550
Tobias Klauser0a058612020-04-22 01:29:27 +0200551 /* This is the backpressure mechanism for the Tx path.
Magnus Karlsson15d8c912019-12-19 13:39:30 +0100552 * Reserve space in the completion queue and only proceed
553 * if there is space in it. This avoids having to implement
554 * any buffering in the Tx path.
555 */
Magnus Karlssonf09ced42020-12-18 14:45:24 +0100556 spin_lock_irqsave(&xs->pool->cq_lock, flags);
Xuan Zhuo9c8f21e2021-02-18 20:50:45 +0000557 if (xskq_prod_reserve(xs->pool->cq)) {
Magnus Karlssonf09ced42020-12-18 14:45:24 +0100558 spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
Magnus Karlsson35fcde72018-05-02 13:01:34 +0200559 kfree_skb(skb);
560 goto out;
561 }
Magnus Karlssonf09ced42020-12-18 14:45:24 +0100562 spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
Magnus Karlsson35fcde72018-05-02 13:01:34 +0200563
Björn Töpel36ccdf82020-11-23 18:56:00 +0100564 err = __dev_direct_xmit(skb, xs->queue_id);
Magnus Karlsson642e4502020-09-16 14:00:25 +0200565 if (err == NETDEV_TX_BUSY) {
566 /* Tell user-space to retry the send */
567 skb->destructor = sock_wfree;
Magnus Karlssonb1b95cb2020-12-18 14:45:25 +0100568 spin_lock_irqsave(&xs->pool->cq_lock, flags);
569 xskq_prod_cancel(xs->pool->cq);
570 spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
Magnus Karlsson642e4502020-09-16 14:00:25 +0200571 /* Free skb without triggering the perf drop trace */
572 consume_skb(skb);
573 err = -EAGAIN;
574 goto out;
575 }
576
Magnus Karlssonc5ed924b2019-12-19 13:39:26 +0100577 xskq_cons_release(xs->tx);
Magnus Karlsson35fcde72018-05-02 13:01:34 +0200578 /* Ignore NET_XMIT_CN as packet might have been sent */
Magnus Karlsson642e4502020-09-16 14:00:25 +0200579 if (err == NET_XMIT_DROP) {
Magnus Karlssonfe588682018-06-29 09:48:18 +0200580 /* SKB completed but not sent */
581 err = -EBUSY;
Magnus Karlsson35fcde72018-05-02 13:01:34 +0200582 goto out;
583 }
584
585 sent_frame = true;
Magnus Karlsson35fcde72018-05-02 13:01:34 +0200586 }
587
Ciara Loftus8aa5a332020-07-08 07:28:33 +0000588 xs->tx->queue_empty_descs++;
589
Magnus Karlsson35fcde72018-05-02 13:01:34 +0200590out:
591 if (sent_frame)
Xuan Zhuo3413f042020-12-01 21:56:58 +0800592 if (xsk_tx_writeable(xs))
593 sk->sk_write_space(sk);
Magnus Karlsson35fcde72018-05-02 13:01:34 +0200594
595 mutex_unlock(&xs->mutex);
596 return err;
597}
598
Magnus Karlssondf551052019-10-02 08:31:59 +0200599static int __xsk_sendmsg(struct sock *sk)
600{
601 struct xdp_sock *xs = xdp_sk(sk);
602
603 if (unlikely(!(xs->dev->flags & IFF_UP)))
604 return -ENETDOWN;
605 if (unlikely(!xs->tx))
606 return -ENOBUFS;
607
608 return xs->zc ? xsk_zc_xmit(xs) : xsk_generic_xmit(sk);
609}
610
Björn Töpela0731952020-11-30 19:52:00 +0100611static bool xsk_no_wakeup(struct sock *sk)
612{
613#ifdef CONFIG_NET_RX_BUSY_POLL
614 /* Prefer busy-polling, skip the wakeup. */
615 return READ_ONCE(sk->sk_prefer_busy_poll) && READ_ONCE(sk->sk_ll_usec) &&
616 READ_ONCE(sk->sk_napi_id) >= MIN_NAPI_ID;
617#else
618 return false;
619#endif
620}
621
Magnus Karlsson35fcde72018-05-02 13:01:34 +0200622static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
623{
Magnus Karlssonac98d8a2018-06-04 14:05:57 +0200624 bool need_wait = !(m->msg_flags & MSG_DONTWAIT);
Magnus Karlsson35fcde72018-05-02 13:01:34 +0200625 struct sock *sk = sock->sk;
626 struct xdp_sock *xs = xdp_sk(sk);
Björn Töpele3920812020-11-30 19:51:59 +0100627 struct xsk_buff_pool *pool;
Magnus Karlsson35fcde72018-05-02 13:01:34 +0200628
Björn Töpel42fddcc2019-09-04 13:49:12 +0200629 if (unlikely(!xsk_is_bound(xs)))
Magnus Karlsson35fcde72018-05-02 13:01:34 +0200630 return -ENXIO;
Magnus Karlssondf551052019-10-02 08:31:59 +0200631 if (unlikely(need_wait))
Magnus Karlssonac98d8a2018-06-04 14:05:57 +0200632 return -EOPNOTSUPP;
Magnus Karlsson35fcde72018-05-02 13:01:34 +0200633
Björn Töpela0731952020-11-30 19:52:00 +0100634 if (sk_can_busy_loop(sk))
635 sk_busy_loop(sk, 1); /* only support non-blocking sockets */
636
637 if (xsk_no_wakeup(sk))
638 return 0;
639
Björn Töpele3920812020-11-30 19:51:59 +0100640 pool = xs->pool;
641 if (pool->cached_need_wakeup & XDP_WAKEUP_TX)
642 return __xsk_sendmsg(sk);
643 return 0;
Magnus Karlsson35fcde72018-05-02 13:01:34 +0200644}
645
Björn Töpel45a86682020-11-30 19:51:58 +0100646static int xsk_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags)
647{
648 bool need_wait = !(flags & MSG_DONTWAIT);
649 struct sock *sk = sock->sk;
650 struct xdp_sock *xs = xdp_sk(sk);
651
Björn Töpel3546b9b2020-12-07 09:20:08 +0100652 if (unlikely(!xsk_is_bound(xs)))
653 return -ENXIO;
Björn Töpel45a86682020-11-30 19:51:58 +0100654 if (unlikely(!(xs->dev->flags & IFF_UP)))
655 return -ENETDOWN;
656 if (unlikely(!xs->rx))
657 return -ENOBUFS;
Björn Töpel45a86682020-11-30 19:51:58 +0100658 if (unlikely(need_wait))
659 return -EOPNOTSUPP;
660
Björn Töpela0731952020-11-30 19:52:00 +0100661 if (sk_can_busy_loop(sk))
662 sk_busy_loop(sk, 1); /* only support non-blocking sockets */
663
664 if (xsk_no_wakeup(sk))
665 return 0;
666
Björn Töpel45a86682020-11-30 19:51:58 +0100667 if (xs->pool->cached_need_wakeup & XDP_WAKEUP_RX && xs->zc)
668 return xsk_wakeup(xs, XDP_WAKEUP_RX);
669 return 0;
Björn Töpelc4971762018-05-02 13:01:27 +0200670}
671
Luc Van Oostenryck5d946c52019-11-20 01:10:42 +0100672static __poll_t xsk_poll(struct file *file, struct socket *sock,
Linus Torvaldsa11e1d42018-06-28 09:43:44 -0700673 struct poll_table_struct *wait)
Björn Töpelc4971762018-05-02 13:01:27 +0200674{
Xuan Zhuof5da5412020-12-01 21:56:57 +0800675 __poll_t mask = 0;
Magnus Karlssondf551052019-10-02 08:31:59 +0200676 struct sock *sk = sock->sk;
677 struct xdp_sock *xs = xdp_sk(sk);
Magnus Karlssonc2d3d6a2020-08-28 10:26:19 +0200678 struct xsk_buff_pool *pool;
Björn Töpel42fddcc2019-09-04 13:49:12 +0200679
Magnus Karlsson0706a782021-12-17 15:56:46 +0100680 sock_poll_wait(file, sock, wait);
681
Björn Töpel42fddcc2019-09-04 13:49:12 +0200682 if (unlikely(!xsk_is_bound(xs)))
683 return mask;
684
Magnus Karlssonc2d3d6a2020-08-28 10:26:19 +0200685 pool = xs->pool;
Magnus Karlsson77cd0d72019-08-14 09:27:17 +0200686
Magnus Karlssonc2d3d6a2020-08-28 10:26:19 +0200687 if (pool->cached_need_wakeup) {
Maxim Mikityanskiy06870682019-12-17 16:20:42 +0000688 if (xs->zc)
Magnus Karlssonc2d3d6a2020-08-28 10:26:19 +0200689 xsk_wakeup(xs, pool->cached_need_wakeup);
Magnus Karlssondf551052019-10-02 08:31:59 +0200690 else
691 /* Poll needs to drive Tx also in copy mode */
692 __xsk_sendmsg(sk);
693 }
Björn Töpelc4971762018-05-02 13:01:27 +0200694
Magnus Karlsson59e35e52019-12-19 13:39:23 +0100695 if (xs->rx && !xskq_prod_is_empty(xs->rx))
Luc Van Oostenryck5d946c52019-11-20 01:10:42 +0100696 mask |= EPOLLIN | EPOLLRDNORM;
Xuan Zhuo3413f042020-12-01 21:56:58 +0800697 if (xs->tx && xsk_tx_writeable(xs))
Luc Van Oostenryck5d946c52019-11-20 01:10:42 +0100698 mask |= EPOLLOUT | EPOLLWRNORM;
Björn Töpelc4971762018-05-02 13:01:27 +0200699
700 return mask;
701}
702
Björn Töpelb9b6b682018-05-02 13:01:25 +0200703static int xsk_init_queue(u32 entries, struct xsk_queue **queue,
704 bool umem_queue)
Magnus Karlsson423f3832018-05-02 13:01:24 +0200705{
706 struct xsk_queue *q;
707
708 if (entries == 0 || *queue || !is_power_of_2(entries))
709 return -EINVAL;
710
Björn Töpelb9b6b682018-05-02 13:01:25 +0200711 q = xskq_create(entries, umem_queue);
Magnus Karlsson423f3832018-05-02 13:01:24 +0200712 if (!q)
713 return -ENOMEM;
714
Björn Töpel37b07692018-05-22 09:35:01 +0200715 /* Make sure queue is ready before it can be seen by others */
716 smp_wmb();
Björn Töpel94a99762019-09-04 13:49:10 +0200717 WRITE_ONCE(*queue, q);
Magnus Karlsson423f3832018-05-02 13:01:24 +0200718 return 0;
719}
720
Ilya Maximets455302d2019-06-28 11:04:07 +0300721static void xsk_unbind_dev(struct xdp_sock *xs)
722{
723 struct net_device *dev = xs->dev;
724
Björn Töpel42fddcc2019-09-04 13:49:12 +0200725 if (xs->state != XSK_BOUND)
Ilya Maximets455302d2019-06-28 11:04:07 +0300726 return;
Björn Töpel42fddcc2019-09-04 13:49:12 +0200727 WRITE_ONCE(xs->state, XSK_UNBOUND);
Ilya Maximets455302d2019-06-28 11:04:07 +0300728
729 /* Wait for driver to stop using the xdp socket. */
Magnus Karlssona5aa8e52020-08-28 10:26:20 +0200730 xp_del_xsk(xs->pool, xs);
Ilya Maximets455302d2019-06-28 11:04:07 +0300731 xs->dev = NULL;
732 synchronize_net();
733 dev_put(dev);
734}
735
Björn Töpel0402acd2019-08-15 11:30:13 +0200736static struct xsk_map *xsk_get_map_list_entry(struct xdp_sock *xs,
Toke Høiland-Jørgensen782347b2021-06-24 18:05:55 +0200737 struct xdp_sock __rcu ***map_entry)
Björn Töpel0402acd2019-08-15 11:30:13 +0200738{
739 struct xsk_map *map = NULL;
740 struct xsk_map_node *node;
741
742 *map_entry = NULL;
743
744 spin_lock_bh(&xs->map_list_lock);
745 node = list_first_entry_or_null(&xs->map_list, struct xsk_map_node,
746 node);
747 if (node) {
Zhu Yanjunbb1b25c2020-11-26 23:03:18 +0800748 bpf_map_inc(&node->map->map);
Björn Töpel0402acd2019-08-15 11:30:13 +0200749 map = node->map;
750 *map_entry = node->map_entry;
751 }
752 spin_unlock_bh(&xs->map_list_lock);
753 return map;
754}
755
756static void xsk_delete_from_maps(struct xdp_sock *xs)
757{
758 /* This function removes the current XDP socket from all the
759 * maps it resides in. We need to take extra care here, due to
760 * the two locks involved. Each map has a lock synchronizing
761 * updates to the entries, and each socket has a lock that
762 * synchronizes access to the list of maps (map_list). For
763 * deadlock avoidance the locks need to be taken in the order
764 * "map lock"->"socket map list lock". We start off by
765 * accessing the socket map list, and take a reference to the
766 * map to guarantee existence between the
767 * xsk_get_map_list_entry() and xsk_map_try_sock_delete()
768 * calls. Then we ask the map to remove the socket, which
769 * tries to remove the socket from the map. Note that there
770 * might be updates to the map between
771 * xsk_get_map_list_entry() and xsk_map_try_sock_delete().
772 */
Toke Høiland-Jørgensen782347b2021-06-24 18:05:55 +0200773 struct xdp_sock __rcu **map_entry = NULL;
Björn Töpel0402acd2019-08-15 11:30:13 +0200774 struct xsk_map *map;
775
776 while ((map = xsk_get_map_list_entry(xs, &map_entry))) {
777 xsk_map_try_sock_delete(map, xs, map_entry);
Zhu Yanjunbb1b25c2020-11-26 23:03:18 +0800778 bpf_map_put(&map->map);
Björn Töpel0402acd2019-08-15 11:30:13 +0200779 }
780}
781
Björn Töpelc0c77d82018-05-02 13:01:23 +0200782static int xsk_release(struct socket *sock)
783{
784 struct sock *sk = sock->sk;
Magnus Karlsson965a9902018-05-02 13:01:26 +0200785 struct xdp_sock *xs = xdp_sk(sk);
Björn Töpelc0c77d82018-05-02 13:01:23 +0200786 struct net *net;
787
788 if (!sk)
789 return 0;
790
791 net = sock_net(sk);
792
Björn Töpel1d0dc062019-01-24 19:59:37 +0100793 mutex_lock(&net->xdp.lock);
794 sk_del_node_init_rcu(sk);
795 mutex_unlock(&net->xdp.lock);
796
Björn Töpelc0c77d82018-05-02 13:01:23 +0200797 local_bh_disable();
798 sock_prot_inuse_add(net, sk->sk_prot, -1);
799 local_bh_enable();
800
Björn Töpel0402acd2019-08-15 11:30:13 +0200801 xsk_delete_from_maps(xs);
Björn Töpel42fddcc2019-09-04 13:49:12 +0200802 mutex_lock(&xs->mutex);
Ilya Maximets455302d2019-06-28 11:04:07 +0300803 xsk_unbind_dev(xs);
Björn Töpel42fddcc2019-09-04 13:49:12 +0200804 mutex_unlock(&xs->mutex);
Magnus Karlsson965a9902018-05-02 13:01:26 +0200805
Björn Töpel541d7fd2018-10-05 13:25:15 +0200806 xskq_destroy(xs->rx);
807 xskq_destroy(xs->tx);
Magnus Karlsson7361f9c2020-08-28 10:26:18 +0200808 xskq_destroy(xs->fq_tmp);
809 xskq_destroy(xs->cq_tmp);
Björn Töpel541d7fd2018-10-05 13:25:15 +0200810
Björn Töpelc0c77d82018-05-02 13:01:23 +0200811 sock_orphan(sk);
812 sock->sk = NULL;
813
814 sk_refcnt_debug_release(sk);
815 sock_put(sk);
816
817 return 0;
818}
819
Magnus Karlsson965a9902018-05-02 13:01:26 +0200820static struct socket *xsk_lookup_xsk_from_fd(int fd)
821{
822 struct socket *sock;
823 int err;
824
825 sock = sockfd_lookup(fd, &err);
826 if (!sock)
827 return ERR_PTR(-ENOTSOCK);
828
829 if (sock->sk->sk_family != PF_XDP) {
830 sockfd_put(sock);
831 return ERR_PTR(-ENOPROTOOPT);
832 }
833
834 return sock;
835}
836
Magnus Karlsson7361f9c2020-08-28 10:26:18 +0200837static bool xsk_validate_queues(struct xdp_sock *xs)
838{
839 return xs->fq_tmp && xs->cq_tmp;
840}
841
Magnus Karlsson965a9902018-05-02 13:01:26 +0200842static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
843{
844 struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr;
845 struct sock *sk = sock->sk;
Magnus Karlsson965a9902018-05-02 13:01:26 +0200846 struct xdp_sock *xs = xdp_sk(sk);
Björn Töpel959b71d2018-05-22 09:34:56 +0200847 struct net_device *dev;
Björn Töpel173d3ad2018-06-04 14:05:55 +0200848 u32 flags, qid;
Magnus Karlsson965a9902018-05-02 13:01:26 +0200849 int err = 0;
850
851 if (addr_len < sizeof(struct sockaddr_xdp))
852 return -EINVAL;
853 if (sxdp->sxdp_family != AF_XDP)
854 return -EINVAL;
855
Björn Töpelf54ba392019-03-08 08:57:26 +0100856 flags = sxdp->sxdp_flags;
Magnus Karlsson77cd0d72019-08-14 09:27:17 +0200857 if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY |
858 XDP_USE_NEED_WAKEUP))
Björn Töpelf54ba392019-03-08 08:57:26 +0100859 return -EINVAL;
860
Ilya Maximets5464c3a2019-07-08 14:03:44 +0300861 rtnl_lock();
Magnus Karlsson965a9902018-05-02 13:01:26 +0200862 mutex_lock(&xs->mutex);
Ilya Maximets455302d2019-06-28 11:04:07 +0300863 if (xs->state != XSK_READY) {
Björn Töpel959b71d2018-05-22 09:34:56 +0200864 err = -EBUSY;
865 goto out_release;
866 }
867
Magnus Karlsson965a9902018-05-02 13:01:26 +0200868 dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex);
869 if (!dev) {
870 err = -ENODEV;
871 goto out_release;
872 }
873
Magnus Karlssonf6145902018-05-02 13:01:32 +0200874 if (!xs->rx && !xs->tx) {
Magnus Karlsson965a9902018-05-02 13:01:26 +0200875 err = -EINVAL;
876 goto out_unlock;
877 }
878
Björn Töpel173d3ad2018-06-04 14:05:55 +0200879 qid = sxdp->sxdp_queue_id;
Björn Töpel173d3ad2018-06-04 14:05:55 +0200880
881 if (flags & XDP_SHARED_UMEM) {
Magnus Karlsson965a9902018-05-02 13:01:26 +0200882 struct xdp_sock *umem_xs;
883 struct socket *sock;
884
Magnus Karlsson77cd0d72019-08-14 09:27:17 +0200885 if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY) ||
886 (flags & XDP_USE_NEED_WAKEUP)) {
Björn Töpel173d3ad2018-06-04 14:05:55 +0200887 /* Cannot specify flags for shared sockets. */
888 err = -EINVAL;
889 goto out_unlock;
890 }
891
Magnus Karlsson965a9902018-05-02 13:01:26 +0200892 if (xs->umem) {
893 /* We have already our own. */
894 err = -EINVAL;
895 goto out_unlock;
896 }
897
898 sock = xsk_lookup_xsk_from_fd(sxdp->sxdp_shared_umem_fd);
899 if (IS_ERR(sock)) {
900 err = PTR_ERR(sock);
901 goto out_unlock;
902 }
903
904 umem_xs = xdp_sk(sock->sk);
Björn Töpel42fddcc2019-09-04 13:49:12 +0200905 if (!xsk_is_bound(umem_xs)) {
Magnus Karlsson965a9902018-05-02 13:01:26 +0200906 err = -EBADF;
907 sockfd_put(sock);
908 goto out_unlock;
Björn Töpel42fddcc2019-09-04 13:49:12 +0200909 }
Magnus Karlsson965a9902018-05-02 13:01:26 +0200910
Magnus Karlssona1132432020-08-28 10:26:26 +0200911 if (umem_xs->queue_id != qid || umem_xs->dev != dev) {
912 /* Share the umem with another socket on another qid
913 * and/or device.
914 */
Magnus Karlssonb5aea282020-08-28 10:26:25 +0200915 xs->pool = xp_create_and_assign_umem(xs,
916 umem_xs->umem);
917 if (!xs->pool) {
Magnus Karlsson1fd17c8c2020-09-26 11:26:13 +0200918 err = -ENOMEM;
Magnus Karlssonb5aea282020-08-28 10:26:25 +0200919 sockfd_put(sock);
920 goto out_unlock;
921 }
922
923 err = xp_assign_dev_shared(xs->pool, umem_xs->umem,
924 dev, qid);
925 if (err) {
926 xp_destroy(xs->pool);
Magnus Karlsson83cf5c62020-09-02 09:36:04 +0200927 xs->pool = NULL;
Magnus Karlssonb5aea282020-08-28 10:26:25 +0200928 sockfd_put(sock);
929 goto out_unlock;
930 }
931 } else {
932 /* Share the buffer pool with the other socket. */
933 if (xs->fq_tmp || xs->cq_tmp) {
934 /* Do not allow setting your own fq or cq. */
935 err = -EINVAL;
936 sockfd_put(sock);
937 goto out_unlock;
938 }
939
940 xp_get_pool(umem_xs->pool);
941 xs->pool = umem_xs->pool;
942 }
943
Magnus Karlsson965a9902018-05-02 13:01:26 +0200944 xdp_get_umem(umem_xs->umem);
Björn Töpel9764f4b2019-09-04 13:49:11 +0200945 WRITE_ONCE(xs->umem, umem_xs->umem);
Magnus Karlsson965a9902018-05-02 13:01:26 +0200946 sockfd_put(sock);
Magnus Karlsson7361f9c2020-08-28 10:26:18 +0200947 } else if (!xs->umem || !xsk_validate_queues(xs)) {
Magnus Karlsson965a9902018-05-02 13:01:26 +0200948 err = -EINVAL;
949 goto out_unlock;
Björn Töpelc4971762018-05-02 13:01:27 +0200950 } else {
951 /* This xsk has its own umem. */
Magnus Karlsson1c1efc22020-08-28 10:26:17 +0200952 xs->pool = xp_create_and_assign_umem(xs, xs->umem);
953 if (!xs->pool) {
954 err = -ENOMEM;
Björn Töpel173d3ad2018-06-04 14:05:55 +0200955 goto out_unlock;
Magnus Karlsson1c1efc22020-08-28 10:26:17 +0200956 }
957
958 err = xp_assign_dev(xs->pool, dev, qid, flags);
959 if (err) {
960 xp_destroy(xs->pool);
961 xs->pool = NULL;
Magnus Karlsson1c1efc22020-08-28 10:26:17 +0200962 goto out_unlock;
963 }
Magnus Karlsson965a9902018-05-02 13:01:26 +0200964 }
965
Magnus Karlsson8bee6832020-12-14 09:51:27 +0100966 /* FQ and CQ are now owned by the buffer pool and cleaned up with it. */
967 xs->fq_tmp = NULL;
968 xs->cq_tmp = NULL;
969
Magnus Karlsson965a9902018-05-02 13:01:26 +0200970 xs->dev = dev;
Magnus Karlssonac98d8a2018-06-04 14:05:57 +0200971 xs->zc = xs->umem->zc;
972 xs->queue_id = qid;
Magnus Karlssona5aa8e52020-08-28 10:26:20 +0200973 xp_add_xsk(xs->pool, xs);
Magnus Karlsson965a9902018-05-02 13:01:26 +0200974
975out_unlock:
Björn Töpel42fddcc2019-09-04 13:49:12 +0200976 if (err) {
Magnus Karlsson965a9902018-05-02 13:01:26 +0200977 dev_put(dev);
Björn Töpel42fddcc2019-09-04 13:49:12 +0200978 } else {
979 /* Matches smp_rmb() in bind() for shared umem
980 * sockets, and xsk_is_bound().
981 */
982 smp_wmb();
983 WRITE_ONCE(xs->state, XSK_BOUND);
984 }
Magnus Karlsson965a9902018-05-02 13:01:26 +0200985out_release:
986 mutex_unlock(&xs->mutex);
Ilya Maximets5464c3a2019-07-08 14:03:44 +0300987 rtnl_unlock();
Magnus Karlsson965a9902018-05-02 13:01:26 +0200988 return err;
989}
990
Kevin Laatzc05cd362019-08-27 02:25:22 +0000991struct xdp_umem_reg_v1 {
992 __u64 addr; /* Start of packet data area */
993 __u64 len; /* Length of packet data area */
994 __u32 chunk_size;
995 __u32 headroom;
996};
997
Björn Töpelc0c77d82018-05-02 13:01:23 +0200998static int xsk_setsockopt(struct socket *sock, int level, int optname,
Christoph Hellwiga7b75c52020-07-23 08:09:07 +0200999 sockptr_t optval, unsigned int optlen)
Björn Töpelc0c77d82018-05-02 13:01:23 +02001000{
1001 struct sock *sk = sock->sk;
1002 struct xdp_sock *xs = xdp_sk(sk);
1003 int err;
1004
1005 if (level != SOL_XDP)
1006 return -ENOPROTOOPT;
1007
1008 switch (optname) {
Björn Töpelb9b6b682018-05-02 13:01:25 +02001009 case XDP_RX_RING:
Magnus Karlssonf6145902018-05-02 13:01:32 +02001010 case XDP_TX_RING:
Björn Töpelb9b6b682018-05-02 13:01:25 +02001011 {
1012 struct xsk_queue **q;
1013 int entries;
1014
1015 if (optlen < sizeof(entries))
1016 return -EINVAL;
Christoph Hellwiga7b75c52020-07-23 08:09:07 +02001017 if (copy_from_sockptr(&entries, optval, sizeof(entries)))
Björn Töpelb9b6b682018-05-02 13:01:25 +02001018 return -EFAULT;
1019
1020 mutex_lock(&xs->mutex);
Ilya Maximets455302d2019-06-28 11:04:07 +03001021 if (xs->state != XSK_READY) {
1022 mutex_unlock(&xs->mutex);
1023 return -EBUSY;
1024 }
Magnus Karlssonf6145902018-05-02 13:01:32 +02001025 q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx;
Björn Töpelb9b6b682018-05-02 13:01:25 +02001026 err = xsk_init_queue(entries, q, false);
Magnus Karlsson77cd0d72019-08-14 09:27:17 +02001027 if (!err && optname == XDP_TX_RING)
1028 /* Tx needs to be explicitly woken up the first time */
1029 xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
Björn Töpelb9b6b682018-05-02 13:01:25 +02001030 mutex_unlock(&xs->mutex);
1031 return err;
1032 }
Björn Töpelc0c77d82018-05-02 13:01:23 +02001033 case XDP_UMEM_REG:
1034 {
Kevin Laatzc05cd362019-08-27 02:25:22 +00001035 size_t mr_size = sizeof(struct xdp_umem_reg);
1036 struct xdp_umem_reg mr = {};
Björn Töpelc0c77d82018-05-02 13:01:23 +02001037 struct xdp_umem *umem;
1038
Kevin Laatzc05cd362019-08-27 02:25:22 +00001039 if (optlen < sizeof(struct xdp_umem_reg_v1))
1040 return -EINVAL;
1041 else if (optlen < sizeof(mr))
1042 mr_size = sizeof(struct xdp_umem_reg_v1);
1043
Christoph Hellwiga7b75c52020-07-23 08:09:07 +02001044 if (copy_from_sockptr(&mr, optval, mr_size))
Björn Töpelc0c77d82018-05-02 13:01:23 +02001045 return -EFAULT;
1046
1047 mutex_lock(&xs->mutex);
Ilya Maximets455302d2019-06-28 11:04:07 +03001048 if (xs->state != XSK_READY || xs->umem) {
Björn Töpelc0c77d82018-05-02 13:01:23 +02001049 mutex_unlock(&xs->mutex);
Björn Töpela49049e2018-05-22 09:35:02 +02001050 return -EBUSY;
1051 }
1052
1053 umem = xdp_umem_create(&mr);
1054 if (IS_ERR(umem)) {
1055 mutex_unlock(&xs->mutex);
1056 return PTR_ERR(umem);
Björn Töpelc0c77d82018-05-02 13:01:23 +02001057 }
1058
1059 /* Make sure umem is ready before it can be seen by others */
1060 smp_wmb();
Björn Töpel9764f4b2019-09-04 13:49:11 +02001061 WRITE_ONCE(xs->umem, umem);
Björn Töpelc0c77d82018-05-02 13:01:23 +02001062 mutex_unlock(&xs->mutex);
1063 return 0;
1064 }
Magnus Karlsson423f3832018-05-02 13:01:24 +02001065 case XDP_UMEM_FILL_RING:
Magnus Karlssonfe230832018-05-02 13:01:31 +02001066 case XDP_UMEM_COMPLETION_RING:
Magnus Karlsson423f3832018-05-02 13:01:24 +02001067 {
1068 struct xsk_queue **q;
1069 int entries;
1070
Christoph Hellwiga7b75c52020-07-23 08:09:07 +02001071 if (copy_from_sockptr(&entries, optval, sizeof(entries)))
Magnus Karlsson423f3832018-05-02 13:01:24 +02001072 return -EFAULT;
1073
1074 mutex_lock(&xs->mutex);
Ilya Maximets455302d2019-06-28 11:04:07 +03001075 if (xs->state != XSK_READY) {
1076 mutex_unlock(&xs->mutex);
1077 return -EBUSY;
1078 }
Björn Töpela49049e2018-05-22 09:35:02 +02001079
Magnus Karlsson7361f9c2020-08-28 10:26:18 +02001080 q = (optname == XDP_UMEM_FILL_RING) ? &xs->fq_tmp :
1081 &xs->cq_tmp;
Björn Töpelb9b6b682018-05-02 13:01:25 +02001082 err = xsk_init_queue(entries, q, true);
Magnus Karlsson423f3832018-05-02 13:01:24 +02001083 mutex_unlock(&xs->mutex);
1084 return err;
1085 }
Björn Töpelc0c77d82018-05-02 13:01:23 +02001086 default:
1087 break;
1088 }
1089
1090 return -ENOPROTOOPT;
1091}
1092
Magnus Karlsson77cd0d72019-08-14 09:27:17 +02001093static void xsk_enter_rxtx_offsets(struct xdp_ring_offset_v1 *ring)
1094{
1095 ring->producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
1096 ring->consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
1097 ring->desc = offsetof(struct xdp_rxtx_ring, desc);
1098}
1099
1100static void xsk_enter_umem_offsets(struct xdp_ring_offset_v1 *ring)
1101{
1102 ring->producer = offsetof(struct xdp_umem_ring, ptrs.producer);
1103 ring->consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
1104 ring->desc = offsetof(struct xdp_umem_ring, desc);
1105}
1106
Ciara Loftus8aa5a332020-07-08 07:28:33 +00001107struct xdp_statistics_v1 {
1108 __u64 rx_dropped;
1109 __u64 rx_invalid_descs;
1110 __u64 tx_invalid_descs;
1111};
1112
Magnus Karlssonaf75d9e2018-05-02 13:01:35 +02001113static int xsk_getsockopt(struct socket *sock, int level, int optname,
1114 char __user *optval, int __user *optlen)
1115{
1116 struct sock *sk = sock->sk;
1117 struct xdp_sock *xs = xdp_sk(sk);
1118 int len;
1119
1120 if (level != SOL_XDP)
1121 return -ENOPROTOOPT;
1122
1123 if (get_user(len, optlen))
1124 return -EFAULT;
1125 if (len < 0)
1126 return -EINVAL;
1127
1128 switch (optname) {
1129 case XDP_STATISTICS:
1130 {
Peilin Ye3c4f8502020-07-28 01:36:04 -04001131 struct xdp_statistics stats = {};
Ciara Loftus8aa5a332020-07-08 07:28:33 +00001132 bool extra_stats = true;
1133 size_t stats_size;
Magnus Karlssonaf75d9e2018-05-02 13:01:35 +02001134
Ciara Loftus8aa5a332020-07-08 07:28:33 +00001135 if (len < sizeof(struct xdp_statistics_v1)) {
Magnus Karlssonaf75d9e2018-05-02 13:01:35 +02001136 return -EINVAL;
Ciara Loftus8aa5a332020-07-08 07:28:33 +00001137 } else if (len < sizeof(stats)) {
1138 extra_stats = false;
1139 stats_size = sizeof(struct xdp_statistics_v1);
1140 } else {
1141 stats_size = sizeof(stats);
1142 }
Magnus Karlssonaf75d9e2018-05-02 13:01:35 +02001143
1144 mutex_lock(&xs->mutex);
1145 stats.rx_dropped = xs->rx_dropped;
Ciara Loftus8aa5a332020-07-08 07:28:33 +00001146 if (extra_stats) {
1147 stats.rx_ring_full = xs->rx_queue_full;
1148 stats.rx_fill_ring_empty_descs =
Magnus Karlsson7361f9c2020-08-28 10:26:18 +02001149 xs->pool ? xskq_nb_queue_empty_descs(xs->pool->fq) : 0;
Ciara Loftus8aa5a332020-07-08 07:28:33 +00001150 stats.tx_ring_empty_descs = xskq_nb_queue_empty_descs(xs->tx);
1151 } else {
1152 stats.rx_dropped += xs->rx_queue_full;
1153 }
Magnus Karlssonaf75d9e2018-05-02 13:01:35 +02001154 stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx);
1155 stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx);
1156 mutex_unlock(&xs->mutex);
1157
Ciara Loftus8aa5a332020-07-08 07:28:33 +00001158 if (copy_to_user(optval, &stats, stats_size))
Magnus Karlssonaf75d9e2018-05-02 13:01:35 +02001159 return -EFAULT;
Ciara Loftus8aa5a332020-07-08 07:28:33 +00001160 if (put_user(stats_size, optlen))
Magnus Karlssonaf75d9e2018-05-02 13:01:35 +02001161 return -EFAULT;
1162
1163 return 0;
1164 }
Björn Töpelb3a9e0b2018-05-22 09:34:59 +02001165 case XDP_MMAP_OFFSETS:
1166 {
1167 struct xdp_mmap_offsets off;
Magnus Karlsson77cd0d72019-08-14 09:27:17 +02001168 struct xdp_mmap_offsets_v1 off_v1;
1169 bool flags_supported = true;
1170 void *to_copy;
Björn Töpelb3a9e0b2018-05-22 09:34:59 +02001171
Magnus Karlsson77cd0d72019-08-14 09:27:17 +02001172 if (len < sizeof(off_v1))
Björn Töpelb3a9e0b2018-05-22 09:34:59 +02001173 return -EINVAL;
Magnus Karlsson77cd0d72019-08-14 09:27:17 +02001174 else if (len < sizeof(off))
1175 flags_supported = false;
Björn Töpelb3a9e0b2018-05-22 09:34:59 +02001176
Magnus Karlsson77cd0d72019-08-14 09:27:17 +02001177 if (flags_supported) {
1178 /* xdp_ring_offset is identical to xdp_ring_offset_v1
1179 * except for the flags field added to the end.
1180 */
1181 xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
1182 &off.rx);
1183 xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
1184 &off.tx);
1185 xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
1186 &off.fr);
1187 xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
1188 &off.cr);
1189 off.rx.flags = offsetof(struct xdp_rxtx_ring,
1190 ptrs.flags);
1191 off.tx.flags = offsetof(struct xdp_rxtx_ring,
1192 ptrs.flags);
1193 off.fr.flags = offsetof(struct xdp_umem_ring,
1194 ptrs.flags);
1195 off.cr.flags = offsetof(struct xdp_umem_ring,
1196 ptrs.flags);
Björn Töpelb3a9e0b2018-05-22 09:34:59 +02001197
Magnus Karlsson77cd0d72019-08-14 09:27:17 +02001198 len = sizeof(off);
1199 to_copy = &off;
1200 } else {
1201 xsk_enter_rxtx_offsets(&off_v1.rx);
1202 xsk_enter_rxtx_offsets(&off_v1.tx);
1203 xsk_enter_umem_offsets(&off_v1.fr);
1204 xsk_enter_umem_offsets(&off_v1.cr);
Björn Töpelb3a9e0b2018-05-22 09:34:59 +02001205
Magnus Karlsson77cd0d72019-08-14 09:27:17 +02001206 len = sizeof(off_v1);
1207 to_copy = &off_v1;
1208 }
1209
1210 if (copy_to_user(optval, to_copy, len))
Björn Töpelb3a9e0b2018-05-22 09:34:59 +02001211 return -EFAULT;
1212 if (put_user(len, optlen))
1213 return -EFAULT;
1214
1215 return 0;
1216 }
Maxim Mikityanskiy2640d3c2019-06-26 17:35:25 +03001217 case XDP_OPTIONS:
1218 {
1219 struct xdp_options opts = {};
1220
1221 if (len < sizeof(opts))
1222 return -EINVAL;
1223
1224 mutex_lock(&xs->mutex);
1225 if (xs->zc)
1226 opts.flags |= XDP_OPTIONS_ZEROCOPY;
1227 mutex_unlock(&xs->mutex);
1228
1229 len = sizeof(opts);
1230 if (copy_to_user(optval, &opts, len))
1231 return -EFAULT;
1232 if (put_user(len, optlen))
1233 return -EFAULT;
1234
1235 return 0;
1236 }
Magnus Karlssonaf75d9e2018-05-02 13:01:35 +02001237 default:
1238 break;
1239 }
1240
1241 return -EOPNOTSUPP;
1242}
1243
Magnus Karlsson423f3832018-05-02 13:01:24 +02001244static int xsk_mmap(struct file *file, struct socket *sock,
1245 struct vm_area_struct *vma)
1246{
Geert Uytterhoevena5a16e42018-06-07 15:37:34 +02001247 loff_t offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
Magnus Karlsson423f3832018-05-02 13:01:24 +02001248 unsigned long size = vma->vm_end - vma->vm_start;
1249 struct xdp_sock *xs = xdp_sk(sock->sk);
1250 struct xsk_queue *q = NULL;
1251 unsigned long pfn;
1252 struct page *qpg;
1253
Björn Töpel42fddcc2019-09-04 13:49:12 +02001254 if (READ_ONCE(xs->state) != XSK_READY)
Ilya Maximets455302d2019-06-28 11:04:07 +03001255 return -EBUSY;
1256
Björn Töpelb9b6b682018-05-02 13:01:25 +02001257 if (offset == XDP_PGOFF_RX_RING) {
Björn Töpel37b07692018-05-22 09:35:01 +02001258 q = READ_ONCE(xs->rx);
Magnus Karlssonf6145902018-05-02 13:01:32 +02001259 } else if (offset == XDP_PGOFF_TX_RING) {
Björn Töpel37b07692018-05-22 09:35:01 +02001260 q = READ_ONCE(xs->tx);
Björn Töpelb9b6b682018-05-02 13:01:25 +02001261 } else {
Magnus Karlssone6762c82019-02-08 14:13:50 +01001262 /* Matches the smp_wmb() in XDP_UMEM_REG */
1263 smp_rmb();
Björn Töpelb9b6b682018-05-02 13:01:25 +02001264 if (offset == XDP_UMEM_PGOFF_FILL_RING)
Magnus Karlsson7361f9c2020-08-28 10:26:18 +02001265 q = READ_ONCE(xs->fq_tmp);
Magnus Karlssonfe230832018-05-02 13:01:31 +02001266 else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING)
Magnus Karlsson7361f9c2020-08-28 10:26:18 +02001267 q = READ_ONCE(xs->cq_tmp);
Björn Töpelb9b6b682018-05-02 13:01:25 +02001268 }
Magnus Karlsson423f3832018-05-02 13:01:24 +02001269
1270 if (!q)
1271 return -EINVAL;
1272
Magnus Karlssone6762c82019-02-08 14:13:50 +01001273 /* Matches the smp_wmb() in xsk_init_queue */
1274 smp_rmb();
Magnus Karlsson423f3832018-05-02 13:01:24 +02001275 qpg = virt_to_head_page(q->ring);
Matthew Wilcox (Oracle)a50b8542019-09-23 15:34:25 -07001276 if (size > page_size(qpg))
Magnus Karlsson423f3832018-05-02 13:01:24 +02001277 return -EINVAL;
1278
1279 pfn = virt_to_phys(q->ring) >> PAGE_SHIFT;
1280 return remap_pfn_range(vma, vma->vm_start, pfn,
1281 size, vma->vm_page_prot);
1282}
1283
Ilya Maximets455302d2019-06-28 11:04:07 +03001284static int xsk_notifier(struct notifier_block *this,
1285 unsigned long msg, void *ptr)
1286{
1287 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
1288 struct net *net = dev_net(dev);
1289 struct sock *sk;
1290
1291 switch (msg) {
1292 case NETDEV_UNREGISTER:
1293 mutex_lock(&net->xdp.lock);
1294 sk_for_each(sk, &net->xdp.list) {
1295 struct xdp_sock *xs = xdp_sk(sk);
1296
1297 mutex_lock(&xs->mutex);
1298 if (xs->dev == dev) {
1299 sk->sk_err = ENETDOWN;
1300 if (!sock_flag(sk, SOCK_DEAD))
Alexander Aringe3ae2362021-06-27 18:48:21 -04001301 sk_error_report(sk);
Ilya Maximets455302d2019-06-28 11:04:07 +03001302
1303 xsk_unbind_dev(xs);
1304
Magnus Karlsson1c1efc22020-08-28 10:26:17 +02001305 /* Clear device references. */
1306 xp_clear_dev(xs->pool);
Ilya Maximets455302d2019-06-28 11:04:07 +03001307 }
1308 mutex_unlock(&xs->mutex);
1309 }
1310 mutex_unlock(&net->xdp.lock);
1311 break;
1312 }
1313 return NOTIFY_DONE;
1314}
1315
Björn Töpelc0c77d82018-05-02 13:01:23 +02001316static struct proto xsk_proto = {
1317 .name = "XDP",
1318 .owner = THIS_MODULE,
1319 .obj_size = sizeof(struct xdp_sock),
1320};
1321
1322static const struct proto_ops xsk_proto_ops = {
Björn Töpelc2f43742018-05-18 14:00:24 +02001323 .family = PF_XDP,
1324 .owner = THIS_MODULE,
1325 .release = xsk_release,
1326 .bind = xsk_bind,
1327 .connect = sock_no_connect,
1328 .socketpair = sock_no_socketpair,
1329 .accept = sock_no_accept,
1330 .getname = sock_no_getname,
Linus Torvaldsa11e1d42018-06-28 09:43:44 -07001331 .poll = xsk_poll,
Björn Töpelc2f43742018-05-18 14:00:24 +02001332 .ioctl = sock_no_ioctl,
1333 .listen = sock_no_listen,
1334 .shutdown = sock_no_shutdown,
1335 .setsockopt = xsk_setsockopt,
1336 .getsockopt = xsk_getsockopt,
1337 .sendmsg = xsk_sendmsg,
Björn Töpel45a86682020-11-30 19:51:58 +01001338 .recvmsg = xsk_recvmsg,
Björn Töpelc2f43742018-05-18 14:00:24 +02001339 .mmap = xsk_mmap,
1340 .sendpage = sock_no_sendpage,
Björn Töpelc0c77d82018-05-02 13:01:23 +02001341};
1342
Björn Töpel11fe9262019-02-21 13:07:38 +01001343static void xsk_destruct(struct sock *sk)
1344{
1345 struct xdp_sock *xs = xdp_sk(sk);
1346
1347 if (!sock_flag(sk, SOCK_DEAD))
1348 return;
1349
Magnus Karlssone5e1a4b2020-10-27 13:32:01 +01001350 if (!xp_put_pool(xs->pool))
Magnus Karlsson537cf4e2020-11-20 12:53:39 +01001351 xdp_put_umem(xs->umem, !xs->pool);
Björn Töpel11fe9262019-02-21 13:07:38 +01001352
1353 sk_refcnt_debug_dec(sk);
1354}
1355
Björn Töpelc0c77d82018-05-02 13:01:23 +02001356static int xsk_create(struct net *net, struct socket *sock, int protocol,
1357 int kern)
1358{
Björn Töpelc0c77d82018-05-02 13:01:23 +02001359 struct xdp_sock *xs;
Magnus Karlsson1c1efc22020-08-28 10:26:17 +02001360 struct sock *sk;
Björn Töpelc0c77d82018-05-02 13:01:23 +02001361
1362 if (!ns_capable(net->user_ns, CAP_NET_RAW))
1363 return -EPERM;
1364 if (sock->type != SOCK_RAW)
1365 return -ESOCKTNOSUPPORT;
1366
1367 if (protocol)
1368 return -EPROTONOSUPPORT;
1369
1370 sock->state = SS_UNCONNECTED;
1371
1372 sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern);
1373 if (!sk)
1374 return -ENOBUFS;
1375
1376 sock->ops = &xsk_proto_ops;
1377
1378 sock_init_data(sock, sk);
1379
1380 sk->sk_family = PF_XDP;
1381
Björn Töpel11fe9262019-02-21 13:07:38 +01001382 sk->sk_destruct = xsk_destruct;
1383 sk_refcnt_debug_inc(sk);
1384
Björn Töpelcee27162018-10-08 19:40:16 +02001385 sock_set_flag(sk, SOCK_RCU_FREE);
1386
Björn Töpelc0c77d82018-05-02 13:01:23 +02001387 xs = xdp_sk(sk);
Ilya Maximets455302d2019-06-28 11:04:07 +03001388 xs->state = XSK_READY;
Björn Töpelc0c77d82018-05-02 13:01:23 +02001389 mutex_init(&xs->mutex);
Ilya Maximetsbf0bdd12019-07-03 15:09:16 +03001390 spin_lock_init(&xs->rx_lock);
Björn Töpelc0c77d82018-05-02 13:01:23 +02001391
Björn Töpel0402acd2019-08-15 11:30:13 +02001392 INIT_LIST_HEAD(&xs->map_list);
1393 spin_lock_init(&xs->map_list_lock);
1394
Björn Töpel1d0dc062019-01-24 19:59:37 +01001395 mutex_lock(&net->xdp.lock);
1396 sk_add_node_rcu(sk, &net->xdp.list);
1397 mutex_unlock(&net->xdp.lock);
1398
Björn Töpelc0c77d82018-05-02 13:01:23 +02001399 local_bh_disable();
1400 sock_prot_inuse_add(net, &xsk_proto, 1);
1401 local_bh_enable();
1402
1403 return 0;
1404}
1405
1406static const struct net_proto_family xsk_family_ops = {
1407 .family = PF_XDP,
1408 .create = xsk_create,
1409 .owner = THIS_MODULE,
1410};
1411
Ilya Maximets455302d2019-06-28 11:04:07 +03001412static struct notifier_block xsk_netdev_notifier = {
1413 .notifier_call = xsk_notifier,
1414};
1415
Björn Töpel1d0dc062019-01-24 19:59:37 +01001416static int __net_init xsk_net_init(struct net *net)
1417{
1418 mutex_init(&net->xdp.lock);
1419 INIT_HLIST_HEAD(&net->xdp.list);
1420 return 0;
1421}
1422
1423static void __net_exit xsk_net_exit(struct net *net)
1424{
1425 WARN_ON_ONCE(!hlist_empty(&net->xdp.list));
1426}
1427
1428static struct pernet_operations xsk_net_ops = {
1429 .init = xsk_net_init,
1430 .exit = xsk_net_exit,
1431};
1432
Björn Töpelc0c77d82018-05-02 13:01:23 +02001433static int __init xsk_init(void)
1434{
Björn Töpele312b9e2019-12-19 07:10:02 +01001435 int err, cpu;
Björn Töpelc0c77d82018-05-02 13:01:23 +02001436
1437 err = proto_register(&xsk_proto, 0 /* no slab */);
1438 if (err)
1439 goto out;
1440
1441 err = sock_register(&xsk_family_ops);
1442 if (err)
1443 goto out_proto;
1444
Björn Töpel1d0dc062019-01-24 19:59:37 +01001445 err = register_pernet_subsys(&xsk_net_ops);
1446 if (err)
1447 goto out_sk;
Ilya Maximets455302d2019-06-28 11:04:07 +03001448
1449 err = register_netdevice_notifier(&xsk_netdev_notifier);
1450 if (err)
1451 goto out_pernet;
1452
Björn Töpele312b9e2019-12-19 07:10:02 +01001453 for_each_possible_cpu(cpu)
1454 INIT_LIST_HEAD(&per_cpu(xskmap_flush_list, cpu));
Björn Töpelc0c77d82018-05-02 13:01:23 +02001455 return 0;
1456
Ilya Maximets455302d2019-06-28 11:04:07 +03001457out_pernet:
1458 unregister_pernet_subsys(&xsk_net_ops);
Björn Töpel1d0dc062019-01-24 19:59:37 +01001459out_sk:
1460 sock_unregister(PF_XDP);
Björn Töpelc0c77d82018-05-02 13:01:23 +02001461out_proto:
1462 proto_unregister(&xsk_proto);
1463out:
1464 return err;
1465}
1466
1467fs_initcall(xsk_init);