blob: 28ef3f4465ae9e63bc5326311f923260b88f0fe8 [file] [log] [blame]
Björn Töpelc0c77d82018-05-02 13:01:23 +02001// SPDX-License-Identifier: GPL-2.0
2/* XDP sockets
3 *
4 * AF_XDP sockets allows a channel between XDP programs and userspace
5 * applications.
6 * Copyright(c) 2018 Intel Corporation.
7 *
Björn Töpelc0c77d82018-05-02 13:01:23 +02008 * Author(s): Björn Töpel <bjorn.topel@intel.com>
9 * Magnus Karlsson <magnus.karlsson@intel.com>
10 */
11
12#define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__
13
14#include <linux/if_xdp.h>
15#include <linux/init.h>
16#include <linux/sched/mm.h>
17#include <linux/sched/signal.h>
18#include <linux/sched/task.h>
19#include <linux/socket.h>
20#include <linux/file.h>
21#include <linux/uaccess.h>
22#include <linux/net.h>
23#include <linux/netdevice.h>
Magnus Karlssonac98d8a2018-06-04 14:05:57 +020024#include <linux/rculist.h>
Magnus Karlssona71506a2020-05-20 21:20:51 +020025#include <net/xdp_sock_drv.h>
Björn Töpela0731952020-11-30 19:52:00 +010026#include <net/busy_poll.h>
Björn Töpelb9b6b682018-05-02 13:01:25 +020027#include <net/xdp.h>
Björn Töpelc0c77d82018-05-02 13:01:23 +020028
Magnus Karlsson423f3832018-05-02 13:01:24 +020029#include "xsk_queue.h"
Björn Töpelc0c77d82018-05-02 13:01:23 +020030#include "xdp_umem.h"
Björn Töpela36b38aa2019-01-24 19:59:39 +010031#include "xsk.h"
Björn Töpelc0c77d82018-05-02 13:01:23 +020032
Li RongQinge7a1c132021-04-14 13:39:12 +080033#define TX_BATCH_SIZE 32
Magnus Karlsson35fcde72018-05-02 13:01:34 +020034
Björn Töpele312b9e2019-12-19 07:10:02 +010035static DEFINE_PER_CPU(struct list_head, xskmap_flush_list);
36
Magnus Karlssonc4655762020-08-28 10:26:16 +020037void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool)
Magnus Karlsson77cd0d72019-08-14 09:27:17 +020038{
Magnus Karlssonc2d3d6a2020-08-28 10:26:19 +020039 if (pool->cached_need_wakeup & XDP_WAKEUP_RX)
Magnus Karlsson77cd0d72019-08-14 09:27:17 +020040 return;
41
Magnus Karlsson7361f9c2020-08-28 10:26:18 +020042 pool->fq->ring->flags |= XDP_RING_NEED_WAKEUP;
Magnus Karlssonc2d3d6a2020-08-28 10:26:19 +020043 pool->cached_need_wakeup |= XDP_WAKEUP_RX;
Magnus Karlsson77cd0d72019-08-14 09:27:17 +020044}
45EXPORT_SYMBOL(xsk_set_rx_need_wakeup);
46
Magnus Karlssonc4655762020-08-28 10:26:16 +020047void xsk_set_tx_need_wakeup(struct xsk_buff_pool *pool)
Magnus Karlsson77cd0d72019-08-14 09:27:17 +020048{
49 struct xdp_sock *xs;
50
Magnus Karlssonc2d3d6a2020-08-28 10:26:19 +020051 if (pool->cached_need_wakeup & XDP_WAKEUP_TX)
Magnus Karlsson77cd0d72019-08-14 09:27:17 +020052 return;
53
54 rcu_read_lock();
Magnus Karlssona5aa8e52020-08-28 10:26:20 +020055 list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
Magnus Karlsson77cd0d72019-08-14 09:27:17 +020056 xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
57 }
58 rcu_read_unlock();
59
Magnus Karlssonc2d3d6a2020-08-28 10:26:19 +020060 pool->cached_need_wakeup |= XDP_WAKEUP_TX;
Magnus Karlsson77cd0d72019-08-14 09:27:17 +020061}
62EXPORT_SYMBOL(xsk_set_tx_need_wakeup);
63
Magnus Karlssonc4655762020-08-28 10:26:16 +020064void xsk_clear_rx_need_wakeup(struct xsk_buff_pool *pool)
Magnus Karlsson77cd0d72019-08-14 09:27:17 +020065{
Magnus Karlssonc2d3d6a2020-08-28 10:26:19 +020066 if (!(pool->cached_need_wakeup & XDP_WAKEUP_RX))
Magnus Karlsson77cd0d72019-08-14 09:27:17 +020067 return;
68
Magnus Karlsson7361f9c2020-08-28 10:26:18 +020069 pool->fq->ring->flags &= ~XDP_RING_NEED_WAKEUP;
Magnus Karlssonc2d3d6a2020-08-28 10:26:19 +020070 pool->cached_need_wakeup &= ~XDP_WAKEUP_RX;
Magnus Karlsson77cd0d72019-08-14 09:27:17 +020071}
72EXPORT_SYMBOL(xsk_clear_rx_need_wakeup);
73
Magnus Karlssonc4655762020-08-28 10:26:16 +020074void xsk_clear_tx_need_wakeup(struct xsk_buff_pool *pool)
Magnus Karlsson77cd0d72019-08-14 09:27:17 +020075{
76 struct xdp_sock *xs;
77
Magnus Karlssonc2d3d6a2020-08-28 10:26:19 +020078 if (!(pool->cached_need_wakeup & XDP_WAKEUP_TX))
Magnus Karlsson77cd0d72019-08-14 09:27:17 +020079 return;
80
81 rcu_read_lock();
Magnus Karlssona5aa8e52020-08-28 10:26:20 +020082 list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
Magnus Karlsson77cd0d72019-08-14 09:27:17 +020083 xs->tx->ring->flags &= ~XDP_RING_NEED_WAKEUP;
84 }
85 rcu_read_unlock();
86
Magnus Karlssonc2d3d6a2020-08-28 10:26:19 +020087 pool->cached_need_wakeup &= ~XDP_WAKEUP_TX;
Magnus Karlsson77cd0d72019-08-14 09:27:17 +020088}
89EXPORT_SYMBOL(xsk_clear_tx_need_wakeup);
90
Magnus Karlssonc4655762020-08-28 10:26:16 +020091bool xsk_uses_need_wakeup(struct xsk_buff_pool *pool)
Magnus Karlsson77cd0d72019-08-14 09:27:17 +020092{
Magnus Karlssonc2d3d6a2020-08-28 10:26:19 +020093 return pool->uses_need_wakeup;
Magnus Karlsson77cd0d72019-08-14 09:27:17 +020094}
Magnus Karlssonc4655762020-08-28 10:26:16 +020095EXPORT_SYMBOL(xsk_uses_need_wakeup);
Magnus Karlsson77cd0d72019-08-14 09:27:17 +020096
Magnus Karlsson1c1efc22020-08-28 10:26:17 +020097struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev,
98 u16 queue_id)
99{
100 if (queue_id < dev->real_num_rx_queues)
101 return dev->_rx[queue_id].pool;
102 if (queue_id < dev->real_num_tx_queues)
103 return dev->_tx[queue_id].pool;
104
105 return NULL;
106}
107EXPORT_SYMBOL(xsk_get_pool_from_qid);
108
109void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id)
110{
Maxim Mikityanskiyb425e242021-01-18 18:03:33 +0200111 if (queue_id < dev->num_rx_queues)
Magnus Karlsson1c1efc22020-08-28 10:26:17 +0200112 dev->_rx[queue_id].pool = NULL;
Maxim Mikityanskiyb425e242021-01-18 18:03:33 +0200113 if (queue_id < dev->num_tx_queues)
Magnus Karlsson1c1efc22020-08-28 10:26:17 +0200114 dev->_tx[queue_id].pool = NULL;
115}
116
117/* The buffer pool is stored both in the _rx struct and the _tx struct as we do
118 * not know if the device has more tx queues than rx, or the opposite.
119 * This might also change during run time.
120 */
121int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool,
122 u16 queue_id)
123{
124 if (queue_id >= max_t(unsigned int,
125 dev->real_num_rx_queues,
126 dev->real_num_tx_queues))
127 return -EINVAL;
128
129 if (queue_id < dev->real_num_rx_queues)
130 dev->_rx[queue_id].pool = pool;
131 if (queue_id < dev->real_num_tx_queues)
132 dev->_tx[queue_id].pool = pool;
133
134 return 0;
135}
136
Björn Töpel2b434702020-05-20 21:20:53 +0200137static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
Kevin Laatzc05cd362019-08-27 02:25:22 +0000138{
Björn Töpel2b434702020-05-20 21:20:53 +0200139 struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp);
140 u64 addr;
141 int err;
Kevin Laatzc05cd362019-08-27 02:25:22 +0000142
Björn Töpel2b434702020-05-20 21:20:53 +0200143 addr = xp_get_handle(xskb);
144 err = xskq_prod_reserve_desc(xs->rx, addr, len);
145 if (err) {
Ciara Loftus8aa5a332020-07-08 07:28:33 +0000146 xs->rx_queue_full++;
Björn Töpel2b434702020-05-20 21:20:53 +0200147 return err;
148 }
Kevin Laatzc05cd362019-08-27 02:25:22 +0000149
Björn Töpel2b434702020-05-20 21:20:53 +0200150 xp_release(xskb);
151 return 0;
152}
Kevin Laatzc05cd362019-08-27 02:25:22 +0000153
Björn Töpel2b434702020-05-20 21:20:53 +0200154static void xsk_copy_xdp(struct xdp_buff *to, struct xdp_buff *from, u32 len)
155{
156 void *from_buf, *to_buf;
157 u32 metalen;
158
159 if (unlikely(xdp_data_meta_unsupported(from))) {
160 from_buf = from->data;
161 to_buf = to->data;
162 metalen = 0;
163 } else {
164 from_buf = from->data_meta;
165 metalen = from->data - from->data_meta;
166 to_buf = to->data - metalen;
Kevin Laatzc05cd362019-08-27 02:25:22 +0000167 }
168
169 memcpy(to_buf, from_buf, len + metalen);
170}
171
Björn Töpel458f7272021-01-22 11:53:49 +0100172static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
Björn Töpel173d3ad2018-06-04 14:05:55 +0200173{
Björn Töpel2b434702020-05-20 21:20:53 +0200174 struct xdp_buff *xsk_xdp;
Björn Töpel4e64c832018-06-04 13:57:11 +0200175 int err;
Björn Töpel458f7272021-01-22 11:53:49 +0100176 u32 len;
Björn Töpelc4971762018-05-02 13:01:27 +0200177
Björn Töpel458f7272021-01-22 11:53:49 +0100178 len = xdp->data_end - xdp->data;
Magnus Karlssonc4655762020-08-28 10:26:16 +0200179 if (len > xsk_pool_get_rx_frame_size(xs->pool)) {
Björn Töpela509a952018-06-04 13:57:12 +0200180 xs->rx_dropped++;
Björn Töpelc4971762018-05-02 13:01:27 +0200181 return -ENOSPC;
Björn Töpela509a952018-06-04 13:57:12 +0200182 }
Björn Töpelc4971762018-05-02 13:01:27 +0200183
Magnus Karlssonc4655762020-08-28 10:26:16 +0200184 xsk_xdp = xsk_buff_alloc(xs->pool);
Björn Töpel2b434702020-05-20 21:20:53 +0200185 if (!xsk_xdp) {
Björn Töpela509a952018-06-04 13:57:12 +0200186 xs->rx_dropped++;
Björn Töpel2b434702020-05-20 21:20:53 +0200187 return -ENOSPC;
188 }
Björn Töpelc4971762018-05-02 13:01:27 +0200189
Björn Töpel2b434702020-05-20 21:20:53 +0200190 xsk_copy_xdp(xsk_xdp, xdp, len);
191 err = __xsk_rcv_zc(xs, xsk_xdp, len);
192 if (err) {
193 xsk_buff_free(xsk_xdp);
194 return err;
195 }
Björn Töpel2b434702020-05-20 21:20:53 +0200196 return 0;
Björn Töpelc4971762018-05-02 13:01:27 +0200197}
198
Xuan Zhuo3413f042020-12-01 21:56:58 +0800199static bool xsk_tx_writeable(struct xdp_sock *xs)
200{
201 if (xskq_cons_present_entries(xs->tx) > xs->tx->nentries / 2)
202 return false;
203
204 return true;
205}
206
Björn Töpel42fddcc2019-09-04 13:49:12 +0200207static bool xsk_is_bound(struct xdp_sock *xs)
208{
209 if (READ_ONCE(xs->state) == XSK_BOUND) {
210 /* Matches smp_wmb() in bind(). */
211 smp_rmb();
212 return true;
213 }
214 return false;
215}
216
Björn Töpel458f7272021-01-22 11:53:49 +0100217static int xsk_rcv_check(struct xdp_sock *xs, struct xdp_buff *xdp)
Björn Töpelc4971762018-05-02 13:01:27 +0200218{
Björn Töpel42fddcc2019-09-04 13:49:12 +0200219 if (!xsk_is_bound(xs))
220 return -EINVAL;
221
Björn Töpel173d3ad2018-06-04 14:05:55 +0200222 if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
223 return -EINVAL;
Björn Töpelc4971762018-05-02 13:01:27 +0200224
Björn Töpelb02e5a02020-11-30 19:52:01 +0100225 sk_mark_napi_id_once_xdp(&xs->sk, xdp);
Björn Töpel458f7272021-01-22 11:53:49 +0100226 return 0;
Björn Töpelc4971762018-05-02 13:01:27 +0200227}
228
Björn Töpeld8179912019-11-01 12:03:46 +0100229static void xsk_flush(struct xdp_sock *xs)
Björn Töpelc4971762018-05-02 13:01:27 +0200230{
Magnus Karlsson59e35e52019-12-19 13:39:23 +0100231 xskq_prod_submit(xs->rx);
Magnus Karlsson7361f9c2020-08-28 10:26:18 +0200232 __xskq_cons_release(xs->pool->fq);
Björn Töpel43a825a2020-01-20 10:29:17 +0100233 sock_def_readable(&xs->sk);
Björn Töpelc4971762018-05-02 13:01:27 +0200234}
235
236int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
237{
238 int err;
239
Ilya Maximetsbf0bdd12019-07-03 15:09:16 +0300240 spin_lock_bh(&xs->rx_lock);
Björn Töpel458f7272021-01-22 11:53:49 +0100241 err = xsk_rcv_check(xs, xdp);
242 if (!err) {
243 err = __xsk_rcv(xs, xdp);
244 xsk_flush(xs);
245 }
Ilya Maximetsbf0bdd12019-07-03 15:09:16 +0300246 spin_unlock_bh(&xs->rx_lock);
Björn Töpelc4971762018-05-02 13:01:27 +0200247 return err;
248}
249
Björn Töpel458f7272021-01-22 11:53:49 +0100250static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
251{
252 int err;
253 u32 len;
254
255 err = xsk_rcv_check(xs, xdp);
256 if (err)
257 return err;
258
259 if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) {
260 len = xdp->data_end - xdp->data;
261 return __xsk_rcv_zc(xs, xdp, len);
262 }
263
264 err = __xsk_rcv(xs, xdp);
265 if (!err)
266 xdp_return_buff(xdp);
267 return err;
268}
269
Björn Töpele312b9e2019-12-19 07:10:02 +0100270int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp)
Björn Töpeld8179912019-11-01 12:03:46 +0100271{
Björn Töpele312b9e2019-12-19 07:10:02 +0100272 struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list);
Björn Töpeld8179912019-11-01 12:03:46 +0100273 int err;
274
Björn Töpel458f7272021-01-22 11:53:49 +0100275 err = xsk_rcv(xs, xdp);
Björn Töpeld8179912019-11-01 12:03:46 +0100276 if (err)
277 return err;
278
279 if (!xs->flush_node.prev)
280 list_add(&xs->flush_node, flush_list);
281
282 return 0;
283}
284
Björn Töpele312b9e2019-12-19 07:10:02 +0100285void __xsk_map_flush(void)
Björn Töpeld8179912019-11-01 12:03:46 +0100286{
Björn Töpele312b9e2019-12-19 07:10:02 +0100287 struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list);
Björn Töpeld8179912019-11-01 12:03:46 +0100288 struct xdp_sock *xs, *tmp;
289
290 list_for_each_entry_safe(xs, tmp, flush_list, flush_node) {
291 xsk_flush(xs);
292 __list_del_clearprev(&xs->flush_node);
293 }
294}
295
Magnus Karlssonc4655762020-08-28 10:26:16 +0200296void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries)
Magnus Karlssonac98d8a2018-06-04 14:05:57 +0200297{
Magnus Karlsson7361f9c2020-08-28 10:26:18 +0200298 xskq_prod_submit_n(pool->cq, nb_entries);
Magnus Karlssonac98d8a2018-06-04 14:05:57 +0200299}
Magnus Karlssonc4655762020-08-28 10:26:16 +0200300EXPORT_SYMBOL(xsk_tx_completed);
Magnus Karlssonac98d8a2018-06-04 14:05:57 +0200301
Magnus Karlssonc4655762020-08-28 10:26:16 +0200302void xsk_tx_release(struct xsk_buff_pool *pool)
Magnus Karlssonac98d8a2018-06-04 14:05:57 +0200303{
304 struct xdp_sock *xs;
305
306 rcu_read_lock();
Magnus Karlssona5aa8e52020-08-28 10:26:20 +0200307 list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
Magnus Karlsson30744a62020-02-10 16:27:12 +0100308 __xskq_cons_release(xs->tx);
Xuan Zhuo3413f042020-12-01 21:56:58 +0800309 if (xsk_tx_writeable(xs))
310 xs->sk.sk_write_space(&xs->sk);
Magnus Karlssonac98d8a2018-06-04 14:05:57 +0200311 }
312 rcu_read_unlock();
313}
Magnus Karlssonc4655762020-08-28 10:26:16 +0200314EXPORT_SYMBOL(xsk_tx_release);
Magnus Karlssonac98d8a2018-06-04 14:05:57 +0200315
Magnus Karlssonc4655762020-08-28 10:26:16 +0200316bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc)
Magnus Karlssonac98d8a2018-06-04 14:05:57 +0200317{
Magnus Karlssonac98d8a2018-06-04 14:05:57 +0200318 struct xdp_sock *xs;
319
320 rcu_read_lock();
Magnus Karlssona5aa8e52020-08-28 10:26:20 +0200321 list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
Magnus Karlsson1c1efc22020-08-28 10:26:17 +0200322 if (!xskq_cons_peek_desc(xs->tx, desc, pool)) {
Ciara Loftus8aa5a332020-07-08 07:28:33 +0000323 xs->tx->queue_empty_descs++;
Magnus Karlssonac98d8a2018-06-04 14:05:57 +0200324 continue;
Ciara Loftus8aa5a332020-07-08 07:28:33 +0000325 }
Magnus Karlssonac98d8a2018-06-04 14:05:57 +0200326
Tobias Klauser0a058612020-04-22 01:29:27 +0200327 /* This is the backpressure mechanism for the Tx path.
Magnus Karlsson15d8c912019-12-19 13:39:30 +0100328 * Reserve space in the completion queue and only proceed
329 * if there is space in it. This avoids having to implement
330 * any buffering in the Tx path.
331 */
Magnus Karlsson7361f9c2020-08-28 10:26:18 +0200332 if (xskq_prod_reserve_addr(pool->cq, desc->addr))
Magnus Karlssonac98d8a2018-06-04 14:05:57 +0200333 goto out;
334
Magnus Karlssonc5ed924b2019-12-19 13:39:26 +0100335 xskq_cons_release(xs->tx);
Magnus Karlssonac98d8a2018-06-04 14:05:57 +0200336 rcu_read_unlock();
337 return true;
338 }
339
340out:
341 rcu_read_unlock();
342 return false;
343}
Magnus Karlssonc4655762020-08-28 10:26:16 +0200344EXPORT_SYMBOL(xsk_tx_peek_desc);
Magnus Karlssonac98d8a2018-06-04 14:05:57 +0200345
Magnus Karlsson9349eb32020-11-16 12:12:46 +0100346static u32 xsk_tx_peek_release_fallback(struct xsk_buff_pool *pool, struct xdp_desc *descs,
347 u32 max_entries)
348{
349 u32 nb_pkts = 0;
350
351 while (nb_pkts < max_entries && xsk_tx_peek_desc(pool, &descs[nb_pkts]))
352 nb_pkts++;
353
354 xsk_tx_release(pool);
355 return nb_pkts;
356}
357
358u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, struct xdp_desc *descs,
359 u32 max_entries)
360{
361 struct xdp_sock *xs;
362 u32 nb_pkts;
363
364 rcu_read_lock();
365 if (!list_is_singular(&pool->xsk_tx_list)) {
366 /* Fallback to the non-batched version */
367 rcu_read_unlock();
368 return xsk_tx_peek_release_fallback(pool, descs, max_entries);
369 }
370
371 xs = list_first_or_null_rcu(&pool->xsk_tx_list, struct xdp_sock, tx_list);
372 if (!xs) {
373 nb_pkts = 0;
374 goto out;
375 }
376
377 nb_pkts = xskq_cons_peek_desc_batch(xs->tx, descs, pool, max_entries);
378 if (!nb_pkts) {
379 xs->tx->queue_empty_descs++;
380 goto out;
381 }
382
383 /* This is the backpressure mechanism for the Tx path. Try to
384 * reserve space in the completion queue for all packets, but
385 * if there are fewer slots available, just process that many
386 * packets. This avoids having to implement any buffering in
387 * the Tx path.
388 */
389 nb_pkts = xskq_prod_reserve_addr_batch(pool->cq, descs, nb_pkts);
390 if (!nb_pkts)
391 goto out;
392
393 xskq_cons_release_n(xs->tx, nb_pkts);
394 __xskq_cons_release(xs->tx);
395 xs->sk.sk_write_space(&xs->sk);
396
397out:
398 rcu_read_unlock();
399 return nb_pkts;
400}
401EXPORT_SYMBOL(xsk_tx_peek_release_desc_batch);
402
Maxim Mikityanskiy06870682019-12-17 16:20:42 +0000403static int xsk_wakeup(struct xdp_sock *xs, u8 flags)
Magnus Karlssonac98d8a2018-06-04 14:05:57 +0200404{
Magnus Karlssonac98d8a2018-06-04 14:05:57 +0200405 struct net_device *dev = xs->dev;
Maxim Mikityanskiy06870682019-12-17 16:20:42 +0000406 int err;
Magnus Karlssonac98d8a2018-06-04 14:05:57 +0200407
Maxim Mikityanskiy06870682019-12-17 16:20:42 +0000408 rcu_read_lock();
409 err = dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags);
410 rcu_read_unlock();
411
412 return err;
413}
414
415static int xsk_zc_xmit(struct xdp_sock *xs)
416{
417 return xsk_wakeup(xs, XDP_WAKEUP_TX);
Magnus Karlssonac98d8a2018-06-04 14:05:57 +0200418}
419
Magnus Karlsson35fcde72018-05-02 13:01:34 +0200420static void xsk_destruct_skb(struct sk_buff *skb)
421{
Björn Töpelbbff2f32018-06-04 13:57:13 +0200422 u64 addr = (u64)(long)skb_shinfo(skb)->destructor_arg;
Magnus Karlsson35fcde72018-05-02 13:01:34 +0200423 struct xdp_sock *xs = xdp_sk(skb->sk);
Magnus Karlssona9744f72018-06-29 09:48:20 +0200424 unsigned long flags;
Magnus Karlsson35fcde72018-05-02 13:01:34 +0200425
Magnus Karlssonf09ced42020-12-18 14:45:24 +0100426 spin_lock_irqsave(&xs->pool->cq_lock, flags);
Magnus Karlsson7361f9c2020-08-28 10:26:18 +0200427 xskq_prod_submit_addr(xs->pool->cq, addr);
Magnus Karlssonf09ced42020-12-18 14:45:24 +0100428 spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
Magnus Karlsson35fcde72018-05-02 13:01:34 +0200429
430 sock_wfree(skb);
431}
432
Xuan Zhuo9c8f21e2021-02-18 20:50:45 +0000433static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
434 struct xdp_desc *desc)
435{
436 struct xsk_buff_pool *pool = xs->pool;
437 u32 hr, len, ts, offset, copy, copied;
438 struct sk_buff *skb;
439 struct page *page;
440 void *buffer;
441 int err, i;
442 u64 addr;
443
444 hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(xs->dev->needed_headroom));
445
446 skb = sock_alloc_send_skb(&xs->sk, hr, 1, &err);
447 if (unlikely(!skb))
448 return ERR_PTR(err);
449
450 skb_reserve(skb, hr);
451
452 addr = desc->addr;
453 len = desc->len;
454 ts = pool->unaligned ? len : pool->chunk_size;
455
456 buffer = xsk_buff_raw_get_data(pool, addr);
457 offset = offset_in_page(buffer);
458 addr = buffer - pool->addrs;
459
460 for (copied = 0, i = 0; copied < len; i++) {
461 page = pool->umem->pgs[addr >> PAGE_SHIFT];
462 get_page(page);
463
464 copy = min_t(u32, PAGE_SIZE - offset, len - copied);
465 skb_fill_page_desc(skb, i, page, offset, copy);
466
467 copied += copy;
468 addr += copy;
469 offset = 0;
470 }
471
472 skb->len += len;
473 skb->data_len += len;
474 skb->truesize += ts;
475
476 refcount_add(ts, &xs->sk.sk_wmem_alloc);
477
478 return skb;
479}
480
481static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
482 struct xdp_desc *desc)
483{
484 struct net_device *dev = xs->dev;
485 struct sk_buff *skb;
486
487 if (dev->priv_flags & IFF_TX_SKB_NO_LINEAR) {
488 skb = xsk_build_skb_zerocopy(xs, desc);
489 if (IS_ERR(skb))
490 return skb;
491 } else {
492 u32 hr, tr, len;
493 void *buffer;
494 int err;
495
496 hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(dev->needed_headroom));
497 tr = dev->needed_tailroom;
498 len = desc->len;
499
500 skb = sock_alloc_send_skb(&xs->sk, hr + len + tr, 1, &err);
501 if (unlikely(!skb))
502 return ERR_PTR(err);
503
504 skb_reserve(skb, hr);
505 skb_put(skb, len);
506
507 buffer = xsk_buff_raw_get_data(xs->pool, desc->addr);
508 err = skb_store_bits(skb, 0, buffer, len);
509 if (unlikely(err)) {
510 kfree_skb(skb);
511 return ERR_PTR(err);
512 }
513 }
514
515 skb->dev = dev;
516 skb->priority = xs->sk.sk_priority;
517 skb->mark = xs->sk.sk_mark;
518 skb_shinfo(skb)->destructor_arg = (void *)(long)desc->addr;
519 skb->destructor = xsk_destruct_skb;
520
521 return skb;
522}
523
Magnus Karlssondf551052019-10-02 08:31:59 +0200524static int xsk_generic_xmit(struct sock *sk)
Magnus Karlsson35fcde72018-05-02 13:01:34 +0200525{
Magnus Karlsson35fcde72018-05-02 13:01:34 +0200526 struct xdp_sock *xs = xdp_sk(sk);
Magnus Karlssondf551052019-10-02 08:31:59 +0200527 u32 max_batch = TX_BATCH_SIZE;
Magnus Karlsson35fcde72018-05-02 13:01:34 +0200528 bool sent_frame = false;
529 struct xdp_desc desc;
530 struct sk_buff *skb;
Magnus Karlssonf09ced42020-12-18 14:45:24 +0100531 unsigned long flags;
Magnus Karlsson35fcde72018-05-02 13:01:34 +0200532 int err = 0;
533
Magnus Karlsson35fcde72018-05-02 13:01:34 +0200534 mutex_lock(&xs->mutex);
535
Ilya Maximets67571642019-07-04 17:25:03 +0300536 if (xs->queue_id >= xs->dev->real_num_tx_queues)
537 goto out;
538
Magnus Karlsson1c1efc22020-08-28 10:26:17 +0200539 while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) {
Magnus Karlsson35fcde72018-05-02 13:01:34 +0200540 if (max_batch-- == 0) {
541 err = -EAGAIN;
542 goto out;
543 }
544
Xuan Zhuo9c8f21e2021-02-18 20:50:45 +0000545 skb = xsk_build_skb(xs, &desc);
546 if (IS_ERR(skb)) {
547 err = PTR_ERR(skb);
Magnus Karlsson35fcde72018-05-02 13:01:34 +0200548 goto out;
Xuan Zhuo9c8f21e2021-02-18 20:50:45 +0000549 }
Magnus Karlsson35fcde72018-05-02 13:01:34 +0200550
Tobias Klauser0a058612020-04-22 01:29:27 +0200551 /* This is the backpressure mechanism for the Tx path.
Magnus Karlsson15d8c912019-12-19 13:39:30 +0100552 * Reserve space in the completion queue and only proceed
553 * if there is space in it. This avoids having to implement
554 * any buffering in the Tx path.
555 */
Magnus Karlssonf09ced42020-12-18 14:45:24 +0100556 spin_lock_irqsave(&xs->pool->cq_lock, flags);
Xuan Zhuo9c8f21e2021-02-18 20:50:45 +0000557 if (xskq_prod_reserve(xs->pool->cq)) {
Magnus Karlssonf09ced42020-12-18 14:45:24 +0100558 spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
Magnus Karlsson35fcde72018-05-02 13:01:34 +0200559 kfree_skb(skb);
560 goto out;
561 }
Magnus Karlssonf09ced42020-12-18 14:45:24 +0100562 spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
Magnus Karlsson35fcde72018-05-02 13:01:34 +0200563
Björn Töpel36ccdf82020-11-23 18:56:00 +0100564 err = __dev_direct_xmit(skb, xs->queue_id);
Magnus Karlsson642e4502020-09-16 14:00:25 +0200565 if (err == NETDEV_TX_BUSY) {
566 /* Tell user-space to retry the send */
567 skb->destructor = sock_wfree;
Magnus Karlssonb1b95cb2020-12-18 14:45:25 +0100568 spin_lock_irqsave(&xs->pool->cq_lock, flags);
569 xskq_prod_cancel(xs->pool->cq);
570 spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
Magnus Karlsson642e4502020-09-16 14:00:25 +0200571 /* Free skb without triggering the perf drop trace */
572 consume_skb(skb);
573 err = -EAGAIN;
574 goto out;
575 }
576
Magnus Karlssonc5ed924b2019-12-19 13:39:26 +0100577 xskq_cons_release(xs->tx);
Magnus Karlsson35fcde72018-05-02 13:01:34 +0200578 /* Ignore NET_XMIT_CN as packet might have been sent */
Magnus Karlsson642e4502020-09-16 14:00:25 +0200579 if (err == NET_XMIT_DROP) {
Magnus Karlssonfe588682018-06-29 09:48:18 +0200580 /* SKB completed but not sent */
581 err = -EBUSY;
Magnus Karlsson35fcde72018-05-02 13:01:34 +0200582 goto out;
583 }
584
585 sent_frame = true;
Magnus Karlsson35fcde72018-05-02 13:01:34 +0200586 }
587
Ciara Loftus8aa5a332020-07-08 07:28:33 +0000588 xs->tx->queue_empty_descs++;
589
Magnus Karlsson35fcde72018-05-02 13:01:34 +0200590out:
591 if (sent_frame)
Xuan Zhuo3413f042020-12-01 21:56:58 +0800592 if (xsk_tx_writeable(xs))
593 sk->sk_write_space(sk);
Magnus Karlsson35fcde72018-05-02 13:01:34 +0200594
595 mutex_unlock(&xs->mutex);
596 return err;
597}
598
Magnus Karlssondf551052019-10-02 08:31:59 +0200599static int __xsk_sendmsg(struct sock *sk)
600{
601 struct xdp_sock *xs = xdp_sk(sk);
602
603 if (unlikely(!(xs->dev->flags & IFF_UP)))
604 return -ENETDOWN;
605 if (unlikely(!xs->tx))
606 return -ENOBUFS;
607
608 return xs->zc ? xsk_zc_xmit(xs) : xsk_generic_xmit(sk);
609}
610
Björn Töpela0731952020-11-30 19:52:00 +0100611static bool xsk_no_wakeup(struct sock *sk)
612{
613#ifdef CONFIG_NET_RX_BUSY_POLL
614 /* Prefer busy-polling, skip the wakeup. */
615 return READ_ONCE(sk->sk_prefer_busy_poll) && READ_ONCE(sk->sk_ll_usec) &&
616 READ_ONCE(sk->sk_napi_id) >= MIN_NAPI_ID;
617#else
618 return false;
619#endif
620}
621
Magnus Karlsson35fcde72018-05-02 13:01:34 +0200622static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
623{
Magnus Karlssonac98d8a2018-06-04 14:05:57 +0200624 bool need_wait = !(m->msg_flags & MSG_DONTWAIT);
Magnus Karlsson35fcde72018-05-02 13:01:34 +0200625 struct sock *sk = sock->sk;
626 struct xdp_sock *xs = xdp_sk(sk);
Björn Töpele3920812020-11-30 19:51:59 +0100627 struct xsk_buff_pool *pool;
Magnus Karlsson35fcde72018-05-02 13:01:34 +0200628
Björn Töpel42fddcc2019-09-04 13:49:12 +0200629 if (unlikely(!xsk_is_bound(xs)))
Magnus Karlsson35fcde72018-05-02 13:01:34 +0200630 return -ENXIO;
Magnus Karlssondf551052019-10-02 08:31:59 +0200631 if (unlikely(need_wait))
Magnus Karlssonac98d8a2018-06-04 14:05:57 +0200632 return -EOPNOTSUPP;
Magnus Karlsson35fcde72018-05-02 13:01:34 +0200633
Björn Töpela0731952020-11-30 19:52:00 +0100634 if (sk_can_busy_loop(sk))
635 sk_busy_loop(sk, 1); /* only support non-blocking sockets */
636
637 if (xsk_no_wakeup(sk))
638 return 0;
639
Björn Töpele3920812020-11-30 19:51:59 +0100640 pool = xs->pool;
641 if (pool->cached_need_wakeup & XDP_WAKEUP_TX)
642 return __xsk_sendmsg(sk);
643 return 0;
Magnus Karlsson35fcde72018-05-02 13:01:34 +0200644}
645
Björn Töpel45a86682020-11-30 19:51:58 +0100646static int xsk_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags)
647{
648 bool need_wait = !(flags & MSG_DONTWAIT);
649 struct sock *sk = sock->sk;
650 struct xdp_sock *xs = xdp_sk(sk);
651
Björn Töpel3546b9b2020-12-07 09:20:08 +0100652 if (unlikely(!xsk_is_bound(xs)))
653 return -ENXIO;
Björn Töpel45a86682020-11-30 19:51:58 +0100654 if (unlikely(!(xs->dev->flags & IFF_UP)))
655 return -ENETDOWN;
656 if (unlikely(!xs->rx))
657 return -ENOBUFS;
Björn Töpel45a86682020-11-30 19:51:58 +0100658 if (unlikely(need_wait))
659 return -EOPNOTSUPP;
660
Björn Töpela0731952020-11-30 19:52:00 +0100661 if (sk_can_busy_loop(sk))
662 sk_busy_loop(sk, 1); /* only support non-blocking sockets */
663
664 if (xsk_no_wakeup(sk))
665 return 0;
666
Björn Töpel45a86682020-11-30 19:51:58 +0100667 if (xs->pool->cached_need_wakeup & XDP_WAKEUP_RX && xs->zc)
668 return xsk_wakeup(xs, XDP_WAKEUP_RX);
669 return 0;
Björn Töpelc4971762018-05-02 13:01:27 +0200670}
671
Luc Van Oostenryck5d946c52019-11-20 01:10:42 +0100672static __poll_t xsk_poll(struct file *file, struct socket *sock,
Linus Torvaldsa11e1d42018-06-28 09:43:44 -0700673 struct poll_table_struct *wait)
Björn Töpelc4971762018-05-02 13:01:27 +0200674{
Xuan Zhuof5da5412020-12-01 21:56:57 +0800675 __poll_t mask = 0;
Magnus Karlssondf551052019-10-02 08:31:59 +0200676 struct sock *sk = sock->sk;
677 struct xdp_sock *xs = xdp_sk(sk);
Magnus Karlssonc2d3d6a2020-08-28 10:26:19 +0200678 struct xsk_buff_pool *pool;
Björn Töpel42fddcc2019-09-04 13:49:12 +0200679
Magnus Karlsson0706a782021-12-17 15:56:46 +0100680 sock_poll_wait(file, sock, wait);
681
Björn Töpel42fddcc2019-09-04 13:49:12 +0200682 if (unlikely(!xsk_is_bound(xs)))
683 return mask;
684
Magnus Karlssonc2d3d6a2020-08-28 10:26:19 +0200685 pool = xs->pool;
Magnus Karlsson77cd0d72019-08-14 09:27:17 +0200686
Magnus Karlssonc2d3d6a2020-08-28 10:26:19 +0200687 if (pool->cached_need_wakeup) {
Maxim Mikityanskiy06870682019-12-17 16:20:42 +0000688 if (xs->zc)
Magnus Karlssonc2d3d6a2020-08-28 10:26:19 +0200689 xsk_wakeup(xs, pool->cached_need_wakeup);
Magnus Karlssondf551052019-10-02 08:31:59 +0200690 else
691 /* Poll needs to drive Tx also in copy mode */
692 __xsk_sendmsg(sk);
693 }
Björn Töpelc4971762018-05-02 13:01:27 +0200694
Magnus Karlsson59e35e52019-12-19 13:39:23 +0100695 if (xs->rx && !xskq_prod_is_empty(xs->rx))
Luc Van Oostenryck5d946c52019-11-20 01:10:42 +0100696 mask |= EPOLLIN | EPOLLRDNORM;
Xuan Zhuo3413f042020-12-01 21:56:58 +0800697 if (xs->tx && xsk_tx_writeable(xs))
Luc Van Oostenryck5d946c52019-11-20 01:10:42 +0100698 mask |= EPOLLOUT | EPOLLWRNORM;
Björn Töpelc4971762018-05-02 13:01:27 +0200699
700 return mask;
701}
702
Björn Töpelb9b6b682018-05-02 13:01:25 +0200703static int xsk_init_queue(u32 entries, struct xsk_queue **queue,
704 bool umem_queue)
Magnus Karlsson423f3832018-05-02 13:01:24 +0200705{
706 struct xsk_queue *q;
707
708 if (entries == 0 || *queue || !is_power_of_2(entries))
709 return -EINVAL;
710
Björn Töpelb9b6b682018-05-02 13:01:25 +0200711 q = xskq_create(entries, umem_queue);
Magnus Karlsson423f3832018-05-02 13:01:24 +0200712 if (!q)
713 return -ENOMEM;
714
Björn Töpel37b07692018-05-22 09:35:01 +0200715 /* Make sure queue is ready before it can be seen by others */
716 smp_wmb();
Björn Töpel94a99762019-09-04 13:49:10 +0200717 WRITE_ONCE(*queue, q);
Magnus Karlsson423f3832018-05-02 13:01:24 +0200718 return 0;
719}
720
Ilya Maximets455302d2019-06-28 11:04:07 +0300721static void xsk_unbind_dev(struct xdp_sock *xs)
722{
723 struct net_device *dev = xs->dev;
724
Björn Töpel42fddcc2019-09-04 13:49:12 +0200725 if (xs->state != XSK_BOUND)
Ilya Maximets455302d2019-06-28 11:04:07 +0300726 return;
Björn Töpel42fddcc2019-09-04 13:49:12 +0200727 WRITE_ONCE(xs->state, XSK_UNBOUND);
Ilya Maximets455302d2019-06-28 11:04:07 +0300728
729 /* Wait for driver to stop using the xdp socket. */
Magnus Karlssona5aa8e52020-08-28 10:26:20 +0200730 xp_del_xsk(xs->pool, xs);
Ilya Maximets455302d2019-06-28 11:04:07 +0300731 xs->dev = NULL;
732 synchronize_net();
733 dev_put(dev);
734}
735
Björn Töpel0402acd2019-08-15 11:30:13 +0200736static struct xsk_map *xsk_get_map_list_entry(struct xdp_sock *xs,
Toke Høiland-Jørgensen782347b2021-06-24 18:05:55 +0200737 struct xdp_sock __rcu ***map_entry)
Björn Töpel0402acd2019-08-15 11:30:13 +0200738{
739 struct xsk_map *map = NULL;
740 struct xsk_map_node *node;
741
742 *map_entry = NULL;
743
744 spin_lock_bh(&xs->map_list_lock);
745 node = list_first_entry_or_null(&xs->map_list, struct xsk_map_node,
746 node);
747 if (node) {
Zhu Yanjunbb1b25c2020-11-26 23:03:18 +0800748 bpf_map_inc(&node->map->map);
Björn Töpel0402acd2019-08-15 11:30:13 +0200749 map = node->map;
750 *map_entry = node->map_entry;
751 }
752 spin_unlock_bh(&xs->map_list_lock);
753 return map;
754}
755
756static void xsk_delete_from_maps(struct xdp_sock *xs)
757{
758 /* This function removes the current XDP socket from all the
759 * maps it resides in. We need to take extra care here, due to
760 * the two locks involved. Each map has a lock synchronizing
761 * updates to the entries, and each socket has a lock that
762 * synchronizes access to the list of maps (map_list). For
763 * deadlock avoidance the locks need to be taken in the order
764 * "map lock"->"socket map list lock". We start off by
765 * accessing the socket map list, and take a reference to the
766 * map to guarantee existence between the
767 * xsk_get_map_list_entry() and xsk_map_try_sock_delete()
768 * calls. Then we ask the map to remove the socket, which
769 * tries to remove the socket from the map. Note that there
770 * might be updates to the map between
771 * xsk_get_map_list_entry() and xsk_map_try_sock_delete().
772 */
Toke Høiland-Jørgensen782347b2021-06-24 18:05:55 +0200773 struct xdp_sock __rcu **map_entry = NULL;
Björn Töpel0402acd2019-08-15 11:30:13 +0200774 struct xsk_map *map;
775
776 while ((map = xsk_get_map_list_entry(xs, &map_entry))) {
777 xsk_map_try_sock_delete(map, xs, map_entry);
Zhu Yanjunbb1b25c2020-11-26 23:03:18 +0800778 bpf_map_put(&map->map);
Björn Töpel0402acd2019-08-15 11:30:13 +0200779 }
780}
781
Björn Töpelc0c77d82018-05-02 13:01:23 +0200782static int xsk_release(struct socket *sock)
783{
784 struct sock *sk = sock->sk;
Magnus Karlsson965a9902018-05-02 13:01:26 +0200785 struct xdp_sock *xs = xdp_sk(sk);
Björn Töpelc0c77d82018-05-02 13:01:23 +0200786 struct net *net;
787
788 if (!sk)
789 return 0;
790
791 net = sock_net(sk);
792
Björn Töpel1d0dc062019-01-24 19:59:37 +0100793 mutex_lock(&net->xdp.lock);
794 sk_del_node_init_rcu(sk);
795 mutex_unlock(&net->xdp.lock);
796
Björn Töpelc0c77d82018-05-02 13:01:23 +0200797 sock_prot_inuse_add(net, sk->sk_prot, -1);
Björn Töpelc0c77d82018-05-02 13:01:23 +0200798
Björn Töpel0402acd2019-08-15 11:30:13 +0200799 xsk_delete_from_maps(xs);
Björn Töpel42fddcc2019-09-04 13:49:12 +0200800 mutex_lock(&xs->mutex);
Ilya Maximets455302d2019-06-28 11:04:07 +0300801 xsk_unbind_dev(xs);
Björn Töpel42fddcc2019-09-04 13:49:12 +0200802 mutex_unlock(&xs->mutex);
Magnus Karlsson965a9902018-05-02 13:01:26 +0200803
Björn Töpel541d7fd2018-10-05 13:25:15 +0200804 xskq_destroy(xs->rx);
805 xskq_destroy(xs->tx);
Magnus Karlsson7361f9c2020-08-28 10:26:18 +0200806 xskq_destroy(xs->fq_tmp);
807 xskq_destroy(xs->cq_tmp);
Björn Töpel541d7fd2018-10-05 13:25:15 +0200808
Björn Töpelc0c77d82018-05-02 13:01:23 +0200809 sock_orphan(sk);
810 sock->sk = NULL;
811
812 sk_refcnt_debug_release(sk);
813 sock_put(sk);
814
815 return 0;
816}
817
Magnus Karlsson965a9902018-05-02 13:01:26 +0200818static struct socket *xsk_lookup_xsk_from_fd(int fd)
819{
820 struct socket *sock;
821 int err;
822
823 sock = sockfd_lookup(fd, &err);
824 if (!sock)
825 return ERR_PTR(-ENOTSOCK);
826
827 if (sock->sk->sk_family != PF_XDP) {
828 sockfd_put(sock);
829 return ERR_PTR(-ENOPROTOOPT);
830 }
831
832 return sock;
833}
834
Magnus Karlsson7361f9c2020-08-28 10:26:18 +0200835static bool xsk_validate_queues(struct xdp_sock *xs)
836{
837 return xs->fq_tmp && xs->cq_tmp;
838}
839
Magnus Karlsson965a9902018-05-02 13:01:26 +0200840static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
841{
842 struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr;
843 struct sock *sk = sock->sk;
Magnus Karlsson965a9902018-05-02 13:01:26 +0200844 struct xdp_sock *xs = xdp_sk(sk);
Björn Töpel959b71d2018-05-22 09:34:56 +0200845 struct net_device *dev;
Björn Töpel173d3ad2018-06-04 14:05:55 +0200846 u32 flags, qid;
Magnus Karlsson965a9902018-05-02 13:01:26 +0200847 int err = 0;
848
849 if (addr_len < sizeof(struct sockaddr_xdp))
850 return -EINVAL;
851 if (sxdp->sxdp_family != AF_XDP)
852 return -EINVAL;
853
Björn Töpelf54ba392019-03-08 08:57:26 +0100854 flags = sxdp->sxdp_flags;
Magnus Karlsson77cd0d72019-08-14 09:27:17 +0200855 if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY |
856 XDP_USE_NEED_WAKEUP))
Björn Töpelf54ba392019-03-08 08:57:26 +0100857 return -EINVAL;
858
Ilya Maximets5464c3a2019-07-08 14:03:44 +0300859 rtnl_lock();
Magnus Karlsson965a9902018-05-02 13:01:26 +0200860 mutex_lock(&xs->mutex);
Ilya Maximets455302d2019-06-28 11:04:07 +0300861 if (xs->state != XSK_READY) {
Björn Töpel959b71d2018-05-22 09:34:56 +0200862 err = -EBUSY;
863 goto out_release;
864 }
865
Magnus Karlsson965a9902018-05-02 13:01:26 +0200866 dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex);
867 if (!dev) {
868 err = -ENODEV;
869 goto out_release;
870 }
871
Magnus Karlssonf6145902018-05-02 13:01:32 +0200872 if (!xs->rx && !xs->tx) {
Magnus Karlsson965a9902018-05-02 13:01:26 +0200873 err = -EINVAL;
874 goto out_unlock;
875 }
876
Björn Töpel173d3ad2018-06-04 14:05:55 +0200877 qid = sxdp->sxdp_queue_id;
Björn Töpel173d3ad2018-06-04 14:05:55 +0200878
879 if (flags & XDP_SHARED_UMEM) {
Magnus Karlsson965a9902018-05-02 13:01:26 +0200880 struct xdp_sock *umem_xs;
881 struct socket *sock;
882
Magnus Karlsson77cd0d72019-08-14 09:27:17 +0200883 if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY) ||
884 (flags & XDP_USE_NEED_WAKEUP)) {
Björn Töpel173d3ad2018-06-04 14:05:55 +0200885 /* Cannot specify flags for shared sockets. */
886 err = -EINVAL;
887 goto out_unlock;
888 }
889
Magnus Karlsson965a9902018-05-02 13:01:26 +0200890 if (xs->umem) {
891 /* We have already our own. */
892 err = -EINVAL;
893 goto out_unlock;
894 }
895
896 sock = xsk_lookup_xsk_from_fd(sxdp->sxdp_shared_umem_fd);
897 if (IS_ERR(sock)) {
898 err = PTR_ERR(sock);
899 goto out_unlock;
900 }
901
902 umem_xs = xdp_sk(sock->sk);
Björn Töpel42fddcc2019-09-04 13:49:12 +0200903 if (!xsk_is_bound(umem_xs)) {
Magnus Karlsson965a9902018-05-02 13:01:26 +0200904 err = -EBADF;
905 sockfd_put(sock);
906 goto out_unlock;
Björn Töpel42fddcc2019-09-04 13:49:12 +0200907 }
Magnus Karlsson965a9902018-05-02 13:01:26 +0200908
Magnus Karlssona1132432020-08-28 10:26:26 +0200909 if (umem_xs->queue_id != qid || umem_xs->dev != dev) {
910 /* Share the umem with another socket on another qid
911 * and/or device.
912 */
Magnus Karlssonb5aea282020-08-28 10:26:25 +0200913 xs->pool = xp_create_and_assign_umem(xs,
914 umem_xs->umem);
915 if (!xs->pool) {
Magnus Karlsson1fd17c8c2020-09-26 11:26:13 +0200916 err = -ENOMEM;
Magnus Karlssonb5aea282020-08-28 10:26:25 +0200917 sockfd_put(sock);
918 goto out_unlock;
919 }
920
921 err = xp_assign_dev_shared(xs->pool, umem_xs->umem,
922 dev, qid);
923 if (err) {
924 xp_destroy(xs->pool);
Magnus Karlsson83cf5c62020-09-02 09:36:04 +0200925 xs->pool = NULL;
Magnus Karlssonb5aea282020-08-28 10:26:25 +0200926 sockfd_put(sock);
927 goto out_unlock;
928 }
929 } else {
930 /* Share the buffer pool with the other socket. */
931 if (xs->fq_tmp || xs->cq_tmp) {
932 /* Do not allow setting your own fq or cq. */
933 err = -EINVAL;
934 sockfd_put(sock);
935 goto out_unlock;
936 }
937
938 xp_get_pool(umem_xs->pool);
939 xs->pool = umem_xs->pool;
940 }
941
Magnus Karlsson965a9902018-05-02 13:01:26 +0200942 xdp_get_umem(umem_xs->umem);
Björn Töpel9764f4b2019-09-04 13:49:11 +0200943 WRITE_ONCE(xs->umem, umem_xs->umem);
Magnus Karlsson965a9902018-05-02 13:01:26 +0200944 sockfd_put(sock);
Magnus Karlsson7361f9c2020-08-28 10:26:18 +0200945 } else if (!xs->umem || !xsk_validate_queues(xs)) {
Magnus Karlsson965a9902018-05-02 13:01:26 +0200946 err = -EINVAL;
947 goto out_unlock;
Björn Töpelc4971762018-05-02 13:01:27 +0200948 } else {
949 /* This xsk has its own umem. */
Magnus Karlsson1c1efc22020-08-28 10:26:17 +0200950 xs->pool = xp_create_and_assign_umem(xs, xs->umem);
951 if (!xs->pool) {
952 err = -ENOMEM;
Björn Töpel173d3ad2018-06-04 14:05:55 +0200953 goto out_unlock;
Magnus Karlsson1c1efc22020-08-28 10:26:17 +0200954 }
955
956 err = xp_assign_dev(xs->pool, dev, qid, flags);
957 if (err) {
958 xp_destroy(xs->pool);
959 xs->pool = NULL;
Magnus Karlsson1c1efc22020-08-28 10:26:17 +0200960 goto out_unlock;
961 }
Magnus Karlsson965a9902018-05-02 13:01:26 +0200962 }
963
Magnus Karlsson8bee6832020-12-14 09:51:27 +0100964 /* FQ and CQ are now owned by the buffer pool and cleaned up with it. */
965 xs->fq_tmp = NULL;
966 xs->cq_tmp = NULL;
967
Magnus Karlsson965a9902018-05-02 13:01:26 +0200968 xs->dev = dev;
Magnus Karlssonac98d8a2018-06-04 14:05:57 +0200969 xs->zc = xs->umem->zc;
970 xs->queue_id = qid;
Magnus Karlssona5aa8e52020-08-28 10:26:20 +0200971 xp_add_xsk(xs->pool, xs);
Magnus Karlsson965a9902018-05-02 13:01:26 +0200972
973out_unlock:
Björn Töpel42fddcc2019-09-04 13:49:12 +0200974 if (err) {
Magnus Karlsson965a9902018-05-02 13:01:26 +0200975 dev_put(dev);
Björn Töpel42fddcc2019-09-04 13:49:12 +0200976 } else {
977 /* Matches smp_rmb() in bind() for shared umem
978 * sockets, and xsk_is_bound().
979 */
980 smp_wmb();
981 WRITE_ONCE(xs->state, XSK_BOUND);
982 }
Magnus Karlsson965a9902018-05-02 13:01:26 +0200983out_release:
984 mutex_unlock(&xs->mutex);
Ilya Maximets5464c3a2019-07-08 14:03:44 +0300985 rtnl_unlock();
Magnus Karlsson965a9902018-05-02 13:01:26 +0200986 return err;
987}
988
Kevin Laatzc05cd362019-08-27 02:25:22 +0000989struct xdp_umem_reg_v1 {
990 __u64 addr; /* Start of packet data area */
991 __u64 len; /* Length of packet data area */
992 __u32 chunk_size;
993 __u32 headroom;
994};
995
Björn Töpelc0c77d82018-05-02 13:01:23 +0200996static int xsk_setsockopt(struct socket *sock, int level, int optname,
Christoph Hellwiga7b75c52020-07-23 08:09:07 +0200997 sockptr_t optval, unsigned int optlen)
Björn Töpelc0c77d82018-05-02 13:01:23 +0200998{
999 struct sock *sk = sock->sk;
1000 struct xdp_sock *xs = xdp_sk(sk);
1001 int err;
1002
1003 if (level != SOL_XDP)
1004 return -ENOPROTOOPT;
1005
1006 switch (optname) {
Björn Töpelb9b6b682018-05-02 13:01:25 +02001007 case XDP_RX_RING:
Magnus Karlssonf6145902018-05-02 13:01:32 +02001008 case XDP_TX_RING:
Björn Töpelb9b6b682018-05-02 13:01:25 +02001009 {
1010 struct xsk_queue **q;
1011 int entries;
1012
1013 if (optlen < sizeof(entries))
1014 return -EINVAL;
Christoph Hellwiga7b75c52020-07-23 08:09:07 +02001015 if (copy_from_sockptr(&entries, optval, sizeof(entries)))
Björn Töpelb9b6b682018-05-02 13:01:25 +02001016 return -EFAULT;
1017
1018 mutex_lock(&xs->mutex);
Ilya Maximets455302d2019-06-28 11:04:07 +03001019 if (xs->state != XSK_READY) {
1020 mutex_unlock(&xs->mutex);
1021 return -EBUSY;
1022 }
Magnus Karlssonf6145902018-05-02 13:01:32 +02001023 q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx;
Björn Töpelb9b6b682018-05-02 13:01:25 +02001024 err = xsk_init_queue(entries, q, false);
Magnus Karlsson77cd0d72019-08-14 09:27:17 +02001025 if (!err && optname == XDP_TX_RING)
1026 /* Tx needs to be explicitly woken up the first time */
1027 xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
Björn Töpelb9b6b682018-05-02 13:01:25 +02001028 mutex_unlock(&xs->mutex);
1029 return err;
1030 }
Björn Töpelc0c77d82018-05-02 13:01:23 +02001031 case XDP_UMEM_REG:
1032 {
Kevin Laatzc05cd362019-08-27 02:25:22 +00001033 size_t mr_size = sizeof(struct xdp_umem_reg);
1034 struct xdp_umem_reg mr = {};
Björn Töpelc0c77d82018-05-02 13:01:23 +02001035 struct xdp_umem *umem;
1036
Kevin Laatzc05cd362019-08-27 02:25:22 +00001037 if (optlen < sizeof(struct xdp_umem_reg_v1))
1038 return -EINVAL;
1039 else if (optlen < sizeof(mr))
1040 mr_size = sizeof(struct xdp_umem_reg_v1);
1041
Christoph Hellwiga7b75c52020-07-23 08:09:07 +02001042 if (copy_from_sockptr(&mr, optval, mr_size))
Björn Töpelc0c77d82018-05-02 13:01:23 +02001043 return -EFAULT;
1044
1045 mutex_lock(&xs->mutex);
Ilya Maximets455302d2019-06-28 11:04:07 +03001046 if (xs->state != XSK_READY || xs->umem) {
Björn Töpelc0c77d82018-05-02 13:01:23 +02001047 mutex_unlock(&xs->mutex);
Björn Töpela49049e2018-05-22 09:35:02 +02001048 return -EBUSY;
1049 }
1050
1051 umem = xdp_umem_create(&mr);
1052 if (IS_ERR(umem)) {
1053 mutex_unlock(&xs->mutex);
1054 return PTR_ERR(umem);
Björn Töpelc0c77d82018-05-02 13:01:23 +02001055 }
1056
1057 /* Make sure umem is ready before it can be seen by others */
1058 smp_wmb();
Björn Töpel9764f4b2019-09-04 13:49:11 +02001059 WRITE_ONCE(xs->umem, umem);
Björn Töpelc0c77d82018-05-02 13:01:23 +02001060 mutex_unlock(&xs->mutex);
1061 return 0;
1062 }
Magnus Karlsson423f3832018-05-02 13:01:24 +02001063 case XDP_UMEM_FILL_RING:
Magnus Karlssonfe230832018-05-02 13:01:31 +02001064 case XDP_UMEM_COMPLETION_RING:
Magnus Karlsson423f3832018-05-02 13:01:24 +02001065 {
1066 struct xsk_queue **q;
1067 int entries;
1068
Christoph Hellwiga7b75c52020-07-23 08:09:07 +02001069 if (copy_from_sockptr(&entries, optval, sizeof(entries)))
Magnus Karlsson423f3832018-05-02 13:01:24 +02001070 return -EFAULT;
1071
1072 mutex_lock(&xs->mutex);
Ilya Maximets455302d2019-06-28 11:04:07 +03001073 if (xs->state != XSK_READY) {
1074 mutex_unlock(&xs->mutex);
1075 return -EBUSY;
1076 }
Björn Töpela49049e2018-05-22 09:35:02 +02001077
Magnus Karlsson7361f9c2020-08-28 10:26:18 +02001078 q = (optname == XDP_UMEM_FILL_RING) ? &xs->fq_tmp :
1079 &xs->cq_tmp;
Björn Töpelb9b6b682018-05-02 13:01:25 +02001080 err = xsk_init_queue(entries, q, true);
Magnus Karlsson423f3832018-05-02 13:01:24 +02001081 mutex_unlock(&xs->mutex);
1082 return err;
1083 }
Björn Töpelc0c77d82018-05-02 13:01:23 +02001084 default:
1085 break;
1086 }
1087
1088 return -ENOPROTOOPT;
1089}
1090
Magnus Karlsson77cd0d72019-08-14 09:27:17 +02001091static void xsk_enter_rxtx_offsets(struct xdp_ring_offset_v1 *ring)
1092{
1093 ring->producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
1094 ring->consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
1095 ring->desc = offsetof(struct xdp_rxtx_ring, desc);
1096}
1097
1098static void xsk_enter_umem_offsets(struct xdp_ring_offset_v1 *ring)
1099{
1100 ring->producer = offsetof(struct xdp_umem_ring, ptrs.producer);
1101 ring->consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
1102 ring->desc = offsetof(struct xdp_umem_ring, desc);
1103}
1104
Ciara Loftus8aa5a332020-07-08 07:28:33 +00001105struct xdp_statistics_v1 {
1106 __u64 rx_dropped;
1107 __u64 rx_invalid_descs;
1108 __u64 tx_invalid_descs;
1109};
1110
Magnus Karlssonaf75d9e2018-05-02 13:01:35 +02001111static int xsk_getsockopt(struct socket *sock, int level, int optname,
1112 char __user *optval, int __user *optlen)
1113{
1114 struct sock *sk = sock->sk;
1115 struct xdp_sock *xs = xdp_sk(sk);
1116 int len;
1117
1118 if (level != SOL_XDP)
1119 return -ENOPROTOOPT;
1120
1121 if (get_user(len, optlen))
1122 return -EFAULT;
1123 if (len < 0)
1124 return -EINVAL;
1125
1126 switch (optname) {
1127 case XDP_STATISTICS:
1128 {
Peilin Ye3c4f8502020-07-28 01:36:04 -04001129 struct xdp_statistics stats = {};
Ciara Loftus8aa5a332020-07-08 07:28:33 +00001130 bool extra_stats = true;
1131 size_t stats_size;
Magnus Karlssonaf75d9e2018-05-02 13:01:35 +02001132
Ciara Loftus8aa5a332020-07-08 07:28:33 +00001133 if (len < sizeof(struct xdp_statistics_v1)) {
Magnus Karlssonaf75d9e2018-05-02 13:01:35 +02001134 return -EINVAL;
Ciara Loftus8aa5a332020-07-08 07:28:33 +00001135 } else if (len < sizeof(stats)) {
1136 extra_stats = false;
1137 stats_size = sizeof(struct xdp_statistics_v1);
1138 } else {
1139 stats_size = sizeof(stats);
1140 }
Magnus Karlssonaf75d9e2018-05-02 13:01:35 +02001141
1142 mutex_lock(&xs->mutex);
1143 stats.rx_dropped = xs->rx_dropped;
Ciara Loftus8aa5a332020-07-08 07:28:33 +00001144 if (extra_stats) {
1145 stats.rx_ring_full = xs->rx_queue_full;
1146 stats.rx_fill_ring_empty_descs =
Magnus Karlsson7361f9c2020-08-28 10:26:18 +02001147 xs->pool ? xskq_nb_queue_empty_descs(xs->pool->fq) : 0;
Ciara Loftus8aa5a332020-07-08 07:28:33 +00001148 stats.tx_ring_empty_descs = xskq_nb_queue_empty_descs(xs->tx);
1149 } else {
1150 stats.rx_dropped += xs->rx_queue_full;
1151 }
Magnus Karlssonaf75d9e2018-05-02 13:01:35 +02001152 stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx);
1153 stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx);
1154 mutex_unlock(&xs->mutex);
1155
Ciara Loftus8aa5a332020-07-08 07:28:33 +00001156 if (copy_to_user(optval, &stats, stats_size))
Magnus Karlssonaf75d9e2018-05-02 13:01:35 +02001157 return -EFAULT;
Ciara Loftus8aa5a332020-07-08 07:28:33 +00001158 if (put_user(stats_size, optlen))
Magnus Karlssonaf75d9e2018-05-02 13:01:35 +02001159 return -EFAULT;
1160
1161 return 0;
1162 }
Björn Töpelb3a9e0b2018-05-22 09:34:59 +02001163 case XDP_MMAP_OFFSETS:
1164 {
1165 struct xdp_mmap_offsets off;
Magnus Karlsson77cd0d72019-08-14 09:27:17 +02001166 struct xdp_mmap_offsets_v1 off_v1;
1167 bool flags_supported = true;
1168 void *to_copy;
Björn Töpelb3a9e0b2018-05-22 09:34:59 +02001169
Magnus Karlsson77cd0d72019-08-14 09:27:17 +02001170 if (len < sizeof(off_v1))
Björn Töpelb3a9e0b2018-05-22 09:34:59 +02001171 return -EINVAL;
Magnus Karlsson77cd0d72019-08-14 09:27:17 +02001172 else if (len < sizeof(off))
1173 flags_supported = false;
Björn Töpelb3a9e0b2018-05-22 09:34:59 +02001174
Magnus Karlsson77cd0d72019-08-14 09:27:17 +02001175 if (flags_supported) {
1176 /* xdp_ring_offset is identical to xdp_ring_offset_v1
1177 * except for the flags field added to the end.
1178 */
1179 xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
1180 &off.rx);
1181 xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
1182 &off.tx);
1183 xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
1184 &off.fr);
1185 xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
1186 &off.cr);
1187 off.rx.flags = offsetof(struct xdp_rxtx_ring,
1188 ptrs.flags);
1189 off.tx.flags = offsetof(struct xdp_rxtx_ring,
1190 ptrs.flags);
1191 off.fr.flags = offsetof(struct xdp_umem_ring,
1192 ptrs.flags);
1193 off.cr.flags = offsetof(struct xdp_umem_ring,
1194 ptrs.flags);
Björn Töpelb3a9e0b2018-05-22 09:34:59 +02001195
Magnus Karlsson77cd0d72019-08-14 09:27:17 +02001196 len = sizeof(off);
1197 to_copy = &off;
1198 } else {
1199 xsk_enter_rxtx_offsets(&off_v1.rx);
1200 xsk_enter_rxtx_offsets(&off_v1.tx);
1201 xsk_enter_umem_offsets(&off_v1.fr);
1202 xsk_enter_umem_offsets(&off_v1.cr);
Björn Töpelb3a9e0b2018-05-22 09:34:59 +02001203
Magnus Karlsson77cd0d72019-08-14 09:27:17 +02001204 len = sizeof(off_v1);
1205 to_copy = &off_v1;
1206 }
1207
1208 if (copy_to_user(optval, to_copy, len))
Björn Töpelb3a9e0b2018-05-22 09:34:59 +02001209 return -EFAULT;
1210 if (put_user(len, optlen))
1211 return -EFAULT;
1212
1213 return 0;
1214 }
Maxim Mikityanskiy2640d3c2019-06-26 17:35:25 +03001215 case XDP_OPTIONS:
1216 {
1217 struct xdp_options opts = {};
1218
1219 if (len < sizeof(opts))
1220 return -EINVAL;
1221
1222 mutex_lock(&xs->mutex);
1223 if (xs->zc)
1224 opts.flags |= XDP_OPTIONS_ZEROCOPY;
1225 mutex_unlock(&xs->mutex);
1226
1227 len = sizeof(opts);
1228 if (copy_to_user(optval, &opts, len))
1229 return -EFAULT;
1230 if (put_user(len, optlen))
1231 return -EFAULT;
1232
1233 return 0;
1234 }
Magnus Karlssonaf75d9e2018-05-02 13:01:35 +02001235 default:
1236 break;
1237 }
1238
1239 return -EOPNOTSUPP;
1240}
1241
Magnus Karlsson423f3832018-05-02 13:01:24 +02001242static int xsk_mmap(struct file *file, struct socket *sock,
1243 struct vm_area_struct *vma)
1244{
Geert Uytterhoevena5a16e42018-06-07 15:37:34 +02001245 loff_t offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
Magnus Karlsson423f3832018-05-02 13:01:24 +02001246 unsigned long size = vma->vm_end - vma->vm_start;
1247 struct xdp_sock *xs = xdp_sk(sock->sk);
1248 struct xsk_queue *q = NULL;
1249 unsigned long pfn;
1250 struct page *qpg;
1251
Björn Töpel42fddcc2019-09-04 13:49:12 +02001252 if (READ_ONCE(xs->state) != XSK_READY)
Ilya Maximets455302d2019-06-28 11:04:07 +03001253 return -EBUSY;
1254
Björn Töpelb9b6b682018-05-02 13:01:25 +02001255 if (offset == XDP_PGOFF_RX_RING) {
Björn Töpel37b07692018-05-22 09:35:01 +02001256 q = READ_ONCE(xs->rx);
Magnus Karlssonf6145902018-05-02 13:01:32 +02001257 } else if (offset == XDP_PGOFF_TX_RING) {
Björn Töpel37b07692018-05-22 09:35:01 +02001258 q = READ_ONCE(xs->tx);
Björn Töpelb9b6b682018-05-02 13:01:25 +02001259 } else {
Magnus Karlssone6762c82019-02-08 14:13:50 +01001260 /* Matches the smp_wmb() in XDP_UMEM_REG */
1261 smp_rmb();
Björn Töpelb9b6b682018-05-02 13:01:25 +02001262 if (offset == XDP_UMEM_PGOFF_FILL_RING)
Magnus Karlsson7361f9c2020-08-28 10:26:18 +02001263 q = READ_ONCE(xs->fq_tmp);
Magnus Karlssonfe230832018-05-02 13:01:31 +02001264 else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING)
Magnus Karlsson7361f9c2020-08-28 10:26:18 +02001265 q = READ_ONCE(xs->cq_tmp);
Björn Töpelb9b6b682018-05-02 13:01:25 +02001266 }
Magnus Karlsson423f3832018-05-02 13:01:24 +02001267
1268 if (!q)
1269 return -EINVAL;
1270
Magnus Karlssone6762c82019-02-08 14:13:50 +01001271 /* Matches the smp_wmb() in xsk_init_queue */
1272 smp_rmb();
Magnus Karlsson423f3832018-05-02 13:01:24 +02001273 qpg = virt_to_head_page(q->ring);
Matthew Wilcox (Oracle)a50b8542019-09-23 15:34:25 -07001274 if (size > page_size(qpg))
Magnus Karlsson423f3832018-05-02 13:01:24 +02001275 return -EINVAL;
1276
1277 pfn = virt_to_phys(q->ring) >> PAGE_SHIFT;
1278 return remap_pfn_range(vma, vma->vm_start, pfn,
1279 size, vma->vm_page_prot);
1280}
1281
Ilya Maximets455302d2019-06-28 11:04:07 +03001282static int xsk_notifier(struct notifier_block *this,
1283 unsigned long msg, void *ptr)
1284{
1285 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
1286 struct net *net = dev_net(dev);
1287 struct sock *sk;
1288
1289 switch (msg) {
1290 case NETDEV_UNREGISTER:
1291 mutex_lock(&net->xdp.lock);
1292 sk_for_each(sk, &net->xdp.list) {
1293 struct xdp_sock *xs = xdp_sk(sk);
1294
1295 mutex_lock(&xs->mutex);
1296 if (xs->dev == dev) {
1297 sk->sk_err = ENETDOWN;
1298 if (!sock_flag(sk, SOCK_DEAD))
Alexander Aringe3ae2362021-06-27 18:48:21 -04001299 sk_error_report(sk);
Ilya Maximets455302d2019-06-28 11:04:07 +03001300
1301 xsk_unbind_dev(xs);
1302
Magnus Karlsson1c1efc22020-08-28 10:26:17 +02001303 /* Clear device references. */
1304 xp_clear_dev(xs->pool);
Ilya Maximets455302d2019-06-28 11:04:07 +03001305 }
1306 mutex_unlock(&xs->mutex);
1307 }
1308 mutex_unlock(&net->xdp.lock);
1309 break;
1310 }
1311 return NOTIFY_DONE;
1312}
1313
Björn Töpelc0c77d82018-05-02 13:01:23 +02001314static struct proto xsk_proto = {
1315 .name = "XDP",
1316 .owner = THIS_MODULE,
1317 .obj_size = sizeof(struct xdp_sock),
1318};
1319
1320static const struct proto_ops xsk_proto_ops = {
Björn Töpelc2f43742018-05-18 14:00:24 +02001321 .family = PF_XDP,
1322 .owner = THIS_MODULE,
1323 .release = xsk_release,
1324 .bind = xsk_bind,
1325 .connect = sock_no_connect,
1326 .socketpair = sock_no_socketpair,
1327 .accept = sock_no_accept,
1328 .getname = sock_no_getname,
Linus Torvaldsa11e1d42018-06-28 09:43:44 -07001329 .poll = xsk_poll,
Björn Töpelc2f43742018-05-18 14:00:24 +02001330 .ioctl = sock_no_ioctl,
1331 .listen = sock_no_listen,
1332 .shutdown = sock_no_shutdown,
1333 .setsockopt = xsk_setsockopt,
1334 .getsockopt = xsk_getsockopt,
1335 .sendmsg = xsk_sendmsg,
Björn Töpel45a86682020-11-30 19:51:58 +01001336 .recvmsg = xsk_recvmsg,
Björn Töpelc2f43742018-05-18 14:00:24 +02001337 .mmap = xsk_mmap,
1338 .sendpage = sock_no_sendpage,
Björn Töpelc0c77d82018-05-02 13:01:23 +02001339};
1340
Björn Töpel11fe9262019-02-21 13:07:38 +01001341static void xsk_destruct(struct sock *sk)
1342{
1343 struct xdp_sock *xs = xdp_sk(sk);
1344
1345 if (!sock_flag(sk, SOCK_DEAD))
1346 return;
1347
Magnus Karlssone5e1a4b2020-10-27 13:32:01 +01001348 if (!xp_put_pool(xs->pool))
Magnus Karlsson537cf4e2020-11-20 12:53:39 +01001349 xdp_put_umem(xs->umem, !xs->pool);
Björn Töpel11fe9262019-02-21 13:07:38 +01001350
1351 sk_refcnt_debug_dec(sk);
1352}
1353
Björn Töpelc0c77d82018-05-02 13:01:23 +02001354static int xsk_create(struct net *net, struct socket *sock, int protocol,
1355 int kern)
1356{
Björn Töpelc0c77d82018-05-02 13:01:23 +02001357 struct xdp_sock *xs;
Magnus Karlsson1c1efc22020-08-28 10:26:17 +02001358 struct sock *sk;
Björn Töpelc0c77d82018-05-02 13:01:23 +02001359
1360 if (!ns_capable(net->user_ns, CAP_NET_RAW))
1361 return -EPERM;
1362 if (sock->type != SOCK_RAW)
1363 return -ESOCKTNOSUPPORT;
1364
1365 if (protocol)
1366 return -EPROTONOSUPPORT;
1367
1368 sock->state = SS_UNCONNECTED;
1369
1370 sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern);
1371 if (!sk)
1372 return -ENOBUFS;
1373
1374 sock->ops = &xsk_proto_ops;
1375
1376 sock_init_data(sock, sk);
1377
1378 sk->sk_family = PF_XDP;
1379
Björn Töpel11fe9262019-02-21 13:07:38 +01001380 sk->sk_destruct = xsk_destruct;
1381 sk_refcnt_debug_inc(sk);
1382
Björn Töpelcee27162018-10-08 19:40:16 +02001383 sock_set_flag(sk, SOCK_RCU_FREE);
1384
Björn Töpelc0c77d82018-05-02 13:01:23 +02001385 xs = xdp_sk(sk);
Ilya Maximets455302d2019-06-28 11:04:07 +03001386 xs->state = XSK_READY;
Björn Töpelc0c77d82018-05-02 13:01:23 +02001387 mutex_init(&xs->mutex);
Ilya Maximetsbf0bdd12019-07-03 15:09:16 +03001388 spin_lock_init(&xs->rx_lock);
Björn Töpelc0c77d82018-05-02 13:01:23 +02001389
Björn Töpel0402acd2019-08-15 11:30:13 +02001390 INIT_LIST_HEAD(&xs->map_list);
1391 spin_lock_init(&xs->map_list_lock);
1392
Björn Töpel1d0dc062019-01-24 19:59:37 +01001393 mutex_lock(&net->xdp.lock);
1394 sk_add_node_rcu(sk, &net->xdp.list);
1395 mutex_unlock(&net->xdp.lock);
1396
Björn Töpelc0c77d82018-05-02 13:01:23 +02001397 sock_prot_inuse_add(net, &xsk_proto, 1);
Björn Töpelc0c77d82018-05-02 13:01:23 +02001398
1399 return 0;
1400}
1401
1402static const struct net_proto_family xsk_family_ops = {
1403 .family = PF_XDP,
1404 .create = xsk_create,
1405 .owner = THIS_MODULE,
1406};
1407
Ilya Maximets455302d2019-06-28 11:04:07 +03001408static struct notifier_block xsk_netdev_notifier = {
1409 .notifier_call = xsk_notifier,
1410};
1411
Björn Töpel1d0dc062019-01-24 19:59:37 +01001412static int __net_init xsk_net_init(struct net *net)
1413{
1414 mutex_init(&net->xdp.lock);
1415 INIT_HLIST_HEAD(&net->xdp.list);
1416 return 0;
1417}
1418
1419static void __net_exit xsk_net_exit(struct net *net)
1420{
1421 WARN_ON_ONCE(!hlist_empty(&net->xdp.list));
1422}
1423
1424static struct pernet_operations xsk_net_ops = {
1425 .init = xsk_net_init,
1426 .exit = xsk_net_exit,
1427};
1428
Björn Töpelc0c77d82018-05-02 13:01:23 +02001429static int __init xsk_init(void)
1430{
Björn Töpele312b9e2019-12-19 07:10:02 +01001431 int err, cpu;
Björn Töpelc0c77d82018-05-02 13:01:23 +02001432
1433 err = proto_register(&xsk_proto, 0 /* no slab */);
1434 if (err)
1435 goto out;
1436
1437 err = sock_register(&xsk_family_ops);
1438 if (err)
1439 goto out_proto;
1440
Björn Töpel1d0dc062019-01-24 19:59:37 +01001441 err = register_pernet_subsys(&xsk_net_ops);
1442 if (err)
1443 goto out_sk;
Ilya Maximets455302d2019-06-28 11:04:07 +03001444
1445 err = register_netdevice_notifier(&xsk_netdev_notifier);
1446 if (err)
1447 goto out_pernet;
1448
Björn Töpele312b9e2019-12-19 07:10:02 +01001449 for_each_possible_cpu(cpu)
1450 INIT_LIST_HEAD(&per_cpu(xskmap_flush_list, cpu));
Björn Töpelc0c77d82018-05-02 13:01:23 +02001451 return 0;
1452
Ilya Maximets455302d2019-06-28 11:04:07 +03001453out_pernet:
1454 unregister_pernet_subsys(&xsk_net_ops);
Björn Töpel1d0dc062019-01-24 19:59:37 +01001455out_sk:
1456 sock_unregister(PF_XDP);
Björn Töpelc0c77d82018-05-02 13:01:23 +02001457out_proto:
1458 proto_unregister(&xsk_proto);
1459out:
1460 return err;
1461}
1462
1463fs_initcall(xsk_init);