blob: cb1acd7009f43f8c0f0bf916fd7ad501f509dd62 [file] [log] [blame]
Björn Töpelc0c77d82018-05-02 13:01:23 +02001// SPDX-License-Identifier: GPL-2.0
2/* XDP sockets
3 *
4 * AF_XDP sockets allows a channel between XDP programs and userspace
5 * applications.
6 * Copyright(c) 2018 Intel Corporation.
7 *
Björn Töpelc0c77d82018-05-02 13:01:23 +02008 * Author(s): Björn Töpel <bjorn.topel@intel.com>
9 * Magnus Karlsson <magnus.karlsson@intel.com>
10 */
11
12#define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__
13
14#include <linux/if_xdp.h>
15#include <linux/init.h>
16#include <linux/sched/mm.h>
17#include <linux/sched/signal.h>
18#include <linux/sched/task.h>
19#include <linux/socket.h>
20#include <linux/file.h>
21#include <linux/uaccess.h>
22#include <linux/net.h>
23#include <linux/netdevice.h>
24#include <net/xdp_sock.h>
Björn Töpelb9b6b682018-05-02 13:01:25 +020025#include <net/xdp.h>
Björn Töpelc0c77d82018-05-02 13:01:23 +020026
Magnus Karlsson423f3832018-05-02 13:01:24 +020027#include "xsk_queue.h"
Björn Töpelc0c77d82018-05-02 13:01:23 +020028#include "xdp_umem.h"
29
Magnus Karlsson35fcde72018-05-02 13:01:34 +020030#define TX_BATCH_SIZE 16
31
Björn Töpelc0c77d82018-05-02 13:01:23 +020032static struct xdp_sock *xdp_sk(struct sock *sk)
33{
34 return (struct xdp_sock *)sk;
35}
36
Björn Töpelfbfc504a2018-05-02 13:01:28 +020037bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs)
38{
39 return !!xs->rx;
40}
41
Björn Töpelc4971762018-05-02 13:01:27 +020042static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
43{
44 u32 *id, len = xdp->data_end - xdp->data;
45 void *buffer;
46 int err = 0;
47
48 if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
49 return -EINVAL;
50
51 id = xskq_peek_id(xs->umem->fq);
52 if (!id)
53 return -ENOSPC;
54
55 buffer = xdp_umem_get_data_with_headroom(xs->umem, *id);
56 memcpy(buffer, xdp->data, len);
57 err = xskq_produce_batch_desc(xs->rx, *id, len,
58 xs->umem->frame_headroom);
59 if (!err)
60 xskq_discard_id(xs->umem->fq);
61
62 return err;
63}
64
65int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
66{
67 int err;
68
69 err = __xsk_rcv(xs, xdp);
70 if (likely(!err))
71 xdp_return_buff(xdp);
72 else
73 xs->rx_dropped++;
74
75 return err;
76}
77
78void xsk_flush(struct xdp_sock *xs)
79{
80 xskq_produce_flush_desc(xs->rx);
81 xs->sk.sk_data_ready(&xs->sk);
82}
83
84int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
85{
86 int err;
87
88 err = __xsk_rcv(xs, xdp);
89 if (!err)
90 xsk_flush(xs);
91 else
92 xs->rx_dropped++;
93
94 return err;
95}
96
Magnus Karlsson35fcde72018-05-02 13:01:34 +020097static void xsk_destruct_skb(struct sk_buff *skb)
98{
99 u32 id = (u32)(long)skb_shinfo(skb)->destructor_arg;
100 struct xdp_sock *xs = xdp_sk(skb->sk);
101
102 WARN_ON_ONCE(xskq_produce_id(xs->umem->cq, id));
103
104 sock_wfree(skb);
105}
106
107static int xsk_generic_xmit(struct sock *sk, struct msghdr *m,
108 size_t total_len)
109{
110 bool need_wait = !(m->msg_flags & MSG_DONTWAIT);
111 u32 max_batch = TX_BATCH_SIZE;
112 struct xdp_sock *xs = xdp_sk(sk);
113 bool sent_frame = false;
114 struct xdp_desc desc;
115 struct sk_buff *skb;
116 int err = 0;
117
118 if (unlikely(!xs->tx))
119 return -ENOBUFS;
120 if (need_wait)
121 return -EOPNOTSUPP;
122
123 mutex_lock(&xs->mutex);
124
125 while (xskq_peek_desc(xs->tx, &desc)) {
126 char *buffer;
127 u32 id, len;
128
129 if (max_batch-- == 0) {
130 err = -EAGAIN;
131 goto out;
132 }
133
134 if (xskq_reserve_id(xs->umem->cq)) {
135 err = -EAGAIN;
136 goto out;
137 }
138
139 len = desc.len;
140 if (unlikely(len > xs->dev->mtu)) {
141 err = -EMSGSIZE;
142 goto out;
143 }
144
145 skb = sock_alloc_send_skb(sk, len, !need_wait, &err);
146 if (unlikely(!skb)) {
147 err = -EAGAIN;
148 goto out;
149 }
150
151 skb_put(skb, len);
152 id = desc.idx;
153 buffer = xdp_umem_get_data(xs->umem, id) + desc.offset;
154 err = skb_store_bits(skb, 0, buffer, len);
155 if (unlikely(err)) {
156 kfree_skb(skb);
157 goto out;
158 }
159
160 skb->dev = xs->dev;
161 skb->priority = sk->sk_priority;
162 skb->mark = sk->sk_mark;
163 skb_shinfo(skb)->destructor_arg = (void *)(long)id;
164 skb->destructor = xsk_destruct_skb;
165
166 err = dev_direct_xmit(skb, xs->queue_id);
167 /* Ignore NET_XMIT_CN as packet might have been sent */
168 if (err == NET_XMIT_DROP || err == NETDEV_TX_BUSY) {
169 err = -EAGAIN;
170 /* SKB consumed by dev_direct_xmit() */
171 goto out;
172 }
173
174 sent_frame = true;
175 xskq_discard_desc(xs->tx);
176 }
177
178out:
179 if (sent_frame)
180 sk->sk_write_space(sk);
181
182 mutex_unlock(&xs->mutex);
183 return err;
184}
185
186static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
187{
188 struct sock *sk = sock->sk;
189 struct xdp_sock *xs = xdp_sk(sk);
190
191 if (unlikely(!xs->dev))
192 return -ENXIO;
193 if (unlikely(!(xs->dev->flags & IFF_UP)))
194 return -ENETDOWN;
195
196 return xsk_generic_xmit(sk, m, total_len);
197}
198
Björn Töpelc4971762018-05-02 13:01:27 +0200199static unsigned int xsk_poll(struct file *file, struct socket *sock,
200 struct poll_table_struct *wait)
201{
202 unsigned int mask = datagram_poll(file, sock, wait);
203 struct sock *sk = sock->sk;
204 struct xdp_sock *xs = xdp_sk(sk);
205
206 if (xs->rx && !xskq_empty_desc(xs->rx))
207 mask |= POLLIN | POLLRDNORM;
Magnus Karlsson35fcde72018-05-02 13:01:34 +0200208 if (xs->tx && !xskq_full_desc(xs->tx))
209 mask |= POLLOUT | POLLWRNORM;
Björn Töpelc4971762018-05-02 13:01:27 +0200210
211 return mask;
212}
213
Björn Töpelb9b6b682018-05-02 13:01:25 +0200214static int xsk_init_queue(u32 entries, struct xsk_queue **queue,
215 bool umem_queue)
Magnus Karlsson423f3832018-05-02 13:01:24 +0200216{
217 struct xsk_queue *q;
218
219 if (entries == 0 || *queue || !is_power_of_2(entries))
220 return -EINVAL;
221
Björn Töpelb9b6b682018-05-02 13:01:25 +0200222 q = xskq_create(entries, umem_queue);
Magnus Karlsson423f3832018-05-02 13:01:24 +0200223 if (!q)
224 return -ENOMEM;
225
226 *queue = q;
227 return 0;
228}
229
Björn Töpelc0c77d82018-05-02 13:01:23 +0200230static int xsk_release(struct socket *sock)
231{
232 struct sock *sk = sock->sk;
Magnus Karlsson965a9902018-05-02 13:01:26 +0200233 struct xdp_sock *xs = xdp_sk(sk);
Björn Töpelc0c77d82018-05-02 13:01:23 +0200234 struct net *net;
235
236 if (!sk)
237 return 0;
238
239 net = sock_net(sk);
240
241 local_bh_disable();
242 sock_prot_inuse_add(net, sk->sk_prot, -1);
243 local_bh_enable();
244
Magnus Karlsson965a9902018-05-02 13:01:26 +0200245 if (xs->dev) {
Björn Töpel959b71d2018-05-22 09:34:56 +0200246 /* Wait for driver to stop using the xdp socket. */
247 synchronize_net();
248 dev_put(xs->dev);
Magnus Karlsson965a9902018-05-02 13:01:26 +0200249 xs->dev = NULL;
250 }
251
Björn Töpelc0c77d82018-05-02 13:01:23 +0200252 sock_orphan(sk);
253 sock->sk = NULL;
254
255 sk_refcnt_debug_release(sk);
256 sock_put(sk);
257
258 return 0;
259}
260
Magnus Karlsson965a9902018-05-02 13:01:26 +0200261static struct socket *xsk_lookup_xsk_from_fd(int fd)
262{
263 struct socket *sock;
264 int err;
265
266 sock = sockfd_lookup(fd, &err);
267 if (!sock)
268 return ERR_PTR(-ENOTSOCK);
269
270 if (sock->sk->sk_family != PF_XDP) {
271 sockfd_put(sock);
272 return ERR_PTR(-ENOPROTOOPT);
273 }
274
275 return sock;
276}
277
278static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
279{
280 struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr;
281 struct sock *sk = sock->sk;
Magnus Karlsson965a9902018-05-02 13:01:26 +0200282 struct xdp_sock *xs = xdp_sk(sk);
Björn Töpel959b71d2018-05-22 09:34:56 +0200283 struct net_device *dev;
Magnus Karlsson965a9902018-05-02 13:01:26 +0200284 int err = 0;
285
286 if (addr_len < sizeof(struct sockaddr_xdp))
287 return -EINVAL;
288 if (sxdp->sxdp_family != AF_XDP)
289 return -EINVAL;
290
291 mutex_lock(&xs->mutex);
Björn Töpel959b71d2018-05-22 09:34:56 +0200292 if (xs->dev) {
293 err = -EBUSY;
294 goto out_release;
295 }
296
Magnus Karlsson965a9902018-05-02 13:01:26 +0200297 dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex);
298 if (!dev) {
299 err = -ENODEV;
300 goto out_release;
301 }
302
Magnus Karlssonf6145902018-05-02 13:01:32 +0200303 if (!xs->rx && !xs->tx) {
Magnus Karlsson965a9902018-05-02 13:01:26 +0200304 err = -EINVAL;
305 goto out_unlock;
306 }
307
308 if (sxdp->sxdp_queue_id >= dev->num_rx_queues) {
309 err = -EINVAL;
310 goto out_unlock;
311 }
312
313 if (sxdp->sxdp_flags & XDP_SHARED_UMEM) {
314 struct xdp_sock *umem_xs;
315 struct socket *sock;
316
317 if (xs->umem) {
318 /* We have already our own. */
319 err = -EINVAL;
320 goto out_unlock;
321 }
322
323 sock = xsk_lookup_xsk_from_fd(sxdp->sxdp_shared_umem_fd);
324 if (IS_ERR(sock)) {
325 err = PTR_ERR(sock);
326 goto out_unlock;
327 }
328
329 umem_xs = xdp_sk(sock->sk);
330 if (!umem_xs->umem) {
331 /* No umem to inherit. */
332 err = -EBADF;
333 sockfd_put(sock);
334 goto out_unlock;
335 } else if (umem_xs->dev != dev ||
336 umem_xs->queue_id != sxdp->sxdp_queue_id) {
337 err = -EINVAL;
338 sockfd_put(sock);
339 goto out_unlock;
340 }
341
342 xdp_get_umem(umem_xs->umem);
Magnus Karlsson965a9902018-05-02 13:01:26 +0200343 xs->umem = umem_xs->umem;
344 sockfd_put(sock);
345 } else if (!xs->umem || !xdp_umem_validate_queues(xs->umem)) {
346 err = -EINVAL;
347 goto out_unlock;
Björn Töpelc4971762018-05-02 13:01:27 +0200348 } else {
349 /* This xsk has its own umem. */
350 xskq_set_umem(xs->umem->fq, &xs->umem->props);
Magnus Karlssonfe230832018-05-02 13:01:31 +0200351 xskq_set_umem(xs->umem->cq, &xs->umem->props);
Magnus Karlsson965a9902018-05-02 13:01:26 +0200352 }
353
Magnus Karlsson965a9902018-05-02 13:01:26 +0200354 xs->dev = dev;
355 xs->queue_id = sxdp->sxdp_queue_id;
356
357 xskq_set_umem(xs->rx, &xs->umem->props);
Magnus Karlsson35fcde72018-05-02 13:01:34 +0200358 xskq_set_umem(xs->tx, &xs->umem->props);
Magnus Karlsson965a9902018-05-02 13:01:26 +0200359
360out_unlock:
361 if (err)
362 dev_put(dev);
363out_release:
364 mutex_unlock(&xs->mutex);
365 return err;
366}
367
Björn Töpelc0c77d82018-05-02 13:01:23 +0200368static int xsk_setsockopt(struct socket *sock, int level, int optname,
369 char __user *optval, unsigned int optlen)
370{
371 struct sock *sk = sock->sk;
372 struct xdp_sock *xs = xdp_sk(sk);
373 int err;
374
375 if (level != SOL_XDP)
376 return -ENOPROTOOPT;
377
378 switch (optname) {
Björn Töpelb9b6b682018-05-02 13:01:25 +0200379 case XDP_RX_RING:
Magnus Karlssonf6145902018-05-02 13:01:32 +0200380 case XDP_TX_RING:
Björn Töpelb9b6b682018-05-02 13:01:25 +0200381 {
382 struct xsk_queue **q;
383 int entries;
384
385 if (optlen < sizeof(entries))
386 return -EINVAL;
387 if (copy_from_user(&entries, optval, sizeof(entries)))
388 return -EFAULT;
389
390 mutex_lock(&xs->mutex);
Magnus Karlssonf6145902018-05-02 13:01:32 +0200391 q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx;
Björn Töpelb9b6b682018-05-02 13:01:25 +0200392 err = xsk_init_queue(entries, q, false);
393 mutex_unlock(&xs->mutex);
394 return err;
395 }
Björn Töpelc0c77d82018-05-02 13:01:23 +0200396 case XDP_UMEM_REG:
397 {
398 struct xdp_umem_reg mr;
399 struct xdp_umem *umem;
400
401 if (xs->umem)
402 return -EBUSY;
403
404 if (copy_from_user(&mr, optval, sizeof(mr)))
405 return -EFAULT;
406
407 mutex_lock(&xs->mutex);
408 err = xdp_umem_create(&umem);
409
410 err = xdp_umem_reg(umem, &mr);
411 if (err) {
412 kfree(umem);
413 mutex_unlock(&xs->mutex);
414 return err;
415 }
416
417 /* Make sure umem is ready before it can be seen by others */
418 smp_wmb();
419
420 xs->umem = umem;
421 mutex_unlock(&xs->mutex);
422 return 0;
423 }
Magnus Karlsson423f3832018-05-02 13:01:24 +0200424 case XDP_UMEM_FILL_RING:
Magnus Karlssonfe230832018-05-02 13:01:31 +0200425 case XDP_UMEM_COMPLETION_RING:
Magnus Karlsson423f3832018-05-02 13:01:24 +0200426 {
427 struct xsk_queue **q;
428 int entries;
429
430 if (!xs->umem)
431 return -EINVAL;
432
433 if (copy_from_user(&entries, optval, sizeof(entries)))
434 return -EFAULT;
435
436 mutex_lock(&xs->mutex);
Magnus Karlssonfe230832018-05-02 13:01:31 +0200437 q = (optname == XDP_UMEM_FILL_RING) ? &xs->umem->fq :
438 &xs->umem->cq;
Björn Töpelb9b6b682018-05-02 13:01:25 +0200439 err = xsk_init_queue(entries, q, true);
Magnus Karlsson423f3832018-05-02 13:01:24 +0200440 mutex_unlock(&xs->mutex);
441 return err;
442 }
Björn Töpelc0c77d82018-05-02 13:01:23 +0200443 default:
444 break;
445 }
446
447 return -ENOPROTOOPT;
448}
449
Magnus Karlssonaf75d9e2018-05-02 13:01:35 +0200450static int xsk_getsockopt(struct socket *sock, int level, int optname,
451 char __user *optval, int __user *optlen)
452{
453 struct sock *sk = sock->sk;
454 struct xdp_sock *xs = xdp_sk(sk);
455 int len;
456
457 if (level != SOL_XDP)
458 return -ENOPROTOOPT;
459
460 if (get_user(len, optlen))
461 return -EFAULT;
462 if (len < 0)
463 return -EINVAL;
464
465 switch (optname) {
466 case XDP_STATISTICS:
467 {
468 struct xdp_statistics stats;
469
470 if (len < sizeof(stats))
471 return -EINVAL;
472
473 mutex_lock(&xs->mutex);
474 stats.rx_dropped = xs->rx_dropped;
475 stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx);
476 stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx);
477 mutex_unlock(&xs->mutex);
478
479 if (copy_to_user(optval, &stats, sizeof(stats)))
480 return -EFAULT;
481 if (put_user(sizeof(stats), optlen))
482 return -EFAULT;
483
484 return 0;
485 }
486 default:
487 break;
488 }
489
490 return -EOPNOTSUPP;
491}
492
Magnus Karlsson423f3832018-05-02 13:01:24 +0200493static int xsk_mmap(struct file *file, struct socket *sock,
494 struct vm_area_struct *vma)
495{
496 unsigned long offset = vma->vm_pgoff << PAGE_SHIFT;
497 unsigned long size = vma->vm_end - vma->vm_start;
498 struct xdp_sock *xs = xdp_sk(sock->sk);
499 struct xsk_queue *q = NULL;
500 unsigned long pfn;
501 struct page *qpg;
502
Björn Töpelb9b6b682018-05-02 13:01:25 +0200503 if (offset == XDP_PGOFF_RX_RING) {
504 q = xs->rx;
Magnus Karlssonf6145902018-05-02 13:01:32 +0200505 } else if (offset == XDP_PGOFF_TX_RING) {
506 q = xs->tx;
Björn Töpelb9b6b682018-05-02 13:01:25 +0200507 } else {
508 if (!xs->umem)
509 return -EINVAL;
Magnus Karlsson423f3832018-05-02 13:01:24 +0200510
Björn Töpelb9b6b682018-05-02 13:01:25 +0200511 if (offset == XDP_UMEM_PGOFF_FILL_RING)
512 q = xs->umem->fq;
Magnus Karlssonfe230832018-05-02 13:01:31 +0200513 else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING)
514 q = xs->umem->cq;
Björn Töpelb9b6b682018-05-02 13:01:25 +0200515 }
Magnus Karlsson423f3832018-05-02 13:01:24 +0200516
517 if (!q)
518 return -EINVAL;
519
520 qpg = virt_to_head_page(q->ring);
521 if (size > (PAGE_SIZE << compound_order(qpg)))
522 return -EINVAL;
523
524 pfn = virt_to_phys(q->ring) >> PAGE_SHIFT;
525 return remap_pfn_range(vma, vma->vm_start, pfn,
526 size, vma->vm_page_prot);
527}
528
Björn Töpelc0c77d82018-05-02 13:01:23 +0200529static struct proto xsk_proto = {
530 .name = "XDP",
531 .owner = THIS_MODULE,
532 .obj_size = sizeof(struct xdp_sock),
533};
534
535static const struct proto_ops xsk_proto_ops = {
Björn Töpelc2f43742018-05-18 14:00:24 +0200536 .family = PF_XDP,
537 .owner = THIS_MODULE,
538 .release = xsk_release,
539 .bind = xsk_bind,
540 .connect = sock_no_connect,
541 .socketpair = sock_no_socketpair,
542 .accept = sock_no_accept,
543 .getname = sock_no_getname,
544 .poll = xsk_poll,
545 .ioctl = sock_no_ioctl,
546 .listen = sock_no_listen,
547 .shutdown = sock_no_shutdown,
548 .setsockopt = xsk_setsockopt,
549 .getsockopt = xsk_getsockopt,
550 .sendmsg = xsk_sendmsg,
551 .recvmsg = sock_no_recvmsg,
552 .mmap = xsk_mmap,
553 .sendpage = sock_no_sendpage,
Björn Töpelc0c77d82018-05-02 13:01:23 +0200554};
555
556static void xsk_destruct(struct sock *sk)
557{
558 struct xdp_sock *xs = xdp_sk(sk);
559
560 if (!sock_flag(sk, SOCK_DEAD))
561 return;
562
Björn Töpelb9b6b682018-05-02 13:01:25 +0200563 xskq_destroy(xs->rx);
Magnus Karlssonf6145902018-05-02 13:01:32 +0200564 xskq_destroy(xs->tx);
Björn Töpelc0c77d82018-05-02 13:01:23 +0200565 xdp_put_umem(xs->umem);
566
567 sk_refcnt_debug_dec(sk);
568}
569
570static int xsk_create(struct net *net, struct socket *sock, int protocol,
571 int kern)
572{
573 struct sock *sk;
574 struct xdp_sock *xs;
575
576 if (!ns_capable(net->user_ns, CAP_NET_RAW))
577 return -EPERM;
578 if (sock->type != SOCK_RAW)
579 return -ESOCKTNOSUPPORT;
580
581 if (protocol)
582 return -EPROTONOSUPPORT;
583
584 sock->state = SS_UNCONNECTED;
585
586 sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern);
587 if (!sk)
588 return -ENOBUFS;
589
590 sock->ops = &xsk_proto_ops;
591
592 sock_init_data(sock, sk);
593
594 sk->sk_family = PF_XDP;
595
596 sk->sk_destruct = xsk_destruct;
597 sk_refcnt_debug_inc(sk);
598
599 xs = xdp_sk(sk);
600 mutex_init(&xs->mutex);
601
602 local_bh_disable();
603 sock_prot_inuse_add(net, &xsk_proto, 1);
604 local_bh_enable();
605
606 return 0;
607}
608
609static const struct net_proto_family xsk_family_ops = {
610 .family = PF_XDP,
611 .create = xsk_create,
612 .owner = THIS_MODULE,
613};
614
615static int __init xsk_init(void)
616{
617 int err;
618
619 err = proto_register(&xsk_proto, 0 /* no slab */);
620 if (err)
621 goto out;
622
623 err = sock_register(&xsk_family_ops);
624 if (err)
625 goto out_proto;
626
627 return 0;
628
629out_proto:
630 proto_unregister(&xsk_proto);
631out:
632 return err;
633}
634
635fs_initcall(xsk_init);