blob: 12aabcda6db20e73e05ebf66ece7d2e0ea01e1a3 [file] [log] [blame]
Thomas Gleixner2874c5f2019-05-27 08:55:01 +02001// SPDX-License-Identifier: GPL-2.0-or-later
Linus Torvalds1da177e2005-04-16 15:20:36 -07002/*
3 * Routines having to do with the 'struct sk_buff' memory handlers.
4 *
Alan Cox113aa832008-10-13 19:01:08 -07005 * Authors: Alan Cox <alan@lxorguk.ukuu.org.uk>
Linus Torvalds1da177e2005-04-16 15:20:36 -07006 * Florian La Roche <rzsfl@rz.uni-sb.de>
7 *
Linus Torvalds1da177e2005-04-16 15:20:36 -07008 * Fixes:
9 * Alan Cox : Fixed the worst of the load
10 * balancer bugs.
11 * Dave Platt : Interrupt stacking fix.
12 * Richard Kooijman : Timestamp fixes.
13 * Alan Cox : Changed buffer format.
14 * Alan Cox : destructor hook for AF_UNIX etc.
15 * Linus Torvalds : Better skb_clone.
16 * Alan Cox : Added skb_copy.
17 * Alan Cox : Added all the changed routines Linus
18 * only put in the headers
19 * Ray VanTassle : Fixed --skb->lock in free
20 * Alan Cox : skb_copy copy arp field
21 * Andi Kleen : slabified it.
22 * Robert Olsson : Removed skb_head_pool
23 *
24 * NOTE:
25 * The __skb_ routines should be called with interrupts
26 * disabled, or you better be *real* sure that the operation is atomic
27 * with respect to whatever list is being frobbed (e.g. via lock_sock()
28 * or via disabling bottom half handlers, etc).
Linus Torvalds1da177e2005-04-16 15:20:36 -070029 */
30
31/*
32 * The functions in this file will not compile correctly with gcc 2.4.x
33 */
34
Joe Perchese005d192012-05-16 19:58:40 +000035#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
36
Linus Torvalds1da177e2005-04-16 15:20:36 -070037#include <linux/module.h>
38#include <linux/types.h>
39#include <linux/kernel.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070040#include <linux/mm.h>
41#include <linux/interrupt.h>
42#include <linux/in.h>
43#include <linux/inet.h>
44#include <linux/slab.h>
Florian Westphalde960aa2014-01-26 10:58:16 +010045#include <linux/tcp.h>
46#include <linux/udp.h>
Marcelo Ricardo Leitner90017ac2016-06-02 15:05:43 -030047#include <linux/sctp.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070048#include <linux/netdevice.h>
49#ifdef CONFIG_NET_CLS_ACT
50#include <net/pkt_sched.h>
51#endif
52#include <linux/string.h>
53#include <linux/skbuff.h>
Jens Axboe9c55e012007-11-06 23:30:13 -080054#include <linux/splice.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070055#include <linux/cache.h>
56#include <linux/rtnetlink.h>
57#include <linux/init.h>
David Howells716ea3a2007-04-02 20:19:53 -070058#include <linux/scatterlist.h>
Patrick Ohlyac45f602009-02-12 05:03:37 +000059#include <linux/errqueue.h>
Linus Torvalds268bb0c2011-05-20 12:50:29 -070060#include <linux/prefetch.h>
Vlad Yasevich0d5501c2014-08-08 14:42:13 -040061#include <linux/if_vlan.h>
John Hurley2a2ea502019-07-07 15:01:57 +010062#include <linux/mpls.h>
Sebastian Andrzej Siewior183f47f2021-02-18 18:31:24 +010063#include <linux/kcov.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070064
65#include <net/protocol.h>
66#include <net/dst.h>
67#include <net/sock.h>
68#include <net/checksum.h>
Paul Durranted1f50c2014-01-09 10:02:46 +000069#include <net/ip6_checksum.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070070#include <net/xfrm.h>
John Hurley8822e272019-07-07 15:01:54 +010071#include <net/mpls.h>
Mat Martineau3ee17bc2020-01-09 07:59:19 -080072#include <net/mptcp.h>
Ilias Apalodimas6a5bcd82021-06-07 21:02:38 +020073#include <net/page_pool.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070074
Linus Torvalds7c0f6ba2016-12-24 11:46:01 -080075#include <linux/uaccess.h>
Steven Rostedtad8d75f2009-04-14 19:39:12 -040076#include <trace/events/skb.h>
Eric Dumazet51c56b02012-04-05 11:35:15 +020077#include <linux/highmem.h>
Willem de Bruijnb245be12015-01-30 13:29:32 -050078#include <linux/capability.h>
79#include <linux/user_namespace.h>
Matteo Croce2544af02019-05-29 17:13:48 +020080#include <linux/indirect_call_wrapper.h>
Al Viroa1f8e7f72006-10-19 16:08:53 -040081
Bart Van Assche7b7ed882019-03-25 09:17:23 -070082#include "datagram.h"
83
Alexey Dobriyan08009a72018-02-24 21:20:33 +030084struct kmem_cache *skbuff_head_cache __ro_after_init;
85static struct kmem_cache *skbuff_fclone_cache __ro_after_init;
Florian Westphaldf5042f2018-12-18 17:15:16 +010086#ifdef CONFIG_SKB_EXTENSIONS
87static struct kmem_cache *skbuff_ext_cache __ro_after_init;
88#endif
Hans Westgaard Ry5f74f82e2016-02-03 09:26:57 +010089int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS;
90EXPORT_SYMBOL(sysctl_max_skb_frags);
Linus Torvalds1da177e2005-04-16 15:20:36 -070091
Linus Torvalds1da177e2005-04-16 15:20:36 -070092/**
Jean Sacrenf05de732013-02-11 13:30:38 +000093 * skb_panic - private function for out-of-line support
94 * @skb: buffer
95 * @sz: size
96 * @addr: address
James Hogan99d58512013-02-13 11:20:27 +000097 * @msg: skb_over_panic or skb_under_panic
Linus Torvalds1da177e2005-04-16 15:20:36 -070098 *
Jean Sacrenf05de732013-02-11 13:30:38 +000099 * Out-of-line support for skb_put() and skb_push().
100 * Called via the wrapper skb_over_panic() or skb_under_panic().
101 * Keep out of line to prevent kernel bloat.
102 * __builtin_return_address is not used because it is not always reliable.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700103 */
Jean Sacrenf05de732013-02-11 13:30:38 +0000104static void skb_panic(struct sk_buff *skb, unsigned int sz, void *addr,
James Hogan99d58512013-02-13 11:20:27 +0000105 const char msg[])
Linus Torvalds1da177e2005-04-16 15:20:36 -0700106{
Jesper Dangaard Brouer41a46912020-04-27 18:37:43 +0200107 pr_emerg("%s: text:%px len:%d put:%d head:%px data:%px tail:%#lx end:%#lx dev:%s\n",
James Hogan99d58512013-02-13 11:20:27 +0000108 msg, addr, skb->len, sz, skb->head, skb->data,
Joe Perchese005d192012-05-16 19:58:40 +0000109 (unsigned long)skb->tail, (unsigned long)skb->end,
110 skb->dev ? skb->dev->name : "<NULL>");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700111 BUG();
112}
113
Jean Sacrenf05de732013-02-11 13:30:38 +0000114static void skb_over_panic(struct sk_buff *skb, unsigned int sz, void *addr)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700115{
Jean Sacrenf05de732013-02-11 13:30:38 +0000116 skb_panic(skb, sz, addr, __func__);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700117}
118
Jean Sacrenf05de732013-02-11 13:30:38 +0000119static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr)
120{
121 skb_panic(skb, sz, addr, __func__);
122}
Mel Gormanc93bdd02012-07-31 16:44:19 -0700123
Alexander Lobakin50fad4b52021-02-13 14:12:13 +0000124#define NAPI_SKB_CACHE_SIZE 64
Alexander Lobakinf450d532021-02-13 14:12:25 +0000125#define NAPI_SKB_CACHE_BULK 16
126#define NAPI_SKB_CACHE_HALF (NAPI_SKB_CACHE_SIZE / 2)
Alexander Lobakin50fad4b52021-02-13 14:12:13 +0000127
128struct napi_alloc_cache {
129 struct page_frag_cache page;
130 unsigned int skb_count;
131 void *skb_cache[NAPI_SKB_CACHE_SIZE];
132};
133
134static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
135static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache);
136
137static void *__alloc_frag_align(unsigned int fragsz, gfp_t gfp_mask,
138 unsigned int align_mask)
139{
140 struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
141
142 return page_frag_alloc_align(&nc->page, fragsz, gfp_mask, align_mask);
143}
144
145void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
146{
147 fragsz = SKB_DATA_ALIGN(fragsz);
148
149 return __alloc_frag_align(fragsz, GFP_ATOMIC, align_mask);
150}
151EXPORT_SYMBOL(__napi_alloc_frag_align);
152
153void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
154{
155 struct page_frag_cache *nc;
156 void *data;
157
158 fragsz = SKB_DATA_ALIGN(fragsz);
159 if (in_irq() || irqs_disabled()) {
160 nc = this_cpu_ptr(&netdev_alloc_cache);
161 data = page_frag_alloc_align(nc, fragsz, GFP_ATOMIC, align_mask);
162 } else {
163 local_bh_disable();
164 data = __alloc_frag_align(fragsz, GFP_ATOMIC, align_mask);
165 local_bh_enable();
166 }
167 return data;
168}
169EXPORT_SYMBOL(__netdev_alloc_frag_align);
170
Alexander Lobakinf450d532021-02-13 14:12:25 +0000171static struct sk_buff *napi_skb_cache_get(void)
172{
173 struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
174 struct sk_buff *skb;
175
176 if (unlikely(!nc->skb_count))
177 nc->skb_count = kmem_cache_alloc_bulk(skbuff_head_cache,
178 GFP_ATOMIC,
179 NAPI_SKB_CACHE_BULK,
180 nc->skb_cache);
181 if (unlikely(!nc->skb_count))
182 return NULL;
183
184 skb = nc->skb_cache[--nc->skb_count];
185 kasan_unpoison_object_data(skbuff_head_cache, skb);
186
187 return skb;
188}
189
Jesper Dangaard Brouerba0509b2019-04-12 17:07:37 +0200190/* Caller must provide SKB that is memset cleared */
Alexander Lobakin483126b2021-02-13 14:11:26 +0000191static void __build_skb_around(struct sk_buff *skb, void *data,
192 unsigned int frag_size)
Jesper Dangaard Brouerba0509b2019-04-12 17:07:37 +0200193{
194 struct skb_shared_info *shinfo;
195 unsigned int size = frag_size ? : ksize(data);
196
197 size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
198
199 /* Assumes caller memset cleared SKB */
200 skb->truesize = SKB_TRUESIZE(size);
201 refcount_set(&skb->users, 1);
202 skb->head = data;
203 skb->data = data;
204 skb_reset_tail_pointer(skb);
205 skb->end = skb->tail + size;
206 skb->mac_header = (typeof(skb->mac_header))~0U;
207 skb->transport_header = (typeof(skb->transport_header))~0U;
208
209 /* make sure we initialize shinfo sequentially */
210 shinfo = skb_shinfo(skb);
211 memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
212 atomic_set(&shinfo->dataref, 1);
213
Aleksandr Nogikh6370cc32020-10-29 17:36:19 +0000214 skb_set_kcov_handle(skb, kcov_common_handle());
Jesper Dangaard Brouerba0509b2019-04-12 17:07:37 +0200215}
216
Linus Torvalds1da177e2005-04-16 15:20:36 -0700217/**
Eric Dumazet2ea2f622015-04-24 16:05:01 -0700218 * __build_skb - build a network buffer
Eric Dumazetb2b5ce92011-11-14 06:03:34 +0000219 * @data: data buffer provided by caller
Eric Dumazet2ea2f622015-04-24 16:05:01 -0700220 * @frag_size: size of data, or 0 if head was kmalloced
Eric Dumazetb2b5ce92011-11-14 06:03:34 +0000221 *
222 * Allocate a new &sk_buff. Caller provides space holding head and
Florian Fainellideceb4c2013-07-23 20:22:39 +0100223 * skb_shared_info. @data must have been allocated by kmalloc() only if
Eric Dumazet2ea2f622015-04-24 16:05:01 -0700224 * @frag_size is 0, otherwise data should come from the page allocator
225 * or vmalloc()
Eric Dumazetb2b5ce92011-11-14 06:03:34 +0000226 * The return is the new skb buffer.
227 * On a failure the return is %NULL, and @data is not freed.
228 * Notes :
229 * Before IO, driver allocates only data buffer where NIC put incoming frame
230 * Driver should add room at head (NET_SKB_PAD) and
231 * MUST add room at tail (SKB_DATA_ALIGN(skb_shared_info))
232 * After IO, driver calls build_skb(), to allocate sk_buff and populate it
233 * before giving packet to stack.
234 * RX rings only contains data buffers, not full skbs.
235 */
Eric Dumazet2ea2f622015-04-24 16:05:01 -0700236struct sk_buff *__build_skb(void *data, unsigned int frag_size)
Eric Dumazetb2b5ce92011-11-14 06:03:34 +0000237{
Eric Dumazetb2b5ce92011-11-14 06:03:34 +0000238 struct sk_buff *skb;
Eric Dumazetb2b5ce92011-11-14 06:03:34 +0000239
240 skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC);
Jesper Dangaard Brouerba0509b2019-04-12 17:07:37 +0200241 if (unlikely(!skb))
Eric Dumazetb2b5ce92011-11-14 06:03:34 +0000242 return NULL;
243
Eric Dumazetb2b5ce92011-11-14 06:03:34 +0000244 memset(skb, 0, offsetof(struct sk_buff, tail));
Alexander Lobakin483126b2021-02-13 14:11:26 +0000245 __build_skb_around(skb, data, frag_size);
Eric Dumazetb2b5ce92011-11-14 06:03:34 +0000246
Alexander Lobakin483126b2021-02-13 14:11:26 +0000247 return skb;
Eric Dumazetb2b5ce92011-11-14 06:03:34 +0000248}
Eric Dumazet2ea2f622015-04-24 16:05:01 -0700249
250/* build_skb() is wrapper over __build_skb(), that specifically
251 * takes care of skb->head and skb->pfmemalloc
252 * This means that if @frag_size is not zero, then @data must be backed
253 * by a page fragment, not kmalloc() or vmalloc()
254 */
255struct sk_buff *build_skb(void *data, unsigned int frag_size)
256{
257 struct sk_buff *skb = __build_skb(data, frag_size);
258
259 if (skb && frag_size) {
260 skb->head_frag = 1;
Michal Hocko2f064f32015-08-21 14:11:51 -0700261 if (page_is_pfmemalloc(virt_to_head_page(data)))
Eric Dumazet2ea2f622015-04-24 16:05:01 -0700262 skb->pfmemalloc = 1;
263 }
264 return skb;
265}
Eric Dumazetb2b5ce92011-11-14 06:03:34 +0000266EXPORT_SYMBOL(build_skb);
267
Jesper Dangaard Brouerba0509b2019-04-12 17:07:37 +0200268/**
269 * build_skb_around - build a network buffer around provided skb
270 * @skb: sk_buff provide by caller, must be memset cleared
271 * @data: data buffer provided by caller
272 * @frag_size: size of data, or 0 if head was kmalloced
273 */
274struct sk_buff *build_skb_around(struct sk_buff *skb,
275 void *data, unsigned int frag_size)
276{
277 if (unlikely(!skb))
278 return NULL;
279
Alexander Lobakin483126b2021-02-13 14:11:26 +0000280 __build_skb_around(skb, data, frag_size);
Jesper Dangaard Brouerba0509b2019-04-12 17:07:37 +0200281
Alexander Lobakin483126b2021-02-13 14:11:26 +0000282 if (frag_size) {
Jesper Dangaard Brouerba0509b2019-04-12 17:07:37 +0200283 skb->head_frag = 1;
284 if (page_is_pfmemalloc(virt_to_head_page(data)))
285 skb->pfmemalloc = 1;
286 }
287 return skb;
288}
289EXPORT_SYMBOL(build_skb_around);
290
Alexander Lobakinf450d532021-02-13 14:12:25 +0000291/**
292 * __napi_build_skb - build a network buffer
293 * @data: data buffer provided by caller
294 * @frag_size: size of data, or 0 if head was kmalloced
295 *
296 * Version of __build_skb() that uses NAPI percpu caches to obtain
297 * skbuff_head instead of inplace allocation.
298 *
299 * Returns a new &sk_buff on success, %NULL on allocation failure.
300 */
301static struct sk_buff *__napi_build_skb(void *data, unsigned int frag_size)
302{
303 struct sk_buff *skb;
304
305 skb = napi_skb_cache_get();
306 if (unlikely(!skb))
307 return NULL;
308
309 memset(skb, 0, offsetof(struct sk_buff, tail));
310 __build_skb_around(skb, data, frag_size);
311
312 return skb;
313}
314
315/**
316 * napi_build_skb - build a network buffer
317 * @data: data buffer provided by caller
318 * @frag_size: size of data, or 0 if head was kmalloced
319 *
320 * Version of __napi_build_skb() that takes care of skb->head_frag
321 * and skb->pfmemalloc when the data is a page or page fragment.
322 *
323 * Returns a new &sk_buff on success, %NULL on allocation failure.
324 */
325struct sk_buff *napi_build_skb(void *data, unsigned int frag_size)
326{
327 struct sk_buff *skb = __napi_build_skb(data, frag_size);
328
329 if (likely(skb) && frag_size) {
330 skb->head_frag = 1;
331 skb_propagate_pfmemalloc(virt_to_head_page(data), skb);
332 }
333
334 return skb;
335}
336EXPORT_SYMBOL(napi_build_skb);
337
Alexander Lobakin5381b232021-02-13 14:11:00 +0000338/*
339 * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells
340 * the caller if emergency pfmemalloc reserves are being used. If it is and
341 * the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves
342 * may be used. Otherwise, the packet data may be discarded until enough
343 * memory is free
344 */
Alexander Lobakinef280952021-02-13 14:11:11 +0000345static void *kmalloc_reserve(size_t size, gfp_t flags, int node,
346 bool *pfmemalloc)
Alexander Lobakin5381b232021-02-13 14:11:00 +0000347{
348 void *obj;
349 bool ret_pfmemalloc = false;
350
351 /*
352 * Try a regular allocation, when that fails and we're not entitled
353 * to the reserves, fail.
354 */
355 obj = kmalloc_node_track_caller(size,
356 flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
357 node);
358 if (obj || !(gfp_pfmemalloc_allowed(flags)))
359 goto out;
360
361 /* Try again but now we are using pfmemalloc reserves */
362 ret_pfmemalloc = true;
363 obj = kmalloc_node_track_caller(size, flags, node);
364
365out:
366 if (pfmemalloc)
367 *pfmemalloc = ret_pfmemalloc;
368
369 return obj;
370}
371
372/* Allocate a new skbuff. We do this ourselves so we can fill in a few
373 * 'private' fields and also do memory statistics to find all the
374 * [BEEP] leaks.
375 *
376 */
377
378/**
379 * __alloc_skb - allocate a network buffer
380 * @size: size to allocate
381 * @gfp_mask: allocation mask
382 * @flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache
383 * instead of head cache and allocate a cloned (child) skb.
384 * If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for
385 * allocations in case the data is required for writeback
386 * @node: numa node to allocate memory on
387 *
388 * Allocate a new &sk_buff. The returned buffer has no headroom and a
389 * tail room of at least size bytes. The object has a reference count
390 * of one. The return is the buffer. On a failure the return is %NULL.
391 *
392 * Buffers may only be allocated from interrupts using a @gfp_mask of
393 * %GFP_ATOMIC.
394 */
395struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
396 int flags, int node)
397{
398 struct kmem_cache *cache;
Alexander Lobakin5381b232021-02-13 14:11:00 +0000399 struct sk_buff *skb;
400 u8 *data;
401 bool pfmemalloc;
402
403 cache = (flags & SKB_ALLOC_FCLONE)
404 ? skbuff_fclone_cache : skbuff_head_cache;
405
406 if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX))
407 gfp_mask |= __GFP_MEMALLOC;
408
409 /* Get the HEAD */
Alexander Lobakind13612b2021-02-13 14:12:38 +0000410 if ((flags & (SKB_ALLOC_FCLONE | SKB_ALLOC_NAPI)) == SKB_ALLOC_NAPI &&
411 likely(node == NUMA_NO_NODE || node == numa_mem_id()))
412 skb = napi_skb_cache_get();
413 else
414 skb = kmem_cache_alloc_node(cache, gfp_mask & ~GFP_DMA, node);
Alexander Lobakindf1ae022021-02-13 14:11:39 +0000415 if (unlikely(!skb))
416 return NULL;
Alexander Lobakin5381b232021-02-13 14:11:00 +0000417 prefetchw(skb);
418
419 /* We do our best to align skb_shared_info on a separate cache
420 * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives
421 * aligned memory blocks, unless SLUB/SLAB debug is enabled.
422 * Both skb->head and skb_shared_info are cache line aligned.
423 */
424 size = SKB_DATA_ALIGN(size);
425 size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
426 data = kmalloc_reserve(size, gfp_mask, node, &pfmemalloc);
Alexander Lobakindf1ae022021-02-13 14:11:39 +0000427 if (unlikely(!data))
Alexander Lobakin5381b232021-02-13 14:11:00 +0000428 goto nodata;
429 /* kmalloc(size) might give us more room than requested.
430 * Put skb_shared_info exactly at the end of allocated zone,
431 * to allow max possible filling before reallocation.
432 */
433 size = SKB_WITH_OVERHEAD(ksize(data));
434 prefetchw(data + size);
435
436 /*
437 * Only clear those fields we need to clear, not those that we will
438 * actually initialise below. Hence, don't put any more fields after
439 * the tail pointer in struct sk_buff!
440 */
441 memset(skb, 0, offsetof(struct sk_buff, tail));
Alexander Lobakinf9d67252021-02-13 14:11:50 +0000442 __build_skb_around(skb, data, 0);
Alexander Lobakin5381b232021-02-13 14:11:00 +0000443 skb->pfmemalloc = pfmemalloc;
Alexander Lobakin5381b232021-02-13 14:11:00 +0000444
445 if (flags & SKB_ALLOC_FCLONE) {
446 struct sk_buff_fclones *fclones;
447
448 fclones = container_of(skb, struct sk_buff_fclones, skb1);
449
450 skb->fclone = SKB_FCLONE_ORIG;
451 refcount_set(&fclones->fclone_ref, 1);
452
453 fclones->skb2.fclone = SKB_FCLONE_CLONE;
454 }
455
Alexander Lobakin5381b232021-02-13 14:11:00 +0000456 return skb;
Alexander Lobakindf1ae022021-02-13 14:11:39 +0000457
Alexander Lobakin5381b232021-02-13 14:11:00 +0000458nodata:
459 kmem_cache_free(cache, skb);
Alexander Lobakindf1ae022021-02-13 14:11:39 +0000460 return NULL;
Alexander Lobakin5381b232021-02-13 14:11:00 +0000461}
462EXPORT_SYMBOL(__alloc_skb);
463
Sebastian Andrzej Siewior7ba7aea2019-06-07 21:20:34 +0200464/**
Alexander Duyckfd11a832014-12-09 19:40:49 -0800465 * __netdev_alloc_skb - allocate an skbuff for rx on a specific device
466 * @dev: network device to receive on
Masanari Iidad7499162015-08-24 22:56:54 +0900467 * @len: length to allocate
Alexander Duyckfd11a832014-12-09 19:40:49 -0800468 * @gfp_mask: get_free_pages mask, passed to alloc_skb
469 *
470 * Allocate a new &sk_buff and assign it a usage count of one. The
471 * buffer has NET_SKB_PAD headroom built in. Users should allocate
472 * the headroom they think they need without accounting for the
473 * built in space. The built in space is used for optimisations.
474 *
475 * %NULL is returned if there is no free memory.
476 */
Alexander Duyck94519802015-05-06 21:11:40 -0700477struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
478 gfp_t gfp_mask)
Alexander Duyckfd11a832014-12-09 19:40:49 -0800479{
Alexander Duyckb63ae8c2015-05-06 21:11:57 -0700480 struct page_frag_cache *nc;
Alexander Duyckfd11a832014-12-09 19:40:49 -0800481 struct sk_buff *skb;
Alexander Duyck94519802015-05-06 21:11:40 -0700482 bool pfmemalloc;
483 void *data;
Alexander Duyckfd11a832014-12-09 19:40:49 -0800484
Alexander Duyck94519802015-05-06 21:11:40 -0700485 len += NET_SKB_PAD;
Alexander Duyckfd11a832014-12-09 19:40:49 -0800486
Alexander Lobakin66c55602021-01-15 15:04:40 +0000487 /* If requested length is either too small or too big,
488 * we use kmalloc() for skb->head allocation.
489 */
490 if (len <= SKB_WITH_OVERHEAD(1024) ||
491 len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
Mel Gormand0164ad2015-11-06 16:28:21 -0800492 (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
Alexander Duycka080e7b2015-05-13 13:34:13 -0700493 skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE);
494 if (!skb)
495 goto skb_fail;
496 goto skb_success;
497 }
Alexander Duyck94519802015-05-06 21:11:40 -0700498
499 len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
500 len = SKB_DATA_ALIGN(len);
501
502 if (sk_memalloc_socks())
503 gfp_mask |= __GFP_MEMALLOC;
504
Sebastian Andrzej Siewior92dcabd2019-06-07 21:20:35 +0200505 if (in_irq() || irqs_disabled()) {
506 nc = this_cpu_ptr(&netdev_alloc_cache);
507 data = page_frag_alloc(nc, len, gfp_mask);
508 pfmemalloc = nc->pfmemalloc;
509 } else {
510 local_bh_disable();
511 nc = this_cpu_ptr(&napi_alloc_cache.page);
512 data = page_frag_alloc(nc, len, gfp_mask);
513 pfmemalloc = nc->pfmemalloc;
514 local_bh_enable();
515 }
Alexander Duyck94519802015-05-06 21:11:40 -0700516
517 if (unlikely(!data))
518 return NULL;
519
520 skb = __build_skb(data, len);
521 if (unlikely(!skb)) {
Alexander Duyck181edb22015-05-06 21:12:03 -0700522 skb_free_frag(data);
Alexander Duyck94519802015-05-06 21:11:40 -0700523 return NULL;
Christoph Hellwig7b2e4972006-08-07 16:09:04 -0700524 }
Alexander Duyckfd11a832014-12-09 19:40:49 -0800525
Alexander Duyck94519802015-05-06 21:11:40 -0700526 if (pfmemalloc)
527 skb->pfmemalloc = 1;
528 skb->head_frag = 1;
529
Alexander Duycka080e7b2015-05-13 13:34:13 -0700530skb_success:
Alexander Duyck94519802015-05-06 21:11:40 -0700531 skb_reserve(skb, NET_SKB_PAD);
532 skb->dev = dev;
533
Alexander Duycka080e7b2015-05-13 13:34:13 -0700534skb_fail:
Christoph Hellwig8af27452006-07-31 22:35:23 -0700535 return skb;
536}
David S. Millerb4ac530fc2009-02-10 02:09:24 -0800537EXPORT_SYMBOL(__netdev_alloc_skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700538
Alexander Duyckfd11a832014-12-09 19:40:49 -0800539/**
540 * __napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance
541 * @napi: napi instance this buffer was allocated for
Masanari Iidad7499162015-08-24 22:56:54 +0900542 * @len: length to allocate
Alexander Duyckfd11a832014-12-09 19:40:49 -0800543 * @gfp_mask: get_free_pages mask, passed to alloc_skb and alloc_pages
544 *
545 * Allocate a new sk_buff for use in NAPI receive. This buffer will
546 * attempt to allocate the head from a special reserved region used
547 * only for NAPI Rx allocation. By doing this we can save several
548 * CPU cycles by avoiding having to disable and re-enable IRQs.
549 *
550 * %NULL is returned if there is no free memory.
551 */
Alexander Duyck94519802015-05-06 21:11:40 -0700552struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
553 gfp_t gfp_mask)
Alexander Duyckfd11a832014-12-09 19:40:49 -0800554{
Eric Dumazet3226b152021-01-13 08:18:19 -0800555 struct napi_alloc_cache *nc;
Alexander Duyckfd11a832014-12-09 19:40:49 -0800556 struct sk_buff *skb;
Alexander Duyck94519802015-05-06 21:11:40 -0700557 void *data;
Alexander Duyckfd11a832014-12-09 19:40:49 -0800558
Alexander Duyck94519802015-05-06 21:11:40 -0700559 len += NET_SKB_PAD + NET_IP_ALIGN;
Alexander Duyckfd11a832014-12-09 19:40:49 -0800560
Eric Dumazet3226b152021-01-13 08:18:19 -0800561 /* If requested length is either too small or too big,
562 * we use kmalloc() for skb->head allocation.
563 */
564 if (len <= SKB_WITH_OVERHEAD(1024) ||
565 len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
Mel Gormand0164ad2015-11-06 16:28:21 -0800566 (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
Alexander Lobakincfb8ec62021-02-13 14:12:49 +0000567 skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI,
568 NUMA_NO_NODE);
Alexander Duycka080e7b2015-05-13 13:34:13 -0700569 if (!skb)
570 goto skb_fail;
571 goto skb_success;
572 }
Alexander Duyck94519802015-05-06 21:11:40 -0700573
Eric Dumazet3226b152021-01-13 08:18:19 -0800574 nc = this_cpu_ptr(&napi_alloc_cache);
Alexander Duyck94519802015-05-06 21:11:40 -0700575 len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
576 len = SKB_DATA_ALIGN(len);
577
578 if (sk_memalloc_socks())
579 gfp_mask |= __GFP_MEMALLOC;
580
Alexander Duyck8c2dd3e2017-01-10 16:58:06 -0800581 data = page_frag_alloc(&nc->page, len, gfp_mask);
Alexander Duyck94519802015-05-06 21:11:40 -0700582 if (unlikely(!data))
583 return NULL;
584
Alexander Lobakincfb8ec62021-02-13 14:12:49 +0000585 skb = __napi_build_skb(data, len);
Alexander Duyck94519802015-05-06 21:11:40 -0700586 if (unlikely(!skb)) {
Alexander Duyck181edb22015-05-06 21:12:03 -0700587 skb_free_frag(data);
Alexander Duyck94519802015-05-06 21:11:40 -0700588 return NULL;
Alexander Duyckfd11a832014-12-09 19:40:49 -0800589 }
590
Jesper Dangaard Brouer795bb1c2016-02-08 13:14:59 +0100591 if (nc->page.pfmemalloc)
Alexander Duyck94519802015-05-06 21:11:40 -0700592 skb->pfmemalloc = 1;
593 skb->head_frag = 1;
594
Alexander Duycka080e7b2015-05-13 13:34:13 -0700595skb_success:
Alexander Duyck94519802015-05-06 21:11:40 -0700596 skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
597 skb->dev = napi->dev;
598
Alexander Duycka080e7b2015-05-13 13:34:13 -0700599skb_fail:
Alexander Duyckfd11a832014-12-09 19:40:49 -0800600 return skb;
601}
602EXPORT_SYMBOL(__napi_alloc_skb);
603
Peter Zijlstra654bed12008-10-07 14:22:33 -0700604void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
Eric Dumazet50269e12012-03-23 23:59:33 +0000605 int size, unsigned int truesize)
Peter Zijlstra654bed12008-10-07 14:22:33 -0700606{
607 skb_fill_page_desc(skb, i, page, off, size);
608 skb->len += size;
609 skb->data_len += size;
Eric Dumazet50269e12012-03-23 23:59:33 +0000610 skb->truesize += truesize;
Peter Zijlstra654bed12008-10-07 14:22:33 -0700611}
612EXPORT_SYMBOL(skb_add_rx_frag);
613
Jason Wangf8e617e2013-11-01 14:07:47 +0800614void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size,
615 unsigned int truesize)
616{
617 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
618
619 skb_frag_size_add(frag, size);
620 skb->len += size;
621 skb->data_len += size;
622 skb->truesize += truesize;
623}
624EXPORT_SYMBOL(skb_coalesce_rx_frag);
625
Herbert Xu27b437c2006-07-13 19:26:39 -0700626static void skb_drop_list(struct sk_buff **listp)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700627{
Eric Dumazetbd8a7032013-06-24 06:26:00 -0700628 kfree_skb_list(*listp);
Herbert Xu27b437c2006-07-13 19:26:39 -0700629 *listp = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700630}
631
Herbert Xu27b437c2006-07-13 19:26:39 -0700632static inline void skb_drop_fraglist(struct sk_buff *skb)
633{
634 skb_drop_list(&skb_shinfo(skb)->frag_list);
635}
636
Linus Torvalds1da177e2005-04-16 15:20:36 -0700637static void skb_clone_fraglist(struct sk_buff *skb)
638{
639 struct sk_buff *list;
640
David S. Millerfbb398a2009-06-09 00:18:59 -0700641 skb_walk_frags(skb, list)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700642 skb_get(list);
643}
644
Eric Dumazetd3836f22012-04-27 00:33:38 +0000645static void skb_free_head(struct sk_buff *skb)
646{
Alexander Duyck181edb22015-05-06 21:12:03 -0700647 unsigned char *head = skb->head;
648
Ilias Apalodimas6a5bcd82021-06-07 21:02:38 +0200649 if (skb->head_frag) {
650 if (skb_pp_recycle(skb, head))
651 return;
Alexander Duyck181edb22015-05-06 21:12:03 -0700652 skb_free_frag(head);
Ilias Apalodimas6a5bcd82021-06-07 21:02:38 +0200653 } else {
Alexander Duyck181edb22015-05-06 21:12:03 -0700654 kfree(head);
Ilias Apalodimas6a5bcd82021-06-07 21:02:38 +0200655 }
Eric Dumazetd3836f22012-04-27 00:33:38 +0000656}
657
Adrian Bunk5bba1712006-06-29 13:02:35 -0700658static void skb_release_data(struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700659{
Eric Dumazetff04a772014-09-23 18:39:30 -0700660 struct skb_shared_info *shinfo = skb_shinfo(skb);
661 int i;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700662
Eric Dumazetff04a772014-09-23 18:39:30 -0700663 if (skb->cloned &&
664 atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1,
665 &shinfo->dataref))
666 return;
Shirley Maa6686f22011-07-06 12:22:12 +0000667
Jonathan Lemon70c43162021-01-06 14:18:36 -0800668 skb_zcopy_clear(skb, true);
669
Eric Dumazetff04a772014-09-23 18:39:30 -0700670 for (i = 0; i < shinfo->nr_frags; i++)
Ilias Apalodimas6a5bcd82021-06-07 21:02:38 +0200671 __skb_frag_unref(&shinfo->frags[i], skb->pp_recycle);
Shirley Maa6686f22011-07-06 12:22:12 +0000672
Eric Dumazetff04a772014-09-23 18:39:30 -0700673 if (shinfo->frag_list)
674 kfree_skb_list(shinfo->frag_list);
675
676 skb_free_head(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700677}
678
679/*
680 * Free an skbuff by memory without cleaning the state.
681 */
Herbert Xu2d4baff2007-11-26 23:11:19 +0800682static void kfree_skbmem(struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700683{
Eric Dumazetd0bf4a92014-09-29 13:29:15 -0700684 struct sk_buff_fclones *fclones;
David S. Millerd179cd12005-08-17 14:57:30 -0700685
David S. Millerd179cd12005-08-17 14:57:30 -0700686 switch (skb->fclone) {
687 case SKB_FCLONE_UNAVAILABLE:
688 kmem_cache_free(skbuff_head_cache, skb);
Eric Dumazet6ffe75eb2014-12-03 17:04:39 -0800689 return;
David S. Millerd179cd12005-08-17 14:57:30 -0700690
691 case SKB_FCLONE_ORIG:
Eric Dumazetd0bf4a92014-09-29 13:29:15 -0700692 fclones = container_of(skb, struct sk_buff_fclones, skb1);
Eric Dumazet6ffe75eb2014-12-03 17:04:39 -0800693
694 /* We usually free the clone (TX completion) before original skb
695 * This test would have no chance to be true for the clone,
696 * while here, branch prediction will be good.
697 */
Reshetova, Elena26385952017-06-30 13:07:59 +0300698 if (refcount_read(&fclones->fclone_ref) == 1)
Eric Dumazet6ffe75eb2014-12-03 17:04:39 -0800699 goto fastpath;
David S. Millerd179cd12005-08-17 14:57:30 -0700700 break;
701
Eric Dumazet6ffe75eb2014-12-03 17:04:39 -0800702 default: /* SKB_FCLONE_CLONE */
Eric Dumazetd0bf4a92014-09-29 13:29:15 -0700703 fclones = container_of(skb, struct sk_buff_fclones, skb2);
David S. Millerd179cd12005-08-17 14:57:30 -0700704 break;
Stephen Hemminger3ff50b72007-04-20 17:09:22 -0700705 }
Reshetova, Elena26385952017-06-30 13:07:59 +0300706 if (!refcount_dec_and_test(&fclones->fclone_ref))
Eric Dumazet6ffe75eb2014-12-03 17:04:39 -0800707 return;
708fastpath:
709 kmem_cache_free(skbuff_fclone_cache, fclones);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700710}
711
Paolo Abeni0a463c72017-06-12 11:23:42 +0200712void skb_release_head_state(struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700713{
Eric Dumazetadf30902009-06-02 05:19:30 +0000714 skb_dst_drop(skb);
Stephen Hemminger9c2b3322005-04-19 22:39:42 -0700715 if (skb->destructor) {
716 WARN_ON(in_irq());
Linus Torvalds1da177e2005-04-16 15:20:36 -0700717 skb->destructor(skb);
718 }
Igor Maravića3bf7ae2011-12-12 02:58:22 +0000719#if IS_ENABLED(CONFIG_NF_CONNTRACK)
Florian Westphalcb9c6832017-01-23 18:21:56 +0100720 nf_conntrack_put(skb_nfct(skb));
KOVACS Krisztian2fc72c72011-01-12 20:25:08 +0100721#endif
Florian Westphaldf5042f2018-12-18 17:15:16 +0100722 skb_ext_put(skb);
Lennert Buytenhek04a4bb52008-10-01 02:33:12 -0700723}
724
725/* Free everything but the sk_buff shell. */
726static void skb_release_all(struct sk_buff *skb)
727{
728 skb_release_head_state(skb);
Florian Westphala28b1b92017-07-23 19:54:47 +0200729 if (likely(skb->head))
730 skb_release_data(skb);
Herbert Xu2d4baff2007-11-26 23:11:19 +0800731}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700732
Herbert Xu2d4baff2007-11-26 23:11:19 +0800733/**
734 * __kfree_skb - private function
735 * @skb: buffer
736 *
737 * Free an sk_buff. Release anything attached to the buffer.
738 * Clean the state. This is an internal helper function. Users should
739 * always call kfree_skb
740 */
741
742void __kfree_skb(struct sk_buff *skb)
743{
744 skb_release_all(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700745 kfree_skbmem(skb);
746}
David S. Millerb4ac530fc2009-02-10 02:09:24 -0800747EXPORT_SYMBOL(__kfree_skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700748
749/**
Jörn Engel231d06a2006-03-20 21:28:35 -0800750 * kfree_skb - free an sk_buff
751 * @skb: buffer to free
752 *
753 * Drop a reference to the buffer and free it if the usage count has
754 * hit zero.
755 */
756void kfree_skb(struct sk_buff *skb)
757{
Paolo Abeni3889a8032017-06-12 11:23:41 +0200758 if (!skb_unref(skb))
Jörn Engel231d06a2006-03-20 21:28:35 -0800759 return;
Paolo Abeni3889a8032017-06-12 11:23:41 +0200760
Neil Hormanead2ceb2009-03-11 09:49:55 +0000761 trace_kfree_skb(skb, __builtin_return_address(0));
Jörn Engel231d06a2006-03-20 21:28:35 -0800762 __kfree_skb(skb);
763}
David S. Millerb4ac530fc2009-02-10 02:09:24 -0800764EXPORT_SYMBOL(kfree_skb);
Jörn Engel231d06a2006-03-20 21:28:35 -0800765
Eric Dumazetbd8a7032013-06-24 06:26:00 -0700766void kfree_skb_list(struct sk_buff *segs)
767{
768 while (segs) {
769 struct sk_buff *next = segs->next;
770
771 kfree_skb(segs);
772 segs = next;
773 }
774}
775EXPORT_SYMBOL(kfree_skb_list);
776
Willem de Bruijn64131392019-07-07 05:51:55 -0400777/* Dump skb information and contents.
778 *
779 * Must only be called from net_ratelimit()-ed paths.
780 *
Vladimir Oltean302af7c2020-10-05 17:48:38 +0300781 * Dumps whole packets if full_pkt, only headers otherwise.
Willem de Bruijn64131392019-07-07 05:51:55 -0400782 */
783void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt)
784{
Willem de Bruijn64131392019-07-07 05:51:55 -0400785 struct skb_shared_info *sh = skb_shinfo(skb);
786 struct net_device *dev = skb->dev;
787 struct sock *sk = skb->sk;
788 struct sk_buff *list_skb;
789 bool has_mac, has_trans;
790 int headroom, tailroom;
791 int i, len, seg_len;
792
793 if (full_pkt)
Willem de Bruijn64131392019-07-07 05:51:55 -0400794 len = skb->len;
795 else
796 len = min_t(int, skb->len, MAX_HEADER + 128);
797
798 headroom = skb_headroom(skb);
799 tailroom = skb_tailroom(skb);
800
801 has_mac = skb_mac_header_was_set(skb);
802 has_trans = skb_transport_header_was_set(skb);
803
804 printk("%sskb len=%u headroom=%u headlen=%u tailroom=%u\n"
805 "mac=(%d,%d) net=(%d,%d) trans=%d\n"
806 "shinfo(txflags=%u nr_frags=%u gso(size=%hu type=%u segs=%hu))\n"
807 "csum(0x%x ip_summed=%u complete_sw=%u valid=%u level=%u)\n"
808 "hash(0x%x sw=%u l4=%u) proto=0x%04x pkttype=%u iif=%d\n",
809 level, skb->len, headroom, skb_headlen(skb), tailroom,
810 has_mac ? skb->mac_header : -1,
811 has_mac ? skb_mac_header_len(skb) : -1,
812 skb->network_header,
813 has_trans ? skb_network_header_len(skb) : -1,
814 has_trans ? skb->transport_header : -1,
815 sh->tx_flags, sh->nr_frags,
816 sh->gso_size, sh->gso_type, sh->gso_segs,
817 skb->csum, skb->ip_summed, skb->csum_complete_sw,
818 skb->csum_valid, skb->csum_level,
819 skb->hash, skb->sw_hash, skb->l4_hash,
820 ntohs(skb->protocol), skb->pkt_type, skb->skb_iif);
821
822 if (dev)
823 printk("%sdev name=%s feat=0x%pNF\n",
824 level, dev->name, &dev->features);
825 if (sk)
Qian Caidb8051f2019-07-16 11:43:05 -0400826 printk("%ssk family=%hu type=%u proto=%u\n",
Willem de Bruijn64131392019-07-07 05:51:55 -0400827 level, sk->sk_family, sk->sk_type, sk->sk_protocol);
828
829 if (full_pkt && headroom)
830 print_hex_dump(level, "skb headroom: ", DUMP_PREFIX_OFFSET,
831 16, 1, skb->head, headroom, false);
832
833 seg_len = min_t(int, skb_headlen(skb), len);
834 if (seg_len)
835 print_hex_dump(level, "skb linear: ", DUMP_PREFIX_OFFSET,
836 16, 1, skb->data, seg_len, false);
837 len -= seg_len;
838
839 if (full_pkt && tailroom)
840 print_hex_dump(level, "skb tailroom: ", DUMP_PREFIX_OFFSET,
841 16, 1, skb_tail_pointer(skb), tailroom, false);
842
843 for (i = 0; len && i < skb_shinfo(skb)->nr_frags; i++) {
844 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
845 u32 p_off, p_len, copied;
846 struct page *p;
847 u8 *vaddr;
848
Jonathan Lemonb54c9d52019-07-30 07:40:33 -0700849 skb_frag_foreach_page(frag, skb_frag_off(frag),
Willem de Bruijn64131392019-07-07 05:51:55 -0400850 skb_frag_size(frag), p, p_off, p_len,
851 copied) {
852 seg_len = min_t(int, p_len, len);
853 vaddr = kmap_atomic(p);
854 print_hex_dump(level, "skb frag: ",
855 DUMP_PREFIX_OFFSET,
856 16, 1, vaddr + p_off, seg_len, false);
857 kunmap_atomic(vaddr);
858 len -= seg_len;
859 if (!len)
860 break;
861 }
862 }
863
864 if (full_pkt && skb_has_frag_list(skb)) {
865 printk("skb fraglist:\n");
866 skb_walk_frags(skb, list_skb)
867 skb_dump(level, list_skb, true);
868 }
869}
870EXPORT_SYMBOL(skb_dump);
871
Stephen Hemmingerd1a203e2008-11-01 21:01:09 -0700872/**
Michael S. Tsirkin25121172012-11-01 09:16:28 +0000873 * skb_tx_error - report an sk_buff xmit error
874 * @skb: buffer that triggered an error
875 *
876 * Report xmit error if a device callback is tracking this skb.
877 * skb must be freed afterwards.
878 */
879void skb_tx_error(struct sk_buff *skb)
880{
Willem de Bruijn1f8b9772017-08-03 16:29:41 -0400881 skb_zcopy_clear(skb, true);
Michael S. Tsirkin25121172012-11-01 09:16:28 +0000882}
883EXPORT_SYMBOL(skb_tx_error);
884
Herbert Xube769db2020-08-22 08:23:29 +1000885#ifdef CONFIG_TRACEPOINTS
Michael S. Tsirkin25121172012-11-01 09:16:28 +0000886/**
Neil Hormanead2ceb2009-03-11 09:49:55 +0000887 * consume_skb - free an skbuff
888 * @skb: buffer to free
889 *
890 * Drop a ref to the buffer and free it if the usage count has hit zero
891 * Functions identically to kfree_skb, but kfree_skb assumes that the frame
892 * is being dropped after a failure and notes that
893 */
894void consume_skb(struct sk_buff *skb)
895{
Paolo Abeni3889a8032017-06-12 11:23:41 +0200896 if (!skb_unref(skb))
Neil Hormanead2ceb2009-03-11 09:49:55 +0000897 return;
Paolo Abeni3889a8032017-06-12 11:23:41 +0200898
Koki Sanagi07dc22e2010-08-23 18:46:12 +0900899 trace_consume_skb(skb);
Neil Hormanead2ceb2009-03-11 09:49:55 +0000900 __kfree_skb(skb);
901}
902EXPORT_SYMBOL(consume_skb);
Herbert Xube769db2020-08-22 08:23:29 +1000903#endif
Neil Hormanead2ceb2009-03-11 09:49:55 +0000904
Paolo Abeni0a463c72017-06-12 11:23:42 +0200905/**
Mauro Carvalho Chehabc1639be2020-11-16 11:17:58 +0100906 * __consume_stateless_skb - free an skbuff, assuming it is stateless
Paolo Abeni0a463c72017-06-12 11:23:42 +0200907 * @skb: buffer to free
908 *
Paolo Abenica2c1412017-09-06 14:44:36 +0200909 * Alike consume_skb(), but this variant assumes that this is the last
910 * skb reference and all the head states have been already dropped
Paolo Abeni0a463c72017-06-12 11:23:42 +0200911 */
Paolo Abenica2c1412017-09-06 14:44:36 +0200912void __consume_stateless_skb(struct sk_buff *skb)
Paolo Abeni0a463c72017-06-12 11:23:42 +0200913{
Paolo Abeni0a463c72017-06-12 11:23:42 +0200914 trace_consume_skb(skb);
Florian Westphal06dc75a2017-07-17 18:56:54 +0200915 skb_release_data(skb);
Paolo Abeni0a463c72017-06-12 11:23:42 +0200916 kfree_skbmem(skb);
917}
918
Alexander Lobakinf450d532021-02-13 14:12:25 +0000919static void napi_skb_cache_put(struct sk_buff *skb)
Jesper Dangaard Brouer795bb1c2016-02-08 13:14:59 +0100920{
921 struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
Alexander Lobakinf450d532021-02-13 14:12:25 +0000922 u32 i;
Jesper Dangaard Brouer795bb1c2016-02-08 13:14:59 +0100923
Alexander Lobakinf450d532021-02-13 14:12:25 +0000924 kasan_poison_object_data(skbuff_head_cache, skb);
Jesper Dangaard Brouer795bb1c2016-02-08 13:14:59 +0100925 nc->skb_cache[nc->skb_count++] = skb;
926
Jesper Dangaard Brouer795bb1c2016-02-08 13:14:59 +0100927 if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) {
Alexander Lobakinf450d532021-02-13 14:12:25 +0000928 for (i = NAPI_SKB_CACHE_HALF; i < NAPI_SKB_CACHE_SIZE; i++)
929 kasan_unpoison_object_data(skbuff_head_cache,
930 nc->skb_cache[i]);
931
932 kmem_cache_free_bulk(skbuff_head_cache, NAPI_SKB_CACHE_HALF,
933 nc->skb_cache + NAPI_SKB_CACHE_HALF);
934 nc->skb_count = NAPI_SKB_CACHE_HALF;
Jesper Dangaard Brouer795bb1c2016-02-08 13:14:59 +0100935 }
936}
Alexander Lobakinf450d532021-02-13 14:12:25 +0000937
Jesper Dangaard Brouer15fad712016-02-08 13:15:04 +0100938void __kfree_skb_defer(struct sk_buff *skb)
939{
Alexander Lobakin9243adf2021-02-13 14:13:09 +0000940 skb_release_all(skb);
941 napi_skb_cache_put(skb);
942}
943
944void napi_skb_free_stolen_head(struct sk_buff *skb)
945{
946 skb_dst_drop(skb);
947 skb_ext_put(skb);
Alexander Lobakinf450d532021-02-13 14:12:25 +0000948 napi_skb_cache_put(skb);
Jesper Dangaard Brouer15fad712016-02-08 13:15:04 +0100949}
Jesper Dangaard Brouer795bb1c2016-02-08 13:14:59 +0100950
951void napi_consume_skb(struct sk_buff *skb, int budget)
952{
Jesper Dangaard Brouer885eb0a2016-03-11 09:43:58 +0100953 /* Zero budget indicate non-NAPI context called us, like netpoll */
Jesper Dangaard Brouer795bb1c2016-02-08 13:14:59 +0100954 if (unlikely(!budget)) {
Jesper Dangaard Brouer885eb0a2016-03-11 09:43:58 +0100955 dev_consume_skb_any(skb);
Jesper Dangaard Brouer795bb1c2016-02-08 13:14:59 +0100956 return;
957 }
958
Yunsheng Lin6454eca2020-11-24 18:49:29 +0800959 lockdep_assert_in_softirq();
960
Paolo Abeni76088942017-06-14 11:48:48 +0200961 if (!skb_unref(skb))
Jesper Dangaard Brouer795bb1c2016-02-08 13:14:59 +0100962 return;
Paolo Abeni76088942017-06-14 11:48:48 +0200963
Jesper Dangaard Brouer795bb1c2016-02-08 13:14:59 +0100964 /* if reaching here SKB is ready to free */
965 trace_consume_skb(skb);
966
967 /* if SKB is a clone, don't handle this case */
Eric Dumazetabbdb5a2016-03-20 11:27:47 -0700968 if (skb->fclone != SKB_FCLONE_UNAVAILABLE) {
Jesper Dangaard Brouer795bb1c2016-02-08 13:14:59 +0100969 __kfree_skb(skb);
970 return;
971 }
972
Alexander Lobakin9243adf2021-02-13 14:13:09 +0000973 skb_release_all(skb);
Alexander Lobakinf450d532021-02-13 14:12:25 +0000974 napi_skb_cache_put(skb);
Jesper Dangaard Brouer795bb1c2016-02-08 13:14:59 +0100975}
976EXPORT_SYMBOL(napi_consume_skb);
977
Eric Dumazetb1937222014-09-28 22:18:47 -0700978/* Make sure a field is enclosed inside headers_start/headers_end section */
979#define CHECK_SKB_FIELD(field) \
980 BUILD_BUG_ON(offsetof(struct sk_buff, field) < \
981 offsetof(struct sk_buff, headers_start)); \
982 BUILD_BUG_ON(offsetof(struct sk_buff, field) > \
983 offsetof(struct sk_buff, headers_end)); \
984
Herbert Xudec18812007-10-14 00:37:30 -0700985static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
986{
987 new->tstamp = old->tstamp;
Eric Dumazetb1937222014-09-28 22:18:47 -0700988 /* We do not copy old->sk */
Herbert Xudec18812007-10-14 00:37:30 -0700989 new->dev = old->dev;
Eric Dumazetb1937222014-09-28 22:18:47 -0700990 memcpy(new->cb, old->cb, sizeof(old->cb));
Eric Dumazet7fee2262010-05-11 23:19:48 +0000991 skb_dst_copy(new, old);
Florian Westphaldf5042f2018-12-18 17:15:16 +0100992 __skb_ext_copy(new, old);
Eric Dumazetb1937222014-09-28 22:18:47 -0700993 __nf_copy(new, old, false);
Patrick McHardy6aa895b2008-07-14 22:49:06 -0700994
Eric Dumazetb1937222014-09-28 22:18:47 -0700995 /* Note : this field could be in headers_start/headers_end section
996 * It is not yet because we do not want to have a 16 bit hole
997 */
998 new->queue_mapping = old->queue_mapping;
Eliezer Tamir06021292013-06-10 11:39:50 +0300999
Eric Dumazetb1937222014-09-28 22:18:47 -07001000 memcpy(&new->headers_start, &old->headers_start,
1001 offsetof(struct sk_buff, headers_end) -
1002 offsetof(struct sk_buff, headers_start));
1003 CHECK_SKB_FIELD(protocol);
1004 CHECK_SKB_FIELD(csum);
1005 CHECK_SKB_FIELD(hash);
1006 CHECK_SKB_FIELD(priority);
1007 CHECK_SKB_FIELD(skb_iif);
1008 CHECK_SKB_FIELD(vlan_proto);
1009 CHECK_SKB_FIELD(vlan_tci);
1010 CHECK_SKB_FIELD(transport_header);
1011 CHECK_SKB_FIELD(network_header);
1012 CHECK_SKB_FIELD(mac_header);
1013 CHECK_SKB_FIELD(inner_protocol);
1014 CHECK_SKB_FIELD(inner_transport_header);
1015 CHECK_SKB_FIELD(inner_network_header);
1016 CHECK_SKB_FIELD(inner_mac_header);
1017 CHECK_SKB_FIELD(mark);
1018#ifdef CONFIG_NETWORK_SECMARK
1019 CHECK_SKB_FIELD(secmark);
1020#endif
Cong Wange0d10952013-08-01 11:10:25 +08001021#ifdef CONFIG_NET_RX_BUSY_POLL
Eric Dumazetb1937222014-09-28 22:18:47 -07001022 CHECK_SKB_FIELD(napi_id);
Eliezer Tamir06021292013-06-10 11:39:50 +03001023#endif
Eric Dumazet2bd82482015-02-03 23:48:24 -08001024#ifdef CONFIG_XPS
1025 CHECK_SKB_FIELD(sender_cpu);
1026#endif
Eric Dumazetb1937222014-09-28 22:18:47 -07001027#ifdef CONFIG_NET_SCHED
1028 CHECK_SKB_FIELD(tc_index);
Eric Dumazetb1937222014-09-28 22:18:47 -07001029#endif
1030
Herbert Xudec18812007-10-14 00:37:30 -07001031}
1032
Herbert Xu82c49a32009-05-22 22:11:37 +00001033/*
1034 * You should not add any new code to this function. Add it to
1035 * __copy_skb_header above instead.
1036 */
Herbert Xue0053ec2007-10-14 00:37:52 -07001037static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001038{
Linus Torvalds1da177e2005-04-16 15:20:36 -07001039#define C(x) n->x = skb->x
1040
1041 n->next = n->prev = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001042 n->sk = NULL;
Herbert Xudec18812007-10-14 00:37:30 -07001043 __copy_skb_header(n, skb);
1044
Linus Torvalds1da177e2005-04-16 15:20:36 -07001045 C(len);
1046 C(data_len);
Alexey Dobriyan3e6b3b22007-03-16 15:00:46 -07001047 C(mac_len);
Patrick McHardy334a8132007-06-25 04:35:20 -07001048 n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len;
Paul Moore02f1c892008-01-07 21:56:41 -08001049 n->cloned = 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001050 n->nohdr = 0;
Eric Dumazetb13dda92018-04-07 13:42:39 -07001051 n->peeked = 0;
Stefano Brivioe78bfb02018-07-13 13:21:07 +02001052 C(pfmemalloc);
Ilias Apalodimas6a5bcd82021-06-07 21:02:38 +02001053 C(pp_recycle);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001054 n->destructor = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001055 C(tail);
1056 C(end);
Paul Moore02f1c892008-01-07 21:56:41 -08001057 C(head);
Eric Dumazetd3836f22012-04-27 00:33:38 +00001058 C(head_frag);
Paul Moore02f1c892008-01-07 21:56:41 -08001059 C(data);
1060 C(truesize);
Reshetova, Elena63354792017-06-30 13:07:58 +03001061 refcount_set(&n->users, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001062
1063 atomic_inc(&(skb_shinfo(skb)->dataref));
1064 skb->cloned = 1;
1065
1066 return n;
Herbert Xue0053ec2007-10-14 00:37:52 -07001067#undef C
1068}
1069
1070/**
Jakub Kicinskida29e4b2019-06-03 15:16:58 -07001071 * alloc_skb_for_msg() - allocate sk_buff to wrap frag list forming a msg
1072 * @first: first sk_buff of the msg
1073 */
1074struct sk_buff *alloc_skb_for_msg(struct sk_buff *first)
1075{
1076 struct sk_buff *n;
1077
1078 n = alloc_skb(0, GFP_ATOMIC);
1079 if (!n)
1080 return NULL;
1081
1082 n->len = first->len;
1083 n->data_len = first->len;
1084 n->truesize = first->truesize;
1085
1086 skb_shinfo(n)->frag_list = first;
1087
1088 __copy_skb_header(n, first);
1089 n->destructor = NULL;
1090
1091 return n;
1092}
1093EXPORT_SYMBOL_GPL(alloc_skb_for_msg);
1094
1095/**
Herbert Xue0053ec2007-10-14 00:37:52 -07001096 * skb_morph - morph one skb into another
1097 * @dst: the skb to receive the contents
1098 * @src: the skb to supply the contents
1099 *
1100 * This is identical to skb_clone except that the target skb is
1101 * supplied by the user.
1102 *
1103 * The target skb is returned upon exit.
1104 */
1105struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)
1106{
Herbert Xu2d4baff2007-11-26 23:11:19 +08001107 skb_release_all(dst);
Herbert Xue0053ec2007-10-14 00:37:52 -07001108 return __skb_clone(dst, src);
1109}
1110EXPORT_SYMBOL_GPL(skb_morph);
1111
Sowmini Varadhan6f89dbc2018-02-15 10:49:32 -08001112int mm_account_pinned_pages(struct mmpin *mmp, size_t size)
Willem de Bruijna91dbff2017-08-03 16:29:43 -04001113{
1114 unsigned long max_pg, num_pg, new_pg, old_pg;
1115 struct user_struct *user;
1116
1117 if (capable(CAP_IPC_LOCK) || !size)
1118 return 0;
1119
1120 num_pg = (size >> PAGE_SHIFT) + 2; /* worst case */
1121 max_pg = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
1122 user = mmp->user ? : current_user();
1123
1124 do {
1125 old_pg = atomic_long_read(&user->locked_vm);
1126 new_pg = old_pg + num_pg;
1127 if (new_pg > max_pg)
1128 return -ENOBUFS;
1129 } while (atomic_long_cmpxchg(&user->locked_vm, old_pg, new_pg) !=
1130 old_pg);
1131
1132 if (!mmp->user) {
1133 mmp->user = get_uid(user);
1134 mmp->num_pg = num_pg;
1135 } else {
1136 mmp->num_pg += num_pg;
1137 }
1138
1139 return 0;
1140}
Sowmini Varadhan6f89dbc2018-02-15 10:49:32 -08001141EXPORT_SYMBOL_GPL(mm_account_pinned_pages);
Willem de Bruijna91dbff2017-08-03 16:29:43 -04001142
Sowmini Varadhan6f89dbc2018-02-15 10:49:32 -08001143void mm_unaccount_pinned_pages(struct mmpin *mmp)
Willem de Bruijna91dbff2017-08-03 16:29:43 -04001144{
1145 if (mmp->user) {
1146 atomic_long_sub(mmp->num_pg, &mmp->user->locked_vm);
1147 free_uid(mmp->user);
1148 }
1149}
Sowmini Varadhan6f89dbc2018-02-15 10:49:32 -08001150EXPORT_SYMBOL_GPL(mm_unaccount_pinned_pages);
Willem de Bruijna91dbff2017-08-03 16:29:43 -04001151
Jonathan Lemon8c793822021-01-06 14:18:37 -08001152struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size)
Willem de Bruijn52267792017-08-03 16:29:39 -04001153{
1154 struct ubuf_info *uarg;
1155 struct sk_buff *skb;
1156
1157 WARN_ON_ONCE(!in_task());
1158
1159 skb = sock_omalloc(sk, 0, GFP_KERNEL);
1160 if (!skb)
1161 return NULL;
1162
1163 BUILD_BUG_ON(sizeof(*uarg) > sizeof(skb->cb));
1164 uarg = (void *)skb->cb;
Willem de Bruijna91dbff2017-08-03 16:29:43 -04001165 uarg->mmp.user = NULL;
1166
1167 if (mm_account_pinned_pages(&uarg->mmp, size)) {
1168 kfree_skb(skb);
1169 return NULL;
1170 }
Willem de Bruijn52267792017-08-03 16:29:39 -04001171
Jonathan Lemon8c793822021-01-06 14:18:37 -08001172 uarg->callback = msg_zerocopy_callback;
Willem de Bruijn4ab6c992017-08-03 16:29:42 -04001173 uarg->id = ((u32)atomic_inc_return(&sk->sk_zckey)) - 1;
1174 uarg->len = 1;
1175 uarg->bytelen = size;
Willem de Bruijn52267792017-08-03 16:29:39 -04001176 uarg->zerocopy = 1;
Jonathan Lemon04c2d332021-01-06 14:18:39 -08001177 uarg->flags = SKBFL_ZEROCOPY_FRAG;
Eric Dumazetc1d1b432017-08-31 16:48:22 -07001178 refcount_set(&uarg->refcnt, 1);
Willem de Bruijn52267792017-08-03 16:29:39 -04001179 sock_hold(sk);
1180
1181 return uarg;
1182}
Jonathan Lemon8c793822021-01-06 14:18:37 -08001183EXPORT_SYMBOL_GPL(msg_zerocopy_alloc);
Willem de Bruijn52267792017-08-03 16:29:39 -04001184
1185static inline struct sk_buff *skb_from_uarg(struct ubuf_info *uarg)
1186{
1187 return container_of((void *)uarg, struct sk_buff, cb);
1188}
1189
Jonathan Lemon8c793822021-01-06 14:18:37 -08001190struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size,
1191 struct ubuf_info *uarg)
Willem de Bruijn4ab6c992017-08-03 16:29:42 -04001192{
1193 if (uarg) {
1194 const u32 byte_limit = 1 << 19; /* limit to a few TSO */
1195 u32 bytelen, next;
1196
1197 /* realloc only when socket is locked (TCP, UDP cork),
1198 * so uarg->len and sk_zckey access is serialized
1199 */
1200 if (!sock_owned_by_user(sk)) {
1201 WARN_ON_ONCE(1);
1202 return NULL;
1203 }
1204
1205 bytelen = uarg->bytelen + size;
1206 if (uarg->len == USHRT_MAX - 1 || bytelen > byte_limit) {
1207 /* TCP can create new skb to attach new uarg */
1208 if (sk->sk_type == SOCK_STREAM)
1209 goto new_alloc;
1210 return NULL;
1211 }
1212
1213 next = (u32)atomic_read(&sk->sk_zckey);
1214 if ((u32)(uarg->id + uarg->len) == next) {
Willem de Bruijna91dbff2017-08-03 16:29:43 -04001215 if (mm_account_pinned_pages(&uarg->mmp, size))
1216 return NULL;
Willem de Bruijn4ab6c992017-08-03 16:29:42 -04001217 uarg->len++;
1218 uarg->bytelen = bytelen;
1219 atomic_set(&sk->sk_zckey, ++next);
Willem de Bruijn100f6d82019-05-30 18:01:21 -04001220
1221 /* no extra ref when appending to datagram (MSG_MORE) */
1222 if (sk->sk_type == SOCK_STREAM)
Jonathan Lemon8e044912021-01-06 14:18:41 -08001223 net_zcopy_get(uarg);
Willem de Bruijn100f6d82019-05-30 18:01:21 -04001224
Willem de Bruijn4ab6c992017-08-03 16:29:42 -04001225 return uarg;
1226 }
1227 }
1228
1229new_alloc:
Jonathan Lemon8c793822021-01-06 14:18:37 -08001230 return msg_zerocopy_alloc(sk, size);
Willem de Bruijn4ab6c992017-08-03 16:29:42 -04001231}
Jonathan Lemon8c793822021-01-06 14:18:37 -08001232EXPORT_SYMBOL_GPL(msg_zerocopy_realloc);
Willem de Bruijn4ab6c992017-08-03 16:29:42 -04001233
1234static bool skb_zerocopy_notify_extend(struct sk_buff *skb, u32 lo, u16 len)
1235{
1236 struct sock_exterr_skb *serr = SKB_EXT_ERR(skb);
1237 u32 old_lo, old_hi;
1238 u64 sum_len;
1239
1240 old_lo = serr->ee.ee_info;
1241 old_hi = serr->ee.ee_data;
1242 sum_len = old_hi - old_lo + 1ULL + len;
1243
1244 if (sum_len >= (1ULL << 32))
1245 return false;
1246
1247 if (lo != old_hi + 1)
1248 return false;
1249
1250 serr->ee.ee_data += len;
1251 return true;
1252}
1253
Jonathan Lemon8c793822021-01-06 14:18:37 -08001254static void __msg_zerocopy_callback(struct ubuf_info *uarg)
Willem de Bruijn52267792017-08-03 16:29:39 -04001255{
Willem de Bruijn4ab6c992017-08-03 16:29:42 -04001256 struct sk_buff *tail, *skb = skb_from_uarg(uarg);
Willem de Bruijn52267792017-08-03 16:29:39 -04001257 struct sock_exterr_skb *serr;
1258 struct sock *sk = skb->sk;
Willem de Bruijn4ab6c992017-08-03 16:29:42 -04001259 struct sk_buff_head *q;
1260 unsigned long flags;
Willem de Bruijn3bdd5ee2021-06-09 18:41:57 -04001261 bool is_zerocopy;
Willem de Bruijn4ab6c992017-08-03 16:29:42 -04001262 u32 lo, hi;
1263 u16 len;
Willem de Bruijn52267792017-08-03 16:29:39 -04001264
Willem de Bruijnccaffff2017-08-09 19:09:43 -04001265 mm_unaccount_pinned_pages(&uarg->mmp);
1266
Willem de Bruijn4ab6c992017-08-03 16:29:42 -04001267 /* if !len, there was only 1 call, and it was aborted
1268 * so do not queue a completion notification
1269 */
1270 if (!uarg->len || sock_flag(sk, SOCK_DEAD))
Willem de Bruijn52267792017-08-03 16:29:39 -04001271 goto release;
1272
Willem de Bruijn4ab6c992017-08-03 16:29:42 -04001273 len = uarg->len;
1274 lo = uarg->id;
1275 hi = uarg->id + len - 1;
Willem de Bruijn3bdd5ee2021-06-09 18:41:57 -04001276 is_zerocopy = uarg->zerocopy;
Willem de Bruijn4ab6c992017-08-03 16:29:42 -04001277
Willem de Bruijn52267792017-08-03 16:29:39 -04001278 serr = SKB_EXT_ERR(skb);
1279 memset(serr, 0, sizeof(*serr));
1280 serr->ee.ee_errno = 0;
1281 serr->ee.ee_origin = SO_EE_ORIGIN_ZEROCOPY;
Willem de Bruijn4ab6c992017-08-03 16:29:42 -04001282 serr->ee.ee_data = hi;
1283 serr->ee.ee_info = lo;
Willem de Bruijn3bdd5ee2021-06-09 18:41:57 -04001284 if (!is_zerocopy)
Willem de Bruijn52267792017-08-03 16:29:39 -04001285 serr->ee.ee_code |= SO_EE_CODE_ZEROCOPY_COPIED;
1286
Willem de Bruijn4ab6c992017-08-03 16:29:42 -04001287 q = &sk->sk_error_queue;
1288 spin_lock_irqsave(&q->lock, flags);
1289 tail = skb_peek_tail(q);
1290 if (!tail || SKB_EXT_ERR(tail)->ee.ee_origin != SO_EE_ORIGIN_ZEROCOPY ||
1291 !skb_zerocopy_notify_extend(tail, lo, len)) {
1292 __skb_queue_tail(q, skb);
1293 skb = NULL;
1294 }
1295 spin_unlock_irqrestore(&q->lock, flags);
Willem de Bruijn52267792017-08-03 16:29:39 -04001296
Alexander Aringe3ae2362021-06-27 18:48:21 -04001297 sk_error_report(sk);
Willem de Bruijn52267792017-08-03 16:29:39 -04001298
1299release:
1300 consume_skb(skb);
1301 sock_put(sk);
1302}
Jonathan Lemon75518852021-01-06 14:18:31 -08001303
Jonathan Lemon8c793822021-01-06 14:18:37 -08001304void msg_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg,
1305 bool success)
Jonathan Lemon75518852021-01-06 14:18:31 -08001306{
1307 uarg->zerocopy = uarg->zerocopy & success;
1308
1309 if (refcount_dec_and_test(&uarg->refcnt))
Jonathan Lemon8c793822021-01-06 14:18:37 -08001310 __msg_zerocopy_callback(uarg);
Jonathan Lemon75518852021-01-06 14:18:31 -08001311}
Jonathan Lemon8c793822021-01-06 14:18:37 -08001312EXPORT_SYMBOL_GPL(msg_zerocopy_callback);
Willem de Bruijn52267792017-08-03 16:29:39 -04001313
Jonathan Lemon8c793822021-01-06 14:18:37 -08001314void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref)
Willem de Bruijn52267792017-08-03 16:29:39 -04001315{
Jonathan Lemon236a6b12021-01-06 14:18:35 -08001316 struct sock *sk = skb_from_uarg(uarg)->sk;
Willem de Bruijn52267792017-08-03 16:29:39 -04001317
Jonathan Lemon236a6b12021-01-06 14:18:35 -08001318 atomic_dec(&sk->sk_zckey);
1319 uarg->len--;
Willem de Bruijn52267792017-08-03 16:29:39 -04001320
Jonathan Lemon236a6b12021-01-06 14:18:35 -08001321 if (have_uref)
Jonathan Lemon8c793822021-01-06 14:18:37 -08001322 msg_zerocopy_callback(NULL, uarg, true);
Willem de Bruijn52267792017-08-03 16:29:39 -04001323}
Jonathan Lemon8c793822021-01-06 14:18:37 -08001324EXPORT_SYMBOL_GPL(msg_zerocopy_put_abort);
Willem de Bruijn52267792017-08-03 16:29:39 -04001325
Willem de Bruijnb5947e52018-11-30 15:32:39 -05001326int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len)
1327{
1328 return __zerocopy_sg_from_iter(skb->sk, skb, &msg->msg_iter, len);
1329}
1330EXPORT_SYMBOL_GPL(skb_zerocopy_iter_dgram);
1331
Willem de Bruijn52267792017-08-03 16:29:39 -04001332int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
1333 struct msghdr *msg, int len,
1334 struct ubuf_info *uarg)
1335{
Willem de Bruijn4ab6c992017-08-03 16:29:42 -04001336 struct ubuf_info *orig_uarg = skb_zcopy(skb);
Willem de Bruijn52267792017-08-03 16:29:39 -04001337 struct iov_iter orig_iter = msg->msg_iter;
1338 int err, orig_len = skb->len;
1339
Willem de Bruijn4ab6c992017-08-03 16:29:42 -04001340 /* An skb can only point to one uarg. This edge case happens when
1341 * TCP appends to an skb, but zerocopy_realloc triggered a new alloc.
1342 */
1343 if (orig_uarg && uarg != orig_uarg)
1344 return -EEXIST;
1345
Willem de Bruijn52267792017-08-03 16:29:39 -04001346 err = __zerocopy_sg_from_iter(sk, skb, &msg->msg_iter, len);
1347 if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) {
Willem de Bruijn54d431172017-10-19 12:40:39 -04001348 struct sock *save_sk = skb->sk;
1349
Willem de Bruijn52267792017-08-03 16:29:39 -04001350 /* Streams do not free skb on error. Reset to prev state. */
1351 msg->msg_iter = orig_iter;
Willem de Bruijn54d431172017-10-19 12:40:39 -04001352 skb->sk = sk;
Willem de Bruijn52267792017-08-03 16:29:39 -04001353 ___pskb_trim(skb, orig_len);
Willem de Bruijn54d431172017-10-19 12:40:39 -04001354 skb->sk = save_sk;
Willem de Bruijn52267792017-08-03 16:29:39 -04001355 return err;
1356 }
1357
Willem de Bruijn52900d22018-11-30 15:32:40 -05001358 skb_zcopy_set(skb, uarg, NULL);
Willem de Bruijn52267792017-08-03 16:29:39 -04001359 return skb->len - orig_len;
1360}
1361EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream);
1362
Willem de Bruijn1f8b9772017-08-03 16:29:41 -04001363static int skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig,
Willem de Bruijn52267792017-08-03 16:29:39 -04001364 gfp_t gfp_mask)
1365{
1366 if (skb_zcopy(orig)) {
1367 if (skb_zcopy(nskb)) {
1368 /* !gfp_mask callers are verified to !skb_zcopy(nskb) */
1369 if (!gfp_mask) {
1370 WARN_ON_ONCE(1);
1371 return -ENOMEM;
1372 }
1373 if (skb_uarg(nskb) == skb_uarg(orig))
1374 return 0;
1375 if (skb_copy_ubufs(nskb, GFP_ATOMIC))
1376 return -EIO;
1377 }
Willem de Bruijn52900d22018-11-30 15:32:40 -05001378 skb_zcopy_set(nskb, skb_uarg(orig), NULL);
Willem de Bruijn52267792017-08-03 16:29:39 -04001379 }
1380 return 0;
1381}
1382
Ben Hutchings2c530402012-07-10 10:55:09 +00001383/**
1384 * skb_copy_ubufs - copy userspace skb frags buffers to kernel
Michael S. Tsirkin48c83012011-08-31 08:03:29 +00001385 * @skb: the skb to modify
1386 * @gfp_mask: allocation priority
1387 *
Jonathan Lemon06b4feb2021-01-06 14:18:38 -08001388 * This must be called on skb with SKBFL_ZEROCOPY_ENABLE.
Michael S. Tsirkin48c83012011-08-31 08:03:29 +00001389 * It will copy all frags into kernel and drop the reference
1390 * to userspace pages.
1391 *
1392 * If this function is called from an interrupt gfp_mask() must be
1393 * %GFP_ATOMIC.
1394 *
1395 * Returns 0 on success or a negative error code on failure
1396 * to allocate kernel memory to copy to.
1397 */
1398int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
Shirley Maa6686f22011-07-06 12:22:12 +00001399{
Shirley Maa6686f22011-07-06 12:22:12 +00001400 int num_frags = skb_shinfo(skb)->nr_frags;
1401 struct page *page, *head = NULL;
Willem de Bruijn3ece7822017-08-03 16:29:38 -04001402 int i, new_frags;
1403 u32 d_off;
Shirley Maa6686f22011-07-06 12:22:12 +00001404
Willem de Bruijn3ece7822017-08-03 16:29:38 -04001405 if (skb_shared(skb) || skb_unclone(skb, gfp_mask))
1406 return -EINVAL;
1407
Willem de Bruijnf72c4ac2017-12-28 12:38:13 -05001408 if (!num_frags)
1409 goto release;
1410
Willem de Bruijn3ece7822017-08-03 16:29:38 -04001411 new_frags = (__skb_pagelen(skb) + PAGE_SIZE - 1) >> PAGE_SHIFT;
1412 for (i = 0; i < new_frags; i++) {
Krishna Kumar02756ed2012-07-17 02:05:29 +00001413 page = alloc_page(gfp_mask);
Shirley Maa6686f22011-07-06 12:22:12 +00001414 if (!page) {
1415 while (head) {
Sunghan Suh40dadff2013-07-12 16:17:23 +09001416 struct page *next = (struct page *)page_private(head);
Shirley Maa6686f22011-07-06 12:22:12 +00001417 put_page(head);
1418 head = next;
1419 }
1420 return -ENOMEM;
1421 }
Willem de Bruijn3ece7822017-08-03 16:29:38 -04001422 set_page_private(page, (unsigned long)head);
1423 head = page;
1424 }
1425
1426 page = head;
1427 d_off = 0;
1428 for (i = 0; i < num_frags; i++) {
1429 skb_frag_t *f = &skb_shinfo(skb)->frags[i];
1430 u32 p_off, p_len, copied;
1431 struct page *p;
1432 u8 *vaddr;
Willem de Bruijnc613c202017-07-31 08:15:47 -04001433
Jonathan Lemonb54c9d52019-07-30 07:40:33 -07001434 skb_frag_foreach_page(f, skb_frag_off(f), skb_frag_size(f),
Willem de Bruijnc613c202017-07-31 08:15:47 -04001435 p, p_off, p_len, copied) {
Willem de Bruijn3ece7822017-08-03 16:29:38 -04001436 u32 copy, done = 0;
Willem de Bruijnc613c202017-07-31 08:15:47 -04001437 vaddr = kmap_atomic(p);
Willem de Bruijn3ece7822017-08-03 16:29:38 -04001438
1439 while (done < p_len) {
1440 if (d_off == PAGE_SIZE) {
1441 d_off = 0;
1442 page = (struct page *)page_private(page);
1443 }
1444 copy = min_t(u32, PAGE_SIZE - d_off, p_len - done);
1445 memcpy(page_address(page) + d_off,
1446 vaddr + p_off + done, copy);
1447 done += copy;
1448 d_off += copy;
1449 }
Willem de Bruijnc613c202017-07-31 08:15:47 -04001450 kunmap_atomic(vaddr);
1451 }
Shirley Maa6686f22011-07-06 12:22:12 +00001452 }
1453
1454 /* skb frags release userspace buffers */
Krishna Kumar02756ed2012-07-17 02:05:29 +00001455 for (i = 0; i < num_frags; i++)
Ian Campbella8605c62011-10-19 23:01:49 +00001456 skb_frag_unref(skb, i);
Shirley Maa6686f22011-07-06 12:22:12 +00001457
Shirley Maa6686f22011-07-06 12:22:12 +00001458 /* skb frags point to kernel buffers */
Willem de Bruijn3ece7822017-08-03 16:29:38 -04001459 for (i = 0; i < new_frags - 1; i++) {
1460 __skb_fill_page_desc(skb, i, head, 0, PAGE_SIZE);
Sunghan Suh40dadff2013-07-12 16:17:23 +09001461 head = (struct page *)page_private(head);
Shirley Maa6686f22011-07-06 12:22:12 +00001462 }
Willem de Bruijn3ece7822017-08-03 16:29:38 -04001463 __skb_fill_page_desc(skb, new_frags - 1, head, 0, d_off);
1464 skb_shinfo(skb)->nr_frags = new_frags;
Michael S. Tsirkin48c83012011-08-31 08:03:29 +00001465
Willem de Bruijnb90ddd52017-12-20 17:37:50 -05001466release:
Willem de Bruijn1f8b9772017-08-03 16:29:41 -04001467 skb_zcopy_clear(skb, false);
Shirley Maa6686f22011-07-06 12:22:12 +00001468 return 0;
1469}
Michael S. Tsirkindcc0fb72012-07-20 09:23:20 +00001470EXPORT_SYMBOL_GPL(skb_copy_ubufs);
Shirley Maa6686f22011-07-06 12:22:12 +00001471
Herbert Xue0053ec2007-10-14 00:37:52 -07001472/**
1473 * skb_clone - duplicate an sk_buff
1474 * @skb: buffer to clone
1475 * @gfp_mask: allocation priority
1476 *
1477 * Duplicate an &sk_buff. The new one is not owned by a socket. Both
1478 * copies share the same packet data but not structure. The new
1479 * buffer has a reference count of 1. If the allocation fails the
1480 * function returns %NULL otherwise the new buffer is returned.
1481 *
1482 * If this function is called from an interrupt gfp_mask() must be
1483 * %GFP_ATOMIC.
1484 */
1485
1486struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
1487{
Eric Dumazetd0bf4a92014-09-29 13:29:15 -07001488 struct sk_buff_fclones *fclones = container_of(skb,
1489 struct sk_buff_fclones,
1490 skb1);
Eric Dumazet6ffe75eb2014-12-03 17:04:39 -08001491 struct sk_buff *n;
Herbert Xue0053ec2007-10-14 00:37:52 -07001492
Michael S. Tsirkin70008aa2012-07-20 09:23:10 +00001493 if (skb_orphan_frags(skb, gfp_mask))
1494 return NULL;
Shirley Maa6686f22011-07-06 12:22:12 +00001495
Herbert Xue0053ec2007-10-14 00:37:52 -07001496 if (skb->fclone == SKB_FCLONE_ORIG &&
Reshetova, Elena26385952017-06-30 13:07:59 +03001497 refcount_read(&fclones->fclone_ref) == 1) {
Eric Dumazet6ffe75eb2014-12-03 17:04:39 -08001498 n = &fclones->skb2;
Reshetova, Elena26385952017-06-30 13:07:59 +03001499 refcount_set(&fclones->fclone_ref, 2);
Herbert Xue0053ec2007-10-14 00:37:52 -07001500 } else {
Mel Gormanc93bdd02012-07-31 16:44:19 -07001501 if (skb_pfmemalloc(skb))
1502 gfp_mask |= __GFP_MEMALLOC;
1503
Herbert Xue0053ec2007-10-14 00:37:52 -07001504 n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
1505 if (!n)
1506 return NULL;
Vegard Nossumfe55f6d2008-08-30 12:16:35 +02001507
Herbert Xue0053ec2007-10-14 00:37:52 -07001508 n->fclone = SKB_FCLONE_UNAVAILABLE;
1509 }
1510
1511 return __skb_clone(n, skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001512}
David S. Millerb4ac530fc2009-02-10 02:09:24 -08001513EXPORT_SYMBOL(skb_clone);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001514
Toshiaki Makitab0768a82018-08-03 16:58:09 +09001515void skb_headers_offset_update(struct sk_buff *skb, int off)
Pravin B Shelarf5b17292013-03-07 13:21:40 +00001516{
Eric Dumazet030737b2013-10-19 11:42:54 -07001517 /* Only adjust this if it actually is csum_start rather than csum */
1518 if (skb->ip_summed == CHECKSUM_PARTIAL)
1519 skb->csum_start += off;
Pravin B Shelarf5b17292013-03-07 13:21:40 +00001520 /* {transport,network,mac}_header and tail are relative to skb->head */
1521 skb->transport_header += off;
1522 skb->network_header += off;
1523 if (skb_mac_header_was_set(skb))
1524 skb->mac_header += off;
1525 skb->inner_transport_header += off;
1526 skb->inner_network_header += off;
Pravin B Shelaraefbd2b2013-03-07 13:21:46 +00001527 skb->inner_mac_header += off;
Pravin B Shelarf5b17292013-03-07 13:21:40 +00001528}
Toshiaki Makitab0768a82018-08-03 16:58:09 +09001529EXPORT_SYMBOL(skb_headers_offset_update);
Pravin B Shelarf5b17292013-03-07 13:21:40 +00001530
Ilya Lesokhin08303c12018-04-30 10:16:11 +03001531void skb_copy_header(struct sk_buff *new, const struct sk_buff *old)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001532{
Herbert Xudec18812007-10-14 00:37:30 -07001533 __copy_skb_header(new, old);
1534
Herbert Xu79671682006-06-22 02:40:14 -07001535 skb_shinfo(new)->gso_size = skb_shinfo(old)->gso_size;
1536 skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs;
1537 skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001538}
Ilya Lesokhin08303c12018-04-30 10:16:11 +03001539EXPORT_SYMBOL(skb_copy_header);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001540
Mel Gormanc93bdd02012-07-31 16:44:19 -07001541static inline int skb_alloc_rx_flag(const struct sk_buff *skb)
1542{
1543 if (skb_pfmemalloc(skb))
1544 return SKB_ALLOC_RX;
1545 return 0;
1546}
1547
Linus Torvalds1da177e2005-04-16 15:20:36 -07001548/**
1549 * skb_copy - create private copy of an sk_buff
1550 * @skb: buffer to copy
1551 * @gfp_mask: allocation priority
1552 *
1553 * Make a copy of both an &sk_buff and its data. This is used when the
1554 * caller wishes to modify the data and needs a private copy of the
1555 * data to alter. Returns %NULL on failure or the pointer to the buffer
1556 * on success. The returned buffer has a reference count of 1.
1557 *
1558 * As by-product this function converts non-linear &sk_buff to linear
1559 * one, so that &sk_buff becomes completely private and caller is allowed
1560 * to modify all the data of returned buffer. This means that this
1561 * function is not recommended for use in circumstances when only
1562 * header is going to be modified. Use pskb_copy() instead.
1563 */
1564
Al Virodd0fc662005-10-07 07:46:04 +01001565struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001566{
Eric Dumazet6602ceb2010-09-01 05:25:10 +00001567 int headerlen = skb_headroom(skb);
Alexander Duyckec47ea82012-05-04 14:26:56 +00001568 unsigned int size = skb_end_offset(skb) + skb->data_len;
Mel Gormanc93bdd02012-07-31 16:44:19 -07001569 struct sk_buff *n = __alloc_skb(size, gfp_mask,
1570 skb_alloc_rx_flag(skb), NUMA_NO_NODE);
Eric Dumazet6602ceb2010-09-01 05:25:10 +00001571
Linus Torvalds1da177e2005-04-16 15:20:36 -07001572 if (!n)
1573 return NULL;
1574
1575 /* Set the data pointer */
1576 skb_reserve(n, headerlen);
1577 /* Set the tail pointer and length */
1578 skb_put(n, skb->len);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001579
Tim Hansen9f77fad2017-10-09 11:37:59 -04001580 BUG_ON(skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001581
Ilya Lesokhin08303c12018-04-30 10:16:11 +03001582 skb_copy_header(n, skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001583 return n;
1584}
David S. Millerb4ac530fc2009-02-10 02:09:24 -08001585EXPORT_SYMBOL(skb_copy);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001586
1587/**
Octavian Purdilabad93e92014-06-12 01:36:26 +03001588 * __pskb_copy_fclone - create copy of an sk_buff with private head.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001589 * @skb: buffer to copy
Eric Dumazet117632e2011-12-03 21:39:53 +00001590 * @headroom: headroom of new skb
Linus Torvalds1da177e2005-04-16 15:20:36 -07001591 * @gfp_mask: allocation priority
Octavian Purdilabad93e92014-06-12 01:36:26 +03001592 * @fclone: if true allocate the copy of the skb from the fclone
1593 * cache instead of the head cache; it is recommended to set this
1594 * to true for the cases where the copy will likely be cloned
Linus Torvalds1da177e2005-04-16 15:20:36 -07001595 *
1596 * Make a copy of both an &sk_buff and part of its data, located
1597 * in header. Fragmented data remain shared. This is used when
1598 * the caller wishes to modify only header of &sk_buff and needs
1599 * private copy of the header to alter. Returns %NULL on failure
1600 * or the pointer to the buffer on success.
1601 * The returned buffer has a reference count of 1.
1602 */
1603
Octavian Purdilabad93e92014-06-12 01:36:26 +03001604struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom,
1605 gfp_t gfp_mask, bool fclone)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001606{
Eric Dumazet117632e2011-12-03 21:39:53 +00001607 unsigned int size = skb_headlen(skb) + headroom;
Octavian Purdilabad93e92014-06-12 01:36:26 +03001608 int flags = skb_alloc_rx_flag(skb) | (fclone ? SKB_ALLOC_FCLONE : 0);
1609 struct sk_buff *n = __alloc_skb(size, gfp_mask, flags, NUMA_NO_NODE);
Eric Dumazet6602ceb2010-09-01 05:25:10 +00001610
Linus Torvalds1da177e2005-04-16 15:20:36 -07001611 if (!n)
1612 goto out;
1613
1614 /* Set the data pointer */
Eric Dumazet117632e2011-12-03 21:39:53 +00001615 skb_reserve(n, headroom);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001616 /* Set the tail pointer and length */
1617 skb_put(n, skb_headlen(skb));
1618 /* Copy the bytes */
Arnaldo Carvalho de Melod626f622007-03-27 18:55:52 -03001619 skb_copy_from_linear_data(skb, n->data, n->len);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001620
Herbert Xu25f484a2006-11-07 14:57:15 -08001621 n->truesize += skb->data_len;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001622 n->data_len = skb->data_len;
1623 n->len = skb->len;
1624
1625 if (skb_shinfo(skb)->nr_frags) {
1626 int i;
1627
Willem de Bruijn1f8b9772017-08-03 16:29:41 -04001628 if (skb_orphan_frags(skb, gfp_mask) ||
1629 skb_zerocopy_clone(n, skb, gfp_mask)) {
Michael S. Tsirkin70008aa2012-07-20 09:23:10 +00001630 kfree_skb(n);
1631 n = NULL;
1632 goto out;
Shirley Maa6686f22011-07-06 12:22:12 +00001633 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001634 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1635 skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i];
Ian Campbellea2ab692011-08-22 23:44:58 +00001636 skb_frag_ref(skb, i);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001637 }
1638 skb_shinfo(n)->nr_frags = i;
1639 }
1640
David S. Miller21dc3302010-08-23 00:13:46 -07001641 if (skb_has_frag_list(skb)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001642 skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list;
1643 skb_clone_fraglist(n);
1644 }
1645
Ilya Lesokhin08303c12018-04-30 10:16:11 +03001646 skb_copy_header(n, skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001647out:
1648 return n;
1649}
Octavian Purdilabad93e92014-06-12 01:36:26 +03001650EXPORT_SYMBOL(__pskb_copy_fclone);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001651
1652/**
1653 * pskb_expand_head - reallocate header of &sk_buff
1654 * @skb: buffer to reallocate
1655 * @nhead: room to add at head
1656 * @ntail: room to add at tail
1657 * @gfp_mask: allocation priority
1658 *
Mathias Krausebc323832013-11-07 14:18:26 +01001659 * Expands (or creates identical copy, if @nhead and @ntail are zero)
1660 * header of @skb. &sk_buff itself is not changed. &sk_buff MUST have
Linus Torvalds1da177e2005-04-16 15:20:36 -07001661 * reference count of 1. Returns zero in the case of success or error,
1662 * if expansion failed. In the last case, &sk_buff is not changed.
1663 *
1664 * All the pointers pointing into skb header may change and must be
1665 * reloaded after call to this function.
1666 */
1667
Victor Fusco86a76ca2005-07-08 14:57:47 -07001668int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
Al Virodd0fc662005-10-07 07:46:04 +01001669 gfp_t gfp_mask)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001670{
Eric Dumazet158f3232017-01-27 07:11:27 -08001671 int i, osize = skb_end_offset(skb);
1672 int size = osize + nhead + ntail;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001673 long off;
Eric Dumazet158f3232017-01-27 07:11:27 -08001674 u8 *data;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001675
Herbert Xu4edd87a2008-10-01 07:09:38 -07001676 BUG_ON(nhead < 0);
1677
Tim Hansen9f77fad2017-10-09 11:37:59 -04001678 BUG_ON(skb_shared(skb));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001679
1680 size = SKB_DATA_ALIGN(size);
1681
Mel Gormanc93bdd02012-07-31 16:44:19 -07001682 if (skb_pfmemalloc(skb))
1683 gfp_mask |= __GFP_MEMALLOC;
1684 data = kmalloc_reserve(size + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)),
1685 gfp_mask, NUMA_NO_NODE, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001686 if (!data)
1687 goto nodata;
Eric Dumazet87151b82012-04-10 20:08:39 +00001688 size = SKB_WITH_OVERHEAD(ksize(data));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001689
1690 /* Copy only real data... and, alas, header. This should be
Eric Dumazet6602ceb2010-09-01 05:25:10 +00001691 * optimized for the cases when header is void.
1692 */
1693 memcpy(data + nhead, skb->head, skb_tail_pointer(skb) - skb->head);
1694
1695 memcpy((struct skb_shared_info *)(data + size),
1696 skb_shinfo(skb),
Eric Dumazetfed66382010-07-22 19:09:08 +00001697 offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags]));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001698
Alexander Duyck3e245912012-05-04 14:26:51 +00001699 /*
1700 * if shinfo is shared we must drop the old head gracefully, but if it
1701 * is not we can just drop the old head and let the existing refcount
1702 * be since all we did is relocate the values
1703 */
1704 if (skb_cloned(skb)) {
Michael S. Tsirkin70008aa2012-07-20 09:23:10 +00001705 if (skb_orphan_frags(skb, gfp_mask))
1706 goto nofrags;
Willem de Bruijn1f8b9772017-08-03 16:29:41 -04001707 if (skb_zcopy(skb))
Eric Dumazetc1d1b432017-08-31 16:48:22 -07001708 refcount_inc(&skb_uarg(skb)->refcnt);
Eric Dumazet1fd63042010-09-02 23:09:32 +00001709 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
Ian Campbellea2ab692011-08-22 23:44:58 +00001710 skb_frag_ref(skb, i);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001711
Eric Dumazet1fd63042010-09-02 23:09:32 +00001712 if (skb_has_frag_list(skb))
1713 skb_clone_fraglist(skb);
1714
1715 skb_release_data(skb);
Alexander Duyck3e245912012-05-04 14:26:51 +00001716 } else {
1717 skb_free_head(skb);
Eric Dumazet1fd63042010-09-02 23:09:32 +00001718 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001719 off = (data + nhead) - skb->head;
1720
1721 skb->head = data;
Eric Dumazetd3836f22012-04-27 00:33:38 +00001722 skb->head_frag = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001723 skb->data += off;
Arnaldo Carvalho de Melo4305b542007-04-19 20:43:29 -07001724#ifdef NET_SKBUFF_DATA_USES_OFFSET
1725 skb->end = size;
Patrick McHardy56eb8882007-04-09 11:45:04 -07001726 off = nhead;
Arnaldo Carvalho de Melo4305b542007-04-19 20:43:29 -07001727#else
1728 skb->end = skb->head + size;
Patrick McHardy56eb8882007-04-09 11:45:04 -07001729#endif
Arnaldo Carvalho de Melo27a884d2007-04-19 20:29:13 -07001730 skb->tail += off;
Peter Pan(潘卫平)b41abb42013-06-06 21:27:21 +08001731 skb_headers_offset_update(skb, nhead);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001732 skb->cloned = 0;
Patrick McHardy334a8132007-06-25 04:35:20 -07001733 skb->hdr_len = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001734 skb->nohdr = 0;
1735 atomic_set(&skb_shinfo(skb)->dataref, 1);
Eric Dumazet158f3232017-01-27 07:11:27 -08001736
Daniel Borkmannde8f3a82017-09-25 02:25:51 +02001737 skb_metadata_clear(skb);
1738
Eric Dumazet158f3232017-01-27 07:11:27 -08001739 /* It is not generally safe to change skb->truesize.
1740 * For the moment, we really care of rx path, or
1741 * when skb is orphaned (not attached to a socket).
1742 */
1743 if (!skb->sk || skb->destructor == sock_edemux)
1744 skb->truesize += size - osize;
1745
Linus Torvalds1da177e2005-04-16 15:20:36 -07001746 return 0;
1747
Shirley Maa6686f22011-07-06 12:22:12 +00001748nofrags:
1749 kfree(data);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001750nodata:
1751 return -ENOMEM;
1752}
David S. Millerb4ac530fc2009-02-10 02:09:24 -08001753EXPORT_SYMBOL(pskb_expand_head);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001754
1755/* Make private copy of skb with writable head and some headroom */
1756
1757struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)
1758{
1759 struct sk_buff *skb2;
1760 int delta = headroom - skb_headroom(skb);
1761
1762 if (delta <= 0)
1763 skb2 = pskb_copy(skb, GFP_ATOMIC);
1764 else {
1765 skb2 = skb_clone(skb, GFP_ATOMIC);
1766 if (skb2 && pskb_expand_head(skb2, SKB_DATA_ALIGN(delta), 0,
1767 GFP_ATOMIC)) {
1768 kfree_skb(skb2);
1769 skb2 = NULL;
1770 }
1771 }
1772 return skb2;
1773}
David S. Millerb4ac530fc2009-02-10 02:09:24 -08001774EXPORT_SYMBOL(skb_realloc_headroom);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001775
1776/**
1777 * skb_copy_expand - copy and expand sk_buff
1778 * @skb: buffer to copy
1779 * @newheadroom: new free bytes at head
1780 * @newtailroom: new free bytes at tail
1781 * @gfp_mask: allocation priority
1782 *
1783 * Make a copy of both an &sk_buff and its data and while doing so
1784 * allocate additional space.
1785 *
1786 * This is used when the caller wishes to modify the data and needs a
1787 * private copy of the data to alter as well as more space for new fields.
1788 * Returns %NULL on failure or the pointer to the buffer
1789 * on success. The returned buffer has a reference count of 1.
1790 *
1791 * You must pass %GFP_ATOMIC as the allocation priority if this function
1792 * is called from an interrupt.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001793 */
1794struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
Victor Fusco86a76ca2005-07-08 14:57:47 -07001795 int newheadroom, int newtailroom,
Al Virodd0fc662005-10-07 07:46:04 +01001796 gfp_t gfp_mask)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001797{
1798 /*
1799 * Allocate the copy buffer
1800 */
Mel Gormanc93bdd02012-07-31 16:44:19 -07001801 struct sk_buff *n = __alloc_skb(newheadroom + skb->len + newtailroom,
1802 gfp_mask, skb_alloc_rx_flag(skb),
1803 NUMA_NO_NODE);
Patrick McHardyefd1e8d2007-04-10 18:30:09 -07001804 int oldheadroom = skb_headroom(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001805 int head_copy_len, head_copy_off;
1806
1807 if (!n)
1808 return NULL;
1809
1810 skb_reserve(n, newheadroom);
1811
1812 /* Set the tail pointer and length */
1813 skb_put(n, skb->len);
1814
Patrick McHardyefd1e8d2007-04-10 18:30:09 -07001815 head_copy_len = oldheadroom;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001816 head_copy_off = 0;
1817 if (newheadroom <= head_copy_len)
1818 head_copy_len = newheadroom;
1819 else
1820 head_copy_off = newheadroom - head_copy_len;
1821
1822 /* Copy the linear header and data. */
Tim Hansen9f77fad2017-10-09 11:37:59 -04001823 BUG_ON(skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off,
1824 skb->len + head_copy_len));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001825
Ilya Lesokhin08303c12018-04-30 10:16:11 +03001826 skb_copy_header(n, skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001827
Eric Dumazet030737b2013-10-19 11:42:54 -07001828 skb_headers_offset_update(n, newheadroom - oldheadroom);
Patrick McHardyefd1e8d2007-04-10 18:30:09 -07001829
Linus Torvalds1da177e2005-04-16 15:20:36 -07001830 return n;
1831}
David S. Millerb4ac530fc2009-02-10 02:09:24 -08001832EXPORT_SYMBOL(skb_copy_expand);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001833
1834/**
Florian Fainellicd0a1372017-08-22 15:12:14 -07001835 * __skb_pad - zero pad the tail of an skb
Linus Torvalds1da177e2005-04-16 15:20:36 -07001836 * @skb: buffer to pad
1837 * @pad: space to pad
Florian Fainellicd0a1372017-08-22 15:12:14 -07001838 * @free_on_error: free buffer on error
Linus Torvalds1da177e2005-04-16 15:20:36 -07001839 *
1840 * Ensure that a buffer is followed by a padding area that is zero
1841 * filled. Used by network drivers which may DMA or transfer data
1842 * beyond the buffer end onto the wire.
1843 *
Florian Fainellicd0a1372017-08-22 15:12:14 -07001844 * May return error in out of memory cases. The skb is freed on error
1845 * if @free_on_error is true.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001846 */
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001847
Florian Fainellicd0a1372017-08-22 15:12:14 -07001848int __skb_pad(struct sk_buff *skb, int pad, bool free_on_error)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001849{
Herbert Xu5b057c62006-06-23 02:06:41 -07001850 int err;
1851 int ntail;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001852
Linus Torvalds1da177e2005-04-16 15:20:36 -07001853 /* If the skbuff is non linear tailroom is always zero.. */
Herbert Xu5b057c62006-06-23 02:06:41 -07001854 if (!skb_cloned(skb) && skb_tailroom(skb) >= pad) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001855 memset(skb->data+skb->len, 0, pad);
Herbert Xu5b057c62006-06-23 02:06:41 -07001856 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001857 }
Herbert Xu5b057c62006-06-23 02:06:41 -07001858
Arnaldo Carvalho de Melo4305b542007-04-19 20:43:29 -07001859 ntail = skb->data_len + pad - (skb->end - skb->tail);
Herbert Xu5b057c62006-06-23 02:06:41 -07001860 if (likely(skb_cloned(skb) || ntail > 0)) {
1861 err = pskb_expand_head(skb, 0, ntail, GFP_ATOMIC);
1862 if (unlikely(err))
1863 goto free_skb;
1864 }
1865
1866 /* FIXME: The use of this function with non-linear skb's really needs
1867 * to be audited.
1868 */
1869 err = skb_linearize(skb);
1870 if (unlikely(err))
1871 goto free_skb;
1872
1873 memset(skb->data + skb->len, 0, pad);
1874 return 0;
1875
1876free_skb:
Florian Fainellicd0a1372017-08-22 15:12:14 -07001877 if (free_on_error)
1878 kfree_skb(skb);
Herbert Xu5b057c62006-06-23 02:06:41 -07001879 return err;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001880}
Florian Fainellicd0a1372017-08-22 15:12:14 -07001881EXPORT_SYMBOL(__skb_pad);
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001882
Ilpo Järvinen0dde3e12008-03-27 17:43:41 -07001883/**
Mathias Krause0c7ddf32013-11-07 14:18:24 +01001884 * pskb_put - add data to the tail of a potentially fragmented buffer
1885 * @skb: start of the buffer to use
1886 * @tail: tail fragment of the buffer to use
1887 * @len: amount of data to add
1888 *
1889 * This function extends the used data area of the potentially
1890 * fragmented buffer. @tail must be the last fragment of @skb -- or
1891 * @skb itself. If this would exceed the total buffer size the kernel
1892 * will panic. A pointer to the first byte of the extra data is
1893 * returned.
1894 */
1895
Johannes Berg4df864c2017-06-16 14:29:21 +02001896void *pskb_put(struct sk_buff *skb, struct sk_buff *tail, int len)
Mathias Krause0c7ddf32013-11-07 14:18:24 +01001897{
1898 if (tail != skb) {
1899 skb->data_len += len;
1900 skb->len += len;
1901 }
1902 return skb_put(tail, len);
1903}
1904EXPORT_SYMBOL_GPL(pskb_put);
1905
1906/**
Ilpo Järvinen0dde3e12008-03-27 17:43:41 -07001907 * skb_put - add data to a buffer
1908 * @skb: buffer to use
1909 * @len: amount of data to add
1910 *
1911 * This function extends the used data area of the buffer. If this would
1912 * exceed the total buffer size the kernel will panic. A pointer to the
1913 * first byte of the extra data is returned.
1914 */
Johannes Berg4df864c2017-06-16 14:29:21 +02001915void *skb_put(struct sk_buff *skb, unsigned int len)
Ilpo Järvinen0dde3e12008-03-27 17:43:41 -07001916{
Johannes Berg4df864c2017-06-16 14:29:21 +02001917 void *tmp = skb_tail_pointer(skb);
Ilpo Järvinen0dde3e12008-03-27 17:43:41 -07001918 SKB_LINEAR_ASSERT(skb);
1919 skb->tail += len;
1920 skb->len += len;
1921 if (unlikely(skb->tail > skb->end))
1922 skb_over_panic(skb, len, __builtin_return_address(0));
1923 return tmp;
1924}
1925EXPORT_SYMBOL(skb_put);
1926
Ilpo Järvinen6be8ac22008-03-27 17:47:24 -07001927/**
Ilpo Järvinenc2aa2702008-03-27 17:52:40 -07001928 * skb_push - add data to the start of a buffer
1929 * @skb: buffer to use
1930 * @len: amount of data to add
1931 *
1932 * This function extends the used data area of the buffer at the buffer
1933 * start. If this would exceed the total buffer headroom the kernel will
1934 * panic. A pointer to the first byte of the extra data is returned.
1935 */
Johannes Bergd58ff352017-06-16 14:29:23 +02001936void *skb_push(struct sk_buff *skb, unsigned int len)
Ilpo Järvinenc2aa2702008-03-27 17:52:40 -07001937{
1938 skb->data -= len;
1939 skb->len += len;
Ganesh Goudar9aba2f82018-08-02 15:34:52 +05301940 if (unlikely(skb->data < skb->head))
Ilpo Järvinenc2aa2702008-03-27 17:52:40 -07001941 skb_under_panic(skb, len, __builtin_return_address(0));
1942 return skb->data;
1943}
1944EXPORT_SYMBOL(skb_push);
1945
1946/**
Ilpo Järvinen6be8ac22008-03-27 17:47:24 -07001947 * skb_pull - remove data from the start of a buffer
1948 * @skb: buffer to use
1949 * @len: amount of data to remove
1950 *
1951 * This function removes data from the start of a buffer, returning
1952 * the memory to the headroom. A pointer to the next data in the buffer
1953 * is returned. Once the data has been pulled future pushes will overwrite
1954 * the old data.
1955 */
Johannes Bergaf728682017-06-16 14:29:22 +02001956void *skb_pull(struct sk_buff *skb, unsigned int len)
Ilpo Järvinen6be8ac22008-03-27 17:47:24 -07001957{
David S. Miller47d29642010-05-02 02:21:44 -07001958 return skb_pull_inline(skb, len);
Ilpo Järvinen6be8ac22008-03-27 17:47:24 -07001959}
1960EXPORT_SYMBOL(skb_pull);
1961
Ilpo Järvinen419ae742008-03-27 17:54:01 -07001962/**
1963 * skb_trim - remove end from a buffer
1964 * @skb: buffer to alter
1965 * @len: new length
1966 *
1967 * Cut the length of a buffer down by removing data from the tail. If
1968 * the buffer is already under the length specified it is not modified.
1969 * The skb must be linear.
1970 */
1971void skb_trim(struct sk_buff *skb, unsigned int len)
1972{
1973 if (skb->len > len)
1974 __skb_trim(skb, len);
1975}
1976EXPORT_SYMBOL(skb_trim);
1977
Herbert Xu3cc0e872006-06-09 16:13:38 -07001978/* Trims skb to length len. It can change skb pointers.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001979 */
1980
Herbert Xu3cc0e872006-06-09 16:13:38 -07001981int ___pskb_trim(struct sk_buff *skb, unsigned int len)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001982{
Herbert Xu27b437c2006-07-13 19:26:39 -07001983 struct sk_buff **fragp;
1984 struct sk_buff *frag;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001985 int offset = skb_headlen(skb);
1986 int nfrags = skb_shinfo(skb)->nr_frags;
1987 int i;
Herbert Xu27b437c2006-07-13 19:26:39 -07001988 int err;
1989
1990 if (skb_cloned(skb) &&
1991 unlikely((err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))))
1992 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001993
Herbert Xuf4d26fb2006-07-30 20:20:28 -07001994 i = 0;
1995 if (offset >= len)
1996 goto drop_pages;
1997
1998 for (; i < nfrags; i++) {
Eric Dumazet9e903e02011-10-18 21:00:24 +00001999 int end = offset + skb_frag_size(&skb_shinfo(skb)->frags[i]);
Herbert Xu27b437c2006-07-13 19:26:39 -07002000
2001 if (end < len) {
2002 offset = end;
2003 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002004 }
Herbert Xu27b437c2006-07-13 19:26:39 -07002005
Eric Dumazet9e903e02011-10-18 21:00:24 +00002006 skb_frag_size_set(&skb_shinfo(skb)->frags[i++], len - offset);
Herbert Xu27b437c2006-07-13 19:26:39 -07002007
Herbert Xuf4d26fb2006-07-30 20:20:28 -07002008drop_pages:
Herbert Xu27b437c2006-07-13 19:26:39 -07002009 skb_shinfo(skb)->nr_frags = i;
2010
2011 for (; i < nfrags; i++)
Ian Campbellea2ab692011-08-22 23:44:58 +00002012 skb_frag_unref(skb, i);
Herbert Xu27b437c2006-07-13 19:26:39 -07002013
David S. Miller21dc3302010-08-23 00:13:46 -07002014 if (skb_has_frag_list(skb))
Herbert Xu27b437c2006-07-13 19:26:39 -07002015 skb_drop_fraglist(skb);
Herbert Xuf4d26fb2006-07-30 20:20:28 -07002016 goto done;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002017 }
2018
Herbert Xu27b437c2006-07-13 19:26:39 -07002019 for (fragp = &skb_shinfo(skb)->frag_list; (frag = *fragp);
2020 fragp = &frag->next) {
2021 int end = offset + frag->len;
2022
2023 if (skb_shared(frag)) {
2024 struct sk_buff *nfrag;
2025
2026 nfrag = skb_clone(frag, GFP_ATOMIC);
2027 if (unlikely(!nfrag))
2028 return -ENOMEM;
2029
2030 nfrag->next = frag->next;
Eric Dumazet85bb2a62012-04-19 02:24:53 +00002031 consume_skb(frag);
Herbert Xu27b437c2006-07-13 19:26:39 -07002032 frag = nfrag;
2033 *fragp = frag;
2034 }
2035
2036 if (end < len) {
2037 offset = end;
2038 continue;
2039 }
2040
2041 if (end > len &&
2042 unlikely((err = pskb_trim(frag, len - offset))))
2043 return err;
2044
2045 if (frag->next)
2046 skb_drop_list(&frag->next);
2047 break;
2048 }
2049
Herbert Xuf4d26fb2006-07-30 20:20:28 -07002050done:
Herbert Xu27b437c2006-07-13 19:26:39 -07002051 if (len > skb_headlen(skb)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002052 skb->data_len -= skb->len - len;
2053 skb->len = len;
2054 } else {
Herbert Xu27b437c2006-07-13 19:26:39 -07002055 skb->len = len;
2056 skb->data_len = 0;
Arnaldo Carvalho de Melo27a884d2007-04-19 20:29:13 -07002057 skb_set_tail_pointer(skb, len);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002058 }
2059
Eric Dumazetc21b48c2017-04-26 09:07:46 -07002060 if (!skb->sk || skb->destructor == sock_edemux)
2061 skb_condense(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002062 return 0;
2063}
David S. Millerb4ac530fc2009-02-10 02:09:24 -08002064EXPORT_SYMBOL(___pskb_trim);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002065
Eric Dumazet88078d92018-04-18 11:43:15 -07002066/* Note : use pskb_trim_rcsum() instead of calling this directly
2067 */
2068int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len)
2069{
2070 if (skb->ip_summed == CHECKSUM_COMPLETE) {
2071 int delta = skb->len - len;
2072
Dimitris Michailidisd55bef502018-10-19 17:07:13 -07002073 skb->csum = csum_block_sub(skb->csum,
2074 skb_checksum(skb, len, delta, 0),
2075 len);
Vasily Averin54970a22020-12-14 22:07:39 +03002076 } else if (skb->ip_summed == CHECKSUM_PARTIAL) {
2077 int hdlen = (len > skb_headlen(skb)) ? skb_headlen(skb) : len;
2078 int offset = skb_checksum_start_offset(skb) + skb->csum_offset;
2079
2080 if (offset + sizeof(__sum16) > hdlen)
2081 return -EINVAL;
Eric Dumazet88078d92018-04-18 11:43:15 -07002082 }
2083 return __pskb_trim(skb, len);
2084}
2085EXPORT_SYMBOL(pskb_trim_rcsum_slow);
2086
Linus Torvalds1da177e2005-04-16 15:20:36 -07002087/**
2088 * __pskb_pull_tail - advance tail of skb header
2089 * @skb: buffer to reallocate
2090 * @delta: number of bytes to advance tail
2091 *
2092 * The function makes a sense only on a fragmented &sk_buff,
2093 * it expands header moving its tail forward and copying necessary
2094 * data from fragmented part.
2095 *
2096 * &sk_buff MUST have reference count of 1.
2097 *
2098 * Returns %NULL (and &sk_buff does not change) if pull failed
2099 * or value of new tail of skb in the case of success.
2100 *
2101 * All the pointers pointing into skb header may change and must be
2102 * reloaded after call to this function.
2103 */
2104
2105/* Moves tail of skb head forward, copying data from fragmented part,
2106 * when it is necessary.
2107 * 1. It may fail due to malloc failure.
2108 * 2. It may change skb pointers.
2109 *
2110 * It is pretty complicated. Luckily, it is called only in exceptional cases.
2111 */
Johannes Bergaf728682017-06-16 14:29:22 +02002112void *__pskb_pull_tail(struct sk_buff *skb, int delta)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002113{
2114 /* If skb has not enough free space at tail, get new one
2115 * plus 128 bytes for future expansions. If we have enough
2116 * room at tail, reallocate without expansion only if skb is cloned.
2117 */
Arnaldo Carvalho de Melo4305b542007-04-19 20:43:29 -07002118 int i, k, eat = (skb->tail + delta) - skb->end;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002119
2120 if (eat > 0 || skb_cloned(skb)) {
2121 if (pskb_expand_head(skb, 0, eat > 0 ? eat + 128 : 0,
2122 GFP_ATOMIC))
2123 return NULL;
2124 }
2125
Tim Hansen9f77fad2017-10-09 11:37:59 -04002126 BUG_ON(skb_copy_bits(skb, skb_headlen(skb),
2127 skb_tail_pointer(skb), delta));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002128
2129 /* Optimization: no fragments, no reasons to preestimate
2130 * size of pulled pages. Superb.
2131 */
David S. Miller21dc3302010-08-23 00:13:46 -07002132 if (!skb_has_frag_list(skb))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002133 goto pull_pages;
2134
2135 /* Estimate size of pulled pages. */
2136 eat = delta;
2137 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
Eric Dumazet9e903e02011-10-18 21:00:24 +00002138 int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);
2139
2140 if (size >= eat)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002141 goto pull_pages;
Eric Dumazet9e903e02011-10-18 21:00:24 +00002142 eat -= size;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002143 }
2144
2145 /* If we need update frag list, we are in troubles.
Wenhua Shi09001b02017-10-14 18:51:36 +02002146 * Certainly, it is possible to add an offset to skb data,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002147 * but taking into account that pulling is expected to
2148 * be very rare operation, it is worth to fight against
2149 * further bloating skb head and crucify ourselves here instead.
2150 * Pure masohism, indeed. 8)8)
2151 */
2152 if (eat) {
2153 struct sk_buff *list = skb_shinfo(skb)->frag_list;
2154 struct sk_buff *clone = NULL;
2155 struct sk_buff *insp = NULL;
2156
2157 do {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002158 if (list->len <= eat) {
2159 /* Eaten as whole. */
2160 eat -= list->len;
2161 list = list->next;
2162 insp = list;
2163 } else {
2164 /* Eaten partially. */
2165
2166 if (skb_shared(list)) {
2167 /* Sucks! We need to fork list. :-( */
2168 clone = skb_clone(list, GFP_ATOMIC);
2169 if (!clone)
2170 return NULL;
2171 insp = list->next;
2172 list = clone;
2173 } else {
2174 /* This may be pulled without
2175 * problems. */
2176 insp = list;
2177 }
2178 if (!pskb_pull(list, eat)) {
Wei Yongjunf3fbbe02009-02-25 00:37:32 +00002179 kfree_skb(clone);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002180 return NULL;
2181 }
2182 break;
2183 }
2184 } while (eat);
2185
2186 /* Free pulled out fragments. */
2187 while ((list = skb_shinfo(skb)->frag_list) != insp) {
2188 skb_shinfo(skb)->frag_list = list->next;
2189 kfree_skb(list);
2190 }
2191 /* And insert new clone at head. */
2192 if (clone) {
2193 clone->next = list;
2194 skb_shinfo(skb)->frag_list = clone;
2195 }
2196 }
2197 /* Success! Now we may commit changes to skb data. */
2198
2199pull_pages:
2200 eat = delta;
2201 k = 0;
2202 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
Eric Dumazet9e903e02011-10-18 21:00:24 +00002203 int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);
2204
2205 if (size <= eat) {
Ian Campbellea2ab692011-08-22 23:44:58 +00002206 skb_frag_unref(skb, i);
Eric Dumazet9e903e02011-10-18 21:00:24 +00002207 eat -= size;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002208 } else {
Jonathan Lemonb54c9d52019-07-30 07:40:33 -07002209 skb_frag_t *frag = &skb_shinfo(skb)->frags[k];
2210
2211 *frag = skb_shinfo(skb)->frags[i];
Linus Torvalds1da177e2005-04-16 15:20:36 -07002212 if (eat) {
Jonathan Lemonb54c9d52019-07-30 07:40:33 -07002213 skb_frag_off_add(frag, eat);
2214 skb_frag_size_sub(frag, eat);
linzhang3ccc6c62017-07-17 17:25:02 +08002215 if (!i)
2216 goto end;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002217 eat = 0;
2218 }
2219 k++;
2220 }
2221 }
2222 skb_shinfo(skb)->nr_frags = k;
2223
linzhang3ccc6c62017-07-17 17:25:02 +08002224end:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002225 skb->tail += delta;
2226 skb->data_len -= delta;
2227
Willem de Bruijn1f8b9772017-08-03 16:29:41 -04002228 if (!skb->data_len)
2229 skb_zcopy_clear(skb, false);
2230
Arnaldo Carvalho de Melo27a884d2007-04-19 20:29:13 -07002231 return skb_tail_pointer(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002232}
David S. Millerb4ac530fc2009-02-10 02:09:24 -08002233EXPORT_SYMBOL(__pskb_pull_tail);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002234
Eric Dumazet22019b12011-07-29 18:37:31 +00002235/**
2236 * skb_copy_bits - copy bits from skb to kernel buffer
2237 * @skb: source skb
2238 * @offset: offset in source
2239 * @to: destination buffer
2240 * @len: number of bytes to copy
2241 *
2242 * Copy the specified number of bytes from the source skb to the
2243 * destination buffer.
2244 *
2245 * CAUTION ! :
2246 * If its prototype is ever changed,
2247 * check arch/{*}/net/{*}.S files,
2248 * since it is called from BPF assembly code.
2249 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07002250int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len)
2251{
David S. Miller1a028e52007-04-27 15:21:23 -07002252 int start = skb_headlen(skb);
David S. Millerfbb398a2009-06-09 00:18:59 -07002253 struct sk_buff *frag_iter;
2254 int i, copy;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002255
2256 if (offset > (int)skb->len - len)
2257 goto fault;
2258
2259 /* Copy header. */
David S. Miller1a028e52007-04-27 15:21:23 -07002260 if ((copy = start - offset) > 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002261 if (copy > len)
2262 copy = len;
Arnaldo Carvalho de Melod626f622007-03-27 18:55:52 -03002263 skb_copy_from_linear_data_offset(skb, offset, to, copy);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002264 if ((len -= copy) == 0)
2265 return 0;
2266 offset += copy;
2267 to += copy;
2268 }
2269
2270 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
David S. Miller1a028e52007-04-27 15:21:23 -07002271 int end;
Eric Dumazet51c56b02012-04-05 11:35:15 +02002272 skb_frag_t *f = &skb_shinfo(skb)->frags[i];
Linus Torvalds1da177e2005-04-16 15:20:36 -07002273
Ilpo Järvinen547b7922008-07-25 21:43:18 -07002274 WARN_ON(start > offset + len);
David S. Miller1a028e52007-04-27 15:21:23 -07002275
Eric Dumazet51c56b02012-04-05 11:35:15 +02002276 end = start + skb_frag_size(f);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002277 if ((copy = end - offset) > 0) {
Willem de Bruijnc613c202017-07-31 08:15:47 -04002278 u32 p_off, p_len, copied;
2279 struct page *p;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002280 u8 *vaddr;
2281
2282 if (copy > len)
2283 copy = len;
2284
Willem de Bruijnc613c202017-07-31 08:15:47 -04002285 skb_frag_foreach_page(f,
Jonathan Lemonb54c9d52019-07-30 07:40:33 -07002286 skb_frag_off(f) + offset - start,
Willem de Bruijnc613c202017-07-31 08:15:47 -04002287 copy, p, p_off, p_len, copied) {
2288 vaddr = kmap_atomic(p);
2289 memcpy(to + copied, vaddr + p_off, p_len);
2290 kunmap_atomic(vaddr);
2291 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002292
2293 if ((len -= copy) == 0)
2294 return 0;
2295 offset += copy;
2296 to += copy;
2297 }
David S. Miller1a028e52007-04-27 15:21:23 -07002298 start = end;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002299 }
2300
David S. Millerfbb398a2009-06-09 00:18:59 -07002301 skb_walk_frags(skb, frag_iter) {
2302 int end;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002303
David S. Millerfbb398a2009-06-09 00:18:59 -07002304 WARN_ON(start > offset + len);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002305
David S. Millerfbb398a2009-06-09 00:18:59 -07002306 end = start + frag_iter->len;
2307 if ((copy = end - offset) > 0) {
2308 if (copy > len)
2309 copy = len;
2310 if (skb_copy_bits(frag_iter, offset - start, to, copy))
2311 goto fault;
2312 if ((len -= copy) == 0)
2313 return 0;
2314 offset += copy;
2315 to += copy;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002316 }
David S. Millerfbb398a2009-06-09 00:18:59 -07002317 start = end;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002318 }
Shirley Maa6686f22011-07-06 12:22:12 +00002319
Linus Torvalds1da177e2005-04-16 15:20:36 -07002320 if (!len)
2321 return 0;
2322
2323fault:
2324 return -EFAULT;
2325}
David S. Millerb4ac530fc2009-02-10 02:09:24 -08002326EXPORT_SYMBOL(skb_copy_bits);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002327
Jens Axboe9c55e012007-11-06 23:30:13 -08002328/*
2329 * Callback from splice_to_pipe(), if we need to release some pages
2330 * at the end of the spd in case we error'ed out in filling the pipe.
2331 */
2332static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i)
2333{
Jarek Poplawski8b9d3722009-01-19 17:03:56 -08002334 put_page(spd->pages[i]);
2335}
Jens Axboe9c55e012007-11-06 23:30:13 -08002336
David S. Millera108d5f2012-04-23 23:06:11 -04002337static struct page *linear_to_page(struct page *page, unsigned int *len,
2338 unsigned int *offset,
Eric Dumazet18aafc62013-01-11 14:46:37 +00002339 struct sock *sk)
Jarek Poplawski8b9d3722009-01-19 17:03:56 -08002340{
Eric Dumazet5640f762012-09-23 23:04:42 +00002341 struct page_frag *pfrag = sk_page_frag(sk);
Jarek Poplawski8b9d3722009-01-19 17:03:56 -08002342
Eric Dumazet5640f762012-09-23 23:04:42 +00002343 if (!sk_page_frag_refill(sk, pfrag))
2344 return NULL;
Jarek Poplawski4fb66992009-02-01 00:41:42 -08002345
Eric Dumazet5640f762012-09-23 23:04:42 +00002346 *len = min_t(unsigned int, *len, pfrag->size - pfrag->offset);
Jarek Poplawski4fb66992009-02-01 00:41:42 -08002347
Eric Dumazet5640f762012-09-23 23:04:42 +00002348 memcpy(page_address(pfrag->page) + pfrag->offset,
2349 page_address(page) + *offset, *len);
2350 *offset = pfrag->offset;
2351 pfrag->offset += *len;
Jarek Poplawski4fb66992009-02-01 00:41:42 -08002352
Eric Dumazet5640f762012-09-23 23:04:42 +00002353 return pfrag->page;
Jens Axboe9c55e012007-11-06 23:30:13 -08002354}
2355
Eric Dumazet41c73a02012-04-22 12:26:16 +00002356static bool spd_can_coalesce(const struct splice_pipe_desc *spd,
2357 struct page *page,
2358 unsigned int offset)
2359{
2360 return spd->nr_pages &&
2361 spd->pages[spd->nr_pages - 1] == page &&
2362 (spd->partial[spd->nr_pages - 1].offset +
2363 spd->partial[spd->nr_pages - 1].len == offset);
2364}
2365
Jens Axboe9c55e012007-11-06 23:30:13 -08002366/*
2367 * Fill page/offset/length into spd, if it can hold more pages.
2368 */
David S. Millera108d5f2012-04-23 23:06:11 -04002369static bool spd_fill_page(struct splice_pipe_desc *spd,
2370 struct pipe_inode_info *pipe, struct page *page,
2371 unsigned int *len, unsigned int offset,
Eric Dumazet18aafc62013-01-11 14:46:37 +00002372 bool linear,
David S. Millera108d5f2012-04-23 23:06:11 -04002373 struct sock *sk)
Jens Axboe9c55e012007-11-06 23:30:13 -08002374{
Eric Dumazet41c73a02012-04-22 12:26:16 +00002375 if (unlikely(spd->nr_pages == MAX_SKB_FRAGS))
David S. Millera108d5f2012-04-23 23:06:11 -04002376 return true;
Jens Axboe9c55e012007-11-06 23:30:13 -08002377
Jarek Poplawski8b9d3722009-01-19 17:03:56 -08002378 if (linear) {
Eric Dumazet18aafc62013-01-11 14:46:37 +00002379 page = linear_to_page(page, len, &offset, sk);
Jarek Poplawski8b9d3722009-01-19 17:03:56 -08002380 if (!page)
David S. Millera108d5f2012-04-23 23:06:11 -04002381 return true;
Eric Dumazet41c73a02012-04-22 12:26:16 +00002382 }
2383 if (spd_can_coalesce(spd, page, offset)) {
2384 spd->partial[spd->nr_pages - 1].len += *len;
David S. Millera108d5f2012-04-23 23:06:11 -04002385 return false;
Eric Dumazet41c73a02012-04-22 12:26:16 +00002386 }
2387 get_page(page);
Jens Axboe9c55e012007-11-06 23:30:13 -08002388 spd->pages[spd->nr_pages] = page;
Jarek Poplawski4fb66992009-02-01 00:41:42 -08002389 spd->partial[spd->nr_pages].len = *len;
Jens Axboe9c55e012007-11-06 23:30:13 -08002390 spd->partial[spd->nr_pages].offset = offset;
Jens Axboe9c55e012007-11-06 23:30:13 -08002391 spd->nr_pages++;
Jarek Poplawski8b9d3722009-01-19 17:03:56 -08002392
David S. Millera108d5f2012-04-23 23:06:11 -04002393 return false;
Jens Axboe9c55e012007-11-06 23:30:13 -08002394}
2395
David S. Millera108d5f2012-04-23 23:06:11 -04002396static bool __splice_segment(struct page *page, unsigned int poff,
2397 unsigned int plen, unsigned int *off,
Eric Dumazet18aafc62013-01-11 14:46:37 +00002398 unsigned int *len,
Eric Dumazetd7ccf7c2012-04-23 23:35:04 -04002399 struct splice_pipe_desc *spd, bool linear,
David S. Millera108d5f2012-04-23 23:06:11 -04002400 struct sock *sk,
2401 struct pipe_inode_info *pipe)
Octavian Purdila2870c432008-07-15 00:49:11 -07002402{
2403 if (!*len)
David S. Millera108d5f2012-04-23 23:06:11 -04002404 return true;
Octavian Purdila2870c432008-07-15 00:49:11 -07002405
2406 /* skip this segment if already processed */
2407 if (*off >= plen) {
2408 *off -= plen;
David S. Millera108d5f2012-04-23 23:06:11 -04002409 return false;
Octavian Purdiladb43a282008-06-27 17:27:21 -07002410 }
Jens Axboe9c55e012007-11-06 23:30:13 -08002411
Octavian Purdila2870c432008-07-15 00:49:11 -07002412 /* ignore any bits we already processed */
Eric Dumazet9ca1b222013-01-05 21:31:18 +00002413 poff += *off;
2414 plen -= *off;
2415 *off = 0;
Octavian Purdila2870c432008-07-15 00:49:11 -07002416
Eric Dumazet18aafc62013-01-11 14:46:37 +00002417 do {
2418 unsigned int flen = min(*len, plen);
Octavian Purdila2870c432008-07-15 00:49:11 -07002419
Eric Dumazet18aafc62013-01-11 14:46:37 +00002420 if (spd_fill_page(spd, pipe, page, &flen, poff,
2421 linear, sk))
2422 return true;
2423 poff += flen;
2424 plen -= flen;
2425 *len -= flen;
2426 } while (*len && plen);
Octavian Purdila2870c432008-07-15 00:49:11 -07002427
David S. Millera108d5f2012-04-23 23:06:11 -04002428 return false;
Octavian Purdila2870c432008-07-15 00:49:11 -07002429}
2430
2431/*
David S. Millera108d5f2012-04-23 23:06:11 -04002432 * Map linear and fragment data from the skb to spd. It reports true if the
Octavian Purdila2870c432008-07-15 00:49:11 -07002433 * pipe is full or if we already spliced the requested length.
2434 */
David S. Millera108d5f2012-04-23 23:06:11 -04002435static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
2436 unsigned int *offset, unsigned int *len,
2437 struct splice_pipe_desc *spd, struct sock *sk)
Octavian Purdila2870c432008-07-15 00:49:11 -07002438{
2439 int seg;
Tom Herbertfa9835e2016-03-07 14:11:04 -08002440 struct sk_buff *iter;
Octavian Purdila2870c432008-07-15 00:49:11 -07002441
Eric Dumazet1d0c0b32012-04-27 02:10:03 +00002442 /* map the linear part :
Alexander Duyck2996d312012-05-02 18:18:42 +00002443 * If skb->head_frag is set, this 'linear' part is backed by a
2444 * fragment, and if the head is not shared with any clones then
2445 * we can avoid a copy since we own the head portion of this page.
Jens Axboe9c55e012007-11-06 23:30:13 -08002446 */
Octavian Purdila2870c432008-07-15 00:49:11 -07002447 if (__splice_segment(virt_to_page(skb->data),
2448 (unsigned long) skb->data & (PAGE_SIZE - 1),
2449 skb_headlen(skb),
Eric Dumazet18aafc62013-01-11 14:46:37 +00002450 offset, len, spd,
Alexander Duyck3a7c1ee42012-05-03 01:09:42 +00002451 skb_head_is_locked(skb),
Eric Dumazet1d0c0b32012-04-27 02:10:03 +00002452 sk, pipe))
David S. Millera108d5f2012-04-23 23:06:11 -04002453 return true;
Jens Axboe9c55e012007-11-06 23:30:13 -08002454
2455 /*
2456 * then map the fragments
2457 */
Jens Axboe9c55e012007-11-06 23:30:13 -08002458 for (seg = 0; seg < skb_shinfo(skb)->nr_frags; seg++) {
2459 const skb_frag_t *f = &skb_shinfo(skb)->frags[seg];
2460
Ian Campbellea2ab692011-08-22 23:44:58 +00002461 if (__splice_segment(skb_frag_page(f),
Jonathan Lemonb54c9d52019-07-30 07:40:33 -07002462 skb_frag_off(f), skb_frag_size(f),
Eric Dumazet18aafc62013-01-11 14:46:37 +00002463 offset, len, spd, false, sk, pipe))
David S. Millera108d5f2012-04-23 23:06:11 -04002464 return true;
Jens Axboe9c55e012007-11-06 23:30:13 -08002465 }
2466
Tom Herbertfa9835e2016-03-07 14:11:04 -08002467 skb_walk_frags(skb, iter) {
2468 if (*offset >= iter->len) {
2469 *offset -= iter->len;
2470 continue;
2471 }
2472 /* __skb_splice_bits() only fails if the output has no room
2473 * left, so no point in going over the frag_list for the error
2474 * case.
2475 */
2476 if (__skb_splice_bits(iter, pipe, offset, len, spd, sk))
2477 return true;
2478 }
2479
David S. Millera108d5f2012-04-23 23:06:11 -04002480 return false;
Jens Axboe9c55e012007-11-06 23:30:13 -08002481}
2482
2483/*
2484 * Map data from the skb to a pipe. Should handle both the linear part,
Tom Herbertfa9835e2016-03-07 14:11:04 -08002485 * the fragments, and the frag list.
Jens Axboe9c55e012007-11-06 23:30:13 -08002486 */
Hannes Frederic Sowaa60e3cc2015-05-21 17:00:00 +02002487int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset,
Jens Axboe9c55e012007-11-06 23:30:13 -08002488 struct pipe_inode_info *pipe, unsigned int tlen,
Al Viro25869262016-09-17 21:02:10 -04002489 unsigned int flags)
Jens Axboe9c55e012007-11-06 23:30:13 -08002490{
Eric Dumazet41c73a02012-04-22 12:26:16 +00002491 struct partial_page partial[MAX_SKB_FRAGS];
2492 struct page *pages[MAX_SKB_FRAGS];
Jens Axboe9c55e012007-11-06 23:30:13 -08002493 struct splice_pipe_desc spd = {
2494 .pages = pages,
2495 .partial = partial,
Eric Dumazet047fe362012-06-12 15:24:40 +02002496 .nr_pages_max = MAX_SKB_FRAGS,
Miklos Szeredi28a625c2014-01-22 19:36:57 +01002497 .ops = &nosteal_pipe_buf_ops,
Jens Axboe9c55e012007-11-06 23:30:13 -08002498 .spd_release = sock_spd_release,
2499 };
Jens Axboe35f3d142010-05-20 10:43:18 +02002500 int ret = 0;
2501
Tom Herbertfa9835e2016-03-07 14:11:04 -08002502 __skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk);
Jens Axboe9c55e012007-11-06 23:30:13 -08002503
Hannes Frederic Sowaa60e3cc2015-05-21 17:00:00 +02002504 if (spd.nr_pages)
Al Viro25869262016-09-17 21:02:10 -04002505 ret = splice_to_pipe(pipe, &spd);
Jens Axboe9c55e012007-11-06 23:30:13 -08002506
Jens Axboe35f3d142010-05-20 10:43:18 +02002507 return ret;
Jens Axboe9c55e012007-11-06 23:30:13 -08002508}
Hannes Frederic Sowa2b514572015-05-21 17:00:01 +02002509EXPORT_SYMBOL_GPL(skb_splice_bits);
Jens Axboe9c55e012007-11-06 23:30:13 -08002510
Cong Wang0739cd22021-03-30 19:32:24 -07002511static int sendmsg_unlocked(struct sock *sk, struct msghdr *msg,
2512 struct kvec *vec, size_t num, size_t size)
2513{
2514 struct socket *sock = sk->sk_socket;
2515
2516 if (!sock)
2517 return -EINVAL;
2518 return kernel_sendmsg(sock, msg, vec, num, size);
2519}
2520
2521static int sendpage_unlocked(struct sock *sk, struct page *page, int offset,
2522 size_t size, int flags)
2523{
2524 struct socket *sock = sk->sk_socket;
2525
2526 if (!sock)
2527 return -EINVAL;
2528 return kernel_sendpage(sock, page, offset, size, flags);
2529}
2530
2531typedef int (*sendmsg_func)(struct sock *sk, struct msghdr *msg,
2532 struct kvec *vec, size_t num, size_t size);
2533typedef int (*sendpage_func)(struct sock *sk, struct page *page, int offset,
2534 size_t size, int flags);
2535static int __skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset,
2536 int len, sendmsg_func sendmsg, sendpage_func sendpage)
Tom Herbert20bf50d2017-07-28 16:22:42 -07002537{
2538 unsigned int orig_len = len;
2539 struct sk_buff *head = skb;
2540 unsigned short fragidx;
2541 int slen, ret;
2542
2543do_frag_list:
2544
2545 /* Deal with head data */
2546 while (offset < skb_headlen(skb) && len) {
2547 struct kvec kv;
2548 struct msghdr msg;
2549
2550 slen = min_t(int, len, skb_headlen(skb) - offset);
2551 kv.iov_base = skb->data + offset;
John Fastabenddb5980d2017-08-15 22:31:34 -07002552 kv.iov_len = slen;
Tom Herbert20bf50d2017-07-28 16:22:42 -07002553 memset(&msg, 0, sizeof(msg));
John Fastabendbd95e6782019-05-24 08:01:00 -07002554 msg.msg_flags = MSG_DONTWAIT;
Tom Herbert20bf50d2017-07-28 16:22:42 -07002555
Cong Wang0739cd22021-03-30 19:32:24 -07002556 ret = INDIRECT_CALL_2(sendmsg, kernel_sendmsg_locked,
2557 sendmsg_unlocked, sk, &msg, &kv, 1, slen);
Tom Herbert20bf50d2017-07-28 16:22:42 -07002558 if (ret <= 0)
2559 goto error;
2560
2561 offset += ret;
2562 len -= ret;
2563 }
2564
2565 /* All the data was skb head? */
2566 if (!len)
2567 goto out;
2568
2569 /* Make offset relative to start of frags */
2570 offset -= skb_headlen(skb);
2571
2572 /* Find where we are in frag list */
2573 for (fragidx = 0; fragidx < skb_shinfo(skb)->nr_frags; fragidx++) {
2574 skb_frag_t *frag = &skb_shinfo(skb)->frags[fragidx];
2575
Matthew Wilcox (Oracle)d8e18a52019-07-22 20:08:26 -07002576 if (offset < skb_frag_size(frag))
Tom Herbert20bf50d2017-07-28 16:22:42 -07002577 break;
2578
Matthew Wilcox (Oracle)d8e18a52019-07-22 20:08:26 -07002579 offset -= skb_frag_size(frag);
Tom Herbert20bf50d2017-07-28 16:22:42 -07002580 }
2581
2582 for (; len && fragidx < skb_shinfo(skb)->nr_frags; fragidx++) {
2583 skb_frag_t *frag = &skb_shinfo(skb)->frags[fragidx];
2584
Matthew Wilcox (Oracle)d8e18a52019-07-22 20:08:26 -07002585 slen = min_t(size_t, len, skb_frag_size(frag) - offset);
Tom Herbert20bf50d2017-07-28 16:22:42 -07002586
2587 while (slen) {
Cong Wang0739cd22021-03-30 19:32:24 -07002588 ret = INDIRECT_CALL_2(sendpage, kernel_sendpage_locked,
2589 sendpage_unlocked, sk,
2590 skb_frag_page(frag),
2591 skb_frag_off(frag) + offset,
2592 slen, MSG_DONTWAIT);
Tom Herbert20bf50d2017-07-28 16:22:42 -07002593 if (ret <= 0)
2594 goto error;
2595
2596 len -= ret;
2597 offset += ret;
2598 slen -= ret;
2599 }
2600
2601 offset = 0;
2602 }
2603
2604 if (len) {
2605 /* Process any frag lists */
2606
2607 if (skb == head) {
2608 if (skb_has_frag_list(skb)) {
2609 skb = skb_shinfo(skb)->frag_list;
2610 goto do_frag_list;
2611 }
2612 } else if (skb->next) {
2613 skb = skb->next;
2614 goto do_frag_list;
2615 }
2616 }
2617
2618out:
2619 return orig_len - len;
2620
2621error:
2622 return orig_len == len ? ret : orig_len - len;
2623}
Cong Wang0739cd22021-03-30 19:32:24 -07002624
2625/* Send skb data on a socket. Socket must be locked. */
2626int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset,
2627 int len)
2628{
2629 return __skb_send_sock(sk, skb, offset, len, kernel_sendmsg_locked,
2630 kernel_sendpage_locked);
2631}
Tom Herbert20bf50d2017-07-28 16:22:42 -07002632EXPORT_SYMBOL_GPL(skb_send_sock_locked);
2633
Cong Wang0739cd22021-03-30 19:32:24 -07002634/* Send skb data on a socket. Socket must be unlocked. */
2635int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len)
2636{
2637 return __skb_send_sock(sk, skb, offset, len, sendmsg_unlocked,
2638 sendpage_unlocked);
2639}
2640
Herbert Xu357b40a2005-04-19 22:30:14 -07002641/**
2642 * skb_store_bits - store bits from kernel buffer to skb
2643 * @skb: destination buffer
2644 * @offset: offset in destination
2645 * @from: source buffer
2646 * @len: number of bytes to copy
2647 *
2648 * Copy the specified number of bytes from the source buffer to the
2649 * destination skb. This function handles all the messy bits of
2650 * traversing fragment lists and such.
2651 */
2652
Stephen Hemminger0c6fcc82007-04-20 16:40:01 -07002653int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len)
Herbert Xu357b40a2005-04-19 22:30:14 -07002654{
David S. Miller1a028e52007-04-27 15:21:23 -07002655 int start = skb_headlen(skb);
David S. Millerfbb398a2009-06-09 00:18:59 -07002656 struct sk_buff *frag_iter;
2657 int i, copy;
Herbert Xu357b40a2005-04-19 22:30:14 -07002658
2659 if (offset > (int)skb->len - len)
2660 goto fault;
2661
David S. Miller1a028e52007-04-27 15:21:23 -07002662 if ((copy = start - offset) > 0) {
Herbert Xu357b40a2005-04-19 22:30:14 -07002663 if (copy > len)
2664 copy = len;
Arnaldo Carvalho de Melo27d7ff42007-03-31 11:55:19 -03002665 skb_copy_to_linear_data_offset(skb, offset, from, copy);
Herbert Xu357b40a2005-04-19 22:30:14 -07002666 if ((len -= copy) == 0)
2667 return 0;
2668 offset += copy;
2669 from += copy;
2670 }
2671
2672 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2673 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
David S. Miller1a028e52007-04-27 15:21:23 -07002674 int end;
Herbert Xu357b40a2005-04-19 22:30:14 -07002675
Ilpo Järvinen547b7922008-07-25 21:43:18 -07002676 WARN_ON(start > offset + len);
David S. Miller1a028e52007-04-27 15:21:23 -07002677
Eric Dumazet9e903e02011-10-18 21:00:24 +00002678 end = start + skb_frag_size(frag);
Herbert Xu357b40a2005-04-19 22:30:14 -07002679 if ((copy = end - offset) > 0) {
Willem de Bruijnc613c202017-07-31 08:15:47 -04002680 u32 p_off, p_len, copied;
2681 struct page *p;
Herbert Xu357b40a2005-04-19 22:30:14 -07002682 u8 *vaddr;
2683
2684 if (copy > len)
2685 copy = len;
2686
Willem de Bruijnc613c202017-07-31 08:15:47 -04002687 skb_frag_foreach_page(frag,
Jonathan Lemonb54c9d52019-07-30 07:40:33 -07002688 skb_frag_off(frag) + offset - start,
Willem de Bruijnc613c202017-07-31 08:15:47 -04002689 copy, p, p_off, p_len, copied) {
2690 vaddr = kmap_atomic(p);
2691 memcpy(vaddr + p_off, from + copied, p_len);
2692 kunmap_atomic(vaddr);
2693 }
Herbert Xu357b40a2005-04-19 22:30:14 -07002694
2695 if ((len -= copy) == 0)
2696 return 0;
2697 offset += copy;
2698 from += copy;
2699 }
David S. Miller1a028e52007-04-27 15:21:23 -07002700 start = end;
Herbert Xu357b40a2005-04-19 22:30:14 -07002701 }
2702
David S. Millerfbb398a2009-06-09 00:18:59 -07002703 skb_walk_frags(skb, frag_iter) {
2704 int end;
Herbert Xu357b40a2005-04-19 22:30:14 -07002705
David S. Millerfbb398a2009-06-09 00:18:59 -07002706 WARN_ON(start > offset + len);
Herbert Xu357b40a2005-04-19 22:30:14 -07002707
David S. Millerfbb398a2009-06-09 00:18:59 -07002708 end = start + frag_iter->len;
2709 if ((copy = end - offset) > 0) {
2710 if (copy > len)
2711 copy = len;
2712 if (skb_store_bits(frag_iter, offset - start,
2713 from, copy))
2714 goto fault;
2715 if ((len -= copy) == 0)
2716 return 0;
2717 offset += copy;
2718 from += copy;
Herbert Xu357b40a2005-04-19 22:30:14 -07002719 }
David S. Millerfbb398a2009-06-09 00:18:59 -07002720 start = end;
Herbert Xu357b40a2005-04-19 22:30:14 -07002721 }
2722 if (!len)
2723 return 0;
2724
2725fault:
2726 return -EFAULT;
2727}
Herbert Xu357b40a2005-04-19 22:30:14 -07002728EXPORT_SYMBOL(skb_store_bits);
2729
Linus Torvalds1da177e2005-04-16 15:20:36 -07002730/* Checksum skb data. */
Daniel Borkmann2817a332013-10-30 11:50:51 +01002731__wsum __skb_checksum(const struct sk_buff *skb, int offset, int len,
2732 __wsum csum, const struct skb_checksum_ops *ops)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002733{
David S. Miller1a028e52007-04-27 15:21:23 -07002734 int start = skb_headlen(skb);
2735 int i, copy = start - offset;
David S. Millerfbb398a2009-06-09 00:18:59 -07002736 struct sk_buff *frag_iter;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002737 int pos = 0;
2738
2739 /* Checksum header. */
2740 if (copy > 0) {
2741 if (copy > len)
2742 copy = len;
Matteo Croce2544af02019-05-29 17:13:48 +02002743 csum = INDIRECT_CALL_1(ops->update, csum_partial_ext,
2744 skb->data + offset, copy, csum);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002745 if ((len -= copy) == 0)
2746 return csum;
2747 offset += copy;
2748 pos = copy;
2749 }
2750
2751 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
David S. Miller1a028e52007-04-27 15:21:23 -07002752 int end;
Eric Dumazet51c56b02012-04-05 11:35:15 +02002753 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
Linus Torvalds1da177e2005-04-16 15:20:36 -07002754
Ilpo Järvinen547b7922008-07-25 21:43:18 -07002755 WARN_ON(start > offset + len);
David S. Miller1a028e52007-04-27 15:21:23 -07002756
Eric Dumazet51c56b02012-04-05 11:35:15 +02002757 end = start + skb_frag_size(frag);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002758 if ((copy = end - offset) > 0) {
Willem de Bruijnc613c202017-07-31 08:15:47 -04002759 u32 p_off, p_len, copied;
2760 struct page *p;
Al Viro44bb9362006-11-14 21:36:14 -08002761 __wsum csum2;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002762 u8 *vaddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002763
2764 if (copy > len)
2765 copy = len;
Willem de Bruijnc613c202017-07-31 08:15:47 -04002766
2767 skb_frag_foreach_page(frag,
Jonathan Lemonb54c9d52019-07-30 07:40:33 -07002768 skb_frag_off(frag) + offset - start,
Willem de Bruijnc613c202017-07-31 08:15:47 -04002769 copy, p, p_off, p_len, copied) {
2770 vaddr = kmap_atomic(p);
Matteo Croce2544af02019-05-29 17:13:48 +02002771 csum2 = INDIRECT_CALL_1(ops->update,
2772 csum_partial_ext,
2773 vaddr + p_off, p_len, 0);
Willem de Bruijnc613c202017-07-31 08:15:47 -04002774 kunmap_atomic(vaddr);
Matteo Croce2544af02019-05-29 17:13:48 +02002775 csum = INDIRECT_CALL_1(ops->combine,
2776 csum_block_add_ext, csum,
2777 csum2, pos, p_len);
Willem de Bruijnc613c202017-07-31 08:15:47 -04002778 pos += p_len;
2779 }
2780
Linus Torvalds1da177e2005-04-16 15:20:36 -07002781 if (!(len -= copy))
2782 return csum;
2783 offset += copy;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002784 }
David S. Miller1a028e52007-04-27 15:21:23 -07002785 start = end;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002786 }
2787
David S. Millerfbb398a2009-06-09 00:18:59 -07002788 skb_walk_frags(skb, frag_iter) {
2789 int end;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002790
David S. Millerfbb398a2009-06-09 00:18:59 -07002791 WARN_ON(start > offset + len);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002792
David S. Millerfbb398a2009-06-09 00:18:59 -07002793 end = start + frag_iter->len;
2794 if ((copy = end - offset) > 0) {
2795 __wsum csum2;
2796 if (copy > len)
2797 copy = len;
Daniel Borkmann2817a332013-10-30 11:50:51 +01002798 csum2 = __skb_checksum(frag_iter, offset - start,
2799 copy, 0, ops);
Matteo Croce2544af02019-05-29 17:13:48 +02002800 csum = INDIRECT_CALL_1(ops->combine, csum_block_add_ext,
2801 csum, csum2, pos, copy);
David S. Millerfbb398a2009-06-09 00:18:59 -07002802 if ((len -= copy) == 0)
2803 return csum;
2804 offset += copy;
2805 pos += copy;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002806 }
David S. Millerfbb398a2009-06-09 00:18:59 -07002807 start = end;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002808 }
Kris Katterjohn09a62662006-01-08 22:24:28 -08002809 BUG_ON(len);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002810
2811 return csum;
2812}
Daniel Borkmann2817a332013-10-30 11:50:51 +01002813EXPORT_SYMBOL(__skb_checksum);
2814
2815__wsum skb_checksum(const struct sk_buff *skb, int offset,
2816 int len, __wsum csum)
2817{
2818 const struct skb_checksum_ops ops = {
Daniel Borkmanncea80ea2013-11-04 17:10:25 +01002819 .update = csum_partial_ext,
Daniel Borkmann2817a332013-10-30 11:50:51 +01002820 .combine = csum_block_add_ext,
2821 };
2822
2823 return __skb_checksum(skb, offset, len, csum, &ops);
2824}
David S. Millerb4ac530fc2009-02-10 02:09:24 -08002825EXPORT_SYMBOL(skb_checksum);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002826
2827/* Both of above in one bottle. */
2828
Al Viro81d77662006-11-14 21:37:33 -08002829__wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset,
Al Viro8d5930d2020-07-10 20:07:10 -04002830 u8 *to, int len)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002831{
David S. Miller1a028e52007-04-27 15:21:23 -07002832 int start = skb_headlen(skb);
2833 int i, copy = start - offset;
David S. Millerfbb398a2009-06-09 00:18:59 -07002834 struct sk_buff *frag_iter;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002835 int pos = 0;
Al Viro8d5930d2020-07-10 20:07:10 -04002836 __wsum csum = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002837
2838 /* Copy header. */
2839 if (copy > 0) {
2840 if (copy > len)
2841 copy = len;
2842 csum = csum_partial_copy_nocheck(skb->data + offset, to,
Al Virocc44c172020-07-11 00:12:07 -04002843 copy);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002844 if ((len -= copy) == 0)
2845 return csum;
2846 offset += copy;
2847 to += copy;
2848 pos = copy;
2849 }
2850
2851 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
David S. Miller1a028e52007-04-27 15:21:23 -07002852 int end;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002853
Ilpo Järvinen547b7922008-07-25 21:43:18 -07002854 WARN_ON(start > offset + len);
David S. Miller1a028e52007-04-27 15:21:23 -07002855
Eric Dumazet9e903e02011-10-18 21:00:24 +00002856 end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002857 if ((copy = end - offset) > 0) {
Willem de Bruijnc613c202017-07-31 08:15:47 -04002858 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2859 u32 p_off, p_len, copied;
2860 struct page *p;
Al Viro50842052006-11-14 21:36:34 -08002861 __wsum csum2;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002862 u8 *vaddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002863
2864 if (copy > len)
2865 copy = len;
Willem de Bruijnc613c202017-07-31 08:15:47 -04002866
2867 skb_frag_foreach_page(frag,
Jonathan Lemonb54c9d52019-07-30 07:40:33 -07002868 skb_frag_off(frag) + offset - start,
Willem de Bruijnc613c202017-07-31 08:15:47 -04002869 copy, p, p_off, p_len, copied) {
2870 vaddr = kmap_atomic(p);
2871 csum2 = csum_partial_copy_nocheck(vaddr + p_off,
2872 to + copied,
Al Virocc44c172020-07-11 00:12:07 -04002873 p_len);
Willem de Bruijnc613c202017-07-31 08:15:47 -04002874 kunmap_atomic(vaddr);
2875 csum = csum_block_add(csum, csum2, pos);
2876 pos += p_len;
2877 }
2878
Linus Torvalds1da177e2005-04-16 15:20:36 -07002879 if (!(len -= copy))
2880 return csum;
2881 offset += copy;
2882 to += copy;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002883 }
David S. Miller1a028e52007-04-27 15:21:23 -07002884 start = end;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002885 }
2886
David S. Millerfbb398a2009-06-09 00:18:59 -07002887 skb_walk_frags(skb, frag_iter) {
2888 __wsum csum2;
2889 int end;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002890
David S. Millerfbb398a2009-06-09 00:18:59 -07002891 WARN_ON(start > offset + len);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002892
David S. Millerfbb398a2009-06-09 00:18:59 -07002893 end = start + frag_iter->len;
2894 if ((copy = end - offset) > 0) {
2895 if (copy > len)
2896 copy = len;
2897 csum2 = skb_copy_and_csum_bits(frag_iter,
2898 offset - start,
Al Viro8d5930d2020-07-10 20:07:10 -04002899 to, copy);
David S. Millerfbb398a2009-06-09 00:18:59 -07002900 csum = csum_block_add(csum, csum2, pos);
2901 if ((len -= copy) == 0)
2902 return csum;
2903 offset += copy;
2904 to += copy;
2905 pos += copy;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002906 }
David S. Millerfbb398a2009-06-09 00:18:59 -07002907 start = end;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002908 }
Kris Katterjohn09a62662006-01-08 22:24:28 -08002909 BUG_ON(len);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002910 return csum;
2911}
David S. Millerb4ac530fc2009-02-10 02:09:24 -08002912EXPORT_SYMBOL(skb_copy_and_csum_bits);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002913
Cong Wang49f8e832018-11-08 14:05:42 -08002914__sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len)
2915{
2916 __sum16 sum;
2917
2918 sum = csum_fold(skb_checksum(skb, 0, len, skb->csum));
Cong Wang14641932018-11-26 09:31:26 -08002919 /* See comments in __skb_checksum_complete(). */
Cong Wang49f8e832018-11-08 14:05:42 -08002920 if (likely(!sum)) {
2921 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
2922 !skb->csum_complete_sw)
Cong Wang7fe50ac2018-11-12 14:47:18 -08002923 netdev_rx_csum_fault(skb->dev, skb);
Cong Wang49f8e832018-11-08 14:05:42 -08002924 }
2925 if (!skb_shared(skb))
2926 skb->csum_valid = !sum;
2927 return sum;
2928}
2929EXPORT_SYMBOL(__skb_checksum_complete_head);
2930
Cong Wang14641932018-11-26 09:31:26 -08002931/* This function assumes skb->csum already holds pseudo header's checksum,
2932 * which has been changed from the hardware checksum, for example, by
2933 * __skb_checksum_validate_complete(). And, the original skb->csum must
2934 * have been validated unsuccessfully for CHECKSUM_COMPLETE case.
2935 *
2936 * It returns non-zero if the recomputed checksum is still invalid, otherwise
2937 * zero. The new checksum is stored back into skb->csum unless the skb is
2938 * shared.
2939 */
Cong Wang49f8e832018-11-08 14:05:42 -08002940__sum16 __skb_checksum_complete(struct sk_buff *skb)
2941{
2942 __wsum csum;
2943 __sum16 sum;
2944
2945 csum = skb_checksum(skb, 0, skb->len, 0);
2946
Cong Wang49f8e832018-11-08 14:05:42 -08002947 sum = csum_fold(csum_add(skb->csum, csum));
Cong Wang14641932018-11-26 09:31:26 -08002948 /* This check is inverted, because we already knew the hardware
2949 * checksum is invalid before calling this function. So, if the
2950 * re-computed checksum is valid instead, then we have a mismatch
2951 * between the original skb->csum and skb_checksum(). This means either
2952 * the original hardware checksum is incorrect or we screw up skb->csum
2953 * when moving skb->data around.
2954 */
Cong Wang49f8e832018-11-08 14:05:42 -08002955 if (likely(!sum)) {
2956 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
2957 !skb->csum_complete_sw)
Cong Wang7fe50ac2018-11-12 14:47:18 -08002958 netdev_rx_csum_fault(skb->dev, skb);
Cong Wang49f8e832018-11-08 14:05:42 -08002959 }
2960
2961 if (!skb_shared(skb)) {
2962 /* Save full packet checksum */
2963 skb->csum = csum;
2964 skb->ip_summed = CHECKSUM_COMPLETE;
2965 skb->csum_complete_sw = 1;
2966 skb->csum_valid = !sum;
2967 }
2968
2969 return sum;
2970}
2971EXPORT_SYMBOL(__skb_checksum_complete);
2972
Davide Caratti96178132017-05-18 15:44:37 +02002973static __wsum warn_crc32c_csum_update(const void *buff, int len, __wsum sum)
2974{
2975 net_warn_ratelimited(
2976 "%s: attempt to compute crc32c without libcrc32c.ko\n",
2977 __func__);
2978 return 0;
2979}
2980
2981static __wsum warn_crc32c_csum_combine(__wsum csum, __wsum csum2,
2982 int offset, int len)
2983{
2984 net_warn_ratelimited(
2985 "%s: attempt to compute crc32c without libcrc32c.ko\n",
2986 __func__);
2987 return 0;
2988}
2989
2990static const struct skb_checksum_ops default_crc32c_ops = {
2991 .update = warn_crc32c_csum_update,
2992 .combine = warn_crc32c_csum_combine,
2993};
2994
2995const struct skb_checksum_ops *crc32c_csum_stub __read_mostly =
2996 &default_crc32c_ops;
2997EXPORT_SYMBOL(crc32c_csum_stub);
2998
Thomas Grafaf2806f2013-12-13 15:22:17 +01002999 /**
3000 * skb_zerocopy_headlen - Calculate headroom needed for skb_zerocopy()
3001 * @from: source buffer
3002 *
3003 * Calculates the amount of linear headroom needed in the 'to' skb passed
3004 * into skb_zerocopy().
3005 */
3006unsigned int
3007skb_zerocopy_headlen(const struct sk_buff *from)
3008{
3009 unsigned int hlen = 0;
3010
3011 if (!from->head_frag ||
3012 skb_headlen(from) < L1_CACHE_BYTES ||
3013 skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS)
3014 hlen = skb_headlen(from);
3015
3016 if (skb_has_frag_list(from))
3017 hlen = from->len;
3018
3019 return hlen;
3020}
3021EXPORT_SYMBOL_GPL(skb_zerocopy_headlen);
3022
3023/**
3024 * skb_zerocopy - Zero copy skb to skb
3025 * @to: destination buffer
Masanari Iida7fceb4d2014-01-29 01:05:28 +09003026 * @from: source buffer
Thomas Grafaf2806f2013-12-13 15:22:17 +01003027 * @len: number of bytes to copy from source buffer
3028 * @hlen: size of linear headroom in destination buffer
3029 *
3030 * Copies up to `len` bytes from `from` to `to` by creating references
3031 * to the frags in the source buffer.
3032 *
3033 * The `hlen` as calculated by skb_zerocopy_headlen() specifies the
3034 * headroom in the `to` buffer.
Zoltan Kiss36d5fe62014-03-26 22:37:45 +00003035 *
3036 * Return value:
3037 * 0: everything is OK
3038 * -ENOMEM: couldn't orphan frags of @from due to lack of memory
3039 * -EFAULT: skb_copy_bits() found some problem with skb geometry
Thomas Grafaf2806f2013-12-13 15:22:17 +01003040 */
Zoltan Kiss36d5fe62014-03-26 22:37:45 +00003041int
3042skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, int hlen)
Thomas Grafaf2806f2013-12-13 15:22:17 +01003043{
3044 int i, j = 0;
3045 int plen = 0; /* length of skb->head fragment */
Zoltan Kiss36d5fe62014-03-26 22:37:45 +00003046 int ret;
Thomas Grafaf2806f2013-12-13 15:22:17 +01003047 struct page *page;
3048 unsigned int offset;
3049
3050 BUG_ON(!from->head_frag && !hlen);
3051
3052 /* dont bother with small payloads */
Zoltan Kiss36d5fe62014-03-26 22:37:45 +00003053 if (len <= skb_tailroom(to))
3054 return skb_copy_bits(from, 0, skb_put(to, len), len);
Thomas Grafaf2806f2013-12-13 15:22:17 +01003055
3056 if (hlen) {
Zoltan Kiss36d5fe62014-03-26 22:37:45 +00003057 ret = skb_copy_bits(from, 0, skb_put(to, hlen), hlen);
3058 if (unlikely(ret))
3059 return ret;
Thomas Grafaf2806f2013-12-13 15:22:17 +01003060 len -= hlen;
3061 } else {
3062 plen = min_t(int, skb_headlen(from), len);
3063 if (plen) {
3064 page = virt_to_head_page(from->head);
3065 offset = from->data - (unsigned char *)page_address(page);
3066 __skb_fill_page_desc(to, 0, page, offset, plen);
3067 get_page(page);
3068 j = 1;
3069 len -= plen;
3070 }
3071 }
3072
3073 to->truesize += len + plen;
3074 to->len += len + plen;
3075 to->data_len += len + plen;
3076
Zoltan Kiss36d5fe62014-03-26 22:37:45 +00003077 if (unlikely(skb_orphan_frags(from, GFP_ATOMIC))) {
3078 skb_tx_error(from);
3079 return -ENOMEM;
3080 }
Willem de Bruijn1f8b9772017-08-03 16:29:41 -04003081 skb_zerocopy_clone(to, from, GFP_ATOMIC);
Zoltan Kiss36d5fe62014-03-26 22:37:45 +00003082
Thomas Grafaf2806f2013-12-13 15:22:17 +01003083 for (i = 0; i < skb_shinfo(from)->nr_frags; i++) {
Matthew Wilcox (Oracle)d8e18a52019-07-22 20:08:26 -07003084 int size;
3085
Thomas Grafaf2806f2013-12-13 15:22:17 +01003086 if (!len)
3087 break;
3088 skb_shinfo(to)->frags[j] = skb_shinfo(from)->frags[i];
Matthew Wilcox (Oracle)d8e18a52019-07-22 20:08:26 -07003089 size = min_t(int, skb_frag_size(&skb_shinfo(to)->frags[j]),
3090 len);
3091 skb_frag_size_set(&skb_shinfo(to)->frags[j], size);
3092 len -= size;
Thomas Grafaf2806f2013-12-13 15:22:17 +01003093 skb_frag_ref(to, j);
3094 j++;
3095 }
3096 skb_shinfo(to)->nr_frags = j;
Zoltan Kiss36d5fe62014-03-26 22:37:45 +00003097
3098 return 0;
Thomas Grafaf2806f2013-12-13 15:22:17 +01003099}
3100EXPORT_SYMBOL_GPL(skb_zerocopy);
3101
Linus Torvalds1da177e2005-04-16 15:20:36 -07003102void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to)
3103{
Al Virod3bc23e2006-11-14 21:24:49 -08003104 __wsum csum;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003105 long csstart;
3106
Patrick McHardy84fa7932006-08-29 16:44:56 -07003107 if (skb->ip_summed == CHECKSUM_PARTIAL)
Michał Mirosław55508d62010-12-14 15:24:08 +00003108 csstart = skb_checksum_start_offset(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003109 else
3110 csstart = skb_headlen(skb);
3111
Kris Katterjohn09a62662006-01-08 22:24:28 -08003112 BUG_ON(csstart > skb_headlen(skb));
Linus Torvalds1da177e2005-04-16 15:20:36 -07003113
Arnaldo Carvalho de Melod626f622007-03-27 18:55:52 -03003114 skb_copy_from_linear_data(skb, to, csstart);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003115
3116 csum = 0;
3117 if (csstart != skb->len)
3118 csum = skb_copy_and_csum_bits(skb, csstart, to + csstart,
Al Viro8d5930d2020-07-10 20:07:10 -04003119 skb->len - csstart);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003120
Patrick McHardy84fa7932006-08-29 16:44:56 -07003121 if (skb->ip_summed == CHECKSUM_PARTIAL) {
Al Viroff1dcad2006-11-20 18:07:29 -08003122 long csstuff = csstart + skb->csum_offset;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003123
Al Virod3bc23e2006-11-14 21:24:49 -08003124 *((__sum16 *)(to + csstuff)) = csum_fold(csum);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003125 }
3126}
David S. Millerb4ac530fc2009-02-10 02:09:24 -08003127EXPORT_SYMBOL(skb_copy_and_csum_dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003128
3129/**
3130 * skb_dequeue - remove from the head of the queue
3131 * @list: list to dequeue from
3132 *
3133 * Remove the head of the list. The list lock is taken so the function
3134 * may be used safely with other locking list functions. The head item is
3135 * returned or %NULL if the list is empty.
3136 */
3137
3138struct sk_buff *skb_dequeue(struct sk_buff_head *list)
3139{
3140 unsigned long flags;
3141 struct sk_buff *result;
3142
3143 spin_lock_irqsave(&list->lock, flags);
3144 result = __skb_dequeue(list);
3145 spin_unlock_irqrestore(&list->lock, flags);
3146 return result;
3147}
David S. Millerb4ac530fc2009-02-10 02:09:24 -08003148EXPORT_SYMBOL(skb_dequeue);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003149
3150/**
3151 * skb_dequeue_tail - remove from the tail of the queue
3152 * @list: list to dequeue from
3153 *
3154 * Remove the tail of the list. The list lock is taken so the function
3155 * may be used safely with other locking list functions. The tail item is
3156 * returned or %NULL if the list is empty.
3157 */
3158struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list)
3159{
3160 unsigned long flags;
3161 struct sk_buff *result;
3162
3163 spin_lock_irqsave(&list->lock, flags);
3164 result = __skb_dequeue_tail(list);
3165 spin_unlock_irqrestore(&list->lock, flags);
3166 return result;
3167}
David S. Millerb4ac530fc2009-02-10 02:09:24 -08003168EXPORT_SYMBOL(skb_dequeue_tail);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003169
3170/**
3171 * skb_queue_purge - empty a list
3172 * @list: list to empty
3173 *
3174 * Delete all buffers on an &sk_buff list. Each buffer is removed from
3175 * the list and one reference dropped. This function takes the list
3176 * lock and is atomic with respect to other list locking functions.
3177 */
3178void skb_queue_purge(struct sk_buff_head *list)
3179{
3180 struct sk_buff *skb;
3181 while ((skb = skb_dequeue(list)) != NULL)
3182 kfree_skb(skb);
3183}
David S. Millerb4ac530fc2009-02-10 02:09:24 -08003184EXPORT_SYMBOL(skb_queue_purge);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003185
3186/**
Yaogong Wang9f5afea2016-09-07 14:49:28 -07003187 * skb_rbtree_purge - empty a skb rbtree
3188 * @root: root of the rbtree to empty
Peter Oskolkov385114d2018-08-02 23:34:38 +00003189 * Return value: the sum of truesizes of all purged skbs.
Yaogong Wang9f5afea2016-09-07 14:49:28 -07003190 *
3191 * Delete all buffers on an &sk_buff rbtree. Each buffer is removed from
3192 * the list and one reference dropped. This function does not take
3193 * any lock. Synchronization should be handled by the caller (e.g., TCP
3194 * out-of-order queue is protected by the socket lock).
3195 */
Peter Oskolkov385114d2018-08-02 23:34:38 +00003196unsigned int skb_rbtree_purge(struct rb_root *root)
Yaogong Wang9f5afea2016-09-07 14:49:28 -07003197{
Eric Dumazet7c905842017-09-23 12:39:12 -07003198 struct rb_node *p = rb_first(root);
Peter Oskolkov385114d2018-08-02 23:34:38 +00003199 unsigned int sum = 0;
Yaogong Wang9f5afea2016-09-07 14:49:28 -07003200
Eric Dumazet7c905842017-09-23 12:39:12 -07003201 while (p) {
3202 struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode);
3203
3204 p = rb_next(p);
3205 rb_erase(&skb->rbnode, root);
Peter Oskolkov385114d2018-08-02 23:34:38 +00003206 sum += skb->truesize;
Yaogong Wang9f5afea2016-09-07 14:49:28 -07003207 kfree_skb(skb);
Eric Dumazet7c905842017-09-23 12:39:12 -07003208 }
Peter Oskolkov385114d2018-08-02 23:34:38 +00003209 return sum;
Yaogong Wang9f5afea2016-09-07 14:49:28 -07003210}
3211
3212/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07003213 * skb_queue_head - queue a buffer at the list head
3214 * @list: list to use
3215 * @newsk: buffer to queue
3216 *
3217 * Queue a buffer at the start of the list. This function takes the
3218 * list lock and can be used safely with other locking &sk_buff functions
3219 * safely.
3220 *
3221 * A buffer cannot be placed on two lists at the same time.
3222 */
3223void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk)
3224{
3225 unsigned long flags;
3226
3227 spin_lock_irqsave(&list->lock, flags);
3228 __skb_queue_head(list, newsk);
3229 spin_unlock_irqrestore(&list->lock, flags);
3230}
David S. Millerb4ac530fc2009-02-10 02:09:24 -08003231EXPORT_SYMBOL(skb_queue_head);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003232
3233/**
3234 * skb_queue_tail - queue a buffer at the list tail
3235 * @list: list to use
3236 * @newsk: buffer to queue
3237 *
3238 * Queue a buffer at the tail of the list. This function takes the
3239 * list lock and can be used safely with other locking &sk_buff functions
3240 * safely.
3241 *
3242 * A buffer cannot be placed on two lists at the same time.
3243 */
3244void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk)
3245{
3246 unsigned long flags;
3247
3248 spin_lock_irqsave(&list->lock, flags);
3249 __skb_queue_tail(list, newsk);
3250 spin_unlock_irqrestore(&list->lock, flags);
3251}
David S. Millerb4ac530fc2009-02-10 02:09:24 -08003252EXPORT_SYMBOL(skb_queue_tail);
David S. Miller8728b832005-08-09 19:25:21 -07003253
Linus Torvalds1da177e2005-04-16 15:20:36 -07003254/**
3255 * skb_unlink - remove a buffer from a list
3256 * @skb: buffer to remove
David S. Miller8728b832005-08-09 19:25:21 -07003257 * @list: list to use
Linus Torvalds1da177e2005-04-16 15:20:36 -07003258 *
David S. Miller8728b832005-08-09 19:25:21 -07003259 * Remove a packet from a list. The list locks are taken and this
3260 * function is atomic with respect to other list locked calls
Linus Torvalds1da177e2005-04-16 15:20:36 -07003261 *
David S. Miller8728b832005-08-09 19:25:21 -07003262 * You must know what list the SKB is on.
Linus Torvalds1da177e2005-04-16 15:20:36 -07003263 */
David S. Miller8728b832005-08-09 19:25:21 -07003264void skb_unlink(struct sk_buff *skb, struct sk_buff_head *list)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003265{
David S. Miller8728b832005-08-09 19:25:21 -07003266 unsigned long flags;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003267
David S. Miller8728b832005-08-09 19:25:21 -07003268 spin_lock_irqsave(&list->lock, flags);
3269 __skb_unlink(skb, list);
3270 spin_unlock_irqrestore(&list->lock, flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003271}
David S. Millerb4ac530fc2009-02-10 02:09:24 -08003272EXPORT_SYMBOL(skb_unlink);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003273
Linus Torvalds1da177e2005-04-16 15:20:36 -07003274/**
3275 * skb_append - append a buffer
3276 * @old: buffer to insert after
3277 * @newsk: buffer to insert
David S. Miller8728b832005-08-09 19:25:21 -07003278 * @list: list to use
Linus Torvalds1da177e2005-04-16 15:20:36 -07003279 *
3280 * Place a packet after a given packet in a list. The list locks are taken
3281 * and this function is atomic with respect to other list locked calls.
3282 * A buffer cannot be placed on two lists at the same time.
3283 */
David S. Miller8728b832005-08-09 19:25:21 -07003284void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003285{
3286 unsigned long flags;
3287
David S. Miller8728b832005-08-09 19:25:21 -07003288 spin_lock_irqsave(&list->lock, flags);
Gerrit Renker7de6c032008-04-14 00:05:09 -07003289 __skb_queue_after(list, old, newsk);
David S. Miller8728b832005-08-09 19:25:21 -07003290 spin_unlock_irqrestore(&list->lock, flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003291}
David S. Millerb4ac530fc2009-02-10 02:09:24 -08003292EXPORT_SYMBOL(skb_append);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003293
Linus Torvalds1da177e2005-04-16 15:20:36 -07003294static inline void skb_split_inside_header(struct sk_buff *skb,
3295 struct sk_buff* skb1,
3296 const u32 len, const int pos)
3297{
3298 int i;
3299
Arnaldo Carvalho de Melod626f622007-03-27 18:55:52 -03003300 skb_copy_from_linear_data_offset(skb, len, skb_put(skb1, pos - len),
3301 pos - len);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003302 /* And move data appendix as is. */
3303 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
3304 skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i];
3305
3306 skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags;
3307 skb_shinfo(skb)->nr_frags = 0;
3308 skb1->data_len = skb->data_len;
3309 skb1->len += skb1->data_len;
3310 skb->data_len = 0;
3311 skb->len = len;
Arnaldo Carvalho de Melo27a884d2007-04-19 20:29:13 -07003312 skb_set_tail_pointer(skb, len);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003313}
3314
3315static inline void skb_split_no_header(struct sk_buff *skb,
3316 struct sk_buff* skb1,
3317 const u32 len, int pos)
3318{
3319 int i, k = 0;
3320 const int nfrags = skb_shinfo(skb)->nr_frags;
3321
3322 skb_shinfo(skb)->nr_frags = 0;
3323 skb1->len = skb1->data_len = skb->len - len;
3324 skb->len = len;
3325 skb->data_len = len - pos;
3326
3327 for (i = 0; i < nfrags; i++) {
Eric Dumazet9e903e02011-10-18 21:00:24 +00003328 int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003329
3330 if (pos + size > len) {
3331 skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i];
3332
3333 if (pos < len) {
3334 /* Split frag.
3335 * We have two variants in this case:
3336 * 1. Move all the frag to the second
3337 * part, if it is possible. F.e.
3338 * this approach is mandatory for TUX,
3339 * where splitting is expensive.
3340 * 2. Split is accurately. We make this.
3341 */
Ian Campbellea2ab692011-08-22 23:44:58 +00003342 skb_frag_ref(skb, i);
Jonathan Lemonb54c9d52019-07-30 07:40:33 -07003343 skb_frag_off_add(&skb_shinfo(skb1)->frags[0], len - pos);
Eric Dumazet9e903e02011-10-18 21:00:24 +00003344 skb_frag_size_sub(&skb_shinfo(skb1)->frags[0], len - pos);
3345 skb_frag_size_set(&skb_shinfo(skb)->frags[i], len - pos);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003346 skb_shinfo(skb)->nr_frags++;
3347 }
3348 k++;
3349 } else
3350 skb_shinfo(skb)->nr_frags++;
3351 pos += size;
3352 }
3353 skb_shinfo(skb1)->nr_frags = k;
3354}
3355
3356/**
3357 * skb_split - Split fragmented skb to two parts at length len.
3358 * @skb: the buffer to split
3359 * @skb1: the buffer to receive the second part
3360 * @len: new length for skb
3361 */
3362void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len)
3363{
3364 int pos = skb_headlen(skb);
3365
Jonathan Lemon06b4feb2021-01-06 14:18:38 -08003366 skb_shinfo(skb1)->flags |= skb_shinfo(skb)->flags & SKBFL_SHARED_FRAG;
Willem de Bruijn1f8b9772017-08-03 16:29:41 -04003367 skb_zerocopy_clone(skb1, skb, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003368 if (len < pos) /* Split line is inside header. */
3369 skb_split_inside_header(skb, skb1, len, pos);
3370 else /* Second chunk has no header, nothing to copy. */
3371 skb_split_no_header(skb, skb1, len, pos);
3372}
David S. Millerb4ac530fc2009-02-10 02:09:24 -08003373EXPORT_SYMBOL(skb_split);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003374
Ilpo Järvinen9f782db2008-11-25 13:57:01 -08003375/* Shifting from/to a cloned skb is a no-go.
3376 *
3377 * Caller cannot keep skb_shinfo related pointers past calling here!
3378 */
Ilpo Järvinen832d11c2008-11-24 21:20:15 -08003379static int skb_prepare_for_shift(struct sk_buff *skb)
3380{
Marco Elver097b9142021-02-01 17:04:20 +01003381 int ret = 0;
3382
3383 if (skb_cloned(skb)) {
3384 /* Save and restore truesize: pskb_expand_head() may reallocate
3385 * memory where ksize(kmalloc(S)) != ksize(kmalloc(S)), but we
3386 * cannot change truesize at this point.
3387 */
3388 unsigned int save_truesize = skb->truesize;
3389
3390 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
3391 skb->truesize = save_truesize;
3392 }
3393 return ret;
Ilpo Järvinen832d11c2008-11-24 21:20:15 -08003394}
3395
3396/**
3397 * skb_shift - Shifts paged data partially from skb to another
3398 * @tgt: buffer into which tail data gets added
3399 * @skb: buffer from which the paged data comes from
3400 * @shiftlen: shift up to this many bytes
3401 *
3402 * Attempts to shift up to shiftlen worth of bytes, which may be less than
Feng King20e994a2011-11-21 01:47:11 +00003403 * the length of the skb, from skb to tgt. Returns number bytes shifted.
Ilpo Järvinen832d11c2008-11-24 21:20:15 -08003404 * It's up to caller to free skb if everything was shifted.
3405 *
3406 * If @tgt runs out of frags, the whole operation is aborted.
3407 *
3408 * Skb cannot include anything else but paged data while tgt is allowed
3409 * to have non-paged data as well.
3410 *
3411 * TODO: full sized shift could be optimized but that would need
3412 * specialized skb free'er to handle frags without up-to-date nr_frags.
3413 */
3414int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen)
3415{
3416 int from, to, merge, todo;
Matthew Wilcox (Oracle)d8e18a52019-07-22 20:08:26 -07003417 skb_frag_t *fragfrom, *fragto;
Ilpo Järvinen832d11c2008-11-24 21:20:15 -08003418
3419 BUG_ON(shiftlen > skb->len);
Eric Dumazetf8071cd2016-11-15 12:51:50 -08003420
3421 if (skb_headlen(skb))
3422 return 0;
Willem de Bruijn1f8b9772017-08-03 16:29:41 -04003423 if (skb_zcopy(tgt) || skb_zcopy(skb))
3424 return 0;
Ilpo Järvinen832d11c2008-11-24 21:20:15 -08003425
3426 todo = shiftlen;
3427 from = 0;
3428 to = skb_shinfo(tgt)->nr_frags;
3429 fragfrom = &skb_shinfo(skb)->frags[from];
3430
3431 /* Actual merge is delayed until the point when we know we can
3432 * commit all, so that we don't have to undo partial changes
3433 */
3434 if (!to ||
Ian Campbellea2ab692011-08-22 23:44:58 +00003435 !skb_can_coalesce(tgt, to, skb_frag_page(fragfrom),
Jonathan Lemonb54c9d52019-07-30 07:40:33 -07003436 skb_frag_off(fragfrom))) {
Ilpo Järvinen832d11c2008-11-24 21:20:15 -08003437 merge = -1;
3438 } else {
3439 merge = to - 1;
3440
Eric Dumazet9e903e02011-10-18 21:00:24 +00003441 todo -= skb_frag_size(fragfrom);
Ilpo Järvinen832d11c2008-11-24 21:20:15 -08003442 if (todo < 0) {
3443 if (skb_prepare_for_shift(skb) ||
3444 skb_prepare_for_shift(tgt))
3445 return 0;
3446
Ilpo Järvinen9f782db2008-11-25 13:57:01 -08003447 /* All previous frag pointers might be stale! */
3448 fragfrom = &skb_shinfo(skb)->frags[from];
Ilpo Järvinen832d11c2008-11-24 21:20:15 -08003449 fragto = &skb_shinfo(tgt)->frags[merge];
3450
Eric Dumazet9e903e02011-10-18 21:00:24 +00003451 skb_frag_size_add(fragto, shiftlen);
3452 skb_frag_size_sub(fragfrom, shiftlen);
Jonathan Lemonb54c9d52019-07-30 07:40:33 -07003453 skb_frag_off_add(fragfrom, shiftlen);
Ilpo Järvinen832d11c2008-11-24 21:20:15 -08003454
3455 goto onlymerged;
3456 }
3457
3458 from++;
3459 }
3460
3461 /* Skip full, not-fitting skb to avoid expensive operations */
3462 if ((shiftlen == skb->len) &&
3463 (skb_shinfo(skb)->nr_frags - from) > (MAX_SKB_FRAGS - to))
3464 return 0;
3465
3466 if (skb_prepare_for_shift(skb) || skb_prepare_for_shift(tgt))
3467 return 0;
3468
3469 while ((todo > 0) && (from < skb_shinfo(skb)->nr_frags)) {
3470 if (to == MAX_SKB_FRAGS)
3471 return 0;
3472
3473 fragfrom = &skb_shinfo(skb)->frags[from];
3474 fragto = &skb_shinfo(tgt)->frags[to];
3475
Eric Dumazet9e903e02011-10-18 21:00:24 +00003476 if (todo >= skb_frag_size(fragfrom)) {
Ilpo Järvinen832d11c2008-11-24 21:20:15 -08003477 *fragto = *fragfrom;
Eric Dumazet9e903e02011-10-18 21:00:24 +00003478 todo -= skb_frag_size(fragfrom);
Ilpo Järvinen832d11c2008-11-24 21:20:15 -08003479 from++;
3480 to++;
3481
3482 } else {
Ian Campbellea2ab692011-08-22 23:44:58 +00003483 __skb_frag_ref(fragfrom);
Jonathan Lemonb54c9d52019-07-30 07:40:33 -07003484 skb_frag_page_copy(fragto, fragfrom);
3485 skb_frag_off_copy(fragto, fragfrom);
Eric Dumazet9e903e02011-10-18 21:00:24 +00003486 skb_frag_size_set(fragto, todo);
Ilpo Järvinen832d11c2008-11-24 21:20:15 -08003487
Jonathan Lemonb54c9d52019-07-30 07:40:33 -07003488 skb_frag_off_add(fragfrom, todo);
Eric Dumazet9e903e02011-10-18 21:00:24 +00003489 skb_frag_size_sub(fragfrom, todo);
Ilpo Järvinen832d11c2008-11-24 21:20:15 -08003490 todo = 0;
3491
3492 to++;
3493 break;
3494 }
3495 }
3496
3497 /* Ready to "commit" this state change to tgt */
3498 skb_shinfo(tgt)->nr_frags = to;
3499
3500 if (merge >= 0) {
3501 fragfrom = &skb_shinfo(skb)->frags[0];
3502 fragto = &skb_shinfo(tgt)->frags[merge];
3503
Eric Dumazet9e903e02011-10-18 21:00:24 +00003504 skb_frag_size_add(fragto, skb_frag_size(fragfrom));
Ilias Apalodimas6a5bcd82021-06-07 21:02:38 +02003505 __skb_frag_unref(fragfrom, skb->pp_recycle);
Ilpo Järvinen832d11c2008-11-24 21:20:15 -08003506 }
3507
3508 /* Reposition in the original skb */
3509 to = 0;
3510 while (from < skb_shinfo(skb)->nr_frags)
3511 skb_shinfo(skb)->frags[to++] = skb_shinfo(skb)->frags[from++];
3512 skb_shinfo(skb)->nr_frags = to;
3513
3514 BUG_ON(todo > 0 && !skb_shinfo(skb)->nr_frags);
3515
3516onlymerged:
3517 /* Most likely the tgt won't ever need its checksum anymore, skb on
3518 * the other hand might need it if it needs to be resent
3519 */
3520 tgt->ip_summed = CHECKSUM_PARTIAL;
3521 skb->ip_summed = CHECKSUM_PARTIAL;
3522
3523 /* Yak, is it really working this way? Some helper please? */
3524 skb->len -= shiftlen;
3525 skb->data_len -= shiftlen;
3526 skb->truesize -= shiftlen;
3527 tgt->len += shiftlen;
3528 tgt->data_len += shiftlen;
3529 tgt->truesize += shiftlen;
3530
3531 return shiftlen;
3532}
3533
Thomas Graf677e90e2005-06-23 20:59:51 -07003534/**
3535 * skb_prepare_seq_read - Prepare a sequential read of skb data
3536 * @skb: the buffer to read
3537 * @from: lower offset of data to be read
3538 * @to: upper offset of data to be read
3539 * @st: state variable
3540 *
3541 * Initializes the specified state variable. Must be called before
3542 * invoking skb_seq_read() for the first time.
3543 */
3544void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from,
3545 unsigned int to, struct skb_seq_state *st)
3546{
3547 st->lower_offset = from;
3548 st->upper_offset = to;
3549 st->root_skb = st->cur_skb = skb;
3550 st->frag_idx = st->stepped_offset = 0;
3551 st->frag_data = NULL;
Willem de Bruijn97550f62021-01-09 17:18:33 -05003552 st->frag_off = 0;
Thomas Graf677e90e2005-06-23 20:59:51 -07003553}
David S. Millerb4ac530fc2009-02-10 02:09:24 -08003554EXPORT_SYMBOL(skb_prepare_seq_read);
Thomas Graf677e90e2005-06-23 20:59:51 -07003555
3556/**
3557 * skb_seq_read - Sequentially read skb data
3558 * @consumed: number of bytes consumed by the caller so far
3559 * @data: destination pointer for data to be returned
3560 * @st: state variable
3561 *
Mathias Krausebc323832013-11-07 14:18:26 +01003562 * Reads a block of skb data at @consumed relative to the
Thomas Graf677e90e2005-06-23 20:59:51 -07003563 * lower offset specified to skb_prepare_seq_read(). Assigns
Mathias Krausebc323832013-11-07 14:18:26 +01003564 * the head of the data block to @data and returns the length
Thomas Graf677e90e2005-06-23 20:59:51 -07003565 * of the block or 0 if the end of the skb data or the upper
3566 * offset has been reached.
3567 *
3568 * The caller is not required to consume all of the data
Mathias Krausebc323832013-11-07 14:18:26 +01003569 * returned, i.e. @consumed is typically set to the number
Thomas Graf677e90e2005-06-23 20:59:51 -07003570 * of bytes already consumed and the next call to
3571 * skb_seq_read() will return the remaining part of the block.
3572 *
Lucas De Marchi25985ed2011-03-30 22:57:33 -03003573 * Note 1: The size of each block of data returned can be arbitrary,
Masanari Iidae793c0f2014-09-04 23:44:36 +09003574 * this limitation is the cost for zerocopy sequential
Thomas Graf677e90e2005-06-23 20:59:51 -07003575 * reads of potentially non linear data.
3576 *
Randy Dunlapbc2cda12008-02-13 15:03:25 -08003577 * Note 2: Fragment lists within fragments are not implemented
Thomas Graf677e90e2005-06-23 20:59:51 -07003578 * at the moment, state->root_skb could be replaced with
3579 * a stack for this purpose.
3580 */
3581unsigned int skb_seq_read(unsigned int consumed, const u8 **data,
3582 struct skb_seq_state *st)
3583{
3584 unsigned int block_limit, abs_offset = consumed + st->lower_offset;
3585 skb_frag_t *frag;
3586
Wedson Almeida Filhoaeb193e2013-06-23 23:33:48 -07003587 if (unlikely(abs_offset >= st->upper_offset)) {
3588 if (st->frag_data) {
3589 kunmap_atomic(st->frag_data);
3590 st->frag_data = NULL;
3591 }
Thomas Graf677e90e2005-06-23 20:59:51 -07003592 return 0;
Wedson Almeida Filhoaeb193e2013-06-23 23:33:48 -07003593 }
Thomas Graf677e90e2005-06-23 20:59:51 -07003594
3595next_skb:
Herbert Xu95e3b242009-01-29 16:07:52 -08003596 block_limit = skb_headlen(st->cur_skb) + st->stepped_offset;
Thomas Graf677e90e2005-06-23 20:59:51 -07003597
Thomas Chenault995b3372009-05-18 21:43:27 -07003598 if (abs_offset < block_limit && !st->frag_data) {
Herbert Xu95e3b242009-01-29 16:07:52 -08003599 *data = st->cur_skb->data + (abs_offset - st->stepped_offset);
Thomas Graf677e90e2005-06-23 20:59:51 -07003600 return block_limit - abs_offset;
3601 }
3602
3603 if (st->frag_idx == 0 && !st->frag_data)
3604 st->stepped_offset += skb_headlen(st->cur_skb);
3605
3606 while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) {
Willem de Bruijn97550f62021-01-09 17:18:33 -05003607 unsigned int pg_idx, pg_off, pg_sz;
Thomas Graf677e90e2005-06-23 20:59:51 -07003608
Willem de Bruijn97550f62021-01-09 17:18:33 -05003609 frag = &skb_shinfo(st->cur_skb)->frags[st->frag_idx];
3610
3611 pg_idx = 0;
3612 pg_off = skb_frag_off(frag);
3613 pg_sz = skb_frag_size(frag);
3614
3615 if (skb_frag_must_loop(skb_frag_page(frag))) {
3616 pg_idx = (pg_off + st->frag_off) >> PAGE_SHIFT;
3617 pg_off = offset_in_page(pg_off + st->frag_off);
3618 pg_sz = min_t(unsigned int, pg_sz - st->frag_off,
3619 PAGE_SIZE - pg_off);
3620 }
3621
3622 block_limit = pg_sz + st->stepped_offset;
Thomas Graf677e90e2005-06-23 20:59:51 -07003623 if (abs_offset < block_limit) {
3624 if (!st->frag_data)
Willem de Bruijn97550f62021-01-09 17:18:33 -05003625 st->frag_data = kmap_atomic(skb_frag_page(frag) + pg_idx);
Thomas Graf677e90e2005-06-23 20:59:51 -07003626
Willem de Bruijn97550f62021-01-09 17:18:33 -05003627 *data = (u8 *)st->frag_data + pg_off +
Thomas Graf677e90e2005-06-23 20:59:51 -07003628 (abs_offset - st->stepped_offset);
3629
3630 return block_limit - abs_offset;
3631 }
3632
3633 if (st->frag_data) {
Eric Dumazet51c56b02012-04-05 11:35:15 +02003634 kunmap_atomic(st->frag_data);
Thomas Graf677e90e2005-06-23 20:59:51 -07003635 st->frag_data = NULL;
3636 }
3637
Willem de Bruijn97550f62021-01-09 17:18:33 -05003638 st->stepped_offset += pg_sz;
3639 st->frag_off += pg_sz;
3640 if (st->frag_off == skb_frag_size(frag)) {
3641 st->frag_off = 0;
3642 st->frag_idx++;
3643 }
Thomas Graf677e90e2005-06-23 20:59:51 -07003644 }
3645
Olaf Kirch5b5a60d2007-06-23 23:11:52 -07003646 if (st->frag_data) {
Eric Dumazet51c56b02012-04-05 11:35:15 +02003647 kunmap_atomic(st->frag_data);
Olaf Kirch5b5a60d2007-06-23 23:11:52 -07003648 st->frag_data = NULL;
3649 }
3650
David S. Miller21dc3302010-08-23 00:13:46 -07003651 if (st->root_skb == st->cur_skb && skb_has_frag_list(st->root_skb)) {
Shyam Iyer71b33462009-01-29 16:12:42 -08003652 st->cur_skb = skb_shinfo(st->root_skb)->frag_list;
Thomas Graf677e90e2005-06-23 20:59:51 -07003653 st->frag_idx = 0;
3654 goto next_skb;
Shyam Iyer71b33462009-01-29 16:12:42 -08003655 } else if (st->cur_skb->next) {
3656 st->cur_skb = st->cur_skb->next;
Herbert Xu95e3b242009-01-29 16:07:52 -08003657 st->frag_idx = 0;
Thomas Graf677e90e2005-06-23 20:59:51 -07003658 goto next_skb;
3659 }
3660
3661 return 0;
3662}
David S. Millerb4ac530fc2009-02-10 02:09:24 -08003663EXPORT_SYMBOL(skb_seq_read);
Thomas Graf677e90e2005-06-23 20:59:51 -07003664
3665/**
3666 * skb_abort_seq_read - Abort a sequential read of skb data
3667 * @st: state variable
3668 *
3669 * Must be called if skb_seq_read() was not called until it
3670 * returned 0.
3671 */
3672void skb_abort_seq_read(struct skb_seq_state *st)
3673{
3674 if (st->frag_data)
Eric Dumazet51c56b02012-04-05 11:35:15 +02003675 kunmap_atomic(st->frag_data);
Thomas Graf677e90e2005-06-23 20:59:51 -07003676}
David S. Millerb4ac530fc2009-02-10 02:09:24 -08003677EXPORT_SYMBOL(skb_abort_seq_read);
Thomas Graf677e90e2005-06-23 20:59:51 -07003678
Thomas Graf3fc7e8a2005-06-23 21:00:17 -07003679#define TS_SKB_CB(state) ((struct skb_seq_state *) &((state)->cb))
3680
3681static unsigned int skb_ts_get_next_block(unsigned int offset, const u8 **text,
3682 struct ts_config *conf,
3683 struct ts_state *state)
3684{
3685 return skb_seq_read(offset, text, TS_SKB_CB(state));
3686}
3687
3688static void skb_ts_finish(struct ts_config *conf, struct ts_state *state)
3689{
3690 skb_abort_seq_read(TS_SKB_CB(state));
3691}
3692
3693/**
3694 * skb_find_text - Find a text pattern in skb data
3695 * @skb: the buffer to look in
3696 * @from: search offset
3697 * @to: search limit
3698 * @config: textsearch configuration
Thomas Graf3fc7e8a2005-06-23 21:00:17 -07003699 *
3700 * Finds a pattern in the skb data according to the specified
3701 * textsearch configuration. Use textsearch_next() to retrieve
3702 * subsequent occurrences of the pattern. Returns the offset
3703 * to the first occurrence or UINT_MAX if no match was found.
3704 */
3705unsigned int skb_find_text(struct sk_buff *skb, unsigned int from,
Bojan Prtvar059a2442015-02-22 11:46:35 +01003706 unsigned int to, struct ts_config *config)
Thomas Graf3fc7e8a2005-06-23 21:00:17 -07003707{
Bojan Prtvar059a2442015-02-22 11:46:35 +01003708 struct ts_state state;
Phil Oesterf72b9482006-06-26 00:00:57 -07003709 unsigned int ret;
3710
Willem de Bruijnb228c9b2021-03-01 15:09:44 +00003711 BUILD_BUG_ON(sizeof(struct skb_seq_state) > sizeof(state.cb));
3712
Thomas Graf3fc7e8a2005-06-23 21:00:17 -07003713 config->get_next_block = skb_ts_get_next_block;
3714 config->finish = skb_ts_finish;
3715
Bojan Prtvar059a2442015-02-22 11:46:35 +01003716 skb_prepare_seq_read(skb, from, to, TS_SKB_CB(&state));
Thomas Graf3fc7e8a2005-06-23 21:00:17 -07003717
Bojan Prtvar059a2442015-02-22 11:46:35 +01003718 ret = textsearch_find(config, &state);
Phil Oesterf72b9482006-06-26 00:00:57 -07003719 return (ret <= to - from ? ret : UINT_MAX);
Thomas Graf3fc7e8a2005-06-23 21:00:17 -07003720}
David S. Millerb4ac530fc2009-02-10 02:09:24 -08003721EXPORT_SYMBOL(skb_find_text);
Thomas Graf3fc7e8a2005-06-23 21:00:17 -07003722
Hannes Frederic Sowabe12a1f2015-05-21 16:59:58 +02003723int skb_append_pagefrags(struct sk_buff *skb, struct page *page,
3724 int offset, size_t size)
3725{
3726 int i = skb_shinfo(skb)->nr_frags;
3727
3728 if (skb_can_coalesce(skb, i, page, offset)) {
3729 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], size);
3730 } else if (i < MAX_SKB_FRAGS) {
3731 get_page(page);
3732 skb_fill_page_desc(skb, i, page, offset, size);
3733 } else {
3734 return -EMSGSIZE;
3735 }
3736
3737 return 0;
3738}
3739EXPORT_SYMBOL_GPL(skb_append_pagefrags);
3740
Herbert Xucbb042f2006-03-20 22:43:56 -08003741/**
3742 * skb_pull_rcsum - pull skb and update receive checksum
3743 * @skb: buffer to update
Herbert Xucbb042f2006-03-20 22:43:56 -08003744 * @len: length of data pulled
3745 *
3746 * This function performs an skb_pull on the packet and updates
Urs Thuermannfee54fa2008-02-12 22:03:25 -08003747 * the CHECKSUM_COMPLETE checksum. It should be used on
Patrick McHardy84fa7932006-08-29 16:44:56 -07003748 * receive path processing instead of skb_pull unless you know
3749 * that the checksum difference is zero (e.g., a valid IP header)
3750 * or you are setting ip_summed to CHECKSUM_NONE.
Herbert Xucbb042f2006-03-20 22:43:56 -08003751 */
Johannes Bergaf728682017-06-16 14:29:22 +02003752void *skb_pull_rcsum(struct sk_buff *skb, unsigned int len)
Herbert Xucbb042f2006-03-20 22:43:56 -08003753{
Pravin B Shelar31b33df2015-09-28 17:24:25 -07003754 unsigned char *data = skb->data;
3755
Herbert Xucbb042f2006-03-20 22:43:56 -08003756 BUG_ON(len > skb->len);
Pravin B Shelar31b33df2015-09-28 17:24:25 -07003757 __skb_pull(skb, len);
3758 skb_postpull_rcsum(skb, data, len);
3759 return skb->data;
Herbert Xucbb042f2006-03-20 22:43:56 -08003760}
Arnaldo Carvalho de Melof94691a2006-03-20 22:47:55 -08003761EXPORT_SYMBOL_GPL(skb_pull_rcsum);
3762
Yonghong Song13acc942018-03-21 16:31:03 -07003763static inline skb_frag_t skb_head_frag_to_page_desc(struct sk_buff *frag_skb)
3764{
3765 skb_frag_t head_frag;
3766 struct page *page;
3767
3768 page = virt_to_head_page(frag_skb->head);
Matthew Wilcox (Oracle)d8e18a52019-07-22 20:08:26 -07003769 __skb_frag_set_page(&head_frag, page);
Jonathan Lemonb54c9d52019-07-30 07:40:33 -07003770 skb_frag_off_set(&head_frag, frag_skb->data -
3771 (unsigned char *)page_address(page));
Matthew Wilcox (Oracle)d8e18a52019-07-22 20:08:26 -07003772 skb_frag_size_set(&head_frag, skb_headlen(frag_skb));
Yonghong Song13acc942018-03-21 16:31:03 -07003773 return head_frag;
3774}
3775
Steffen Klassert3a1296a2020-01-25 11:26:44 +01003776struct sk_buff *skb_segment_list(struct sk_buff *skb,
3777 netdev_features_t features,
3778 unsigned int offset)
3779{
3780 struct sk_buff *list_skb = skb_shinfo(skb)->frag_list;
3781 unsigned int tnl_hlen = skb_tnl_header_len(skb);
3782 unsigned int delta_truesize = 0;
3783 unsigned int delta_len = 0;
3784 struct sk_buff *tail = NULL;
Dongseok Yi53475c52021-01-08 11:28:38 +09003785 struct sk_buff *nskb, *tmp;
3786 int err;
Steffen Klassert3a1296a2020-01-25 11:26:44 +01003787
3788 skb_push(skb, -skb_network_offset(skb) + offset);
3789
3790 skb_shinfo(skb)->frag_list = NULL;
3791
3792 do {
3793 nskb = list_skb;
3794 list_skb = list_skb->next;
3795
Dongseok Yi53475c52021-01-08 11:28:38 +09003796 err = 0;
3797 if (skb_shared(nskb)) {
3798 tmp = skb_clone(nskb, GFP_ATOMIC);
3799 if (tmp) {
3800 consume_skb(nskb);
3801 nskb = tmp;
3802 err = skb_unclone(nskb, GFP_ATOMIC);
3803 } else {
3804 err = -ENOMEM;
3805 }
3806 }
3807
Steffen Klassert3a1296a2020-01-25 11:26:44 +01003808 if (!tail)
3809 skb->next = nskb;
3810 else
3811 tail->next = nskb;
3812
Dongseok Yi53475c52021-01-08 11:28:38 +09003813 if (unlikely(err)) {
3814 nskb->next = list_skb;
3815 goto err_linearize;
3816 }
3817
Steffen Klassert3a1296a2020-01-25 11:26:44 +01003818 tail = nskb;
3819
3820 delta_len += nskb->len;
3821 delta_truesize += nskb->truesize;
3822
3823 skb_push(nskb, -skb_network_offset(nskb) + offset);
3824
Florian Westphalcf673ed2020-03-30 18:51:29 +02003825 skb_release_head_state(nskb);
Steffen Klassert3a1296a2020-01-25 11:26:44 +01003826 __copy_skb_header(nskb, skb);
3827
3828 skb_headers_offset_update(nskb, skb_headroom(nskb) - skb_headroom(skb));
3829 skb_copy_from_linear_data_offset(skb, -tnl_hlen,
3830 nskb->data - tnl_hlen,
3831 offset + tnl_hlen);
3832
3833 if (skb_needs_linearize(nskb, features) &&
3834 __skb_linearize(nskb))
3835 goto err_linearize;
3836
3837 } while (list_skb);
3838
3839 skb->truesize = skb->truesize - delta_truesize;
3840 skb->data_len = skb->data_len - delta_len;
3841 skb->len = skb->len - delta_len;
3842
3843 skb_gso_reset(skb);
3844
3845 skb->prev = tail;
3846
3847 if (skb_needs_linearize(skb, features) &&
3848 __skb_linearize(skb))
3849 goto err_linearize;
3850
3851 skb_get(skb);
3852
3853 return skb;
3854
3855err_linearize:
3856 kfree_skb_list(skb->next);
3857 skb->next = NULL;
3858 return ERR_PTR(-ENOMEM);
3859}
3860EXPORT_SYMBOL_GPL(skb_segment_list);
3861
3862int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb)
3863{
3864 if (unlikely(p->len + skb->len >= 65536))
3865 return -E2BIG;
3866
3867 if (NAPI_GRO_CB(p)->last == p)
3868 skb_shinfo(p)->frag_list = skb;
3869 else
3870 NAPI_GRO_CB(p)->last->next = skb;
3871
3872 skb_pull(skb, skb_gro_offset(skb));
3873
3874 NAPI_GRO_CB(p)->last = skb;
3875 NAPI_GRO_CB(p)->count++;
3876 p->data_len += skb->len;
3877 p->truesize += skb->truesize;
3878 p->len += skb->len;
3879
3880 NAPI_GRO_CB(skb)->same_flow = 1;
3881
3882 return 0;
3883}
Steffen Klassert3a1296a2020-01-25 11:26:44 +01003884
Herbert Xuf4c50d92006-06-22 03:02:40 -07003885/**
3886 * skb_segment - Perform protocol segmentation on skb.
Michael S. Tsirkindf5771f2014-03-10 18:29:19 +02003887 * @head_skb: buffer to segment
Herbert Xu576a30e2006-06-27 13:22:38 -07003888 * @features: features for the output path (see dev->features)
Herbert Xuf4c50d92006-06-22 03:02:40 -07003889 *
3890 * This function performs segmentation on the given skb. It returns
Ben Hutchings4c821d72008-04-13 21:52:48 -07003891 * a pointer to the first in a list of new skbs for the segments.
3892 * In case of error it returns ERR_PTR(err).
Herbert Xuf4c50d92006-06-22 03:02:40 -07003893 */
Michael S. Tsirkindf5771f2014-03-10 18:29:19 +02003894struct sk_buff *skb_segment(struct sk_buff *head_skb,
3895 netdev_features_t features)
Herbert Xuf4c50d92006-06-22 03:02:40 -07003896{
3897 struct sk_buff *segs = NULL;
3898 struct sk_buff *tail = NULL;
Michael S. Tsirkin1a4ceda2014-03-10 19:27:59 +02003899 struct sk_buff *list_skb = skb_shinfo(head_skb)->frag_list;
Michael S. Tsirkindf5771f2014-03-10 18:29:19 +02003900 skb_frag_t *frag = skb_shinfo(head_skb)->frags;
3901 unsigned int mss = skb_shinfo(head_skb)->gso_size;
3902 unsigned int doffset = head_skb->data - skb_mac_header(head_skb);
Michael S. Tsirkin1fd819e2014-03-10 19:28:08 +02003903 struct sk_buff *frag_skb = head_skb;
Herbert Xuf4c50d92006-06-22 03:02:40 -07003904 unsigned int offset = doffset;
Michael S. Tsirkindf5771f2014-03-10 18:29:19 +02003905 unsigned int tnl_hlen = skb_tnl_header_len(head_skb);
Alexander Duyck802ab552016-04-10 21:45:03 -04003906 unsigned int partial_segs = 0;
Herbert Xuf4c50d92006-06-22 03:02:40 -07003907 unsigned int headroom;
Alexander Duyck802ab552016-04-10 21:45:03 -04003908 unsigned int len = head_skb->len;
Pravin B Shelarec5f0612013-03-07 09:28:01 +00003909 __be16 proto;
Alexander Duyck36c98382016-05-02 09:38:18 -07003910 bool csum, sg;
Michael S. Tsirkindf5771f2014-03-10 18:29:19 +02003911 int nfrags = skb_shinfo(head_skb)->nr_frags;
Herbert Xuf4c50d92006-06-22 03:02:40 -07003912 int err = -ENOMEM;
3913 int i = 0;
3914 int pos;
3915
Shmulik Ladkani3dcbdb12019-09-06 12:23:50 +03003916 if (list_skb && !list_skb->head_frag && skb_headlen(list_skb) &&
3917 (skb_shinfo(head_skb)->gso_type & SKB_GSO_DODGY)) {
3918 /* gso_size is untrusted, and we have a frag_list with a linear
3919 * non head_frag head.
3920 *
3921 * (we assume checking the first list_skb member suffices;
3922 * i.e if either of the list_skb members have non head_frag
3923 * head, then the first one has too).
3924 *
3925 * If head_skb's headlen does not fit requested gso_size, it
3926 * means that the frag_list members do NOT terminate on exact
3927 * gso_size boundaries. Hence we cannot perform skb_frag_t page
3928 * sharing. Therefore we must fallback to copying the frag_list
3929 * skbs; we do so by disabling SG.
3930 */
3931 if (mss != GSO_BY_FRAGS && mss != skb_headlen(head_skb))
3932 features &= ~NETIF_F_SG;
3933 }
3934
Wei-Chun Chao5882a072014-06-08 23:48:54 -07003935 __skb_push(head_skb, doffset);
Miaohe Lin2f631132020-08-01 17:36:05 +08003936 proto = skb_network_protocol(head_skb, NULL);
Pravin B Shelarec5f0612013-03-07 09:28:01 +00003937 if (unlikely(!proto))
3938 return ERR_PTR(-EINVAL);
3939
Alexander Duyck36c98382016-05-02 09:38:18 -07003940 sg = !!(features & NETIF_F_SG);
Alexander Duyckf245d072016-02-05 15:28:26 -08003941 csum = !!can_checksum_protocol(features, proto);
Tom Herbert7e2b10c2014-06-04 17:20:02 -07003942
Steffen Klassert07b26c92016-09-19 12:58:47 +02003943 if (sg && csum && (mss != GSO_BY_FRAGS)) {
3944 if (!(features & NETIF_F_GSO_PARTIAL)) {
3945 struct sk_buff *iter;
Ilan Tayari43170c42017-04-19 21:26:07 +03003946 unsigned int frag_len;
Steffen Klassert07b26c92016-09-19 12:58:47 +02003947
3948 if (!list_skb ||
3949 !net_gso_ok(features, skb_shinfo(head_skb)->gso_type))
3950 goto normal;
3951
Ilan Tayari43170c42017-04-19 21:26:07 +03003952 /* If we get here then all the required
3953 * GSO features except frag_list are supported.
3954 * Try to split the SKB to multiple GSO SKBs
3955 * with no frag_list.
3956 * Currently we can do that only when the buffers don't
3957 * have a linear part and all the buffers except
3958 * the last are of the same length.
Steffen Klassert07b26c92016-09-19 12:58:47 +02003959 */
Ilan Tayari43170c42017-04-19 21:26:07 +03003960 frag_len = list_skb->len;
Steffen Klassert07b26c92016-09-19 12:58:47 +02003961 skb_walk_frags(head_skb, iter) {
Ilan Tayari43170c42017-04-19 21:26:07 +03003962 if (frag_len != iter->len && iter->next)
3963 goto normal;
Ilan Tayarieaffadb2017-04-08 02:07:08 +03003964 if (skb_headlen(iter) && !iter->head_frag)
Steffen Klassert07b26c92016-09-19 12:58:47 +02003965 goto normal;
3966
3967 len -= iter->len;
3968 }
Ilan Tayari43170c42017-04-19 21:26:07 +03003969
3970 if (len != frag_len)
3971 goto normal;
Steffen Klassert07b26c92016-09-19 12:58:47 +02003972 }
3973
3974 /* GSO partial only requires that we trim off any excess that
3975 * doesn't fit into an MSS sized block, so take care of that
3976 * now.
3977 */
Alexander Duyck802ab552016-04-10 21:45:03 -04003978 partial_segs = len / mss;
Alexander Duyckd7fb5a82016-05-02 09:38:12 -07003979 if (partial_segs > 1)
3980 mss *= partial_segs;
3981 else
3982 partial_segs = 0;
Alexander Duyck802ab552016-04-10 21:45:03 -04003983 }
3984
Steffen Klassert07b26c92016-09-19 12:58:47 +02003985normal:
Michael S. Tsirkindf5771f2014-03-10 18:29:19 +02003986 headroom = skb_headroom(head_skb);
3987 pos = skb_headlen(head_skb);
Herbert Xuf4c50d92006-06-22 03:02:40 -07003988
3989 do {
3990 struct sk_buff *nskb;
Michael S. Tsirkin8cb19902014-03-10 18:29:04 +02003991 skb_frag_t *nskb_frag;
Herbert Xuc8884ed2006-10-29 15:59:41 -08003992 int hsize;
Herbert Xuf4c50d92006-06-22 03:02:40 -07003993 int size;
3994
Marcelo Ricardo Leitner3953c462016-06-02 15:05:40 -03003995 if (unlikely(mss == GSO_BY_FRAGS)) {
3996 len = list_skb->len;
3997 } else {
3998 len = head_skb->len - offset;
3999 if (len > mss)
4000 len = mss;
4001 }
Herbert Xuf4c50d92006-06-22 03:02:40 -07004002
Michael S. Tsirkindf5771f2014-03-10 18:29:19 +02004003 hsize = skb_headlen(head_skb) - offset;
Herbert Xuf4c50d92006-06-22 03:02:40 -07004004
Xin Longdbd50f22021-01-15 17:36:38 +08004005 if (hsize <= 0 && i >= nfrags && skb_headlen(list_skb) &&
Michael S. Tsirkin1a4ceda2014-03-10 19:27:59 +02004006 (skb_headlen(list_skb) == len || sg)) {
4007 BUG_ON(skb_headlen(list_skb) > len);
Herbert Xu89319d382008-12-15 23:26:06 -08004008
Herbert Xu9d8506c2013-11-21 11:10:04 -08004009 i = 0;
Michael S. Tsirkin1a4ceda2014-03-10 19:27:59 +02004010 nfrags = skb_shinfo(list_skb)->nr_frags;
4011 frag = skb_shinfo(list_skb)->frags;
Michael S. Tsirkin1fd819e2014-03-10 19:28:08 +02004012 frag_skb = list_skb;
Michael S. Tsirkin1a4ceda2014-03-10 19:27:59 +02004013 pos += skb_headlen(list_skb);
Herbert Xu9d8506c2013-11-21 11:10:04 -08004014
4015 while (pos < offset + len) {
4016 BUG_ON(i >= nfrags);
4017
Michael S. Tsirkin4e1beba2014-03-10 18:29:14 +02004018 size = skb_frag_size(frag);
Herbert Xu9d8506c2013-11-21 11:10:04 -08004019 if (pos + size > offset + len)
4020 break;
4021
4022 i++;
4023 pos += size;
Michael S. Tsirkin4e1beba2014-03-10 18:29:14 +02004024 frag++;
Herbert Xu9d8506c2013-11-21 11:10:04 -08004025 }
4026
Michael S. Tsirkin1a4ceda2014-03-10 19:27:59 +02004027 nskb = skb_clone(list_skb, GFP_ATOMIC);
4028 list_skb = list_skb->next;
Herbert Xu89319d382008-12-15 23:26:06 -08004029
4030 if (unlikely(!nskb))
4031 goto err;
4032
Herbert Xu9d8506c2013-11-21 11:10:04 -08004033 if (unlikely(pskb_trim(nskb, len))) {
4034 kfree_skb(nskb);
4035 goto err;
4036 }
4037
Alexander Duyckec47ea82012-05-04 14:26:56 +00004038 hsize = skb_end_offset(nskb);
Herbert Xu89319d382008-12-15 23:26:06 -08004039 if (skb_cow_head(nskb, doffset + headroom)) {
4040 kfree_skb(nskb);
4041 goto err;
4042 }
4043
Alexander Duyckec47ea82012-05-04 14:26:56 +00004044 nskb->truesize += skb_end_offset(nskb) - hsize;
Herbert Xu89319d382008-12-15 23:26:06 -08004045 skb_release_head_state(nskb);
4046 __skb_push(nskb, doffset);
4047 } else {
Paolo Abeni00b229f2021-01-19 17:56:56 +01004048 if (hsize < 0)
4049 hsize = 0;
Xin Longdbd50f22021-01-15 17:36:38 +08004050 if (hsize > len || !sg)
4051 hsize = len;
Xin Longdbd50f22021-01-15 17:36:38 +08004052
Mel Gormanc93bdd02012-07-31 16:44:19 -07004053 nskb = __alloc_skb(hsize + doffset + headroom,
Michael S. Tsirkindf5771f2014-03-10 18:29:19 +02004054 GFP_ATOMIC, skb_alloc_rx_flag(head_skb),
Mel Gormanc93bdd02012-07-31 16:44:19 -07004055 NUMA_NO_NODE);
Herbert Xu89319d382008-12-15 23:26:06 -08004056
4057 if (unlikely(!nskb))
4058 goto err;
4059
4060 skb_reserve(nskb, headroom);
4061 __skb_put(nskb, doffset);
4062 }
Herbert Xuf4c50d92006-06-22 03:02:40 -07004063
4064 if (segs)
4065 tail->next = nskb;
4066 else
4067 segs = nskb;
4068 tail = nskb;
4069
Michael S. Tsirkindf5771f2014-03-10 18:29:19 +02004070 __copy_skb_header(nskb, head_skb);
Herbert Xuf4c50d92006-06-22 03:02:40 -07004071
Eric Dumazet030737b2013-10-19 11:42:54 -07004072 skb_headers_offset_update(nskb, skb_headroom(nskb) - headroom);
Vlad Yasevichfcdfe3a2014-07-31 10:33:06 -04004073 skb_reset_mac_len(nskb);
Pravin B Shelar68c33162013-02-14 14:02:41 +00004074
Michael S. Tsirkindf5771f2014-03-10 18:29:19 +02004075 skb_copy_from_linear_data_offset(head_skb, -tnl_hlen,
Pravin B Shelar68c33162013-02-14 14:02:41 +00004076 nskb->data - tnl_hlen,
4077 doffset + tnl_hlen);
Herbert Xu89319d382008-12-15 23:26:06 -08004078
Herbert Xu9d8506c2013-11-21 11:10:04 -08004079 if (nskb->len == len + doffset)
Simon Horman1cdbcb72013-05-19 15:46:49 +00004080 goto perform_csum_check;
Herbert Xu89319d382008-12-15 23:26:06 -08004081
Alexander Duyck7fbeffe2016-02-05 15:27:43 -08004082 if (!sg) {
Yadu Kishore1454c9f2020-03-17 14:08:38 +05304083 if (!csum) {
4084 if (!nskb->remcsum_offload)
4085 nskb->ip_summed = CHECKSUM_NONE;
4086 SKB_GSO_CB(nskb)->csum =
4087 skb_copy_and_csum_bits(head_skb, offset,
4088 skb_put(nskb,
4089 len),
Al Viro8d5930d2020-07-10 20:07:10 -04004090 len);
Yadu Kishore1454c9f2020-03-17 14:08:38 +05304091 SKB_GSO_CB(nskb)->csum_start =
4092 skb_headroom(nskb) + doffset;
4093 } else {
4094 skb_copy_bits(head_skb, offset,
4095 skb_put(nskb, len),
4096 len);
4097 }
Herbert Xuf4c50d92006-06-22 03:02:40 -07004098 continue;
4099 }
4100
Michael S. Tsirkin8cb19902014-03-10 18:29:04 +02004101 nskb_frag = skb_shinfo(nskb)->frags;
Herbert Xuf4c50d92006-06-22 03:02:40 -07004102
Michael S. Tsirkindf5771f2014-03-10 18:29:19 +02004103 skb_copy_from_linear_data_offset(head_skb, offset,
Arnaldo Carvalho de Melod626f622007-03-27 18:55:52 -03004104 skb_put(nskb, hsize), hsize);
Herbert Xuf4c50d92006-06-22 03:02:40 -07004105
Jonathan Lemon06b4feb2021-01-06 14:18:38 -08004106 skb_shinfo(nskb)->flags |= skb_shinfo(head_skb)->flags &
4107 SKBFL_SHARED_FRAG;
Eric Dumazetcef401d2013-01-25 20:34:37 +00004108
Willem de Bruijnbf5c25d2017-12-22 19:00:17 -05004109 if (skb_orphan_frags(frag_skb, GFP_ATOMIC) ||
4110 skb_zerocopy_clone(nskb, frag_skb, GFP_ATOMIC))
4111 goto err;
4112
Herbert Xu9d8506c2013-11-21 11:10:04 -08004113 while (pos < offset + len) {
4114 if (i >= nfrags) {
Herbert Xu9d8506c2013-11-21 11:10:04 -08004115 i = 0;
Michael S. Tsirkin1a4ceda2014-03-10 19:27:59 +02004116 nfrags = skb_shinfo(list_skb)->nr_frags;
4117 frag = skb_shinfo(list_skb)->frags;
Michael S. Tsirkin1fd819e2014-03-10 19:28:08 +02004118 frag_skb = list_skb;
Yonghong Song13acc942018-03-21 16:31:03 -07004119 if (!skb_headlen(list_skb)) {
4120 BUG_ON(!nfrags);
4121 } else {
4122 BUG_ON(!list_skb->head_frag);
Herbert Xu9d8506c2013-11-21 11:10:04 -08004123
Yonghong Song13acc942018-03-21 16:31:03 -07004124 /* to make room for head_frag. */
4125 i--;
4126 frag--;
4127 }
Willem de Bruijnbf5c25d2017-12-22 19:00:17 -05004128 if (skb_orphan_frags(frag_skb, GFP_ATOMIC) ||
4129 skb_zerocopy_clone(nskb, frag_skb,
4130 GFP_ATOMIC))
4131 goto err;
4132
Michael S. Tsirkin1a4ceda2014-03-10 19:27:59 +02004133 list_skb = list_skb->next;
Herbert Xu9d8506c2013-11-21 11:10:04 -08004134 }
4135
4136 if (unlikely(skb_shinfo(nskb)->nr_frags >=
4137 MAX_SKB_FRAGS)) {
4138 net_warn_ratelimited(
4139 "skb_segment: too many frags: %u %u\n",
4140 pos, mss);
Eric Dumazetff907a12018-07-19 16:04:38 -07004141 err = -EINVAL;
Herbert Xu9d8506c2013-11-21 11:10:04 -08004142 goto err;
4143 }
4144
Yonghong Song13acc942018-03-21 16:31:03 -07004145 *nskb_frag = (i < 0) ? skb_head_frag_to_page_desc(frag_skb) : *frag;
Michael S. Tsirkin8cb19902014-03-10 18:29:04 +02004146 __skb_frag_ref(nskb_frag);
4147 size = skb_frag_size(nskb_frag);
Herbert Xuf4c50d92006-06-22 03:02:40 -07004148
4149 if (pos < offset) {
Jonathan Lemonb54c9d52019-07-30 07:40:33 -07004150 skb_frag_off_add(nskb_frag, offset - pos);
Michael S. Tsirkin8cb19902014-03-10 18:29:04 +02004151 skb_frag_size_sub(nskb_frag, offset - pos);
Herbert Xuf4c50d92006-06-22 03:02:40 -07004152 }
4153
Herbert Xu89319d382008-12-15 23:26:06 -08004154 skb_shinfo(nskb)->nr_frags++;
Herbert Xuf4c50d92006-06-22 03:02:40 -07004155
4156 if (pos + size <= offset + len) {
4157 i++;
Michael S. Tsirkin4e1beba2014-03-10 18:29:14 +02004158 frag++;
Herbert Xuf4c50d92006-06-22 03:02:40 -07004159 pos += size;
4160 } else {
Michael S. Tsirkin8cb19902014-03-10 18:29:04 +02004161 skb_frag_size_sub(nskb_frag, pos + size - (offset + len));
Herbert Xu89319d382008-12-15 23:26:06 -08004162 goto skip_fraglist;
Herbert Xuf4c50d92006-06-22 03:02:40 -07004163 }
4164
Michael S. Tsirkin8cb19902014-03-10 18:29:04 +02004165 nskb_frag++;
Herbert Xuf4c50d92006-06-22 03:02:40 -07004166 }
4167
Herbert Xu89319d382008-12-15 23:26:06 -08004168skip_fraglist:
Herbert Xuf4c50d92006-06-22 03:02:40 -07004169 nskb->data_len = len - hsize;
4170 nskb->len += nskb->data_len;
4171 nskb->truesize += nskb->data_len;
Pravin B Shelarec5f0612013-03-07 09:28:01 +00004172
Simon Horman1cdbcb72013-05-19 15:46:49 +00004173perform_csum_check:
Alexander Duyck7fbeffe2016-02-05 15:27:43 -08004174 if (!csum) {
Eric Dumazetff907a12018-07-19 16:04:38 -07004175 if (skb_has_shared_frag(nskb) &&
4176 __skb_linearize(nskb))
4177 goto err;
4178
Alexander Duyck7fbeffe2016-02-05 15:27:43 -08004179 if (!nskb->remcsum_offload)
4180 nskb->ip_summed = CHECKSUM_NONE;
Alexander Duyck76443452016-02-05 15:27:37 -08004181 SKB_GSO_CB(nskb)->csum =
4182 skb_checksum(nskb, doffset,
4183 nskb->len - doffset, 0);
Tom Herbert7e2b10c2014-06-04 17:20:02 -07004184 SKB_GSO_CB(nskb)->csum_start =
Alexander Duyck76443452016-02-05 15:27:37 -08004185 skb_headroom(nskb) + doffset;
Pravin B Shelarec5f0612013-03-07 09:28:01 +00004186 }
Michael S. Tsirkindf5771f2014-03-10 18:29:19 +02004187 } while ((offset += len) < head_skb->len);
Herbert Xuf4c50d92006-06-22 03:02:40 -07004188
Eric Dumazetbec3cfd2014-10-03 20:59:19 -07004189 /* Some callers want to get the end of the list.
4190 * Put it in segs->prev to avoid walking the list.
4191 * (see validate_xmit_skb_list() for example)
4192 */
4193 segs->prev = tail;
Toshiaki Makita432c8562014-10-27 10:30:51 -07004194
Alexander Duyck802ab552016-04-10 21:45:03 -04004195 if (partial_segs) {
Steffen Klassert07b26c92016-09-19 12:58:47 +02004196 struct sk_buff *iter;
Alexander Duyck802ab552016-04-10 21:45:03 -04004197 int type = skb_shinfo(head_skb)->gso_type;
Steffen Klassert07b26c92016-09-19 12:58:47 +02004198 unsigned short gso_size = skb_shinfo(head_skb)->gso_size;
Alexander Duyck802ab552016-04-10 21:45:03 -04004199
4200 /* Update type to add partial and then remove dodgy if set */
Steffen Klassert07b26c92016-09-19 12:58:47 +02004201 type |= (features & NETIF_F_GSO_PARTIAL) / NETIF_F_GSO_PARTIAL * SKB_GSO_PARTIAL;
Alexander Duyck802ab552016-04-10 21:45:03 -04004202 type &= ~SKB_GSO_DODGY;
4203
4204 /* Update GSO info and prepare to start updating headers on
4205 * our way back down the stack of protocols.
4206 */
Steffen Klassert07b26c92016-09-19 12:58:47 +02004207 for (iter = segs; iter; iter = iter->next) {
4208 skb_shinfo(iter)->gso_size = gso_size;
4209 skb_shinfo(iter)->gso_segs = partial_segs;
4210 skb_shinfo(iter)->gso_type = type;
4211 SKB_GSO_CB(iter)->data_offset = skb_headroom(iter) + doffset;
4212 }
4213
4214 if (tail->len - doffset <= gso_size)
4215 skb_shinfo(tail)->gso_size = 0;
4216 else if (tail != segs)
4217 skb_shinfo(tail)->gso_segs = DIV_ROUND_UP(tail->len - doffset, gso_size);
Alexander Duyck802ab552016-04-10 21:45:03 -04004218 }
4219
Toshiaki Makita432c8562014-10-27 10:30:51 -07004220 /* Following permits correct backpressure, for protocols
4221 * using skb_set_owner_w().
4222 * Idea is to tranfert ownership from head_skb to last segment.
4223 */
4224 if (head_skb->destructor == sock_wfree) {
4225 swap(tail->truesize, head_skb->truesize);
4226 swap(tail->destructor, head_skb->destructor);
4227 swap(tail->sk, head_skb->sk);
4228 }
Herbert Xuf4c50d92006-06-22 03:02:40 -07004229 return segs;
4230
4231err:
Eric Dumazet289dccb2013-12-20 14:29:08 -08004232 kfree_skb_list(segs);
Herbert Xuf4c50d92006-06-22 03:02:40 -07004233 return ERR_PTR(err);
4234}
Herbert Xuf4c50d92006-06-22 03:02:40 -07004235EXPORT_SYMBOL_GPL(skb_segment);
4236
David Millerd4546c22018-06-24 14:13:49 +09004237int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
Herbert Xu71d93b32008-12-15 23:42:33 -08004238{
Eric Dumazet8a291112013-10-08 09:02:23 -07004239 struct skb_shared_info *pinfo, *skbinfo = skb_shinfo(skb);
Herbert Xu67147ba2009-05-26 18:50:22 +00004240 unsigned int offset = skb_gro_offset(skb);
4241 unsigned int headlen = skb_headlen(skb);
Eric Dumazet8a291112013-10-08 09:02:23 -07004242 unsigned int len = skb_gro_len(skb);
Eric Dumazet715dc1f2012-05-02 23:33:21 +00004243 unsigned int delta_truesize;
David Millerd4546c22018-06-24 14:13:49 +09004244 struct sk_buff *lp;
Herbert Xu71d93b32008-12-15 23:42:33 -08004245
Steffen Klassert0ab03f32019-04-02 08:16:03 +02004246 if (unlikely(p->len + len >= 65536 || NAPI_GRO_CB(skb)->flush))
Herbert Xu71d93b32008-12-15 23:42:33 -08004247 return -E2BIG;
4248
Eric Dumazet29e98242014-05-16 11:34:37 -07004249 lp = NAPI_GRO_CB(p)->last;
Eric Dumazet8a291112013-10-08 09:02:23 -07004250 pinfo = skb_shinfo(lp);
4251
4252 if (headlen <= offset) {
Herbert Xu42da6992009-05-26 18:50:19 +00004253 skb_frag_t *frag;
Herbert Xu66e92fc2009-05-26 18:50:32 +00004254 skb_frag_t *frag2;
Herbert Xu9aaa1562009-05-26 18:50:33 +00004255 int i = skbinfo->nr_frags;
4256 int nr_frags = pinfo->nr_frags + i;
Herbert Xu42da6992009-05-26 18:50:19 +00004257
Herbert Xu66e92fc2009-05-26 18:50:32 +00004258 if (nr_frags > MAX_SKB_FRAGS)
Eric Dumazet8a291112013-10-08 09:02:23 -07004259 goto merge;
Herbert Xu81705ad2009-01-29 14:19:51 +00004260
Eric Dumazet8a291112013-10-08 09:02:23 -07004261 offset -= headlen;
Herbert Xu9aaa1562009-05-26 18:50:33 +00004262 pinfo->nr_frags = nr_frags;
4263 skbinfo->nr_frags = 0;
Herbert Xuf5572062009-01-14 20:40:03 -08004264
Herbert Xu9aaa1562009-05-26 18:50:33 +00004265 frag = pinfo->frags + nr_frags;
4266 frag2 = skbinfo->frags + i;
Herbert Xu66e92fc2009-05-26 18:50:32 +00004267 do {
4268 *--frag = *--frag2;
4269 } while (--i);
4270
Jonathan Lemonb54c9d52019-07-30 07:40:33 -07004271 skb_frag_off_add(frag, offset);
Eric Dumazet9e903e02011-10-18 21:00:24 +00004272 skb_frag_size_sub(frag, offset);
Herbert Xu66e92fc2009-05-26 18:50:32 +00004273
Eric Dumazet715dc1f2012-05-02 23:33:21 +00004274 /* all fragments truesize : remove (head size + sk_buff) */
Alexander Duyckec47ea82012-05-04 14:26:56 +00004275 delta_truesize = skb->truesize -
4276 SKB_TRUESIZE(skb_end_offset(skb));
Eric Dumazet715dc1f2012-05-02 23:33:21 +00004277
Herbert Xuf5572062009-01-14 20:40:03 -08004278 skb->truesize -= skb->data_len;
4279 skb->len -= skb->data_len;
4280 skb->data_len = 0;
4281
Eric Dumazet715dc1f2012-05-02 23:33:21 +00004282 NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE;
Herbert Xu5d38a072009-01-04 16:13:40 -08004283 goto done;
Eric Dumazetd7e88832012-04-30 08:10:34 +00004284 } else if (skb->head_frag) {
4285 int nr_frags = pinfo->nr_frags;
4286 skb_frag_t *frag = pinfo->frags + nr_frags;
4287 struct page *page = virt_to_head_page(skb->head);
4288 unsigned int first_size = headlen - offset;
4289 unsigned int first_offset;
4290
4291 if (nr_frags + 1 + skbinfo->nr_frags > MAX_SKB_FRAGS)
Eric Dumazet8a291112013-10-08 09:02:23 -07004292 goto merge;
Eric Dumazetd7e88832012-04-30 08:10:34 +00004293
4294 first_offset = skb->data -
4295 (unsigned char *)page_address(page) +
4296 offset;
4297
4298 pinfo->nr_frags = nr_frags + 1 + skbinfo->nr_frags;
4299
Matthew Wilcox (Oracle)d8e18a52019-07-22 20:08:26 -07004300 __skb_frag_set_page(frag, page);
Jonathan Lemonb54c9d52019-07-30 07:40:33 -07004301 skb_frag_off_set(frag, first_offset);
Eric Dumazetd7e88832012-04-30 08:10:34 +00004302 skb_frag_size_set(frag, first_size);
4303
4304 memcpy(frag + 1, skbinfo->frags, sizeof(*frag) * skbinfo->nr_frags);
4305 /* We dont need to clear skbinfo->nr_frags here */
4306
Eric Dumazet715dc1f2012-05-02 23:33:21 +00004307 delta_truesize = skb->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff));
Eric Dumazetd7e88832012-04-30 08:10:34 +00004308 NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE_STOLEN_HEAD;
4309 goto done;
Eric Dumazet8a291112013-10-08 09:02:23 -07004310 }
Herbert Xu71d93b32008-12-15 23:42:33 -08004311
4312merge:
Eric Dumazet715dc1f2012-05-02 23:33:21 +00004313 delta_truesize = skb->truesize;
Herbert Xu67147ba2009-05-26 18:50:22 +00004314 if (offset > headlen) {
Michal Schmidtd1dc7ab2011-01-24 12:08:48 +00004315 unsigned int eat = offset - headlen;
4316
Jonathan Lemonb54c9d52019-07-30 07:40:33 -07004317 skb_frag_off_add(&skbinfo->frags[0], eat);
Eric Dumazet9e903e02011-10-18 21:00:24 +00004318 skb_frag_size_sub(&skbinfo->frags[0], eat);
Michal Schmidtd1dc7ab2011-01-24 12:08:48 +00004319 skb->data_len -= eat;
4320 skb->len -= eat;
Herbert Xu67147ba2009-05-26 18:50:22 +00004321 offset = headlen;
Herbert Xu56035022009-02-05 21:26:52 -08004322 }
4323
Herbert Xu67147ba2009-05-26 18:50:22 +00004324 __skb_pull(skb, offset);
Herbert Xu56035022009-02-05 21:26:52 -08004325
Eric Dumazet29e98242014-05-16 11:34:37 -07004326 if (NAPI_GRO_CB(p)->last == p)
Eric Dumazet8a291112013-10-08 09:02:23 -07004327 skb_shinfo(p)->frag_list = skb;
4328 else
4329 NAPI_GRO_CB(p)->last->next = skb;
Eric Dumazetc3c7c252012-12-06 13:54:59 +00004330 NAPI_GRO_CB(p)->last = skb;
Eric Dumazetf4a775d2014-09-22 16:29:32 -07004331 __skb_header_release(skb);
Eric Dumazet8a291112013-10-08 09:02:23 -07004332 lp = p;
Herbert Xu71d93b32008-12-15 23:42:33 -08004333
Herbert Xu5d38a072009-01-04 16:13:40 -08004334done:
4335 NAPI_GRO_CB(p)->count++;
Herbert Xu37fe4732009-01-17 19:48:13 +00004336 p->data_len += len;
Eric Dumazet715dc1f2012-05-02 23:33:21 +00004337 p->truesize += delta_truesize;
Herbert Xu37fe4732009-01-17 19:48:13 +00004338 p->len += len;
Eric Dumazet8a291112013-10-08 09:02:23 -07004339 if (lp != p) {
4340 lp->data_len += len;
4341 lp->truesize += delta_truesize;
4342 lp->len += len;
4343 }
Herbert Xu71d93b32008-12-15 23:42:33 -08004344 NAPI_GRO_CB(skb)->same_flow = 1;
4345 return 0;
4346}
Herbert Xu71d93b32008-12-15 23:42:33 -08004347
Florian Westphaldf5042f2018-12-18 17:15:16 +01004348#ifdef CONFIG_SKB_EXTENSIONS
4349#define SKB_EXT_ALIGN_VALUE 8
4350#define SKB_EXT_CHUNKSIZEOF(x) (ALIGN((sizeof(x)), SKB_EXT_ALIGN_VALUE) / SKB_EXT_ALIGN_VALUE)
4351
4352static const u8 skb_ext_type_len[] = {
4353#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
4354 [SKB_EXT_BRIDGE_NF] = SKB_EXT_CHUNKSIZEOF(struct nf_bridge_info),
4355#endif
Florian Westphal41650792018-12-18 17:15:27 +01004356#ifdef CONFIG_XFRM
4357 [SKB_EXT_SEC_PATH] = SKB_EXT_CHUNKSIZEOF(struct sec_path),
4358#endif
Paul Blakey95a72332019-09-04 16:56:37 +03004359#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
4360 [TC_SKB_EXT] = SKB_EXT_CHUNKSIZEOF(struct tc_skb_ext),
4361#endif
Mat Martineau3ee17bc2020-01-09 07:59:19 -08004362#if IS_ENABLED(CONFIG_MPTCP)
4363 [SKB_EXT_MPTCP] = SKB_EXT_CHUNKSIZEOF(struct mptcp_ext),
4364#endif
Florian Westphaldf5042f2018-12-18 17:15:16 +01004365};
4366
4367static __always_inline unsigned int skb_ext_total_length(void)
4368{
4369 return SKB_EXT_CHUNKSIZEOF(struct skb_ext) +
4370#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
4371 skb_ext_type_len[SKB_EXT_BRIDGE_NF] +
4372#endif
Florian Westphal41650792018-12-18 17:15:27 +01004373#ifdef CONFIG_XFRM
4374 skb_ext_type_len[SKB_EXT_SEC_PATH] +
4375#endif
Paul Blakey95a72332019-09-04 16:56:37 +03004376#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
4377 skb_ext_type_len[TC_SKB_EXT] +
4378#endif
Mat Martineau3ee17bc2020-01-09 07:59:19 -08004379#if IS_ENABLED(CONFIG_MPTCP)
4380 skb_ext_type_len[SKB_EXT_MPTCP] +
4381#endif
Florian Westphaldf5042f2018-12-18 17:15:16 +01004382 0;
4383}
4384
4385static void skb_extensions_init(void)
4386{
4387 BUILD_BUG_ON(SKB_EXT_NUM >= 8);
4388 BUILD_BUG_ON(skb_ext_total_length() > 255);
4389
4390 skbuff_ext_cache = kmem_cache_create("skbuff_ext_cache",
4391 SKB_EXT_ALIGN_VALUE * skb_ext_total_length(),
4392 0,
4393 SLAB_HWCACHE_ALIGN|SLAB_PANIC,
4394 NULL);
4395}
4396#else
4397static void skb_extensions_init(void) {}
4398#endif
4399
Linus Torvalds1da177e2005-04-16 15:20:36 -07004400void __init skb_init(void)
4401{
Kees Cook79a8a642018-02-07 17:44:38 -08004402 skbuff_head_cache = kmem_cache_create_usercopy("skbuff_head_cache",
Linus Torvalds1da177e2005-04-16 15:20:36 -07004403 sizeof(struct sk_buff),
4404 0,
Alexey Dobriyane5d679f332006-08-26 19:25:52 -07004405 SLAB_HWCACHE_ALIGN|SLAB_PANIC,
Kees Cook79a8a642018-02-07 17:44:38 -08004406 offsetof(struct sk_buff, cb),
4407 sizeof_field(struct sk_buff, cb),
Paul Mundt20c2df82007-07-20 10:11:58 +09004408 NULL);
David S. Millerd179cd12005-08-17 14:57:30 -07004409 skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache",
Eric Dumazetd0bf4a92014-09-29 13:29:15 -07004410 sizeof(struct sk_buff_fclones),
David S. Millerd179cd12005-08-17 14:57:30 -07004411 0,
Alexey Dobriyane5d679f332006-08-26 19:25:52 -07004412 SLAB_HWCACHE_ALIGN|SLAB_PANIC,
Paul Mundt20c2df82007-07-20 10:11:58 +09004413 NULL);
Florian Westphaldf5042f2018-12-18 17:15:16 +01004414 skb_extensions_init();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004415}
4416
David S. Miller51c739d2007-10-30 21:29:29 -07004417static int
Jason A. Donenfeld48a1df62017-06-04 04:16:22 +02004418__skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len,
4419 unsigned int recursion_level)
David Howells716ea3a2007-04-02 20:19:53 -07004420{
David S. Miller1a028e52007-04-27 15:21:23 -07004421 int start = skb_headlen(skb);
4422 int i, copy = start - offset;
David S. Millerfbb398a2009-06-09 00:18:59 -07004423 struct sk_buff *frag_iter;
David Howells716ea3a2007-04-02 20:19:53 -07004424 int elt = 0;
4425
Jason A. Donenfeld48a1df62017-06-04 04:16:22 +02004426 if (unlikely(recursion_level >= 24))
4427 return -EMSGSIZE;
4428
David Howells716ea3a2007-04-02 20:19:53 -07004429 if (copy > 0) {
4430 if (copy > len)
4431 copy = len;
Jens Axboe642f149032007-10-24 11:20:47 +02004432 sg_set_buf(sg, skb->data + offset, copy);
David Howells716ea3a2007-04-02 20:19:53 -07004433 elt++;
4434 if ((len -= copy) == 0)
4435 return elt;
4436 offset += copy;
4437 }
4438
4439 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
David S. Miller1a028e52007-04-27 15:21:23 -07004440 int end;
David Howells716ea3a2007-04-02 20:19:53 -07004441
Ilpo Järvinen547b7922008-07-25 21:43:18 -07004442 WARN_ON(start > offset + len);
David S. Miller1a028e52007-04-27 15:21:23 -07004443
Eric Dumazet9e903e02011-10-18 21:00:24 +00004444 end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]);
David Howells716ea3a2007-04-02 20:19:53 -07004445 if ((copy = end - offset) > 0) {
4446 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
Jason A. Donenfeld48a1df62017-06-04 04:16:22 +02004447 if (unlikely(elt && sg_is_last(&sg[elt - 1])))
4448 return -EMSGSIZE;
David Howells716ea3a2007-04-02 20:19:53 -07004449
4450 if (copy > len)
4451 copy = len;
Ian Campbellea2ab692011-08-22 23:44:58 +00004452 sg_set_page(&sg[elt], skb_frag_page(frag), copy,
Jonathan Lemonb54c9d52019-07-30 07:40:33 -07004453 skb_frag_off(frag) + offset - start);
David Howells716ea3a2007-04-02 20:19:53 -07004454 elt++;
4455 if (!(len -= copy))
4456 return elt;
4457 offset += copy;
4458 }
David S. Miller1a028e52007-04-27 15:21:23 -07004459 start = end;
David Howells716ea3a2007-04-02 20:19:53 -07004460 }
4461
David S. Millerfbb398a2009-06-09 00:18:59 -07004462 skb_walk_frags(skb, frag_iter) {
Jason A. Donenfeld48a1df62017-06-04 04:16:22 +02004463 int end, ret;
David Howells716ea3a2007-04-02 20:19:53 -07004464
David S. Millerfbb398a2009-06-09 00:18:59 -07004465 WARN_ON(start > offset + len);
David Howells716ea3a2007-04-02 20:19:53 -07004466
David S. Millerfbb398a2009-06-09 00:18:59 -07004467 end = start + frag_iter->len;
4468 if ((copy = end - offset) > 0) {
Jason A. Donenfeld48a1df62017-06-04 04:16:22 +02004469 if (unlikely(elt && sg_is_last(&sg[elt - 1])))
4470 return -EMSGSIZE;
4471
David S. Millerfbb398a2009-06-09 00:18:59 -07004472 if (copy > len)
4473 copy = len;
Jason A. Donenfeld48a1df62017-06-04 04:16:22 +02004474 ret = __skb_to_sgvec(frag_iter, sg+elt, offset - start,
4475 copy, recursion_level + 1);
4476 if (unlikely(ret < 0))
4477 return ret;
4478 elt += ret;
David S. Millerfbb398a2009-06-09 00:18:59 -07004479 if ((len -= copy) == 0)
4480 return elt;
4481 offset += copy;
David Howells716ea3a2007-04-02 20:19:53 -07004482 }
David S. Millerfbb398a2009-06-09 00:18:59 -07004483 start = end;
David Howells716ea3a2007-04-02 20:19:53 -07004484 }
4485 BUG_ON(len);
4486 return elt;
4487}
4488
Jason A. Donenfeld48a1df62017-06-04 04:16:22 +02004489/**
4490 * skb_to_sgvec - Fill a scatter-gather list from a socket buffer
4491 * @skb: Socket buffer containing the buffers to be mapped
4492 * @sg: The scatter-gather list to map into
4493 * @offset: The offset into the buffer's contents to start mapping
4494 * @len: Length of buffer space to be mapped
4495 *
4496 * Fill the specified scatter-gather list with mappings/pointers into a
4497 * region of the buffer space attached to a socket buffer. Returns either
4498 * the number of scatterlist items used, or -EMSGSIZE if the contents
4499 * could not fit.
4500 */
4501int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len)
4502{
4503 int nsg = __skb_to_sgvec(skb, sg, offset, len, 0);
4504
4505 if (nsg <= 0)
4506 return nsg;
4507
4508 sg_mark_end(&sg[nsg - 1]);
4509
4510 return nsg;
4511}
4512EXPORT_SYMBOL_GPL(skb_to_sgvec);
4513
Fan Du25a91d82014-01-18 09:54:23 +08004514/* As compared with skb_to_sgvec, skb_to_sgvec_nomark only map skb to given
4515 * sglist without mark the sg which contain last skb data as the end.
4516 * So the caller can mannipulate sg list as will when padding new data after
4517 * the first call without calling sg_unmark_end to expend sg list.
4518 *
4519 * Scenario to use skb_to_sgvec_nomark:
4520 * 1. sg_init_table
4521 * 2. skb_to_sgvec_nomark(payload1)
4522 * 3. skb_to_sgvec_nomark(payload2)
4523 *
4524 * This is equivalent to:
4525 * 1. sg_init_table
4526 * 2. skb_to_sgvec(payload1)
4527 * 3. sg_unmark_end
4528 * 4. skb_to_sgvec(payload2)
4529 *
4530 * When mapping mutilple payload conditionally, skb_to_sgvec_nomark
4531 * is more preferable.
4532 */
4533int skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg,
4534 int offset, int len)
4535{
Jason A. Donenfeld48a1df62017-06-04 04:16:22 +02004536 return __skb_to_sgvec(skb, sg, offset, len, 0);
Fan Du25a91d82014-01-18 09:54:23 +08004537}
4538EXPORT_SYMBOL_GPL(skb_to_sgvec_nomark);
4539
David S. Miller51c739d2007-10-30 21:29:29 -07004540
David S. Miller51c739d2007-10-30 21:29:29 -07004541
David Howells716ea3a2007-04-02 20:19:53 -07004542/**
4543 * skb_cow_data - Check that a socket buffer's data buffers are writable
4544 * @skb: The socket buffer to check.
4545 * @tailbits: Amount of trailing space to be added
4546 * @trailer: Returned pointer to the skb where the @tailbits space begins
4547 *
4548 * Make sure that the data buffers attached to a socket buffer are
4549 * writable. If they are not, private copies are made of the data buffers
4550 * and the socket buffer is set to use these instead.
4551 *
4552 * If @tailbits is given, make sure that there is space to write @tailbits
4553 * bytes of data beyond current end of socket buffer. @trailer will be
4554 * set to point to the skb in which this space begins.
4555 *
4556 * The number of scatterlist elements required to completely map the
4557 * COW'd and extended socket buffer will be returned.
4558 */
4559int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer)
4560{
4561 int copyflag;
4562 int elt;
4563 struct sk_buff *skb1, **skb_p;
4564
4565 /* If skb is cloned or its head is paged, reallocate
4566 * head pulling out all the pages (pages are considered not writable
4567 * at the moment even if they are anonymous).
4568 */
4569 if ((skb_cloned(skb) || skb_shinfo(skb)->nr_frags) &&
Miaohe Linc15fc192020-08-01 17:30:23 +08004570 !__pskb_pull_tail(skb, __skb_pagelen(skb)))
David Howells716ea3a2007-04-02 20:19:53 -07004571 return -ENOMEM;
4572
4573 /* Easy case. Most of packets will go this way. */
David S. Miller21dc3302010-08-23 00:13:46 -07004574 if (!skb_has_frag_list(skb)) {
David Howells716ea3a2007-04-02 20:19:53 -07004575 /* A little of trouble, not enough of space for trailer.
4576 * This should not happen, when stack is tuned to generate
4577 * good frames. OK, on miss we reallocate and reserve even more
4578 * space, 128 bytes is fair. */
4579
4580 if (skb_tailroom(skb) < tailbits &&
4581 pskb_expand_head(skb, 0, tailbits-skb_tailroom(skb)+128, GFP_ATOMIC))
4582 return -ENOMEM;
4583
4584 /* Voila! */
4585 *trailer = skb;
4586 return 1;
4587 }
4588
4589 /* Misery. We are in troubles, going to mincer fragments... */
4590
4591 elt = 1;
4592 skb_p = &skb_shinfo(skb)->frag_list;
4593 copyflag = 0;
4594
4595 while ((skb1 = *skb_p) != NULL) {
4596 int ntail = 0;
4597
4598 /* The fragment is partially pulled by someone,
4599 * this can happen on input. Copy it and everything
4600 * after it. */
4601
4602 if (skb_shared(skb1))
4603 copyflag = 1;
4604
4605 /* If the skb is the last, worry about trailer. */
4606
4607 if (skb1->next == NULL && tailbits) {
4608 if (skb_shinfo(skb1)->nr_frags ||
David S. Miller21dc3302010-08-23 00:13:46 -07004609 skb_has_frag_list(skb1) ||
David Howells716ea3a2007-04-02 20:19:53 -07004610 skb_tailroom(skb1) < tailbits)
4611 ntail = tailbits + 128;
4612 }
4613
4614 if (copyflag ||
4615 skb_cloned(skb1) ||
4616 ntail ||
4617 skb_shinfo(skb1)->nr_frags ||
David S. Miller21dc3302010-08-23 00:13:46 -07004618 skb_has_frag_list(skb1)) {
David Howells716ea3a2007-04-02 20:19:53 -07004619 struct sk_buff *skb2;
4620
4621 /* Fuck, we are miserable poor guys... */
4622 if (ntail == 0)
4623 skb2 = skb_copy(skb1, GFP_ATOMIC);
4624 else
4625 skb2 = skb_copy_expand(skb1,
4626 skb_headroom(skb1),
4627 ntail,
4628 GFP_ATOMIC);
4629 if (unlikely(skb2 == NULL))
4630 return -ENOMEM;
4631
4632 if (skb1->sk)
4633 skb_set_owner_w(skb2, skb1->sk);
4634
4635 /* Looking around. Are we still alive?
4636 * OK, link new skb, drop old one */
4637
4638 skb2->next = skb1->next;
4639 *skb_p = skb2;
4640 kfree_skb(skb1);
4641 skb1 = skb2;
4642 }
4643 elt++;
4644 *trailer = skb1;
4645 skb_p = &skb1->next;
4646 }
4647
4648 return elt;
4649}
David S. Millerb4ac530fc2009-02-10 02:09:24 -08004650EXPORT_SYMBOL_GPL(skb_cow_data);
David Howells716ea3a2007-04-02 20:19:53 -07004651
Eric Dumazetb1faf562010-05-31 23:44:05 -07004652static void sock_rmem_free(struct sk_buff *skb)
4653{
4654 struct sock *sk = skb->sk;
4655
4656 atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
4657}
4658
Soheil Hassas Yeganeh8605330a2017-03-18 17:02:59 -04004659static void skb_set_err_queue(struct sk_buff *skb)
4660{
4661 /* pkt_type of skbs received on local sockets is never PACKET_OUTGOING.
4662 * So, it is safe to (mis)use it to mark skbs on the error queue.
4663 */
4664 skb->pkt_type = PACKET_OUTGOING;
4665 BUILD_BUG_ON(PACKET_OUTGOING == 0);
4666}
4667
Eric Dumazetb1faf562010-05-31 23:44:05 -07004668/*
4669 * Note: We dont mem charge error packets (no sk_forward_alloc changes)
4670 */
4671int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb)
4672{
4673 if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
Eric Dumazetebb3b782019-10-10 20:17:44 -07004674 (unsigned int)READ_ONCE(sk->sk_rcvbuf))
Eric Dumazetb1faf562010-05-31 23:44:05 -07004675 return -ENOMEM;
4676
4677 skb_orphan(skb);
4678 skb->sk = sk;
4679 skb->destructor = sock_rmem_free;
4680 atomic_add(skb->truesize, &sk->sk_rmem_alloc);
Soheil Hassas Yeganeh8605330a2017-03-18 17:02:59 -04004681 skb_set_err_queue(skb);
Eric Dumazetb1faf562010-05-31 23:44:05 -07004682
Eric Dumazetabb57ea2011-05-18 02:21:31 -04004683 /* before exiting rcu section, make sure dst is refcounted */
4684 skb_dst_force(skb);
4685
Eric Dumazetb1faf562010-05-31 23:44:05 -07004686 skb_queue_tail(&sk->sk_error_queue, skb);
4687 if (!sock_flag(sk, SOCK_DEAD))
Alexander Aringe3ae2362021-06-27 18:48:21 -04004688 sk_error_report(sk);
Eric Dumazetb1faf562010-05-31 23:44:05 -07004689 return 0;
4690}
4691EXPORT_SYMBOL(sock_queue_err_skb);
4692
Soheil Hassas Yeganeh83a1a1a2016-11-30 14:01:08 -05004693static bool is_icmp_err_skb(const struct sk_buff *skb)
4694{
4695 return skb && (SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP ||
4696 SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP6);
4697}
4698
Willem de Bruijn364a9e92014-08-31 21:30:27 -04004699struct sk_buff *sock_dequeue_err_skb(struct sock *sk)
4700{
4701 struct sk_buff_head *q = &sk->sk_error_queue;
Soheil Hassas Yeganeh83a1a1a2016-11-30 14:01:08 -05004702 struct sk_buff *skb, *skb_next = NULL;
4703 bool icmp_next = false;
Eric Dumazet997d5c32015-02-18 05:47:55 -08004704 unsigned long flags;
Willem de Bruijn364a9e92014-08-31 21:30:27 -04004705
Eric Dumazet997d5c32015-02-18 05:47:55 -08004706 spin_lock_irqsave(&q->lock, flags);
Willem de Bruijn364a9e92014-08-31 21:30:27 -04004707 skb = __skb_dequeue(q);
Soheil Hassas Yeganeh38b25792017-06-02 12:38:22 -04004708 if (skb && (skb_next = skb_peek(q))) {
Soheil Hassas Yeganeh83a1a1a2016-11-30 14:01:08 -05004709 icmp_next = is_icmp_err_skb(skb_next);
Soheil Hassas Yeganeh38b25792017-06-02 12:38:22 -04004710 if (icmp_next)
Willem de Bruijn985f7332020-11-26 10:12:20 -05004711 sk->sk_err = SKB_EXT_ERR(skb_next)->ee.ee_errno;
Soheil Hassas Yeganeh38b25792017-06-02 12:38:22 -04004712 }
Eric Dumazet997d5c32015-02-18 05:47:55 -08004713 spin_unlock_irqrestore(&q->lock, flags);
Willem de Bruijn364a9e92014-08-31 21:30:27 -04004714
Soheil Hassas Yeganeh83a1a1a2016-11-30 14:01:08 -05004715 if (is_icmp_err_skb(skb) && !icmp_next)
4716 sk->sk_err = 0;
4717
4718 if (skb_next)
Alexander Aringe3ae2362021-06-27 18:48:21 -04004719 sk_error_report(sk);
Willem de Bruijn364a9e92014-08-31 21:30:27 -04004720
4721 return skb;
4722}
4723EXPORT_SYMBOL(sock_dequeue_err_skb);
4724
Alexander Duyckcab41c42014-09-10 18:05:26 -04004725/**
4726 * skb_clone_sk - create clone of skb, and take reference to socket
4727 * @skb: the skb to clone
4728 *
4729 * This function creates a clone of a buffer that holds a reference on
4730 * sk_refcnt. Buffers created via this function are meant to be
4731 * returned using sock_queue_err_skb, or free via kfree_skb.
4732 *
4733 * When passing buffers allocated with this function to sock_queue_err_skb
4734 * it is necessary to wrap the call with sock_hold/sock_put in order to
4735 * prevent the socket from being released prior to being enqueued on
4736 * the sk_error_queue.
4737 */
Alexander Duyck62bccb82014-09-04 13:31:35 -04004738struct sk_buff *skb_clone_sk(struct sk_buff *skb)
4739{
4740 struct sock *sk = skb->sk;
4741 struct sk_buff *clone;
4742
Reshetova, Elena41c6d652017-06-30 13:08:01 +03004743 if (!sk || !refcount_inc_not_zero(&sk->sk_refcnt))
Alexander Duyck62bccb82014-09-04 13:31:35 -04004744 return NULL;
4745
4746 clone = skb_clone(skb, GFP_ATOMIC);
4747 if (!clone) {
4748 sock_put(sk);
4749 return NULL;
4750 }
4751
4752 clone->sk = sk;
4753 clone->destructor = sock_efree;
4754
4755 return clone;
4756}
4757EXPORT_SYMBOL(skb_clone_sk);
4758
Alexander Duyck37846ef2014-09-04 13:31:10 -04004759static void __skb_complete_tx_timestamp(struct sk_buff *skb,
4760 struct sock *sk,
Soheil Hassas Yeganeh4ef1b282017-03-18 17:03:00 -04004761 int tstype,
4762 bool opt_stats)
Patrick Ohlyac45f602009-02-12 05:03:37 +00004763{
Patrick Ohlyac45f602009-02-12 05:03:37 +00004764 struct sock_exterr_skb *serr;
Patrick Ohlyac45f602009-02-12 05:03:37 +00004765 int err;
4766
Soheil Hassas Yeganeh4ef1b282017-03-18 17:03:00 -04004767 BUILD_BUG_ON(sizeof(struct sock_exterr_skb) > sizeof(skb->cb));
4768
Patrick Ohlyac45f602009-02-12 05:03:37 +00004769 serr = SKB_EXT_ERR(skb);
4770 memset(serr, 0, sizeof(*serr));
4771 serr->ee.ee_errno = ENOMSG;
4772 serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING;
Willem de Bruijne7fd2882014-08-04 22:11:48 -04004773 serr->ee.ee_info = tstype;
Soheil Hassas Yeganeh4ef1b282017-03-18 17:03:00 -04004774 serr->opt_stats = opt_stats;
Willem de Bruijn1862d622017-04-12 19:24:35 -04004775 serr->header.h4.iif = skb->dev ? skb->dev->ifindex : 0;
Willem de Bruijn4ed2d762014-08-04 22:11:49 -04004776 if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) {
Willem de Bruijn09c2d252014-08-04 22:11:47 -04004777 serr->ee.ee_data = skb_shinfo(skb)->tskey;
WANG Congac5cc972015-12-16 23:39:04 -08004778 if (sk->sk_protocol == IPPROTO_TCP &&
4779 sk->sk_type == SOCK_STREAM)
Willem de Bruijn4ed2d762014-08-04 22:11:49 -04004780 serr->ee.ee_data -= sk->sk_tskey;
4781 }
Eric Dumazet29030372010-05-29 00:20:48 -07004782
Patrick Ohlyac45f602009-02-12 05:03:37 +00004783 err = sock_queue_err_skb(sk, skb);
Eric Dumazet29030372010-05-29 00:20:48 -07004784
Patrick Ohlyac45f602009-02-12 05:03:37 +00004785 if (err)
4786 kfree_skb(skb);
4787}
Alexander Duyck37846ef2014-09-04 13:31:10 -04004788
Willem de Bruijnb245be12015-01-30 13:29:32 -05004789static bool skb_may_tx_timestamp(struct sock *sk, bool tsonly)
4790{
4791 bool ret;
4792
4793 if (likely(sysctl_tstamp_allow_data || tsonly))
4794 return true;
4795
4796 read_lock_bh(&sk->sk_callback_lock);
4797 ret = sk->sk_socket && sk->sk_socket->file &&
4798 file_ns_capable(sk->sk_socket->file, &init_user_ns, CAP_NET_RAW);
4799 read_unlock_bh(&sk->sk_callback_lock);
4800 return ret;
4801}
4802
Alexander Duyck37846ef2014-09-04 13:31:10 -04004803void skb_complete_tx_timestamp(struct sk_buff *skb,
4804 struct skb_shared_hwtstamps *hwtstamps)
4805{
4806 struct sock *sk = skb->sk;
4807
Willem de Bruijnb245be12015-01-30 13:29:32 -05004808 if (!skb_may_tx_timestamp(sk, false))
Willem de Bruijn35b99df2017-12-13 14:41:06 -05004809 goto err;
Willem de Bruijnb245be12015-01-30 13:29:32 -05004810
Eric Dumazet9ac25fc2017-03-03 21:01:03 -08004811 /* Take a reference to prevent skb_orphan() from freeing the socket,
4812 * but only if the socket refcount is not zero.
4813 */
Reshetova, Elena41c6d652017-06-30 13:08:01 +03004814 if (likely(refcount_inc_not_zero(&sk->sk_refcnt))) {
Eric Dumazet9ac25fc2017-03-03 21:01:03 -08004815 *skb_hwtstamps(skb) = *hwtstamps;
Soheil Hassas Yeganeh4ef1b282017-03-18 17:03:00 -04004816 __skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND, false);
Eric Dumazet9ac25fc2017-03-03 21:01:03 -08004817 sock_put(sk);
Willem de Bruijn35b99df2017-12-13 14:41:06 -05004818 return;
Eric Dumazet9ac25fc2017-03-03 21:01:03 -08004819 }
Willem de Bruijn35b99df2017-12-13 14:41:06 -05004820
4821err:
4822 kfree_skb(skb);
Alexander Duyck37846ef2014-09-04 13:31:10 -04004823}
4824EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp);
4825
4826void __skb_tstamp_tx(struct sk_buff *orig_skb,
Yousuk Seunge7ed11e2021-01-20 12:41:55 -08004827 const struct sk_buff *ack_skb,
Alexander Duyck37846ef2014-09-04 13:31:10 -04004828 struct skb_shared_hwtstamps *hwtstamps,
4829 struct sock *sk, int tstype)
4830{
4831 struct sk_buff *skb;
Soheil Hassas Yeganeh4ef1b282017-03-18 17:03:00 -04004832 bool tsonly, opt_stats = false;
Alexander Duyck37846ef2014-09-04 13:31:10 -04004833
Willem de Bruijn3a8dd972015-03-11 15:43:55 -04004834 if (!sk)
4835 return;
4836
Miroslav Lichvarb50a5c72017-05-19 17:52:40 +02004837 if (!hwtstamps && !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TX_SWHW) &&
4838 skb_shinfo(orig_skb)->tx_flags & SKBTX_IN_PROGRESS)
4839 return;
4840
Willem de Bruijn3a8dd972015-03-11 15:43:55 -04004841 tsonly = sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TSONLY;
4842 if (!skb_may_tx_timestamp(sk, tsonly))
Alexander Duyck37846ef2014-09-04 13:31:10 -04004843 return;
4844
Francis Yan1c885802016-11-27 23:07:18 -08004845 if (tsonly) {
4846#ifdef CONFIG_INET
4847 if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_STATS) &&
4848 sk->sk_protocol == IPPROTO_TCP &&
Soheil Hassas Yeganeh4ef1b282017-03-18 17:03:00 -04004849 sk->sk_type == SOCK_STREAM) {
Yousuk Seunge7ed11e2021-01-20 12:41:55 -08004850 skb = tcp_get_timestamping_opt_stats(sk, orig_skb,
4851 ack_skb);
Soheil Hassas Yeganeh4ef1b282017-03-18 17:03:00 -04004852 opt_stats = true;
4853 } else
Francis Yan1c885802016-11-27 23:07:18 -08004854#endif
4855 skb = alloc_skb(0, GFP_ATOMIC);
4856 } else {
Willem de Bruijn49ca0d82015-01-30 13:29:31 -05004857 skb = skb_clone(orig_skb, GFP_ATOMIC);
Francis Yan1c885802016-11-27 23:07:18 -08004858 }
Alexander Duyck37846ef2014-09-04 13:31:10 -04004859 if (!skb)
4860 return;
4861
Willem de Bruijn49ca0d82015-01-30 13:29:31 -05004862 if (tsonly) {
Willem de Bruijnfff88032017-06-08 11:35:03 -04004863 skb_shinfo(skb)->tx_flags |= skb_shinfo(orig_skb)->tx_flags &
4864 SKBTX_ANY_TSTAMP;
Willem de Bruijn49ca0d82015-01-30 13:29:31 -05004865 skb_shinfo(skb)->tskey = skb_shinfo(orig_skb)->tskey;
4866 }
4867
4868 if (hwtstamps)
4869 *skb_hwtstamps(skb) = *hwtstamps;
4870 else
4871 skb->tstamp = ktime_get_real();
4872
Soheil Hassas Yeganeh4ef1b282017-03-18 17:03:00 -04004873 __skb_complete_tx_timestamp(skb, sk, tstype, opt_stats);
Alexander Duyck37846ef2014-09-04 13:31:10 -04004874}
Willem de Bruijne7fd2882014-08-04 22:11:48 -04004875EXPORT_SYMBOL_GPL(__skb_tstamp_tx);
4876
4877void skb_tstamp_tx(struct sk_buff *orig_skb,
4878 struct skb_shared_hwtstamps *hwtstamps)
4879{
Yousuk Seunge7ed11e2021-01-20 12:41:55 -08004880 return __skb_tstamp_tx(orig_skb, NULL, hwtstamps, orig_skb->sk,
Willem de Bruijne7fd2882014-08-04 22:11:48 -04004881 SCM_TSTAMP_SND);
4882}
Patrick Ohlyac45f602009-02-12 05:03:37 +00004883EXPORT_SYMBOL_GPL(skb_tstamp_tx);
4884
Johannes Berg6e3e9392011-11-09 10:15:42 +01004885void skb_complete_wifi_ack(struct sk_buff *skb, bool acked)
4886{
4887 struct sock *sk = skb->sk;
4888 struct sock_exterr_skb *serr;
Eric Dumazetdd4f1072017-03-03 21:01:02 -08004889 int err = 1;
Johannes Berg6e3e9392011-11-09 10:15:42 +01004890
4891 skb->wifi_acked_valid = 1;
4892 skb->wifi_acked = acked;
4893
4894 serr = SKB_EXT_ERR(skb);
4895 memset(serr, 0, sizeof(*serr));
4896 serr->ee.ee_errno = ENOMSG;
4897 serr->ee.ee_origin = SO_EE_ORIGIN_TXSTATUS;
4898
Eric Dumazetdd4f1072017-03-03 21:01:02 -08004899 /* Take a reference to prevent skb_orphan() from freeing the socket,
4900 * but only if the socket refcount is not zero.
4901 */
Reshetova, Elena41c6d652017-06-30 13:08:01 +03004902 if (likely(refcount_inc_not_zero(&sk->sk_refcnt))) {
Eric Dumazetdd4f1072017-03-03 21:01:02 -08004903 err = sock_queue_err_skb(sk, skb);
4904 sock_put(sk);
4905 }
Johannes Berg6e3e9392011-11-09 10:15:42 +01004906 if (err)
4907 kfree_skb(skb);
4908}
4909EXPORT_SYMBOL_GPL(skb_complete_wifi_ack);
4910
Rusty Russellf35d9d82008-02-04 23:49:54 -05004911/**
4912 * skb_partial_csum_set - set up and verify partial csum values for packet
4913 * @skb: the skb to set
4914 * @start: the number of bytes after skb->data to start checksumming.
4915 * @off: the offset from start to place the checksum.
4916 *
4917 * For untrusted partially-checksummed packets, we need to make sure the values
4918 * for skb->csum_start and skb->csum_offset are valid so we don't oops.
4919 *
4920 * This function checks and sets those values and skb->ip_summed: if this
4921 * returns false you should drop the packet.
4922 */
4923bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off)
4924{
Eric Dumazet52b5d6f2018-10-10 06:59:35 -07004925 u32 csum_end = (u32)start + (u32)off + sizeof(__sum16);
4926 u32 csum_start = skb_headroom(skb) + (u32)start;
4927
4928 if (unlikely(csum_start > U16_MAX || csum_end > skb_headlen(skb))) {
4929 net_warn_ratelimited("bad partial csum: csum=%u/%u headroom=%u headlen=%u\n",
4930 start, off, skb_headroom(skb), skb_headlen(skb));
Rusty Russellf35d9d82008-02-04 23:49:54 -05004931 return false;
4932 }
4933 skb->ip_summed = CHECKSUM_PARTIAL;
Eric Dumazet52b5d6f2018-10-10 06:59:35 -07004934 skb->csum_start = csum_start;
Rusty Russellf35d9d82008-02-04 23:49:54 -05004935 skb->csum_offset = off;
Jason Wange5d5dec2013-03-26 23:11:20 +00004936 skb_set_transport_header(skb, start);
Rusty Russellf35d9d82008-02-04 23:49:54 -05004937 return true;
4938}
David S. Millerb4ac530fc2009-02-10 02:09:24 -08004939EXPORT_SYMBOL_GPL(skb_partial_csum_set);
Rusty Russellf35d9d82008-02-04 23:49:54 -05004940
Paul Durranted1f50c2014-01-09 10:02:46 +00004941static int skb_maybe_pull_tail(struct sk_buff *skb, unsigned int len,
4942 unsigned int max)
4943{
4944 if (skb_headlen(skb) >= len)
4945 return 0;
4946
4947 /* If we need to pullup then pullup to the max, so we
4948 * won't need to do it again.
4949 */
4950 if (max > skb->len)
4951 max = skb->len;
4952
4953 if (__pskb_pull_tail(skb, max - skb_headlen(skb)) == NULL)
4954 return -ENOMEM;
4955
4956 if (skb_headlen(skb) < len)
4957 return -EPROTO;
4958
4959 return 0;
4960}
4961
Jan Beulichf9708b42014-03-11 13:56:05 +00004962#define MAX_TCP_HDR_LEN (15 * 4)
4963
4964static __sum16 *skb_checksum_setup_ip(struct sk_buff *skb,
4965 typeof(IPPROTO_IP) proto,
4966 unsigned int off)
4967{
Kees Cook161d1792020-02-19 22:23:04 -08004968 int err;
Jan Beulichf9708b42014-03-11 13:56:05 +00004969
Kees Cook161d1792020-02-19 22:23:04 -08004970 switch (proto) {
Jan Beulichf9708b42014-03-11 13:56:05 +00004971 case IPPROTO_TCP:
4972 err = skb_maybe_pull_tail(skb, off + sizeof(struct tcphdr),
4973 off + MAX_TCP_HDR_LEN);
4974 if (!err && !skb_partial_csum_set(skb, off,
4975 offsetof(struct tcphdr,
4976 check)))
4977 err = -EPROTO;
4978 return err ? ERR_PTR(err) : &tcp_hdr(skb)->check;
4979
4980 case IPPROTO_UDP:
4981 err = skb_maybe_pull_tail(skb, off + sizeof(struct udphdr),
4982 off + sizeof(struct udphdr));
4983 if (!err && !skb_partial_csum_set(skb, off,
4984 offsetof(struct udphdr,
4985 check)))
4986 err = -EPROTO;
4987 return err ? ERR_PTR(err) : &udp_hdr(skb)->check;
4988 }
4989
4990 return ERR_PTR(-EPROTO);
4991}
4992
Paul Durranted1f50c2014-01-09 10:02:46 +00004993/* This value should be large enough to cover a tagged ethernet header plus
4994 * maximally sized IP and TCP or UDP headers.
4995 */
4996#define MAX_IP_HDR_LEN 128
4997
Jan Beulichf9708b42014-03-11 13:56:05 +00004998static int skb_checksum_setup_ipv4(struct sk_buff *skb, bool recalculate)
Paul Durranted1f50c2014-01-09 10:02:46 +00004999{
5000 unsigned int off;
5001 bool fragment;
Jan Beulichf9708b42014-03-11 13:56:05 +00005002 __sum16 *csum;
Paul Durranted1f50c2014-01-09 10:02:46 +00005003 int err;
5004
5005 fragment = false;
5006
5007 err = skb_maybe_pull_tail(skb,
5008 sizeof(struct iphdr),
5009 MAX_IP_HDR_LEN);
5010 if (err < 0)
5011 goto out;
5012
Miaohe Lin11f920d2020-08-06 19:57:18 +08005013 if (ip_is_fragment(ip_hdr(skb)))
Paul Durranted1f50c2014-01-09 10:02:46 +00005014 fragment = true;
5015
5016 off = ip_hdrlen(skb);
5017
5018 err = -EPROTO;
5019
5020 if (fragment)
5021 goto out;
5022
Jan Beulichf9708b42014-03-11 13:56:05 +00005023 csum = skb_checksum_setup_ip(skb, ip_hdr(skb)->protocol, off);
5024 if (IS_ERR(csum))
5025 return PTR_ERR(csum);
Paul Durranted1f50c2014-01-09 10:02:46 +00005026
Jan Beulichf9708b42014-03-11 13:56:05 +00005027 if (recalculate)
5028 *csum = ~csum_tcpudp_magic(ip_hdr(skb)->saddr,
5029 ip_hdr(skb)->daddr,
5030 skb->len - off,
5031 ip_hdr(skb)->protocol, 0);
Paul Durranted1f50c2014-01-09 10:02:46 +00005032 err = 0;
5033
5034out:
5035 return err;
5036}
5037
5038/* This value should be large enough to cover a tagged ethernet header plus
5039 * an IPv6 header, all options, and a maximal TCP or UDP header.
5040 */
5041#define MAX_IPV6_HDR_LEN 256
5042
5043#define OPT_HDR(type, skb, off) \
5044 (type *)(skb_network_header(skb) + (off))
5045
5046static int skb_checksum_setup_ipv6(struct sk_buff *skb, bool recalculate)
5047{
5048 int err;
5049 u8 nexthdr;
5050 unsigned int off;
5051 unsigned int len;
5052 bool fragment;
5053 bool done;
Jan Beulichf9708b42014-03-11 13:56:05 +00005054 __sum16 *csum;
Paul Durranted1f50c2014-01-09 10:02:46 +00005055
5056 fragment = false;
5057 done = false;
5058
5059 off = sizeof(struct ipv6hdr);
5060
5061 err = skb_maybe_pull_tail(skb, off, MAX_IPV6_HDR_LEN);
5062 if (err < 0)
5063 goto out;
5064
5065 nexthdr = ipv6_hdr(skb)->nexthdr;
5066
5067 len = sizeof(struct ipv6hdr) + ntohs(ipv6_hdr(skb)->payload_len);
5068 while (off <= len && !done) {
5069 switch (nexthdr) {
5070 case IPPROTO_DSTOPTS:
5071 case IPPROTO_HOPOPTS:
5072 case IPPROTO_ROUTING: {
5073 struct ipv6_opt_hdr *hp;
5074
5075 err = skb_maybe_pull_tail(skb,
5076 off +
5077 sizeof(struct ipv6_opt_hdr),
5078 MAX_IPV6_HDR_LEN);
5079 if (err < 0)
5080 goto out;
5081
5082 hp = OPT_HDR(struct ipv6_opt_hdr, skb, off);
5083 nexthdr = hp->nexthdr;
5084 off += ipv6_optlen(hp);
5085 break;
5086 }
5087 case IPPROTO_AH: {
5088 struct ip_auth_hdr *hp;
5089
5090 err = skb_maybe_pull_tail(skb,
5091 off +
5092 sizeof(struct ip_auth_hdr),
5093 MAX_IPV6_HDR_LEN);
5094 if (err < 0)
5095 goto out;
5096
5097 hp = OPT_HDR(struct ip_auth_hdr, skb, off);
5098 nexthdr = hp->nexthdr;
5099 off += ipv6_authlen(hp);
5100 break;
5101 }
5102 case IPPROTO_FRAGMENT: {
5103 struct frag_hdr *hp;
5104
5105 err = skb_maybe_pull_tail(skb,
5106 off +
5107 sizeof(struct frag_hdr),
5108 MAX_IPV6_HDR_LEN);
5109 if (err < 0)
5110 goto out;
5111
5112 hp = OPT_HDR(struct frag_hdr, skb, off);
5113
5114 if (hp->frag_off & htons(IP6_OFFSET | IP6_MF))
5115 fragment = true;
5116
5117 nexthdr = hp->nexthdr;
5118 off += sizeof(struct frag_hdr);
5119 break;
5120 }
5121 default:
5122 done = true;
5123 break;
5124 }
5125 }
5126
5127 err = -EPROTO;
5128
5129 if (!done || fragment)
5130 goto out;
5131
Jan Beulichf9708b42014-03-11 13:56:05 +00005132 csum = skb_checksum_setup_ip(skb, nexthdr, off);
5133 if (IS_ERR(csum))
5134 return PTR_ERR(csum);
Paul Durranted1f50c2014-01-09 10:02:46 +00005135
Jan Beulichf9708b42014-03-11 13:56:05 +00005136 if (recalculate)
5137 *csum = ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
5138 &ipv6_hdr(skb)->daddr,
5139 skb->len - off, nexthdr, 0);
Paul Durranted1f50c2014-01-09 10:02:46 +00005140 err = 0;
5141
5142out:
5143 return err;
5144}
5145
5146/**
5147 * skb_checksum_setup - set up partial checksum offset
5148 * @skb: the skb to set up
5149 * @recalculate: if true the pseudo-header checksum will be recalculated
5150 */
5151int skb_checksum_setup(struct sk_buff *skb, bool recalculate)
5152{
5153 int err;
5154
5155 switch (skb->protocol) {
5156 case htons(ETH_P_IP):
Jan Beulichf9708b42014-03-11 13:56:05 +00005157 err = skb_checksum_setup_ipv4(skb, recalculate);
Paul Durranted1f50c2014-01-09 10:02:46 +00005158 break;
5159
5160 case htons(ETH_P_IPV6):
5161 err = skb_checksum_setup_ipv6(skb, recalculate);
5162 break;
5163
5164 default:
5165 err = -EPROTO;
5166 break;
5167 }
5168
5169 return err;
5170}
5171EXPORT_SYMBOL(skb_checksum_setup);
5172
Linus Lüssing9afd85c2015-05-02 14:01:07 +02005173/**
5174 * skb_checksum_maybe_trim - maybe trims the given skb
5175 * @skb: the skb to check
5176 * @transport_len: the data length beyond the network header
5177 *
5178 * Checks whether the given skb has data beyond the given transport length.
5179 * If so, returns a cloned skb trimmed to this transport length.
5180 * Otherwise returns the provided skb. Returns NULL in error cases
5181 * (e.g. transport_len exceeds skb length or out-of-memory).
5182 *
Linus Lüssinga5169932015-08-13 05:54:07 +02005183 * Caller needs to set the skb transport header and free any returned skb if it
5184 * differs from the provided skb.
Linus Lüssing9afd85c2015-05-02 14:01:07 +02005185 */
5186static struct sk_buff *skb_checksum_maybe_trim(struct sk_buff *skb,
5187 unsigned int transport_len)
5188{
5189 struct sk_buff *skb_chk;
5190 unsigned int len = skb_transport_offset(skb) + transport_len;
5191 int ret;
5192
Linus Lüssinga5169932015-08-13 05:54:07 +02005193 if (skb->len < len)
Linus Lüssing9afd85c2015-05-02 14:01:07 +02005194 return NULL;
Linus Lüssinga5169932015-08-13 05:54:07 +02005195 else if (skb->len == len)
Linus Lüssing9afd85c2015-05-02 14:01:07 +02005196 return skb;
Linus Lüssing9afd85c2015-05-02 14:01:07 +02005197
5198 skb_chk = skb_clone(skb, GFP_ATOMIC);
Linus Lüssing9afd85c2015-05-02 14:01:07 +02005199 if (!skb_chk)
5200 return NULL;
5201
5202 ret = pskb_trim_rcsum(skb_chk, len);
5203 if (ret) {
5204 kfree_skb(skb_chk);
5205 return NULL;
5206 }
5207
5208 return skb_chk;
5209}
5210
5211/**
5212 * skb_checksum_trimmed - validate checksum of an skb
5213 * @skb: the skb to check
5214 * @transport_len: the data length beyond the network header
5215 * @skb_chkf: checksum function to use
5216 *
5217 * Applies the given checksum function skb_chkf to the provided skb.
5218 * Returns a checked and maybe trimmed skb. Returns NULL on error.
5219 *
5220 * If the skb has data beyond the given transport length, then a
5221 * trimmed & cloned skb is checked and returned.
5222 *
Linus Lüssinga5169932015-08-13 05:54:07 +02005223 * Caller needs to set the skb transport header and free any returned skb if it
5224 * differs from the provided skb.
Linus Lüssing9afd85c2015-05-02 14:01:07 +02005225 */
5226struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb,
5227 unsigned int transport_len,
5228 __sum16(*skb_chkf)(struct sk_buff *skb))
5229{
5230 struct sk_buff *skb_chk;
5231 unsigned int offset = skb_transport_offset(skb);
Linus Lüssingfcba67c2015-05-05 00:19:35 +02005232 __sum16 ret;
Linus Lüssing9afd85c2015-05-02 14:01:07 +02005233
5234 skb_chk = skb_checksum_maybe_trim(skb, transport_len);
5235 if (!skb_chk)
Linus Lüssinga5169932015-08-13 05:54:07 +02005236 goto err;
Linus Lüssing9afd85c2015-05-02 14:01:07 +02005237
Linus Lüssinga5169932015-08-13 05:54:07 +02005238 if (!pskb_may_pull(skb_chk, offset))
5239 goto err;
Linus Lüssing9afd85c2015-05-02 14:01:07 +02005240
Linus Lüssing9b368812016-02-24 04:21:42 +01005241 skb_pull_rcsum(skb_chk, offset);
Linus Lüssing9afd85c2015-05-02 14:01:07 +02005242 ret = skb_chkf(skb_chk);
Linus Lüssing9b368812016-02-24 04:21:42 +01005243 skb_push_rcsum(skb_chk, offset);
Linus Lüssing9afd85c2015-05-02 14:01:07 +02005244
Linus Lüssinga5169932015-08-13 05:54:07 +02005245 if (ret)
5246 goto err;
Linus Lüssing9afd85c2015-05-02 14:01:07 +02005247
5248 return skb_chk;
Linus Lüssinga5169932015-08-13 05:54:07 +02005249
5250err:
5251 if (skb_chk && skb_chk != skb)
5252 kfree_skb(skb_chk);
5253
5254 return NULL;
5255
Linus Lüssing9afd85c2015-05-02 14:01:07 +02005256}
5257EXPORT_SYMBOL(skb_checksum_trimmed);
5258
Ben Hutchings4497b072008-06-19 16:22:28 -07005259void __skb_warn_lro_forwarding(const struct sk_buff *skb)
5260{
Joe Perchese87cc472012-05-13 21:56:26 +00005261 net_warn_ratelimited("%s: received packets cannot be forwarded while LRO is enabled\n",
5262 skb->dev->name);
Ben Hutchings4497b072008-06-19 16:22:28 -07005263}
Ben Hutchings4497b072008-06-19 16:22:28 -07005264EXPORT_SYMBOL(__skb_warn_lro_forwarding);
Eric Dumazetbad43ca2012-05-19 03:02:02 +00005265
5266void kfree_skb_partial(struct sk_buff *skb, bool head_stolen)
5267{
Eric Dumazet3d861f62012-10-22 09:03:40 +00005268 if (head_stolen) {
5269 skb_release_head_state(skb);
Eric Dumazetbad43ca2012-05-19 03:02:02 +00005270 kmem_cache_free(skbuff_head_cache, skb);
Eric Dumazet3d861f62012-10-22 09:03:40 +00005271 } else {
Eric Dumazetbad43ca2012-05-19 03:02:02 +00005272 __kfree_skb(skb);
Eric Dumazet3d861f62012-10-22 09:03:40 +00005273 }
Eric Dumazetbad43ca2012-05-19 03:02:02 +00005274}
5275EXPORT_SYMBOL(kfree_skb_partial);
5276
5277/**
5278 * skb_try_coalesce - try to merge skb to prior one
5279 * @to: prior buffer
5280 * @from: buffer to add
5281 * @fragstolen: pointer to boolean
Randy Dunlapc6c4b972012-06-08 14:01:44 +00005282 * @delta_truesize: how much more was allocated than was requested
Eric Dumazetbad43ca2012-05-19 03:02:02 +00005283 */
5284bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,
5285 bool *fragstolen, int *delta_truesize)
5286{
Eric Dumazetc818fa92017-10-04 10:48:35 -07005287 struct skb_shared_info *to_shinfo, *from_shinfo;
Eric Dumazetbad43ca2012-05-19 03:02:02 +00005288 int i, delta, len = from->len;
5289
5290 *fragstolen = false;
5291
5292 if (skb_cloned(to))
5293 return false;
5294
Ilias Apalodimas6a5bcd82021-06-07 21:02:38 +02005295 /* The page pool signature of struct page will eventually figure out
5296 * which pages can be recycled or not but for now let's prohibit slab
5297 * allocated and page_pool allocated SKBs from being coalesced.
5298 */
5299 if (to->pp_recycle != from->pp_recycle)
5300 return false;
5301
Eric Dumazetbad43ca2012-05-19 03:02:02 +00005302 if (len <= skb_tailroom(to)) {
Eric Dumazete93a0432014-09-15 04:19:52 -07005303 if (len)
5304 BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len));
Eric Dumazetbad43ca2012-05-19 03:02:02 +00005305 *delta_truesize = 0;
5306 return true;
5307 }
5308
Eric Dumazetc818fa92017-10-04 10:48:35 -07005309 to_shinfo = skb_shinfo(to);
5310 from_shinfo = skb_shinfo(from);
5311 if (to_shinfo->frag_list || from_shinfo->frag_list)
Eric Dumazetbad43ca2012-05-19 03:02:02 +00005312 return false;
Willem de Bruijn1f8b9772017-08-03 16:29:41 -04005313 if (skb_zcopy(to) || skb_zcopy(from))
5314 return false;
Eric Dumazetbad43ca2012-05-19 03:02:02 +00005315
5316 if (skb_headlen(from) != 0) {
5317 struct page *page;
5318 unsigned int offset;
5319
Eric Dumazetc818fa92017-10-04 10:48:35 -07005320 if (to_shinfo->nr_frags +
5321 from_shinfo->nr_frags >= MAX_SKB_FRAGS)
Eric Dumazetbad43ca2012-05-19 03:02:02 +00005322 return false;
5323
5324 if (skb_head_is_locked(from))
5325 return false;
5326
5327 delta = from->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff));
5328
5329 page = virt_to_head_page(from->head);
5330 offset = from->data - (unsigned char *)page_address(page);
5331
Eric Dumazetc818fa92017-10-04 10:48:35 -07005332 skb_fill_page_desc(to, to_shinfo->nr_frags,
Eric Dumazetbad43ca2012-05-19 03:02:02 +00005333 page, offset, skb_headlen(from));
5334 *fragstolen = true;
5335 } else {
Eric Dumazetc818fa92017-10-04 10:48:35 -07005336 if (to_shinfo->nr_frags +
5337 from_shinfo->nr_frags > MAX_SKB_FRAGS)
Eric Dumazetbad43ca2012-05-19 03:02:02 +00005338 return false;
5339
Weiping Panf4b549a2012-09-28 20:15:30 +00005340 delta = from->truesize - SKB_TRUESIZE(skb_end_offset(from));
Eric Dumazetbad43ca2012-05-19 03:02:02 +00005341 }
5342
5343 WARN_ON_ONCE(delta < len);
5344
Eric Dumazetc818fa92017-10-04 10:48:35 -07005345 memcpy(to_shinfo->frags + to_shinfo->nr_frags,
5346 from_shinfo->frags,
5347 from_shinfo->nr_frags * sizeof(skb_frag_t));
5348 to_shinfo->nr_frags += from_shinfo->nr_frags;
Eric Dumazetbad43ca2012-05-19 03:02:02 +00005349
5350 if (!skb_cloned(from))
Eric Dumazetc818fa92017-10-04 10:48:35 -07005351 from_shinfo->nr_frags = 0;
Eric Dumazetbad43ca2012-05-19 03:02:02 +00005352
Li RongQing8ea853f2012-09-18 16:53:21 +00005353 /* if the skb is not cloned this does nothing
5354 * since we set nr_frags to 0.
5355 */
Eric Dumazetc818fa92017-10-04 10:48:35 -07005356 for (i = 0; i < from_shinfo->nr_frags; i++)
5357 __skb_frag_ref(&from_shinfo->frags[i]);
Eric Dumazetbad43ca2012-05-19 03:02:02 +00005358
5359 to->truesize += delta;
5360 to->len += len;
5361 to->data_len += len;
5362
5363 *delta_truesize = delta;
5364 return true;
5365}
5366EXPORT_SYMBOL(skb_try_coalesce);
Nicolas Dichtel621e84d2013-06-26 16:11:27 +02005367
5368/**
Nicolas Dichtel8b27f272013-09-02 15:34:56 +02005369 * skb_scrub_packet - scrub an skb
Nicolas Dichtel621e84d2013-06-26 16:11:27 +02005370 *
5371 * @skb: buffer to clean
Nicolas Dichtel8b27f272013-09-02 15:34:56 +02005372 * @xnet: packet is crossing netns
Nicolas Dichtel621e84d2013-06-26 16:11:27 +02005373 *
Nicolas Dichtel8b27f272013-09-02 15:34:56 +02005374 * skb_scrub_packet can be used after encapsulating or decapsulting a packet
5375 * into/from a tunnel. Some information have to be cleared during these
5376 * operations.
5377 * skb_scrub_packet can also be used to clean a skb before injecting it in
5378 * another namespace (@xnet == true). We have to clear all information in the
5379 * skb that could impact namespace isolation.
Nicolas Dichtel621e84d2013-06-26 16:11:27 +02005380 */
Nicolas Dichtel8b27f272013-09-02 15:34:56 +02005381void skb_scrub_packet(struct sk_buff *skb, bool xnet)
Nicolas Dichtel621e84d2013-06-26 16:11:27 +02005382{
Nicolas Dichtel621e84d2013-06-26 16:11:27 +02005383 skb->pkt_type = PACKET_HOST;
5384 skb->skb_iif = 0;
WANG Cong60ff7462014-05-04 16:39:18 -07005385 skb->ignore_df = 0;
Nicolas Dichtel621e84d2013-06-26 16:11:27 +02005386 skb_dst_drop(skb);
Florian Westphal174e2382019-09-26 20:37:05 +02005387 skb_ext_reset(skb);
Florian Westphal895b5c92019-09-29 20:54:03 +02005388 nf_reset_ct(skb);
Nicolas Dichtel621e84d2013-06-26 16:11:27 +02005389 nf_reset_trace(skb);
Herbert Xu213dd742015-04-16 09:03:27 +08005390
Petr Machata6f9a5062018-11-19 16:11:07 +00005391#ifdef CONFIG_NET_SWITCHDEV
5392 skb->offload_fwd_mark = 0;
Ido Schimmel875e8932018-12-04 08:15:10 +00005393 skb->offload_l3_fwd_mark = 0;
Petr Machata6f9a5062018-11-19 16:11:07 +00005394#endif
5395
Herbert Xu213dd742015-04-16 09:03:27 +08005396 if (!xnet)
5397 return;
5398
Ye Yin2b5ec1a2017-10-26 16:57:05 +08005399 ipvs_reset(skb);
Herbert Xu213dd742015-04-16 09:03:27 +08005400 skb->mark = 0;
Jesus Sanchez-Palenciac47d8c22018-07-03 15:42:47 -07005401 skb->tstamp = 0;
Nicolas Dichtel621e84d2013-06-26 16:11:27 +02005402}
5403EXPORT_SYMBOL_GPL(skb_scrub_packet);
Florian Westphalde960aa2014-01-26 10:58:16 +01005404
5405/**
5406 * skb_gso_transport_seglen - Return length of individual segments of a gso packet
5407 *
5408 * @skb: GSO skb
5409 *
5410 * skb_gso_transport_seglen is used to determine the real size of the
5411 * individual segments, including Layer4 headers (TCP/UDP).
5412 *
5413 * The MAC/L2 or network (IP, IPv6) headers are not accounted for.
5414 */
Daniel Axtensa4a77712018-03-01 17:13:40 +11005415static unsigned int skb_gso_transport_seglen(const struct sk_buff *skb)
Florian Westphalde960aa2014-01-26 10:58:16 +01005416{
5417 const struct skb_shared_info *shinfo = skb_shinfo(skb);
Florian Westphalf993bc22014-10-20 13:49:18 +02005418 unsigned int thlen = 0;
Florian Westphalde960aa2014-01-26 10:58:16 +01005419
Florian Westphalf993bc22014-10-20 13:49:18 +02005420 if (skb->encapsulation) {
5421 thlen = skb_inner_transport_header(skb) -
5422 skb_transport_header(skb);
Florian Westphal6d39d582014-04-09 10:28:50 +02005423
Florian Westphalf993bc22014-10-20 13:49:18 +02005424 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
5425 thlen += inner_tcp_hdrlen(skb);
5426 } else if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
5427 thlen = tcp_hdrlen(skb);
Daniel Axtens1dd27cd2018-03-09 14:06:09 +11005428 } else if (unlikely(skb_is_gso_sctp(skb))) {
Marcelo Ricardo Leitner90017ac2016-06-02 15:05:43 -03005429 thlen = sizeof(struct sctphdr);
Willem de Bruijnee80d1e2018-04-26 13:42:16 -04005430 } else if (shinfo->gso_type & SKB_GSO_UDP_L4) {
5431 thlen = sizeof(struct udphdr);
Florian Westphalf993bc22014-10-20 13:49:18 +02005432 }
Florian Westphal6d39d582014-04-09 10:28:50 +02005433 /* UFO sets gso_size to the size of the fragmentation
5434 * payload, i.e. the size of the L4 (UDP) header is already
5435 * accounted for.
5436 */
Florian Westphalf993bc22014-10-20 13:49:18 +02005437 return thlen + shinfo->gso_size;
Florian Westphalde960aa2014-01-26 10:58:16 +01005438}
Daniel Axtensa4a77712018-03-01 17:13:40 +11005439
5440/**
5441 * skb_gso_network_seglen - Return length of individual segments of a gso packet
5442 *
5443 * @skb: GSO skb
5444 *
5445 * skb_gso_network_seglen is used to determine the real size of the
5446 * individual segments, including Layer3 (IP, IPv6) and L4 headers (TCP/UDP).
5447 *
5448 * The MAC/L2 header is not accounted for.
5449 */
5450static unsigned int skb_gso_network_seglen(const struct sk_buff *skb)
5451{
5452 unsigned int hdr_len = skb_transport_header(skb) -
5453 skb_network_header(skb);
5454
5455 return hdr_len + skb_gso_transport_seglen(skb);
5456}
5457
5458/**
5459 * skb_gso_mac_seglen - Return length of individual segments of a gso packet
5460 *
5461 * @skb: GSO skb
5462 *
5463 * skb_gso_mac_seglen is used to determine the real size of the
5464 * individual segments, including MAC/L2, Layer3 (IP, IPv6) and L4
5465 * headers (TCP/UDP).
5466 */
5467static unsigned int skb_gso_mac_seglen(const struct sk_buff *skb)
5468{
5469 unsigned int hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
5470
5471 return hdr_len + skb_gso_transport_seglen(skb);
5472}
Vlad Yasevich0d5501c2014-08-08 14:42:13 -04005473
Marcelo Ricardo Leitnerae7ef812016-06-02 15:05:41 -03005474/**
Daniel Axtens2b16f042018-01-31 14:15:33 +11005475 * skb_gso_size_check - check the skb size, considering GSO_BY_FRAGS
5476 *
5477 * There are a couple of instances where we have a GSO skb, and we
5478 * want to determine what size it would be after it is segmented.
5479 *
5480 * We might want to check:
5481 * - L3+L4+payload size (e.g. IP forwarding)
5482 * - L2+L3+L4+payload size (e.g. sanity check before passing to driver)
5483 *
5484 * This is a helper to do that correctly considering GSO_BY_FRAGS.
5485 *
Mathieu Malaterre49682bf2018-10-31 13:16:58 +01005486 * @skb: GSO skb
5487 *
Daniel Axtens2b16f042018-01-31 14:15:33 +11005488 * @seg_len: The segmented length (from skb_gso_*_seglen). In the
5489 * GSO_BY_FRAGS case this will be [header sizes + GSO_BY_FRAGS].
5490 *
5491 * @max_len: The maximum permissible length.
5492 *
5493 * Returns true if the segmented length <= max length.
5494 */
5495static inline bool skb_gso_size_check(const struct sk_buff *skb,
5496 unsigned int seg_len,
5497 unsigned int max_len) {
5498 const struct skb_shared_info *shinfo = skb_shinfo(skb);
5499 const struct sk_buff *iter;
5500
5501 if (shinfo->gso_size != GSO_BY_FRAGS)
5502 return seg_len <= max_len;
5503
5504 /* Undo this so we can re-use header sizes */
5505 seg_len -= GSO_BY_FRAGS;
5506
5507 skb_walk_frags(skb, iter) {
5508 if (seg_len + skb_headlen(iter) > max_len)
5509 return false;
5510 }
5511
5512 return true;
5513}
5514
5515/**
Daniel Axtens779b7932018-03-01 17:13:37 +11005516 * skb_gso_validate_network_len - Will a split GSO skb fit into a given MTU?
Marcelo Ricardo Leitnerae7ef812016-06-02 15:05:41 -03005517 *
5518 * @skb: GSO skb
David S. Miller76f21b92016-06-03 22:56:28 -07005519 * @mtu: MTU to validate against
Marcelo Ricardo Leitnerae7ef812016-06-02 15:05:41 -03005520 *
Daniel Axtens779b7932018-03-01 17:13:37 +11005521 * skb_gso_validate_network_len validates if a given skb will fit a
5522 * wanted MTU once split. It considers L3 headers, L4 headers, and the
5523 * payload.
Marcelo Ricardo Leitnerae7ef812016-06-02 15:05:41 -03005524 */
Daniel Axtens779b7932018-03-01 17:13:37 +11005525bool skb_gso_validate_network_len(const struct sk_buff *skb, unsigned int mtu)
Marcelo Ricardo Leitnerae7ef812016-06-02 15:05:41 -03005526{
Daniel Axtens2b16f042018-01-31 14:15:33 +11005527 return skb_gso_size_check(skb, skb_gso_network_seglen(skb), mtu);
Marcelo Ricardo Leitnerae7ef812016-06-02 15:05:41 -03005528}
Daniel Axtens779b7932018-03-01 17:13:37 +11005529EXPORT_SYMBOL_GPL(skb_gso_validate_network_len);
Marcelo Ricardo Leitnerae7ef812016-06-02 15:05:41 -03005530
Daniel Axtens2b16f042018-01-31 14:15:33 +11005531/**
5532 * skb_gso_validate_mac_len - Will a split GSO skb fit in a given length?
5533 *
5534 * @skb: GSO skb
5535 * @len: length to validate against
5536 *
5537 * skb_gso_validate_mac_len validates if a given skb will fit a wanted
5538 * length once split, including L2, L3 and L4 headers and the payload.
5539 */
5540bool skb_gso_validate_mac_len(const struct sk_buff *skb, unsigned int len)
5541{
5542 return skb_gso_size_check(skb, skb_gso_mac_seglen(skb), len);
5543}
5544EXPORT_SYMBOL_GPL(skb_gso_validate_mac_len);
5545
Vlad Yasevich0d5501c2014-08-08 14:42:13 -04005546static struct sk_buff *skb_reorder_vlan_header(struct sk_buff *skb)
5547{
Yuya Kusakabed85e8be2019-04-16 10:22:28 +09005548 int mac_len, meta_len;
5549 void *meta;
Toshiaki Makita4bbb3e02018-03-13 14:51:27 +09005550
Vlad Yasevich0d5501c2014-08-08 14:42:13 -04005551 if (skb_cow(skb, skb_headroom(skb)) < 0) {
5552 kfree_skb(skb);
5553 return NULL;
5554 }
5555
Toshiaki Makita4bbb3e02018-03-13 14:51:27 +09005556 mac_len = skb->data - skb_mac_header(skb);
Toshiaki Makitaae474572018-03-29 19:05:29 +09005557 if (likely(mac_len > VLAN_HLEN + ETH_TLEN)) {
5558 memmove(skb_mac_header(skb) + VLAN_HLEN, skb_mac_header(skb),
5559 mac_len - VLAN_HLEN - ETH_TLEN);
5560 }
Yuya Kusakabed85e8be2019-04-16 10:22:28 +09005561
5562 meta_len = skb_metadata_len(skb);
5563 if (meta_len) {
5564 meta = skb_metadata_end(skb) - meta_len;
5565 memmove(meta + VLAN_HLEN, meta, meta_len);
5566 }
5567
Vlad Yasevich0d5501c2014-08-08 14:42:13 -04005568 skb->mac_header += VLAN_HLEN;
5569 return skb;
5570}
5571
5572struct sk_buff *skb_vlan_untag(struct sk_buff *skb)
5573{
5574 struct vlan_hdr *vhdr;
5575 u16 vlan_tci;
5576
Jiri Pirkodf8a39d2015-01-13 17:13:44 +01005577 if (unlikely(skb_vlan_tag_present(skb))) {
Vlad Yasevich0d5501c2014-08-08 14:42:13 -04005578 /* vlan_tci is already set-up so leave this for another time */
5579 return skb;
5580 }
5581
5582 skb = skb_share_check(skb, GFP_ATOMIC);
5583 if (unlikely(!skb))
5584 goto err_free;
Miaohe Lin55eff0e2020-08-15 04:44:31 -04005585 /* We may access the two bytes after vlan_hdr in vlan_set_encap_proto(). */
5586 if (unlikely(!pskb_may_pull(skb, VLAN_HLEN + sizeof(unsigned short))))
Vlad Yasevich0d5501c2014-08-08 14:42:13 -04005587 goto err_free;
5588
5589 vhdr = (struct vlan_hdr *)skb->data;
5590 vlan_tci = ntohs(vhdr->h_vlan_TCI);
5591 __vlan_hwaccel_put_tag(skb, skb->protocol, vlan_tci);
5592
5593 skb_pull_rcsum(skb, VLAN_HLEN);
5594 vlan_set_encap_proto(skb, vhdr);
5595
5596 skb = skb_reorder_vlan_header(skb);
5597 if (unlikely(!skb))
5598 goto err_free;
5599
5600 skb_reset_network_header(skb);
Alexander Lobakin8be33ec2020-11-09 23:47:23 +00005601 if (!skb_transport_header_was_set(skb))
5602 skb_reset_transport_header(skb);
Vlad Yasevich0d5501c2014-08-08 14:42:13 -04005603 skb_reset_mac_len(skb);
5604
5605 return skb;
5606
5607err_free:
5608 kfree_skb(skb);
5609 return NULL;
5610}
5611EXPORT_SYMBOL(skb_vlan_untag);
Eric Dumazet2e4e4412014-09-17 04:49:49 -07005612
Jiri Pirkoe2195122014-11-19 14:05:01 +01005613int skb_ensure_writable(struct sk_buff *skb, int write_len)
5614{
5615 if (!pskb_may_pull(skb, write_len))
5616 return -ENOMEM;
5617
5618 if (!skb_cloned(skb) || skb_clone_writable(skb, write_len))
5619 return 0;
5620
5621 return pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
5622}
5623EXPORT_SYMBOL(skb_ensure_writable);
5624
Shmulik Ladkanibfca4c52016-09-19 19:11:09 +03005625/* remove VLAN header from packet and update csum accordingly.
5626 * expects a non skb_vlan_tag_present skb with a vlan tag payload
5627 */
5628int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci)
Jiri Pirko93515d52014-11-19 14:05:02 +01005629{
5630 struct vlan_hdr *vhdr;
Shmulik Ladkanib6a79202016-09-29 12:10:41 +03005631 int offset = skb->data - skb_mac_header(skb);
Jiri Pirko93515d52014-11-19 14:05:02 +01005632 int err;
5633
Shmulik Ladkanib6a79202016-09-29 12:10:41 +03005634 if (WARN_ONCE(offset,
5635 "__skb_vlan_pop got skb with skb->data not at mac header (offset %d)\n",
5636 offset)) {
5637 return -EINVAL;
5638 }
5639
Jiri Pirko93515d52014-11-19 14:05:02 +01005640 err = skb_ensure_writable(skb, VLAN_ETH_HLEN);
5641 if (unlikely(err))
Shmulik Ladkanib6a79202016-09-29 12:10:41 +03005642 return err;
Jiri Pirko93515d52014-11-19 14:05:02 +01005643
5644 skb_postpull_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN);
5645
5646 vhdr = (struct vlan_hdr *)(skb->data + ETH_HLEN);
5647 *vlan_tci = ntohs(vhdr->h_vlan_TCI);
5648
5649 memmove(skb->data + VLAN_HLEN, skb->data, 2 * ETH_ALEN);
5650 __skb_pull(skb, VLAN_HLEN);
5651
5652 vlan_set_encap_proto(skb, vhdr);
5653 skb->mac_header += VLAN_HLEN;
5654
5655 if (skb_network_offset(skb) < ETH_HLEN)
5656 skb_set_network_header(skb, ETH_HLEN);
5657
5658 skb_reset_mac_len(skb);
Jiri Pirko93515d52014-11-19 14:05:02 +01005659
5660 return err;
5661}
Shmulik Ladkanibfca4c52016-09-19 19:11:09 +03005662EXPORT_SYMBOL(__skb_vlan_pop);
Jiri Pirko93515d52014-11-19 14:05:02 +01005663
Shmulik Ladkanib6a79202016-09-29 12:10:41 +03005664/* Pop a vlan tag either from hwaccel or from payload.
5665 * Expects skb->data at mac header.
5666 */
Jiri Pirko93515d52014-11-19 14:05:02 +01005667int skb_vlan_pop(struct sk_buff *skb)
5668{
5669 u16 vlan_tci;
5670 __be16 vlan_proto;
5671 int err;
5672
Jiri Pirkodf8a39d2015-01-13 17:13:44 +01005673 if (likely(skb_vlan_tag_present(skb))) {
Michał Mirosławb18175242018-11-09 00:18:02 +01005674 __vlan_hwaccel_clear_tag(skb);
Jiri Pirko93515d52014-11-19 14:05:02 +01005675 } else {
Shmulik Ladkaniecf4ee42016-09-20 12:48:37 +03005676 if (unlikely(!eth_type_vlan(skb->protocol)))
Jiri Pirko93515d52014-11-19 14:05:02 +01005677 return 0;
5678
5679 err = __skb_vlan_pop(skb, &vlan_tci);
5680 if (err)
5681 return err;
5682 }
5683 /* move next vlan tag to hw accel tag */
Shmulik Ladkaniecf4ee42016-09-20 12:48:37 +03005684 if (likely(!eth_type_vlan(skb->protocol)))
Jiri Pirko93515d52014-11-19 14:05:02 +01005685 return 0;
5686
5687 vlan_proto = skb->protocol;
5688 err = __skb_vlan_pop(skb, &vlan_tci);
5689 if (unlikely(err))
5690 return err;
5691
5692 __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci);
5693 return 0;
5694}
5695EXPORT_SYMBOL(skb_vlan_pop);
5696
Shmulik Ladkanib6a79202016-09-29 12:10:41 +03005697/* Push a vlan tag either into hwaccel or into payload (if hwaccel tag present).
5698 * Expects skb->data at mac header.
5699 */
Jiri Pirko93515d52014-11-19 14:05:02 +01005700int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci)
5701{
Jiri Pirkodf8a39d2015-01-13 17:13:44 +01005702 if (skb_vlan_tag_present(skb)) {
Shmulik Ladkanib6a79202016-09-29 12:10:41 +03005703 int offset = skb->data - skb_mac_header(skb);
Jiri Pirko93515d52014-11-19 14:05:02 +01005704 int err;
5705
Shmulik Ladkanib6a79202016-09-29 12:10:41 +03005706 if (WARN_ONCE(offset,
5707 "skb_vlan_push got skb with skb->data not at mac header (offset %d)\n",
5708 offset)) {
5709 return -EINVAL;
5710 }
5711
Jiri Pirko93515d52014-11-19 14:05:02 +01005712 err = __vlan_insert_tag(skb, skb->vlan_proto,
Jiri Pirkodf8a39d2015-01-13 17:13:44 +01005713 skb_vlan_tag_get(skb));
Shmulik Ladkanib6a79202016-09-29 12:10:41 +03005714 if (err)
Jiri Pirko93515d52014-11-19 14:05:02 +01005715 return err;
Daniel Borkmann9241e2d2016-04-16 02:27:58 +02005716
Jiri Pirko93515d52014-11-19 14:05:02 +01005717 skb->protocol = skb->vlan_proto;
5718 skb->mac_len += VLAN_HLEN;
Jiri Pirko93515d52014-11-19 14:05:02 +01005719
Daniel Borkmann6b83d282016-02-20 00:29:30 +01005720 skb_postpush_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN);
Jiri Pirko93515d52014-11-19 14:05:02 +01005721 }
5722 __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci);
5723 return 0;
5724}
5725EXPORT_SYMBOL(skb_vlan_push);
5726
Guillaume Nault19fbcb32020-10-03 00:44:28 +02005727/**
5728 * skb_eth_pop() - Drop the Ethernet header at the head of a packet
5729 *
5730 * @skb: Socket buffer to modify
5731 *
5732 * Drop the Ethernet header of @skb.
5733 *
5734 * Expects that skb->data points to the mac header and that no VLAN tags are
5735 * present.
5736 *
5737 * Returns 0 on success, -errno otherwise.
5738 */
5739int skb_eth_pop(struct sk_buff *skb)
5740{
5741 if (!pskb_may_pull(skb, ETH_HLEN) || skb_vlan_tagged(skb) ||
5742 skb_network_offset(skb) < ETH_HLEN)
5743 return -EPROTO;
5744
5745 skb_pull_rcsum(skb, ETH_HLEN);
5746 skb_reset_mac_header(skb);
5747 skb_reset_mac_len(skb);
5748
5749 return 0;
5750}
5751EXPORT_SYMBOL(skb_eth_pop);
5752
5753/**
5754 * skb_eth_push() - Add a new Ethernet header at the head of a packet
5755 *
5756 * @skb: Socket buffer to modify
5757 * @dst: Destination MAC address of the new header
5758 * @src: Source MAC address of the new header
5759 *
5760 * Prepend @skb with a new Ethernet header.
5761 *
5762 * Expects that skb->data points to the mac header, which must be empty.
5763 *
5764 * Returns 0 on success, -errno otherwise.
5765 */
5766int skb_eth_push(struct sk_buff *skb, const unsigned char *dst,
5767 const unsigned char *src)
5768{
5769 struct ethhdr *eth;
5770 int err;
5771
5772 if (skb_network_offset(skb) || skb_vlan_tag_present(skb))
5773 return -EPROTO;
5774
5775 err = skb_cow_head(skb, sizeof(*eth));
5776 if (err < 0)
5777 return err;
5778
5779 skb_push(skb, sizeof(*eth));
5780 skb_reset_mac_header(skb);
5781 skb_reset_mac_len(skb);
5782
5783 eth = eth_hdr(skb);
5784 ether_addr_copy(eth->h_dest, dst);
5785 ether_addr_copy(eth->h_source, src);
5786 eth->h_proto = skb->protocol;
5787
5788 skb_postpush_rcsum(skb, eth, sizeof(*eth));
5789
5790 return 0;
5791}
5792EXPORT_SYMBOL(skb_eth_push);
5793
John Hurley8822e272019-07-07 15:01:54 +01005794/* Update the ethertype of hdr and the skb csum value if required. */
5795static void skb_mod_eth_type(struct sk_buff *skb, struct ethhdr *hdr,
5796 __be16 ethertype)
5797{
5798 if (skb->ip_summed == CHECKSUM_COMPLETE) {
5799 __be16 diff[] = { ~hdr->h_proto, ethertype };
5800
5801 skb->csum = csum_partial((char *)diff, sizeof(diff), skb->csum);
5802 }
5803
5804 hdr->h_proto = ethertype;
5805}
5806
5807/**
Martin Varghesee7dbfed2019-12-21 08:50:01 +05305808 * skb_mpls_push() - push a new MPLS header after mac_len bytes from start of
5809 * the packet
John Hurley8822e272019-07-07 15:01:54 +01005810 *
5811 * @skb: buffer
5812 * @mpls_lse: MPLS label stack entry to push
5813 * @mpls_proto: ethertype of the new MPLS header (expects 0x8847 or 0x8848)
Davide Carattifa4e0f82019-10-12 13:55:07 +02005814 * @mac_len: length of the MAC header
Martin Varghesee7dbfed2019-12-21 08:50:01 +05305815 * @ethernet: flag to indicate if the resulting packet after skb_mpls_push is
5816 * ethernet
John Hurley8822e272019-07-07 15:01:54 +01005817 *
5818 * Expects skb->data at mac header.
5819 *
5820 * Returns 0 on success, -errno otherwise.
5821 */
Davide Carattifa4e0f82019-10-12 13:55:07 +02005822int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto,
Martin Varghesed04ac222019-12-05 05:57:22 +05305823 int mac_len, bool ethernet)
John Hurley8822e272019-07-07 15:01:54 +01005824{
5825 struct mpls_shim_hdr *lse;
5826 int err;
5827
5828 if (unlikely(!eth_p_mpls(mpls_proto)))
5829 return -EINVAL;
5830
5831 /* Networking stack does not allow simultaneous Tunnel and MPLS GSO. */
5832 if (skb->encapsulation)
5833 return -EINVAL;
5834
5835 err = skb_cow_head(skb, MPLS_HLEN);
5836 if (unlikely(err))
5837 return err;
5838
5839 if (!skb->inner_protocol) {
Martin Varghesee7dbfed2019-12-21 08:50:01 +05305840 skb_set_inner_network_header(skb, skb_network_offset(skb));
John Hurley8822e272019-07-07 15:01:54 +01005841 skb_set_inner_protocol(skb, skb->protocol);
5842 }
5843
5844 skb_push(skb, MPLS_HLEN);
5845 memmove(skb_mac_header(skb) - MPLS_HLEN, skb_mac_header(skb),
Davide Carattifa4e0f82019-10-12 13:55:07 +02005846 mac_len);
John Hurley8822e272019-07-07 15:01:54 +01005847 skb_reset_mac_header(skb);
Davide Carattifa4e0f82019-10-12 13:55:07 +02005848 skb_set_network_header(skb, mac_len);
Martin Varghesee7dbfed2019-12-21 08:50:01 +05305849 skb_reset_mac_len(skb);
John Hurley8822e272019-07-07 15:01:54 +01005850
5851 lse = mpls_hdr(skb);
5852 lse->label_stack_entry = mpls_lse;
5853 skb_postpush_rcsum(skb, lse, MPLS_HLEN);
5854
Guillaume Nault4296adc2020-10-02 21:53:08 +02005855 if (ethernet && mac_len >= ETH_HLEN)
John Hurley8822e272019-07-07 15:01:54 +01005856 skb_mod_eth_type(skb, eth_hdr(skb), mpls_proto);
5857 skb->protocol = mpls_proto;
5858
5859 return 0;
5860}
5861EXPORT_SYMBOL_GPL(skb_mpls_push);
5862
Eric Dumazet2e4e4412014-09-17 04:49:49 -07005863/**
John Hurleyed246ce2019-07-07 15:01:55 +01005864 * skb_mpls_pop() - pop the outermost MPLS header
5865 *
5866 * @skb: buffer
5867 * @next_proto: ethertype of header after popped MPLS header
Davide Carattifa4e0f82019-10-12 13:55:07 +02005868 * @mac_len: length of the MAC header
Martin Varghese76f99f92019-12-21 08:50:23 +05305869 * @ethernet: flag to indicate if the packet is ethernet
John Hurleyed246ce2019-07-07 15:01:55 +01005870 *
5871 * Expects skb->data at mac header.
5872 *
5873 * Returns 0 on success, -errno otherwise.
5874 */
Martin Varghese040b5cf2019-12-02 10:49:51 +05305875int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto, int mac_len,
5876 bool ethernet)
John Hurleyed246ce2019-07-07 15:01:55 +01005877{
5878 int err;
5879
5880 if (unlikely(!eth_p_mpls(skb->protocol)))
Davide Carattidedc5a02019-10-12 13:55:06 +02005881 return 0;
John Hurleyed246ce2019-07-07 15:01:55 +01005882
Davide Carattifa4e0f82019-10-12 13:55:07 +02005883 err = skb_ensure_writable(skb, mac_len + MPLS_HLEN);
John Hurleyed246ce2019-07-07 15:01:55 +01005884 if (unlikely(err))
5885 return err;
5886
5887 skb_postpull_rcsum(skb, mpls_hdr(skb), MPLS_HLEN);
5888 memmove(skb_mac_header(skb) + MPLS_HLEN, skb_mac_header(skb),
Davide Carattifa4e0f82019-10-12 13:55:07 +02005889 mac_len);
John Hurleyed246ce2019-07-07 15:01:55 +01005890
5891 __skb_pull(skb, MPLS_HLEN);
5892 skb_reset_mac_header(skb);
Davide Carattifa4e0f82019-10-12 13:55:07 +02005893 skb_set_network_header(skb, mac_len);
John Hurleyed246ce2019-07-07 15:01:55 +01005894
Guillaume Nault4296adc2020-10-02 21:53:08 +02005895 if (ethernet && mac_len >= ETH_HLEN) {
John Hurleyed246ce2019-07-07 15:01:55 +01005896 struct ethhdr *hdr;
5897
5898 /* use mpls_hdr() to get ethertype to account for VLANs. */
5899 hdr = (struct ethhdr *)((void *)mpls_hdr(skb) - ETH_HLEN);
5900 skb_mod_eth_type(skb, hdr, next_proto);
5901 }
5902 skb->protocol = next_proto;
5903
5904 return 0;
5905}
5906EXPORT_SYMBOL_GPL(skb_mpls_pop);
5907
5908/**
John Hurleyd27cf5c2019-07-07 15:01:56 +01005909 * skb_mpls_update_lse() - modify outermost MPLS header and update csum
5910 *
5911 * @skb: buffer
5912 * @mpls_lse: new MPLS label stack entry to update to
5913 *
5914 * Expects skb->data at mac header.
5915 *
5916 * Returns 0 on success, -errno otherwise.
5917 */
5918int skb_mpls_update_lse(struct sk_buff *skb, __be32 mpls_lse)
5919{
5920 int err;
5921
5922 if (unlikely(!eth_p_mpls(skb->protocol)))
5923 return -EINVAL;
5924
5925 err = skb_ensure_writable(skb, skb->mac_len + MPLS_HLEN);
5926 if (unlikely(err))
5927 return err;
5928
5929 if (skb->ip_summed == CHECKSUM_COMPLETE) {
5930 __be32 diff[] = { ~mpls_hdr(skb)->label_stack_entry, mpls_lse };
5931
5932 skb->csum = csum_partial((char *)diff, sizeof(diff), skb->csum);
5933 }
5934
5935 mpls_hdr(skb)->label_stack_entry = mpls_lse;
5936
5937 return 0;
5938}
5939EXPORT_SYMBOL_GPL(skb_mpls_update_lse);
5940
5941/**
John Hurley2a2ea502019-07-07 15:01:57 +01005942 * skb_mpls_dec_ttl() - decrement the TTL of the outermost MPLS header
5943 *
5944 * @skb: buffer
5945 *
5946 * Expects skb->data at mac header.
5947 *
5948 * Returns 0 on success, -errno otherwise.
5949 */
5950int skb_mpls_dec_ttl(struct sk_buff *skb)
5951{
5952 u32 lse;
5953 u8 ttl;
5954
5955 if (unlikely(!eth_p_mpls(skb->protocol)))
5956 return -EINVAL;
5957
Davide Caratti13de4ed2020-12-03 10:58:21 +01005958 if (!pskb_may_pull(skb, skb_network_offset(skb) + MPLS_HLEN))
5959 return -ENOMEM;
5960
John Hurley2a2ea502019-07-07 15:01:57 +01005961 lse = be32_to_cpu(mpls_hdr(skb)->label_stack_entry);
5962 ttl = (lse & MPLS_LS_TTL_MASK) >> MPLS_LS_TTL_SHIFT;
5963 if (!--ttl)
5964 return -EINVAL;
5965
5966 lse &= ~MPLS_LS_TTL_MASK;
5967 lse |= ttl << MPLS_LS_TTL_SHIFT;
5968
5969 return skb_mpls_update_lse(skb, cpu_to_be32(lse));
5970}
5971EXPORT_SYMBOL_GPL(skb_mpls_dec_ttl);
5972
5973/**
Eric Dumazet2e4e4412014-09-17 04:49:49 -07005974 * alloc_skb_with_frags - allocate skb with page frags
5975 *
Masanari Iidade3f0d02014-10-09 12:58:08 +09005976 * @header_len: size of linear part
5977 * @data_len: needed length in frags
5978 * @max_page_order: max page order desired.
5979 * @errcode: pointer to error code if any
5980 * @gfp_mask: allocation mask
Eric Dumazet2e4e4412014-09-17 04:49:49 -07005981 *
5982 * This can be used to allocate a paged skb, given a maximal order for frags.
5983 */
5984struct sk_buff *alloc_skb_with_frags(unsigned long header_len,
5985 unsigned long data_len,
5986 int max_page_order,
5987 int *errcode,
5988 gfp_t gfp_mask)
5989{
5990 int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
5991 unsigned long chunk;
5992 struct sk_buff *skb;
5993 struct page *page;
Eric Dumazet2e4e4412014-09-17 04:49:49 -07005994 int i;
5995
5996 *errcode = -EMSGSIZE;
5997 /* Note this test could be relaxed, if we succeed to allocate
5998 * high order pages...
5999 */
6000 if (npages > MAX_SKB_FRAGS)
6001 return NULL;
6002
Eric Dumazet2e4e4412014-09-17 04:49:49 -07006003 *errcode = -ENOBUFS;
David Rientjesf8c468e2019-01-02 13:01:43 -08006004 skb = alloc_skb(header_len, gfp_mask);
Eric Dumazet2e4e4412014-09-17 04:49:49 -07006005 if (!skb)
6006 return NULL;
6007
6008 skb->truesize += npages << PAGE_SHIFT;
6009
6010 for (i = 0; npages > 0; i++) {
6011 int order = max_page_order;
6012
6013 while (order) {
6014 if (npages >= 1 << order) {
Mel Gormand0164ad2015-11-06 16:28:21 -08006015 page = alloc_pages((gfp_mask & ~__GFP_DIRECT_RECLAIM) |
Eric Dumazet2e4e4412014-09-17 04:49:49 -07006016 __GFP_COMP |
Michal Hockod14b56f2018-06-28 17:53:06 +02006017 __GFP_NOWARN,
Eric Dumazet2e4e4412014-09-17 04:49:49 -07006018 order);
6019 if (page)
6020 goto fill_page;
6021 /* Do not retry other high order allocations */
6022 order = 1;
6023 max_page_order = 0;
6024 }
6025 order--;
6026 }
6027 page = alloc_page(gfp_mask);
6028 if (!page)
6029 goto failure;
6030fill_page:
6031 chunk = min_t(unsigned long, data_len,
6032 PAGE_SIZE << order);
6033 skb_fill_page_desc(skb, i, page, 0, chunk);
6034 data_len -= chunk;
6035 npages -= 1 << order;
6036 }
6037 return skb;
6038
6039failure:
6040 kfree_skb(skb);
6041 return NULL;
6042}
6043EXPORT_SYMBOL(alloc_skb_with_frags);
Sowmini Varadhan6fa01cc2016-04-22 18:36:35 -07006044
6045/* carve out the first off bytes from skb when off < headlen */
6046static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off,
6047 const int headlen, gfp_t gfp_mask)
6048{
6049 int i;
6050 int size = skb_end_offset(skb);
6051 int new_hlen = headlen - off;
6052 u8 *data;
Sowmini Varadhan6fa01cc2016-04-22 18:36:35 -07006053
6054 size = SKB_DATA_ALIGN(size);
6055
6056 if (skb_pfmemalloc(skb))
6057 gfp_mask |= __GFP_MEMALLOC;
6058 data = kmalloc_reserve(size +
6059 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)),
6060 gfp_mask, NUMA_NO_NODE, NULL);
6061 if (!data)
6062 return -ENOMEM;
6063
6064 size = SKB_WITH_OVERHEAD(ksize(data));
6065
6066 /* Copy real data, and all frags */
6067 skb_copy_from_linear_data_offset(skb, off, data, new_hlen);
6068 skb->len -= off;
6069
6070 memcpy((struct skb_shared_info *)(data + size),
6071 skb_shinfo(skb),
6072 offsetof(struct skb_shared_info,
6073 frags[skb_shinfo(skb)->nr_frags]));
6074 if (skb_cloned(skb)) {
6075 /* drop the old head gracefully */
6076 if (skb_orphan_frags(skb, gfp_mask)) {
6077 kfree(data);
6078 return -ENOMEM;
6079 }
6080 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
6081 skb_frag_ref(skb, i);
6082 if (skb_has_frag_list(skb))
6083 skb_clone_fraglist(skb);
6084 skb_release_data(skb);
6085 } else {
6086 /* we can reuse existing recount- all we did was
6087 * relocate values
6088 */
6089 skb_free_head(skb);
6090 }
6091
Sowmini Varadhan6fa01cc2016-04-22 18:36:35 -07006092 skb->head = data;
6093 skb->data = data;
6094 skb->head_frag = 0;
6095#ifdef NET_SKBUFF_DATA_USES_OFFSET
6096 skb->end = size;
Sowmini Varadhan6fa01cc2016-04-22 18:36:35 -07006097#else
6098 skb->end = skb->head + size;
6099#endif
6100 skb_set_tail_pointer(skb, skb_headlen(skb));
6101 skb_headers_offset_update(skb, 0);
6102 skb->cloned = 0;
6103 skb->hdr_len = 0;
6104 skb->nohdr = 0;
6105 atomic_set(&skb_shinfo(skb)->dataref, 1);
6106
6107 return 0;
6108}
6109
6110static int pskb_carve(struct sk_buff *skb, const u32 off, gfp_t gfp);
6111
6112/* carve out the first eat bytes from skb's frag_list. May recurse into
6113 * pskb_carve()
6114 */
6115static int pskb_carve_frag_list(struct sk_buff *skb,
6116 struct skb_shared_info *shinfo, int eat,
6117 gfp_t gfp_mask)
6118{
6119 struct sk_buff *list = shinfo->frag_list;
6120 struct sk_buff *clone = NULL;
6121 struct sk_buff *insp = NULL;
6122
6123 do {
6124 if (!list) {
6125 pr_err("Not enough bytes to eat. Want %d\n", eat);
6126 return -EFAULT;
6127 }
6128 if (list->len <= eat) {
6129 /* Eaten as whole. */
6130 eat -= list->len;
6131 list = list->next;
6132 insp = list;
6133 } else {
6134 /* Eaten partially. */
6135 if (skb_shared(list)) {
6136 clone = skb_clone(list, gfp_mask);
6137 if (!clone)
6138 return -ENOMEM;
6139 insp = list->next;
6140 list = clone;
6141 } else {
6142 /* This may be pulled without problems. */
6143 insp = list;
6144 }
6145 if (pskb_carve(list, eat, gfp_mask) < 0) {
6146 kfree_skb(clone);
6147 return -ENOMEM;
6148 }
6149 break;
6150 }
6151 } while (eat);
6152
6153 /* Free pulled out fragments. */
6154 while ((list = shinfo->frag_list) != insp) {
6155 shinfo->frag_list = list->next;
6156 kfree_skb(list);
6157 }
6158 /* And insert new clone at head. */
6159 if (clone) {
6160 clone->next = list;
6161 shinfo->frag_list = clone;
6162 }
6163 return 0;
6164}
6165
6166/* carve off first len bytes from skb. Split line (off) is in the
6167 * non-linear part of skb
6168 */
6169static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,
6170 int pos, gfp_t gfp_mask)
6171{
6172 int i, k = 0;
6173 int size = skb_end_offset(skb);
6174 u8 *data;
6175 const int nfrags = skb_shinfo(skb)->nr_frags;
6176 struct skb_shared_info *shinfo;
Sowmini Varadhan6fa01cc2016-04-22 18:36:35 -07006177
6178 size = SKB_DATA_ALIGN(size);
6179
6180 if (skb_pfmemalloc(skb))
6181 gfp_mask |= __GFP_MEMALLOC;
6182 data = kmalloc_reserve(size +
6183 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)),
6184 gfp_mask, NUMA_NO_NODE, NULL);
6185 if (!data)
6186 return -ENOMEM;
6187
6188 size = SKB_WITH_OVERHEAD(ksize(data));
6189
6190 memcpy((struct skb_shared_info *)(data + size),
Miaohe Line3ec1e82020-08-15 04:48:53 -04006191 skb_shinfo(skb), offsetof(struct skb_shared_info, frags[0]));
Sowmini Varadhan6fa01cc2016-04-22 18:36:35 -07006192 if (skb_orphan_frags(skb, gfp_mask)) {
6193 kfree(data);
6194 return -ENOMEM;
6195 }
6196 shinfo = (struct skb_shared_info *)(data + size);
6197 for (i = 0; i < nfrags; i++) {
6198 int fsize = skb_frag_size(&skb_shinfo(skb)->frags[i]);
6199
6200 if (pos + fsize > off) {
6201 shinfo->frags[k] = skb_shinfo(skb)->frags[i];
6202
6203 if (pos < off) {
6204 /* Split frag.
6205 * We have two variants in this case:
6206 * 1. Move all the frag to the second
6207 * part, if it is possible. F.e.
6208 * this approach is mandatory for TUX,
6209 * where splitting is expensive.
6210 * 2. Split is accurately. We make this.
6211 */
Jonathan Lemonb54c9d52019-07-30 07:40:33 -07006212 skb_frag_off_add(&shinfo->frags[0], off - pos);
Sowmini Varadhan6fa01cc2016-04-22 18:36:35 -07006213 skb_frag_size_sub(&shinfo->frags[0], off - pos);
6214 }
6215 skb_frag_ref(skb, i);
6216 k++;
6217 }
6218 pos += fsize;
6219 }
6220 shinfo->nr_frags = k;
6221 if (skb_has_frag_list(skb))
6222 skb_clone_fraglist(skb);
6223
Miaohe Lineabe8612020-08-15 04:46:41 -04006224 /* split line is in frag list */
6225 if (k == 0 && pskb_carve_frag_list(skb, shinfo, off - pos, gfp_mask)) {
6226 /* skb_frag_unref() is not needed here as shinfo->nr_frags = 0. */
6227 if (skb_has_frag_list(skb))
6228 kfree_skb_list(skb_shinfo(skb)->frag_list);
6229 kfree(data);
6230 return -ENOMEM;
Sowmini Varadhan6fa01cc2016-04-22 18:36:35 -07006231 }
6232 skb_release_data(skb);
6233
Sowmini Varadhan6fa01cc2016-04-22 18:36:35 -07006234 skb->head = data;
6235 skb->head_frag = 0;
6236 skb->data = data;
6237#ifdef NET_SKBUFF_DATA_USES_OFFSET
6238 skb->end = size;
Sowmini Varadhan6fa01cc2016-04-22 18:36:35 -07006239#else
6240 skb->end = skb->head + size;
6241#endif
6242 skb_reset_tail_pointer(skb);
6243 skb_headers_offset_update(skb, 0);
6244 skb->cloned = 0;
6245 skb->hdr_len = 0;
6246 skb->nohdr = 0;
6247 skb->len -= off;
6248 skb->data_len = skb->len;
6249 atomic_set(&skb_shinfo(skb)->dataref, 1);
6250 return 0;
6251}
6252
6253/* remove len bytes from the beginning of the skb */
6254static int pskb_carve(struct sk_buff *skb, const u32 len, gfp_t gfp)
6255{
6256 int headlen = skb_headlen(skb);
6257
6258 if (len < headlen)
6259 return pskb_carve_inside_header(skb, len, headlen, gfp);
6260 else
6261 return pskb_carve_inside_nonlinear(skb, len, headlen, gfp);
6262}
6263
6264/* Extract to_copy bytes starting at off from skb, and return this in
6265 * a new skb
6266 */
6267struct sk_buff *pskb_extract(struct sk_buff *skb, int off,
6268 int to_copy, gfp_t gfp)
6269{
6270 struct sk_buff *clone = skb_clone(skb, gfp);
6271
6272 if (!clone)
6273 return NULL;
6274
6275 if (pskb_carve(clone, off, gfp) < 0 ||
6276 pskb_trim(clone, to_copy)) {
6277 kfree_skb(clone);
6278 return NULL;
6279 }
6280 return clone;
6281}
6282EXPORT_SYMBOL(pskb_extract);
Eric Dumazetc8c8b122016-12-07 09:19:33 -08006283
6284/**
6285 * skb_condense - try to get rid of fragments/frag_list if possible
6286 * @skb: buffer
6287 *
6288 * Can be used to save memory before skb is added to a busy queue.
6289 * If packet has bytes in frags and enough tail room in skb->head,
6290 * pull all of them, so that we can free the frags right now and adjust
6291 * truesize.
6292 * Notes:
6293 * We do not reallocate skb->head thus can not fail.
6294 * Caller must re-evaluate skb->truesize if needed.
6295 */
6296void skb_condense(struct sk_buff *skb)
6297{
Eric Dumazet3174fed2016-12-09 08:02:05 -08006298 if (skb->data_len) {
6299 if (skb->data_len > skb->end - skb->tail ||
6300 skb_cloned(skb))
6301 return;
Eric Dumazetc8c8b122016-12-07 09:19:33 -08006302
Eric Dumazet3174fed2016-12-09 08:02:05 -08006303 /* Nice, we can free page frag(s) right now */
6304 __pskb_pull_tail(skb, skb->data_len);
6305 }
6306 /* At this point, skb->truesize might be over estimated,
6307 * because skb had a fragment, and fragments do not tell
6308 * their truesize.
6309 * When we pulled its content into skb->head, fragment
6310 * was freed, but __pskb_pull_tail() could not possibly
6311 * adjust skb->truesize, not knowing the frag truesize.
Eric Dumazetc8c8b122016-12-07 09:19:33 -08006312 */
6313 skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
6314}
Florian Westphaldf5042f2018-12-18 17:15:16 +01006315
6316#ifdef CONFIG_SKB_EXTENSIONS
6317static void *skb_ext_get_ptr(struct skb_ext *ext, enum skb_ext_id id)
6318{
6319 return (void *)ext + (ext->offset[id] * SKB_EXT_ALIGN_VALUE);
6320}
6321
Paolo Abeni8b69a802020-01-09 07:59:24 -08006322/**
6323 * __skb_ext_alloc - allocate a new skb extensions storage
6324 *
Florian Westphal4930f482020-05-16 10:46:23 +02006325 * @flags: See kmalloc().
6326 *
Paolo Abeni8b69a802020-01-09 07:59:24 -08006327 * Returns the newly allocated pointer. The pointer can later attached to a
6328 * skb via __skb_ext_set().
6329 * Note: caller must handle the skb_ext as an opaque data.
6330 */
Florian Westphal4930f482020-05-16 10:46:23 +02006331struct skb_ext *__skb_ext_alloc(gfp_t flags)
Florian Westphaldf5042f2018-12-18 17:15:16 +01006332{
Florian Westphal4930f482020-05-16 10:46:23 +02006333 struct skb_ext *new = kmem_cache_alloc(skbuff_ext_cache, flags);
Florian Westphaldf5042f2018-12-18 17:15:16 +01006334
6335 if (new) {
6336 memset(new->offset, 0, sizeof(new->offset));
6337 refcount_set(&new->refcnt, 1);
6338 }
6339
6340 return new;
6341}
6342
Florian Westphal41650792018-12-18 17:15:27 +01006343static struct skb_ext *skb_ext_maybe_cow(struct skb_ext *old,
6344 unsigned int old_active)
Florian Westphaldf5042f2018-12-18 17:15:16 +01006345{
6346 struct skb_ext *new;
6347
6348 if (refcount_read(&old->refcnt) == 1)
6349 return old;
6350
6351 new = kmem_cache_alloc(skbuff_ext_cache, GFP_ATOMIC);
6352 if (!new)
6353 return NULL;
6354
6355 memcpy(new, old, old->chunks * SKB_EXT_ALIGN_VALUE);
6356 refcount_set(&new->refcnt, 1);
6357
Florian Westphal41650792018-12-18 17:15:27 +01006358#ifdef CONFIG_XFRM
6359 if (old_active & (1 << SKB_EXT_SEC_PATH)) {
6360 struct sec_path *sp = skb_ext_get_ptr(old, SKB_EXT_SEC_PATH);
6361 unsigned int i;
6362
6363 for (i = 0; i < sp->len; i++)
6364 xfrm_state_hold(sp->xvec[i]);
6365 }
6366#endif
Florian Westphaldf5042f2018-12-18 17:15:16 +01006367 __skb_ext_put(old);
6368 return new;
6369}
6370
6371/**
Paolo Abeni8b69a802020-01-09 07:59:24 -08006372 * __skb_ext_set - attach the specified extension storage to this skb
6373 * @skb: buffer
6374 * @id: extension id
6375 * @ext: extension storage previously allocated via __skb_ext_alloc()
6376 *
6377 * Existing extensions, if any, are cleared.
6378 *
6379 * Returns the pointer to the extension.
6380 */
6381void *__skb_ext_set(struct sk_buff *skb, enum skb_ext_id id,
6382 struct skb_ext *ext)
6383{
6384 unsigned int newlen, newoff = SKB_EXT_CHUNKSIZEOF(*ext);
6385
6386 skb_ext_put(skb);
6387 newlen = newoff + skb_ext_type_len[id];
6388 ext->chunks = newlen;
6389 ext->offset[id] = newoff;
6390 skb->extensions = ext;
6391 skb->active_extensions = 1 << id;
6392 return skb_ext_get_ptr(ext, id);
6393}
6394
6395/**
Florian Westphaldf5042f2018-12-18 17:15:16 +01006396 * skb_ext_add - allocate space for given extension, COW if needed
6397 * @skb: buffer
6398 * @id: extension to allocate space for
6399 *
6400 * Allocates enough space for the given extension.
6401 * If the extension is already present, a pointer to that extension
6402 * is returned.
6403 *
6404 * If the skb was cloned, COW applies and the returned memory can be
6405 * modified without changing the extension space of clones buffers.
6406 *
6407 * Returns pointer to the extension or NULL on allocation failure.
6408 */
6409void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id)
6410{
6411 struct skb_ext *new, *old = NULL;
6412 unsigned int newlen, newoff;
6413
6414 if (skb->active_extensions) {
6415 old = skb->extensions;
6416
Florian Westphal41650792018-12-18 17:15:27 +01006417 new = skb_ext_maybe_cow(old, skb->active_extensions);
Florian Westphaldf5042f2018-12-18 17:15:16 +01006418 if (!new)
6419 return NULL;
6420
Paolo Abeni682ec852018-12-21 19:03:15 +01006421 if (__skb_ext_exist(new, id))
Florian Westphaldf5042f2018-12-18 17:15:16 +01006422 goto set_active;
Florian Westphaldf5042f2018-12-18 17:15:16 +01006423
Paolo Abenie94e50b2018-12-21 19:03:13 +01006424 newoff = new->chunks;
Florian Westphaldf5042f2018-12-18 17:15:16 +01006425 } else {
6426 newoff = SKB_EXT_CHUNKSIZEOF(*new);
6427
Florian Westphal4930f482020-05-16 10:46:23 +02006428 new = __skb_ext_alloc(GFP_ATOMIC);
Florian Westphaldf5042f2018-12-18 17:15:16 +01006429 if (!new)
6430 return NULL;
6431 }
6432
6433 newlen = newoff + skb_ext_type_len[id];
6434 new->chunks = newlen;
6435 new->offset[id] = newoff;
Florian Westphaldf5042f2018-12-18 17:15:16 +01006436set_active:
Paolo Abeni682ec852018-12-21 19:03:15 +01006437 skb->extensions = new;
Florian Westphaldf5042f2018-12-18 17:15:16 +01006438 skb->active_extensions |= 1 << id;
6439 return skb_ext_get_ptr(new, id);
6440}
6441EXPORT_SYMBOL(skb_ext_add);
6442
Florian Westphal41650792018-12-18 17:15:27 +01006443#ifdef CONFIG_XFRM
6444static void skb_ext_put_sp(struct sec_path *sp)
6445{
6446 unsigned int i;
6447
6448 for (i = 0; i < sp->len; i++)
6449 xfrm_state_put(sp->xvec[i]);
6450}
6451#endif
6452
Florian Westphaldf5042f2018-12-18 17:15:16 +01006453void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id)
6454{
6455 struct skb_ext *ext = skb->extensions;
6456
6457 skb->active_extensions &= ~(1 << id);
6458 if (skb->active_extensions == 0) {
6459 skb->extensions = NULL;
6460 __skb_ext_put(ext);
Florian Westphal41650792018-12-18 17:15:27 +01006461#ifdef CONFIG_XFRM
6462 } else if (id == SKB_EXT_SEC_PATH &&
6463 refcount_read(&ext->refcnt) == 1) {
6464 struct sec_path *sp = skb_ext_get_ptr(ext, SKB_EXT_SEC_PATH);
6465
6466 skb_ext_put_sp(sp);
6467 sp->len = 0;
6468#endif
Florian Westphaldf5042f2018-12-18 17:15:16 +01006469 }
6470}
6471EXPORT_SYMBOL(__skb_ext_del);
6472
6473void __skb_ext_put(struct skb_ext *ext)
6474{
6475 /* If this is last clone, nothing can increment
6476 * it after check passes. Avoids one atomic op.
6477 */
6478 if (refcount_read(&ext->refcnt) == 1)
6479 goto free_now;
6480
6481 if (!refcount_dec_and_test(&ext->refcnt))
6482 return;
6483free_now:
Florian Westphal41650792018-12-18 17:15:27 +01006484#ifdef CONFIG_XFRM
6485 if (__skb_ext_exist(ext, SKB_EXT_SEC_PATH))
6486 skb_ext_put_sp(skb_ext_get_ptr(ext, SKB_EXT_SEC_PATH));
6487#endif
6488
Florian Westphaldf5042f2018-12-18 17:15:16 +01006489 kmem_cache_free(skbuff_ext_cache, ext);
6490}
6491EXPORT_SYMBOL(__skb_ext_put);
6492#endif /* CONFIG_SKB_EXTENSIONS */