blob: 614296a920c6ab189b58607726eea55bee4d0426 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * IPv6 output functions
3 * Linux INET6 implementation
4 *
5 * Authors:
6 * Pedro Roque <roque@di.fc.ul.pt>
7 *
8 * $Id: ip6_output.c,v 1.34 2002/02/01 22:01:04 davem Exp $
9 *
10 * Based on linux/net/ipv4/ip_output.c
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version
15 * 2 of the License, or (at your option) any later version.
16 *
17 * Changes:
18 * A.N.Kuznetsov : airthmetics in fragmentation.
19 * extension headers are implemented.
20 * route changes now work.
21 * ip6_forward does not confuse sniffers.
22 * etc.
23 *
24 * H. von Brand : Added missing #include <linux/string.h>
25 * Imran Patel : frag id should be in NBO
26 * Kazunori MIYAZAWA @USAGI
27 * : add ip6_append_data and related functions
28 * for datagram xmit
29 */
30
31#include <linux/config.h>
32#include <linux/errno.h>
33#include <linux/types.h>
34#include <linux/string.h>
35#include <linux/socket.h>
36#include <linux/net.h>
37#include <linux/netdevice.h>
38#include <linux/if_arp.h>
39#include <linux/in6.h>
40#include <linux/tcp.h>
41#include <linux/route.h>
42
43#include <linux/netfilter.h>
44#include <linux/netfilter_ipv6.h>
45
46#include <net/sock.h>
47#include <net/snmp.h>
48
49#include <net/ipv6.h>
50#include <net/ndisc.h>
51#include <net/protocol.h>
52#include <net/ip6_route.h>
53#include <net/addrconf.h>
54#include <net/rawv6.h>
55#include <net/icmp.h>
56#include <net/xfrm.h>
57#include <net/checksum.h>
58
59static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60
61static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
62{
63 static u32 ipv6_fragmentation_id = 1;
64 static DEFINE_SPINLOCK(ip6_id_lock);
65
66 spin_lock_bh(&ip6_id_lock);
67 fhdr->identification = htonl(ipv6_fragmentation_id);
68 if (++ipv6_fragmentation_id == 0)
69 ipv6_fragmentation_id = 1;
70 spin_unlock_bh(&ip6_id_lock);
71}
72
73static inline int ip6_output_finish(struct sk_buff *skb)
74{
75
76 struct dst_entry *dst = skb->dst;
77 struct hh_cache *hh = dst->hh;
78
79 if (hh) {
80 int hh_alen;
81
82 read_lock_bh(&hh->hh_lock);
83 hh_alen = HH_DATA_ALIGN(hh->hh_len);
84 memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
85 read_unlock_bh(&hh->hh_lock);
86 skb_push(skb, hh->hh_len);
87 return hh->hh_output(skb);
88 } else if (dst->neighbour)
89 return dst->neighbour->output(skb);
90
91 IP6_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
92 kfree_skb(skb);
93 return -EINVAL;
94
95}
96
97/* dev_loopback_xmit for use with netfilter. */
98static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
99{
100 newskb->mac.raw = newskb->data;
101 __skb_pull(newskb, newskb->nh.raw - newskb->data);
102 newskb->pkt_type = PACKET_LOOPBACK;
103 newskb->ip_summed = CHECKSUM_UNNECESSARY;
104 BUG_TRAP(newskb->dst);
105
106 netif_rx(newskb);
107 return 0;
108}
109
110
111static int ip6_output2(struct sk_buff *skb)
112{
113 struct dst_entry *dst = skb->dst;
114 struct net_device *dev = dst->dev;
115
116 skb->protocol = htons(ETH_P_IPV6);
117 skb->dev = dev;
118
119 if (ipv6_addr_is_multicast(&skb->nh.ipv6h->daddr)) {
120 struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
121
122 if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
123 ipv6_chk_mcast_addr(dev, &skb->nh.ipv6h->daddr,
124 &skb->nh.ipv6h->saddr)) {
125 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
126
127 /* Do not check for IFF_ALLMULTI; multicast routing
128 is not supported in any case.
129 */
130 if (newskb)
131 NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, newskb, NULL,
132 newskb->dev,
133 ip6_dev_loopback_xmit);
134
135 if (skb->nh.ipv6h->hop_limit == 0) {
136 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
137 kfree_skb(skb);
138 return 0;
139 }
140 }
141
142 IP6_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS);
143 }
144
145 return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb,NULL, skb->dev,ip6_output_finish);
146}
147
148int ip6_output(struct sk_buff *skb)
149{
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700150 if ((skb->len > dst_mtu(skb->dst) && !skb_shinfo(skb)->ufo_size) ||
151 dst_allfrag(skb->dst))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700152 return ip6_fragment(skb, ip6_output2);
153 else
154 return ip6_output2(skb);
155}
156
Linus Torvalds1da177e2005-04-16 15:20:36 -0700157/*
158 * xmit an sk_buff (used by TCP)
159 */
160
161int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
162 struct ipv6_txoptions *opt, int ipfragok)
163{
164 struct ipv6_pinfo *np = sk ? inet6_sk(sk) : NULL;
165 struct in6_addr *first_hop = &fl->fl6_dst;
166 struct dst_entry *dst = skb->dst;
167 struct ipv6hdr *hdr;
168 u8 proto = fl->proto;
169 int seg_len = skb->len;
YOSHIFUJI Hideaki41a1f8e2005-09-08 10:19:03 +0900170 int hlimit, tclass;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700171 u32 mtu;
172
173 if (opt) {
174 int head_room;
175
176 /* First: exthdrs may take lots of space (~8K for now)
177 MAX_HEADER is not enough.
178 */
179 head_room = opt->opt_nflen + opt->opt_flen;
180 seg_len += head_room;
181 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
182
183 if (skb_headroom(skb) < head_room) {
184 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
185 kfree_skb(skb);
186 skb = skb2;
187 if (skb == NULL) {
188 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
189 return -ENOBUFS;
190 }
191 if (sk)
192 skb_set_owner_w(skb, sk);
193 }
194 if (opt->opt_flen)
195 ipv6_push_frag_opts(skb, opt, &proto);
196 if (opt->opt_nflen)
197 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
198 }
199
200 hdr = skb->nh.ipv6h = (struct ipv6hdr*)skb_push(skb, sizeof(struct ipv6hdr));
201
202 /*
203 * Fill in the IPv6 header
204 */
205
Linus Torvalds1da177e2005-04-16 15:20:36 -0700206 hlimit = -1;
207 if (np)
208 hlimit = np->hop_limit;
209 if (hlimit < 0)
210 hlimit = dst_metric(dst, RTAX_HOPLIMIT);
211 if (hlimit < 0)
212 hlimit = ipv6_get_hoplimit(dst->dev);
213
YOSHIFUJI Hideaki41a1f8e2005-09-08 10:19:03 +0900214 tclass = -1;
215 if (np)
216 tclass = np->tclass;
217 if (tclass < 0)
218 tclass = 0;
219
220 *(u32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
221
Linus Torvalds1da177e2005-04-16 15:20:36 -0700222 hdr->payload_len = htons(seg_len);
223 hdr->nexthdr = proto;
224 hdr->hop_limit = hlimit;
225
226 ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
227 ipv6_addr_copy(&hdr->daddr, first_hop);
228
229 mtu = dst_mtu(dst);
230 if ((skb->len <= mtu) || ipfragok) {
231 IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
Harald Welte6869c4d2005-08-09 19:24:19 -0700232 return NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev,
233 dst_output);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700234 }
235
236 if (net_ratelimit())
237 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
238 skb->dev = dst->dev;
239 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
240 IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
241 kfree_skb(skb);
242 return -EMSGSIZE;
243}
244
245/*
246 * To avoid extra problems ND packets are send through this
247 * routine. It's code duplication but I really want to avoid
248 * extra checks since ipv6_build_header is used by TCP (which
249 * is for us performance critical)
250 */
251
252int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
253 struct in6_addr *saddr, struct in6_addr *daddr,
254 int proto, int len)
255{
256 struct ipv6_pinfo *np = inet6_sk(sk);
257 struct ipv6hdr *hdr;
258 int totlen;
259
260 skb->protocol = htons(ETH_P_IPV6);
261 skb->dev = dev;
262
263 totlen = len + sizeof(struct ipv6hdr);
264
265 hdr = (struct ipv6hdr *) skb_put(skb, sizeof(struct ipv6hdr));
266 skb->nh.ipv6h = hdr;
267
268 *(u32*)hdr = htonl(0x60000000);
269
270 hdr->payload_len = htons(len);
271 hdr->nexthdr = proto;
272 hdr->hop_limit = np->hop_limit;
273
274 ipv6_addr_copy(&hdr->saddr, saddr);
275 ipv6_addr_copy(&hdr->daddr, daddr);
276
277 return 0;
278}
279
280static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
281{
282 struct ip6_ra_chain *ra;
283 struct sock *last = NULL;
284
285 read_lock(&ip6_ra_lock);
286 for (ra = ip6_ra_chain; ra; ra = ra->next) {
287 struct sock *sk = ra->sk;
Andrew McDonald0bd1b592005-08-09 19:44:42 -0700288 if (sk && ra->sel == sel &&
289 (!sk->sk_bound_dev_if ||
290 sk->sk_bound_dev_if == skb->dev->ifindex)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700291 if (last) {
292 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
293 if (skb2)
294 rawv6_rcv(last, skb2);
295 }
296 last = sk;
297 }
298 }
299
300 if (last) {
301 rawv6_rcv(last, skb);
302 read_unlock(&ip6_ra_lock);
303 return 1;
304 }
305 read_unlock(&ip6_ra_lock);
306 return 0;
307}
308
309static inline int ip6_forward_finish(struct sk_buff *skb)
310{
311 return dst_output(skb);
312}
313
314int ip6_forward(struct sk_buff *skb)
315{
316 struct dst_entry *dst = skb->dst;
317 struct ipv6hdr *hdr = skb->nh.ipv6h;
318 struct inet6_skb_parm *opt = IP6CB(skb);
319
320 if (ipv6_devconf.forwarding == 0)
321 goto error;
322
323 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
324 IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
325 goto drop;
326 }
327
328 skb->ip_summed = CHECKSUM_NONE;
329
330 /*
331 * We DO NOT make any processing on
332 * RA packets, pushing them to user level AS IS
333 * without ane WARRANTY that application will be able
334 * to interpret them. The reason is that we
335 * cannot make anything clever here.
336 *
337 * We are not end-node, so that if packet contains
338 * AH/ESP, we cannot make anything.
339 * Defragmentation also would be mistake, RA packets
340 * cannot be fragmented, because there is no warranty
341 * that different fragments will go along one path. --ANK
342 */
343 if (opt->ra) {
344 u8 *ptr = skb->nh.raw + opt->ra;
345 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
346 return 0;
347 }
348
349 /*
350 * check and decrement ttl
351 */
352 if (hdr->hop_limit <= 1) {
353 /* Force OUTPUT device used as source address */
354 skb->dev = dst->dev;
355 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
356 0, skb->dev);
357
358 kfree_skb(skb);
359 return -ETIMEDOUT;
360 }
361
362 if (!xfrm6_route_forward(skb)) {
363 IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
364 goto drop;
365 }
366 dst = skb->dst;
367
368 /* IPv6 specs say nothing about it, but it is clear that we cannot
369 send redirects to source routed frames.
370 */
371 if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0) {
372 struct in6_addr *target = NULL;
373 struct rt6_info *rt;
374 struct neighbour *n = dst->neighbour;
375
376 /*
377 * incoming and outgoing devices are the same
378 * send a redirect.
379 */
380
381 rt = (struct rt6_info *) dst;
382 if ((rt->rt6i_flags & RTF_GATEWAY))
383 target = (struct in6_addr*)&n->primary_key;
384 else
385 target = &hdr->daddr;
386
387 /* Limit redirects both by destination (here)
388 and by source (inside ndisc_send_redirect)
389 */
390 if (xrlim_allow(dst, 1*HZ))
391 ndisc_send_redirect(skb, n, target);
392 } else if (ipv6_addr_type(&hdr->saddr)&(IPV6_ADDR_MULTICAST|IPV6_ADDR_LOOPBACK
393 |IPV6_ADDR_LINKLOCAL)) {
394 /* This check is security critical. */
395 goto error;
396 }
397
398 if (skb->len > dst_mtu(dst)) {
399 /* Again, force OUTPUT device used as source address */
400 skb->dev = dst->dev;
401 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
402 IP6_INC_STATS_BH(IPSTATS_MIB_INTOOBIGERRORS);
403 IP6_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS);
404 kfree_skb(skb);
405 return -EMSGSIZE;
406 }
407
408 if (skb_cow(skb, dst->dev->hard_header_len)) {
409 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
410 goto drop;
411 }
412
413 hdr = skb->nh.ipv6h;
414
415 /* Mangling hops number delayed to point after skb COW */
416
417 hdr->hop_limit--;
418
419 IP6_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
420 return NF_HOOK(PF_INET6,NF_IP6_FORWARD, skb, skb->dev, dst->dev, ip6_forward_finish);
421
422error:
423 IP6_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
424drop:
425 kfree_skb(skb);
426 return -EINVAL;
427}
428
429static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
430{
431 to->pkt_type = from->pkt_type;
432 to->priority = from->priority;
433 to->protocol = from->protocol;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700434 dst_release(to->dst);
435 to->dst = dst_clone(from->dst);
436 to->dev = from->dev;
437
438#ifdef CONFIG_NET_SCHED
439 to->tc_index = from->tc_index;
440#endif
441#ifdef CONFIG_NETFILTER
442 to->nfmark = from->nfmark;
443 /* Connection association is same as pre-frag packet */
444 to->nfct = from->nfct;
445 nf_conntrack_get(to->nfct);
446 to->nfctinfo = from->nfctinfo;
447#ifdef CONFIG_BRIDGE_NETFILTER
448 nf_bridge_put(to->nf_bridge);
449 to->nf_bridge = from->nf_bridge;
450 nf_bridge_get(to->nf_bridge);
451#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700452#endif
453}
454
455int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
456{
457 u16 offset = sizeof(struct ipv6hdr);
458 struct ipv6_opt_hdr *exthdr = (struct ipv6_opt_hdr*)(skb->nh.ipv6h + 1);
459 unsigned int packet_len = skb->tail - skb->nh.raw;
460 int found_rhdr = 0;
461 *nexthdr = &skb->nh.ipv6h->nexthdr;
462
463 while (offset + 1 <= packet_len) {
464
465 switch (**nexthdr) {
466
467 case NEXTHDR_HOP:
468 case NEXTHDR_ROUTING:
469 case NEXTHDR_DEST:
470 if (**nexthdr == NEXTHDR_ROUTING) found_rhdr = 1;
471 if (**nexthdr == NEXTHDR_DEST && found_rhdr) return offset;
472 offset += ipv6_optlen(exthdr);
473 *nexthdr = &exthdr->nexthdr;
474 exthdr = (struct ipv6_opt_hdr*)(skb->nh.raw + offset);
475 break;
476 default :
477 return offset;
478 }
479 }
480
481 return offset;
482}
483
484static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
485{
486 struct net_device *dev;
487 struct sk_buff *frag;
488 struct rt6_info *rt = (struct rt6_info*)skb->dst;
489 struct ipv6hdr *tmp_hdr;
490 struct frag_hdr *fh;
491 unsigned int mtu, hlen, left, len;
492 u32 frag_id = 0;
493 int ptr, offset = 0, err=0;
494 u8 *prevhdr, nexthdr = 0;
495
496 dev = rt->u.dst.dev;
497 hlen = ip6_find_1stfragopt(skb, &prevhdr);
498 nexthdr = *prevhdr;
499
500 mtu = dst_mtu(&rt->u.dst) - hlen - sizeof(struct frag_hdr);
501
502 if (skb_shinfo(skb)->frag_list) {
503 int first_len = skb_pagelen(skb);
504
505 if (first_len - hlen > mtu ||
506 ((first_len - hlen) & 7) ||
507 skb_cloned(skb))
508 goto slow_path;
509
510 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
511 /* Correct geometry. */
512 if (frag->len > mtu ||
513 ((frag->len & 7) && frag->next) ||
514 skb_headroom(frag) < hlen)
515 goto slow_path;
516
Linus Torvalds1da177e2005-04-16 15:20:36 -0700517 /* Partially cloned skb? */
518 if (skb_shared(frag))
519 goto slow_path;
Herbert Xu2fdba6b2005-05-18 22:52:33 -0700520
521 BUG_ON(frag->sk);
522 if (skb->sk) {
523 sock_hold(skb->sk);
524 frag->sk = skb->sk;
525 frag->destructor = sock_wfree;
526 skb->truesize -= frag->truesize;
527 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700528 }
529
530 err = 0;
531 offset = 0;
532 frag = skb_shinfo(skb)->frag_list;
533 skb_shinfo(skb)->frag_list = NULL;
534 /* BUILD HEADER */
535
536 tmp_hdr = kmalloc(hlen, GFP_ATOMIC);
537 if (!tmp_hdr) {
538 IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
539 return -ENOMEM;
540 }
541
542 *prevhdr = NEXTHDR_FRAGMENT;
543 memcpy(tmp_hdr, skb->nh.raw, hlen);
544 __skb_pull(skb, hlen);
545 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
546 skb->nh.raw = __skb_push(skb, hlen);
547 memcpy(skb->nh.raw, tmp_hdr, hlen);
548
549 ipv6_select_ident(skb, fh);
550 fh->nexthdr = nexthdr;
551 fh->reserved = 0;
552 fh->frag_off = htons(IP6_MF);
553 frag_id = fh->identification;
554
555 first_len = skb_pagelen(skb);
556 skb->data_len = first_len - skb_headlen(skb);
557 skb->len = first_len;
558 skb->nh.ipv6h->payload_len = htons(first_len - sizeof(struct ipv6hdr));
559
560
561 for (;;) {
562 /* Prepare header of the next frame,
563 * before previous one went down. */
564 if (frag) {
565 frag->ip_summed = CHECKSUM_NONE;
566 frag->h.raw = frag->data;
567 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
568 frag->nh.raw = __skb_push(frag, hlen);
569 memcpy(frag->nh.raw, tmp_hdr, hlen);
570 offset += skb->len - hlen - sizeof(struct frag_hdr);
571 fh->nexthdr = nexthdr;
572 fh->reserved = 0;
573 fh->frag_off = htons(offset);
574 if (frag->next != NULL)
575 fh->frag_off |= htons(IP6_MF);
576 fh->identification = frag_id;
577 frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
578 ip6_copy_metadata(frag, skb);
579 }
580
581 err = output(skb);
582 if (err || !frag)
583 break;
584
585 skb = frag;
586 frag = skb->next;
587 skb->next = NULL;
588 }
589
590 if (tmp_hdr)
591 kfree(tmp_hdr);
592
593 if (err == 0) {
594 IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
595 return 0;
596 }
597
598 while (frag) {
599 skb = frag->next;
600 kfree_skb(frag);
601 frag = skb;
602 }
603
604 IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
605 return err;
606 }
607
608slow_path:
609 left = skb->len - hlen; /* Space per frame */
610 ptr = hlen; /* Where to start from */
611
612 /*
613 * Fragment the datagram.
614 */
615
616 *prevhdr = NEXTHDR_FRAGMENT;
617
618 /*
619 * Keep copying data until we run out.
620 */
621 while(left > 0) {
622 len = left;
623 /* IF: it doesn't fit, use 'mtu' - the data space left */
624 if (len > mtu)
625 len = mtu;
626 /* IF: we are not sending upto and including the packet end
627 then align the next start on an eight byte boundary */
628 if (len < left) {
629 len &= ~7;
630 }
631 /*
632 * Allocate buffer.
633 */
634
635 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_RESERVED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
Patrick McHardy64ce2072005-08-09 20:50:53 -0700636 NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700637 IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
638 err = -ENOMEM;
639 goto fail;
640 }
641
642 /*
643 * Set up data on packet
644 */
645
646 ip6_copy_metadata(frag, skb);
647 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
648 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
649 frag->nh.raw = frag->data;
650 fh = (struct frag_hdr*)(frag->data + hlen);
651 frag->h.raw = frag->data + hlen + sizeof(struct frag_hdr);
652
653 /*
654 * Charge the memory for the fragment to any owner
655 * it might possess
656 */
657 if (skb->sk)
658 skb_set_owner_w(frag, skb->sk);
659
660 /*
661 * Copy the packet header into the new buffer.
662 */
663 memcpy(frag->nh.raw, skb->data, hlen);
664
665 /*
666 * Build fragment header.
667 */
668 fh->nexthdr = nexthdr;
669 fh->reserved = 0;
Yan Zhengf36d6ab2005-10-03 14:19:15 -0700670 if (!frag_id) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700671 ipv6_select_ident(skb, fh);
672 frag_id = fh->identification;
673 } else
674 fh->identification = frag_id;
675
676 /*
677 * Copy a block of the IP datagram.
678 */
679 if (skb_copy_bits(skb, ptr, frag->h.raw, len))
680 BUG();
681 left -= len;
682
683 fh->frag_off = htons(offset);
684 if (left > 0)
685 fh->frag_off |= htons(IP6_MF);
686 frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
687
688 ptr += len;
689 offset += len;
690
691 /*
692 * Put this fragment into the sending queue.
693 */
694
695 IP6_INC_STATS(IPSTATS_MIB_FRAGCREATES);
696
697 err = output(frag);
698 if (err)
699 goto fail;
700 }
701 kfree_skb(skb);
702 IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
703 return err;
704
705fail:
706 kfree_skb(skb);
707 IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
708 return err;
709}
710
711int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
712{
713 int err = 0;
714
715 *dst = NULL;
716 if (sk) {
717 struct ipv6_pinfo *np = inet6_sk(sk);
718
719 *dst = sk_dst_check(sk, np->dst_cookie);
720 if (*dst) {
721 struct rt6_info *rt = (struct rt6_info*)*dst;
722
723 /* Yes, checking route validity in not connected
724 case is not very simple. Take into account,
725 that we do not support routing by source, TOS,
726 and MSG_DONTROUTE --ANK (980726)
727
728 1. If route was host route, check that
729 cached destination is current.
730 If it is network route, we still may
731 check its validity using saved pointer
732 to the last used address: daddr_cache.
733 We do not want to save whole address now,
734 (because main consumer of this service
735 is tcp, which has not this problem),
736 so that the last trick works only on connected
737 sockets.
738 2. oif also should be the same.
739 */
740
741 if (((rt->rt6i_dst.plen != 128 ||
742 !ipv6_addr_equal(&fl->fl6_dst, &rt->rt6i_dst.addr))
743 && (np->daddr_cache == NULL ||
744 !ipv6_addr_equal(&fl->fl6_dst, np->daddr_cache)))
745 || (fl->oif && fl->oif != (*dst)->dev->ifindex)) {
746 dst_release(*dst);
747 *dst = NULL;
748 }
749 }
750 }
751
752 if (*dst == NULL)
753 *dst = ip6_route_output(sk, fl);
754
755 if ((err = (*dst)->error))
756 goto out_err_release;
757
758 if (ipv6_addr_any(&fl->fl6_src)) {
759 err = ipv6_get_saddr(*dst, &fl->fl6_dst, &fl->fl6_src);
760
Olaf Hering44456d32005-07-27 11:45:17 -0700761 if (err)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700762 goto out_err_release;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700763 }
764
765 return 0;
766
767out_err_release:
768 dst_release(*dst);
769 *dst = NULL;
770 return err;
771}
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700772inline int ip6_ufo_append_data(struct sock *sk,
773 int getfrag(void *from, char *to, int offset, int len,
774 int odd, struct sk_buff *skb),
775 void *from, int length, int hh_len, int fragheaderlen,
776 int transhdrlen, int mtu,unsigned int flags)
777
778{
779 struct sk_buff *skb;
780 int err;
781
782 /* There is support for UDP large send offload by network
783 * device, so create one single skb packet containing complete
784 * udp datagram
785 */
786 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
787 skb = sock_alloc_send_skb(sk,
788 hh_len + fragheaderlen + transhdrlen + 20,
789 (flags & MSG_DONTWAIT), &err);
790 if (skb == NULL)
791 return -ENOMEM;
792
793 /* reserve space for Hardware header */
794 skb_reserve(skb, hh_len);
795
796 /* create space for UDP/IP header */
797 skb_put(skb,fragheaderlen + transhdrlen);
798
799 /* initialize network header pointer */
800 skb->nh.raw = skb->data;
801
802 /* initialize protocol header pointer */
803 skb->h.raw = skb->data + fragheaderlen;
804
805 skb->ip_summed = CHECKSUM_HW;
806 skb->csum = 0;
807 sk->sk_sndmsg_off = 0;
808 }
809
810 err = skb_append_datato_frags(sk,skb, getfrag, from,
811 (length - transhdrlen));
812 if (!err) {
813 struct frag_hdr fhdr;
814
815 /* specify the length of each IP datagram fragment*/
816 skb_shinfo(skb)->ufo_size = (mtu - fragheaderlen) -
817 sizeof(struct frag_hdr);
818 ipv6_select_ident(skb, &fhdr);
819 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
820 __skb_queue_tail(&sk->sk_write_queue, skb);
821
822 return 0;
823 }
824 /* There is not enough support do UPD LSO,
825 * so follow normal path
826 */
827 kfree_skb(skb);
828
829 return err;
830}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700831
YOSHIFUJI Hideaki41a1f8e2005-09-08 10:19:03 +0900832int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
833 int offset, int len, int odd, struct sk_buff *skb),
834 void *from, int length, int transhdrlen,
835 int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
836 struct rt6_info *rt, unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700837{
838 struct inet_sock *inet = inet_sk(sk);
839 struct ipv6_pinfo *np = inet6_sk(sk);
840 struct sk_buff *skb;
841 unsigned int maxfraglen, fragheaderlen;
842 int exthdrlen;
843 int hh_len;
844 int mtu;
845 int copy;
846 int err;
847 int offset = 0;
848 int csummode = CHECKSUM_NONE;
849
850 if (flags&MSG_PROBE)
851 return 0;
852 if (skb_queue_empty(&sk->sk_write_queue)) {
853 /*
854 * setup for corking
855 */
856 if (opt) {
857 if (np->cork.opt == NULL) {
858 np->cork.opt = kmalloc(opt->tot_len,
859 sk->sk_allocation);
860 if (unlikely(np->cork.opt == NULL))
861 return -ENOBUFS;
862 } else if (np->cork.opt->tot_len < opt->tot_len) {
863 printk(KERN_DEBUG "ip6_append_data: invalid option length\n");
864 return -EINVAL;
865 }
866 memcpy(np->cork.opt, opt, opt->tot_len);
867 inet->cork.flags |= IPCORK_OPT;
868 /* need source address above miyazawa*/
869 }
870 dst_hold(&rt->u.dst);
871 np->cork.rt = rt;
872 inet->cork.fl = *fl;
873 np->cork.hop_limit = hlimit;
YOSHIFUJI Hideaki41a1f8e2005-09-08 10:19:03 +0900874 np->cork.tclass = tclass;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700875 inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
876 if (dst_allfrag(rt->u.dst.path))
877 inet->cork.flags |= IPCORK_ALLFRAG;
878 inet->cork.length = 0;
879 sk->sk_sndmsg_page = NULL;
880 sk->sk_sndmsg_off = 0;
881 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0);
882 length += exthdrlen;
883 transhdrlen += exthdrlen;
884 } else {
885 rt = np->cork.rt;
886 fl = &inet->cork.fl;
887 if (inet->cork.flags & IPCORK_OPT)
888 opt = np->cork.opt;
889 transhdrlen = 0;
890 exthdrlen = 0;
891 mtu = inet->cork.fragsize;
892 }
893
894 hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
895
896 fragheaderlen = sizeof(struct ipv6hdr) + (opt ? opt->opt_nflen : 0);
897 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
898
899 if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
900 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
901 ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
902 return -EMSGSIZE;
903 }
904 }
905
906 /*
907 * Let's try using as much space as possible.
908 * Use MTU if total length of the message fits into the MTU.
909 * Otherwise, we need to reserve fragment header and
910 * fragment alignment (= 8-15 octects, in total).
911 *
912 * Note that we may need to "move" the data from the tail of
913 * of the buffer to the new fragment when we split
914 * the message.
915 *
916 * FIXME: It may be fragmented into multiple chunks
917 * at once if non-fragmentable extension headers
918 * are too large.
919 * --yoshfuji
920 */
921
922 inet->cork.length += length;
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700923 if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
924 (rt->u.dst.dev->features & NETIF_F_UFO)) {
925
926 if(ip6_ufo_append_data(sk, getfrag, from, length, hh_len,
927 fragheaderlen, transhdrlen, mtu, flags))
928 goto error;
929
930 return 0;
931 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700932
933 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
934 goto alloc_new_skb;
935
936 while (length > 0) {
937 /* Check if the remaining data fits into current packet. */
938 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
939 if (copy < length)
940 copy = maxfraglen - skb->len;
941
942 if (copy <= 0) {
943 char *data;
944 unsigned int datalen;
945 unsigned int fraglen;
946 unsigned int fraggap;
947 unsigned int alloclen;
948 struct sk_buff *skb_prev;
949alloc_new_skb:
950 skb_prev = skb;
951
952 /* There's no room in the current skb */
953 if (skb_prev)
954 fraggap = skb_prev->len - maxfraglen;
955 else
956 fraggap = 0;
957
958 /*
959 * If remaining data exceeds the mtu,
960 * we know we need more fragment(s).
961 */
962 datalen = length + fraggap;
963 if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
964 datalen = maxfraglen - fragheaderlen;
965
966 fraglen = datalen + fragheaderlen;
967 if ((flags & MSG_MORE) &&
968 !(rt->u.dst.dev->features&NETIF_F_SG))
969 alloclen = mtu;
970 else
971 alloclen = datalen + fragheaderlen;
972
973 /*
974 * The last fragment gets additional space at tail.
975 * Note: we overallocate on fragments with MSG_MODE
976 * because we have no idea if we're the last one.
977 */
978 if (datalen == length + fraggap)
979 alloclen += rt->u.dst.trailer_len;
980
981 /*
982 * We just reserve space for fragment header.
983 * Note: this may be overallocation if the message
984 * (without MSG_MORE) fits into the MTU.
985 */
986 alloclen += sizeof(struct frag_hdr);
987
988 if (transhdrlen) {
989 skb = sock_alloc_send_skb(sk,
990 alloclen + hh_len,
991 (flags & MSG_DONTWAIT), &err);
992 } else {
993 skb = NULL;
994 if (atomic_read(&sk->sk_wmem_alloc) <=
995 2 * sk->sk_sndbuf)
996 skb = sock_wmalloc(sk,
997 alloclen + hh_len, 1,
998 sk->sk_allocation);
999 if (unlikely(skb == NULL))
1000 err = -ENOBUFS;
1001 }
1002 if (skb == NULL)
1003 goto error;
1004 /*
1005 * Fill in the control structures
1006 */
1007 skb->ip_summed = csummode;
1008 skb->csum = 0;
1009 /* reserve for fragmentation */
1010 skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1011
1012 /*
1013 * Find where to start putting bytes
1014 */
1015 data = skb_put(skb, fraglen);
1016 skb->nh.raw = data + exthdrlen;
1017 data += fragheaderlen;
1018 skb->h.raw = data + exthdrlen;
1019
1020 if (fraggap) {
1021 skb->csum = skb_copy_and_csum_bits(
1022 skb_prev, maxfraglen,
1023 data + transhdrlen, fraggap, 0);
1024 skb_prev->csum = csum_sub(skb_prev->csum,
1025 skb->csum);
1026 data += fraggap;
1027 skb_trim(skb_prev, maxfraglen);
1028 }
1029 copy = datalen - transhdrlen - fraggap;
1030 if (copy < 0) {
1031 err = -EINVAL;
1032 kfree_skb(skb);
1033 goto error;
1034 } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1035 err = -EFAULT;
1036 kfree_skb(skb);
1037 goto error;
1038 }
1039
1040 offset += copy;
1041 length -= datalen - fraggap;
1042 transhdrlen = 0;
1043 exthdrlen = 0;
1044 csummode = CHECKSUM_NONE;
1045
1046 /*
1047 * Put the packet on the pending queue
1048 */
1049 __skb_queue_tail(&sk->sk_write_queue, skb);
1050 continue;
1051 }
1052
1053 if (copy > length)
1054 copy = length;
1055
1056 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1057 unsigned int off;
1058
1059 off = skb->len;
1060 if (getfrag(from, skb_put(skb, copy),
1061 offset, copy, off, skb) < 0) {
1062 __skb_trim(skb, off);
1063 err = -EFAULT;
1064 goto error;
1065 }
1066 } else {
1067 int i = skb_shinfo(skb)->nr_frags;
1068 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1069 struct page *page = sk->sk_sndmsg_page;
1070 int off = sk->sk_sndmsg_off;
1071 unsigned int left;
1072
1073 if (page && (left = PAGE_SIZE - off) > 0) {
1074 if (copy >= left)
1075 copy = left;
1076 if (page != frag->page) {
1077 if (i == MAX_SKB_FRAGS) {
1078 err = -EMSGSIZE;
1079 goto error;
1080 }
1081 get_page(page);
1082 skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1083 frag = &skb_shinfo(skb)->frags[i];
1084 }
1085 } else if(i < MAX_SKB_FRAGS) {
1086 if (copy > PAGE_SIZE)
1087 copy = PAGE_SIZE;
1088 page = alloc_pages(sk->sk_allocation, 0);
1089 if (page == NULL) {
1090 err = -ENOMEM;
1091 goto error;
1092 }
1093 sk->sk_sndmsg_page = page;
1094 sk->sk_sndmsg_off = 0;
1095
1096 skb_fill_page_desc(skb, i, page, 0, 0);
1097 frag = &skb_shinfo(skb)->frags[i];
1098 skb->truesize += PAGE_SIZE;
1099 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
1100 } else {
1101 err = -EMSGSIZE;
1102 goto error;
1103 }
1104 if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1105 err = -EFAULT;
1106 goto error;
1107 }
1108 sk->sk_sndmsg_off += copy;
1109 frag->size += copy;
1110 skb->len += copy;
1111 skb->data_len += copy;
1112 }
1113 offset += copy;
1114 length -= copy;
1115 }
1116 return 0;
1117error:
1118 inet->cork.length -= length;
1119 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1120 return err;
1121}
1122
1123int ip6_push_pending_frames(struct sock *sk)
1124{
1125 struct sk_buff *skb, *tmp_skb;
1126 struct sk_buff **tail_skb;
1127 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1128 struct inet_sock *inet = inet_sk(sk);
1129 struct ipv6_pinfo *np = inet6_sk(sk);
1130 struct ipv6hdr *hdr;
1131 struct ipv6_txoptions *opt = np->cork.opt;
1132 struct rt6_info *rt = np->cork.rt;
1133 struct flowi *fl = &inet->cork.fl;
1134 unsigned char proto = fl->proto;
1135 int err = 0;
1136
1137 if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1138 goto out;
1139 tail_skb = &(skb_shinfo(skb)->frag_list);
1140
1141 /* move skb->data to ip header from ext header */
1142 if (skb->data < skb->nh.raw)
1143 __skb_pull(skb, skb->nh.raw - skb->data);
1144 while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1145 __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
1146 *tail_skb = tmp_skb;
1147 tail_skb = &(tmp_skb->next);
1148 skb->len += tmp_skb->len;
1149 skb->data_len += tmp_skb->len;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001150 skb->truesize += tmp_skb->truesize;
1151 __sock_put(tmp_skb->sk);
1152 tmp_skb->destructor = NULL;
1153 tmp_skb->sk = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001154 }
1155
1156 ipv6_addr_copy(final_dst, &fl->fl6_dst);
1157 __skb_pull(skb, skb->h.raw - skb->nh.raw);
1158 if (opt && opt->opt_flen)
1159 ipv6_push_frag_opts(skb, opt, &proto);
1160 if (opt && opt->opt_nflen)
1161 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1162
1163 skb->nh.ipv6h = hdr = (struct ipv6hdr*) skb_push(skb, sizeof(struct ipv6hdr));
1164
YOSHIFUJI Hideaki41a1f8e2005-09-08 10:19:03 +09001165 *(u32*)hdr = fl->fl6_flowlabel |
1166 htonl(0x60000000 | ((int)np->cork.tclass << 20));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001167
1168 if (skb->len <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN)
1169 hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
1170 else
1171 hdr->payload_len = 0;
1172 hdr->hop_limit = np->cork.hop_limit;
1173 hdr->nexthdr = proto;
1174 ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1175 ipv6_addr_copy(&hdr->daddr, final_dst);
1176
1177 skb->dst = dst_clone(&rt->u.dst);
1178 IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
1179 err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dst->dev, dst_output);
1180 if (err) {
1181 if (err > 0)
Herbert Xu3320da82005-04-19 22:32:22 -07001182 err = np->recverr ? net_xmit_errno(err) : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001183 if (err)
1184 goto error;
1185 }
1186
1187out:
1188 inet->cork.flags &= ~IPCORK_OPT;
1189 if (np->cork.opt) {
1190 kfree(np->cork.opt);
1191 np->cork.opt = NULL;
1192 }
1193 if (np->cork.rt) {
1194 dst_release(&np->cork.rt->u.dst);
1195 np->cork.rt = NULL;
1196 inet->cork.flags &= ~IPCORK_ALLFRAG;
1197 }
1198 memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1199 return err;
1200error:
1201 goto out;
1202}
1203
1204void ip6_flush_pending_frames(struct sock *sk)
1205{
1206 struct inet_sock *inet = inet_sk(sk);
1207 struct ipv6_pinfo *np = inet6_sk(sk);
1208 struct sk_buff *skb;
1209
1210 while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1211 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1212 kfree_skb(skb);
1213 }
1214
1215 inet->cork.flags &= ~IPCORK_OPT;
1216
1217 if (np->cork.opt) {
1218 kfree(np->cork.opt);
1219 np->cork.opt = NULL;
1220 }
1221 if (np->cork.rt) {
1222 dst_release(&np->cork.rt->u.dst);
1223 np->cork.rt = NULL;
1224 inet->cork.flags &= ~IPCORK_ALLFRAG;
1225 }
1226 memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1227}