blob: ea6eb44618e77faf35f6c5e04d08150a88d356b7 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * Linux INET6 implementation
3 * FIB front-end.
4 *
5 * Authors:
6 * Pedro Roque <roque@di.fc.ul.pt>
7 *
8 * $Id: route.c,v 1.56 2001/10/31 21:55:55 davem Exp $
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
14 */
15
16/* Changes:
17 *
18 * YOSHIFUJI Hideaki @USAGI
19 * reworked default router selection.
20 * - respect outgoing interface
21 * - select from (probably) reachable routers (i.e.
22 * routers in REACHABLE, STALE, DELAY or PROBE states).
23 * - always select the same router if it is (probably)
24 * reachable. otherwise, round-robin the list.
25 */
26
Randy Dunlap4fc268d2006-01-11 12:17:47 -080027#include <linux/capability.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070028#include <linux/config.h>
29#include <linux/errno.h>
30#include <linux/types.h>
31#include <linux/times.h>
32#include <linux/socket.h>
33#include <linux/sockios.h>
34#include <linux/net.h>
35#include <linux/route.h>
36#include <linux/netdevice.h>
37#include <linux/in6.h>
38#include <linux/init.h>
39#include <linux/netlink.h>
40#include <linux/if_arp.h>
41
42#ifdef CONFIG_PROC_FS
43#include <linux/proc_fs.h>
44#include <linux/seq_file.h>
45#endif
46
47#include <net/snmp.h>
48#include <net/ipv6.h>
49#include <net/ip6_fib.h>
50#include <net/ip6_route.h>
51#include <net/ndisc.h>
52#include <net/addrconf.h>
53#include <net/tcp.h>
54#include <linux/rtnetlink.h>
55#include <net/dst.h>
56#include <net/xfrm.h>
57
58#include <asm/uaccess.h>
59
60#ifdef CONFIG_SYSCTL
61#include <linux/sysctl.h>
62#endif
63
64/* Set to 3 to get tracing. */
65#define RT6_DEBUG 2
66
67#if RT6_DEBUG >= 3
68#define RDBG(x) printk x
69#define RT6_TRACE(x...) printk(KERN_DEBUG x)
70#else
71#define RDBG(x)
72#define RT6_TRACE(x...) do { ; } while (0)
73#endif
74
YOSHIFUJI Hideaki519fbd82006-03-20 17:00:05 -080075#define CLONE_OFFLINK_ROUTE 0
Linus Torvalds1da177e2005-04-16 15:20:36 -070076
YOSHIFUJI Hideaki554cfb72006-03-20 17:00:26 -080077#define RT6_SELECT_F_IFACE 0x1
78#define RT6_SELECT_F_REACHABLE 0x2
79
Linus Torvalds1da177e2005-04-16 15:20:36 -070080static int ip6_rt_max_size = 4096;
81static int ip6_rt_gc_min_interval = HZ / 2;
82static int ip6_rt_gc_timeout = 60*HZ;
83int ip6_rt_gc_interval = 30*HZ;
84static int ip6_rt_gc_elasticity = 9;
85static int ip6_rt_mtu_expires = 10*60*HZ;
86static int ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
87
88static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
89static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
90static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91static void ip6_dst_destroy(struct dst_entry *);
92static void ip6_dst_ifdown(struct dst_entry *,
93 struct net_device *dev, int how);
94static int ip6_dst_gc(void);
95
96static int ip6_pkt_discard(struct sk_buff *skb);
97static int ip6_pkt_discard_out(struct sk_buff *skb);
98static void ip6_link_failure(struct sk_buff *skb);
99static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
100
101static struct dst_ops ip6_dst_ops = {
102 .family = AF_INET6,
103 .protocol = __constant_htons(ETH_P_IPV6),
104 .gc = ip6_dst_gc,
105 .gc_thresh = 1024,
106 .check = ip6_dst_check,
107 .destroy = ip6_dst_destroy,
108 .ifdown = ip6_dst_ifdown,
109 .negative_advice = ip6_negative_advice,
110 .link_failure = ip6_link_failure,
111 .update_pmtu = ip6_rt_update_pmtu,
112 .entry_size = sizeof(struct rt6_info),
113};
114
115struct rt6_info ip6_null_entry = {
116 .u = {
117 .dst = {
118 .__refcnt = ATOMIC_INIT(1),
119 .__use = 1,
120 .dev = &loopback_dev,
121 .obsolete = -1,
122 .error = -ENETUNREACH,
123 .metrics = { [RTAX_HOPLIMIT - 1] = 255, },
124 .input = ip6_pkt_discard,
125 .output = ip6_pkt_discard_out,
126 .ops = &ip6_dst_ops,
127 .path = (struct dst_entry*)&ip6_null_entry,
128 }
129 },
130 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
131 .rt6i_metric = ~(u32) 0,
132 .rt6i_ref = ATOMIC_INIT(1),
133};
134
135struct fib6_node ip6_routing_table = {
136 .leaf = &ip6_null_entry,
137 .fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO,
138};
139
140/* Protects all the ip6 fib */
141
142DEFINE_RWLOCK(rt6_lock);
143
144
145/* allocate dst with ip6_dst_ops */
146static __inline__ struct rt6_info *ip6_dst_alloc(void)
147{
148 return (struct rt6_info *)dst_alloc(&ip6_dst_ops);
149}
150
151static void ip6_dst_destroy(struct dst_entry *dst)
152{
153 struct rt6_info *rt = (struct rt6_info *)dst;
154 struct inet6_dev *idev = rt->rt6i_idev;
155
156 if (idev != NULL) {
157 rt->rt6i_idev = NULL;
158 in6_dev_put(idev);
159 }
160}
161
162static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
163 int how)
164{
165 struct rt6_info *rt = (struct rt6_info *)dst;
166 struct inet6_dev *idev = rt->rt6i_idev;
167
168 if (dev != &loopback_dev && idev != NULL && idev->dev == dev) {
169 struct inet6_dev *loopback_idev = in6_dev_get(&loopback_dev);
170 if (loopback_idev != NULL) {
171 rt->rt6i_idev = loopback_idev;
172 in6_dev_put(idev);
173 }
174 }
175}
176
177static __inline__ int rt6_check_expired(const struct rt6_info *rt)
178{
179 return (rt->rt6i_flags & RTF_EXPIRES &&
180 time_after(jiffies, rt->rt6i_expires));
181}
182
183/*
184 * Route lookup. Any rt6_lock is implied.
185 */
186
187static __inline__ struct rt6_info *rt6_device_match(struct rt6_info *rt,
188 int oif,
189 int strict)
190{
191 struct rt6_info *local = NULL;
192 struct rt6_info *sprt;
193
194 if (oif) {
195 for (sprt = rt; sprt; sprt = sprt->u.next) {
196 struct net_device *dev = sprt->rt6i_dev;
197 if (dev->ifindex == oif)
198 return sprt;
199 if (dev->flags & IFF_LOOPBACK) {
200 if (sprt->rt6i_idev == NULL ||
201 sprt->rt6i_idev->dev->ifindex != oif) {
202 if (strict && oif)
203 continue;
204 if (local && (!oif ||
205 local->rt6i_idev->dev->ifindex == oif))
206 continue;
207 }
208 local = sprt;
209 }
210 }
211
212 if (local)
213 return local;
214
215 if (strict)
216 return &ip6_null_entry;
217 }
218 return rt;
219}
220
221/*
YOSHIFUJI Hideaki554cfb72006-03-20 17:00:26 -0800222 * Default Router Selection (RFC 2461 6.3.6)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700223 */
YOSHIFUJI Hideaki554cfb72006-03-20 17:00:26 -0800224static int inline rt6_check_dev(struct rt6_info *rt, int oif)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700225{
YOSHIFUJI Hideaki554cfb72006-03-20 17:00:26 -0800226 struct net_device *dev = rt->rt6i_dev;
227 if (!oif || dev->ifindex == oif)
228 return 2;
229 if ((dev->flags & IFF_LOOPBACK) &&
230 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
231 return 1;
232 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700233}
234
YOSHIFUJI Hideaki554cfb72006-03-20 17:00:26 -0800235static int inline rt6_check_neigh(struct rt6_info *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700236{
YOSHIFUJI Hideaki554cfb72006-03-20 17:00:26 -0800237 struct neighbour *neigh = rt->rt6i_nexthop;
238 int m = 0;
239 if (neigh) {
240 read_lock_bh(&neigh->lock);
241 if (neigh->nud_state & NUD_VALID)
242 m = 1;
243 read_unlock_bh(&neigh->lock);
244 }
245 return m;
246}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700247
YOSHIFUJI Hideaki554cfb72006-03-20 17:00:26 -0800248static int rt6_score_route(struct rt6_info *rt, int oif,
249 int strict)
250{
251 int m = rt6_check_dev(rt, oif);
252 if (!m && (strict & RT6_SELECT_F_IFACE))
253 return -1;
254 if (rt6_check_neigh(rt))
255 m |= 4;
256 else if (strict & RT6_SELECT_F_REACHABLE)
257 return -1;
258 return m;
259}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700260
YOSHIFUJI Hideaki554cfb72006-03-20 17:00:26 -0800261static struct rt6_info *rt6_select(struct rt6_info **head, int oif,
262 int strict)
263{
264 struct rt6_info *match = NULL, *last = NULL;
265 struct rt6_info *rt, *rt0 = *head;
266 u32 metric;
267 int mpri = -1;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700268
YOSHIFUJI Hideaki554cfb72006-03-20 17:00:26 -0800269 RT6_TRACE("%s(head=%p(*head=%p), oif=%d)\n",
270 __FUNCTION__, head, head ? *head : NULL, oif);
271
272 for (rt = rt0, metric = rt0->rt6i_metric;
273 rt && rt->rt6i_metric == metric;
274 rt = rt->u.next) {
275 int m;
276
277 if (rt6_check_expired(rt))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700278 continue;
279
YOSHIFUJI Hideaki554cfb72006-03-20 17:00:26 -0800280 last = rt;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700281
YOSHIFUJI Hideaki554cfb72006-03-20 17:00:26 -0800282 m = rt6_score_route(rt, oif, strict);
283 if (m < 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700284 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700285
YOSHIFUJI Hideaki554cfb72006-03-20 17:00:26 -0800286 if (m > mpri) {
287 match = rt;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700288 mpri = m;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700289 }
290 }
291
YOSHIFUJI Hideaki554cfb72006-03-20 17:00:26 -0800292 if (!match &&
293 (strict & RT6_SELECT_F_REACHABLE) &&
294 last && last != rt0) {
295 /* no entries matched; do round-robin */
296 *head = rt0->u.next;
297 rt0->u.next = last->u.next;
298 last->u.next = rt0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700299 }
300
YOSHIFUJI Hideaki554cfb72006-03-20 17:00:26 -0800301 RT6_TRACE("%s() => %p, score=%d\n",
302 __FUNCTION__, match, mpri);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700303
YOSHIFUJI Hideaki554cfb72006-03-20 17:00:26 -0800304 return (match ? match : &ip6_null_entry);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700305}
306
307struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr,
308 int oif, int strict)
309{
310 struct fib6_node *fn;
311 struct rt6_info *rt;
312
313 read_lock_bh(&rt6_lock);
314 fn = fib6_lookup(&ip6_routing_table, daddr, saddr);
315 rt = rt6_device_match(fn->leaf, oif, strict);
316 dst_hold(&rt->u.dst);
317 rt->u.dst.__use++;
318 read_unlock_bh(&rt6_lock);
319
320 rt->u.dst.lastuse = jiffies;
321 if (rt->u.dst.error == 0)
322 return rt;
323 dst_release(&rt->u.dst);
324 return NULL;
325}
326
327/* ip6_ins_rt is called with FREE rt6_lock.
328 It takes new route entry, the addition fails by any reason the
329 route is freed. In any case, if caller does not hold it, it may
330 be destroyed.
331 */
332
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -0700333int ip6_ins_rt(struct rt6_info *rt, struct nlmsghdr *nlh,
334 void *_rtattr, struct netlink_skb_parms *req)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700335{
336 int err;
337
338 write_lock_bh(&rt6_lock);
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -0700339 err = fib6_add(&ip6_routing_table, rt, nlh, _rtattr, req);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700340 write_unlock_bh(&rt6_lock);
341
342 return err;
343}
344
YOSHIFUJI Hideaki95a9a5b2006-03-20 16:55:51 -0800345static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
346 struct in6_addr *saddr)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700347{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700348 struct rt6_info *rt;
349
350 /*
351 * Clone the route.
352 */
353
354 rt = ip6_rt_copy(ort);
355
356 if (rt) {
YOSHIFUJI Hideaki58c4fb82005-12-21 22:56:42 +0900357 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
358 if (rt->rt6i_dst.plen != 128 &&
359 ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
360 rt->rt6i_flags |= RTF_ANYCAST;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700361 ipv6_addr_copy(&rt->rt6i_gateway, daddr);
YOSHIFUJI Hideaki58c4fb82005-12-21 22:56:42 +0900362 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700363
YOSHIFUJI Hideaki58c4fb82005-12-21 22:56:42 +0900364 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700365 rt->rt6i_dst.plen = 128;
366 rt->rt6i_flags |= RTF_CACHE;
367 rt->u.dst.flags |= DST_HOST;
368
369#ifdef CONFIG_IPV6_SUBTREES
370 if (rt->rt6i_src.plen && saddr) {
371 ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
372 rt->rt6i_src.plen = 128;
373 }
374#endif
375
376 rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
377
YOSHIFUJI Hideaki95a9a5b2006-03-20 16:55:51 -0800378 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700379
YOSHIFUJI Hideaki95a9a5b2006-03-20 16:55:51 -0800380 return rt;
381}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700382
YOSHIFUJI Hideaki299d9932006-03-20 16:58:32 -0800383static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
384{
385 struct rt6_info *rt = ip6_rt_copy(ort);
386 if (rt) {
387 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
388 rt->rt6i_dst.plen = 128;
389 rt->rt6i_flags |= RTF_CACHE;
390 if (rt->rt6i_flags & RTF_REJECT)
391 rt->u.dst.error = ort->u.dst.error;
392 rt->u.dst.flags |= DST_HOST;
393 rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
394 }
395 return rt;
396}
397
Linus Torvalds1da177e2005-04-16 15:20:36 -0700398#define BACKTRACK() \
399if (rt == &ip6_null_entry && strict) { \
400 while ((fn = fn->parent) != NULL) { \
401 if (fn->fn_flags & RTN_ROOT) { \
Linus Torvalds1da177e2005-04-16 15:20:36 -0700402 goto out; \
403 } \
404 if (fn->fn_flags & RTN_RTINFO) \
405 goto restart; \
406 } \
407}
408
409
410void ip6_route_input(struct sk_buff *skb)
411{
412 struct fib6_node *fn;
YOSHIFUJI Hideaki519fbd82006-03-20 17:00:05 -0800413 struct rt6_info *rt, *nrt;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700414 int strict;
415 int attempts = 3;
YOSHIFUJI Hideaki519fbd82006-03-20 17:00:05 -0800416 int err;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700417
418 strict = ipv6_addr_type(&skb->nh.ipv6h->daddr) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL);
419
420relookup:
421 read_lock_bh(&rt6_lock);
422
423 fn = fib6_lookup(&ip6_routing_table, &skb->nh.ipv6h->daddr,
424 &skb->nh.ipv6h->saddr);
425
426restart:
427 rt = fn->leaf;
428
429 if ((rt->rt6i_flags & RTF_CACHE)) {
430 rt = rt6_device_match(rt, skb->dev->ifindex, strict);
431 BACKTRACK();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700432 goto out;
433 }
434
Yan Zheng9d17f212005-10-28 15:12:00 -0700435 rt = rt6_device_match(rt, skb->dev->ifindex, strict);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700436 BACKTRACK();
437
YOSHIFUJI Hideakifb9de912006-03-20 16:59:08 -0800438 dst_hold(&rt->u.dst);
439 read_unlock_bh(&rt6_lock);
440
YOSHIFUJI Hideaki519fbd82006-03-20 17:00:05 -0800441 if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
442 nrt = rt6_alloc_cow(rt, &skb->nh.ipv6h->daddr, &skb->nh.ipv6h->saddr);
443 else {
444#if CLONE_OFFLINK_ROUTE
445 nrt = rt6_alloc_clone(rt, &skb->nh.ipv6h->daddr);
446#else
447 goto out2;
448#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700449 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700450
YOSHIFUJI Hideaki519fbd82006-03-20 17:00:05 -0800451 dst_release(&rt->u.dst);
452 rt = nrt ? : &ip6_null_entry;
453
454 dst_hold(&rt->u.dst);
455 if (nrt) {
456 err = ip6_ins_rt(nrt, NULL, NULL, &NETLINK_CB(skb));
457 if (!err)
458 goto out2;
459 }
460
461 if (--attempts <= 0)
462 goto out2;
463
464 /*
465 * Race condition! In the gap, when rt6_lock was
466 * released someone could insert this route. Relookup.
467 */
468 dst_release(&rt->u.dst);
469 goto relookup;
470
471out:
472 dst_hold(&rt->u.dst);
473 read_unlock_bh(&rt6_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700474out2:
475 rt->u.dst.lastuse = jiffies;
476 rt->u.dst.__use++;
477 skb->dst = (struct dst_entry *) rt;
YOSHIFUJI Hideakifb9de912006-03-20 16:59:08 -0800478 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700479}
480
481struct dst_entry * ip6_route_output(struct sock *sk, struct flowi *fl)
482{
483 struct fib6_node *fn;
YOSHIFUJI Hideaki519fbd82006-03-20 17:00:05 -0800484 struct rt6_info *rt, *nrt;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700485 int strict;
486 int attempts = 3;
YOSHIFUJI Hideaki519fbd82006-03-20 17:00:05 -0800487 int err;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700488
YOSHIFUJI Hideaki554cfb72006-03-20 17:00:26 -0800489 strict = ipv6_addr_type(&fl->fl6_dst) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL) ? RT6_SELECT_F_IFACE : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700490
491relookup:
492 read_lock_bh(&rt6_lock);
493
494 fn = fib6_lookup(&ip6_routing_table, &fl->fl6_dst, &fl->fl6_src);
495
496restart:
497 rt = fn->leaf;
498
499 if ((rt->rt6i_flags & RTF_CACHE)) {
500 rt = rt6_device_match(rt, fl->oif, strict);
501 BACKTRACK();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700502 goto out;
503 }
504 if (rt->rt6i_flags & RTF_DEFAULT) {
YOSHIFUJI Hideaki554cfb72006-03-20 17:00:26 -0800505 rt = rt6_select(&fn->leaf, fl->oif, strict | RT6_SELECT_F_REACHABLE);
506 if (rt == &ip6_null_entry)
507 rt = rt6_select(&fn->leaf, fl->oif, strict);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700508 } else {
509 rt = rt6_device_match(rt, fl->oif, strict);
510 BACKTRACK();
511 }
512
YOSHIFUJI Hideakifb9de912006-03-20 16:59:08 -0800513 dst_hold(&rt->u.dst);
514 read_unlock_bh(&rt6_lock);
515
YOSHIFUJI Hideaki519fbd82006-03-20 17:00:05 -0800516 if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
YOSHIFUJI Hideakie40cf352006-03-20 16:59:27 -0800517 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
YOSHIFUJI Hideaki519fbd82006-03-20 17:00:05 -0800518 else {
519#if CLONE_OFFLINK_ROUTE
520 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
521#else
522 goto out2;
523#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700524 }
YOSHIFUJI Hideakie40cf352006-03-20 16:59:27 -0800525
YOSHIFUJI Hideaki519fbd82006-03-20 17:00:05 -0800526 dst_release(&rt->u.dst);
527 rt = nrt ? : &ip6_null_entry;
528
529 dst_hold(&rt->u.dst);
530 if (nrt) {
531 err = ip6_ins_rt(nrt, NULL, NULL, NULL);
532 if (!err)
533 goto out2;
534 }
535
536 if (--attempts <= 0)
537 goto out2;
538
539 /*
540 * Race condition! In the gap, when rt6_lock was
541 * released someone could insert this route. Relookup.
542 */
543 dst_release(&rt->u.dst);
544 goto relookup;
545
546out:
547 dst_hold(&rt->u.dst);
548 read_unlock_bh(&rt6_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700549out2:
550 rt->u.dst.lastuse = jiffies;
551 rt->u.dst.__use++;
552 return &rt->u.dst;
553}
554
555
556/*
557 * Destination cache support functions
558 */
559
560static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
561{
562 struct rt6_info *rt;
563
564 rt = (struct rt6_info *) dst;
565
566 if (rt && rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
567 return dst;
568
569 return NULL;
570}
571
572static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
573{
574 struct rt6_info *rt = (struct rt6_info *) dst;
575
576 if (rt) {
577 if (rt->rt6i_flags & RTF_CACHE)
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -0700578 ip6_del_rt(rt, NULL, NULL, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700579 else
580 dst_release(dst);
581 }
582 return NULL;
583}
584
585static void ip6_link_failure(struct sk_buff *skb)
586{
587 struct rt6_info *rt;
588
589 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
590
591 rt = (struct rt6_info *) skb->dst;
592 if (rt) {
593 if (rt->rt6i_flags&RTF_CACHE) {
594 dst_set_expires(&rt->u.dst, 0);
595 rt->rt6i_flags |= RTF_EXPIRES;
596 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
597 rt->rt6i_node->fn_sernum = -1;
598 }
599}
600
601static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
602{
603 struct rt6_info *rt6 = (struct rt6_info*)dst;
604
605 if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
606 rt6->rt6i_flags |= RTF_MODIFIED;
607 if (mtu < IPV6_MIN_MTU) {
608 mtu = IPV6_MIN_MTU;
609 dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
610 }
611 dst->metrics[RTAX_MTU-1] = mtu;
612 }
613}
614
615/* Protected by rt6_lock. */
616static struct dst_entry *ndisc_dst_gc_list;
617static int ipv6_get_mtu(struct net_device *dev);
618
619static inline unsigned int ipv6_advmss(unsigned int mtu)
620{
621 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
622
623 if (mtu < ip6_rt_min_advmss)
624 mtu = ip6_rt_min_advmss;
625
626 /*
627 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
628 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
629 * IPV6_MAXPLEN is also valid and means: "any MSS,
630 * rely only on pmtu discovery"
631 */
632 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
633 mtu = IPV6_MAXPLEN;
634 return mtu;
635}
636
637struct dst_entry *ndisc_dst_alloc(struct net_device *dev,
638 struct neighbour *neigh,
639 struct in6_addr *addr,
640 int (*output)(struct sk_buff *))
641{
642 struct rt6_info *rt;
643 struct inet6_dev *idev = in6_dev_get(dev);
644
645 if (unlikely(idev == NULL))
646 return NULL;
647
648 rt = ip6_dst_alloc();
649 if (unlikely(rt == NULL)) {
650 in6_dev_put(idev);
651 goto out;
652 }
653
654 dev_hold(dev);
655 if (neigh)
656 neigh_hold(neigh);
657 else
658 neigh = ndisc_get_neigh(dev, addr);
659
660 rt->rt6i_dev = dev;
661 rt->rt6i_idev = idev;
662 rt->rt6i_nexthop = neigh;
663 atomic_set(&rt->u.dst.__refcnt, 1);
664 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255;
665 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
666 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
667 rt->u.dst.output = output;
668
669#if 0 /* there's no chance to use these for ndisc */
670 rt->u.dst.flags = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST
671 ? DST_HOST
672 : 0;
673 ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
674 rt->rt6i_dst.plen = 128;
675#endif
676
677 write_lock_bh(&rt6_lock);
678 rt->u.dst.next = ndisc_dst_gc_list;
679 ndisc_dst_gc_list = &rt->u.dst;
680 write_unlock_bh(&rt6_lock);
681
682 fib6_force_start_gc();
683
684out:
685 return (struct dst_entry *)rt;
686}
687
688int ndisc_dst_gc(int *more)
689{
690 struct dst_entry *dst, *next, **pprev;
691 int freed;
692
693 next = NULL;
694 pprev = &ndisc_dst_gc_list;
695 freed = 0;
696 while ((dst = *pprev) != NULL) {
697 if (!atomic_read(&dst->__refcnt)) {
698 *pprev = dst->next;
699 dst_free(dst);
700 freed++;
701 } else {
702 pprev = &dst->next;
703 (*more)++;
704 }
705 }
706
707 return freed;
708}
709
710static int ip6_dst_gc(void)
711{
712 static unsigned expire = 30*HZ;
713 static unsigned long last_gc;
714 unsigned long now = jiffies;
715
716 if (time_after(last_gc + ip6_rt_gc_min_interval, now) &&
717 atomic_read(&ip6_dst_ops.entries) <= ip6_rt_max_size)
718 goto out;
719
720 expire++;
721 fib6_run_gc(expire);
722 last_gc = now;
723 if (atomic_read(&ip6_dst_ops.entries) < ip6_dst_ops.gc_thresh)
724 expire = ip6_rt_gc_timeout>>1;
725
726out:
727 expire -= expire>>ip6_rt_gc_elasticity;
728 return (atomic_read(&ip6_dst_ops.entries) > ip6_rt_max_size);
729}
730
731/* Clean host part of a prefix. Not necessary in radix tree,
732 but results in cleaner routing tables.
733
734 Remove it only when all the things will work!
735 */
736
737static int ipv6_get_mtu(struct net_device *dev)
738{
739 int mtu = IPV6_MIN_MTU;
740 struct inet6_dev *idev;
741
742 idev = in6_dev_get(dev);
743 if (idev) {
744 mtu = idev->cnf.mtu6;
745 in6_dev_put(idev);
746 }
747 return mtu;
748}
749
750int ipv6_get_hoplimit(struct net_device *dev)
751{
752 int hoplimit = ipv6_devconf.hop_limit;
753 struct inet6_dev *idev;
754
755 idev = in6_dev_get(dev);
756 if (idev) {
757 hoplimit = idev->cnf.hop_limit;
758 in6_dev_put(idev);
759 }
760 return hoplimit;
761}
762
763/*
764 *
765 */
766
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -0700767int ip6_route_add(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh,
768 void *_rtattr, struct netlink_skb_parms *req)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700769{
770 int err;
771 struct rtmsg *r;
772 struct rtattr **rta;
773 struct rt6_info *rt = NULL;
774 struct net_device *dev = NULL;
775 struct inet6_dev *idev = NULL;
776 int addr_type;
777
778 rta = (struct rtattr **) _rtattr;
779
780 if (rtmsg->rtmsg_dst_len > 128 || rtmsg->rtmsg_src_len > 128)
781 return -EINVAL;
782#ifndef CONFIG_IPV6_SUBTREES
783 if (rtmsg->rtmsg_src_len)
784 return -EINVAL;
785#endif
786 if (rtmsg->rtmsg_ifindex) {
787 err = -ENODEV;
788 dev = dev_get_by_index(rtmsg->rtmsg_ifindex);
789 if (!dev)
790 goto out;
791 idev = in6_dev_get(dev);
792 if (!idev)
793 goto out;
794 }
795
796 if (rtmsg->rtmsg_metric == 0)
797 rtmsg->rtmsg_metric = IP6_RT_PRIO_USER;
798
799 rt = ip6_dst_alloc();
800
801 if (rt == NULL) {
802 err = -ENOMEM;
803 goto out;
804 }
805
806 rt->u.dst.obsolete = -1;
YOSHIFUJI Hideaki3dd4bc62005-12-19 14:02:45 -0800807 rt->rt6i_expires = jiffies + clock_t_to_jiffies(rtmsg->rtmsg_info);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700808 if (nlh && (r = NLMSG_DATA(nlh))) {
809 rt->rt6i_protocol = r->rtm_protocol;
810 } else {
811 rt->rt6i_protocol = RTPROT_BOOT;
812 }
813
814 addr_type = ipv6_addr_type(&rtmsg->rtmsg_dst);
815
816 if (addr_type & IPV6_ADDR_MULTICAST)
817 rt->u.dst.input = ip6_mc_input;
818 else
819 rt->u.dst.input = ip6_forward;
820
821 rt->u.dst.output = ip6_output;
822
823 ipv6_addr_prefix(&rt->rt6i_dst.addr,
824 &rtmsg->rtmsg_dst, rtmsg->rtmsg_dst_len);
825 rt->rt6i_dst.plen = rtmsg->rtmsg_dst_len;
826 if (rt->rt6i_dst.plen == 128)
827 rt->u.dst.flags = DST_HOST;
828
829#ifdef CONFIG_IPV6_SUBTREES
830 ipv6_addr_prefix(&rt->rt6i_src.addr,
831 &rtmsg->rtmsg_src, rtmsg->rtmsg_src_len);
832 rt->rt6i_src.plen = rtmsg->rtmsg_src_len;
833#endif
834
835 rt->rt6i_metric = rtmsg->rtmsg_metric;
836
837 /* We cannot add true routes via loopback here,
838 they would result in kernel looping; promote them to reject routes
839 */
840 if ((rtmsg->rtmsg_flags&RTF_REJECT) ||
841 (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
842 /* hold loopback dev/idev if we haven't done so. */
843 if (dev != &loopback_dev) {
844 if (dev) {
845 dev_put(dev);
846 in6_dev_put(idev);
847 }
848 dev = &loopback_dev;
849 dev_hold(dev);
850 idev = in6_dev_get(dev);
851 if (!idev) {
852 err = -ENODEV;
853 goto out;
854 }
855 }
856 rt->u.dst.output = ip6_pkt_discard_out;
857 rt->u.dst.input = ip6_pkt_discard;
858 rt->u.dst.error = -ENETUNREACH;
859 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
860 goto install_route;
861 }
862
863 if (rtmsg->rtmsg_flags & RTF_GATEWAY) {
864 struct in6_addr *gw_addr;
865 int gwa_type;
866
867 gw_addr = &rtmsg->rtmsg_gateway;
868 ipv6_addr_copy(&rt->rt6i_gateway, &rtmsg->rtmsg_gateway);
869 gwa_type = ipv6_addr_type(gw_addr);
870
871 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
872 struct rt6_info *grt;
873
874 /* IPv6 strictly inhibits using not link-local
875 addresses as nexthop address.
876 Otherwise, router will not able to send redirects.
877 It is very good, but in some (rare!) circumstances
878 (SIT, PtP, NBMA NOARP links) it is handy to allow
879 some exceptions. --ANK
880 */
881 err = -EINVAL;
882 if (!(gwa_type&IPV6_ADDR_UNICAST))
883 goto out;
884
885 grt = rt6_lookup(gw_addr, NULL, rtmsg->rtmsg_ifindex, 1);
886
887 err = -EHOSTUNREACH;
888 if (grt == NULL)
889 goto out;
890 if (dev) {
891 if (dev != grt->rt6i_dev) {
892 dst_release(&grt->u.dst);
893 goto out;
894 }
895 } else {
896 dev = grt->rt6i_dev;
897 idev = grt->rt6i_idev;
898 dev_hold(dev);
899 in6_dev_hold(grt->rt6i_idev);
900 }
901 if (!(grt->rt6i_flags&RTF_GATEWAY))
902 err = 0;
903 dst_release(&grt->u.dst);
904
905 if (err)
906 goto out;
907 }
908 err = -EINVAL;
909 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
910 goto out;
911 }
912
913 err = -ENODEV;
914 if (dev == NULL)
915 goto out;
916
917 if (rtmsg->rtmsg_flags & (RTF_GATEWAY|RTF_NONEXTHOP)) {
918 rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
919 if (IS_ERR(rt->rt6i_nexthop)) {
920 err = PTR_ERR(rt->rt6i_nexthop);
921 rt->rt6i_nexthop = NULL;
922 goto out;
923 }
924 }
925
926 rt->rt6i_flags = rtmsg->rtmsg_flags;
927
928install_route:
929 if (rta && rta[RTA_METRICS-1]) {
930 int attrlen = RTA_PAYLOAD(rta[RTA_METRICS-1]);
931 struct rtattr *attr = RTA_DATA(rta[RTA_METRICS-1]);
932
933 while (RTA_OK(attr, attrlen)) {
934 unsigned flavor = attr->rta_type;
935 if (flavor) {
936 if (flavor > RTAX_MAX) {
937 err = -EINVAL;
938 goto out;
939 }
940 rt->u.dst.metrics[flavor-1] =
941 *(u32 *)RTA_DATA(attr);
942 }
943 attr = RTA_NEXT(attr, attrlen);
944 }
945 }
946
947 if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
948 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
949 if (!rt->u.dst.metrics[RTAX_MTU-1])
950 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
951 if (!rt->u.dst.metrics[RTAX_ADVMSS-1])
952 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
953 rt->u.dst.dev = dev;
954 rt->rt6i_idev = idev;
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -0700955 return ip6_ins_rt(rt, nlh, _rtattr, req);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700956
957out:
958 if (dev)
959 dev_put(dev);
960 if (idev)
961 in6_dev_put(idev);
962 if (rt)
963 dst_free((struct dst_entry *) rt);
964 return err;
965}
966
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -0700967int ip6_del_rt(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700968{
969 int err;
970
971 write_lock_bh(&rt6_lock);
972
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -0700973 err = fib6_del(rt, nlh, _rtattr, req);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700974 dst_release(&rt->u.dst);
975
976 write_unlock_bh(&rt6_lock);
977
978 return err;
979}
980
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -0700981static int ip6_route_del(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700982{
983 struct fib6_node *fn;
984 struct rt6_info *rt;
985 int err = -ESRCH;
986
987 read_lock_bh(&rt6_lock);
988
989 fn = fib6_locate(&ip6_routing_table,
990 &rtmsg->rtmsg_dst, rtmsg->rtmsg_dst_len,
991 &rtmsg->rtmsg_src, rtmsg->rtmsg_src_len);
992
993 if (fn) {
994 for (rt = fn->leaf; rt; rt = rt->u.next) {
995 if (rtmsg->rtmsg_ifindex &&
996 (rt->rt6i_dev == NULL ||
997 rt->rt6i_dev->ifindex != rtmsg->rtmsg_ifindex))
998 continue;
999 if (rtmsg->rtmsg_flags&RTF_GATEWAY &&
1000 !ipv6_addr_equal(&rtmsg->rtmsg_gateway, &rt->rt6i_gateway))
1001 continue;
1002 if (rtmsg->rtmsg_metric &&
1003 rtmsg->rtmsg_metric != rt->rt6i_metric)
1004 continue;
1005 dst_hold(&rt->u.dst);
1006 read_unlock_bh(&rt6_lock);
1007
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -07001008 return ip6_del_rt(rt, nlh, _rtattr, req);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001009 }
1010 }
1011 read_unlock_bh(&rt6_lock);
1012
1013 return err;
1014}
1015
1016/*
1017 * Handle redirects
1018 */
1019void rt6_redirect(struct in6_addr *dest, struct in6_addr *saddr,
1020 struct neighbour *neigh, u8 *lladdr, int on_link)
1021{
1022 struct rt6_info *rt, *nrt;
1023
1024 /* Locate old route to this destination. */
1025 rt = rt6_lookup(dest, NULL, neigh->dev->ifindex, 1);
1026
1027 if (rt == NULL)
1028 return;
1029
1030 if (neigh->dev != rt->rt6i_dev)
1031 goto out;
1032
1033 /*
1034 * Current route is on-link; redirect is always invalid.
1035 *
1036 * Seems, previous statement is not true. It could
1037 * be node, which looks for us as on-link (f.e. proxy ndisc)
1038 * But then router serving it might decide, that we should
1039 * know truth 8)8) --ANK (980726).
1040 */
1041 if (!(rt->rt6i_flags&RTF_GATEWAY))
1042 goto out;
1043
1044 /*
1045 * RFC 2461 specifies that redirects should only be
1046 * accepted if they come from the nexthop to the target.
1047 * Due to the way default routers are chosen, this notion
1048 * is a bit fuzzy and one might need to check all default
1049 * routers.
1050 */
1051 if (!ipv6_addr_equal(saddr, &rt->rt6i_gateway)) {
1052 if (rt->rt6i_flags & RTF_DEFAULT) {
1053 struct rt6_info *rt1;
1054
1055 read_lock(&rt6_lock);
1056 for (rt1 = ip6_routing_table.leaf; rt1; rt1 = rt1->u.next) {
1057 if (ipv6_addr_equal(saddr, &rt1->rt6i_gateway)) {
1058 dst_hold(&rt1->u.dst);
1059 dst_release(&rt->u.dst);
1060 read_unlock(&rt6_lock);
1061 rt = rt1;
1062 goto source_ok;
1063 }
1064 }
1065 read_unlock(&rt6_lock);
1066 }
1067 if (net_ratelimit())
1068 printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1069 "for redirect target\n");
1070 goto out;
1071 }
1072
1073source_ok:
1074
1075 /*
1076 * We have finally decided to accept it.
1077 */
1078
1079 neigh_update(neigh, lladdr, NUD_STALE,
1080 NEIGH_UPDATE_F_WEAK_OVERRIDE|
1081 NEIGH_UPDATE_F_OVERRIDE|
1082 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1083 NEIGH_UPDATE_F_ISROUTER))
1084 );
1085
1086 /*
1087 * Redirect received -> path was valid.
1088 * Look, redirects are sent only in response to data packets,
1089 * so that this nexthop apparently is reachable. --ANK
1090 */
1091 dst_confirm(&rt->u.dst);
1092
1093 /* Duplicate redirect: silently ignore. */
1094 if (neigh == rt->u.dst.neighbour)
1095 goto out;
1096
1097 nrt = ip6_rt_copy(rt);
1098 if (nrt == NULL)
1099 goto out;
1100
1101 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1102 if (on_link)
1103 nrt->rt6i_flags &= ~RTF_GATEWAY;
1104
1105 ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1106 nrt->rt6i_dst.plen = 128;
1107 nrt->u.dst.flags |= DST_HOST;
1108
1109 ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1110 nrt->rt6i_nexthop = neigh_clone(neigh);
1111 /* Reset pmtu, it may be better */
1112 nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1113 nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&nrt->u.dst));
1114
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -07001115 if (ip6_ins_rt(nrt, NULL, NULL, NULL))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001116 goto out;
1117
1118 if (rt->rt6i_flags&RTF_CACHE) {
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -07001119 ip6_del_rt(rt, NULL, NULL, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001120 return;
1121 }
1122
1123out:
1124 dst_release(&rt->u.dst);
1125 return;
1126}
1127
1128/*
1129 * Handle ICMP "packet too big" messages
1130 * i.e. Path MTU discovery
1131 */
1132
1133void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1134 struct net_device *dev, u32 pmtu)
1135{
1136 struct rt6_info *rt, *nrt;
1137 int allfrag = 0;
1138
1139 rt = rt6_lookup(daddr, saddr, dev->ifindex, 0);
1140 if (rt == NULL)
1141 return;
1142
1143 if (pmtu >= dst_mtu(&rt->u.dst))
1144 goto out;
1145
1146 if (pmtu < IPV6_MIN_MTU) {
1147 /*
1148 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1149 * MTU (1280) and a fragment header should always be included
1150 * after a node receiving Too Big message reporting PMTU is
1151 * less than the IPv6 Minimum Link MTU.
1152 */
1153 pmtu = IPV6_MIN_MTU;
1154 allfrag = 1;
1155 }
1156
1157 /* New mtu received -> path was valid.
1158 They are sent only in response to data packets,
1159 so that this nexthop apparently is reachable. --ANK
1160 */
1161 dst_confirm(&rt->u.dst);
1162
1163 /* Host route. If it is static, it would be better
1164 not to override it, but add new one, so that
1165 when cache entry will expire old pmtu
1166 would return automatically.
1167 */
1168 if (rt->rt6i_flags & RTF_CACHE) {
1169 rt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1170 if (allfrag)
1171 rt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1172 dst_set_expires(&rt->u.dst, ip6_rt_mtu_expires);
1173 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1174 goto out;
1175 }
1176
1177 /* Network route.
1178 Two cases are possible:
1179 1. It is connected route. Action: COW
1180 2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1181 */
YOSHIFUJI Hideakid5315b502006-03-20 16:58:48 -08001182 if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
YOSHIFUJI Hideakia1e78362006-03-20 16:56:32 -08001183 nrt = rt6_alloc_cow(rt, daddr, saddr);
YOSHIFUJI Hideakid5315b502006-03-20 16:58:48 -08001184 else
1185 nrt = rt6_alloc_clone(rt, daddr);
YOSHIFUJI Hideakia1e78362006-03-20 16:56:32 -08001186
YOSHIFUJI Hideakid5315b502006-03-20 16:58:48 -08001187 if (nrt) {
YOSHIFUJI Hideakia1e78362006-03-20 16:56:32 -08001188 nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1189 if (allfrag)
1190 nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1191
1192 /* According to RFC 1981, detecting PMTU increase shouldn't be
1193 * happened within 5 mins, the recommended timer is 10 mins.
1194 * Here this route expiration time is set to ip6_rt_mtu_expires
1195 * which is 10 mins. After 10 mins the decreased pmtu is expired
1196 * and detecting PMTU increase will be automatically happened.
1197 */
1198 dst_set_expires(&nrt->u.dst, ip6_rt_mtu_expires);
1199 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1200
1201 ip6_ins_rt(nrt, NULL, NULL, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001202 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001203out:
1204 dst_release(&rt->u.dst);
1205}
1206
1207/*
1208 * Misc support functions
1209 */
1210
1211static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1212{
1213 struct rt6_info *rt = ip6_dst_alloc();
1214
1215 if (rt) {
1216 rt->u.dst.input = ort->u.dst.input;
1217 rt->u.dst.output = ort->u.dst.output;
1218
1219 memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
1220 rt->u.dst.dev = ort->u.dst.dev;
1221 if (rt->u.dst.dev)
1222 dev_hold(rt->u.dst.dev);
1223 rt->rt6i_idev = ort->rt6i_idev;
1224 if (rt->rt6i_idev)
1225 in6_dev_hold(rt->rt6i_idev);
1226 rt->u.dst.lastuse = jiffies;
1227 rt->rt6i_expires = 0;
1228
1229 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1230 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1231 rt->rt6i_metric = 0;
1232
1233 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1234#ifdef CONFIG_IPV6_SUBTREES
1235 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1236#endif
1237 }
1238 return rt;
1239}
1240
1241struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1242{
1243 struct rt6_info *rt;
1244 struct fib6_node *fn;
1245
1246 fn = &ip6_routing_table;
1247
1248 write_lock_bh(&rt6_lock);
1249 for (rt = fn->leaf; rt; rt=rt->u.next) {
1250 if (dev == rt->rt6i_dev &&
YOSHIFUJI Hideaki045927f2006-03-20 17:00:48 -08001251 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001252 ipv6_addr_equal(&rt->rt6i_gateway, addr))
1253 break;
1254 }
1255 if (rt)
1256 dst_hold(&rt->u.dst);
1257 write_unlock_bh(&rt6_lock);
1258 return rt;
1259}
1260
1261struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1262 struct net_device *dev)
1263{
1264 struct in6_rtmsg rtmsg;
1265
1266 memset(&rtmsg, 0, sizeof(struct in6_rtmsg));
1267 rtmsg.rtmsg_type = RTMSG_NEWROUTE;
1268 ipv6_addr_copy(&rtmsg.rtmsg_gateway, gwaddr);
1269 rtmsg.rtmsg_metric = 1024;
1270 rtmsg.rtmsg_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | RTF_UP | RTF_EXPIRES;
1271
1272 rtmsg.rtmsg_ifindex = dev->ifindex;
1273
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -07001274 ip6_route_add(&rtmsg, NULL, NULL, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001275 return rt6_get_dflt_router(gwaddr, dev);
1276}
1277
1278void rt6_purge_dflt_routers(void)
1279{
1280 struct rt6_info *rt;
1281
1282restart:
1283 read_lock_bh(&rt6_lock);
1284 for (rt = ip6_routing_table.leaf; rt; rt = rt->u.next) {
1285 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1286 dst_hold(&rt->u.dst);
1287
Linus Torvalds1da177e2005-04-16 15:20:36 -07001288 read_unlock_bh(&rt6_lock);
1289
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -07001290 ip6_del_rt(rt, NULL, NULL, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001291
1292 goto restart;
1293 }
1294 }
1295 read_unlock_bh(&rt6_lock);
1296}
1297
1298int ipv6_route_ioctl(unsigned int cmd, void __user *arg)
1299{
1300 struct in6_rtmsg rtmsg;
1301 int err;
1302
1303 switch(cmd) {
1304 case SIOCADDRT: /* Add a route */
1305 case SIOCDELRT: /* Delete a route */
1306 if (!capable(CAP_NET_ADMIN))
1307 return -EPERM;
1308 err = copy_from_user(&rtmsg, arg,
1309 sizeof(struct in6_rtmsg));
1310 if (err)
1311 return -EFAULT;
1312
1313 rtnl_lock();
1314 switch (cmd) {
1315 case SIOCADDRT:
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -07001316 err = ip6_route_add(&rtmsg, NULL, NULL, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001317 break;
1318 case SIOCDELRT:
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -07001319 err = ip6_route_del(&rtmsg, NULL, NULL, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001320 break;
1321 default:
1322 err = -EINVAL;
1323 }
1324 rtnl_unlock();
1325
1326 return err;
1327 };
1328
1329 return -EINVAL;
1330}
1331
1332/*
1333 * Drop the packet on the floor
1334 */
1335
Arnaldo Carvalho de Melo20380732005-08-16 02:18:02 -03001336static int ip6_pkt_discard(struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001337{
1338 IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
1339 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_NOROUTE, 0, skb->dev);
1340 kfree_skb(skb);
1341 return 0;
1342}
1343
Arnaldo Carvalho de Melo20380732005-08-16 02:18:02 -03001344static int ip6_pkt_discard_out(struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001345{
1346 skb->dev = skb->dst->dev;
1347 return ip6_pkt_discard(skb);
1348}
1349
1350/*
1351 * Allocate a dst for local (unicast / anycast) address.
1352 */
1353
1354struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1355 const struct in6_addr *addr,
1356 int anycast)
1357{
1358 struct rt6_info *rt = ip6_dst_alloc();
1359
1360 if (rt == NULL)
1361 return ERR_PTR(-ENOMEM);
1362
1363 dev_hold(&loopback_dev);
1364 in6_dev_hold(idev);
1365
1366 rt->u.dst.flags = DST_HOST;
1367 rt->u.dst.input = ip6_input;
1368 rt->u.dst.output = ip6_output;
1369 rt->rt6i_dev = &loopback_dev;
1370 rt->rt6i_idev = idev;
1371 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1372 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1373 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1374 rt->u.dst.obsolete = -1;
1375
1376 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
YOSHIFUJI Hideaki58c4fb82005-12-21 22:56:42 +09001377 if (anycast)
1378 rt->rt6i_flags |= RTF_ANYCAST;
1379 else
Linus Torvalds1da177e2005-04-16 15:20:36 -07001380 rt->rt6i_flags |= RTF_LOCAL;
1381 rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1382 if (rt->rt6i_nexthop == NULL) {
1383 dst_free((struct dst_entry *) rt);
1384 return ERR_PTR(-ENOMEM);
1385 }
1386
1387 ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1388 rt->rt6i_dst.plen = 128;
1389
1390 atomic_set(&rt->u.dst.__refcnt, 1);
1391
1392 return rt;
1393}
1394
1395static int fib6_ifdown(struct rt6_info *rt, void *arg)
1396{
1397 if (((void*)rt->rt6i_dev == arg || arg == NULL) &&
1398 rt != &ip6_null_entry) {
1399 RT6_TRACE("deleted by ifdown %p\n", rt);
1400 return -1;
1401 }
1402 return 0;
1403}
1404
1405void rt6_ifdown(struct net_device *dev)
1406{
1407 write_lock_bh(&rt6_lock);
1408 fib6_clean_tree(&ip6_routing_table, fib6_ifdown, 0, dev);
1409 write_unlock_bh(&rt6_lock);
1410}
1411
1412struct rt6_mtu_change_arg
1413{
1414 struct net_device *dev;
1415 unsigned mtu;
1416};
1417
1418static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
1419{
1420 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
1421 struct inet6_dev *idev;
1422
1423 /* In IPv6 pmtu discovery is not optional,
1424 so that RTAX_MTU lock cannot disable it.
1425 We still use this lock to block changes
1426 caused by addrconf/ndisc.
1427 */
1428
1429 idev = __in6_dev_get(arg->dev);
1430 if (idev == NULL)
1431 return 0;
1432
1433 /* For administrative MTU increase, there is no way to discover
1434 IPv6 PMTU increase, so PMTU increase should be updated here.
1435 Since RFC 1981 doesn't include administrative MTU increase
1436 update PMTU increase is a MUST. (i.e. jumbo frame)
1437 */
1438 /*
1439 If new MTU is less than route PMTU, this new MTU will be the
1440 lowest MTU in the path, update the route PMTU to reflect PMTU
1441 decreases; if new MTU is greater than route PMTU, and the
1442 old MTU is the lowest MTU in the path, update the route PMTU
1443 to reflect the increase. In this case if the other nodes' MTU
1444 also have the lowest MTU, TOO BIG MESSAGE will be lead to
1445 PMTU discouvery.
1446 */
1447 if (rt->rt6i_dev == arg->dev &&
1448 !dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1449 (dst_mtu(&rt->u.dst) > arg->mtu ||
1450 (dst_mtu(&rt->u.dst) < arg->mtu &&
1451 dst_mtu(&rt->u.dst) == idev->cnf.mtu6)))
1452 rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu;
1453 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(arg->mtu);
1454 return 0;
1455}
1456
1457void rt6_mtu_change(struct net_device *dev, unsigned mtu)
1458{
1459 struct rt6_mtu_change_arg arg;
1460
1461 arg.dev = dev;
1462 arg.mtu = mtu;
1463 read_lock_bh(&rt6_lock);
1464 fib6_clean_tree(&ip6_routing_table, rt6_mtu_change_route, 0, &arg);
1465 read_unlock_bh(&rt6_lock);
1466}
1467
1468static int inet6_rtm_to_rtmsg(struct rtmsg *r, struct rtattr **rta,
1469 struct in6_rtmsg *rtmsg)
1470{
1471 memset(rtmsg, 0, sizeof(*rtmsg));
1472
1473 rtmsg->rtmsg_dst_len = r->rtm_dst_len;
1474 rtmsg->rtmsg_src_len = r->rtm_src_len;
1475 rtmsg->rtmsg_flags = RTF_UP;
1476 if (r->rtm_type == RTN_UNREACHABLE)
1477 rtmsg->rtmsg_flags |= RTF_REJECT;
1478
1479 if (rta[RTA_GATEWAY-1]) {
1480 if (rta[RTA_GATEWAY-1]->rta_len != RTA_LENGTH(16))
1481 return -EINVAL;
1482 memcpy(&rtmsg->rtmsg_gateway, RTA_DATA(rta[RTA_GATEWAY-1]), 16);
1483 rtmsg->rtmsg_flags |= RTF_GATEWAY;
1484 }
1485 if (rta[RTA_DST-1]) {
1486 if (RTA_PAYLOAD(rta[RTA_DST-1]) < ((r->rtm_dst_len+7)>>3))
1487 return -EINVAL;
1488 memcpy(&rtmsg->rtmsg_dst, RTA_DATA(rta[RTA_DST-1]), ((r->rtm_dst_len+7)>>3));
1489 }
1490 if (rta[RTA_SRC-1]) {
1491 if (RTA_PAYLOAD(rta[RTA_SRC-1]) < ((r->rtm_src_len+7)>>3))
1492 return -EINVAL;
1493 memcpy(&rtmsg->rtmsg_src, RTA_DATA(rta[RTA_SRC-1]), ((r->rtm_src_len+7)>>3));
1494 }
1495 if (rta[RTA_OIF-1]) {
1496 if (rta[RTA_OIF-1]->rta_len != RTA_LENGTH(sizeof(int)))
1497 return -EINVAL;
1498 memcpy(&rtmsg->rtmsg_ifindex, RTA_DATA(rta[RTA_OIF-1]), sizeof(int));
1499 }
1500 if (rta[RTA_PRIORITY-1]) {
1501 if (rta[RTA_PRIORITY-1]->rta_len != RTA_LENGTH(4))
1502 return -EINVAL;
1503 memcpy(&rtmsg->rtmsg_metric, RTA_DATA(rta[RTA_PRIORITY-1]), 4);
1504 }
1505 return 0;
1506}
1507
1508int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1509{
1510 struct rtmsg *r = NLMSG_DATA(nlh);
1511 struct in6_rtmsg rtmsg;
1512
1513 if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
1514 return -EINVAL;
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -07001515 return ip6_route_del(&rtmsg, nlh, arg, &NETLINK_CB(skb));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001516}
1517
1518int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1519{
1520 struct rtmsg *r = NLMSG_DATA(nlh);
1521 struct in6_rtmsg rtmsg;
1522
1523 if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
1524 return -EINVAL;
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -07001525 return ip6_route_add(&rtmsg, nlh, arg, &NETLINK_CB(skb));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001526}
1527
1528struct rt6_rtnl_dump_arg
1529{
1530 struct sk_buff *skb;
1531 struct netlink_callback *cb;
1532};
1533
1534static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -07001535 struct in6_addr *dst, struct in6_addr *src,
1536 int iif, int type, u32 pid, u32 seq,
1537 int prefix, unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001538{
1539 struct rtmsg *rtm;
1540 struct nlmsghdr *nlh;
1541 unsigned char *b = skb->tail;
1542 struct rta_cacheinfo ci;
1543
1544 if (prefix) { /* user wants prefix routes only */
1545 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
1546 /* success since this is not a prefix route */
1547 return 1;
1548 }
1549 }
1550
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07001551 nlh = NLMSG_NEW(skb, pid, seq, type, sizeof(*rtm), flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001552 rtm = NLMSG_DATA(nlh);
1553 rtm->rtm_family = AF_INET6;
1554 rtm->rtm_dst_len = rt->rt6i_dst.plen;
1555 rtm->rtm_src_len = rt->rt6i_src.plen;
1556 rtm->rtm_tos = 0;
1557 rtm->rtm_table = RT_TABLE_MAIN;
1558 if (rt->rt6i_flags&RTF_REJECT)
1559 rtm->rtm_type = RTN_UNREACHABLE;
1560 else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
1561 rtm->rtm_type = RTN_LOCAL;
1562 else
1563 rtm->rtm_type = RTN_UNICAST;
1564 rtm->rtm_flags = 0;
1565 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
1566 rtm->rtm_protocol = rt->rt6i_protocol;
1567 if (rt->rt6i_flags&RTF_DYNAMIC)
1568 rtm->rtm_protocol = RTPROT_REDIRECT;
1569 else if (rt->rt6i_flags & RTF_ADDRCONF)
1570 rtm->rtm_protocol = RTPROT_KERNEL;
1571 else if (rt->rt6i_flags&RTF_DEFAULT)
1572 rtm->rtm_protocol = RTPROT_RA;
1573
1574 if (rt->rt6i_flags&RTF_CACHE)
1575 rtm->rtm_flags |= RTM_F_CLONED;
1576
1577 if (dst) {
1578 RTA_PUT(skb, RTA_DST, 16, dst);
1579 rtm->rtm_dst_len = 128;
1580 } else if (rtm->rtm_dst_len)
1581 RTA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
1582#ifdef CONFIG_IPV6_SUBTREES
1583 if (src) {
1584 RTA_PUT(skb, RTA_SRC, 16, src);
1585 rtm->rtm_src_len = 128;
1586 } else if (rtm->rtm_src_len)
1587 RTA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
1588#endif
1589 if (iif)
1590 RTA_PUT(skb, RTA_IIF, 4, &iif);
1591 else if (dst) {
1592 struct in6_addr saddr_buf;
1593 if (ipv6_get_saddr(&rt->u.dst, dst, &saddr_buf) == 0)
1594 RTA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
1595 }
1596 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
1597 goto rtattr_failure;
1598 if (rt->u.dst.neighbour)
1599 RTA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
1600 if (rt->u.dst.dev)
1601 RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->rt6i_dev->ifindex);
1602 RTA_PUT(skb, RTA_PRIORITY, 4, &rt->rt6i_metric);
1603 ci.rta_lastuse = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
1604 if (rt->rt6i_expires)
1605 ci.rta_expires = jiffies_to_clock_t(rt->rt6i_expires - jiffies);
1606 else
1607 ci.rta_expires = 0;
1608 ci.rta_used = rt->u.dst.__use;
1609 ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt);
1610 ci.rta_error = rt->u.dst.error;
1611 ci.rta_id = 0;
1612 ci.rta_ts = 0;
1613 ci.rta_tsage = 0;
1614 RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
1615 nlh->nlmsg_len = skb->tail - b;
1616 return skb->len;
1617
1618nlmsg_failure:
1619rtattr_failure:
1620 skb_trim(skb, b - skb->data);
1621 return -1;
1622}
1623
1624static int rt6_dump_route(struct rt6_info *rt, void *p_arg)
1625{
1626 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
1627 int prefix;
1628
1629 if (arg->cb->nlh->nlmsg_len >= NLMSG_LENGTH(sizeof(struct rtmsg))) {
1630 struct rtmsg *rtm = NLMSG_DATA(arg->cb->nlh);
1631 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
1632 } else
1633 prefix = 0;
1634
1635 return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
1636 NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -07001637 prefix, NLM_F_MULTI);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001638}
1639
1640static int fib6_dump_node(struct fib6_walker_t *w)
1641{
1642 int res;
1643 struct rt6_info *rt;
1644
1645 for (rt = w->leaf; rt; rt = rt->u.next) {
1646 res = rt6_dump_route(rt, w->args);
1647 if (res < 0) {
1648 /* Frame is full, suspend walking */
1649 w->leaf = rt;
1650 return 1;
1651 }
1652 BUG_TRAP(res!=0);
1653 }
1654 w->leaf = NULL;
1655 return 0;
1656}
1657
1658static void fib6_dump_end(struct netlink_callback *cb)
1659{
1660 struct fib6_walker_t *w = (void*)cb->args[0];
1661
1662 if (w) {
1663 cb->args[0] = 0;
1664 fib6_walker_unlink(w);
1665 kfree(w);
1666 }
Herbert Xuefacfbc2005-11-12 12:12:05 -08001667 cb->done = (void*)cb->args[1];
1668 cb->args[1] = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001669}
1670
1671static int fib6_dump_done(struct netlink_callback *cb)
1672{
1673 fib6_dump_end(cb);
Thomas Grafa8f74b22005-11-10 02:25:52 +01001674 return cb->done ? cb->done(cb) : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001675}
1676
1677int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
1678{
1679 struct rt6_rtnl_dump_arg arg;
1680 struct fib6_walker_t *w;
1681 int res;
1682
1683 arg.skb = skb;
1684 arg.cb = cb;
1685
1686 w = (void*)cb->args[0];
1687 if (w == NULL) {
1688 /* New dump:
1689 *
1690 * 1. hook callback destructor.
1691 */
1692 cb->args[1] = (long)cb->done;
1693 cb->done = fib6_dump_done;
1694
1695 /*
1696 * 2. allocate and initialize walker.
1697 */
David S. Miller9e147a12005-11-17 16:52:51 -08001698 w = kmalloc(sizeof(*w), GFP_ATOMIC);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001699 if (w == NULL)
1700 return -ENOMEM;
1701 RT6_TRACE("dump<%p", w);
1702 memset(w, 0, sizeof(*w));
1703 w->root = &ip6_routing_table;
1704 w->func = fib6_dump_node;
1705 w->args = &arg;
1706 cb->args[0] = (long)w;
1707 read_lock_bh(&rt6_lock);
1708 res = fib6_walk(w);
1709 read_unlock_bh(&rt6_lock);
1710 } else {
1711 w->args = &arg;
1712 read_lock_bh(&rt6_lock);
1713 res = fib6_walk_continue(w);
1714 read_unlock_bh(&rt6_lock);
1715 }
1716#if RT6_DEBUG >= 3
1717 if (res <= 0 && skb->len == 0)
1718 RT6_TRACE("%p>dump end\n", w);
1719#endif
1720 res = res < 0 ? res : skb->len;
1721 /* res < 0 is an error. (really, impossible)
1722 res == 0 means that dump is complete, but skb still can contain data.
1723 res > 0 dump is not complete, but frame is full.
1724 */
1725 /* Destroy walker, if dump of this table is complete. */
1726 if (res <= 0)
1727 fib6_dump_end(cb);
1728 return res;
1729}
1730
1731int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
1732{
1733 struct rtattr **rta = arg;
1734 int iif = 0;
1735 int err = -ENOBUFS;
1736 struct sk_buff *skb;
1737 struct flowi fl;
1738 struct rt6_info *rt;
1739
1740 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1741 if (skb == NULL)
1742 goto out;
1743
1744 /* Reserve room for dummy headers, this skb can pass
1745 through good chunk of routing engine.
1746 */
1747 skb->mac.raw = skb->data;
1748 skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
1749
1750 memset(&fl, 0, sizeof(fl));
1751 if (rta[RTA_SRC-1])
1752 ipv6_addr_copy(&fl.fl6_src,
1753 (struct in6_addr*)RTA_DATA(rta[RTA_SRC-1]));
1754 if (rta[RTA_DST-1])
1755 ipv6_addr_copy(&fl.fl6_dst,
1756 (struct in6_addr*)RTA_DATA(rta[RTA_DST-1]));
1757
1758 if (rta[RTA_IIF-1])
1759 memcpy(&iif, RTA_DATA(rta[RTA_IIF-1]), sizeof(int));
1760
1761 if (iif) {
1762 struct net_device *dev;
1763 dev = __dev_get_by_index(iif);
1764 if (!dev) {
1765 err = -ENODEV;
1766 goto out_free;
1767 }
1768 }
1769
1770 fl.oif = 0;
1771 if (rta[RTA_OIF-1])
1772 memcpy(&fl.oif, RTA_DATA(rta[RTA_OIF-1]), sizeof(int));
1773
1774 rt = (struct rt6_info*)ip6_route_output(NULL, &fl);
1775
1776 skb->dst = &rt->u.dst;
1777
1778 NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
1779 err = rt6_fill_node(skb, rt,
1780 &fl.fl6_dst, &fl.fl6_src,
1781 iif,
1782 RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -07001783 nlh->nlmsg_seq, 0, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001784 if (err < 0) {
1785 err = -EMSGSIZE;
1786 goto out_free;
1787 }
1788
1789 err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
1790 if (err > 0)
1791 err = 0;
1792out:
1793 return err;
1794out_free:
1795 kfree_skb(skb);
1796 goto out;
1797}
1798
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -07001799void inet6_rt_notify(int event, struct rt6_info *rt, struct nlmsghdr *nlh,
1800 struct netlink_skb_parms *req)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001801{
1802 struct sk_buff *skb;
1803 int size = NLMSG_SPACE(sizeof(struct rtmsg)+256);
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -07001804 u32 pid = current->pid;
1805 u32 seq = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001806
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -07001807 if (req)
1808 pid = req->pid;
1809 if (nlh)
1810 seq = nlh->nlmsg_seq;
1811
Linus Torvalds1da177e2005-04-16 15:20:36 -07001812 skb = alloc_skb(size, gfp_any());
1813 if (!skb) {
Patrick McHardyac6d4392005-08-14 19:29:52 -07001814 netlink_set_err(rtnl, 0, RTNLGRP_IPV6_ROUTE, ENOBUFS);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001815 return;
1816 }
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -07001817 if (rt6_fill_node(skb, rt, NULL, NULL, 0, event, pid, seq, 0, 0) < 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001818 kfree_skb(skb);
Patrick McHardyac6d4392005-08-14 19:29:52 -07001819 netlink_set_err(rtnl, 0, RTNLGRP_IPV6_ROUTE, EINVAL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001820 return;
1821 }
Patrick McHardyac6d4392005-08-14 19:29:52 -07001822 NETLINK_CB(skb).dst_group = RTNLGRP_IPV6_ROUTE;
1823 netlink_broadcast(rtnl, skb, 0, RTNLGRP_IPV6_ROUTE, gfp_any());
Linus Torvalds1da177e2005-04-16 15:20:36 -07001824}
1825
1826/*
1827 * /proc
1828 */
1829
1830#ifdef CONFIG_PROC_FS
1831
1832#define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
1833
1834struct rt6_proc_arg
1835{
1836 char *buffer;
1837 int offset;
1838 int length;
1839 int skip;
1840 int len;
1841};
1842
1843static int rt6_info_route(struct rt6_info *rt, void *p_arg)
1844{
1845 struct rt6_proc_arg *arg = (struct rt6_proc_arg *) p_arg;
1846 int i;
1847
1848 if (arg->skip < arg->offset / RT6_INFO_LEN) {
1849 arg->skip++;
1850 return 0;
1851 }
1852
1853 if (arg->len >= arg->length)
1854 return 0;
1855
1856 for (i=0; i<16; i++) {
1857 sprintf(arg->buffer + arg->len, "%02x",
1858 rt->rt6i_dst.addr.s6_addr[i]);
1859 arg->len += 2;
1860 }
1861 arg->len += sprintf(arg->buffer + arg->len, " %02x ",
1862 rt->rt6i_dst.plen);
1863
1864#ifdef CONFIG_IPV6_SUBTREES
1865 for (i=0; i<16; i++) {
1866 sprintf(arg->buffer + arg->len, "%02x",
1867 rt->rt6i_src.addr.s6_addr[i]);
1868 arg->len += 2;
1869 }
1870 arg->len += sprintf(arg->buffer + arg->len, " %02x ",
1871 rt->rt6i_src.plen);
1872#else
1873 sprintf(arg->buffer + arg->len,
1874 "00000000000000000000000000000000 00 ");
1875 arg->len += 36;
1876#endif
1877
1878 if (rt->rt6i_nexthop) {
1879 for (i=0; i<16; i++) {
1880 sprintf(arg->buffer + arg->len, "%02x",
1881 rt->rt6i_nexthop->primary_key[i]);
1882 arg->len += 2;
1883 }
1884 } else {
1885 sprintf(arg->buffer + arg->len,
1886 "00000000000000000000000000000000");
1887 arg->len += 32;
1888 }
1889 arg->len += sprintf(arg->buffer + arg->len,
1890 " %08x %08x %08x %08x %8s\n",
1891 rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt),
1892 rt->u.dst.__use, rt->rt6i_flags,
1893 rt->rt6i_dev ? rt->rt6i_dev->name : "");
1894 return 0;
1895}
1896
1897static int rt6_proc_info(char *buffer, char **start, off_t offset, int length)
1898{
1899 struct rt6_proc_arg arg;
1900 arg.buffer = buffer;
1901 arg.offset = offset;
1902 arg.length = length;
1903 arg.skip = 0;
1904 arg.len = 0;
1905
1906 read_lock_bh(&rt6_lock);
1907 fib6_clean_tree(&ip6_routing_table, rt6_info_route, 0, &arg);
1908 read_unlock_bh(&rt6_lock);
1909
1910 *start = buffer;
1911 if (offset)
1912 *start += offset % RT6_INFO_LEN;
1913
1914 arg.len -= offset % RT6_INFO_LEN;
1915
1916 if (arg.len > length)
1917 arg.len = length;
1918 if (arg.len < 0)
1919 arg.len = 0;
1920
1921 return arg.len;
1922}
1923
Linus Torvalds1da177e2005-04-16 15:20:36 -07001924static int rt6_stats_seq_show(struct seq_file *seq, void *v)
1925{
1926 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
1927 rt6_stats.fib_nodes, rt6_stats.fib_route_nodes,
1928 rt6_stats.fib_rt_alloc, rt6_stats.fib_rt_entries,
1929 rt6_stats.fib_rt_cache,
1930 atomic_read(&ip6_dst_ops.entries),
1931 rt6_stats.fib_discarded_routes);
1932
1933 return 0;
1934}
1935
1936static int rt6_stats_seq_open(struct inode *inode, struct file *file)
1937{
1938 return single_open(file, rt6_stats_seq_show, NULL);
1939}
1940
1941static struct file_operations rt6_stats_seq_fops = {
1942 .owner = THIS_MODULE,
1943 .open = rt6_stats_seq_open,
1944 .read = seq_read,
1945 .llseek = seq_lseek,
1946 .release = single_release,
1947};
1948#endif /* CONFIG_PROC_FS */
1949
1950#ifdef CONFIG_SYSCTL
1951
1952static int flush_delay;
1953
1954static
1955int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
1956 void __user *buffer, size_t *lenp, loff_t *ppos)
1957{
1958 if (write) {
1959 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
1960 fib6_run_gc(flush_delay <= 0 ? ~0UL : (unsigned long)flush_delay);
1961 return 0;
1962 } else
1963 return -EINVAL;
1964}
1965
1966ctl_table ipv6_route_table[] = {
1967 {
1968 .ctl_name = NET_IPV6_ROUTE_FLUSH,
1969 .procname = "flush",
1970 .data = &flush_delay,
1971 .maxlen = sizeof(int),
Dave Jones89c8b3a12005-04-28 12:11:49 -07001972 .mode = 0200,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001973 .proc_handler = &ipv6_sysctl_rtcache_flush
1974 },
1975 {
1976 .ctl_name = NET_IPV6_ROUTE_GC_THRESH,
1977 .procname = "gc_thresh",
1978 .data = &ip6_dst_ops.gc_thresh,
1979 .maxlen = sizeof(int),
1980 .mode = 0644,
1981 .proc_handler = &proc_dointvec,
1982 },
1983 {
1984 .ctl_name = NET_IPV6_ROUTE_MAX_SIZE,
1985 .procname = "max_size",
1986 .data = &ip6_rt_max_size,
1987 .maxlen = sizeof(int),
1988 .mode = 0644,
1989 .proc_handler = &proc_dointvec,
1990 },
1991 {
1992 .ctl_name = NET_IPV6_ROUTE_GC_MIN_INTERVAL,
1993 .procname = "gc_min_interval",
1994 .data = &ip6_rt_gc_min_interval,
1995 .maxlen = sizeof(int),
1996 .mode = 0644,
1997 .proc_handler = &proc_dointvec_jiffies,
1998 .strategy = &sysctl_jiffies,
1999 },
2000 {
2001 .ctl_name = NET_IPV6_ROUTE_GC_TIMEOUT,
2002 .procname = "gc_timeout",
2003 .data = &ip6_rt_gc_timeout,
2004 .maxlen = sizeof(int),
2005 .mode = 0644,
2006 .proc_handler = &proc_dointvec_jiffies,
2007 .strategy = &sysctl_jiffies,
2008 },
2009 {
2010 .ctl_name = NET_IPV6_ROUTE_GC_INTERVAL,
2011 .procname = "gc_interval",
2012 .data = &ip6_rt_gc_interval,
2013 .maxlen = sizeof(int),
2014 .mode = 0644,
2015 .proc_handler = &proc_dointvec_jiffies,
2016 .strategy = &sysctl_jiffies,
2017 },
2018 {
2019 .ctl_name = NET_IPV6_ROUTE_GC_ELASTICITY,
2020 .procname = "gc_elasticity",
2021 .data = &ip6_rt_gc_elasticity,
2022 .maxlen = sizeof(int),
2023 .mode = 0644,
2024 .proc_handler = &proc_dointvec_jiffies,
2025 .strategy = &sysctl_jiffies,
2026 },
2027 {
2028 .ctl_name = NET_IPV6_ROUTE_MTU_EXPIRES,
2029 .procname = "mtu_expires",
2030 .data = &ip6_rt_mtu_expires,
2031 .maxlen = sizeof(int),
2032 .mode = 0644,
2033 .proc_handler = &proc_dointvec_jiffies,
2034 .strategy = &sysctl_jiffies,
2035 },
2036 {
2037 .ctl_name = NET_IPV6_ROUTE_MIN_ADVMSS,
2038 .procname = "min_adv_mss",
2039 .data = &ip6_rt_min_advmss,
2040 .maxlen = sizeof(int),
2041 .mode = 0644,
2042 .proc_handler = &proc_dointvec_jiffies,
2043 .strategy = &sysctl_jiffies,
2044 },
2045 {
2046 .ctl_name = NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS,
2047 .procname = "gc_min_interval_ms",
2048 .data = &ip6_rt_gc_min_interval,
2049 .maxlen = sizeof(int),
2050 .mode = 0644,
2051 .proc_handler = &proc_dointvec_ms_jiffies,
2052 .strategy = &sysctl_ms_jiffies,
2053 },
2054 { .ctl_name = 0 }
2055};
2056
2057#endif
2058
2059void __init ip6_route_init(void)
2060{
2061 struct proc_dir_entry *p;
2062
2063 ip6_dst_ops.kmem_cachep = kmem_cache_create("ip6_dst_cache",
2064 sizeof(struct rt6_info),
2065 0, SLAB_HWCACHE_ALIGN,
2066 NULL, NULL);
2067 if (!ip6_dst_ops.kmem_cachep)
2068 panic("cannot create ip6_dst_cache");
2069
2070 fib6_init();
2071#ifdef CONFIG_PROC_FS
2072 p = proc_net_create("ipv6_route", 0, rt6_proc_info);
2073 if (p)
2074 p->owner = THIS_MODULE;
2075
2076 proc_net_fops_create("rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2077#endif
2078#ifdef CONFIG_XFRM
2079 xfrm6_init();
2080#endif
2081}
2082
2083void ip6_route_cleanup(void)
2084{
2085#ifdef CONFIG_PROC_FS
2086 proc_net_remove("ipv6_route");
2087 proc_net_remove("rt6_stats");
2088#endif
2089#ifdef CONFIG_XFRM
2090 xfrm6_fini();
2091#endif
2092 rt6_ifdown(NULL);
2093 fib6_gc_cleanup();
2094 kmem_cache_destroy(ip6_dst_ops.kmem_cachep);
2095}