blob: a7030fed1a18bdc2decf6dd75724c24a1dac3572 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * Linux INET6 implementation
3 * FIB front-end.
4 *
5 * Authors:
6 * Pedro Roque <roque@di.fc.ul.pt>
7 *
8 * $Id: route.c,v 1.56 2001/10/31 21:55:55 davem Exp $
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
14 */
15
16/* Changes:
17 *
18 * YOSHIFUJI Hideaki @USAGI
19 * reworked default router selection.
20 * - respect outgoing interface
21 * - select from (probably) reachable routers (i.e.
22 * routers in REACHABLE, STALE, DELAY or PROBE states).
23 * - always select the same router if it is (probably)
24 * reachable. otherwise, round-robin the list.
25 */
26
Randy Dunlap4fc268d2006-01-11 12:17:47 -080027#include <linux/capability.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070028#include <linux/config.h>
29#include <linux/errno.h>
30#include <linux/types.h>
31#include <linux/times.h>
32#include <linux/socket.h>
33#include <linux/sockios.h>
34#include <linux/net.h>
35#include <linux/route.h>
36#include <linux/netdevice.h>
37#include <linux/in6.h>
38#include <linux/init.h>
39#include <linux/netlink.h>
40#include <linux/if_arp.h>
41
42#ifdef CONFIG_PROC_FS
43#include <linux/proc_fs.h>
44#include <linux/seq_file.h>
45#endif
46
47#include <net/snmp.h>
48#include <net/ipv6.h>
49#include <net/ip6_fib.h>
50#include <net/ip6_route.h>
51#include <net/ndisc.h>
52#include <net/addrconf.h>
53#include <net/tcp.h>
54#include <linux/rtnetlink.h>
55#include <net/dst.h>
56#include <net/xfrm.h>
57
58#include <asm/uaccess.h>
59
60#ifdef CONFIG_SYSCTL
61#include <linux/sysctl.h>
62#endif
63
64/* Set to 3 to get tracing. */
65#define RT6_DEBUG 2
66
67#if RT6_DEBUG >= 3
68#define RDBG(x) printk x
69#define RT6_TRACE(x...) printk(KERN_DEBUG x)
70#else
71#define RDBG(x)
72#define RT6_TRACE(x...) do { ; } while (0)
73#endif
74
YOSHIFUJI Hideaki519fbd82006-03-20 17:00:05 -080075#define CLONE_OFFLINK_ROUTE 0
Linus Torvalds1da177e2005-04-16 15:20:36 -070076
YOSHIFUJI Hideaki554cfb72006-03-20 17:00:26 -080077#define RT6_SELECT_F_IFACE 0x1
78#define RT6_SELECT_F_REACHABLE 0x2
79
Linus Torvalds1da177e2005-04-16 15:20:36 -070080static int ip6_rt_max_size = 4096;
81static int ip6_rt_gc_min_interval = HZ / 2;
82static int ip6_rt_gc_timeout = 60*HZ;
83int ip6_rt_gc_interval = 30*HZ;
84static int ip6_rt_gc_elasticity = 9;
85static int ip6_rt_mtu_expires = 10*60*HZ;
86static int ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
87
88static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
89static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
90static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91static void ip6_dst_destroy(struct dst_entry *);
92static void ip6_dst_ifdown(struct dst_entry *,
93 struct net_device *dev, int how);
94static int ip6_dst_gc(void);
95
96static int ip6_pkt_discard(struct sk_buff *skb);
97static int ip6_pkt_discard_out(struct sk_buff *skb);
98static void ip6_link_failure(struct sk_buff *skb);
99static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
100
101static struct dst_ops ip6_dst_ops = {
102 .family = AF_INET6,
103 .protocol = __constant_htons(ETH_P_IPV6),
104 .gc = ip6_dst_gc,
105 .gc_thresh = 1024,
106 .check = ip6_dst_check,
107 .destroy = ip6_dst_destroy,
108 .ifdown = ip6_dst_ifdown,
109 .negative_advice = ip6_negative_advice,
110 .link_failure = ip6_link_failure,
111 .update_pmtu = ip6_rt_update_pmtu,
112 .entry_size = sizeof(struct rt6_info),
113};
114
115struct rt6_info ip6_null_entry = {
116 .u = {
117 .dst = {
118 .__refcnt = ATOMIC_INIT(1),
119 .__use = 1,
120 .dev = &loopback_dev,
121 .obsolete = -1,
122 .error = -ENETUNREACH,
123 .metrics = { [RTAX_HOPLIMIT - 1] = 255, },
124 .input = ip6_pkt_discard,
125 .output = ip6_pkt_discard_out,
126 .ops = &ip6_dst_ops,
127 .path = (struct dst_entry*)&ip6_null_entry,
128 }
129 },
130 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
131 .rt6i_metric = ~(u32) 0,
132 .rt6i_ref = ATOMIC_INIT(1),
133};
134
135struct fib6_node ip6_routing_table = {
136 .leaf = &ip6_null_entry,
137 .fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO,
138};
139
140/* Protects all the ip6 fib */
141
142DEFINE_RWLOCK(rt6_lock);
143
144
145/* allocate dst with ip6_dst_ops */
146static __inline__ struct rt6_info *ip6_dst_alloc(void)
147{
148 return (struct rt6_info *)dst_alloc(&ip6_dst_ops);
149}
150
151static void ip6_dst_destroy(struct dst_entry *dst)
152{
153 struct rt6_info *rt = (struct rt6_info *)dst;
154 struct inet6_dev *idev = rt->rt6i_idev;
155
156 if (idev != NULL) {
157 rt->rt6i_idev = NULL;
158 in6_dev_put(idev);
159 }
160}
161
162static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
163 int how)
164{
165 struct rt6_info *rt = (struct rt6_info *)dst;
166 struct inet6_dev *idev = rt->rt6i_idev;
167
168 if (dev != &loopback_dev && idev != NULL && idev->dev == dev) {
169 struct inet6_dev *loopback_idev = in6_dev_get(&loopback_dev);
170 if (loopback_idev != NULL) {
171 rt->rt6i_idev = loopback_idev;
172 in6_dev_put(idev);
173 }
174 }
175}
176
177static __inline__ int rt6_check_expired(const struct rt6_info *rt)
178{
179 return (rt->rt6i_flags & RTF_EXPIRES &&
180 time_after(jiffies, rt->rt6i_expires));
181}
182
183/*
184 * Route lookup. Any rt6_lock is implied.
185 */
186
187static __inline__ struct rt6_info *rt6_device_match(struct rt6_info *rt,
188 int oif,
189 int strict)
190{
191 struct rt6_info *local = NULL;
192 struct rt6_info *sprt;
193
194 if (oif) {
195 for (sprt = rt; sprt; sprt = sprt->u.next) {
196 struct net_device *dev = sprt->rt6i_dev;
197 if (dev->ifindex == oif)
198 return sprt;
199 if (dev->flags & IFF_LOOPBACK) {
200 if (sprt->rt6i_idev == NULL ||
201 sprt->rt6i_idev->dev->ifindex != oif) {
202 if (strict && oif)
203 continue;
204 if (local && (!oif ||
205 local->rt6i_idev->dev->ifindex == oif))
206 continue;
207 }
208 local = sprt;
209 }
210 }
211
212 if (local)
213 return local;
214
215 if (strict)
216 return &ip6_null_entry;
217 }
218 return rt;
219}
220
221/*
YOSHIFUJI Hideaki554cfb72006-03-20 17:00:26 -0800222 * Default Router Selection (RFC 2461 6.3.6)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700223 */
YOSHIFUJI Hideaki554cfb72006-03-20 17:00:26 -0800224static int inline rt6_check_dev(struct rt6_info *rt, int oif)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700225{
YOSHIFUJI Hideaki554cfb72006-03-20 17:00:26 -0800226 struct net_device *dev = rt->rt6i_dev;
227 if (!oif || dev->ifindex == oif)
228 return 2;
229 if ((dev->flags & IFF_LOOPBACK) &&
230 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
231 return 1;
232 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700233}
234
YOSHIFUJI Hideaki554cfb72006-03-20 17:00:26 -0800235static int inline rt6_check_neigh(struct rt6_info *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700236{
YOSHIFUJI Hideaki554cfb72006-03-20 17:00:26 -0800237 struct neighbour *neigh = rt->rt6i_nexthop;
238 int m = 0;
239 if (neigh) {
240 read_lock_bh(&neigh->lock);
241 if (neigh->nud_state & NUD_VALID)
242 m = 1;
243 read_unlock_bh(&neigh->lock);
244 }
245 return m;
246}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700247
YOSHIFUJI Hideaki554cfb72006-03-20 17:00:26 -0800248static int rt6_score_route(struct rt6_info *rt, int oif,
249 int strict)
250{
251 int m = rt6_check_dev(rt, oif);
252 if (!m && (strict & RT6_SELECT_F_IFACE))
253 return -1;
YOSHIFUJI Hideakiebacaaa2006-03-20 17:04:53 -0800254#ifdef CONFIG_IPV6_ROUTER_PREF
255 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
256#endif
YOSHIFUJI Hideaki554cfb72006-03-20 17:00:26 -0800257 if (rt6_check_neigh(rt))
YOSHIFUJI Hideakiebacaaa2006-03-20 17:04:53 -0800258 m |= 16;
YOSHIFUJI Hideaki554cfb72006-03-20 17:00:26 -0800259 else if (strict & RT6_SELECT_F_REACHABLE)
260 return -1;
261 return m;
262}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700263
YOSHIFUJI Hideaki554cfb72006-03-20 17:00:26 -0800264static struct rt6_info *rt6_select(struct rt6_info **head, int oif,
265 int strict)
266{
267 struct rt6_info *match = NULL, *last = NULL;
268 struct rt6_info *rt, *rt0 = *head;
269 u32 metric;
270 int mpri = -1;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700271
YOSHIFUJI Hideaki554cfb72006-03-20 17:00:26 -0800272 RT6_TRACE("%s(head=%p(*head=%p), oif=%d)\n",
273 __FUNCTION__, head, head ? *head : NULL, oif);
274
275 for (rt = rt0, metric = rt0->rt6i_metric;
276 rt && rt->rt6i_metric == metric;
277 rt = rt->u.next) {
278 int m;
279
280 if (rt6_check_expired(rt))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700281 continue;
282
YOSHIFUJI Hideaki554cfb72006-03-20 17:00:26 -0800283 last = rt;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700284
YOSHIFUJI Hideaki554cfb72006-03-20 17:00:26 -0800285 m = rt6_score_route(rt, oif, strict);
286 if (m < 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700287 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700288
YOSHIFUJI Hideaki554cfb72006-03-20 17:00:26 -0800289 if (m > mpri) {
290 match = rt;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700291 mpri = m;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700292 }
293 }
294
YOSHIFUJI Hideaki554cfb72006-03-20 17:00:26 -0800295 if (!match &&
296 (strict & RT6_SELECT_F_REACHABLE) &&
297 last && last != rt0) {
298 /* no entries matched; do round-robin */
299 *head = rt0->u.next;
300 rt0->u.next = last->u.next;
301 last->u.next = rt0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700302 }
303
YOSHIFUJI Hideaki554cfb72006-03-20 17:00:26 -0800304 RT6_TRACE("%s() => %p, score=%d\n",
305 __FUNCTION__, match, mpri);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700306
YOSHIFUJI Hideaki554cfb72006-03-20 17:00:26 -0800307 return (match ? match : &ip6_null_entry);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700308}
309
310struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr,
311 int oif, int strict)
312{
313 struct fib6_node *fn;
314 struct rt6_info *rt;
315
316 read_lock_bh(&rt6_lock);
317 fn = fib6_lookup(&ip6_routing_table, daddr, saddr);
318 rt = rt6_device_match(fn->leaf, oif, strict);
319 dst_hold(&rt->u.dst);
320 rt->u.dst.__use++;
321 read_unlock_bh(&rt6_lock);
322
323 rt->u.dst.lastuse = jiffies;
324 if (rt->u.dst.error == 0)
325 return rt;
326 dst_release(&rt->u.dst);
327 return NULL;
328}
329
330/* ip6_ins_rt is called with FREE rt6_lock.
331 It takes new route entry, the addition fails by any reason the
332 route is freed. In any case, if caller does not hold it, it may
333 be destroyed.
334 */
335
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -0700336int ip6_ins_rt(struct rt6_info *rt, struct nlmsghdr *nlh,
337 void *_rtattr, struct netlink_skb_parms *req)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700338{
339 int err;
340
341 write_lock_bh(&rt6_lock);
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -0700342 err = fib6_add(&ip6_routing_table, rt, nlh, _rtattr, req);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700343 write_unlock_bh(&rt6_lock);
344
345 return err;
346}
347
YOSHIFUJI Hideaki95a9a5b2006-03-20 16:55:51 -0800348static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
349 struct in6_addr *saddr)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700350{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700351 struct rt6_info *rt;
352
353 /*
354 * Clone the route.
355 */
356
357 rt = ip6_rt_copy(ort);
358
359 if (rt) {
YOSHIFUJI Hideaki58c4fb82005-12-21 22:56:42 +0900360 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
361 if (rt->rt6i_dst.plen != 128 &&
362 ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
363 rt->rt6i_flags |= RTF_ANYCAST;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700364 ipv6_addr_copy(&rt->rt6i_gateway, daddr);
YOSHIFUJI Hideaki58c4fb82005-12-21 22:56:42 +0900365 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700366
YOSHIFUJI Hideaki58c4fb82005-12-21 22:56:42 +0900367 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700368 rt->rt6i_dst.plen = 128;
369 rt->rt6i_flags |= RTF_CACHE;
370 rt->u.dst.flags |= DST_HOST;
371
372#ifdef CONFIG_IPV6_SUBTREES
373 if (rt->rt6i_src.plen && saddr) {
374 ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
375 rt->rt6i_src.plen = 128;
376 }
377#endif
378
379 rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
380
YOSHIFUJI Hideaki95a9a5b2006-03-20 16:55:51 -0800381 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700382
YOSHIFUJI Hideaki95a9a5b2006-03-20 16:55:51 -0800383 return rt;
384}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700385
YOSHIFUJI Hideaki299d9932006-03-20 16:58:32 -0800386static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
387{
388 struct rt6_info *rt = ip6_rt_copy(ort);
389 if (rt) {
390 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
391 rt->rt6i_dst.plen = 128;
392 rt->rt6i_flags |= RTF_CACHE;
393 if (rt->rt6i_flags & RTF_REJECT)
394 rt->u.dst.error = ort->u.dst.error;
395 rt->u.dst.flags |= DST_HOST;
396 rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
397 }
398 return rt;
399}
400
Linus Torvalds1da177e2005-04-16 15:20:36 -0700401#define BACKTRACK() \
YOSHIFUJI Hideakibb133962006-03-20 17:01:43 -0800402if (rt == &ip6_null_entry) { \
Linus Torvalds1da177e2005-04-16 15:20:36 -0700403 while ((fn = fn->parent) != NULL) { \
404 if (fn->fn_flags & RTN_ROOT) { \
Linus Torvalds1da177e2005-04-16 15:20:36 -0700405 goto out; \
406 } \
407 if (fn->fn_flags & RTN_RTINFO) \
408 goto restart; \
409 } \
410}
411
412
413void ip6_route_input(struct sk_buff *skb)
414{
415 struct fib6_node *fn;
YOSHIFUJI Hideaki519fbd82006-03-20 17:00:05 -0800416 struct rt6_info *rt, *nrt;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700417 int strict;
418 int attempts = 3;
YOSHIFUJI Hideaki519fbd82006-03-20 17:00:05 -0800419 int err;
YOSHIFUJI Hideaki8238dd02006-03-20 17:04:35 -0800420 int reachable = RT6_SELECT_F_REACHABLE;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700421
YOSHIFUJI Hideaki118f8c12006-03-20 17:01:06 -0800422 strict = ipv6_addr_type(&skb->nh.ipv6h->daddr) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL) ? RT6_SELECT_F_IFACE : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700423
424relookup:
425 read_lock_bh(&rt6_lock);
426
YOSHIFUJI Hideaki8238dd02006-03-20 17:04:35 -0800427restart_2:
Linus Torvalds1da177e2005-04-16 15:20:36 -0700428 fn = fib6_lookup(&ip6_routing_table, &skb->nh.ipv6h->daddr,
429 &skb->nh.ipv6h->saddr);
430
431restart:
YOSHIFUJI Hideaki8238dd02006-03-20 17:04:35 -0800432 rt = rt6_select(&fn->leaf, skb->dev->ifindex, strict | reachable);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700433 BACKTRACK();
YOSHIFUJI Hideaki8238dd02006-03-20 17:04:35 -0800434 if (rt == &ip6_null_entry ||
435 rt->rt6i_flags & RTF_CACHE)
YOSHIFUJI Hideaki1ddef0442006-03-20 17:01:24 -0800436 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700437
YOSHIFUJI Hideakifb9de912006-03-20 16:59:08 -0800438 dst_hold(&rt->u.dst);
439 read_unlock_bh(&rt6_lock);
440
YOSHIFUJI Hideaki519fbd82006-03-20 17:00:05 -0800441 if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
442 nrt = rt6_alloc_cow(rt, &skb->nh.ipv6h->daddr, &skb->nh.ipv6h->saddr);
443 else {
444#if CLONE_OFFLINK_ROUTE
445 nrt = rt6_alloc_clone(rt, &skb->nh.ipv6h->daddr);
446#else
447 goto out2;
448#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700449 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700450
YOSHIFUJI Hideaki519fbd82006-03-20 17:00:05 -0800451 dst_release(&rt->u.dst);
452 rt = nrt ? : &ip6_null_entry;
453
454 dst_hold(&rt->u.dst);
455 if (nrt) {
456 err = ip6_ins_rt(nrt, NULL, NULL, &NETLINK_CB(skb));
457 if (!err)
458 goto out2;
459 }
460
461 if (--attempts <= 0)
462 goto out2;
463
464 /*
465 * Race condition! In the gap, when rt6_lock was
466 * released someone could insert this route. Relookup.
467 */
468 dst_release(&rt->u.dst);
469 goto relookup;
470
471out:
YOSHIFUJI Hideaki8238dd02006-03-20 17:04:35 -0800472 if (reachable) {
473 reachable = 0;
474 goto restart_2;
475 }
YOSHIFUJI Hideaki519fbd82006-03-20 17:00:05 -0800476 dst_hold(&rt->u.dst);
477 read_unlock_bh(&rt6_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700478out2:
479 rt->u.dst.lastuse = jiffies;
480 rt->u.dst.__use++;
481 skb->dst = (struct dst_entry *) rt;
YOSHIFUJI Hideakifb9de912006-03-20 16:59:08 -0800482 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700483}
484
485struct dst_entry * ip6_route_output(struct sock *sk, struct flowi *fl)
486{
487 struct fib6_node *fn;
YOSHIFUJI Hideaki519fbd82006-03-20 17:00:05 -0800488 struct rt6_info *rt, *nrt;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700489 int strict;
490 int attempts = 3;
YOSHIFUJI Hideaki519fbd82006-03-20 17:00:05 -0800491 int err;
YOSHIFUJI Hideaki8238dd02006-03-20 17:04:35 -0800492 int reachable = RT6_SELECT_F_REACHABLE;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700493
YOSHIFUJI Hideaki554cfb72006-03-20 17:00:26 -0800494 strict = ipv6_addr_type(&fl->fl6_dst) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL) ? RT6_SELECT_F_IFACE : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700495
496relookup:
497 read_lock_bh(&rt6_lock);
498
YOSHIFUJI Hideaki8238dd02006-03-20 17:04:35 -0800499restart_2:
Linus Torvalds1da177e2005-04-16 15:20:36 -0700500 fn = fib6_lookup(&ip6_routing_table, &fl->fl6_dst, &fl->fl6_src);
501
502restart:
YOSHIFUJI Hideaki8238dd02006-03-20 17:04:35 -0800503 rt = rt6_select(&fn->leaf, fl->oif, strict | reachable);
YOSHIFUJI Hideaki1ddef0442006-03-20 17:01:24 -0800504 BACKTRACK();
YOSHIFUJI Hideaki8238dd02006-03-20 17:04:35 -0800505 if (rt == &ip6_null_entry ||
506 rt->rt6i_flags & RTF_CACHE)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700507 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700508
YOSHIFUJI Hideakifb9de912006-03-20 16:59:08 -0800509 dst_hold(&rt->u.dst);
510 read_unlock_bh(&rt6_lock);
511
YOSHIFUJI Hideaki519fbd82006-03-20 17:00:05 -0800512 if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
YOSHIFUJI Hideakie40cf352006-03-20 16:59:27 -0800513 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
YOSHIFUJI Hideaki519fbd82006-03-20 17:00:05 -0800514 else {
515#if CLONE_OFFLINK_ROUTE
516 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
517#else
518 goto out2;
519#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700520 }
YOSHIFUJI Hideakie40cf352006-03-20 16:59:27 -0800521
YOSHIFUJI Hideaki519fbd82006-03-20 17:00:05 -0800522 dst_release(&rt->u.dst);
523 rt = nrt ? : &ip6_null_entry;
524
525 dst_hold(&rt->u.dst);
526 if (nrt) {
527 err = ip6_ins_rt(nrt, NULL, NULL, NULL);
528 if (!err)
529 goto out2;
530 }
531
532 if (--attempts <= 0)
533 goto out2;
534
535 /*
536 * Race condition! In the gap, when rt6_lock was
537 * released someone could insert this route. Relookup.
538 */
539 dst_release(&rt->u.dst);
540 goto relookup;
541
542out:
YOSHIFUJI Hideaki8238dd02006-03-20 17:04:35 -0800543 if (reachable) {
544 reachable = 0;
545 goto restart_2;
546 }
YOSHIFUJI Hideaki519fbd82006-03-20 17:00:05 -0800547 dst_hold(&rt->u.dst);
548 read_unlock_bh(&rt6_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700549out2:
550 rt->u.dst.lastuse = jiffies;
551 rt->u.dst.__use++;
552 return &rt->u.dst;
553}
554
555
556/*
557 * Destination cache support functions
558 */
559
560static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
561{
562 struct rt6_info *rt;
563
564 rt = (struct rt6_info *) dst;
565
566 if (rt && rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
567 return dst;
568
569 return NULL;
570}
571
572static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
573{
574 struct rt6_info *rt = (struct rt6_info *) dst;
575
576 if (rt) {
577 if (rt->rt6i_flags & RTF_CACHE)
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -0700578 ip6_del_rt(rt, NULL, NULL, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700579 else
580 dst_release(dst);
581 }
582 return NULL;
583}
584
585static void ip6_link_failure(struct sk_buff *skb)
586{
587 struct rt6_info *rt;
588
589 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
590
591 rt = (struct rt6_info *) skb->dst;
592 if (rt) {
593 if (rt->rt6i_flags&RTF_CACHE) {
594 dst_set_expires(&rt->u.dst, 0);
595 rt->rt6i_flags |= RTF_EXPIRES;
596 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
597 rt->rt6i_node->fn_sernum = -1;
598 }
599}
600
601static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
602{
603 struct rt6_info *rt6 = (struct rt6_info*)dst;
604
605 if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
606 rt6->rt6i_flags |= RTF_MODIFIED;
607 if (mtu < IPV6_MIN_MTU) {
608 mtu = IPV6_MIN_MTU;
609 dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
610 }
611 dst->metrics[RTAX_MTU-1] = mtu;
612 }
613}
614
615/* Protected by rt6_lock. */
616static struct dst_entry *ndisc_dst_gc_list;
617static int ipv6_get_mtu(struct net_device *dev);
618
619static inline unsigned int ipv6_advmss(unsigned int mtu)
620{
621 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
622
623 if (mtu < ip6_rt_min_advmss)
624 mtu = ip6_rt_min_advmss;
625
626 /*
627 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
628 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
629 * IPV6_MAXPLEN is also valid and means: "any MSS,
630 * rely only on pmtu discovery"
631 */
632 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
633 mtu = IPV6_MAXPLEN;
634 return mtu;
635}
636
637struct dst_entry *ndisc_dst_alloc(struct net_device *dev,
638 struct neighbour *neigh,
639 struct in6_addr *addr,
640 int (*output)(struct sk_buff *))
641{
642 struct rt6_info *rt;
643 struct inet6_dev *idev = in6_dev_get(dev);
644
645 if (unlikely(idev == NULL))
646 return NULL;
647
648 rt = ip6_dst_alloc();
649 if (unlikely(rt == NULL)) {
650 in6_dev_put(idev);
651 goto out;
652 }
653
654 dev_hold(dev);
655 if (neigh)
656 neigh_hold(neigh);
657 else
658 neigh = ndisc_get_neigh(dev, addr);
659
660 rt->rt6i_dev = dev;
661 rt->rt6i_idev = idev;
662 rt->rt6i_nexthop = neigh;
663 atomic_set(&rt->u.dst.__refcnt, 1);
664 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255;
665 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
666 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
667 rt->u.dst.output = output;
668
669#if 0 /* there's no chance to use these for ndisc */
670 rt->u.dst.flags = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST
671 ? DST_HOST
672 : 0;
673 ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
674 rt->rt6i_dst.plen = 128;
675#endif
676
677 write_lock_bh(&rt6_lock);
678 rt->u.dst.next = ndisc_dst_gc_list;
679 ndisc_dst_gc_list = &rt->u.dst;
680 write_unlock_bh(&rt6_lock);
681
682 fib6_force_start_gc();
683
684out:
685 return (struct dst_entry *)rt;
686}
687
688int ndisc_dst_gc(int *more)
689{
690 struct dst_entry *dst, *next, **pprev;
691 int freed;
692
693 next = NULL;
694 pprev = &ndisc_dst_gc_list;
695 freed = 0;
696 while ((dst = *pprev) != NULL) {
697 if (!atomic_read(&dst->__refcnt)) {
698 *pprev = dst->next;
699 dst_free(dst);
700 freed++;
701 } else {
702 pprev = &dst->next;
703 (*more)++;
704 }
705 }
706
707 return freed;
708}
709
710static int ip6_dst_gc(void)
711{
712 static unsigned expire = 30*HZ;
713 static unsigned long last_gc;
714 unsigned long now = jiffies;
715
716 if (time_after(last_gc + ip6_rt_gc_min_interval, now) &&
717 atomic_read(&ip6_dst_ops.entries) <= ip6_rt_max_size)
718 goto out;
719
720 expire++;
721 fib6_run_gc(expire);
722 last_gc = now;
723 if (atomic_read(&ip6_dst_ops.entries) < ip6_dst_ops.gc_thresh)
724 expire = ip6_rt_gc_timeout>>1;
725
726out:
727 expire -= expire>>ip6_rt_gc_elasticity;
728 return (atomic_read(&ip6_dst_ops.entries) > ip6_rt_max_size);
729}
730
731/* Clean host part of a prefix. Not necessary in radix tree,
732 but results in cleaner routing tables.
733
734 Remove it only when all the things will work!
735 */
736
737static int ipv6_get_mtu(struct net_device *dev)
738{
739 int mtu = IPV6_MIN_MTU;
740 struct inet6_dev *idev;
741
742 idev = in6_dev_get(dev);
743 if (idev) {
744 mtu = idev->cnf.mtu6;
745 in6_dev_put(idev);
746 }
747 return mtu;
748}
749
750int ipv6_get_hoplimit(struct net_device *dev)
751{
752 int hoplimit = ipv6_devconf.hop_limit;
753 struct inet6_dev *idev;
754
755 idev = in6_dev_get(dev);
756 if (idev) {
757 hoplimit = idev->cnf.hop_limit;
758 in6_dev_put(idev);
759 }
760 return hoplimit;
761}
762
763/*
764 *
765 */
766
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -0700767int ip6_route_add(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh,
768 void *_rtattr, struct netlink_skb_parms *req)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700769{
770 int err;
771 struct rtmsg *r;
772 struct rtattr **rta;
773 struct rt6_info *rt = NULL;
774 struct net_device *dev = NULL;
775 struct inet6_dev *idev = NULL;
776 int addr_type;
777
778 rta = (struct rtattr **) _rtattr;
779
780 if (rtmsg->rtmsg_dst_len > 128 || rtmsg->rtmsg_src_len > 128)
781 return -EINVAL;
782#ifndef CONFIG_IPV6_SUBTREES
783 if (rtmsg->rtmsg_src_len)
784 return -EINVAL;
785#endif
786 if (rtmsg->rtmsg_ifindex) {
787 err = -ENODEV;
788 dev = dev_get_by_index(rtmsg->rtmsg_ifindex);
789 if (!dev)
790 goto out;
791 idev = in6_dev_get(dev);
792 if (!idev)
793 goto out;
794 }
795
796 if (rtmsg->rtmsg_metric == 0)
797 rtmsg->rtmsg_metric = IP6_RT_PRIO_USER;
798
799 rt = ip6_dst_alloc();
800
801 if (rt == NULL) {
802 err = -ENOMEM;
803 goto out;
804 }
805
806 rt->u.dst.obsolete = -1;
YOSHIFUJI Hideaki3dd4bc62005-12-19 14:02:45 -0800807 rt->rt6i_expires = jiffies + clock_t_to_jiffies(rtmsg->rtmsg_info);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700808 if (nlh && (r = NLMSG_DATA(nlh))) {
809 rt->rt6i_protocol = r->rtm_protocol;
810 } else {
811 rt->rt6i_protocol = RTPROT_BOOT;
812 }
813
814 addr_type = ipv6_addr_type(&rtmsg->rtmsg_dst);
815
816 if (addr_type & IPV6_ADDR_MULTICAST)
817 rt->u.dst.input = ip6_mc_input;
818 else
819 rt->u.dst.input = ip6_forward;
820
821 rt->u.dst.output = ip6_output;
822
823 ipv6_addr_prefix(&rt->rt6i_dst.addr,
824 &rtmsg->rtmsg_dst, rtmsg->rtmsg_dst_len);
825 rt->rt6i_dst.plen = rtmsg->rtmsg_dst_len;
826 if (rt->rt6i_dst.plen == 128)
827 rt->u.dst.flags = DST_HOST;
828
829#ifdef CONFIG_IPV6_SUBTREES
830 ipv6_addr_prefix(&rt->rt6i_src.addr,
831 &rtmsg->rtmsg_src, rtmsg->rtmsg_src_len);
832 rt->rt6i_src.plen = rtmsg->rtmsg_src_len;
833#endif
834
835 rt->rt6i_metric = rtmsg->rtmsg_metric;
836
837 /* We cannot add true routes via loopback here,
838 they would result in kernel looping; promote them to reject routes
839 */
840 if ((rtmsg->rtmsg_flags&RTF_REJECT) ||
841 (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
842 /* hold loopback dev/idev if we haven't done so. */
843 if (dev != &loopback_dev) {
844 if (dev) {
845 dev_put(dev);
846 in6_dev_put(idev);
847 }
848 dev = &loopback_dev;
849 dev_hold(dev);
850 idev = in6_dev_get(dev);
851 if (!idev) {
852 err = -ENODEV;
853 goto out;
854 }
855 }
856 rt->u.dst.output = ip6_pkt_discard_out;
857 rt->u.dst.input = ip6_pkt_discard;
858 rt->u.dst.error = -ENETUNREACH;
859 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
860 goto install_route;
861 }
862
863 if (rtmsg->rtmsg_flags & RTF_GATEWAY) {
864 struct in6_addr *gw_addr;
865 int gwa_type;
866
867 gw_addr = &rtmsg->rtmsg_gateway;
868 ipv6_addr_copy(&rt->rt6i_gateway, &rtmsg->rtmsg_gateway);
869 gwa_type = ipv6_addr_type(gw_addr);
870
871 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
872 struct rt6_info *grt;
873
874 /* IPv6 strictly inhibits using not link-local
875 addresses as nexthop address.
876 Otherwise, router will not able to send redirects.
877 It is very good, but in some (rare!) circumstances
878 (SIT, PtP, NBMA NOARP links) it is handy to allow
879 some exceptions. --ANK
880 */
881 err = -EINVAL;
882 if (!(gwa_type&IPV6_ADDR_UNICAST))
883 goto out;
884
885 grt = rt6_lookup(gw_addr, NULL, rtmsg->rtmsg_ifindex, 1);
886
887 err = -EHOSTUNREACH;
888 if (grt == NULL)
889 goto out;
890 if (dev) {
891 if (dev != grt->rt6i_dev) {
892 dst_release(&grt->u.dst);
893 goto out;
894 }
895 } else {
896 dev = grt->rt6i_dev;
897 idev = grt->rt6i_idev;
898 dev_hold(dev);
899 in6_dev_hold(grt->rt6i_idev);
900 }
901 if (!(grt->rt6i_flags&RTF_GATEWAY))
902 err = 0;
903 dst_release(&grt->u.dst);
904
905 if (err)
906 goto out;
907 }
908 err = -EINVAL;
909 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
910 goto out;
911 }
912
913 err = -ENODEV;
914 if (dev == NULL)
915 goto out;
916
917 if (rtmsg->rtmsg_flags & (RTF_GATEWAY|RTF_NONEXTHOP)) {
918 rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
919 if (IS_ERR(rt->rt6i_nexthop)) {
920 err = PTR_ERR(rt->rt6i_nexthop);
921 rt->rt6i_nexthop = NULL;
922 goto out;
923 }
924 }
925
926 rt->rt6i_flags = rtmsg->rtmsg_flags;
927
928install_route:
929 if (rta && rta[RTA_METRICS-1]) {
930 int attrlen = RTA_PAYLOAD(rta[RTA_METRICS-1]);
931 struct rtattr *attr = RTA_DATA(rta[RTA_METRICS-1]);
932
933 while (RTA_OK(attr, attrlen)) {
934 unsigned flavor = attr->rta_type;
935 if (flavor) {
936 if (flavor > RTAX_MAX) {
937 err = -EINVAL;
938 goto out;
939 }
940 rt->u.dst.metrics[flavor-1] =
941 *(u32 *)RTA_DATA(attr);
942 }
943 attr = RTA_NEXT(attr, attrlen);
944 }
945 }
946
947 if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
948 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
949 if (!rt->u.dst.metrics[RTAX_MTU-1])
950 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
951 if (!rt->u.dst.metrics[RTAX_ADVMSS-1])
952 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
953 rt->u.dst.dev = dev;
954 rt->rt6i_idev = idev;
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -0700955 return ip6_ins_rt(rt, nlh, _rtattr, req);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700956
957out:
958 if (dev)
959 dev_put(dev);
960 if (idev)
961 in6_dev_put(idev);
962 if (rt)
963 dst_free((struct dst_entry *) rt);
964 return err;
965}
966
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -0700967int ip6_del_rt(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700968{
969 int err;
970
971 write_lock_bh(&rt6_lock);
972
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -0700973 err = fib6_del(rt, nlh, _rtattr, req);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700974 dst_release(&rt->u.dst);
975
976 write_unlock_bh(&rt6_lock);
977
978 return err;
979}
980
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -0700981static int ip6_route_del(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700982{
983 struct fib6_node *fn;
984 struct rt6_info *rt;
985 int err = -ESRCH;
986
987 read_lock_bh(&rt6_lock);
988
989 fn = fib6_locate(&ip6_routing_table,
990 &rtmsg->rtmsg_dst, rtmsg->rtmsg_dst_len,
991 &rtmsg->rtmsg_src, rtmsg->rtmsg_src_len);
992
993 if (fn) {
994 for (rt = fn->leaf; rt; rt = rt->u.next) {
995 if (rtmsg->rtmsg_ifindex &&
996 (rt->rt6i_dev == NULL ||
997 rt->rt6i_dev->ifindex != rtmsg->rtmsg_ifindex))
998 continue;
999 if (rtmsg->rtmsg_flags&RTF_GATEWAY &&
1000 !ipv6_addr_equal(&rtmsg->rtmsg_gateway, &rt->rt6i_gateway))
1001 continue;
1002 if (rtmsg->rtmsg_metric &&
1003 rtmsg->rtmsg_metric != rt->rt6i_metric)
1004 continue;
1005 dst_hold(&rt->u.dst);
1006 read_unlock_bh(&rt6_lock);
1007
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -07001008 return ip6_del_rt(rt, nlh, _rtattr, req);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001009 }
1010 }
1011 read_unlock_bh(&rt6_lock);
1012
1013 return err;
1014}
1015
1016/*
1017 * Handle redirects
1018 */
1019void rt6_redirect(struct in6_addr *dest, struct in6_addr *saddr,
1020 struct neighbour *neigh, u8 *lladdr, int on_link)
1021{
1022 struct rt6_info *rt, *nrt;
1023
1024 /* Locate old route to this destination. */
1025 rt = rt6_lookup(dest, NULL, neigh->dev->ifindex, 1);
1026
1027 if (rt == NULL)
1028 return;
1029
1030 if (neigh->dev != rt->rt6i_dev)
1031 goto out;
1032
1033 /*
1034 * Current route is on-link; redirect is always invalid.
1035 *
1036 * Seems, previous statement is not true. It could
1037 * be node, which looks for us as on-link (f.e. proxy ndisc)
1038 * But then router serving it might decide, that we should
1039 * know truth 8)8) --ANK (980726).
1040 */
1041 if (!(rt->rt6i_flags&RTF_GATEWAY))
1042 goto out;
1043
1044 /*
1045 * RFC 2461 specifies that redirects should only be
1046 * accepted if they come from the nexthop to the target.
1047 * Due to the way default routers are chosen, this notion
1048 * is a bit fuzzy and one might need to check all default
1049 * routers.
1050 */
1051 if (!ipv6_addr_equal(saddr, &rt->rt6i_gateway)) {
1052 if (rt->rt6i_flags & RTF_DEFAULT) {
1053 struct rt6_info *rt1;
1054
1055 read_lock(&rt6_lock);
1056 for (rt1 = ip6_routing_table.leaf; rt1; rt1 = rt1->u.next) {
1057 if (ipv6_addr_equal(saddr, &rt1->rt6i_gateway)) {
1058 dst_hold(&rt1->u.dst);
1059 dst_release(&rt->u.dst);
1060 read_unlock(&rt6_lock);
1061 rt = rt1;
1062 goto source_ok;
1063 }
1064 }
1065 read_unlock(&rt6_lock);
1066 }
1067 if (net_ratelimit())
1068 printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1069 "for redirect target\n");
1070 goto out;
1071 }
1072
1073source_ok:
1074
1075 /*
1076 * We have finally decided to accept it.
1077 */
1078
1079 neigh_update(neigh, lladdr, NUD_STALE,
1080 NEIGH_UPDATE_F_WEAK_OVERRIDE|
1081 NEIGH_UPDATE_F_OVERRIDE|
1082 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1083 NEIGH_UPDATE_F_ISROUTER))
1084 );
1085
1086 /*
1087 * Redirect received -> path was valid.
1088 * Look, redirects are sent only in response to data packets,
1089 * so that this nexthop apparently is reachable. --ANK
1090 */
1091 dst_confirm(&rt->u.dst);
1092
1093 /* Duplicate redirect: silently ignore. */
1094 if (neigh == rt->u.dst.neighbour)
1095 goto out;
1096
1097 nrt = ip6_rt_copy(rt);
1098 if (nrt == NULL)
1099 goto out;
1100
1101 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1102 if (on_link)
1103 nrt->rt6i_flags &= ~RTF_GATEWAY;
1104
1105 ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1106 nrt->rt6i_dst.plen = 128;
1107 nrt->u.dst.flags |= DST_HOST;
1108
1109 ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1110 nrt->rt6i_nexthop = neigh_clone(neigh);
1111 /* Reset pmtu, it may be better */
1112 nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1113 nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&nrt->u.dst));
1114
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -07001115 if (ip6_ins_rt(nrt, NULL, NULL, NULL))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001116 goto out;
1117
1118 if (rt->rt6i_flags&RTF_CACHE) {
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -07001119 ip6_del_rt(rt, NULL, NULL, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001120 return;
1121 }
1122
1123out:
1124 dst_release(&rt->u.dst);
1125 return;
1126}
1127
1128/*
1129 * Handle ICMP "packet too big" messages
1130 * i.e. Path MTU discovery
1131 */
1132
1133void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1134 struct net_device *dev, u32 pmtu)
1135{
1136 struct rt6_info *rt, *nrt;
1137 int allfrag = 0;
1138
1139 rt = rt6_lookup(daddr, saddr, dev->ifindex, 0);
1140 if (rt == NULL)
1141 return;
1142
1143 if (pmtu >= dst_mtu(&rt->u.dst))
1144 goto out;
1145
1146 if (pmtu < IPV6_MIN_MTU) {
1147 /*
1148 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1149 * MTU (1280) and a fragment header should always be included
1150 * after a node receiving Too Big message reporting PMTU is
1151 * less than the IPv6 Minimum Link MTU.
1152 */
1153 pmtu = IPV6_MIN_MTU;
1154 allfrag = 1;
1155 }
1156
1157 /* New mtu received -> path was valid.
1158 They are sent only in response to data packets,
1159 so that this nexthop apparently is reachable. --ANK
1160 */
1161 dst_confirm(&rt->u.dst);
1162
1163 /* Host route. If it is static, it would be better
1164 not to override it, but add new one, so that
1165 when cache entry will expire old pmtu
1166 would return automatically.
1167 */
1168 if (rt->rt6i_flags & RTF_CACHE) {
1169 rt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1170 if (allfrag)
1171 rt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1172 dst_set_expires(&rt->u.dst, ip6_rt_mtu_expires);
1173 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1174 goto out;
1175 }
1176
1177 /* Network route.
1178 Two cases are possible:
1179 1. It is connected route. Action: COW
1180 2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1181 */
YOSHIFUJI Hideakid5315b502006-03-20 16:58:48 -08001182 if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
YOSHIFUJI Hideakia1e78362006-03-20 16:56:32 -08001183 nrt = rt6_alloc_cow(rt, daddr, saddr);
YOSHIFUJI Hideakid5315b502006-03-20 16:58:48 -08001184 else
1185 nrt = rt6_alloc_clone(rt, daddr);
YOSHIFUJI Hideakia1e78362006-03-20 16:56:32 -08001186
YOSHIFUJI Hideakid5315b502006-03-20 16:58:48 -08001187 if (nrt) {
YOSHIFUJI Hideakia1e78362006-03-20 16:56:32 -08001188 nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1189 if (allfrag)
1190 nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1191
1192 /* According to RFC 1981, detecting PMTU increase shouldn't be
1193 * happened within 5 mins, the recommended timer is 10 mins.
1194 * Here this route expiration time is set to ip6_rt_mtu_expires
1195 * which is 10 mins. After 10 mins the decreased pmtu is expired
1196 * and detecting PMTU increase will be automatically happened.
1197 */
1198 dst_set_expires(&nrt->u.dst, ip6_rt_mtu_expires);
1199 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1200
1201 ip6_ins_rt(nrt, NULL, NULL, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001202 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001203out:
1204 dst_release(&rt->u.dst);
1205}
1206
1207/*
1208 * Misc support functions
1209 */
1210
1211static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1212{
1213 struct rt6_info *rt = ip6_dst_alloc();
1214
1215 if (rt) {
1216 rt->u.dst.input = ort->u.dst.input;
1217 rt->u.dst.output = ort->u.dst.output;
1218
1219 memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
1220 rt->u.dst.dev = ort->u.dst.dev;
1221 if (rt->u.dst.dev)
1222 dev_hold(rt->u.dst.dev);
1223 rt->rt6i_idev = ort->rt6i_idev;
1224 if (rt->rt6i_idev)
1225 in6_dev_hold(rt->rt6i_idev);
1226 rt->u.dst.lastuse = jiffies;
1227 rt->rt6i_expires = 0;
1228
1229 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1230 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1231 rt->rt6i_metric = 0;
1232
1233 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1234#ifdef CONFIG_IPV6_SUBTREES
1235 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1236#endif
1237 }
1238 return rt;
1239}
1240
1241struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1242{
1243 struct rt6_info *rt;
1244 struct fib6_node *fn;
1245
1246 fn = &ip6_routing_table;
1247
1248 write_lock_bh(&rt6_lock);
1249 for (rt = fn->leaf; rt; rt=rt->u.next) {
1250 if (dev == rt->rt6i_dev &&
YOSHIFUJI Hideaki045927f2006-03-20 17:00:48 -08001251 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001252 ipv6_addr_equal(&rt->rt6i_gateway, addr))
1253 break;
1254 }
1255 if (rt)
1256 dst_hold(&rt->u.dst);
1257 write_unlock_bh(&rt6_lock);
1258 return rt;
1259}
1260
1261struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
YOSHIFUJI Hideakiebacaaa2006-03-20 17:04:53 -08001262 struct net_device *dev,
1263 unsigned int pref)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001264{
1265 struct in6_rtmsg rtmsg;
1266
1267 memset(&rtmsg, 0, sizeof(struct in6_rtmsg));
1268 rtmsg.rtmsg_type = RTMSG_NEWROUTE;
1269 ipv6_addr_copy(&rtmsg.rtmsg_gateway, gwaddr);
1270 rtmsg.rtmsg_metric = 1024;
YOSHIFUJI Hideakiebacaaa2006-03-20 17:04:53 -08001271 rtmsg.rtmsg_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | RTF_UP | RTF_EXPIRES |
1272 RTF_PREF(pref);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001273
1274 rtmsg.rtmsg_ifindex = dev->ifindex;
1275
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -07001276 ip6_route_add(&rtmsg, NULL, NULL, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001277 return rt6_get_dflt_router(gwaddr, dev);
1278}
1279
1280void rt6_purge_dflt_routers(void)
1281{
1282 struct rt6_info *rt;
1283
1284restart:
1285 read_lock_bh(&rt6_lock);
1286 for (rt = ip6_routing_table.leaf; rt; rt = rt->u.next) {
1287 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1288 dst_hold(&rt->u.dst);
1289
Linus Torvalds1da177e2005-04-16 15:20:36 -07001290 read_unlock_bh(&rt6_lock);
1291
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -07001292 ip6_del_rt(rt, NULL, NULL, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001293
1294 goto restart;
1295 }
1296 }
1297 read_unlock_bh(&rt6_lock);
1298}
1299
1300int ipv6_route_ioctl(unsigned int cmd, void __user *arg)
1301{
1302 struct in6_rtmsg rtmsg;
1303 int err;
1304
1305 switch(cmd) {
1306 case SIOCADDRT: /* Add a route */
1307 case SIOCDELRT: /* Delete a route */
1308 if (!capable(CAP_NET_ADMIN))
1309 return -EPERM;
1310 err = copy_from_user(&rtmsg, arg,
1311 sizeof(struct in6_rtmsg));
1312 if (err)
1313 return -EFAULT;
1314
1315 rtnl_lock();
1316 switch (cmd) {
1317 case SIOCADDRT:
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -07001318 err = ip6_route_add(&rtmsg, NULL, NULL, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001319 break;
1320 case SIOCDELRT:
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -07001321 err = ip6_route_del(&rtmsg, NULL, NULL, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001322 break;
1323 default:
1324 err = -EINVAL;
1325 }
1326 rtnl_unlock();
1327
1328 return err;
1329 };
1330
1331 return -EINVAL;
1332}
1333
1334/*
1335 * Drop the packet on the floor
1336 */
1337
Arnaldo Carvalho de Melo20380732005-08-16 02:18:02 -03001338static int ip6_pkt_discard(struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001339{
1340 IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
1341 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_NOROUTE, 0, skb->dev);
1342 kfree_skb(skb);
1343 return 0;
1344}
1345
Arnaldo Carvalho de Melo20380732005-08-16 02:18:02 -03001346static int ip6_pkt_discard_out(struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001347{
1348 skb->dev = skb->dst->dev;
1349 return ip6_pkt_discard(skb);
1350}
1351
1352/*
1353 * Allocate a dst for local (unicast / anycast) address.
1354 */
1355
1356struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1357 const struct in6_addr *addr,
1358 int anycast)
1359{
1360 struct rt6_info *rt = ip6_dst_alloc();
1361
1362 if (rt == NULL)
1363 return ERR_PTR(-ENOMEM);
1364
1365 dev_hold(&loopback_dev);
1366 in6_dev_hold(idev);
1367
1368 rt->u.dst.flags = DST_HOST;
1369 rt->u.dst.input = ip6_input;
1370 rt->u.dst.output = ip6_output;
1371 rt->rt6i_dev = &loopback_dev;
1372 rt->rt6i_idev = idev;
1373 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1374 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1375 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1376 rt->u.dst.obsolete = -1;
1377
1378 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
YOSHIFUJI Hideaki58c4fb82005-12-21 22:56:42 +09001379 if (anycast)
1380 rt->rt6i_flags |= RTF_ANYCAST;
1381 else
Linus Torvalds1da177e2005-04-16 15:20:36 -07001382 rt->rt6i_flags |= RTF_LOCAL;
1383 rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1384 if (rt->rt6i_nexthop == NULL) {
1385 dst_free((struct dst_entry *) rt);
1386 return ERR_PTR(-ENOMEM);
1387 }
1388
1389 ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1390 rt->rt6i_dst.plen = 128;
1391
1392 atomic_set(&rt->u.dst.__refcnt, 1);
1393
1394 return rt;
1395}
1396
1397static int fib6_ifdown(struct rt6_info *rt, void *arg)
1398{
1399 if (((void*)rt->rt6i_dev == arg || arg == NULL) &&
1400 rt != &ip6_null_entry) {
1401 RT6_TRACE("deleted by ifdown %p\n", rt);
1402 return -1;
1403 }
1404 return 0;
1405}
1406
1407void rt6_ifdown(struct net_device *dev)
1408{
1409 write_lock_bh(&rt6_lock);
1410 fib6_clean_tree(&ip6_routing_table, fib6_ifdown, 0, dev);
1411 write_unlock_bh(&rt6_lock);
1412}
1413
1414struct rt6_mtu_change_arg
1415{
1416 struct net_device *dev;
1417 unsigned mtu;
1418};
1419
1420static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
1421{
1422 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
1423 struct inet6_dev *idev;
1424
1425 /* In IPv6 pmtu discovery is not optional,
1426 so that RTAX_MTU lock cannot disable it.
1427 We still use this lock to block changes
1428 caused by addrconf/ndisc.
1429 */
1430
1431 idev = __in6_dev_get(arg->dev);
1432 if (idev == NULL)
1433 return 0;
1434
1435 /* For administrative MTU increase, there is no way to discover
1436 IPv6 PMTU increase, so PMTU increase should be updated here.
1437 Since RFC 1981 doesn't include administrative MTU increase
1438 update PMTU increase is a MUST. (i.e. jumbo frame)
1439 */
1440 /*
1441 If new MTU is less than route PMTU, this new MTU will be the
1442 lowest MTU in the path, update the route PMTU to reflect PMTU
1443 decreases; if new MTU is greater than route PMTU, and the
1444 old MTU is the lowest MTU in the path, update the route PMTU
1445 to reflect the increase. In this case if the other nodes' MTU
1446 also have the lowest MTU, TOO BIG MESSAGE will be lead to
1447 PMTU discouvery.
1448 */
1449 if (rt->rt6i_dev == arg->dev &&
1450 !dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1451 (dst_mtu(&rt->u.dst) > arg->mtu ||
1452 (dst_mtu(&rt->u.dst) < arg->mtu &&
1453 dst_mtu(&rt->u.dst) == idev->cnf.mtu6)))
1454 rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu;
1455 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(arg->mtu);
1456 return 0;
1457}
1458
1459void rt6_mtu_change(struct net_device *dev, unsigned mtu)
1460{
1461 struct rt6_mtu_change_arg arg;
1462
1463 arg.dev = dev;
1464 arg.mtu = mtu;
1465 read_lock_bh(&rt6_lock);
1466 fib6_clean_tree(&ip6_routing_table, rt6_mtu_change_route, 0, &arg);
1467 read_unlock_bh(&rt6_lock);
1468}
1469
1470static int inet6_rtm_to_rtmsg(struct rtmsg *r, struct rtattr **rta,
1471 struct in6_rtmsg *rtmsg)
1472{
1473 memset(rtmsg, 0, sizeof(*rtmsg));
1474
1475 rtmsg->rtmsg_dst_len = r->rtm_dst_len;
1476 rtmsg->rtmsg_src_len = r->rtm_src_len;
1477 rtmsg->rtmsg_flags = RTF_UP;
1478 if (r->rtm_type == RTN_UNREACHABLE)
1479 rtmsg->rtmsg_flags |= RTF_REJECT;
1480
1481 if (rta[RTA_GATEWAY-1]) {
1482 if (rta[RTA_GATEWAY-1]->rta_len != RTA_LENGTH(16))
1483 return -EINVAL;
1484 memcpy(&rtmsg->rtmsg_gateway, RTA_DATA(rta[RTA_GATEWAY-1]), 16);
1485 rtmsg->rtmsg_flags |= RTF_GATEWAY;
1486 }
1487 if (rta[RTA_DST-1]) {
1488 if (RTA_PAYLOAD(rta[RTA_DST-1]) < ((r->rtm_dst_len+7)>>3))
1489 return -EINVAL;
1490 memcpy(&rtmsg->rtmsg_dst, RTA_DATA(rta[RTA_DST-1]), ((r->rtm_dst_len+7)>>3));
1491 }
1492 if (rta[RTA_SRC-1]) {
1493 if (RTA_PAYLOAD(rta[RTA_SRC-1]) < ((r->rtm_src_len+7)>>3))
1494 return -EINVAL;
1495 memcpy(&rtmsg->rtmsg_src, RTA_DATA(rta[RTA_SRC-1]), ((r->rtm_src_len+7)>>3));
1496 }
1497 if (rta[RTA_OIF-1]) {
1498 if (rta[RTA_OIF-1]->rta_len != RTA_LENGTH(sizeof(int)))
1499 return -EINVAL;
1500 memcpy(&rtmsg->rtmsg_ifindex, RTA_DATA(rta[RTA_OIF-1]), sizeof(int));
1501 }
1502 if (rta[RTA_PRIORITY-1]) {
1503 if (rta[RTA_PRIORITY-1]->rta_len != RTA_LENGTH(4))
1504 return -EINVAL;
1505 memcpy(&rtmsg->rtmsg_metric, RTA_DATA(rta[RTA_PRIORITY-1]), 4);
1506 }
1507 return 0;
1508}
1509
1510int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1511{
1512 struct rtmsg *r = NLMSG_DATA(nlh);
1513 struct in6_rtmsg rtmsg;
1514
1515 if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
1516 return -EINVAL;
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -07001517 return ip6_route_del(&rtmsg, nlh, arg, &NETLINK_CB(skb));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001518}
1519
1520int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1521{
1522 struct rtmsg *r = NLMSG_DATA(nlh);
1523 struct in6_rtmsg rtmsg;
1524
1525 if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
1526 return -EINVAL;
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -07001527 return ip6_route_add(&rtmsg, nlh, arg, &NETLINK_CB(skb));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001528}
1529
1530struct rt6_rtnl_dump_arg
1531{
1532 struct sk_buff *skb;
1533 struct netlink_callback *cb;
1534};
1535
1536static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -07001537 struct in6_addr *dst, struct in6_addr *src,
1538 int iif, int type, u32 pid, u32 seq,
1539 int prefix, unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001540{
1541 struct rtmsg *rtm;
1542 struct nlmsghdr *nlh;
1543 unsigned char *b = skb->tail;
1544 struct rta_cacheinfo ci;
1545
1546 if (prefix) { /* user wants prefix routes only */
1547 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
1548 /* success since this is not a prefix route */
1549 return 1;
1550 }
1551 }
1552
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07001553 nlh = NLMSG_NEW(skb, pid, seq, type, sizeof(*rtm), flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001554 rtm = NLMSG_DATA(nlh);
1555 rtm->rtm_family = AF_INET6;
1556 rtm->rtm_dst_len = rt->rt6i_dst.plen;
1557 rtm->rtm_src_len = rt->rt6i_src.plen;
1558 rtm->rtm_tos = 0;
1559 rtm->rtm_table = RT_TABLE_MAIN;
1560 if (rt->rt6i_flags&RTF_REJECT)
1561 rtm->rtm_type = RTN_UNREACHABLE;
1562 else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
1563 rtm->rtm_type = RTN_LOCAL;
1564 else
1565 rtm->rtm_type = RTN_UNICAST;
1566 rtm->rtm_flags = 0;
1567 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
1568 rtm->rtm_protocol = rt->rt6i_protocol;
1569 if (rt->rt6i_flags&RTF_DYNAMIC)
1570 rtm->rtm_protocol = RTPROT_REDIRECT;
1571 else if (rt->rt6i_flags & RTF_ADDRCONF)
1572 rtm->rtm_protocol = RTPROT_KERNEL;
1573 else if (rt->rt6i_flags&RTF_DEFAULT)
1574 rtm->rtm_protocol = RTPROT_RA;
1575
1576 if (rt->rt6i_flags&RTF_CACHE)
1577 rtm->rtm_flags |= RTM_F_CLONED;
1578
1579 if (dst) {
1580 RTA_PUT(skb, RTA_DST, 16, dst);
1581 rtm->rtm_dst_len = 128;
1582 } else if (rtm->rtm_dst_len)
1583 RTA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
1584#ifdef CONFIG_IPV6_SUBTREES
1585 if (src) {
1586 RTA_PUT(skb, RTA_SRC, 16, src);
1587 rtm->rtm_src_len = 128;
1588 } else if (rtm->rtm_src_len)
1589 RTA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
1590#endif
1591 if (iif)
1592 RTA_PUT(skb, RTA_IIF, 4, &iif);
1593 else if (dst) {
1594 struct in6_addr saddr_buf;
1595 if (ipv6_get_saddr(&rt->u.dst, dst, &saddr_buf) == 0)
1596 RTA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
1597 }
1598 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
1599 goto rtattr_failure;
1600 if (rt->u.dst.neighbour)
1601 RTA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
1602 if (rt->u.dst.dev)
1603 RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->rt6i_dev->ifindex);
1604 RTA_PUT(skb, RTA_PRIORITY, 4, &rt->rt6i_metric);
1605 ci.rta_lastuse = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
1606 if (rt->rt6i_expires)
1607 ci.rta_expires = jiffies_to_clock_t(rt->rt6i_expires - jiffies);
1608 else
1609 ci.rta_expires = 0;
1610 ci.rta_used = rt->u.dst.__use;
1611 ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt);
1612 ci.rta_error = rt->u.dst.error;
1613 ci.rta_id = 0;
1614 ci.rta_ts = 0;
1615 ci.rta_tsage = 0;
1616 RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
1617 nlh->nlmsg_len = skb->tail - b;
1618 return skb->len;
1619
1620nlmsg_failure:
1621rtattr_failure:
1622 skb_trim(skb, b - skb->data);
1623 return -1;
1624}
1625
1626static int rt6_dump_route(struct rt6_info *rt, void *p_arg)
1627{
1628 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
1629 int prefix;
1630
1631 if (arg->cb->nlh->nlmsg_len >= NLMSG_LENGTH(sizeof(struct rtmsg))) {
1632 struct rtmsg *rtm = NLMSG_DATA(arg->cb->nlh);
1633 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
1634 } else
1635 prefix = 0;
1636
1637 return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
1638 NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -07001639 prefix, NLM_F_MULTI);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001640}
1641
1642static int fib6_dump_node(struct fib6_walker_t *w)
1643{
1644 int res;
1645 struct rt6_info *rt;
1646
1647 for (rt = w->leaf; rt; rt = rt->u.next) {
1648 res = rt6_dump_route(rt, w->args);
1649 if (res < 0) {
1650 /* Frame is full, suspend walking */
1651 w->leaf = rt;
1652 return 1;
1653 }
1654 BUG_TRAP(res!=0);
1655 }
1656 w->leaf = NULL;
1657 return 0;
1658}
1659
1660static void fib6_dump_end(struct netlink_callback *cb)
1661{
1662 struct fib6_walker_t *w = (void*)cb->args[0];
1663
1664 if (w) {
1665 cb->args[0] = 0;
1666 fib6_walker_unlink(w);
1667 kfree(w);
1668 }
Herbert Xuefacfbc2005-11-12 12:12:05 -08001669 cb->done = (void*)cb->args[1];
1670 cb->args[1] = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001671}
1672
1673static int fib6_dump_done(struct netlink_callback *cb)
1674{
1675 fib6_dump_end(cb);
Thomas Grafa8f74b22005-11-10 02:25:52 +01001676 return cb->done ? cb->done(cb) : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001677}
1678
1679int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
1680{
1681 struct rt6_rtnl_dump_arg arg;
1682 struct fib6_walker_t *w;
1683 int res;
1684
1685 arg.skb = skb;
1686 arg.cb = cb;
1687
1688 w = (void*)cb->args[0];
1689 if (w == NULL) {
1690 /* New dump:
1691 *
1692 * 1. hook callback destructor.
1693 */
1694 cb->args[1] = (long)cb->done;
1695 cb->done = fib6_dump_done;
1696
1697 /*
1698 * 2. allocate and initialize walker.
1699 */
David S. Miller9e147a12005-11-17 16:52:51 -08001700 w = kmalloc(sizeof(*w), GFP_ATOMIC);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001701 if (w == NULL)
1702 return -ENOMEM;
1703 RT6_TRACE("dump<%p", w);
1704 memset(w, 0, sizeof(*w));
1705 w->root = &ip6_routing_table;
1706 w->func = fib6_dump_node;
1707 w->args = &arg;
1708 cb->args[0] = (long)w;
1709 read_lock_bh(&rt6_lock);
1710 res = fib6_walk(w);
1711 read_unlock_bh(&rt6_lock);
1712 } else {
1713 w->args = &arg;
1714 read_lock_bh(&rt6_lock);
1715 res = fib6_walk_continue(w);
1716 read_unlock_bh(&rt6_lock);
1717 }
1718#if RT6_DEBUG >= 3
1719 if (res <= 0 && skb->len == 0)
1720 RT6_TRACE("%p>dump end\n", w);
1721#endif
1722 res = res < 0 ? res : skb->len;
1723 /* res < 0 is an error. (really, impossible)
1724 res == 0 means that dump is complete, but skb still can contain data.
1725 res > 0 dump is not complete, but frame is full.
1726 */
1727 /* Destroy walker, if dump of this table is complete. */
1728 if (res <= 0)
1729 fib6_dump_end(cb);
1730 return res;
1731}
1732
1733int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
1734{
1735 struct rtattr **rta = arg;
1736 int iif = 0;
1737 int err = -ENOBUFS;
1738 struct sk_buff *skb;
1739 struct flowi fl;
1740 struct rt6_info *rt;
1741
1742 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1743 if (skb == NULL)
1744 goto out;
1745
1746 /* Reserve room for dummy headers, this skb can pass
1747 through good chunk of routing engine.
1748 */
1749 skb->mac.raw = skb->data;
1750 skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
1751
1752 memset(&fl, 0, sizeof(fl));
1753 if (rta[RTA_SRC-1])
1754 ipv6_addr_copy(&fl.fl6_src,
1755 (struct in6_addr*)RTA_DATA(rta[RTA_SRC-1]));
1756 if (rta[RTA_DST-1])
1757 ipv6_addr_copy(&fl.fl6_dst,
1758 (struct in6_addr*)RTA_DATA(rta[RTA_DST-1]));
1759
1760 if (rta[RTA_IIF-1])
1761 memcpy(&iif, RTA_DATA(rta[RTA_IIF-1]), sizeof(int));
1762
1763 if (iif) {
1764 struct net_device *dev;
1765 dev = __dev_get_by_index(iif);
1766 if (!dev) {
1767 err = -ENODEV;
1768 goto out_free;
1769 }
1770 }
1771
1772 fl.oif = 0;
1773 if (rta[RTA_OIF-1])
1774 memcpy(&fl.oif, RTA_DATA(rta[RTA_OIF-1]), sizeof(int));
1775
1776 rt = (struct rt6_info*)ip6_route_output(NULL, &fl);
1777
1778 skb->dst = &rt->u.dst;
1779
1780 NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
1781 err = rt6_fill_node(skb, rt,
1782 &fl.fl6_dst, &fl.fl6_src,
1783 iif,
1784 RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -07001785 nlh->nlmsg_seq, 0, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001786 if (err < 0) {
1787 err = -EMSGSIZE;
1788 goto out_free;
1789 }
1790
1791 err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
1792 if (err > 0)
1793 err = 0;
1794out:
1795 return err;
1796out_free:
1797 kfree_skb(skb);
1798 goto out;
1799}
1800
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -07001801void inet6_rt_notify(int event, struct rt6_info *rt, struct nlmsghdr *nlh,
1802 struct netlink_skb_parms *req)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001803{
1804 struct sk_buff *skb;
1805 int size = NLMSG_SPACE(sizeof(struct rtmsg)+256);
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -07001806 u32 pid = current->pid;
1807 u32 seq = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001808
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -07001809 if (req)
1810 pid = req->pid;
1811 if (nlh)
1812 seq = nlh->nlmsg_seq;
1813
Linus Torvalds1da177e2005-04-16 15:20:36 -07001814 skb = alloc_skb(size, gfp_any());
1815 if (!skb) {
Patrick McHardyac6d4392005-08-14 19:29:52 -07001816 netlink_set_err(rtnl, 0, RTNLGRP_IPV6_ROUTE, ENOBUFS);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001817 return;
1818 }
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -07001819 if (rt6_fill_node(skb, rt, NULL, NULL, 0, event, pid, seq, 0, 0) < 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001820 kfree_skb(skb);
Patrick McHardyac6d4392005-08-14 19:29:52 -07001821 netlink_set_err(rtnl, 0, RTNLGRP_IPV6_ROUTE, EINVAL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001822 return;
1823 }
Patrick McHardyac6d4392005-08-14 19:29:52 -07001824 NETLINK_CB(skb).dst_group = RTNLGRP_IPV6_ROUTE;
1825 netlink_broadcast(rtnl, skb, 0, RTNLGRP_IPV6_ROUTE, gfp_any());
Linus Torvalds1da177e2005-04-16 15:20:36 -07001826}
1827
1828/*
1829 * /proc
1830 */
1831
1832#ifdef CONFIG_PROC_FS
1833
1834#define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
1835
1836struct rt6_proc_arg
1837{
1838 char *buffer;
1839 int offset;
1840 int length;
1841 int skip;
1842 int len;
1843};
1844
1845static int rt6_info_route(struct rt6_info *rt, void *p_arg)
1846{
1847 struct rt6_proc_arg *arg = (struct rt6_proc_arg *) p_arg;
1848 int i;
1849
1850 if (arg->skip < arg->offset / RT6_INFO_LEN) {
1851 arg->skip++;
1852 return 0;
1853 }
1854
1855 if (arg->len >= arg->length)
1856 return 0;
1857
1858 for (i=0; i<16; i++) {
1859 sprintf(arg->buffer + arg->len, "%02x",
1860 rt->rt6i_dst.addr.s6_addr[i]);
1861 arg->len += 2;
1862 }
1863 arg->len += sprintf(arg->buffer + arg->len, " %02x ",
1864 rt->rt6i_dst.plen);
1865
1866#ifdef CONFIG_IPV6_SUBTREES
1867 for (i=0; i<16; i++) {
1868 sprintf(arg->buffer + arg->len, "%02x",
1869 rt->rt6i_src.addr.s6_addr[i]);
1870 arg->len += 2;
1871 }
1872 arg->len += sprintf(arg->buffer + arg->len, " %02x ",
1873 rt->rt6i_src.plen);
1874#else
1875 sprintf(arg->buffer + arg->len,
1876 "00000000000000000000000000000000 00 ");
1877 arg->len += 36;
1878#endif
1879
1880 if (rt->rt6i_nexthop) {
1881 for (i=0; i<16; i++) {
1882 sprintf(arg->buffer + arg->len, "%02x",
1883 rt->rt6i_nexthop->primary_key[i]);
1884 arg->len += 2;
1885 }
1886 } else {
1887 sprintf(arg->buffer + arg->len,
1888 "00000000000000000000000000000000");
1889 arg->len += 32;
1890 }
1891 arg->len += sprintf(arg->buffer + arg->len,
1892 " %08x %08x %08x %08x %8s\n",
1893 rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt),
1894 rt->u.dst.__use, rt->rt6i_flags,
1895 rt->rt6i_dev ? rt->rt6i_dev->name : "");
1896 return 0;
1897}
1898
1899static int rt6_proc_info(char *buffer, char **start, off_t offset, int length)
1900{
1901 struct rt6_proc_arg arg;
1902 arg.buffer = buffer;
1903 arg.offset = offset;
1904 arg.length = length;
1905 arg.skip = 0;
1906 arg.len = 0;
1907
1908 read_lock_bh(&rt6_lock);
1909 fib6_clean_tree(&ip6_routing_table, rt6_info_route, 0, &arg);
1910 read_unlock_bh(&rt6_lock);
1911
1912 *start = buffer;
1913 if (offset)
1914 *start += offset % RT6_INFO_LEN;
1915
1916 arg.len -= offset % RT6_INFO_LEN;
1917
1918 if (arg.len > length)
1919 arg.len = length;
1920 if (arg.len < 0)
1921 arg.len = 0;
1922
1923 return arg.len;
1924}
1925
Linus Torvalds1da177e2005-04-16 15:20:36 -07001926static int rt6_stats_seq_show(struct seq_file *seq, void *v)
1927{
1928 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
1929 rt6_stats.fib_nodes, rt6_stats.fib_route_nodes,
1930 rt6_stats.fib_rt_alloc, rt6_stats.fib_rt_entries,
1931 rt6_stats.fib_rt_cache,
1932 atomic_read(&ip6_dst_ops.entries),
1933 rt6_stats.fib_discarded_routes);
1934
1935 return 0;
1936}
1937
1938static int rt6_stats_seq_open(struct inode *inode, struct file *file)
1939{
1940 return single_open(file, rt6_stats_seq_show, NULL);
1941}
1942
1943static struct file_operations rt6_stats_seq_fops = {
1944 .owner = THIS_MODULE,
1945 .open = rt6_stats_seq_open,
1946 .read = seq_read,
1947 .llseek = seq_lseek,
1948 .release = single_release,
1949};
1950#endif /* CONFIG_PROC_FS */
1951
1952#ifdef CONFIG_SYSCTL
1953
1954static int flush_delay;
1955
1956static
1957int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
1958 void __user *buffer, size_t *lenp, loff_t *ppos)
1959{
1960 if (write) {
1961 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
1962 fib6_run_gc(flush_delay <= 0 ? ~0UL : (unsigned long)flush_delay);
1963 return 0;
1964 } else
1965 return -EINVAL;
1966}
1967
1968ctl_table ipv6_route_table[] = {
1969 {
1970 .ctl_name = NET_IPV6_ROUTE_FLUSH,
1971 .procname = "flush",
1972 .data = &flush_delay,
1973 .maxlen = sizeof(int),
Dave Jones89c8b3a12005-04-28 12:11:49 -07001974 .mode = 0200,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001975 .proc_handler = &ipv6_sysctl_rtcache_flush
1976 },
1977 {
1978 .ctl_name = NET_IPV6_ROUTE_GC_THRESH,
1979 .procname = "gc_thresh",
1980 .data = &ip6_dst_ops.gc_thresh,
1981 .maxlen = sizeof(int),
1982 .mode = 0644,
1983 .proc_handler = &proc_dointvec,
1984 },
1985 {
1986 .ctl_name = NET_IPV6_ROUTE_MAX_SIZE,
1987 .procname = "max_size",
1988 .data = &ip6_rt_max_size,
1989 .maxlen = sizeof(int),
1990 .mode = 0644,
1991 .proc_handler = &proc_dointvec,
1992 },
1993 {
1994 .ctl_name = NET_IPV6_ROUTE_GC_MIN_INTERVAL,
1995 .procname = "gc_min_interval",
1996 .data = &ip6_rt_gc_min_interval,
1997 .maxlen = sizeof(int),
1998 .mode = 0644,
1999 .proc_handler = &proc_dointvec_jiffies,
2000 .strategy = &sysctl_jiffies,
2001 },
2002 {
2003 .ctl_name = NET_IPV6_ROUTE_GC_TIMEOUT,
2004 .procname = "gc_timeout",
2005 .data = &ip6_rt_gc_timeout,
2006 .maxlen = sizeof(int),
2007 .mode = 0644,
2008 .proc_handler = &proc_dointvec_jiffies,
2009 .strategy = &sysctl_jiffies,
2010 },
2011 {
2012 .ctl_name = NET_IPV6_ROUTE_GC_INTERVAL,
2013 .procname = "gc_interval",
2014 .data = &ip6_rt_gc_interval,
2015 .maxlen = sizeof(int),
2016 .mode = 0644,
2017 .proc_handler = &proc_dointvec_jiffies,
2018 .strategy = &sysctl_jiffies,
2019 },
2020 {
2021 .ctl_name = NET_IPV6_ROUTE_GC_ELASTICITY,
2022 .procname = "gc_elasticity",
2023 .data = &ip6_rt_gc_elasticity,
2024 .maxlen = sizeof(int),
2025 .mode = 0644,
2026 .proc_handler = &proc_dointvec_jiffies,
2027 .strategy = &sysctl_jiffies,
2028 },
2029 {
2030 .ctl_name = NET_IPV6_ROUTE_MTU_EXPIRES,
2031 .procname = "mtu_expires",
2032 .data = &ip6_rt_mtu_expires,
2033 .maxlen = sizeof(int),
2034 .mode = 0644,
2035 .proc_handler = &proc_dointvec_jiffies,
2036 .strategy = &sysctl_jiffies,
2037 },
2038 {
2039 .ctl_name = NET_IPV6_ROUTE_MIN_ADVMSS,
2040 .procname = "min_adv_mss",
2041 .data = &ip6_rt_min_advmss,
2042 .maxlen = sizeof(int),
2043 .mode = 0644,
2044 .proc_handler = &proc_dointvec_jiffies,
2045 .strategy = &sysctl_jiffies,
2046 },
2047 {
2048 .ctl_name = NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS,
2049 .procname = "gc_min_interval_ms",
2050 .data = &ip6_rt_gc_min_interval,
2051 .maxlen = sizeof(int),
2052 .mode = 0644,
2053 .proc_handler = &proc_dointvec_ms_jiffies,
2054 .strategy = &sysctl_ms_jiffies,
2055 },
2056 { .ctl_name = 0 }
2057};
2058
2059#endif
2060
2061void __init ip6_route_init(void)
2062{
2063 struct proc_dir_entry *p;
2064
2065 ip6_dst_ops.kmem_cachep = kmem_cache_create("ip6_dst_cache",
2066 sizeof(struct rt6_info),
2067 0, SLAB_HWCACHE_ALIGN,
2068 NULL, NULL);
2069 if (!ip6_dst_ops.kmem_cachep)
2070 panic("cannot create ip6_dst_cache");
2071
2072 fib6_init();
2073#ifdef CONFIG_PROC_FS
2074 p = proc_net_create("ipv6_route", 0, rt6_proc_info);
2075 if (p)
2076 p->owner = THIS_MODULE;
2077
2078 proc_net_fops_create("rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2079#endif
2080#ifdef CONFIG_XFRM
2081 xfrm6_init();
2082#endif
2083}
2084
2085void ip6_route_cleanup(void)
2086{
2087#ifdef CONFIG_PROC_FS
2088 proc_net_remove("ipv6_route");
2089 proc_net_remove("rt6_stats");
2090#endif
2091#ifdef CONFIG_XFRM
2092 xfrm6_fini();
2093#endif
2094 rt6_ifdown(NULL);
2095 fib6_gc_cleanup();
2096 kmem_cache_destroy(ip6_dst_ops.kmem_cachep);
2097}