blob: 6a068e7f81f18ba513dff2d7f5f3947982a8210f [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * Linux INET6 implementation
3 * FIB front-end.
4 *
5 * Authors:
6 * Pedro Roque <roque@di.fc.ul.pt>
7 *
8 * $Id: route.c,v 1.56 2001/10/31 21:55:55 davem Exp $
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
14 */
15
16/* Changes:
17 *
18 * YOSHIFUJI Hideaki @USAGI
19 * reworked default router selection.
20 * - respect outgoing interface
21 * - select from (probably) reachable routers (i.e.
22 * routers in REACHABLE, STALE, DELAY or PROBE states).
23 * - always select the same router if it is (probably)
24 * reachable. otherwise, round-robin the list.
25 */
26
Randy Dunlap4fc268d2006-01-11 12:17:47 -080027#include <linux/capability.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070028#include <linux/config.h>
29#include <linux/errno.h>
30#include <linux/types.h>
31#include <linux/times.h>
32#include <linux/socket.h>
33#include <linux/sockios.h>
34#include <linux/net.h>
35#include <linux/route.h>
36#include <linux/netdevice.h>
37#include <linux/in6.h>
38#include <linux/init.h>
39#include <linux/netlink.h>
40#include <linux/if_arp.h>
41
42#ifdef CONFIG_PROC_FS
43#include <linux/proc_fs.h>
44#include <linux/seq_file.h>
45#endif
46
47#include <net/snmp.h>
48#include <net/ipv6.h>
49#include <net/ip6_fib.h>
50#include <net/ip6_route.h>
51#include <net/ndisc.h>
52#include <net/addrconf.h>
53#include <net/tcp.h>
54#include <linux/rtnetlink.h>
55#include <net/dst.h>
56#include <net/xfrm.h>
57
58#include <asm/uaccess.h>
59
60#ifdef CONFIG_SYSCTL
61#include <linux/sysctl.h>
62#endif
63
64/* Set to 3 to get tracing. */
65#define RT6_DEBUG 2
66
67#if RT6_DEBUG >= 3
68#define RDBG(x) printk x
69#define RT6_TRACE(x...) printk(KERN_DEBUG x)
70#else
71#define RDBG(x)
72#define RT6_TRACE(x...) do { ; } while (0)
73#endif
74
YOSHIFUJI Hideaki519fbd82006-03-20 17:00:05 -080075#define CLONE_OFFLINK_ROUTE 0
Linus Torvalds1da177e2005-04-16 15:20:36 -070076
YOSHIFUJI Hideaki554cfb72006-03-20 17:00:26 -080077#define RT6_SELECT_F_IFACE 0x1
78#define RT6_SELECT_F_REACHABLE 0x2
79
Linus Torvalds1da177e2005-04-16 15:20:36 -070080static int ip6_rt_max_size = 4096;
81static int ip6_rt_gc_min_interval = HZ / 2;
82static int ip6_rt_gc_timeout = 60*HZ;
83int ip6_rt_gc_interval = 30*HZ;
84static int ip6_rt_gc_elasticity = 9;
85static int ip6_rt_mtu_expires = 10*60*HZ;
86static int ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
87
88static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
89static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
90static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91static void ip6_dst_destroy(struct dst_entry *);
92static void ip6_dst_ifdown(struct dst_entry *,
93 struct net_device *dev, int how);
94static int ip6_dst_gc(void);
95
96static int ip6_pkt_discard(struct sk_buff *skb);
97static int ip6_pkt_discard_out(struct sk_buff *skb);
98static void ip6_link_failure(struct sk_buff *skb);
99static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
100
101static struct dst_ops ip6_dst_ops = {
102 .family = AF_INET6,
103 .protocol = __constant_htons(ETH_P_IPV6),
104 .gc = ip6_dst_gc,
105 .gc_thresh = 1024,
106 .check = ip6_dst_check,
107 .destroy = ip6_dst_destroy,
108 .ifdown = ip6_dst_ifdown,
109 .negative_advice = ip6_negative_advice,
110 .link_failure = ip6_link_failure,
111 .update_pmtu = ip6_rt_update_pmtu,
112 .entry_size = sizeof(struct rt6_info),
113};
114
115struct rt6_info ip6_null_entry = {
116 .u = {
117 .dst = {
118 .__refcnt = ATOMIC_INIT(1),
119 .__use = 1,
120 .dev = &loopback_dev,
121 .obsolete = -1,
122 .error = -ENETUNREACH,
123 .metrics = { [RTAX_HOPLIMIT - 1] = 255, },
124 .input = ip6_pkt_discard,
125 .output = ip6_pkt_discard_out,
126 .ops = &ip6_dst_ops,
127 .path = (struct dst_entry*)&ip6_null_entry,
128 }
129 },
130 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
131 .rt6i_metric = ~(u32) 0,
132 .rt6i_ref = ATOMIC_INIT(1),
133};
134
135struct fib6_node ip6_routing_table = {
136 .leaf = &ip6_null_entry,
137 .fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO,
138};
139
140/* Protects all the ip6 fib */
141
142DEFINE_RWLOCK(rt6_lock);
143
144
145/* allocate dst with ip6_dst_ops */
146static __inline__ struct rt6_info *ip6_dst_alloc(void)
147{
148 return (struct rt6_info *)dst_alloc(&ip6_dst_ops);
149}
150
151static void ip6_dst_destroy(struct dst_entry *dst)
152{
153 struct rt6_info *rt = (struct rt6_info *)dst;
154 struct inet6_dev *idev = rt->rt6i_idev;
155
156 if (idev != NULL) {
157 rt->rt6i_idev = NULL;
158 in6_dev_put(idev);
159 }
160}
161
162static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
163 int how)
164{
165 struct rt6_info *rt = (struct rt6_info *)dst;
166 struct inet6_dev *idev = rt->rt6i_idev;
167
168 if (dev != &loopback_dev && idev != NULL && idev->dev == dev) {
169 struct inet6_dev *loopback_idev = in6_dev_get(&loopback_dev);
170 if (loopback_idev != NULL) {
171 rt->rt6i_idev = loopback_idev;
172 in6_dev_put(idev);
173 }
174 }
175}
176
177static __inline__ int rt6_check_expired(const struct rt6_info *rt)
178{
179 return (rt->rt6i_flags & RTF_EXPIRES &&
180 time_after(jiffies, rt->rt6i_expires));
181}
182
183/*
184 * Route lookup. Any rt6_lock is implied.
185 */
186
187static __inline__ struct rt6_info *rt6_device_match(struct rt6_info *rt,
188 int oif,
189 int strict)
190{
191 struct rt6_info *local = NULL;
192 struct rt6_info *sprt;
193
194 if (oif) {
195 for (sprt = rt; sprt; sprt = sprt->u.next) {
196 struct net_device *dev = sprt->rt6i_dev;
197 if (dev->ifindex == oif)
198 return sprt;
199 if (dev->flags & IFF_LOOPBACK) {
200 if (sprt->rt6i_idev == NULL ||
201 sprt->rt6i_idev->dev->ifindex != oif) {
202 if (strict && oif)
203 continue;
204 if (local && (!oif ||
205 local->rt6i_idev->dev->ifindex == oif))
206 continue;
207 }
208 local = sprt;
209 }
210 }
211
212 if (local)
213 return local;
214
215 if (strict)
216 return &ip6_null_entry;
217 }
218 return rt;
219}
220
221/*
YOSHIFUJI Hideaki554cfb72006-03-20 17:00:26 -0800222 * Default Router Selection (RFC 2461 6.3.6)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700223 */
YOSHIFUJI Hideaki554cfb72006-03-20 17:00:26 -0800224static int inline rt6_check_dev(struct rt6_info *rt, int oif)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700225{
YOSHIFUJI Hideaki554cfb72006-03-20 17:00:26 -0800226 struct net_device *dev = rt->rt6i_dev;
227 if (!oif || dev->ifindex == oif)
228 return 2;
229 if ((dev->flags & IFF_LOOPBACK) &&
230 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
231 return 1;
232 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700233}
234
YOSHIFUJI Hideaki554cfb72006-03-20 17:00:26 -0800235static int inline rt6_check_neigh(struct rt6_info *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700236{
YOSHIFUJI Hideaki554cfb72006-03-20 17:00:26 -0800237 struct neighbour *neigh = rt->rt6i_nexthop;
238 int m = 0;
239 if (neigh) {
240 read_lock_bh(&neigh->lock);
241 if (neigh->nud_state & NUD_VALID)
242 m = 1;
243 read_unlock_bh(&neigh->lock);
244 }
245 return m;
246}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700247
YOSHIFUJI Hideaki554cfb72006-03-20 17:00:26 -0800248static int rt6_score_route(struct rt6_info *rt, int oif,
249 int strict)
250{
251 int m = rt6_check_dev(rt, oif);
252 if (!m && (strict & RT6_SELECT_F_IFACE))
253 return -1;
254 if (rt6_check_neigh(rt))
255 m |= 4;
256 else if (strict & RT6_SELECT_F_REACHABLE)
257 return -1;
258 return m;
259}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700260
YOSHIFUJI Hideaki554cfb72006-03-20 17:00:26 -0800261static struct rt6_info *rt6_select(struct rt6_info **head, int oif,
262 int strict)
263{
264 struct rt6_info *match = NULL, *last = NULL;
265 struct rt6_info *rt, *rt0 = *head;
266 u32 metric;
267 int mpri = -1;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700268
YOSHIFUJI Hideaki554cfb72006-03-20 17:00:26 -0800269 RT6_TRACE("%s(head=%p(*head=%p), oif=%d)\n",
270 __FUNCTION__, head, head ? *head : NULL, oif);
271
272 for (rt = rt0, metric = rt0->rt6i_metric;
273 rt && rt->rt6i_metric == metric;
274 rt = rt->u.next) {
275 int m;
276
277 if (rt6_check_expired(rt))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700278 continue;
279
YOSHIFUJI Hideaki554cfb72006-03-20 17:00:26 -0800280 last = rt;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700281
YOSHIFUJI Hideaki554cfb72006-03-20 17:00:26 -0800282 m = rt6_score_route(rt, oif, strict);
283 if (m < 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700284 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700285
YOSHIFUJI Hideaki554cfb72006-03-20 17:00:26 -0800286 if (m > mpri) {
287 match = rt;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700288 mpri = m;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700289 }
290 }
291
YOSHIFUJI Hideaki554cfb72006-03-20 17:00:26 -0800292 if (!match &&
293 (strict & RT6_SELECT_F_REACHABLE) &&
294 last && last != rt0) {
295 /* no entries matched; do round-robin */
296 *head = rt0->u.next;
297 rt0->u.next = last->u.next;
298 last->u.next = rt0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700299 }
300
YOSHIFUJI Hideaki554cfb72006-03-20 17:00:26 -0800301 RT6_TRACE("%s() => %p, score=%d\n",
302 __FUNCTION__, match, mpri);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700303
YOSHIFUJI Hideaki554cfb72006-03-20 17:00:26 -0800304 return (match ? match : &ip6_null_entry);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700305}
306
307struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr,
308 int oif, int strict)
309{
310 struct fib6_node *fn;
311 struct rt6_info *rt;
312
313 read_lock_bh(&rt6_lock);
314 fn = fib6_lookup(&ip6_routing_table, daddr, saddr);
315 rt = rt6_device_match(fn->leaf, oif, strict);
316 dst_hold(&rt->u.dst);
317 rt->u.dst.__use++;
318 read_unlock_bh(&rt6_lock);
319
320 rt->u.dst.lastuse = jiffies;
321 if (rt->u.dst.error == 0)
322 return rt;
323 dst_release(&rt->u.dst);
324 return NULL;
325}
326
327/* ip6_ins_rt is called with FREE rt6_lock.
328 It takes new route entry, the addition fails by any reason the
329 route is freed. In any case, if caller does not hold it, it may
330 be destroyed.
331 */
332
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -0700333int ip6_ins_rt(struct rt6_info *rt, struct nlmsghdr *nlh,
334 void *_rtattr, struct netlink_skb_parms *req)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700335{
336 int err;
337
338 write_lock_bh(&rt6_lock);
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -0700339 err = fib6_add(&ip6_routing_table, rt, nlh, _rtattr, req);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700340 write_unlock_bh(&rt6_lock);
341
342 return err;
343}
344
YOSHIFUJI Hideaki95a9a5b2006-03-20 16:55:51 -0800345static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
346 struct in6_addr *saddr)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700347{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700348 struct rt6_info *rt;
349
350 /*
351 * Clone the route.
352 */
353
354 rt = ip6_rt_copy(ort);
355
356 if (rt) {
YOSHIFUJI Hideaki58c4fb82005-12-21 22:56:42 +0900357 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
358 if (rt->rt6i_dst.plen != 128 &&
359 ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
360 rt->rt6i_flags |= RTF_ANYCAST;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700361 ipv6_addr_copy(&rt->rt6i_gateway, daddr);
YOSHIFUJI Hideaki58c4fb82005-12-21 22:56:42 +0900362 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700363
YOSHIFUJI Hideaki58c4fb82005-12-21 22:56:42 +0900364 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700365 rt->rt6i_dst.plen = 128;
366 rt->rt6i_flags |= RTF_CACHE;
367 rt->u.dst.flags |= DST_HOST;
368
369#ifdef CONFIG_IPV6_SUBTREES
370 if (rt->rt6i_src.plen && saddr) {
371 ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
372 rt->rt6i_src.plen = 128;
373 }
374#endif
375
376 rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
377
YOSHIFUJI Hideaki95a9a5b2006-03-20 16:55:51 -0800378 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700379
YOSHIFUJI Hideaki95a9a5b2006-03-20 16:55:51 -0800380 return rt;
381}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700382
YOSHIFUJI Hideaki299d9932006-03-20 16:58:32 -0800383static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
384{
385 struct rt6_info *rt = ip6_rt_copy(ort);
386 if (rt) {
387 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
388 rt->rt6i_dst.plen = 128;
389 rt->rt6i_flags |= RTF_CACHE;
390 if (rt->rt6i_flags & RTF_REJECT)
391 rt->u.dst.error = ort->u.dst.error;
392 rt->u.dst.flags |= DST_HOST;
393 rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
394 }
395 return rt;
396}
397
Linus Torvalds1da177e2005-04-16 15:20:36 -0700398#define BACKTRACK() \
YOSHIFUJI Hideakibb133962006-03-20 17:01:43 -0800399if (rt == &ip6_null_entry) { \
Linus Torvalds1da177e2005-04-16 15:20:36 -0700400 while ((fn = fn->parent) != NULL) { \
401 if (fn->fn_flags & RTN_ROOT) { \
Linus Torvalds1da177e2005-04-16 15:20:36 -0700402 goto out; \
403 } \
404 if (fn->fn_flags & RTN_RTINFO) \
405 goto restart; \
406 } \
407}
408
409
410void ip6_route_input(struct sk_buff *skb)
411{
412 struct fib6_node *fn;
YOSHIFUJI Hideaki519fbd82006-03-20 17:00:05 -0800413 struct rt6_info *rt, *nrt;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700414 int strict;
415 int attempts = 3;
YOSHIFUJI Hideaki519fbd82006-03-20 17:00:05 -0800416 int err;
YOSHIFUJI Hideaki8238dd02006-03-20 17:04:35 -0800417 int reachable = RT6_SELECT_F_REACHABLE;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700418
YOSHIFUJI Hideaki118f8c12006-03-20 17:01:06 -0800419 strict = ipv6_addr_type(&skb->nh.ipv6h->daddr) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL) ? RT6_SELECT_F_IFACE : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700420
421relookup:
422 read_lock_bh(&rt6_lock);
423
YOSHIFUJI Hideaki8238dd02006-03-20 17:04:35 -0800424restart_2:
Linus Torvalds1da177e2005-04-16 15:20:36 -0700425 fn = fib6_lookup(&ip6_routing_table, &skb->nh.ipv6h->daddr,
426 &skb->nh.ipv6h->saddr);
427
428restart:
YOSHIFUJI Hideaki8238dd02006-03-20 17:04:35 -0800429 rt = rt6_select(&fn->leaf, skb->dev->ifindex, strict | reachable);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700430 BACKTRACK();
YOSHIFUJI Hideaki8238dd02006-03-20 17:04:35 -0800431 if (rt == &ip6_null_entry ||
432 rt->rt6i_flags & RTF_CACHE)
YOSHIFUJI Hideaki1ddef0442006-03-20 17:01:24 -0800433 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700434
YOSHIFUJI Hideakifb9de912006-03-20 16:59:08 -0800435 dst_hold(&rt->u.dst);
436 read_unlock_bh(&rt6_lock);
437
YOSHIFUJI Hideaki519fbd82006-03-20 17:00:05 -0800438 if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
439 nrt = rt6_alloc_cow(rt, &skb->nh.ipv6h->daddr, &skb->nh.ipv6h->saddr);
440 else {
441#if CLONE_OFFLINK_ROUTE
442 nrt = rt6_alloc_clone(rt, &skb->nh.ipv6h->daddr);
443#else
444 goto out2;
445#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700446 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700447
YOSHIFUJI Hideaki519fbd82006-03-20 17:00:05 -0800448 dst_release(&rt->u.dst);
449 rt = nrt ? : &ip6_null_entry;
450
451 dst_hold(&rt->u.dst);
452 if (nrt) {
453 err = ip6_ins_rt(nrt, NULL, NULL, &NETLINK_CB(skb));
454 if (!err)
455 goto out2;
456 }
457
458 if (--attempts <= 0)
459 goto out2;
460
461 /*
462 * Race condition! In the gap, when rt6_lock was
463 * released someone could insert this route. Relookup.
464 */
465 dst_release(&rt->u.dst);
466 goto relookup;
467
468out:
YOSHIFUJI Hideaki8238dd02006-03-20 17:04:35 -0800469 if (reachable) {
470 reachable = 0;
471 goto restart_2;
472 }
YOSHIFUJI Hideaki519fbd82006-03-20 17:00:05 -0800473 dst_hold(&rt->u.dst);
474 read_unlock_bh(&rt6_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700475out2:
476 rt->u.dst.lastuse = jiffies;
477 rt->u.dst.__use++;
478 skb->dst = (struct dst_entry *) rt;
YOSHIFUJI Hideakifb9de912006-03-20 16:59:08 -0800479 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700480}
481
482struct dst_entry * ip6_route_output(struct sock *sk, struct flowi *fl)
483{
484 struct fib6_node *fn;
YOSHIFUJI Hideaki519fbd82006-03-20 17:00:05 -0800485 struct rt6_info *rt, *nrt;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700486 int strict;
487 int attempts = 3;
YOSHIFUJI Hideaki519fbd82006-03-20 17:00:05 -0800488 int err;
YOSHIFUJI Hideaki8238dd02006-03-20 17:04:35 -0800489 int reachable = RT6_SELECT_F_REACHABLE;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700490
YOSHIFUJI Hideaki554cfb72006-03-20 17:00:26 -0800491 strict = ipv6_addr_type(&fl->fl6_dst) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL) ? RT6_SELECT_F_IFACE : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700492
493relookup:
494 read_lock_bh(&rt6_lock);
495
YOSHIFUJI Hideaki8238dd02006-03-20 17:04:35 -0800496restart_2:
Linus Torvalds1da177e2005-04-16 15:20:36 -0700497 fn = fib6_lookup(&ip6_routing_table, &fl->fl6_dst, &fl->fl6_src);
498
499restart:
YOSHIFUJI Hideaki8238dd02006-03-20 17:04:35 -0800500 rt = rt6_select(&fn->leaf, fl->oif, strict | reachable);
YOSHIFUJI Hideaki1ddef0442006-03-20 17:01:24 -0800501 BACKTRACK();
YOSHIFUJI Hideaki8238dd02006-03-20 17:04:35 -0800502 if (rt == &ip6_null_entry ||
503 rt->rt6i_flags & RTF_CACHE)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700504 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700505
YOSHIFUJI Hideakifb9de912006-03-20 16:59:08 -0800506 dst_hold(&rt->u.dst);
507 read_unlock_bh(&rt6_lock);
508
YOSHIFUJI Hideaki519fbd82006-03-20 17:00:05 -0800509 if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
YOSHIFUJI Hideakie40cf352006-03-20 16:59:27 -0800510 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
YOSHIFUJI Hideaki519fbd82006-03-20 17:00:05 -0800511 else {
512#if CLONE_OFFLINK_ROUTE
513 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
514#else
515 goto out2;
516#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700517 }
YOSHIFUJI Hideakie40cf352006-03-20 16:59:27 -0800518
YOSHIFUJI Hideaki519fbd82006-03-20 17:00:05 -0800519 dst_release(&rt->u.dst);
520 rt = nrt ? : &ip6_null_entry;
521
522 dst_hold(&rt->u.dst);
523 if (nrt) {
524 err = ip6_ins_rt(nrt, NULL, NULL, NULL);
525 if (!err)
526 goto out2;
527 }
528
529 if (--attempts <= 0)
530 goto out2;
531
532 /*
533 * Race condition! In the gap, when rt6_lock was
534 * released someone could insert this route. Relookup.
535 */
536 dst_release(&rt->u.dst);
537 goto relookup;
538
539out:
YOSHIFUJI Hideaki8238dd02006-03-20 17:04:35 -0800540 if (reachable) {
541 reachable = 0;
542 goto restart_2;
543 }
YOSHIFUJI Hideaki519fbd82006-03-20 17:00:05 -0800544 dst_hold(&rt->u.dst);
545 read_unlock_bh(&rt6_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700546out2:
547 rt->u.dst.lastuse = jiffies;
548 rt->u.dst.__use++;
549 return &rt->u.dst;
550}
551
552
553/*
554 * Destination cache support functions
555 */
556
557static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
558{
559 struct rt6_info *rt;
560
561 rt = (struct rt6_info *) dst;
562
563 if (rt && rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
564 return dst;
565
566 return NULL;
567}
568
569static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
570{
571 struct rt6_info *rt = (struct rt6_info *) dst;
572
573 if (rt) {
574 if (rt->rt6i_flags & RTF_CACHE)
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -0700575 ip6_del_rt(rt, NULL, NULL, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700576 else
577 dst_release(dst);
578 }
579 return NULL;
580}
581
582static void ip6_link_failure(struct sk_buff *skb)
583{
584 struct rt6_info *rt;
585
586 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
587
588 rt = (struct rt6_info *) skb->dst;
589 if (rt) {
590 if (rt->rt6i_flags&RTF_CACHE) {
591 dst_set_expires(&rt->u.dst, 0);
592 rt->rt6i_flags |= RTF_EXPIRES;
593 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
594 rt->rt6i_node->fn_sernum = -1;
595 }
596}
597
598static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
599{
600 struct rt6_info *rt6 = (struct rt6_info*)dst;
601
602 if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
603 rt6->rt6i_flags |= RTF_MODIFIED;
604 if (mtu < IPV6_MIN_MTU) {
605 mtu = IPV6_MIN_MTU;
606 dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
607 }
608 dst->metrics[RTAX_MTU-1] = mtu;
609 }
610}
611
612/* Protected by rt6_lock. */
613static struct dst_entry *ndisc_dst_gc_list;
614static int ipv6_get_mtu(struct net_device *dev);
615
616static inline unsigned int ipv6_advmss(unsigned int mtu)
617{
618 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
619
620 if (mtu < ip6_rt_min_advmss)
621 mtu = ip6_rt_min_advmss;
622
623 /*
624 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
625 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
626 * IPV6_MAXPLEN is also valid and means: "any MSS,
627 * rely only on pmtu discovery"
628 */
629 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
630 mtu = IPV6_MAXPLEN;
631 return mtu;
632}
633
634struct dst_entry *ndisc_dst_alloc(struct net_device *dev,
635 struct neighbour *neigh,
636 struct in6_addr *addr,
637 int (*output)(struct sk_buff *))
638{
639 struct rt6_info *rt;
640 struct inet6_dev *idev = in6_dev_get(dev);
641
642 if (unlikely(idev == NULL))
643 return NULL;
644
645 rt = ip6_dst_alloc();
646 if (unlikely(rt == NULL)) {
647 in6_dev_put(idev);
648 goto out;
649 }
650
651 dev_hold(dev);
652 if (neigh)
653 neigh_hold(neigh);
654 else
655 neigh = ndisc_get_neigh(dev, addr);
656
657 rt->rt6i_dev = dev;
658 rt->rt6i_idev = idev;
659 rt->rt6i_nexthop = neigh;
660 atomic_set(&rt->u.dst.__refcnt, 1);
661 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255;
662 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
663 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
664 rt->u.dst.output = output;
665
666#if 0 /* there's no chance to use these for ndisc */
667 rt->u.dst.flags = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST
668 ? DST_HOST
669 : 0;
670 ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
671 rt->rt6i_dst.plen = 128;
672#endif
673
674 write_lock_bh(&rt6_lock);
675 rt->u.dst.next = ndisc_dst_gc_list;
676 ndisc_dst_gc_list = &rt->u.dst;
677 write_unlock_bh(&rt6_lock);
678
679 fib6_force_start_gc();
680
681out:
682 return (struct dst_entry *)rt;
683}
684
685int ndisc_dst_gc(int *more)
686{
687 struct dst_entry *dst, *next, **pprev;
688 int freed;
689
690 next = NULL;
691 pprev = &ndisc_dst_gc_list;
692 freed = 0;
693 while ((dst = *pprev) != NULL) {
694 if (!atomic_read(&dst->__refcnt)) {
695 *pprev = dst->next;
696 dst_free(dst);
697 freed++;
698 } else {
699 pprev = &dst->next;
700 (*more)++;
701 }
702 }
703
704 return freed;
705}
706
707static int ip6_dst_gc(void)
708{
709 static unsigned expire = 30*HZ;
710 static unsigned long last_gc;
711 unsigned long now = jiffies;
712
713 if (time_after(last_gc + ip6_rt_gc_min_interval, now) &&
714 atomic_read(&ip6_dst_ops.entries) <= ip6_rt_max_size)
715 goto out;
716
717 expire++;
718 fib6_run_gc(expire);
719 last_gc = now;
720 if (atomic_read(&ip6_dst_ops.entries) < ip6_dst_ops.gc_thresh)
721 expire = ip6_rt_gc_timeout>>1;
722
723out:
724 expire -= expire>>ip6_rt_gc_elasticity;
725 return (atomic_read(&ip6_dst_ops.entries) > ip6_rt_max_size);
726}
727
728/* Clean host part of a prefix. Not necessary in radix tree,
729 but results in cleaner routing tables.
730
731 Remove it only when all the things will work!
732 */
733
734static int ipv6_get_mtu(struct net_device *dev)
735{
736 int mtu = IPV6_MIN_MTU;
737 struct inet6_dev *idev;
738
739 idev = in6_dev_get(dev);
740 if (idev) {
741 mtu = idev->cnf.mtu6;
742 in6_dev_put(idev);
743 }
744 return mtu;
745}
746
747int ipv6_get_hoplimit(struct net_device *dev)
748{
749 int hoplimit = ipv6_devconf.hop_limit;
750 struct inet6_dev *idev;
751
752 idev = in6_dev_get(dev);
753 if (idev) {
754 hoplimit = idev->cnf.hop_limit;
755 in6_dev_put(idev);
756 }
757 return hoplimit;
758}
759
760/*
761 *
762 */
763
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -0700764int ip6_route_add(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh,
765 void *_rtattr, struct netlink_skb_parms *req)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700766{
767 int err;
768 struct rtmsg *r;
769 struct rtattr **rta;
770 struct rt6_info *rt = NULL;
771 struct net_device *dev = NULL;
772 struct inet6_dev *idev = NULL;
773 int addr_type;
774
775 rta = (struct rtattr **) _rtattr;
776
777 if (rtmsg->rtmsg_dst_len > 128 || rtmsg->rtmsg_src_len > 128)
778 return -EINVAL;
779#ifndef CONFIG_IPV6_SUBTREES
780 if (rtmsg->rtmsg_src_len)
781 return -EINVAL;
782#endif
783 if (rtmsg->rtmsg_ifindex) {
784 err = -ENODEV;
785 dev = dev_get_by_index(rtmsg->rtmsg_ifindex);
786 if (!dev)
787 goto out;
788 idev = in6_dev_get(dev);
789 if (!idev)
790 goto out;
791 }
792
793 if (rtmsg->rtmsg_metric == 0)
794 rtmsg->rtmsg_metric = IP6_RT_PRIO_USER;
795
796 rt = ip6_dst_alloc();
797
798 if (rt == NULL) {
799 err = -ENOMEM;
800 goto out;
801 }
802
803 rt->u.dst.obsolete = -1;
YOSHIFUJI Hideaki3dd4bc62005-12-19 14:02:45 -0800804 rt->rt6i_expires = jiffies + clock_t_to_jiffies(rtmsg->rtmsg_info);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700805 if (nlh && (r = NLMSG_DATA(nlh))) {
806 rt->rt6i_protocol = r->rtm_protocol;
807 } else {
808 rt->rt6i_protocol = RTPROT_BOOT;
809 }
810
811 addr_type = ipv6_addr_type(&rtmsg->rtmsg_dst);
812
813 if (addr_type & IPV6_ADDR_MULTICAST)
814 rt->u.dst.input = ip6_mc_input;
815 else
816 rt->u.dst.input = ip6_forward;
817
818 rt->u.dst.output = ip6_output;
819
820 ipv6_addr_prefix(&rt->rt6i_dst.addr,
821 &rtmsg->rtmsg_dst, rtmsg->rtmsg_dst_len);
822 rt->rt6i_dst.plen = rtmsg->rtmsg_dst_len;
823 if (rt->rt6i_dst.plen == 128)
824 rt->u.dst.flags = DST_HOST;
825
826#ifdef CONFIG_IPV6_SUBTREES
827 ipv6_addr_prefix(&rt->rt6i_src.addr,
828 &rtmsg->rtmsg_src, rtmsg->rtmsg_src_len);
829 rt->rt6i_src.plen = rtmsg->rtmsg_src_len;
830#endif
831
832 rt->rt6i_metric = rtmsg->rtmsg_metric;
833
834 /* We cannot add true routes via loopback here,
835 they would result in kernel looping; promote them to reject routes
836 */
837 if ((rtmsg->rtmsg_flags&RTF_REJECT) ||
838 (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
839 /* hold loopback dev/idev if we haven't done so. */
840 if (dev != &loopback_dev) {
841 if (dev) {
842 dev_put(dev);
843 in6_dev_put(idev);
844 }
845 dev = &loopback_dev;
846 dev_hold(dev);
847 idev = in6_dev_get(dev);
848 if (!idev) {
849 err = -ENODEV;
850 goto out;
851 }
852 }
853 rt->u.dst.output = ip6_pkt_discard_out;
854 rt->u.dst.input = ip6_pkt_discard;
855 rt->u.dst.error = -ENETUNREACH;
856 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
857 goto install_route;
858 }
859
860 if (rtmsg->rtmsg_flags & RTF_GATEWAY) {
861 struct in6_addr *gw_addr;
862 int gwa_type;
863
864 gw_addr = &rtmsg->rtmsg_gateway;
865 ipv6_addr_copy(&rt->rt6i_gateway, &rtmsg->rtmsg_gateway);
866 gwa_type = ipv6_addr_type(gw_addr);
867
868 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
869 struct rt6_info *grt;
870
871 /* IPv6 strictly inhibits using not link-local
872 addresses as nexthop address.
873 Otherwise, router will not able to send redirects.
874 It is very good, but in some (rare!) circumstances
875 (SIT, PtP, NBMA NOARP links) it is handy to allow
876 some exceptions. --ANK
877 */
878 err = -EINVAL;
879 if (!(gwa_type&IPV6_ADDR_UNICAST))
880 goto out;
881
882 grt = rt6_lookup(gw_addr, NULL, rtmsg->rtmsg_ifindex, 1);
883
884 err = -EHOSTUNREACH;
885 if (grt == NULL)
886 goto out;
887 if (dev) {
888 if (dev != grt->rt6i_dev) {
889 dst_release(&grt->u.dst);
890 goto out;
891 }
892 } else {
893 dev = grt->rt6i_dev;
894 idev = grt->rt6i_idev;
895 dev_hold(dev);
896 in6_dev_hold(grt->rt6i_idev);
897 }
898 if (!(grt->rt6i_flags&RTF_GATEWAY))
899 err = 0;
900 dst_release(&grt->u.dst);
901
902 if (err)
903 goto out;
904 }
905 err = -EINVAL;
906 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
907 goto out;
908 }
909
910 err = -ENODEV;
911 if (dev == NULL)
912 goto out;
913
914 if (rtmsg->rtmsg_flags & (RTF_GATEWAY|RTF_NONEXTHOP)) {
915 rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
916 if (IS_ERR(rt->rt6i_nexthop)) {
917 err = PTR_ERR(rt->rt6i_nexthop);
918 rt->rt6i_nexthop = NULL;
919 goto out;
920 }
921 }
922
923 rt->rt6i_flags = rtmsg->rtmsg_flags;
924
925install_route:
926 if (rta && rta[RTA_METRICS-1]) {
927 int attrlen = RTA_PAYLOAD(rta[RTA_METRICS-1]);
928 struct rtattr *attr = RTA_DATA(rta[RTA_METRICS-1]);
929
930 while (RTA_OK(attr, attrlen)) {
931 unsigned flavor = attr->rta_type;
932 if (flavor) {
933 if (flavor > RTAX_MAX) {
934 err = -EINVAL;
935 goto out;
936 }
937 rt->u.dst.metrics[flavor-1] =
938 *(u32 *)RTA_DATA(attr);
939 }
940 attr = RTA_NEXT(attr, attrlen);
941 }
942 }
943
944 if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
945 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
946 if (!rt->u.dst.metrics[RTAX_MTU-1])
947 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
948 if (!rt->u.dst.metrics[RTAX_ADVMSS-1])
949 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
950 rt->u.dst.dev = dev;
951 rt->rt6i_idev = idev;
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -0700952 return ip6_ins_rt(rt, nlh, _rtattr, req);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700953
954out:
955 if (dev)
956 dev_put(dev);
957 if (idev)
958 in6_dev_put(idev);
959 if (rt)
960 dst_free((struct dst_entry *) rt);
961 return err;
962}
963
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -0700964int ip6_del_rt(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700965{
966 int err;
967
968 write_lock_bh(&rt6_lock);
969
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -0700970 err = fib6_del(rt, nlh, _rtattr, req);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700971 dst_release(&rt->u.dst);
972
973 write_unlock_bh(&rt6_lock);
974
975 return err;
976}
977
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -0700978static int ip6_route_del(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700979{
980 struct fib6_node *fn;
981 struct rt6_info *rt;
982 int err = -ESRCH;
983
984 read_lock_bh(&rt6_lock);
985
986 fn = fib6_locate(&ip6_routing_table,
987 &rtmsg->rtmsg_dst, rtmsg->rtmsg_dst_len,
988 &rtmsg->rtmsg_src, rtmsg->rtmsg_src_len);
989
990 if (fn) {
991 for (rt = fn->leaf; rt; rt = rt->u.next) {
992 if (rtmsg->rtmsg_ifindex &&
993 (rt->rt6i_dev == NULL ||
994 rt->rt6i_dev->ifindex != rtmsg->rtmsg_ifindex))
995 continue;
996 if (rtmsg->rtmsg_flags&RTF_GATEWAY &&
997 !ipv6_addr_equal(&rtmsg->rtmsg_gateway, &rt->rt6i_gateway))
998 continue;
999 if (rtmsg->rtmsg_metric &&
1000 rtmsg->rtmsg_metric != rt->rt6i_metric)
1001 continue;
1002 dst_hold(&rt->u.dst);
1003 read_unlock_bh(&rt6_lock);
1004
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -07001005 return ip6_del_rt(rt, nlh, _rtattr, req);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001006 }
1007 }
1008 read_unlock_bh(&rt6_lock);
1009
1010 return err;
1011}
1012
1013/*
1014 * Handle redirects
1015 */
1016void rt6_redirect(struct in6_addr *dest, struct in6_addr *saddr,
1017 struct neighbour *neigh, u8 *lladdr, int on_link)
1018{
1019 struct rt6_info *rt, *nrt;
1020
1021 /* Locate old route to this destination. */
1022 rt = rt6_lookup(dest, NULL, neigh->dev->ifindex, 1);
1023
1024 if (rt == NULL)
1025 return;
1026
1027 if (neigh->dev != rt->rt6i_dev)
1028 goto out;
1029
1030 /*
1031 * Current route is on-link; redirect is always invalid.
1032 *
1033 * Seems, previous statement is not true. It could
1034 * be node, which looks for us as on-link (f.e. proxy ndisc)
1035 * But then router serving it might decide, that we should
1036 * know truth 8)8) --ANK (980726).
1037 */
1038 if (!(rt->rt6i_flags&RTF_GATEWAY))
1039 goto out;
1040
1041 /*
1042 * RFC 2461 specifies that redirects should only be
1043 * accepted if they come from the nexthop to the target.
1044 * Due to the way default routers are chosen, this notion
1045 * is a bit fuzzy and one might need to check all default
1046 * routers.
1047 */
1048 if (!ipv6_addr_equal(saddr, &rt->rt6i_gateway)) {
1049 if (rt->rt6i_flags & RTF_DEFAULT) {
1050 struct rt6_info *rt1;
1051
1052 read_lock(&rt6_lock);
1053 for (rt1 = ip6_routing_table.leaf; rt1; rt1 = rt1->u.next) {
1054 if (ipv6_addr_equal(saddr, &rt1->rt6i_gateway)) {
1055 dst_hold(&rt1->u.dst);
1056 dst_release(&rt->u.dst);
1057 read_unlock(&rt6_lock);
1058 rt = rt1;
1059 goto source_ok;
1060 }
1061 }
1062 read_unlock(&rt6_lock);
1063 }
1064 if (net_ratelimit())
1065 printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1066 "for redirect target\n");
1067 goto out;
1068 }
1069
1070source_ok:
1071
1072 /*
1073 * We have finally decided to accept it.
1074 */
1075
1076 neigh_update(neigh, lladdr, NUD_STALE,
1077 NEIGH_UPDATE_F_WEAK_OVERRIDE|
1078 NEIGH_UPDATE_F_OVERRIDE|
1079 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1080 NEIGH_UPDATE_F_ISROUTER))
1081 );
1082
1083 /*
1084 * Redirect received -> path was valid.
1085 * Look, redirects are sent only in response to data packets,
1086 * so that this nexthop apparently is reachable. --ANK
1087 */
1088 dst_confirm(&rt->u.dst);
1089
1090 /* Duplicate redirect: silently ignore. */
1091 if (neigh == rt->u.dst.neighbour)
1092 goto out;
1093
1094 nrt = ip6_rt_copy(rt);
1095 if (nrt == NULL)
1096 goto out;
1097
1098 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1099 if (on_link)
1100 nrt->rt6i_flags &= ~RTF_GATEWAY;
1101
1102 ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1103 nrt->rt6i_dst.plen = 128;
1104 nrt->u.dst.flags |= DST_HOST;
1105
1106 ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1107 nrt->rt6i_nexthop = neigh_clone(neigh);
1108 /* Reset pmtu, it may be better */
1109 nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1110 nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&nrt->u.dst));
1111
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -07001112 if (ip6_ins_rt(nrt, NULL, NULL, NULL))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001113 goto out;
1114
1115 if (rt->rt6i_flags&RTF_CACHE) {
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -07001116 ip6_del_rt(rt, NULL, NULL, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001117 return;
1118 }
1119
1120out:
1121 dst_release(&rt->u.dst);
1122 return;
1123}
1124
1125/*
1126 * Handle ICMP "packet too big" messages
1127 * i.e. Path MTU discovery
1128 */
1129
1130void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1131 struct net_device *dev, u32 pmtu)
1132{
1133 struct rt6_info *rt, *nrt;
1134 int allfrag = 0;
1135
1136 rt = rt6_lookup(daddr, saddr, dev->ifindex, 0);
1137 if (rt == NULL)
1138 return;
1139
1140 if (pmtu >= dst_mtu(&rt->u.dst))
1141 goto out;
1142
1143 if (pmtu < IPV6_MIN_MTU) {
1144 /*
1145 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1146 * MTU (1280) and a fragment header should always be included
1147 * after a node receiving Too Big message reporting PMTU is
1148 * less than the IPv6 Minimum Link MTU.
1149 */
1150 pmtu = IPV6_MIN_MTU;
1151 allfrag = 1;
1152 }
1153
1154 /* New mtu received -> path was valid.
1155 They are sent only in response to data packets,
1156 so that this nexthop apparently is reachable. --ANK
1157 */
1158 dst_confirm(&rt->u.dst);
1159
1160 /* Host route. If it is static, it would be better
1161 not to override it, but add new one, so that
1162 when cache entry will expire old pmtu
1163 would return automatically.
1164 */
1165 if (rt->rt6i_flags & RTF_CACHE) {
1166 rt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1167 if (allfrag)
1168 rt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1169 dst_set_expires(&rt->u.dst, ip6_rt_mtu_expires);
1170 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1171 goto out;
1172 }
1173
1174 /* Network route.
1175 Two cases are possible:
1176 1. It is connected route. Action: COW
1177 2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1178 */
YOSHIFUJI Hideakid5315b52006-03-20 16:58:48 -08001179 if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
YOSHIFUJI Hideakia1e78362006-03-20 16:56:32 -08001180 nrt = rt6_alloc_cow(rt, daddr, saddr);
YOSHIFUJI Hideakid5315b52006-03-20 16:58:48 -08001181 else
1182 nrt = rt6_alloc_clone(rt, daddr);
YOSHIFUJI Hideakia1e78362006-03-20 16:56:32 -08001183
YOSHIFUJI Hideakid5315b52006-03-20 16:58:48 -08001184 if (nrt) {
YOSHIFUJI Hideakia1e78362006-03-20 16:56:32 -08001185 nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1186 if (allfrag)
1187 nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1188
1189 /* According to RFC 1981, detecting PMTU increase shouldn't be
1190 * happened within 5 mins, the recommended timer is 10 mins.
1191 * Here this route expiration time is set to ip6_rt_mtu_expires
1192 * which is 10 mins. After 10 mins the decreased pmtu is expired
1193 * and detecting PMTU increase will be automatically happened.
1194 */
1195 dst_set_expires(&nrt->u.dst, ip6_rt_mtu_expires);
1196 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1197
1198 ip6_ins_rt(nrt, NULL, NULL, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001199 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001200out:
1201 dst_release(&rt->u.dst);
1202}
1203
1204/*
1205 * Misc support functions
1206 */
1207
1208static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1209{
1210 struct rt6_info *rt = ip6_dst_alloc();
1211
1212 if (rt) {
1213 rt->u.dst.input = ort->u.dst.input;
1214 rt->u.dst.output = ort->u.dst.output;
1215
1216 memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
1217 rt->u.dst.dev = ort->u.dst.dev;
1218 if (rt->u.dst.dev)
1219 dev_hold(rt->u.dst.dev);
1220 rt->rt6i_idev = ort->rt6i_idev;
1221 if (rt->rt6i_idev)
1222 in6_dev_hold(rt->rt6i_idev);
1223 rt->u.dst.lastuse = jiffies;
1224 rt->rt6i_expires = 0;
1225
1226 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1227 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1228 rt->rt6i_metric = 0;
1229
1230 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1231#ifdef CONFIG_IPV6_SUBTREES
1232 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1233#endif
1234 }
1235 return rt;
1236}
1237
1238struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1239{
1240 struct rt6_info *rt;
1241 struct fib6_node *fn;
1242
1243 fn = &ip6_routing_table;
1244
1245 write_lock_bh(&rt6_lock);
1246 for (rt = fn->leaf; rt; rt=rt->u.next) {
1247 if (dev == rt->rt6i_dev &&
YOSHIFUJI Hideaki045927f2006-03-20 17:00:48 -08001248 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001249 ipv6_addr_equal(&rt->rt6i_gateway, addr))
1250 break;
1251 }
1252 if (rt)
1253 dst_hold(&rt->u.dst);
1254 write_unlock_bh(&rt6_lock);
1255 return rt;
1256}
1257
1258struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1259 struct net_device *dev)
1260{
1261 struct in6_rtmsg rtmsg;
1262
1263 memset(&rtmsg, 0, sizeof(struct in6_rtmsg));
1264 rtmsg.rtmsg_type = RTMSG_NEWROUTE;
1265 ipv6_addr_copy(&rtmsg.rtmsg_gateway, gwaddr);
1266 rtmsg.rtmsg_metric = 1024;
1267 rtmsg.rtmsg_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | RTF_UP | RTF_EXPIRES;
1268
1269 rtmsg.rtmsg_ifindex = dev->ifindex;
1270
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -07001271 ip6_route_add(&rtmsg, NULL, NULL, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001272 return rt6_get_dflt_router(gwaddr, dev);
1273}
1274
1275void rt6_purge_dflt_routers(void)
1276{
1277 struct rt6_info *rt;
1278
1279restart:
1280 read_lock_bh(&rt6_lock);
1281 for (rt = ip6_routing_table.leaf; rt; rt = rt->u.next) {
1282 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1283 dst_hold(&rt->u.dst);
1284
Linus Torvalds1da177e2005-04-16 15:20:36 -07001285 read_unlock_bh(&rt6_lock);
1286
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -07001287 ip6_del_rt(rt, NULL, NULL, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001288
1289 goto restart;
1290 }
1291 }
1292 read_unlock_bh(&rt6_lock);
1293}
1294
1295int ipv6_route_ioctl(unsigned int cmd, void __user *arg)
1296{
1297 struct in6_rtmsg rtmsg;
1298 int err;
1299
1300 switch(cmd) {
1301 case SIOCADDRT: /* Add a route */
1302 case SIOCDELRT: /* Delete a route */
1303 if (!capable(CAP_NET_ADMIN))
1304 return -EPERM;
1305 err = copy_from_user(&rtmsg, arg,
1306 sizeof(struct in6_rtmsg));
1307 if (err)
1308 return -EFAULT;
1309
1310 rtnl_lock();
1311 switch (cmd) {
1312 case SIOCADDRT:
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -07001313 err = ip6_route_add(&rtmsg, NULL, NULL, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001314 break;
1315 case SIOCDELRT:
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -07001316 err = ip6_route_del(&rtmsg, NULL, NULL, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001317 break;
1318 default:
1319 err = -EINVAL;
1320 }
1321 rtnl_unlock();
1322
1323 return err;
1324 };
1325
1326 return -EINVAL;
1327}
1328
1329/*
1330 * Drop the packet on the floor
1331 */
1332
Arnaldo Carvalho de Melo20380732005-08-16 02:18:02 -03001333static int ip6_pkt_discard(struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001334{
1335 IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
1336 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_NOROUTE, 0, skb->dev);
1337 kfree_skb(skb);
1338 return 0;
1339}
1340
Arnaldo Carvalho de Melo20380732005-08-16 02:18:02 -03001341static int ip6_pkt_discard_out(struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001342{
1343 skb->dev = skb->dst->dev;
1344 return ip6_pkt_discard(skb);
1345}
1346
1347/*
1348 * Allocate a dst for local (unicast / anycast) address.
1349 */
1350
1351struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1352 const struct in6_addr *addr,
1353 int anycast)
1354{
1355 struct rt6_info *rt = ip6_dst_alloc();
1356
1357 if (rt == NULL)
1358 return ERR_PTR(-ENOMEM);
1359
1360 dev_hold(&loopback_dev);
1361 in6_dev_hold(idev);
1362
1363 rt->u.dst.flags = DST_HOST;
1364 rt->u.dst.input = ip6_input;
1365 rt->u.dst.output = ip6_output;
1366 rt->rt6i_dev = &loopback_dev;
1367 rt->rt6i_idev = idev;
1368 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1369 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1370 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1371 rt->u.dst.obsolete = -1;
1372
1373 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
YOSHIFUJI Hideaki58c4fb82005-12-21 22:56:42 +09001374 if (anycast)
1375 rt->rt6i_flags |= RTF_ANYCAST;
1376 else
Linus Torvalds1da177e2005-04-16 15:20:36 -07001377 rt->rt6i_flags |= RTF_LOCAL;
1378 rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1379 if (rt->rt6i_nexthop == NULL) {
1380 dst_free((struct dst_entry *) rt);
1381 return ERR_PTR(-ENOMEM);
1382 }
1383
1384 ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1385 rt->rt6i_dst.plen = 128;
1386
1387 atomic_set(&rt->u.dst.__refcnt, 1);
1388
1389 return rt;
1390}
1391
1392static int fib6_ifdown(struct rt6_info *rt, void *arg)
1393{
1394 if (((void*)rt->rt6i_dev == arg || arg == NULL) &&
1395 rt != &ip6_null_entry) {
1396 RT6_TRACE("deleted by ifdown %p\n", rt);
1397 return -1;
1398 }
1399 return 0;
1400}
1401
1402void rt6_ifdown(struct net_device *dev)
1403{
1404 write_lock_bh(&rt6_lock);
1405 fib6_clean_tree(&ip6_routing_table, fib6_ifdown, 0, dev);
1406 write_unlock_bh(&rt6_lock);
1407}
1408
1409struct rt6_mtu_change_arg
1410{
1411 struct net_device *dev;
1412 unsigned mtu;
1413};
1414
1415static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
1416{
1417 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
1418 struct inet6_dev *idev;
1419
1420 /* In IPv6 pmtu discovery is not optional,
1421 so that RTAX_MTU lock cannot disable it.
1422 We still use this lock to block changes
1423 caused by addrconf/ndisc.
1424 */
1425
1426 idev = __in6_dev_get(arg->dev);
1427 if (idev == NULL)
1428 return 0;
1429
1430 /* For administrative MTU increase, there is no way to discover
1431 IPv6 PMTU increase, so PMTU increase should be updated here.
1432 Since RFC 1981 doesn't include administrative MTU increase
1433 update PMTU increase is a MUST. (i.e. jumbo frame)
1434 */
1435 /*
1436 If new MTU is less than route PMTU, this new MTU will be the
1437 lowest MTU in the path, update the route PMTU to reflect PMTU
1438 decreases; if new MTU is greater than route PMTU, and the
1439 old MTU is the lowest MTU in the path, update the route PMTU
1440 to reflect the increase. In this case if the other nodes' MTU
1441 also have the lowest MTU, TOO BIG MESSAGE will be lead to
1442 PMTU discouvery.
1443 */
1444 if (rt->rt6i_dev == arg->dev &&
1445 !dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1446 (dst_mtu(&rt->u.dst) > arg->mtu ||
1447 (dst_mtu(&rt->u.dst) < arg->mtu &&
1448 dst_mtu(&rt->u.dst) == idev->cnf.mtu6)))
1449 rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu;
1450 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(arg->mtu);
1451 return 0;
1452}
1453
1454void rt6_mtu_change(struct net_device *dev, unsigned mtu)
1455{
1456 struct rt6_mtu_change_arg arg;
1457
1458 arg.dev = dev;
1459 arg.mtu = mtu;
1460 read_lock_bh(&rt6_lock);
1461 fib6_clean_tree(&ip6_routing_table, rt6_mtu_change_route, 0, &arg);
1462 read_unlock_bh(&rt6_lock);
1463}
1464
1465static int inet6_rtm_to_rtmsg(struct rtmsg *r, struct rtattr **rta,
1466 struct in6_rtmsg *rtmsg)
1467{
1468 memset(rtmsg, 0, sizeof(*rtmsg));
1469
1470 rtmsg->rtmsg_dst_len = r->rtm_dst_len;
1471 rtmsg->rtmsg_src_len = r->rtm_src_len;
1472 rtmsg->rtmsg_flags = RTF_UP;
1473 if (r->rtm_type == RTN_UNREACHABLE)
1474 rtmsg->rtmsg_flags |= RTF_REJECT;
1475
1476 if (rta[RTA_GATEWAY-1]) {
1477 if (rta[RTA_GATEWAY-1]->rta_len != RTA_LENGTH(16))
1478 return -EINVAL;
1479 memcpy(&rtmsg->rtmsg_gateway, RTA_DATA(rta[RTA_GATEWAY-1]), 16);
1480 rtmsg->rtmsg_flags |= RTF_GATEWAY;
1481 }
1482 if (rta[RTA_DST-1]) {
1483 if (RTA_PAYLOAD(rta[RTA_DST-1]) < ((r->rtm_dst_len+7)>>3))
1484 return -EINVAL;
1485 memcpy(&rtmsg->rtmsg_dst, RTA_DATA(rta[RTA_DST-1]), ((r->rtm_dst_len+7)>>3));
1486 }
1487 if (rta[RTA_SRC-1]) {
1488 if (RTA_PAYLOAD(rta[RTA_SRC-1]) < ((r->rtm_src_len+7)>>3))
1489 return -EINVAL;
1490 memcpy(&rtmsg->rtmsg_src, RTA_DATA(rta[RTA_SRC-1]), ((r->rtm_src_len+7)>>3));
1491 }
1492 if (rta[RTA_OIF-1]) {
1493 if (rta[RTA_OIF-1]->rta_len != RTA_LENGTH(sizeof(int)))
1494 return -EINVAL;
1495 memcpy(&rtmsg->rtmsg_ifindex, RTA_DATA(rta[RTA_OIF-1]), sizeof(int));
1496 }
1497 if (rta[RTA_PRIORITY-1]) {
1498 if (rta[RTA_PRIORITY-1]->rta_len != RTA_LENGTH(4))
1499 return -EINVAL;
1500 memcpy(&rtmsg->rtmsg_metric, RTA_DATA(rta[RTA_PRIORITY-1]), 4);
1501 }
1502 return 0;
1503}
1504
1505int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1506{
1507 struct rtmsg *r = NLMSG_DATA(nlh);
1508 struct in6_rtmsg rtmsg;
1509
1510 if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
1511 return -EINVAL;
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -07001512 return ip6_route_del(&rtmsg, nlh, arg, &NETLINK_CB(skb));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001513}
1514
1515int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1516{
1517 struct rtmsg *r = NLMSG_DATA(nlh);
1518 struct in6_rtmsg rtmsg;
1519
1520 if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
1521 return -EINVAL;
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -07001522 return ip6_route_add(&rtmsg, nlh, arg, &NETLINK_CB(skb));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001523}
1524
1525struct rt6_rtnl_dump_arg
1526{
1527 struct sk_buff *skb;
1528 struct netlink_callback *cb;
1529};
1530
1531static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -07001532 struct in6_addr *dst, struct in6_addr *src,
1533 int iif, int type, u32 pid, u32 seq,
1534 int prefix, unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001535{
1536 struct rtmsg *rtm;
1537 struct nlmsghdr *nlh;
1538 unsigned char *b = skb->tail;
1539 struct rta_cacheinfo ci;
1540
1541 if (prefix) { /* user wants prefix routes only */
1542 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
1543 /* success since this is not a prefix route */
1544 return 1;
1545 }
1546 }
1547
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07001548 nlh = NLMSG_NEW(skb, pid, seq, type, sizeof(*rtm), flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001549 rtm = NLMSG_DATA(nlh);
1550 rtm->rtm_family = AF_INET6;
1551 rtm->rtm_dst_len = rt->rt6i_dst.plen;
1552 rtm->rtm_src_len = rt->rt6i_src.plen;
1553 rtm->rtm_tos = 0;
1554 rtm->rtm_table = RT_TABLE_MAIN;
1555 if (rt->rt6i_flags&RTF_REJECT)
1556 rtm->rtm_type = RTN_UNREACHABLE;
1557 else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
1558 rtm->rtm_type = RTN_LOCAL;
1559 else
1560 rtm->rtm_type = RTN_UNICAST;
1561 rtm->rtm_flags = 0;
1562 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
1563 rtm->rtm_protocol = rt->rt6i_protocol;
1564 if (rt->rt6i_flags&RTF_DYNAMIC)
1565 rtm->rtm_protocol = RTPROT_REDIRECT;
1566 else if (rt->rt6i_flags & RTF_ADDRCONF)
1567 rtm->rtm_protocol = RTPROT_KERNEL;
1568 else if (rt->rt6i_flags&RTF_DEFAULT)
1569 rtm->rtm_protocol = RTPROT_RA;
1570
1571 if (rt->rt6i_flags&RTF_CACHE)
1572 rtm->rtm_flags |= RTM_F_CLONED;
1573
1574 if (dst) {
1575 RTA_PUT(skb, RTA_DST, 16, dst);
1576 rtm->rtm_dst_len = 128;
1577 } else if (rtm->rtm_dst_len)
1578 RTA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
1579#ifdef CONFIG_IPV6_SUBTREES
1580 if (src) {
1581 RTA_PUT(skb, RTA_SRC, 16, src);
1582 rtm->rtm_src_len = 128;
1583 } else if (rtm->rtm_src_len)
1584 RTA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
1585#endif
1586 if (iif)
1587 RTA_PUT(skb, RTA_IIF, 4, &iif);
1588 else if (dst) {
1589 struct in6_addr saddr_buf;
1590 if (ipv6_get_saddr(&rt->u.dst, dst, &saddr_buf) == 0)
1591 RTA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
1592 }
1593 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
1594 goto rtattr_failure;
1595 if (rt->u.dst.neighbour)
1596 RTA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
1597 if (rt->u.dst.dev)
1598 RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->rt6i_dev->ifindex);
1599 RTA_PUT(skb, RTA_PRIORITY, 4, &rt->rt6i_metric);
1600 ci.rta_lastuse = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
1601 if (rt->rt6i_expires)
1602 ci.rta_expires = jiffies_to_clock_t(rt->rt6i_expires - jiffies);
1603 else
1604 ci.rta_expires = 0;
1605 ci.rta_used = rt->u.dst.__use;
1606 ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt);
1607 ci.rta_error = rt->u.dst.error;
1608 ci.rta_id = 0;
1609 ci.rta_ts = 0;
1610 ci.rta_tsage = 0;
1611 RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
1612 nlh->nlmsg_len = skb->tail - b;
1613 return skb->len;
1614
1615nlmsg_failure:
1616rtattr_failure:
1617 skb_trim(skb, b - skb->data);
1618 return -1;
1619}
1620
1621static int rt6_dump_route(struct rt6_info *rt, void *p_arg)
1622{
1623 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
1624 int prefix;
1625
1626 if (arg->cb->nlh->nlmsg_len >= NLMSG_LENGTH(sizeof(struct rtmsg))) {
1627 struct rtmsg *rtm = NLMSG_DATA(arg->cb->nlh);
1628 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
1629 } else
1630 prefix = 0;
1631
1632 return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
1633 NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -07001634 prefix, NLM_F_MULTI);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001635}
1636
1637static int fib6_dump_node(struct fib6_walker_t *w)
1638{
1639 int res;
1640 struct rt6_info *rt;
1641
1642 for (rt = w->leaf; rt; rt = rt->u.next) {
1643 res = rt6_dump_route(rt, w->args);
1644 if (res < 0) {
1645 /* Frame is full, suspend walking */
1646 w->leaf = rt;
1647 return 1;
1648 }
1649 BUG_TRAP(res!=0);
1650 }
1651 w->leaf = NULL;
1652 return 0;
1653}
1654
1655static void fib6_dump_end(struct netlink_callback *cb)
1656{
1657 struct fib6_walker_t *w = (void*)cb->args[0];
1658
1659 if (w) {
1660 cb->args[0] = 0;
1661 fib6_walker_unlink(w);
1662 kfree(w);
1663 }
Herbert Xuefacfbc2005-11-12 12:12:05 -08001664 cb->done = (void*)cb->args[1];
1665 cb->args[1] = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001666}
1667
1668static int fib6_dump_done(struct netlink_callback *cb)
1669{
1670 fib6_dump_end(cb);
Thomas Grafa8f74b22005-11-10 02:25:52 +01001671 return cb->done ? cb->done(cb) : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001672}
1673
1674int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
1675{
1676 struct rt6_rtnl_dump_arg arg;
1677 struct fib6_walker_t *w;
1678 int res;
1679
1680 arg.skb = skb;
1681 arg.cb = cb;
1682
1683 w = (void*)cb->args[0];
1684 if (w == NULL) {
1685 /* New dump:
1686 *
1687 * 1. hook callback destructor.
1688 */
1689 cb->args[1] = (long)cb->done;
1690 cb->done = fib6_dump_done;
1691
1692 /*
1693 * 2. allocate and initialize walker.
1694 */
David S. Miller9e147a12005-11-17 16:52:51 -08001695 w = kmalloc(sizeof(*w), GFP_ATOMIC);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001696 if (w == NULL)
1697 return -ENOMEM;
1698 RT6_TRACE("dump<%p", w);
1699 memset(w, 0, sizeof(*w));
1700 w->root = &ip6_routing_table;
1701 w->func = fib6_dump_node;
1702 w->args = &arg;
1703 cb->args[0] = (long)w;
1704 read_lock_bh(&rt6_lock);
1705 res = fib6_walk(w);
1706 read_unlock_bh(&rt6_lock);
1707 } else {
1708 w->args = &arg;
1709 read_lock_bh(&rt6_lock);
1710 res = fib6_walk_continue(w);
1711 read_unlock_bh(&rt6_lock);
1712 }
1713#if RT6_DEBUG >= 3
1714 if (res <= 0 && skb->len == 0)
1715 RT6_TRACE("%p>dump end\n", w);
1716#endif
1717 res = res < 0 ? res : skb->len;
1718 /* res < 0 is an error. (really, impossible)
1719 res == 0 means that dump is complete, but skb still can contain data.
1720 res > 0 dump is not complete, but frame is full.
1721 */
1722 /* Destroy walker, if dump of this table is complete. */
1723 if (res <= 0)
1724 fib6_dump_end(cb);
1725 return res;
1726}
1727
1728int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
1729{
1730 struct rtattr **rta = arg;
1731 int iif = 0;
1732 int err = -ENOBUFS;
1733 struct sk_buff *skb;
1734 struct flowi fl;
1735 struct rt6_info *rt;
1736
1737 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1738 if (skb == NULL)
1739 goto out;
1740
1741 /* Reserve room for dummy headers, this skb can pass
1742 through good chunk of routing engine.
1743 */
1744 skb->mac.raw = skb->data;
1745 skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
1746
1747 memset(&fl, 0, sizeof(fl));
1748 if (rta[RTA_SRC-1])
1749 ipv6_addr_copy(&fl.fl6_src,
1750 (struct in6_addr*)RTA_DATA(rta[RTA_SRC-1]));
1751 if (rta[RTA_DST-1])
1752 ipv6_addr_copy(&fl.fl6_dst,
1753 (struct in6_addr*)RTA_DATA(rta[RTA_DST-1]));
1754
1755 if (rta[RTA_IIF-1])
1756 memcpy(&iif, RTA_DATA(rta[RTA_IIF-1]), sizeof(int));
1757
1758 if (iif) {
1759 struct net_device *dev;
1760 dev = __dev_get_by_index(iif);
1761 if (!dev) {
1762 err = -ENODEV;
1763 goto out_free;
1764 }
1765 }
1766
1767 fl.oif = 0;
1768 if (rta[RTA_OIF-1])
1769 memcpy(&fl.oif, RTA_DATA(rta[RTA_OIF-1]), sizeof(int));
1770
1771 rt = (struct rt6_info*)ip6_route_output(NULL, &fl);
1772
1773 skb->dst = &rt->u.dst;
1774
1775 NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
1776 err = rt6_fill_node(skb, rt,
1777 &fl.fl6_dst, &fl.fl6_src,
1778 iif,
1779 RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -07001780 nlh->nlmsg_seq, 0, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001781 if (err < 0) {
1782 err = -EMSGSIZE;
1783 goto out_free;
1784 }
1785
1786 err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
1787 if (err > 0)
1788 err = 0;
1789out:
1790 return err;
1791out_free:
1792 kfree_skb(skb);
1793 goto out;
1794}
1795
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -07001796void inet6_rt_notify(int event, struct rt6_info *rt, struct nlmsghdr *nlh,
1797 struct netlink_skb_parms *req)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001798{
1799 struct sk_buff *skb;
1800 int size = NLMSG_SPACE(sizeof(struct rtmsg)+256);
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -07001801 u32 pid = current->pid;
1802 u32 seq = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001803
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -07001804 if (req)
1805 pid = req->pid;
1806 if (nlh)
1807 seq = nlh->nlmsg_seq;
1808
Linus Torvalds1da177e2005-04-16 15:20:36 -07001809 skb = alloc_skb(size, gfp_any());
1810 if (!skb) {
Patrick McHardyac6d4392005-08-14 19:29:52 -07001811 netlink_set_err(rtnl, 0, RTNLGRP_IPV6_ROUTE, ENOBUFS);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001812 return;
1813 }
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -07001814 if (rt6_fill_node(skb, rt, NULL, NULL, 0, event, pid, seq, 0, 0) < 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001815 kfree_skb(skb);
Patrick McHardyac6d4392005-08-14 19:29:52 -07001816 netlink_set_err(rtnl, 0, RTNLGRP_IPV6_ROUTE, EINVAL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001817 return;
1818 }
Patrick McHardyac6d4392005-08-14 19:29:52 -07001819 NETLINK_CB(skb).dst_group = RTNLGRP_IPV6_ROUTE;
1820 netlink_broadcast(rtnl, skb, 0, RTNLGRP_IPV6_ROUTE, gfp_any());
Linus Torvalds1da177e2005-04-16 15:20:36 -07001821}
1822
1823/*
1824 * /proc
1825 */
1826
1827#ifdef CONFIG_PROC_FS
1828
1829#define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
1830
1831struct rt6_proc_arg
1832{
1833 char *buffer;
1834 int offset;
1835 int length;
1836 int skip;
1837 int len;
1838};
1839
1840static int rt6_info_route(struct rt6_info *rt, void *p_arg)
1841{
1842 struct rt6_proc_arg *arg = (struct rt6_proc_arg *) p_arg;
1843 int i;
1844
1845 if (arg->skip < arg->offset / RT6_INFO_LEN) {
1846 arg->skip++;
1847 return 0;
1848 }
1849
1850 if (arg->len >= arg->length)
1851 return 0;
1852
1853 for (i=0; i<16; i++) {
1854 sprintf(arg->buffer + arg->len, "%02x",
1855 rt->rt6i_dst.addr.s6_addr[i]);
1856 arg->len += 2;
1857 }
1858 arg->len += sprintf(arg->buffer + arg->len, " %02x ",
1859 rt->rt6i_dst.plen);
1860
1861#ifdef CONFIG_IPV6_SUBTREES
1862 for (i=0; i<16; i++) {
1863 sprintf(arg->buffer + arg->len, "%02x",
1864 rt->rt6i_src.addr.s6_addr[i]);
1865 arg->len += 2;
1866 }
1867 arg->len += sprintf(arg->buffer + arg->len, " %02x ",
1868 rt->rt6i_src.plen);
1869#else
1870 sprintf(arg->buffer + arg->len,
1871 "00000000000000000000000000000000 00 ");
1872 arg->len += 36;
1873#endif
1874
1875 if (rt->rt6i_nexthop) {
1876 for (i=0; i<16; i++) {
1877 sprintf(arg->buffer + arg->len, "%02x",
1878 rt->rt6i_nexthop->primary_key[i]);
1879 arg->len += 2;
1880 }
1881 } else {
1882 sprintf(arg->buffer + arg->len,
1883 "00000000000000000000000000000000");
1884 arg->len += 32;
1885 }
1886 arg->len += sprintf(arg->buffer + arg->len,
1887 " %08x %08x %08x %08x %8s\n",
1888 rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt),
1889 rt->u.dst.__use, rt->rt6i_flags,
1890 rt->rt6i_dev ? rt->rt6i_dev->name : "");
1891 return 0;
1892}
1893
1894static int rt6_proc_info(char *buffer, char **start, off_t offset, int length)
1895{
1896 struct rt6_proc_arg arg;
1897 arg.buffer = buffer;
1898 arg.offset = offset;
1899 arg.length = length;
1900 arg.skip = 0;
1901 arg.len = 0;
1902
1903 read_lock_bh(&rt6_lock);
1904 fib6_clean_tree(&ip6_routing_table, rt6_info_route, 0, &arg);
1905 read_unlock_bh(&rt6_lock);
1906
1907 *start = buffer;
1908 if (offset)
1909 *start += offset % RT6_INFO_LEN;
1910
1911 arg.len -= offset % RT6_INFO_LEN;
1912
1913 if (arg.len > length)
1914 arg.len = length;
1915 if (arg.len < 0)
1916 arg.len = 0;
1917
1918 return arg.len;
1919}
1920
Linus Torvalds1da177e2005-04-16 15:20:36 -07001921static int rt6_stats_seq_show(struct seq_file *seq, void *v)
1922{
1923 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
1924 rt6_stats.fib_nodes, rt6_stats.fib_route_nodes,
1925 rt6_stats.fib_rt_alloc, rt6_stats.fib_rt_entries,
1926 rt6_stats.fib_rt_cache,
1927 atomic_read(&ip6_dst_ops.entries),
1928 rt6_stats.fib_discarded_routes);
1929
1930 return 0;
1931}
1932
1933static int rt6_stats_seq_open(struct inode *inode, struct file *file)
1934{
1935 return single_open(file, rt6_stats_seq_show, NULL);
1936}
1937
1938static struct file_operations rt6_stats_seq_fops = {
1939 .owner = THIS_MODULE,
1940 .open = rt6_stats_seq_open,
1941 .read = seq_read,
1942 .llseek = seq_lseek,
1943 .release = single_release,
1944};
1945#endif /* CONFIG_PROC_FS */
1946
1947#ifdef CONFIG_SYSCTL
1948
1949static int flush_delay;
1950
1951static
1952int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
1953 void __user *buffer, size_t *lenp, loff_t *ppos)
1954{
1955 if (write) {
1956 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
1957 fib6_run_gc(flush_delay <= 0 ? ~0UL : (unsigned long)flush_delay);
1958 return 0;
1959 } else
1960 return -EINVAL;
1961}
1962
1963ctl_table ipv6_route_table[] = {
1964 {
1965 .ctl_name = NET_IPV6_ROUTE_FLUSH,
1966 .procname = "flush",
1967 .data = &flush_delay,
1968 .maxlen = sizeof(int),
Dave Jones89c8b3a12005-04-28 12:11:49 -07001969 .mode = 0200,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001970 .proc_handler = &ipv6_sysctl_rtcache_flush
1971 },
1972 {
1973 .ctl_name = NET_IPV6_ROUTE_GC_THRESH,
1974 .procname = "gc_thresh",
1975 .data = &ip6_dst_ops.gc_thresh,
1976 .maxlen = sizeof(int),
1977 .mode = 0644,
1978 .proc_handler = &proc_dointvec,
1979 },
1980 {
1981 .ctl_name = NET_IPV6_ROUTE_MAX_SIZE,
1982 .procname = "max_size",
1983 .data = &ip6_rt_max_size,
1984 .maxlen = sizeof(int),
1985 .mode = 0644,
1986 .proc_handler = &proc_dointvec,
1987 },
1988 {
1989 .ctl_name = NET_IPV6_ROUTE_GC_MIN_INTERVAL,
1990 .procname = "gc_min_interval",
1991 .data = &ip6_rt_gc_min_interval,
1992 .maxlen = sizeof(int),
1993 .mode = 0644,
1994 .proc_handler = &proc_dointvec_jiffies,
1995 .strategy = &sysctl_jiffies,
1996 },
1997 {
1998 .ctl_name = NET_IPV6_ROUTE_GC_TIMEOUT,
1999 .procname = "gc_timeout",
2000 .data = &ip6_rt_gc_timeout,
2001 .maxlen = sizeof(int),
2002 .mode = 0644,
2003 .proc_handler = &proc_dointvec_jiffies,
2004 .strategy = &sysctl_jiffies,
2005 },
2006 {
2007 .ctl_name = NET_IPV6_ROUTE_GC_INTERVAL,
2008 .procname = "gc_interval",
2009 .data = &ip6_rt_gc_interval,
2010 .maxlen = sizeof(int),
2011 .mode = 0644,
2012 .proc_handler = &proc_dointvec_jiffies,
2013 .strategy = &sysctl_jiffies,
2014 },
2015 {
2016 .ctl_name = NET_IPV6_ROUTE_GC_ELASTICITY,
2017 .procname = "gc_elasticity",
2018 .data = &ip6_rt_gc_elasticity,
2019 .maxlen = sizeof(int),
2020 .mode = 0644,
2021 .proc_handler = &proc_dointvec_jiffies,
2022 .strategy = &sysctl_jiffies,
2023 },
2024 {
2025 .ctl_name = NET_IPV6_ROUTE_MTU_EXPIRES,
2026 .procname = "mtu_expires",
2027 .data = &ip6_rt_mtu_expires,
2028 .maxlen = sizeof(int),
2029 .mode = 0644,
2030 .proc_handler = &proc_dointvec_jiffies,
2031 .strategy = &sysctl_jiffies,
2032 },
2033 {
2034 .ctl_name = NET_IPV6_ROUTE_MIN_ADVMSS,
2035 .procname = "min_adv_mss",
2036 .data = &ip6_rt_min_advmss,
2037 .maxlen = sizeof(int),
2038 .mode = 0644,
2039 .proc_handler = &proc_dointvec_jiffies,
2040 .strategy = &sysctl_jiffies,
2041 },
2042 {
2043 .ctl_name = NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS,
2044 .procname = "gc_min_interval_ms",
2045 .data = &ip6_rt_gc_min_interval,
2046 .maxlen = sizeof(int),
2047 .mode = 0644,
2048 .proc_handler = &proc_dointvec_ms_jiffies,
2049 .strategy = &sysctl_ms_jiffies,
2050 },
2051 { .ctl_name = 0 }
2052};
2053
2054#endif
2055
2056void __init ip6_route_init(void)
2057{
2058 struct proc_dir_entry *p;
2059
2060 ip6_dst_ops.kmem_cachep = kmem_cache_create("ip6_dst_cache",
2061 sizeof(struct rt6_info),
2062 0, SLAB_HWCACHE_ALIGN,
2063 NULL, NULL);
2064 if (!ip6_dst_ops.kmem_cachep)
2065 panic("cannot create ip6_dst_cache");
2066
2067 fib6_init();
2068#ifdef CONFIG_PROC_FS
2069 p = proc_net_create("ipv6_route", 0, rt6_proc_info);
2070 if (p)
2071 p->owner = THIS_MODULE;
2072
2073 proc_net_fops_create("rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2074#endif
2075#ifdef CONFIG_XFRM
2076 xfrm6_init();
2077#endif
2078}
2079
2080void ip6_route_cleanup(void)
2081{
2082#ifdef CONFIG_PROC_FS
2083 proc_net_remove("ipv6_route");
2084 proc_net_remove("rt6_stats");
2085#endif
2086#ifdef CONFIG_XFRM
2087 xfrm6_fini();
2088#endif
2089 rt6_ifdown(NULL);
2090 fib6_gc_cleanup();
2091 kmem_cache_destroy(ip6_dst_ops.kmem_cachep);
2092}