blob: fcd7da830aca813b02bcebba9ec4ce2763c6bdf5 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * Linux INET6 implementation
3 * Forwarding Information Database
4 *
5 * Authors:
6 * Pedro Roque <roque@di.fc.ul.pt>
7 *
8 * $Id: ip6_fib.c,v 1.25 2001/10/31 21:55:55 davem Exp $
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
14 */
15
16/*
17 * Changes:
18 * Yuji SEKIYA @USAGI: Support default route on router node;
19 * remove ip6_null_entry from the top of
20 * routing table.
21 */
Linus Torvalds1da177e2005-04-16 15:20:36 -070022#include <linux/errno.h>
23#include <linux/types.h>
24#include <linux/net.h>
25#include <linux/route.h>
26#include <linux/netdevice.h>
27#include <linux/in6.h>
28#include <linux/init.h>
Thomas Grafc71099a2006-08-04 23:20:06 -070029#include <linux/list.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070030
31#ifdef CONFIG_PROC_FS
32#include <linux/proc_fs.h>
33#endif
34
35#include <net/ipv6.h>
36#include <net/ndisc.h>
37#include <net/addrconf.h>
38
39#include <net/ip6_fib.h>
40#include <net/ip6_route.h>
41
42#define RT6_DEBUG 2
43
44#if RT6_DEBUG >= 3
45#define RT6_TRACE(x...) printk(KERN_DEBUG x)
46#else
47#define RT6_TRACE(x...) do { ; } while (0)
48#endif
49
50struct rt6_statistics rt6_stats;
51
Eric Dumazetba899662005-08-26 12:05:31 -070052static kmem_cache_t * fib6_node_kmem __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -070053
54enum fib_walk_state_t
55{
56#ifdef CONFIG_IPV6_SUBTREES
57 FWS_S,
58#endif
59 FWS_L,
60 FWS_R,
61 FWS_C,
62 FWS_U
63};
64
65struct fib6_cleaner_t
66{
67 struct fib6_walker_t w;
68 int (*func)(struct rt6_info *, void *arg);
69 void *arg;
70};
71
72DEFINE_RWLOCK(fib6_walker_lock);
73
74
75#ifdef CONFIG_IPV6_SUBTREES
76#define FWS_INIT FWS_S
77#define SUBTREE(fn) ((fn)->subtree)
78#else
79#define FWS_INIT FWS_L
80#define SUBTREE(fn) NULL
81#endif
82
83static void fib6_prune_clones(struct fib6_node *fn, struct rt6_info *rt);
84static struct fib6_node * fib6_repair_tree(struct fib6_node *fn);
85
86/*
87 * A routing update causes an increase of the serial number on the
88 * affected subtree. This allows for cached routes to be asynchronously
89 * tested when modifications are made to the destination cache as a
90 * result of redirects, path MTU changes, etc.
91 */
92
93static __u32 rt_sernum;
94
Ingo Molnar8d06afa2005-09-09 13:10:40 -070095static DEFINE_TIMER(ip6_fib_timer, fib6_run_gc, 0, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -070096
97struct fib6_walker_t fib6_walker_list = {
98 .prev = &fib6_walker_list,
99 .next = &fib6_walker_list,
100};
101
102#define FOR_WALKERS(w) for ((w)=fib6_walker_list.next; (w) != &fib6_walker_list; (w)=(w)->next)
103
104static __inline__ u32 fib6_new_sernum(void)
105{
106 u32 n = ++rt_sernum;
107 if ((__s32)n <= 0)
108 rt_sernum = n = 1;
109 return n;
110}
111
112/*
113 * Auxiliary address test functions for the radix tree.
114 *
115 * These assume a 32bit processor (although it will work on
116 * 64bit processors)
117 */
118
119/*
120 * test bit
121 */
122
123static __inline__ int addr_bit_set(void *token, int fn_bit)
124{
125 __u32 *addr = token;
126
127 return htonl(1 << ((~fn_bit)&0x1F)) & addr[fn_bit>>5];
128}
129
Linus Torvalds1da177e2005-04-16 15:20:36 -0700130static __inline__ struct fib6_node * node_alloc(void)
131{
132 struct fib6_node *fn;
133
134 if ((fn = kmem_cache_alloc(fib6_node_kmem, SLAB_ATOMIC)) != NULL)
135 memset(fn, 0, sizeof(struct fib6_node));
136
137 return fn;
138}
139
140static __inline__ void node_free(struct fib6_node * fn)
141{
142 kmem_cache_free(fib6_node_kmem, fn);
143}
144
145static __inline__ void rt6_release(struct rt6_info *rt)
146{
147 if (atomic_dec_and_test(&rt->rt6i_ref))
148 dst_free(&rt->u.dst);
149}
150
Thomas Grafc71099a2006-08-04 23:20:06 -0700151static struct fib6_table fib6_main_tbl = {
152 .tb6_id = RT6_TABLE_MAIN,
153 .tb6_lock = RW_LOCK_UNLOCKED,
154 .tb6_root = {
155 .leaf = &ip6_null_entry,
156 .fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO,
157 },
158};
159
160#ifdef CONFIG_IPV6_MULTIPLE_TABLES
161
162#define FIB_TABLE_HASHSZ 256
163static struct hlist_head fib_table_hash[FIB_TABLE_HASHSZ];
164
165static struct fib6_table *fib6_alloc_table(u32 id)
166{
167 struct fib6_table *table;
168
169 table = kzalloc(sizeof(*table), GFP_ATOMIC);
170 if (table != NULL) {
171 table->tb6_id = id;
172 table->tb6_lock = RW_LOCK_UNLOCKED;
173 table->tb6_root.leaf = &ip6_null_entry;
174 table->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
175 }
176
177 return table;
178}
179
180static void fib6_link_table(struct fib6_table *tb)
181{
182 unsigned int h;
183
184 h = tb->tb6_id & (FIB_TABLE_HASHSZ - 1);
185
186 /*
187 * No protection necessary, this is the only list mutatation
188 * operation, tables never disappear once they exist.
189 */
190 hlist_add_head_rcu(&tb->tb6_hlist, &fib_table_hash[h]);
191}
192
193struct fib6_table *fib6_new_table(u32 id)
194{
195 struct fib6_table *tb;
196
197 if (id == 0)
198 id = RT6_TABLE_MAIN;
199 tb = fib6_get_table(id);
200 if (tb)
201 return tb;
202
203 tb = fib6_alloc_table(id);
204 if (tb != NULL)
205 fib6_link_table(tb);
206
207 return tb;
208}
209
210struct fib6_table *fib6_get_table(u32 id)
211{
212 struct fib6_table *tb;
213 struct hlist_node *node;
214 unsigned int h;
215
216 if (id == 0)
217 id = RT6_TABLE_MAIN;
218 h = id & (FIB_TABLE_HASHSZ - 1);
219 rcu_read_lock();
220 hlist_for_each_entry_rcu(tb, node, &fib_table_hash[h], tb6_hlist) {
221 if (tb->tb6_id == id) {
222 rcu_read_unlock();
223 return tb;
224 }
225 }
226 rcu_read_unlock();
227
228 return NULL;
229}
230
231struct dst_entry *fib6_rule_lookup(struct flowi *fl, int flags,
232 pol_lookup_t lookup)
233{
234 /*
235 * TODO: Add rule lookup
236 */
237 struct fib6_table *table = fib6_get_table(RT6_TABLE_MAIN);
238
239 return (struct dst_entry *) lookup(table, fl, flags);
240}
241
242static void __init fib6_tables_init(void)
243{
244 fib6_link_table(&fib6_main_tbl);
245}
246
247#else
248
249struct fib6_table *fib6_new_table(u32 id)
250{
251 return fib6_get_table(id);
252}
253
254struct fib6_table *fib6_get_table(u32 id)
255{
256 return &fib6_main_tbl;
257}
258
259struct dst_entry *fib6_rule_lookup(struct flowi *fl, int flags,
260 pol_lookup_t lookup)
261{
262 return (struct dst_entry *) lookup(&fib6_main_tbl, fl, flags);
263}
264
265static void __init fib6_tables_init(void)
266{
267}
268
269#endif
270
Linus Torvalds1da177e2005-04-16 15:20:36 -0700271
272/*
273 * Routing Table
274 *
275 * return the appropriate node for a routing tree "add" operation
276 * by either creating and inserting or by returning an existing
277 * node.
278 */
279
280static struct fib6_node * fib6_add_1(struct fib6_node *root, void *addr,
281 int addrlen, int plen,
282 int offset)
283{
284 struct fib6_node *fn, *in, *ln;
285 struct fib6_node *pn = NULL;
286 struct rt6key *key;
287 int bit;
288 int dir = 0;
289 __u32 sernum = fib6_new_sernum();
290
291 RT6_TRACE("fib6_add_1\n");
292
293 /* insert node in tree */
294
295 fn = root;
296
297 do {
298 key = (struct rt6key *)((u8 *)fn->leaf + offset);
299
300 /*
301 * Prefix match
302 */
303 if (plen < fn->fn_bit ||
304 !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit))
305 goto insert_above;
306
307 /*
308 * Exact match ?
309 */
310
311 if (plen == fn->fn_bit) {
312 /* clean up an intermediate node */
313 if ((fn->fn_flags & RTN_RTINFO) == 0) {
314 rt6_release(fn->leaf);
315 fn->leaf = NULL;
316 }
317
318 fn->fn_sernum = sernum;
319
320 return fn;
321 }
322
323 /*
324 * We have more bits to go
325 */
326
327 /* Try to walk down on tree. */
328 fn->fn_sernum = sernum;
329 dir = addr_bit_set(addr, fn->fn_bit);
330 pn = fn;
331 fn = dir ? fn->right: fn->left;
332 } while (fn);
333
334 /*
335 * We walked to the bottom of tree.
336 * Create new leaf node without children.
337 */
338
339 ln = node_alloc();
340
341 if (ln == NULL)
342 return NULL;
343 ln->fn_bit = plen;
344
345 ln->parent = pn;
346 ln->fn_sernum = sernum;
347
348 if (dir)
349 pn->right = ln;
350 else
351 pn->left = ln;
352
353 return ln;
354
355
356insert_above:
357 /*
358 * split since we don't have a common prefix anymore or
359 * we have a less significant route.
360 * we've to insert an intermediate node on the list
361 * this new node will point to the one we need to create
362 * and the current
363 */
364
365 pn = fn->parent;
366
367 /* find 1st bit in difference between the 2 addrs.
368
YOSHIFUJI Hideaki971f3592005-11-08 09:37:56 -0800369 See comment in __ipv6_addr_diff: bit may be an invalid value,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700370 but if it is >= plen, the value is ignored in any case.
371 */
372
YOSHIFUJI Hideaki971f3592005-11-08 09:37:56 -0800373 bit = __ipv6_addr_diff(addr, &key->addr, addrlen);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700374
375 /*
376 * (intermediate)[in]
377 * / \
378 * (new leaf node)[ln] (old node)[fn]
379 */
380 if (plen > bit) {
381 in = node_alloc();
382 ln = node_alloc();
383
384 if (in == NULL || ln == NULL) {
385 if (in)
386 node_free(in);
387 if (ln)
388 node_free(ln);
389 return NULL;
390 }
391
392 /*
393 * new intermediate node.
394 * RTN_RTINFO will
395 * be off since that an address that chooses one of
396 * the branches would not match less specific routes
397 * in the other branch
398 */
399
400 in->fn_bit = bit;
401
402 in->parent = pn;
403 in->leaf = fn->leaf;
404 atomic_inc(&in->leaf->rt6i_ref);
405
406 in->fn_sernum = sernum;
407
408 /* update parent pointer */
409 if (dir)
410 pn->right = in;
411 else
412 pn->left = in;
413
414 ln->fn_bit = plen;
415
416 ln->parent = in;
417 fn->parent = in;
418
419 ln->fn_sernum = sernum;
420
421 if (addr_bit_set(addr, bit)) {
422 in->right = ln;
423 in->left = fn;
424 } else {
425 in->left = ln;
426 in->right = fn;
427 }
428 } else { /* plen <= bit */
429
430 /*
431 * (new leaf node)[ln]
432 * / \
433 * (old node)[fn] NULL
434 */
435
436 ln = node_alloc();
437
438 if (ln == NULL)
439 return NULL;
440
441 ln->fn_bit = plen;
442
443 ln->parent = pn;
444
445 ln->fn_sernum = sernum;
446
447 if (dir)
448 pn->right = ln;
449 else
450 pn->left = ln;
451
452 if (addr_bit_set(&key->addr, plen))
453 ln->right = fn;
454 else
455 ln->left = fn;
456
457 fn->parent = ln;
458 }
459 return ln;
460}
461
462/*
463 * Insert routing information in a node.
464 */
465
466static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -0700467 struct nlmsghdr *nlh, struct netlink_skb_parms *req)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700468{
469 struct rt6_info *iter = NULL;
470 struct rt6_info **ins;
471
472 ins = &fn->leaf;
473
474 if (fn->fn_flags&RTN_TL_ROOT &&
475 fn->leaf == &ip6_null_entry &&
476 !(rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) ){
477 fn->leaf = rt;
478 rt->u.next = NULL;
479 goto out;
480 }
481
482 for (iter = fn->leaf; iter; iter=iter->u.next) {
483 /*
484 * Search for duplicates
485 */
486
487 if (iter->rt6i_metric == rt->rt6i_metric) {
488 /*
489 * Same priority level
490 */
491
492 if (iter->rt6i_dev == rt->rt6i_dev &&
493 iter->rt6i_idev == rt->rt6i_idev &&
494 ipv6_addr_equal(&iter->rt6i_gateway,
495 &rt->rt6i_gateway)) {
496 if (!(iter->rt6i_flags&RTF_EXPIRES))
497 return -EEXIST;
498 iter->rt6i_expires = rt->rt6i_expires;
499 if (!(rt->rt6i_flags&RTF_EXPIRES)) {
500 iter->rt6i_flags &= ~RTF_EXPIRES;
501 iter->rt6i_expires = 0;
502 }
503 return -EEXIST;
504 }
505 }
506
507 if (iter->rt6i_metric > rt->rt6i_metric)
508 break;
509
510 ins = &iter->u.next;
511 }
512
513 /*
514 * insert node
515 */
516
517out:
518 rt->u.next = iter;
519 *ins = rt;
520 rt->rt6i_node = fn;
521 atomic_inc(&rt->rt6i_ref);
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -0700522 inet6_rt_notify(RTM_NEWROUTE, rt, nlh, req);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700523 rt6_stats.fib_rt_entries++;
524
525 if ((fn->fn_flags & RTN_RTINFO) == 0) {
526 rt6_stats.fib_route_nodes++;
527 fn->fn_flags |= RTN_RTINFO;
528 }
529
530 return 0;
531}
532
533static __inline__ void fib6_start_gc(struct rt6_info *rt)
534{
535 if (ip6_fib_timer.expires == 0 &&
536 (rt->rt6i_flags & (RTF_EXPIRES|RTF_CACHE)))
537 mod_timer(&ip6_fib_timer, jiffies + ip6_rt_gc_interval);
538}
539
540void fib6_force_start_gc(void)
541{
542 if (ip6_fib_timer.expires == 0)
543 mod_timer(&ip6_fib_timer, jiffies + ip6_rt_gc_interval);
544}
545
546/*
547 * Add routing information to the routing tree.
548 * <destination addr>/<source addr>
549 * with source addr info in sub-trees
550 */
551
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -0700552int fib6_add(struct fib6_node *root, struct rt6_info *rt,
553 struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700554{
555 struct fib6_node *fn;
556 int err = -ENOMEM;
557
558 fn = fib6_add_1(root, &rt->rt6i_dst.addr, sizeof(struct in6_addr),
559 rt->rt6i_dst.plen, offsetof(struct rt6_info, rt6i_dst));
560
561 if (fn == NULL)
562 goto out;
563
564#ifdef CONFIG_IPV6_SUBTREES
565 if (rt->rt6i_src.plen) {
566 struct fib6_node *sn;
567
568 if (fn->subtree == NULL) {
569 struct fib6_node *sfn;
570
571 /*
572 * Create subtree.
573 *
574 * fn[main tree]
575 * |
576 * sfn[subtree root]
577 * \
578 * sn[new leaf node]
579 */
580
581 /* Create subtree root node */
582 sfn = node_alloc();
583 if (sfn == NULL)
584 goto st_failure;
585
586 sfn->leaf = &ip6_null_entry;
587 atomic_inc(&ip6_null_entry.rt6i_ref);
588 sfn->fn_flags = RTN_ROOT;
589 sfn->fn_sernum = fib6_new_sernum();
590
591 /* Now add the first leaf node to new subtree */
592
593 sn = fib6_add_1(sfn, &rt->rt6i_src.addr,
594 sizeof(struct in6_addr), rt->rt6i_src.plen,
595 offsetof(struct rt6_info, rt6i_src));
596
597 if (sn == NULL) {
598 /* If it is failed, discard just allocated
599 root, and then (in st_failure) stale node
600 in main tree.
601 */
602 node_free(sfn);
603 goto st_failure;
604 }
605
606 /* Now link new subtree to main tree */
607 sfn->parent = fn;
608 fn->subtree = sfn;
609 if (fn->leaf == NULL) {
610 fn->leaf = rt;
611 atomic_inc(&rt->rt6i_ref);
612 }
613 } else {
614 sn = fib6_add_1(fn->subtree, &rt->rt6i_src.addr,
615 sizeof(struct in6_addr), rt->rt6i_src.plen,
616 offsetof(struct rt6_info, rt6i_src));
617
618 if (sn == NULL)
619 goto st_failure;
620 }
621
622 fn = sn;
623 }
624#endif
625
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -0700626 err = fib6_add_rt2node(fn, rt, nlh, req);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700627
628 if (err == 0) {
629 fib6_start_gc(rt);
630 if (!(rt->rt6i_flags&RTF_CACHE))
631 fib6_prune_clones(fn, rt);
632 }
633
634out:
635 if (err)
636 dst_free(&rt->u.dst);
637 return err;
638
639#ifdef CONFIG_IPV6_SUBTREES
640 /* Subtree creation failed, probably main tree node
641 is orphan. If it is, shoot it.
642 */
643st_failure:
644 if (fn && !(fn->fn_flags & (RTN_RTINFO|RTN_ROOT)))
645 fib6_repair_tree(fn);
646 dst_free(&rt->u.dst);
647 return err;
648#endif
649}
650
651/*
652 * Routing tree lookup
653 *
654 */
655
656struct lookup_args {
657 int offset; /* key offset on rt6_info */
658 struct in6_addr *addr; /* search key */
659};
660
661static struct fib6_node * fib6_lookup_1(struct fib6_node *root,
662 struct lookup_args *args)
663{
664 struct fib6_node *fn;
665 int dir;
666
667 /*
668 * Descend on a tree
669 */
670
671 fn = root;
672
673 for (;;) {
674 struct fib6_node *next;
675
676 dir = addr_bit_set(args->addr, fn->fn_bit);
677
678 next = dir ? fn->right : fn->left;
679
680 if (next) {
681 fn = next;
682 continue;
683 }
684
685 break;
686 }
687
688 while ((fn->fn_flags & RTN_ROOT) == 0) {
689#ifdef CONFIG_IPV6_SUBTREES
690 if (fn->subtree) {
691 struct fib6_node *st;
692 struct lookup_args *narg;
693
694 narg = args + 1;
695
696 if (narg->addr) {
697 st = fib6_lookup_1(fn->subtree, narg);
698
699 if (st && !(st->fn_flags & RTN_ROOT))
700 return st;
701 }
702 }
703#endif
704
705 if (fn->fn_flags & RTN_RTINFO) {
706 struct rt6key *key;
707
708 key = (struct rt6key *) ((u8 *) fn->leaf +
709 args->offset);
710
711 if (ipv6_prefix_equal(&key->addr, args->addr, key->plen))
712 return fn;
713 }
714
715 fn = fn->parent;
716 }
717
718 return NULL;
719}
720
721struct fib6_node * fib6_lookup(struct fib6_node *root, struct in6_addr *daddr,
722 struct in6_addr *saddr)
723{
724 struct lookup_args args[2];
725 struct fib6_node *fn;
726
727 args[0].offset = offsetof(struct rt6_info, rt6i_dst);
728 args[0].addr = daddr;
729
730#ifdef CONFIG_IPV6_SUBTREES
731 args[1].offset = offsetof(struct rt6_info, rt6i_src);
732 args[1].addr = saddr;
733#endif
734
735 fn = fib6_lookup_1(root, args);
736
737 if (fn == NULL || fn->fn_flags & RTN_TL_ROOT)
738 fn = root;
739
740 return fn;
741}
742
743/*
744 * Get node with specified destination prefix (and source prefix,
745 * if subtrees are used)
746 */
747
748
749static struct fib6_node * fib6_locate_1(struct fib6_node *root,
750 struct in6_addr *addr,
751 int plen, int offset)
752{
753 struct fib6_node *fn;
754
755 for (fn = root; fn ; ) {
756 struct rt6key *key = (struct rt6key *)((u8 *)fn->leaf + offset);
757
758 /*
759 * Prefix match
760 */
761 if (plen < fn->fn_bit ||
762 !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit))
763 return NULL;
764
765 if (plen == fn->fn_bit)
766 return fn;
767
768 /*
769 * We have more bits to go
770 */
771 if (addr_bit_set(addr, fn->fn_bit))
772 fn = fn->right;
773 else
774 fn = fn->left;
775 }
776 return NULL;
777}
778
779struct fib6_node * fib6_locate(struct fib6_node *root,
780 struct in6_addr *daddr, int dst_len,
781 struct in6_addr *saddr, int src_len)
782{
783 struct fib6_node *fn;
784
785 fn = fib6_locate_1(root, daddr, dst_len,
786 offsetof(struct rt6_info, rt6i_dst));
787
788#ifdef CONFIG_IPV6_SUBTREES
789 if (src_len) {
790 BUG_TRAP(saddr!=NULL);
791 if (fn == NULL)
792 fn = fn->subtree;
793 if (fn)
794 fn = fib6_locate_1(fn, saddr, src_len,
795 offsetof(struct rt6_info, rt6i_src));
796 }
797#endif
798
799 if (fn && fn->fn_flags&RTN_RTINFO)
800 return fn;
801
802 return NULL;
803}
804
805
806/*
807 * Deletion
808 *
809 */
810
811static struct rt6_info * fib6_find_prefix(struct fib6_node *fn)
812{
813 if (fn->fn_flags&RTN_ROOT)
814 return &ip6_null_entry;
815
816 while(fn) {
817 if(fn->left)
818 return fn->left->leaf;
819
820 if(fn->right)
821 return fn->right->leaf;
822
823 fn = SUBTREE(fn);
824 }
825 return NULL;
826}
827
828/*
829 * Called to trim the tree of intermediate nodes when possible. "fn"
830 * is the node we want to try and remove.
831 */
832
833static struct fib6_node * fib6_repair_tree(struct fib6_node *fn)
834{
835 int children;
836 int nstate;
837 struct fib6_node *child, *pn;
838 struct fib6_walker_t *w;
839 int iter = 0;
840
841 for (;;) {
842 RT6_TRACE("fixing tree: plen=%d iter=%d\n", fn->fn_bit, iter);
843 iter++;
844
845 BUG_TRAP(!(fn->fn_flags&RTN_RTINFO));
846 BUG_TRAP(!(fn->fn_flags&RTN_TL_ROOT));
847 BUG_TRAP(fn->leaf==NULL);
848
849 children = 0;
850 child = NULL;
851 if (fn->right) child = fn->right, children |= 1;
852 if (fn->left) child = fn->left, children |= 2;
853
854 if (children == 3 || SUBTREE(fn)
855#ifdef CONFIG_IPV6_SUBTREES
856 /* Subtree root (i.e. fn) may have one child */
857 || (children && fn->fn_flags&RTN_ROOT)
858#endif
859 ) {
860 fn->leaf = fib6_find_prefix(fn);
861#if RT6_DEBUG >= 2
862 if (fn->leaf==NULL) {
863 BUG_TRAP(fn->leaf);
864 fn->leaf = &ip6_null_entry;
865 }
866#endif
867 atomic_inc(&fn->leaf->rt6i_ref);
868 return fn->parent;
869 }
870
871 pn = fn->parent;
872#ifdef CONFIG_IPV6_SUBTREES
873 if (SUBTREE(pn) == fn) {
874 BUG_TRAP(fn->fn_flags&RTN_ROOT);
875 SUBTREE(pn) = NULL;
876 nstate = FWS_L;
877 } else {
878 BUG_TRAP(!(fn->fn_flags&RTN_ROOT));
879#endif
880 if (pn->right == fn) pn->right = child;
881 else if (pn->left == fn) pn->left = child;
882#if RT6_DEBUG >= 2
883 else BUG_TRAP(0);
884#endif
885 if (child)
886 child->parent = pn;
887 nstate = FWS_R;
888#ifdef CONFIG_IPV6_SUBTREES
889 }
890#endif
891
892 read_lock(&fib6_walker_lock);
893 FOR_WALKERS(w) {
894 if (child == NULL) {
895 if (w->root == fn) {
896 w->root = w->node = NULL;
897 RT6_TRACE("W %p adjusted by delroot 1\n", w);
898 } else if (w->node == fn) {
899 RT6_TRACE("W %p adjusted by delnode 1, s=%d/%d\n", w, w->state, nstate);
900 w->node = pn;
901 w->state = nstate;
902 }
903 } else {
904 if (w->root == fn) {
905 w->root = child;
906 RT6_TRACE("W %p adjusted by delroot 2\n", w);
907 }
908 if (w->node == fn) {
909 w->node = child;
910 if (children&2) {
911 RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w, w->state);
912 w->state = w->state>=FWS_R ? FWS_U : FWS_INIT;
913 } else {
914 RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w, w->state);
915 w->state = w->state>=FWS_C ? FWS_U : FWS_INIT;
916 }
917 }
918 }
919 }
920 read_unlock(&fib6_walker_lock);
921
922 node_free(fn);
923 if (pn->fn_flags&RTN_RTINFO || SUBTREE(pn))
924 return pn;
925
926 rt6_release(pn->leaf);
927 pn->leaf = NULL;
928 fn = pn;
929 }
930}
931
932static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -0700933 struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700934{
935 struct fib6_walker_t *w;
936 struct rt6_info *rt = *rtp;
937
938 RT6_TRACE("fib6_del_route\n");
939
940 /* Unlink it */
941 *rtp = rt->u.next;
942 rt->rt6i_node = NULL;
943 rt6_stats.fib_rt_entries--;
944 rt6_stats.fib_discarded_routes++;
945
946 /* Adjust walkers */
947 read_lock(&fib6_walker_lock);
948 FOR_WALKERS(w) {
949 if (w->state == FWS_C && w->leaf == rt) {
950 RT6_TRACE("walker %p adjusted by delroute\n", w);
951 w->leaf = rt->u.next;
952 if (w->leaf == NULL)
953 w->state = FWS_U;
954 }
955 }
956 read_unlock(&fib6_walker_lock);
957
958 rt->u.next = NULL;
959
960 if (fn->leaf == NULL && fn->fn_flags&RTN_TL_ROOT)
961 fn->leaf = &ip6_null_entry;
962
963 /* If it was last route, expunge its radix tree node */
964 if (fn->leaf == NULL) {
965 fn->fn_flags &= ~RTN_RTINFO;
966 rt6_stats.fib_route_nodes--;
967 fn = fib6_repair_tree(fn);
968 }
969
970 if (atomic_read(&rt->rt6i_ref) != 1) {
971 /* This route is used as dummy address holder in some split
972 * nodes. It is not leaked, but it still holds other resources,
973 * which must be released in time. So, scan ascendant nodes
974 * and replace dummy references to this route with references
975 * to still alive ones.
976 */
977 while (fn) {
978 if (!(fn->fn_flags&RTN_RTINFO) && fn->leaf == rt) {
979 fn->leaf = fib6_find_prefix(fn);
980 atomic_inc(&fn->leaf->rt6i_ref);
981 rt6_release(rt);
982 }
983 fn = fn->parent;
984 }
985 /* No more references are possible at this point. */
986 if (atomic_read(&rt->rt6i_ref) != 1) BUG();
987 }
988
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -0700989 inet6_rt_notify(RTM_DELROUTE, rt, nlh, req);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700990 rt6_release(rt);
991}
992
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -0700993int fib6_del(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700994{
995 struct fib6_node *fn = rt->rt6i_node;
996 struct rt6_info **rtp;
997
998#if RT6_DEBUG >= 2
999 if (rt->u.dst.obsolete>0) {
1000 BUG_TRAP(fn==NULL);
1001 return -ENOENT;
1002 }
1003#endif
1004 if (fn == NULL || rt == &ip6_null_entry)
1005 return -ENOENT;
1006
1007 BUG_TRAP(fn->fn_flags&RTN_RTINFO);
1008
1009 if (!(rt->rt6i_flags&RTF_CACHE))
1010 fib6_prune_clones(fn, rt);
1011
1012 /*
1013 * Walk the leaf entries looking for ourself
1014 */
1015
1016 for (rtp = &fn->leaf; *rtp; rtp = &(*rtp)->u.next) {
1017 if (*rtp == rt) {
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -07001018 fib6_del_route(fn, rtp, nlh, _rtattr, req);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001019 return 0;
1020 }
1021 }
1022 return -ENOENT;
1023}
1024
1025/*
1026 * Tree traversal function.
1027 *
1028 * Certainly, it is not interrupt safe.
1029 * However, it is internally reenterable wrt itself and fib6_add/fib6_del.
1030 * It means, that we can modify tree during walking
1031 * and use this function for garbage collection, clone pruning,
1032 * cleaning tree when a device goes down etc. etc.
1033 *
1034 * It guarantees that every node will be traversed,
1035 * and that it will be traversed only once.
1036 *
1037 * Callback function w->func may return:
1038 * 0 -> continue walking.
1039 * positive value -> walking is suspended (used by tree dumps,
1040 * and probably by gc, if it will be split to several slices)
1041 * negative value -> terminate walking.
1042 *
1043 * The function itself returns:
1044 * 0 -> walk is complete.
1045 * >0 -> walk is incomplete (i.e. suspended)
1046 * <0 -> walk is terminated by an error.
1047 */
1048
1049int fib6_walk_continue(struct fib6_walker_t *w)
1050{
1051 struct fib6_node *fn, *pn;
1052
1053 for (;;) {
1054 fn = w->node;
1055 if (fn == NULL)
1056 return 0;
1057
1058 if (w->prune && fn != w->root &&
1059 fn->fn_flags&RTN_RTINFO && w->state < FWS_C) {
1060 w->state = FWS_C;
1061 w->leaf = fn->leaf;
1062 }
1063 switch (w->state) {
1064#ifdef CONFIG_IPV6_SUBTREES
1065 case FWS_S:
1066 if (SUBTREE(fn)) {
1067 w->node = SUBTREE(fn);
1068 continue;
1069 }
1070 w->state = FWS_L;
1071#endif
1072 case FWS_L:
1073 if (fn->left) {
1074 w->node = fn->left;
1075 w->state = FWS_INIT;
1076 continue;
1077 }
1078 w->state = FWS_R;
1079 case FWS_R:
1080 if (fn->right) {
1081 w->node = fn->right;
1082 w->state = FWS_INIT;
1083 continue;
1084 }
1085 w->state = FWS_C;
1086 w->leaf = fn->leaf;
1087 case FWS_C:
1088 if (w->leaf && fn->fn_flags&RTN_RTINFO) {
1089 int err = w->func(w);
1090 if (err)
1091 return err;
1092 continue;
1093 }
1094 w->state = FWS_U;
1095 case FWS_U:
1096 if (fn == w->root)
1097 return 0;
1098 pn = fn->parent;
1099 w->node = pn;
1100#ifdef CONFIG_IPV6_SUBTREES
1101 if (SUBTREE(pn) == fn) {
1102 BUG_TRAP(fn->fn_flags&RTN_ROOT);
1103 w->state = FWS_L;
1104 continue;
1105 }
1106#endif
1107 if (pn->left == fn) {
1108 w->state = FWS_R;
1109 continue;
1110 }
1111 if (pn->right == fn) {
1112 w->state = FWS_C;
1113 w->leaf = w->node->leaf;
1114 continue;
1115 }
1116#if RT6_DEBUG >= 2
1117 BUG_TRAP(0);
1118#endif
1119 }
1120 }
1121}
1122
1123int fib6_walk(struct fib6_walker_t *w)
1124{
1125 int res;
1126
1127 w->state = FWS_INIT;
1128 w->node = w->root;
1129
1130 fib6_walker_link(w);
1131 res = fib6_walk_continue(w);
1132 if (res <= 0)
1133 fib6_walker_unlink(w);
1134 return res;
1135}
1136
1137static int fib6_clean_node(struct fib6_walker_t *w)
1138{
1139 int res;
1140 struct rt6_info *rt;
1141 struct fib6_cleaner_t *c = (struct fib6_cleaner_t*)w;
1142
1143 for (rt = w->leaf; rt; rt = rt->u.next) {
1144 res = c->func(rt, c->arg);
1145 if (res < 0) {
1146 w->leaf = rt;
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -07001147 res = fib6_del(rt, NULL, NULL, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001148 if (res) {
1149#if RT6_DEBUG >= 2
1150 printk(KERN_DEBUG "fib6_clean_node: del failed: rt=%p@%p err=%d\n", rt, rt->rt6i_node, res);
1151#endif
1152 continue;
1153 }
1154 return 0;
1155 }
1156 BUG_TRAP(res==0);
1157 }
1158 w->leaf = rt;
1159 return 0;
1160}
1161
1162/*
1163 * Convenient frontend to tree walker.
1164 *
1165 * func is called on each route.
1166 * It may return -1 -> delete this route.
1167 * 0 -> continue walking
1168 *
1169 * prune==1 -> only immediate children of node (certainly,
1170 * ignoring pure split nodes) will be scanned.
1171 */
1172
1173void fib6_clean_tree(struct fib6_node *root,
1174 int (*func)(struct rt6_info *, void *arg),
1175 int prune, void *arg)
1176{
1177 struct fib6_cleaner_t c;
1178
1179 c.w.root = root;
1180 c.w.func = fib6_clean_node;
1181 c.w.prune = prune;
1182 c.func = func;
1183 c.arg = arg;
1184
1185 fib6_walk(&c.w);
1186}
1187
Thomas Grafc71099a2006-08-04 23:20:06 -07001188void fib6_clean_all(int (*func)(struct rt6_info *, void *arg),
1189 int prune, void *arg)
1190{
1191 int i;
1192 struct fib6_table *table;
1193
1194 for (i = FIB6_TABLE_MIN; i <= FIB6_TABLE_MAX; i++) {
1195 table = fib6_get_table(i);
1196 if (table != NULL) {
1197 write_lock_bh(&table->tb6_lock);
1198 fib6_clean_tree(&table->tb6_root, func, prune, arg);
1199 write_unlock_bh(&table->tb6_lock);
1200 }
1201 }
1202}
1203
Linus Torvalds1da177e2005-04-16 15:20:36 -07001204static int fib6_prune_clone(struct rt6_info *rt, void *arg)
1205{
1206 if (rt->rt6i_flags & RTF_CACHE) {
1207 RT6_TRACE("pruning clone %p\n", rt);
1208 return -1;
1209 }
1210
1211 return 0;
1212}
1213
1214static void fib6_prune_clones(struct fib6_node *fn, struct rt6_info *rt)
1215{
1216 fib6_clean_tree(fn, fib6_prune_clone, 1, rt);
1217}
1218
1219/*
1220 * Garbage collection
1221 */
1222
1223static struct fib6_gc_args
1224{
1225 int timeout;
1226 int more;
1227} gc_args;
1228
1229static int fib6_age(struct rt6_info *rt, void *arg)
1230{
1231 unsigned long now = jiffies;
1232
1233 /*
1234 * check addrconf expiration here.
1235 * Routes are expired even if they are in use.
1236 *
1237 * Also age clones. Note, that clones are aged out
1238 * only if they are not in use now.
1239 */
1240
1241 if (rt->rt6i_flags&RTF_EXPIRES && rt->rt6i_expires) {
1242 if (time_after(now, rt->rt6i_expires)) {
1243 RT6_TRACE("expiring %p\n", rt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001244 return -1;
1245 }
1246 gc_args.more++;
1247 } else if (rt->rt6i_flags & RTF_CACHE) {
1248 if (atomic_read(&rt->u.dst.__refcnt) == 0 &&
1249 time_after_eq(now, rt->u.dst.lastuse + gc_args.timeout)) {
1250 RT6_TRACE("aging clone %p\n", rt);
1251 return -1;
1252 } else if ((rt->rt6i_flags & RTF_GATEWAY) &&
1253 (!(rt->rt6i_nexthop->flags & NTF_ROUTER))) {
1254 RT6_TRACE("purging route %p via non-router but gateway\n",
1255 rt);
1256 return -1;
1257 }
1258 gc_args.more++;
1259 }
1260
1261 return 0;
1262}
1263
1264static DEFINE_SPINLOCK(fib6_gc_lock);
1265
1266void fib6_run_gc(unsigned long dummy)
1267{
1268 if (dummy != ~0UL) {
1269 spin_lock_bh(&fib6_gc_lock);
1270 gc_args.timeout = dummy ? (int)dummy : ip6_rt_gc_interval;
1271 } else {
1272 local_bh_disable();
1273 if (!spin_trylock(&fib6_gc_lock)) {
1274 mod_timer(&ip6_fib_timer, jiffies + HZ);
1275 local_bh_enable();
1276 return;
1277 }
1278 gc_args.timeout = ip6_rt_gc_interval;
1279 }
1280 gc_args.more = 0;
1281
Linus Torvalds1da177e2005-04-16 15:20:36 -07001282 ndisc_dst_gc(&gc_args.more);
Thomas Grafc71099a2006-08-04 23:20:06 -07001283 fib6_clean_all(fib6_age, 0, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001284
1285 if (gc_args.more)
1286 mod_timer(&ip6_fib_timer, jiffies + ip6_rt_gc_interval);
1287 else {
1288 del_timer(&ip6_fib_timer);
1289 ip6_fib_timer.expires = 0;
1290 }
1291 spin_unlock_bh(&fib6_gc_lock);
1292}
1293
1294void __init fib6_init(void)
1295{
1296 fib6_node_kmem = kmem_cache_create("fib6_nodes",
1297 sizeof(struct fib6_node),
1298 0, SLAB_HWCACHE_ALIGN,
1299 NULL, NULL);
1300 if (!fib6_node_kmem)
1301 panic("cannot create fib6_nodes cache");
Thomas Grafc71099a2006-08-04 23:20:06 -07001302
1303 fib6_tables_init();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001304}
1305
1306void fib6_gc_cleanup(void)
1307{
1308 del_timer(&ip6_fib_timer);
1309 kmem_cache_destroy(fib6_node_kmem);
1310}