blob: 4be234c7d8c3c7b9838691c2c40e41379925e27d [file] [log] [blame]
Robert Olsson19baf832005-06-21 12:43:18 -07001/*
2 * This program is free software; you can redistribute it and/or
3 * modify it under the terms of the GNU General Public License
4 * as published by the Free Software Foundation; either version
5 * 2 of the License, or (at your option) any later version.
6 *
7 * Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8 * & Swedish University of Agricultural Sciences.
9 *
10 * Jens Laas <jens.laas@data.slu.se> Swedish University of
11 * Agricultural Sciences.
12 *
13 * Hans Liss <hans.liss@its.uu.se> Uppsala Universitet
14 *
15 * This work is based on the LPC-trie which is originally descibed in:
16 *
17 * An experimental study of compression methods for dynamic tries
18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19 * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
20 *
21 *
22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24 *
25 * Version: $Id: fib_trie.c,v 1.3 2005/06/08 14:20:01 robert Exp $
26 *
27 *
28 * Code from fib_hash has been reused which includes the following header:
29 *
30 *
31 * INET An implementation of the TCP/IP protocol suite for the LINUX
32 * operating system. INET is implemented using the BSD Socket
33 * interface as the means of communication with the user level.
34 *
35 * IPv4 FIB: lookup engine and maintenance routines.
36 *
37 *
38 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
39 *
40 * This program is free software; you can redistribute it and/or
41 * modify it under the terms of the GNU General Public License
42 * as published by the Free Software Foundation; either version
43 * 2 of the License, or (at your option) any later version.
44 */
45
Robert Olsson2f368952005-07-05 15:02:40 -070046#define VERSION "0.325"
Robert Olsson19baf832005-06-21 12:43:18 -070047
48#include <linux/config.h>
49#include <asm/uaccess.h>
50#include <asm/system.h>
51#include <asm/bitops.h>
52#include <linux/types.h>
53#include <linux/kernel.h>
54#include <linux/sched.h>
55#include <linux/mm.h>
56#include <linux/string.h>
57#include <linux/socket.h>
58#include <linux/sockios.h>
59#include <linux/errno.h>
60#include <linux/in.h>
61#include <linux/inet.h>
62#include <linux/netdevice.h>
63#include <linux/if_arp.h>
64#include <linux/proc_fs.h>
65#include <linux/skbuff.h>
66#include <linux/netlink.h>
67#include <linux/init.h>
68#include <linux/list.h>
69#include <net/ip.h>
70#include <net/protocol.h>
71#include <net/route.h>
72#include <net/tcp.h>
73#include <net/sock.h>
74#include <net/ip_fib.h>
75#include "fib_lookup.h"
76
77#undef CONFIG_IP_FIB_TRIE_STATS
78#define MAX_CHILDS 16384
79
80#define EXTRACT(p, n, str) ((str)<<(p)>>(32-(n)))
81#define KEYLENGTH (8*sizeof(t_key))
82#define MASK_PFX(k, l) (((l)==0)?0:(k >> (KEYLENGTH-l)) << (KEYLENGTH-l))
83#define TKEY_GET_MASK(offset, bits) (((bits)==0)?0:((t_key)(-1) << (KEYLENGTH - bits) >> offset))
84
85static DEFINE_RWLOCK(fib_lock);
86
87typedef unsigned int t_key;
88
89#define T_TNODE 0
90#define T_LEAF 1
91#define NODE_TYPE_MASK 0x1UL
92#define NODE_PARENT(_node) \
93((struct tnode *)((_node)->_parent & ~NODE_TYPE_MASK))
94#define NODE_SET_PARENT(_node, _ptr) \
95((_node)->_parent = (((unsigned long)(_ptr)) | \
96 ((_node)->_parent & NODE_TYPE_MASK)))
97#define NODE_INIT_PARENT(_node, _type) \
98((_node)->_parent = (_type))
99#define NODE_TYPE(_node) \
100((_node)->_parent & NODE_TYPE_MASK)
101
102#define IS_TNODE(n) (!(n->_parent & T_LEAF))
103#define IS_LEAF(n) (n->_parent & T_LEAF)
104
105struct node {
106 t_key key;
107 unsigned long _parent;
108};
109
110struct leaf {
111 t_key key;
112 unsigned long _parent;
113 struct hlist_head list;
114};
115
116struct leaf_info {
117 struct hlist_node hlist;
118 int plen;
119 struct list_head falh;
120};
121
122struct tnode {
123 t_key key;
124 unsigned long _parent;
125 unsigned short pos:5; /* 2log(KEYLENGTH) bits needed */
126 unsigned short bits:5; /* 2log(KEYLENGTH) bits needed */
127 unsigned short full_children; /* KEYLENGTH bits needed */
128 unsigned short empty_children; /* KEYLENGTH bits needed */
129 struct node *child[0];
130};
131
132#ifdef CONFIG_IP_FIB_TRIE_STATS
133struct trie_use_stats {
134 unsigned int gets;
135 unsigned int backtrack;
136 unsigned int semantic_match_passed;
137 unsigned int semantic_match_miss;
138 unsigned int null_node_hit;
Robert Olsson2f368952005-07-05 15:02:40 -0700139 unsigned int resize_node_skipped;
Robert Olsson19baf832005-06-21 12:43:18 -0700140};
141#endif
142
143struct trie_stat {
144 unsigned int totdepth;
145 unsigned int maxdepth;
146 unsigned int tnodes;
147 unsigned int leaves;
148 unsigned int nullpointers;
149 unsigned int nodesizes[MAX_CHILDS];
150};
151
152struct trie {
153 struct node *trie;
154#ifdef CONFIG_IP_FIB_TRIE_STATS
155 struct trie_use_stats stats;
156#endif
157 int size;
158 unsigned int revision;
159};
160
161static int trie_debug = 0;
162
163static int tnode_full(struct tnode *tn, struct node *n);
164static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
165static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull);
166static int tnode_child_length(struct tnode *tn);
167static struct node *resize(struct trie *t, struct tnode *tn);
Robert Olsson2f368952005-07-05 15:02:40 -0700168static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err);
169static struct tnode *halve(struct trie *t, struct tnode *tn, int *err);
Robert Olsson19baf832005-06-21 12:43:18 -0700170static void tnode_free(struct tnode *tn);
171static void trie_dump_seq(struct seq_file *seq, struct trie *t);
172extern struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio);
173extern int fib_detect_death(struct fib_info *fi, int order,
174 struct fib_info **last_resort, int *last_idx, int *dflt);
175
176extern void rtmsg_fib(int event, u32 key, struct fib_alias *fa, int z, int tb_id,
177 struct nlmsghdr *n, struct netlink_skb_parms *req);
178
179static kmem_cache_t *fn_alias_kmem;
180static struct trie *trie_local = NULL, *trie_main = NULL;
181
182static void trie_bug(char *err)
183{
184 printk("Trie Bug: %s\n", err);
185 BUG();
186}
187
188static inline struct node *tnode_get_child(struct tnode *tn, int i)
189{
190 if (i >= 1<<tn->bits)
191 trie_bug("tnode_get_child");
192
193 return tn->child[i];
194}
195
196static inline int tnode_child_length(struct tnode *tn)
197{
198 return 1<<tn->bits;
199}
200
201/*
202 _________________________________________________________________
203 | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
204 ----------------------------------------------------------------
205 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
206
207 _________________________________________________________________
208 | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
209 -----------------------------------------------------------------
210 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
211
212 tp->pos = 7
213 tp->bits = 3
214 n->pos = 15
215 n->bits=4
216 KEYLENGTH=32
217*/
218
219static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
220{
221 if (offset < KEYLENGTH)
222 return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
223 else
224 return 0;
225}
226
227static inline int tkey_equals(t_key a, t_key b)
228{
229 return a == b;
230}
231
232static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
233{
234 if (bits == 0 || offset >= KEYLENGTH)
235 return 1;
236 bits = bits > KEYLENGTH ? KEYLENGTH : bits;
237 return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
238}
239
240static inline int tkey_mismatch(t_key a, int offset, t_key b)
241{
242 t_key diff = a ^ b;
243 int i = offset;
244
245 if(!diff)
246 return 0;
247 while((diff << i) >> (KEYLENGTH-1) == 0)
248 i++;
249 return i;
250}
251
252/* Candiate for fib_semantics */
253
254static void fn_free_alias(struct fib_alias *fa)
255{
256 fib_release_info(fa->fa_info);
257 kmem_cache_free(fn_alias_kmem, fa);
258}
259
260/*
261 To understand this stuff, an understanding of keys and all their bits is
262 necessary. Every node in the trie has a key associated with it, but not
263 all of the bits in that key are significant.
264
265 Consider a node 'n' and its parent 'tp'.
266
267 If n is a leaf, every bit in its key is significant. Its presence is
268 necessitaded by path compression, since during a tree traversal (when
269 searching for a leaf - unless we are doing an insertion) we will completely
270 ignore all skipped bits we encounter. Thus we need to verify, at the end of
271 a potentially successful search, that we have indeed been walking the
272 correct key path.
273
274 Note that we can never "miss" the correct key in the tree if present by
275 following the wrong path. Path compression ensures that segments of the key
276 that are the same for all keys with a given prefix are skipped, but the
277 skipped part *is* identical for each node in the subtrie below the skipped
278 bit! trie_insert() in this implementation takes care of that - note the
279 call to tkey_sub_equals() in trie_insert().
280
281 if n is an internal node - a 'tnode' here, the various parts of its key
282 have many different meanings.
283
284 Example:
285 _________________________________________________________________
286 | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
287 -----------------------------------------------------------------
288 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
289
290 _________________________________________________________________
291 | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
292 -----------------------------------------------------------------
293 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
294
295 tp->pos = 7
296 tp->bits = 3
297 n->pos = 15
298 n->bits=4
299
300 First, let's just ignore the bits that come before the parent tp, that is
301 the bits from 0 to (tp->pos-1). They are *known* but at this point we do
302 not use them for anything.
303
304 The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
305 index into the parent's child array. That is, they will be used to find
306 'n' among tp's children.
307
308 The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
309 for the node n.
310
311 All the bits we have seen so far are significant to the node n. The rest
312 of the bits are really not needed or indeed known in n->key.
313
314 The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
315 n's child array, and will of course be different for each child.
316
317 The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
318 at this point.
319
320*/
321
322static void check_tnode(struct tnode *tn)
323{
324 if(tn && tn->pos+tn->bits > 32) {
325 printk("TNODE ERROR tn=%p, pos=%d, bits=%d\n", tn, tn->pos, tn->bits);
326 }
327}
328
329static int halve_threshold = 25;
330static int inflate_threshold = 50;
331
332static struct leaf *leaf_new(void)
333{
334 struct leaf *l = kmalloc(sizeof(struct leaf), GFP_KERNEL);
335 if(l) {
336 NODE_INIT_PARENT(l, T_LEAF);
337 INIT_HLIST_HEAD(&l->list);
338 }
339 return l;
340}
341
342static struct leaf_info *leaf_info_new(int plen)
343{
344 struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL);
Robert Olssonf835e472005-06-28 15:00:39 -0700345 if(li) {
346 li->plen = plen;
347 INIT_LIST_HEAD(&li->falh);
348 }
Robert Olsson19baf832005-06-21 12:43:18 -0700349 return li;
350}
351
352static inline void free_leaf(struct leaf *l)
353{
354 kfree(l);
355}
356
357static inline void free_leaf_info(struct leaf_info *li)
358{
359 kfree(li);
360}
361
Patrick McHardyf0e36f82005-07-05 14:44:55 -0700362static struct tnode *tnode_alloc(unsigned int size)
363{
364 if (size <= PAGE_SIZE) {
365 return kmalloc(size, GFP_KERNEL);
366 } else {
367 return (struct tnode *)
368 __get_free_pages(GFP_KERNEL, get_order(size));
369 }
370}
371
372static void __tnode_free(struct tnode *tn)
373{
374 unsigned int size = sizeof(struct tnode) +
375 (1<<tn->bits) * sizeof(struct node *);
376
377 if (size <= PAGE_SIZE)
378 kfree(tn);
379 else
380 free_pages((unsigned long)tn, get_order(size));
381}
382
Robert Olsson19baf832005-06-21 12:43:18 -0700383static struct tnode* tnode_new(t_key key, int pos, int bits)
384{
385 int nchildren = 1<<bits;
386 int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *);
Patrick McHardyf0e36f82005-07-05 14:44:55 -0700387 struct tnode *tn = tnode_alloc(sz);
Robert Olsson19baf832005-06-21 12:43:18 -0700388
389 if(tn) {
390 memset(tn, 0, sz);
391 NODE_INIT_PARENT(tn, T_TNODE);
392 tn->pos = pos;
393 tn->bits = bits;
394 tn->key = key;
395 tn->full_children = 0;
396 tn->empty_children = 1<<bits;
397 }
398 if(trie_debug > 0)
399 printk("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode),
400 (unsigned int) (sizeof(struct node) * 1<<bits));
401 return tn;
402}
403
404static void tnode_free(struct tnode *tn)
405{
406 if(!tn) {
407 trie_bug("tnode_free\n");
408 }
409 if(IS_LEAF(tn)) {
410 free_leaf((struct leaf *)tn);
411 if(trie_debug > 0 )
412 printk("FL %p \n", tn);
413 }
414 else if(IS_TNODE(tn)) {
Patrick McHardyf0e36f82005-07-05 14:44:55 -0700415 __tnode_free(tn);
Robert Olsson19baf832005-06-21 12:43:18 -0700416 if(trie_debug > 0 )
417 printk("FT %p \n", tn);
418 }
419 else {
420 trie_bug("tnode_free\n");
421 }
422}
423
424/*
425 * Check whether a tnode 'n' is "full", i.e. it is an internal node
426 * and no bits are skipped. See discussion in dyntree paper p. 6
427 */
428
429static inline int tnode_full(struct tnode *tn, struct node *n)
430{
431 if(n == NULL || IS_LEAF(n))
432 return 0;
433
434 return ((struct tnode *) n)->pos == tn->pos + tn->bits;
435}
436
437static inline void put_child(struct trie *t, struct tnode *tn, int i, struct node *n)
438{
439 tnode_put_child_reorg(tn, i, n, -1);
440}
441
442 /*
443 * Add a child at position i overwriting the old value.
444 * Update the value of full_children and empty_children.
445 */
446
447static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull)
448{
449 struct node *chi;
450 int isfull;
451
452 if(i >= 1<<tn->bits) {
453 printk("bits=%d, i=%d\n", tn->bits, i);
454 trie_bug("tnode_put_child_reorg bits");
455 }
456 write_lock_bh(&fib_lock);
457 chi = tn->child[i];
458
459 /* update emptyChildren */
460 if (n == NULL && chi != NULL)
461 tn->empty_children++;
462 else if (n != NULL && chi == NULL)
463 tn->empty_children--;
464
465 /* update fullChildren */
466 if (wasfull == -1)
467 wasfull = tnode_full(tn, chi);
468
469 isfull = tnode_full(tn, n);
470 if (wasfull && !isfull)
471 tn->full_children--;
472
473 else if (!wasfull && isfull)
474 tn->full_children++;
475 if(n)
476 NODE_SET_PARENT(n, tn);
477
478 tn->child[i] = n;
479 write_unlock_bh(&fib_lock);
480}
481
482static struct node *resize(struct trie *t, struct tnode *tn)
483{
484 int i;
Robert Olsson2f368952005-07-05 15:02:40 -0700485 int err = 0;
Robert Olsson19baf832005-06-21 12:43:18 -0700486
487 if (!tn)
488 return NULL;
489
490 if(trie_debug)
491 printk("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
492 tn, inflate_threshold, halve_threshold);
493
494 /* No children */
495 if (tn->empty_children == tnode_child_length(tn)) {
496 tnode_free(tn);
497 return NULL;
498 }
499 /* One child */
500 if (tn->empty_children == tnode_child_length(tn) - 1)
501 for (i = 0; i < tnode_child_length(tn); i++) {
502
503 write_lock_bh(&fib_lock);
504 if (tn->child[i] != NULL) {
505
506 /* compress one level */
507 struct node *n = tn->child[i];
508 if(n)
509 NODE_INIT_PARENT(n, NODE_TYPE(n));
510
511 write_unlock_bh(&fib_lock);
512 tnode_free(tn);
513 return n;
514 }
515 write_unlock_bh(&fib_lock);
516 }
517 /*
518 * Double as long as the resulting node has a number of
519 * nonempty nodes that are above the threshold.
520 */
521
522 /*
523 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
524 * the Helsinki University of Technology and Matti Tikkanen of Nokia
525 * Telecommunications, page 6:
526 * "A node is doubled if the ratio of non-empty children to all
527 * children in the *doubled* node is at least 'high'."
528 *
529 * 'high' in this instance is the variable 'inflate_threshold'. It
530 * is expressed as a percentage, so we multiply it with
531 * tnode_child_length() and instead of multiplying by 2 (since the
532 * child array will be doubled by inflate()) and multiplying
533 * the left-hand side by 100 (to handle the percentage thing) we
534 * multiply the left-hand side by 50.
535 *
536 * The left-hand side may look a bit weird: tnode_child_length(tn)
537 * - tn->empty_children is of course the number of non-null children
538 * in the current node. tn->full_children is the number of "full"
539 * children, that is non-null tnodes with a skip value of 0.
540 * All of those will be doubled in the resulting inflated tnode, so
541 * we just count them one extra time here.
542 *
543 * A clearer way to write this would be:
544 *
545 * to_be_doubled = tn->full_children;
546 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
547 * tn->full_children;
548 *
549 * new_child_length = tnode_child_length(tn) * 2;
550 *
551 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
552 * new_child_length;
553 * if (new_fill_factor >= inflate_threshold)
554 *
555 * ...and so on, tho it would mess up the while() loop.
556 *
557 * anyway,
558 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
559 * inflate_threshold
560 *
561 * avoid a division:
562 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
563 * inflate_threshold * new_child_length
564 *
565 * expand not_to_be_doubled and to_be_doubled, and shorten:
566 * 100 * (tnode_child_length(tn) - tn->empty_children +
567 * tn->full_children ) >= inflate_threshold * new_child_length
568 *
569 * expand new_child_length:
570 * 100 * (tnode_child_length(tn) - tn->empty_children +
571 * tn->full_children ) >=
572 * inflate_threshold * tnode_child_length(tn) * 2
573 *
574 * shorten again:
575 * 50 * (tn->full_children + tnode_child_length(tn) -
576 * tn->empty_children ) >= inflate_threshold *
577 * tnode_child_length(tn)
578 *
579 */
580
581 check_tnode(tn);
Robert Olsson2f368952005-07-05 15:02:40 -0700582
583 err = 0;
Robert Olsson19baf832005-06-21 12:43:18 -0700584 while ((tn->full_children > 0 &&
585 50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >=
586 inflate_threshold * tnode_child_length(tn))) {
587
Robert Olsson2f368952005-07-05 15:02:40 -0700588 tn = inflate(t, tn, &err);
589
590 if(err) {
591#ifdef CONFIG_IP_FIB_TRIE_STATS
592 t->stats.resize_node_skipped++;
593#endif
594 break;
595 }
Robert Olsson19baf832005-06-21 12:43:18 -0700596 }
597
598 check_tnode(tn);
599
600 /*
601 * Halve as long as the number of empty children in this
602 * node is above threshold.
603 */
Robert Olsson2f368952005-07-05 15:02:40 -0700604
605 err = 0;
Robert Olsson19baf832005-06-21 12:43:18 -0700606 while (tn->bits > 1 &&
607 100 * (tnode_child_length(tn) - tn->empty_children) <
Robert Olsson2f368952005-07-05 15:02:40 -0700608 halve_threshold * tnode_child_length(tn)) {
Robert Olsson19baf832005-06-21 12:43:18 -0700609
Robert Olsson2f368952005-07-05 15:02:40 -0700610 tn = halve(t, tn, &err);
611
612 if(err) {
613#ifdef CONFIG_IP_FIB_TRIE_STATS
614 t->stats.resize_node_skipped++;
615#endif
616 break;
617 }
618 }
619
Robert Olsson19baf832005-06-21 12:43:18 -0700620
621 /* Only one child remains */
622
623 if (tn->empty_children == tnode_child_length(tn) - 1)
624 for (i = 0; i < tnode_child_length(tn); i++) {
625
626 write_lock_bh(&fib_lock);
627 if (tn->child[i] != NULL) {
628 /* compress one level */
629 struct node *n = tn->child[i];
630
631 if(n)
632 NODE_INIT_PARENT(n, NODE_TYPE(n));
633
634 write_unlock_bh(&fib_lock);
635 tnode_free(tn);
636 return n;
637 }
638 write_unlock_bh(&fib_lock);
639 }
640
641 return (struct node *) tn;
642}
643
Robert Olsson2f368952005-07-05 15:02:40 -0700644static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err)
Robert Olsson19baf832005-06-21 12:43:18 -0700645{
646 struct tnode *inode;
647 struct tnode *oldtnode = tn;
648 int olen = tnode_child_length(tn);
649 int i;
650
651 if(trie_debug)
652 printk("In inflate\n");
653
654 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
655
Robert Olsson2f368952005-07-05 15:02:40 -0700656 if (!tn) {
657 *err = -ENOMEM;
658 return oldtnode;
659 }
660
661 /*
662 * Preallocate and store tnodes before the actual work so we
663 * don't get into an inconsistent state if memory allocation
664 * fails. In case of failure we return the oldnode and inflate
665 * of tnode is ignored.
666 */
667
668 for(i = 0; i < olen; i++) {
669 struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i);
670
671 if (inode &&
672 IS_TNODE(inode) &&
673 inode->pos == oldtnode->pos + oldtnode->bits &&
674 inode->bits > 1) {
675 struct tnode *left, *right;
676
677 t_key m = TKEY_GET_MASK(inode->pos, 1);
678
679 left = tnode_new(inode->key&(~m), inode->pos + 1,
680 inode->bits - 1);
681
682 if(!left) {
683 *err = -ENOMEM;
684 break;
685 }
686
687 right = tnode_new(inode->key|m, inode->pos + 1,
688 inode->bits - 1);
689
690 if(!right) {
691 *err = -ENOMEM;
692 break;
693 }
694
695 put_child(t, tn, 2*i, (struct node *) left);
696 put_child(t, tn, 2*i+1, (struct node *) right);
697 }
698 }
699
700 if(*err) {
701 int size = tnode_child_length(tn);
702 int j;
703
704 for(j = 0; j < size; j++)
705 if( tn->child[j])
706 tnode_free((struct tnode *)tn->child[j]);
707
708 tnode_free(tn);
709
710 *err = -ENOMEM;
711 return oldtnode;
712 }
Robert Olsson19baf832005-06-21 12:43:18 -0700713
714 for(i = 0; i < olen; i++) {
715 struct node *node = tnode_get_child(oldtnode, i);
716
717 /* An empty child */
718 if (node == NULL)
719 continue;
720
721 /* A leaf or an internal node with skipped bits */
722
723 if(IS_LEAF(node) || ((struct tnode *) node)->pos >
724 tn->pos + tn->bits - 1) {
Robert Olsson2f368952005-07-05 15:02:40 -0700725 if(tkey_extract_bits(node->key, oldtnode->pos + oldtnode->bits,
Robert Olsson19baf832005-06-21 12:43:18 -0700726 1) == 0)
727 put_child(t, tn, 2*i, node);
728 else
729 put_child(t, tn, 2*i+1, node);
730 continue;
731 }
732
733 /* An internal node with two children */
734 inode = (struct tnode *) node;
735
736 if (inode->bits == 1) {
737 put_child(t, tn, 2*i, inode->child[0]);
738 put_child(t, tn, 2*i+1, inode->child[1]);
739
740 tnode_free(inode);
741 }
742
743 /* An internal node with more than two children */
744 else {
745 struct tnode *left, *right;
746 int size, j;
747
748 /* We will replace this node 'inode' with two new
749 * ones, 'left' and 'right', each with half of the
750 * original children. The two new nodes will have
751 * a position one bit further down the key and this
752 * means that the "significant" part of their keys
753 * (see the discussion near the top of this file)
754 * will differ by one bit, which will be "0" in
755 * left's key and "1" in right's key. Since we are
756 * moving the key position by one step, the bit that
757 * we are moving away from - the bit at position
758 * (inode->pos) - is the one that will differ between
759 * left and right. So... we synthesize that bit in the
760 * two new keys.
761 * The mask 'm' below will be a single "one" bit at
762 * the position (inode->pos)
763 */
764
Robert Olsson19baf832005-06-21 12:43:18 -0700765 /* Use the old key, but set the new significant
766 * bit to zero.
767 */
Robert Olsson19baf832005-06-21 12:43:18 -0700768
Robert Olsson2f368952005-07-05 15:02:40 -0700769 left = (struct tnode *) tnode_get_child(tn, 2*i);
770 put_child(t, tn, 2*i, NULL);
Robert Olsson19baf832005-06-21 12:43:18 -0700771
Robert Olsson2f368952005-07-05 15:02:40 -0700772 if(!left)
773 BUG();
774
775 right = (struct tnode *) tnode_get_child(tn, 2*i+1);
776 put_child(t, tn, 2*i+1, NULL);
777
778 if(!right)
779 BUG();
780
Robert Olsson19baf832005-06-21 12:43:18 -0700781 size = tnode_child_length(left);
782 for(j = 0; j < size; j++) {
783 put_child(t, left, j, inode->child[j]);
784 put_child(t, right, j, inode->child[j + size]);
785 }
786 put_child(t, tn, 2*i, resize(t, left));
787 put_child(t, tn, 2*i+1, resize(t, right));
788
789 tnode_free(inode);
790 }
791 }
792 tnode_free(oldtnode);
793 return tn;
794}
795
Robert Olsson2f368952005-07-05 15:02:40 -0700796static struct tnode *halve(struct trie *t, struct tnode *tn, int *err)
Robert Olsson19baf832005-06-21 12:43:18 -0700797{
798 struct tnode *oldtnode = tn;
799 struct node *left, *right;
800 int i;
801 int olen = tnode_child_length(tn);
802
803 if(trie_debug) printk("In halve\n");
804
805 tn=tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
806
Robert Olsson2f368952005-07-05 15:02:40 -0700807 if (!tn) {
808 *err = -ENOMEM;
809 return oldtnode;
810 }
811
812 /*
813 * Preallocate and store tnodes before the actual work so we
814 * don't get into an inconsistent state if memory allocation
815 * fails. In case of failure we return the oldnode and halve
816 * of tnode is ignored.
817 */
818
819 for(i = 0; i < olen; i += 2) {
820 left = tnode_get_child(oldtnode, i);
821 right = tnode_get_child(oldtnode, i+1);
822
823 /* Two nonempty children */
824 if( left && right) {
825 struct tnode *newBinNode =
826 tnode_new(left->key, tn->pos + tn->bits, 1);
827
828 if(!newBinNode) {
829 *err = -ENOMEM;
830 break;
831 }
832 put_child(t, tn, i/2, (struct node *)newBinNode);
833 }
834 }
835
836 if(*err) {
837 int size = tnode_child_length(tn);
838 int j;
839
840 for(j = 0; j < size; j++)
841 if( tn->child[j])
842 tnode_free((struct tnode *)tn->child[j]);
843
844 tnode_free(tn);
845
846 *err = -ENOMEM;
847 return oldtnode;
848 }
Robert Olsson19baf832005-06-21 12:43:18 -0700849
850 for(i = 0; i < olen; i += 2) {
851 left = tnode_get_child(oldtnode, i);
852 right = tnode_get_child(oldtnode, i+1);
853
854 /* At least one of the children is empty */
855 if (left == NULL) {
856 if (right == NULL) /* Both are empty */
857 continue;
858 put_child(t, tn, i/2, right);
859 } else if (right == NULL)
860 put_child(t, tn, i/2, left);
861
862 /* Two nonempty children */
863 else {
864 struct tnode *newBinNode =
Robert Olsson2f368952005-07-05 15:02:40 -0700865 (struct tnode *) tnode_get_child(tn, i/2);
866 put_child(t, tn, i/2, NULL);
Robert Olsson19baf832005-06-21 12:43:18 -0700867
868 if(!newBinNode)
Robert Olsson2f368952005-07-05 15:02:40 -0700869 BUG();
Robert Olsson19baf832005-06-21 12:43:18 -0700870
871 put_child(t, newBinNode, 0, left);
872 put_child(t, newBinNode, 1, right);
873 put_child(t, tn, i/2, resize(t, newBinNode));
874 }
875 }
876 tnode_free(oldtnode);
877 return tn;
878}
879
880static void *trie_init(struct trie *t)
881{
882 if(t) {
883 t->size = 0;
884 t->trie = NULL;
885 t->revision = 0;
886#ifdef CONFIG_IP_FIB_TRIE_STATS
887 memset(&t->stats, 0, sizeof(struct trie_use_stats));
888#endif
889 }
890 return t;
891}
892
893static struct leaf_info *find_leaf_info(struct hlist_head *head, int plen)
894{
895 struct hlist_node *node;
896 struct leaf_info *li;
897
898 hlist_for_each_entry(li, node, head, hlist) {
899
900 if ( li->plen == plen )
901 return li;
902 }
903 return NULL;
904}
905
906static inline struct list_head * get_fa_head(struct leaf *l, int plen)
907{
908 struct list_head *fa_head=NULL;
909 struct leaf_info *li = find_leaf_info(&l->list, plen);
910
911 if(li)
912 fa_head = &li->falh;
913
914 return fa_head;
915}
916
917static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
918{
919 struct leaf_info *li=NULL, *last=NULL;
920 struct hlist_node *node, *tmp;
921
922 write_lock_bh(&fib_lock);
923
924 if(hlist_empty(head))
925 hlist_add_head(&new->hlist, head);
926 else {
927 hlist_for_each_entry_safe(li, node, tmp, head, hlist) {
928
929 if (new->plen > li->plen)
930 break;
931
932 last = li;
933 }
934 if(last)
935 hlist_add_after(&last->hlist, &new->hlist);
936 else
937 hlist_add_before(&new->hlist, &li->hlist);
938 }
939 write_unlock_bh(&fib_lock);
940}
941
942static struct leaf *
943fib_find_node(struct trie *t, u32 key)
944{
945 int pos;
946 struct tnode *tn;
947 struct node *n;
948
949 pos = 0;
950 n=t->trie;
951
952 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
953 tn = (struct tnode *) n;
954
955 check_tnode(tn);
956
957 if(tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
958 pos=tn->pos + tn->bits;
959 n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
960 }
961 else
962 break;
963 }
964 /* Case we have found a leaf. Compare prefixes */
965
966 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
967 struct leaf *l = (struct leaf *) n;
968 return l;
969 }
970 return NULL;
971}
972
973static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
974{
975 int i = 0;
976 int wasfull;
977 t_key cindex, key;
978 struct tnode *tp = NULL;
979
980 if(!tn)
981 BUG();
982
983 key = tn->key;
984 i = 0;
985
986 while (tn != NULL && NODE_PARENT(tn) != NULL) {
987
988 if( i > 10 ) {
989 printk("Rebalance tn=%p \n", tn);
990 if(tn) printk("tn->parent=%p \n", NODE_PARENT(tn));
991
992 printk("Rebalance tp=%p \n", tp);
993 if(tp) printk("tp->parent=%p \n", NODE_PARENT(tp));
994 }
995
996 if( i > 12 ) BUG();
997 i++;
998
999 tp = NODE_PARENT(tn);
1000 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1001 wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
1002 tn = (struct tnode *) resize (t, (struct tnode *)tn);
1003 tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull);
1004
1005 if(!NODE_PARENT(tn))
1006 break;
1007
1008 tn = NODE_PARENT(tn);
1009 }
1010 /* Handle last (top) tnode */
1011 if (IS_TNODE(tn))
1012 tn = (struct tnode*) resize(t, (struct tnode *)tn);
1013
1014 return (struct node*) tn;
1015}
1016
Robert Olssonf835e472005-06-28 15:00:39 -07001017static struct list_head *
1018fib_insert_node(struct trie *t, int *err, u32 key, int plen)
Robert Olsson19baf832005-06-21 12:43:18 -07001019{
1020 int pos, newpos;
1021 struct tnode *tp = NULL, *tn = NULL;
1022 struct node *n;
1023 struct leaf *l;
1024 int missbit;
1025 struct list_head *fa_head=NULL;
1026 struct leaf_info *li;
1027 t_key cindex;
1028
1029 pos = 0;
1030 n=t->trie;
1031
1032 /* If we point to NULL, stop. Either the tree is empty and we should
1033 * just put a new leaf in if, or we have reached an empty child slot,
1034 * and we should just put our new leaf in that.
1035 * If we point to a T_TNODE, check if it matches our key. Note that
1036 * a T_TNODE might be skipping any number of bits - its 'pos' need
1037 * not be the parent's 'pos'+'bits'!
1038 *
1039 * If it does match the current key, get pos/bits from it, extract
1040 * the index from our key, push the T_TNODE and walk the tree.
1041 *
1042 * If it doesn't, we have to replace it with a new T_TNODE.
1043 *
1044 * If we point to a T_LEAF, it might or might not have the same key
1045 * as we do. If it does, just change the value, update the T_LEAF's
1046 * value, and return it.
1047 * If it doesn't, we need to replace it with a T_TNODE.
1048 */
1049
1050 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
1051 tn = (struct tnode *) n;
1052
1053 check_tnode(tn);
1054
1055 if(tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
1056 tp = tn;
1057 pos=tn->pos + tn->bits;
1058 n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
1059
1060 if(n && NODE_PARENT(n) != tn) {
1061 printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n));
1062 BUG();
1063 }
1064 }
1065 else
1066 break;
1067 }
1068
1069 /*
1070 * n ----> NULL, LEAF or TNODE
1071 *
1072 * tp is n's (parent) ----> NULL or TNODE
1073 */
1074
1075 if(tp && IS_LEAF(tp))
1076 BUG();
1077
Robert Olsson19baf832005-06-21 12:43:18 -07001078
1079 /* Case 1: n is a leaf. Compare prefixes */
1080
1081 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1082 struct leaf *l = ( struct leaf *) n;
1083
1084 li = leaf_info_new(plen);
1085
Robert Olssonf835e472005-06-28 15:00:39 -07001086 if(! li) {
1087 *err = -ENOMEM;
1088 goto err;
1089 }
Robert Olsson19baf832005-06-21 12:43:18 -07001090
1091 fa_head = &li->falh;
1092 insert_leaf_info(&l->list, li);
1093 goto done;
1094 }
1095 t->size++;
1096 l = leaf_new();
1097
Robert Olssonf835e472005-06-28 15:00:39 -07001098 if(! l) {
1099 *err = -ENOMEM;
1100 goto err;
1101 }
Robert Olsson19baf832005-06-21 12:43:18 -07001102
1103 l->key = key;
1104 li = leaf_info_new(plen);
1105
Robert Olssonf835e472005-06-28 15:00:39 -07001106 if(! li) {
1107 tnode_free((struct tnode *) l);
1108 *err = -ENOMEM;
1109 goto err;
1110 }
Robert Olsson19baf832005-06-21 12:43:18 -07001111
1112 fa_head = &li->falh;
1113 insert_leaf_info(&l->list, li);
1114
1115 /* Case 2: n is NULL, and will just insert a new leaf */
1116 if (t->trie && n == NULL) {
1117
1118 NODE_SET_PARENT(l, tp);
1119
1120 if (!tp)
1121 BUG();
1122
1123 else {
1124 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1125 put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
1126 }
1127 }
1128 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1129 else {
1130 /*
1131 * Add a new tnode here
1132 * first tnode need some special handling
1133 */
1134
1135 if (tp)
1136 pos=tp->pos+tp->bits;
1137 else
1138 pos=0;
1139 if(n) {
1140 newpos = tkey_mismatch(key, pos, n->key);
1141 tn = tnode_new(n->key, newpos, 1);
1142 }
1143 else {
1144 newpos = 0;
1145 tn = tnode_new(key, newpos, 1); /* First tnode */
1146 }
Robert Olsson19baf832005-06-21 12:43:18 -07001147
Robert Olssonf835e472005-06-28 15:00:39 -07001148 if(!tn) {
1149 free_leaf_info(li);
1150 tnode_free((struct tnode *) l);
1151 *err = -ENOMEM;
1152 goto err;
1153 }
1154
Robert Olsson19baf832005-06-21 12:43:18 -07001155 NODE_SET_PARENT(tn, tp);
1156
1157 missbit=tkey_extract_bits(key, newpos, 1);
1158 put_child(t, tn, missbit, (struct node *)l);
1159 put_child(t, tn, 1-missbit, n);
1160
1161 if(tp) {
1162 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1163 put_child(t, (struct tnode *)tp, cindex, (struct node *)tn);
1164 }
1165 else {
1166 t->trie = (struct node*) tn; /* First tnode */
1167 tp = tn;
1168 }
1169 }
1170 if(tp && tp->pos+tp->bits > 32) {
1171 printk("ERROR tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1172 tp, tp->pos, tp->bits, key, plen);
1173 }
1174 /* Rebalance the trie */
1175 t->trie = trie_rebalance(t, tp);
Robert Olssonf835e472005-06-28 15:00:39 -07001176done:
1177 t->revision++;
1178err:;
Robert Olsson19baf832005-06-21 12:43:18 -07001179 return fa_head;
1180}
1181
1182static int
1183fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1184 struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
1185{
1186 struct trie *t = (struct trie *) tb->tb_data;
1187 struct fib_alias *fa, *new_fa;
1188 struct list_head *fa_head=NULL;
1189 struct fib_info *fi;
1190 int plen = r->rtm_dst_len;
1191 int type = r->rtm_type;
1192 u8 tos = r->rtm_tos;
1193 u32 key, mask;
1194 int err;
1195 struct leaf *l;
1196
1197 if (plen > 32)
1198 return -EINVAL;
1199
1200 key = 0;
1201 if (rta->rta_dst)
1202 memcpy(&key, rta->rta_dst, 4);
1203
1204 key = ntohl(key);
1205
1206 if(trie_debug)
1207 printk("Insert table=%d %08x/%d\n", tb->tb_id, key, plen);
1208
1209 mask = ntohl( inet_make_mask(plen) );
1210
1211 if(key & ~mask)
1212 return -EINVAL;
1213
1214 key = key & mask;
1215
1216 if ((fi = fib_create_info(r, rta, nlhdr, &err)) == NULL)
1217 goto err;
1218
1219 l = fib_find_node(t, key);
1220 fa = NULL;
1221
1222 if(l) {
1223 fa_head = get_fa_head(l, plen);
1224 fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1225 }
1226
1227 /* Now fa, if non-NULL, points to the first fib alias
1228 * with the same keys [prefix,tos,priority], if such key already
1229 * exists or to the node before which we will insert new one.
1230 *
1231 * If fa is NULL, we will need to allocate a new one and
1232 * insert to the head of f.
1233 *
1234 * If f is NULL, no fib node matched the destination key
1235 * and we need to allocate a new one of those as well.
1236 */
1237
1238 if (fa &&
1239 fa->fa_info->fib_priority == fi->fib_priority) {
1240 struct fib_alias *fa_orig;
1241
1242 err = -EEXIST;
1243 if (nlhdr->nlmsg_flags & NLM_F_EXCL)
1244 goto out;
1245
1246 if (nlhdr->nlmsg_flags & NLM_F_REPLACE) {
1247 struct fib_info *fi_drop;
1248 u8 state;
1249
1250 write_lock_bh(&fib_lock);
1251
1252 fi_drop = fa->fa_info;
1253 fa->fa_info = fi;
1254 fa->fa_type = type;
1255 fa->fa_scope = r->rtm_scope;
1256 state = fa->fa_state;
1257 fa->fa_state &= ~FA_S_ACCESSED;
1258
1259 write_unlock_bh(&fib_lock);
1260
1261 fib_release_info(fi_drop);
1262 if (state & FA_S_ACCESSED)
1263 rt_cache_flush(-1);
1264
1265 goto succeeded;
1266 }
1267 /* Error if we find a perfect match which
1268 * uses the same scope, type, and nexthop
1269 * information.
1270 */
1271 fa_orig = fa;
1272 list_for_each_entry(fa, fa_orig->fa_list.prev, fa_list) {
1273 if (fa->fa_tos != tos)
1274 break;
1275 if (fa->fa_info->fib_priority != fi->fib_priority)
1276 break;
1277 if (fa->fa_type == type &&
1278 fa->fa_scope == r->rtm_scope &&
1279 fa->fa_info == fi) {
1280 goto out;
1281 }
1282 }
1283 if (!(nlhdr->nlmsg_flags & NLM_F_APPEND))
1284 fa = fa_orig;
1285 }
1286 err = -ENOENT;
1287 if (!(nlhdr->nlmsg_flags&NLM_F_CREATE))
1288 goto out;
1289
1290 err = -ENOBUFS;
1291 new_fa = kmem_cache_alloc(fn_alias_kmem, SLAB_KERNEL);
1292 if (new_fa == NULL)
1293 goto out;
1294
1295 new_fa->fa_info = fi;
1296 new_fa->fa_tos = tos;
1297 new_fa->fa_type = type;
1298 new_fa->fa_scope = r->rtm_scope;
1299 new_fa->fa_state = 0;
1300#if 0
1301 new_fa->dst = NULL;
1302#endif
1303 /*
1304 * Insert new entry to the list.
1305 */
1306
Robert Olssonf835e472005-06-28 15:00:39 -07001307 if(!fa_head) {
1308 fa_head = fib_insert_node(t, &err, key, plen);
1309 err = 0;
1310 if(err)
1311 goto out_free_new_fa;
1312 }
Robert Olsson19baf832005-06-21 12:43:18 -07001313
1314 write_lock_bh(&fib_lock);
1315
1316 list_add_tail(&new_fa->fa_list,
1317 (fa ? &fa->fa_list : fa_head));
1318
1319 write_unlock_bh(&fib_lock);
1320
1321 rt_cache_flush(-1);
1322 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, nlhdr, req);
1323succeeded:
1324 return 0;
Robert Olssonf835e472005-06-28 15:00:39 -07001325
1326out_free_new_fa:
1327 kmem_cache_free(fn_alias_kmem, new_fa);
Robert Olsson19baf832005-06-21 12:43:18 -07001328out:
1329 fib_release_info(fi);
1330err:;
1331 return err;
1332}
1333
1334static inline int check_leaf(struct trie *t, struct leaf *l, t_key key, int *plen, const struct flowi *flp,
1335 struct fib_result *res, int *err)
1336{
1337 int i;
1338 t_key mask;
1339 struct leaf_info *li;
1340 struct hlist_head *hhead = &l->list;
1341 struct hlist_node *node;
1342
1343 hlist_for_each_entry(li, node, hhead, hlist) {
1344
1345 i = li->plen;
1346 mask = ntohl(inet_make_mask(i));
1347 if (l->key != (key & mask))
1348 continue;
1349
1350 if (((*err) = fib_semantic_match(&li->falh, flp, res, l->key, mask, i)) == 0) {
1351 *plen = i;
1352#ifdef CONFIG_IP_FIB_TRIE_STATS
1353 t->stats.semantic_match_passed++;
1354#endif
1355 return 1;
1356 }
1357#ifdef CONFIG_IP_FIB_TRIE_STATS
1358 t->stats.semantic_match_miss++;
1359#endif
1360 }
1361 return 0;
1362}
1363
1364static int
1365fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1366{
1367 struct trie *t = (struct trie *) tb->tb_data;
1368 int plen, ret = 0;
1369 struct node *n;
1370 struct tnode *pn;
1371 int pos, bits;
1372 t_key key=ntohl(flp->fl4_dst);
1373 int chopped_off;
1374 t_key cindex = 0;
1375 int current_prefix_length = KEYLENGTH;
1376 n = t->trie;
1377
1378 read_lock(&fib_lock);
1379 if(!n)
1380 goto failed;
1381
1382#ifdef CONFIG_IP_FIB_TRIE_STATS
1383 t->stats.gets++;
1384#endif
1385
1386 /* Just a leaf? */
1387 if (IS_LEAF(n)) {
1388 if( check_leaf(t, (struct leaf *)n, key, &plen, flp, res, &ret) )
1389 goto found;
1390 goto failed;
1391 }
1392 pn = (struct tnode *) n;
1393 chopped_off = 0;
1394
1395 while (pn) {
1396
1397 pos = pn->pos;
1398 bits = pn->bits;
1399
1400 if(!chopped_off)
1401 cindex = tkey_extract_bits(MASK_PFX(key, current_prefix_length), pos, bits);
1402
1403 n = tnode_get_child(pn, cindex);
1404
1405 if (n == NULL) {
1406#ifdef CONFIG_IP_FIB_TRIE_STATS
1407 t->stats.null_node_hit++;
1408#endif
1409 goto backtrace;
1410 }
1411
1412 if (IS_TNODE(n)) {
1413#define HL_OPTIMIZE
1414#ifdef HL_OPTIMIZE
1415 struct tnode *cn = (struct tnode *)n;
1416 t_key node_prefix, key_prefix, pref_mismatch;
1417 int mp;
1418
1419 /*
1420 * It's a tnode, and we can do some extra checks here if we
1421 * like, to avoid descending into a dead-end branch.
1422 * This tnode is in the parent's child array at index
1423 * key[p_pos..p_pos+p_bits] but potentially with some bits
1424 * chopped off, so in reality the index may be just a
1425 * subprefix, padded with zero at the end.
1426 * We can also take a look at any skipped bits in this
1427 * tnode - everything up to p_pos is supposed to be ok,
1428 * and the non-chopped bits of the index (se previous
1429 * paragraph) are also guaranteed ok, but the rest is
1430 * considered unknown.
1431 *
1432 * The skipped bits are key[pos+bits..cn->pos].
1433 */
1434
1435 /* If current_prefix_length < pos+bits, we are already doing
1436 * actual prefix matching, which means everything from
1437 * pos+(bits-chopped_off) onward must be zero along some
1438 * branch of this subtree - otherwise there is *no* valid
1439 * prefix present. Here we can only check the skipped
1440 * bits. Remember, since we have already indexed into the
1441 * parent's child array, we know that the bits we chopped of
1442 * *are* zero.
1443 */
1444
1445 /* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */
1446
1447 if (current_prefix_length < pos+bits) {
1448 if (tkey_extract_bits(cn->key, current_prefix_length,
1449 cn->pos - current_prefix_length) != 0 ||
1450 !(cn->child[0]))
1451 goto backtrace;
1452 }
1453
1454 /*
1455 * If chopped_off=0, the index is fully validated and we
1456 * only need to look at the skipped bits for this, the new,
1457 * tnode. What we actually want to do is to find out if
1458 * these skipped bits match our key perfectly, or if we will
1459 * have to count on finding a matching prefix further down,
1460 * because if we do, we would like to have some way of
1461 * verifying the existence of such a prefix at this point.
1462 */
1463
1464 /* The only thing we can do at this point is to verify that
1465 * any such matching prefix can indeed be a prefix to our
1466 * key, and if the bits in the node we are inspecting that
1467 * do not match our key are not ZERO, this cannot be true.
1468 * Thus, find out where there is a mismatch (before cn->pos)
1469 * and verify that all the mismatching bits are zero in the
1470 * new tnode's key.
1471 */
1472
1473 /* Note: We aren't very concerned about the piece of the key
1474 * that precede pn->pos+pn->bits, since these have already been
1475 * checked. The bits after cn->pos aren't checked since these are
1476 * by definition "unknown" at this point. Thus, what we want to
1477 * see is if we are about to enter the "prefix matching" state,
1478 * and in that case verify that the skipped bits that will prevail
1479 * throughout this subtree are zero, as they have to be if we are
1480 * to find a matching prefix.
1481 */
1482
1483 node_prefix = MASK_PFX(cn->key, cn->pos);
1484 key_prefix = MASK_PFX(key, cn->pos);
1485 pref_mismatch = key_prefix^node_prefix;
1486 mp = 0;
1487
1488 /* In short: If skipped bits in this node do not match the search
1489 * key, enter the "prefix matching" state.directly.
1490 */
1491 if (pref_mismatch) {
1492 while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
1493 mp++;
1494 pref_mismatch = pref_mismatch <<1;
1495 }
1496 key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1497
1498 if (key_prefix != 0)
1499 goto backtrace;
1500
1501 if (current_prefix_length >= cn->pos)
1502 current_prefix_length=mp;
1503 }
1504#endif
1505 pn = (struct tnode *)n; /* Descend */
1506 chopped_off = 0;
1507 continue;
1508 }
1509 if (IS_LEAF(n)) {
1510 if( check_leaf(t, (struct leaf *)n, key, &plen, flp, res, &ret))
1511 goto found;
1512 }
1513backtrace:
1514 chopped_off++;
1515
1516 /* As zero don't change the child key (cindex) */
1517 while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1)))) {
1518 chopped_off++;
1519 }
1520
1521 /* Decrease current_... with bits chopped off */
1522 if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1523 current_prefix_length = pn->pos + pn->bits - chopped_off;
1524
1525 /*
1526 * Either we do the actual chop off according or if we have
1527 * chopped off all bits in this tnode walk up to our parent.
1528 */
1529
1530 if(chopped_off <= pn->bits)
1531 cindex &= ~(1 << (chopped_off-1));
1532 else {
1533 if( NODE_PARENT(pn) == NULL)
1534 goto failed;
1535
1536 /* Get Child's index */
1537 cindex = tkey_extract_bits(pn->key, NODE_PARENT(pn)->pos, NODE_PARENT(pn)->bits);
1538 pn = NODE_PARENT(pn);
1539 chopped_off = 0;
1540
1541#ifdef CONFIG_IP_FIB_TRIE_STATS
1542 t->stats.backtrack++;
1543#endif
1544 goto backtrace;
1545 }
1546 }
1547failed:
1548 ret = 1;
1549found:
1550 read_unlock(&fib_lock);
1551 return ret;
1552}
1553
1554static int trie_leaf_remove(struct trie *t, t_key key)
1555{
1556 t_key cindex;
1557 struct tnode *tp = NULL;
1558 struct node *n = t->trie;
1559 struct leaf *l;
1560
1561 if(trie_debug)
1562 printk("entering trie_leaf_remove(%p)\n", n);
1563
1564 /* Note that in the case skipped bits, those bits are *not* checked!
1565 * When we finish this, we will have NULL or a T_LEAF, and the
1566 * T_LEAF may or may not match our key.
1567 */
1568
1569 while (n != NULL && IS_TNODE(n)) {
1570 struct tnode *tn = (struct tnode *) n;
1571 check_tnode(tn);
1572 n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits));
1573
1574 if(n && NODE_PARENT(n) != tn) {
1575 printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n));
1576 BUG();
1577 }
1578 }
1579 l = (struct leaf *) n;
1580
1581 if(!n || !tkey_equals(l->key, key))
1582 return 0;
1583
1584 /*
1585 * Key found.
1586 * Remove the leaf and rebalance the tree
1587 */
1588
1589 t->revision++;
1590 t->size--;
1591
1592 tp = NODE_PARENT(n);
1593 tnode_free((struct tnode *) n);
1594
1595 if(tp) {
1596 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1597 put_child(t, (struct tnode *)tp, cindex, NULL);
1598 t->trie = trie_rebalance(t, tp);
1599 }
1600 else
1601 t->trie = NULL;
1602
1603 return 1;
1604}
1605
1606static int
1607fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1608 struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
1609{
1610 struct trie *t = (struct trie *) tb->tb_data;
1611 u32 key, mask;
1612 int plen = r->rtm_dst_len;
1613 u8 tos = r->rtm_tos;
1614 struct fib_alias *fa, *fa_to_delete;
1615 struct list_head *fa_head;
1616 struct leaf *l;
1617
1618 if (plen > 32)
1619 return -EINVAL;
1620
1621 key = 0;
1622 if (rta->rta_dst)
1623 memcpy(&key, rta->rta_dst, 4);
1624
1625 key = ntohl(key);
1626 mask = ntohl( inet_make_mask(plen) );
1627
1628 if(key & ~mask)
1629 return -EINVAL;
1630
1631 key = key & mask;
1632 l = fib_find_node(t, key);
1633
1634 if(!l)
1635 return -ESRCH;
1636
1637 fa_head = get_fa_head(l, plen);
1638 fa = fib_find_alias(fa_head, tos, 0);
1639
1640 if (!fa)
1641 return -ESRCH;
1642
1643 if (trie_debug)
1644 printk("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1645
1646 fa_to_delete = NULL;
1647 fa_head = fa->fa_list.prev;
1648 list_for_each_entry(fa, fa_head, fa_list) {
1649 struct fib_info *fi = fa->fa_info;
1650
1651 if (fa->fa_tos != tos)
1652 break;
1653
1654 if ((!r->rtm_type ||
1655 fa->fa_type == r->rtm_type) &&
1656 (r->rtm_scope == RT_SCOPE_NOWHERE ||
1657 fa->fa_scope == r->rtm_scope) &&
1658 (!r->rtm_protocol ||
1659 fi->fib_protocol == r->rtm_protocol) &&
1660 fib_nh_match(r, nlhdr, rta, fi) == 0) {
1661 fa_to_delete = fa;
1662 break;
1663 }
1664 }
1665
1666 if (fa_to_delete) {
1667 int kill_li = 0;
1668 struct leaf_info *li;
1669
1670 fa = fa_to_delete;
1671 rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id, nlhdr, req);
1672
1673 l = fib_find_node(t, key);
1674 li = find_leaf_info(&l->list, plen);
1675
1676 write_lock_bh(&fib_lock);
1677
1678 list_del(&fa->fa_list);
1679
1680 if(list_empty(fa_head)) {
1681 hlist_del(&li->hlist);
1682 kill_li = 1;
1683 }
1684 write_unlock_bh(&fib_lock);
1685
1686 if(kill_li)
1687 free_leaf_info(li);
1688
1689 if(hlist_empty(&l->list))
1690 trie_leaf_remove(t, key);
1691
1692 if (fa->fa_state & FA_S_ACCESSED)
1693 rt_cache_flush(-1);
1694
1695 fn_free_alias(fa);
1696 return 0;
1697 }
1698 return -ESRCH;
1699}
1700
1701static int trie_flush_list(struct trie *t, struct list_head *head)
1702{
1703 struct fib_alias *fa, *fa_node;
1704 int found = 0;
1705
1706 list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1707 struct fib_info *fi = fa->fa_info;
1708
1709 if (fi && (fi->fib_flags&RTNH_F_DEAD)) {
1710
1711 write_lock_bh(&fib_lock);
1712 list_del(&fa->fa_list);
1713 write_unlock_bh(&fib_lock);
1714
1715 fn_free_alias(fa);
1716 found++;
1717 }
1718 }
1719 return found;
1720}
1721
1722static int trie_flush_leaf(struct trie *t, struct leaf *l)
1723{
1724 int found = 0;
1725 struct hlist_head *lih = &l->list;
1726 struct hlist_node *node, *tmp;
1727 struct leaf_info *li = NULL;
1728
1729 hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
1730
1731 found += trie_flush_list(t, &li->falh);
1732
1733 if (list_empty(&li->falh)) {
1734
1735 write_lock_bh(&fib_lock);
1736 hlist_del(&li->hlist);
1737 write_unlock_bh(&fib_lock);
1738
1739 free_leaf_info(li);
1740 }
1741 }
1742 return found;
1743}
1744
1745static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
1746{
1747 struct node *c = (struct node *) thisleaf;
1748 struct tnode *p;
1749 int idx;
1750
1751 if(c == NULL) {
1752 if(t->trie == NULL)
1753 return NULL;
1754
1755 if (IS_LEAF(t->trie)) /* trie w. just a leaf */
1756 return (struct leaf *) t->trie;
1757
1758 p = (struct tnode*) t->trie; /* Start */
1759 }
1760 else
1761 p = (struct tnode *) NODE_PARENT(c);
1762 while (p) {
1763 int pos, last;
1764
1765 /* Find the next child of the parent */
1766 if(c)
1767 pos = 1 + tkey_extract_bits(c->key, p->pos, p->bits);
1768 else
1769 pos = 0;
1770
1771 last = 1 << p->bits;
1772 for(idx = pos; idx < last ; idx++) {
1773 if( p->child[idx]) {
1774
1775 /* Decend if tnode */
1776
1777 while (IS_TNODE(p->child[idx])) {
1778 p = (struct tnode*) p->child[idx];
1779 idx = 0;
1780
1781 /* Rightmost non-NULL branch */
1782 if( p && IS_TNODE(p) )
1783 while ( p->child[idx] == NULL && idx < (1 << p->bits) ) idx++;
1784
1785 /* Done with this tnode? */
1786 if( idx >= (1 << p->bits) || p->child[idx] == NULL )
1787 goto up;
1788 }
1789 return (struct leaf*) p->child[idx];
1790 }
1791 }
1792up:
1793 /* No more children go up one step */
1794 c = (struct node*) p;
1795 p = (struct tnode *) NODE_PARENT(p);
1796 }
1797 return NULL; /* Ready. Root of trie */
1798}
1799
1800static int fn_trie_flush(struct fib_table *tb)
1801{
1802 struct trie *t = (struct trie *) tb->tb_data;
1803 struct leaf *ll = NULL, *l = NULL;
1804 int found = 0, h;
1805
1806 t->revision++;
1807
1808 for (h=0; (l = nextleaf(t, l)) != NULL; h++) {
1809 found += trie_flush_leaf(t, l);
1810
1811 if (ll && hlist_empty(&ll->list))
1812 trie_leaf_remove(t, ll->key);
1813 ll = l;
1814 }
1815
1816 if (ll && hlist_empty(&ll->list))
1817 trie_leaf_remove(t, ll->key);
1818
1819 if(trie_debug)
1820 printk("trie_flush found=%d\n", found);
1821 return found;
1822}
1823
1824static int trie_last_dflt=-1;
1825
1826static void
1827fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1828{
1829 struct trie *t = (struct trie *) tb->tb_data;
1830 int order, last_idx;
1831 struct fib_info *fi = NULL;
1832 struct fib_info *last_resort;
1833 struct fib_alias *fa = NULL;
1834 struct list_head *fa_head;
1835 struct leaf *l;
1836
1837 last_idx = -1;
1838 last_resort = NULL;
1839 order = -1;
1840
1841 read_lock(&fib_lock);
1842
1843 l = fib_find_node(t, 0);
1844 if(!l)
1845 goto out;
1846
1847 fa_head = get_fa_head(l, 0);
1848 if(!fa_head)
1849 goto out;
1850
1851 if (list_empty(fa_head))
1852 goto out;
1853
1854 list_for_each_entry(fa, fa_head, fa_list) {
1855 struct fib_info *next_fi = fa->fa_info;
1856
1857 if (fa->fa_scope != res->scope ||
1858 fa->fa_type != RTN_UNICAST)
1859 continue;
1860
1861 if (next_fi->fib_priority > res->fi->fib_priority)
1862 break;
1863 if (!next_fi->fib_nh[0].nh_gw ||
1864 next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1865 continue;
1866 fa->fa_state |= FA_S_ACCESSED;
1867
1868 if (fi == NULL) {
1869 if (next_fi != res->fi)
1870 break;
1871 } else if (!fib_detect_death(fi, order, &last_resort,
1872 &last_idx, &trie_last_dflt)) {
1873 if (res->fi)
1874 fib_info_put(res->fi);
1875 res->fi = fi;
1876 atomic_inc(&fi->fib_clntref);
1877 trie_last_dflt = order;
1878 goto out;
1879 }
1880 fi = next_fi;
1881 order++;
1882 }
1883 if (order <= 0 || fi == NULL) {
1884 trie_last_dflt = -1;
1885 goto out;
1886 }
1887
1888 if (!fib_detect_death(fi, order, &last_resort, &last_idx, &trie_last_dflt)) {
1889 if (res->fi)
1890 fib_info_put(res->fi);
1891 res->fi = fi;
1892 atomic_inc(&fi->fib_clntref);
1893 trie_last_dflt = order;
1894 goto out;
1895 }
1896 if (last_idx >= 0) {
1897 if (res->fi)
1898 fib_info_put(res->fi);
1899 res->fi = last_resort;
1900 if (last_resort)
1901 atomic_inc(&last_resort->fib_clntref);
1902 }
1903 trie_last_dflt = last_idx;
1904 out:;
1905 read_unlock(&fib_lock);
1906}
1907
1908static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb,
1909 struct sk_buff *skb, struct netlink_callback *cb)
1910{
1911 int i, s_i;
1912 struct fib_alias *fa;
1913
1914 u32 xkey=htonl(key);
1915
1916 s_i=cb->args[3];
1917 i = 0;
1918
1919 list_for_each_entry(fa, fah, fa_list) {
1920 if (i < s_i) {
1921 i++;
1922 continue;
1923 }
1924 if (fa->fa_info->fib_nh == NULL) {
1925 printk("Trie error _fib_nh=NULL in fa[%d] k=%08x plen=%d\n", i, key, plen);
1926 i++;
1927 continue;
1928 }
1929 if (fa->fa_info == NULL) {
1930 printk("Trie error fa_info=NULL in fa[%d] k=%08x plen=%d\n", i, key, plen);
1931 i++;
1932 continue;
1933 }
1934
1935 if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1936 cb->nlh->nlmsg_seq,
1937 RTM_NEWROUTE,
1938 tb->tb_id,
1939 fa->fa_type,
1940 fa->fa_scope,
1941 &xkey,
1942 plen,
1943 fa->fa_tos,
David S. Miller90f66912005-06-21 14:43:28 -07001944 fa->fa_info, 0) < 0) {
Robert Olsson19baf832005-06-21 12:43:18 -07001945 cb->args[3] = i;
1946 return -1;
1947 }
1948 i++;
1949 }
1950 cb->args[3]=i;
1951 return skb->len;
1952}
1953
1954static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, struct sk_buff *skb,
1955 struct netlink_callback *cb)
1956{
1957 int h, s_h;
1958 struct list_head *fa_head;
1959 struct leaf *l = NULL;
1960 s_h=cb->args[2];
1961
1962 for (h=0; (l = nextleaf(t, l)) != NULL; h++) {
1963
1964 if (h < s_h)
1965 continue;
1966 if (h > s_h)
1967 memset(&cb->args[3], 0,
1968 sizeof(cb->args) - 3*sizeof(cb->args[0]));
1969
1970 fa_head = get_fa_head(l, plen);
1971
1972 if(!fa_head)
1973 continue;
1974
1975 if(list_empty(fa_head))
1976 continue;
1977
1978 if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) {
1979 cb->args[2]=h;
1980 return -1;
1981 }
1982 }
1983 cb->args[2]=h;
1984 return skb->len;
1985}
1986
1987static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb)
1988{
1989 int m, s_m;
1990 struct trie *t = (struct trie *) tb->tb_data;
1991
1992 s_m = cb->args[1];
1993
1994 read_lock(&fib_lock);
1995 for (m=0; m<=32; m++) {
1996
1997 if (m < s_m)
1998 continue;
1999 if (m > s_m)
2000 memset(&cb->args[2], 0,
2001 sizeof(cb->args) - 2*sizeof(cb->args[0]));
2002
2003 if (fn_trie_dump_plen(t, 32-m, tb, skb, cb)<0) {
2004 cb->args[1] = m;
2005 goto out;
2006 }
2007 }
2008 read_unlock(&fib_lock);
2009 cb->args[1] = m;
2010 return skb->len;
2011 out:
2012 read_unlock(&fib_lock);
2013 return -1;
2014}
2015
2016/* Fix more generic FIB names for init later */
2017
2018#ifdef CONFIG_IP_MULTIPLE_TABLES
2019struct fib_table * fib_hash_init(int id)
2020#else
2021struct fib_table * __init fib_hash_init(int id)
2022#endif
2023{
2024 struct fib_table *tb;
2025 struct trie *t;
2026
2027 if (fn_alias_kmem == NULL)
2028 fn_alias_kmem = kmem_cache_create("ip_fib_alias",
2029 sizeof(struct fib_alias),
2030 0, SLAB_HWCACHE_ALIGN,
2031 NULL, NULL);
2032
2033 tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
2034 GFP_KERNEL);
2035 if (tb == NULL)
2036 return NULL;
2037
2038 tb->tb_id = id;
2039 tb->tb_lookup = fn_trie_lookup;
2040 tb->tb_insert = fn_trie_insert;
2041 tb->tb_delete = fn_trie_delete;
2042 tb->tb_flush = fn_trie_flush;
2043 tb->tb_select_default = fn_trie_select_default;
2044 tb->tb_dump = fn_trie_dump;
2045 memset(tb->tb_data, 0, sizeof(struct trie));
2046
2047 t = (struct trie *) tb->tb_data;
2048
2049 trie_init(t);
2050
2051 if (id == RT_TABLE_LOCAL)
2052 trie_local=t;
2053 else if (id == RT_TABLE_MAIN)
2054 trie_main=t;
2055
2056 if (id == RT_TABLE_LOCAL)
2057 printk("IPv4 FIB: Using LC-trie version %s\n", VERSION);
2058
2059 return tb;
2060}
2061
2062/* Trie dump functions */
2063
2064static void putspace_seq(struct seq_file *seq, int n)
2065{
2066 while (n--) seq_printf(seq, " ");
2067}
2068
2069static void printbin_seq(struct seq_file *seq, unsigned int v, int bits)
2070{
2071 while (bits--)
2072 seq_printf(seq, "%s", (v & (1<<bits))?"1":"0");
2073}
2074
2075static void printnode_seq(struct seq_file *seq, int indent, struct node *n,
2076 int pend, int cindex, int bits)
2077{
2078 putspace_seq(seq, indent);
2079 if (IS_LEAF(n))
2080 seq_printf(seq, "|");
2081 else
2082 seq_printf(seq, "+");
2083 if (bits) {
2084 seq_printf(seq, "%d/", cindex);
2085 printbin_seq(seq, cindex, bits);
2086 seq_printf(seq, ": ");
2087 }
2088 else
2089 seq_printf(seq, "<root>: ");
2090 seq_printf(seq, "%s:%p ", IS_LEAF(n)?"Leaf":"Internal node", n);
2091
2092 if (IS_LEAF(n))
2093 seq_printf(seq, "key=%d.%d.%d.%d\n",
2094 n->key >> 24, (n->key >> 16) % 256, (n->key >> 8) % 256, n->key % 256);
2095 else {
2096 int plen=((struct tnode *)n)->pos;
2097 t_key prf=MASK_PFX(n->key, plen);
2098 seq_printf(seq, "key=%d.%d.%d.%d/%d\n",
2099 prf >> 24, (prf >> 16) % 256, (prf >> 8) % 256, prf % 256, plen);
2100 }
2101 if (IS_LEAF(n)) {
2102 struct leaf *l=(struct leaf *)n;
2103 struct fib_alias *fa;
2104 int i;
2105 for (i=32; i>=0; i--)
2106 if(find_leaf_info(&l->list, i)) {
2107
2108 struct list_head *fa_head = get_fa_head(l, i);
2109
2110 if(!fa_head)
2111 continue;
2112
2113 if(list_empty(fa_head))
2114 continue;
2115
2116 putspace_seq(seq, indent+2);
2117 seq_printf(seq, "{/%d...dumping}\n", i);
2118
2119
2120 list_for_each_entry(fa, fa_head, fa_list) {
2121 putspace_seq(seq, indent+2);
2122 if (fa->fa_info->fib_nh == NULL) {
2123 seq_printf(seq, "Error _fib_nh=NULL\n");
2124 continue;
2125 }
2126 if (fa->fa_info == NULL) {
2127 seq_printf(seq, "Error fa_info=NULL\n");
2128 continue;
2129 }
2130
2131 seq_printf(seq, "{type=%d scope=%d TOS=%d}\n",
2132 fa->fa_type,
2133 fa->fa_scope,
2134 fa->fa_tos);
2135 }
2136 }
2137 }
2138 else if (IS_TNODE(n)) {
2139 struct tnode *tn=(struct tnode *)n;
2140 putspace_seq(seq, indent); seq_printf(seq, "| ");
2141 seq_printf(seq, "{key prefix=%08x/", tn->key&TKEY_GET_MASK(0, tn->pos));
2142 printbin_seq(seq, tkey_extract_bits(tn->key, 0, tn->pos), tn->pos);
2143 seq_printf(seq, "}\n");
2144 putspace_seq(seq, indent); seq_printf(seq, "| ");
2145 seq_printf(seq, "{pos=%d", tn->pos);
2146 seq_printf(seq, " (skip=%d bits)", tn->pos - pend);
2147 seq_printf(seq, " bits=%d (%u children)}\n", tn->bits, (1 << tn->bits));
2148 putspace_seq(seq, indent); seq_printf(seq, "| ");
2149 seq_printf(seq, "{empty=%d full=%d}\n", tn->empty_children, tn->full_children);
2150 }
2151}
2152
2153static void trie_dump_seq(struct seq_file *seq, struct trie *t)
2154{
2155 struct node *n=t->trie;
2156 int cindex=0;
2157 int indent=1;
2158 int pend=0;
2159 int depth = 0;
2160
2161 read_lock(&fib_lock);
2162
2163 seq_printf(seq, "------ trie_dump of t=%p ------\n", t);
2164 if (n) {
2165 printnode_seq(seq, indent, n, pend, cindex, 0);
2166 if (IS_TNODE(n)) {
2167 struct tnode *tn=(struct tnode *)n;
2168 pend = tn->pos+tn->bits;
2169 putspace_seq(seq, indent); seq_printf(seq, "\\--\n");
2170 indent += 3;
2171 depth++;
2172
2173 while (tn && cindex < (1 << tn->bits)) {
2174 if (tn->child[cindex]) {
2175
2176 /* Got a child */
2177
2178 printnode_seq(seq, indent, tn->child[cindex], pend, cindex, tn->bits);
2179 if (IS_LEAF(tn->child[cindex])) {
2180 cindex++;
2181
2182 }
2183 else {
2184 /*
2185 * New tnode. Decend one level
2186 */
2187
2188 depth++;
2189 n=tn->child[cindex];
2190 tn=(struct tnode *)n;
2191 pend=tn->pos+tn->bits;
2192 putspace_seq(seq, indent); seq_printf(seq, "\\--\n");
2193 indent+=3;
2194 cindex=0;
2195 }
2196 }
2197 else
2198 cindex++;
2199
2200 /*
2201 * Test if we are done
2202 */
2203
2204 while (cindex >= (1 << tn->bits)) {
2205
2206 /*
2207 * Move upwards and test for root
2208 * pop off all traversed nodes
2209 */
2210
2211 if (NODE_PARENT(tn) == NULL) {
2212 tn = NULL;
2213 n = NULL;
2214 break;
2215 }
2216 else {
2217 cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
2218 tn = NODE_PARENT(tn);
2219 cindex++;
2220 n=(struct node *)tn;
2221 pend=tn->pos+tn->bits;
2222 indent-=3;
2223 depth--;
2224 }
2225 }
2226 }
2227 }
2228 else n = NULL;
2229 }
2230 else seq_printf(seq, "------ trie is empty\n");
2231
2232 read_unlock(&fib_lock);
2233}
2234
2235static struct trie_stat *trie_stat_new(void)
2236{
2237 struct trie_stat *s = kmalloc(sizeof(struct trie_stat), GFP_KERNEL);
2238 int i;
2239
2240 if(s) {
2241 s->totdepth = 0;
2242 s->maxdepth = 0;
2243 s->tnodes = 0;
2244 s->leaves = 0;
2245 s->nullpointers = 0;
2246
2247 for(i=0; i< MAX_CHILDS; i++)
2248 s->nodesizes[i] = 0;
2249 }
2250 return s;
2251}
2252
2253static struct trie_stat *trie_collect_stats(struct trie *t)
2254{
2255 struct node *n=t->trie;
2256 struct trie_stat *s = trie_stat_new();
2257 int cindex = 0;
2258 int indent = 1;
2259 int pend = 0;
2260 int depth = 0;
2261
2262 read_lock(&fib_lock);
2263
2264 if (s) {
2265 if (n) {
2266 if (IS_TNODE(n)) {
2267 struct tnode *tn = (struct tnode *)n;
2268 pend=tn->pos+tn->bits;
2269 indent += 3;
2270 s->nodesizes[tn->bits]++;
2271 depth++;
2272
2273 while (tn && cindex < (1 << tn->bits)) {
2274 if (tn->child[cindex]) {
2275 /* Got a child */
2276
2277 if (IS_LEAF(tn->child[cindex])) {
2278 cindex++;
2279
2280 /* stats */
2281 if (depth > s->maxdepth)
2282 s->maxdepth = depth;
2283 s->totdepth += depth;
2284 s->leaves++;
2285 }
2286
2287 else {
2288 /*
2289 * New tnode. Decend one level
2290 */
2291
2292 s->tnodes++;
2293 s->nodesizes[tn->bits]++;
2294 depth++;
2295
2296 n = tn->child[cindex];
2297 tn = (struct tnode *)n;
2298 pend = tn->pos+tn->bits;
2299
2300 indent += 3;
2301 cindex = 0;
2302 }
2303 }
2304 else {
2305 cindex++;
2306 s->nullpointers++;
2307 }
2308
2309 /*
2310 * Test if we are done
2311 */
2312
2313 while (cindex >= (1 << tn->bits)) {
2314
2315 /*
2316 * Move upwards and test for root
2317 * pop off all traversed nodes
2318 */
2319
2320
2321 if (NODE_PARENT(tn) == NULL) {
2322 tn = NULL;
2323 n = NULL;
2324 break;
2325 }
2326 else {
2327 cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
2328 tn = NODE_PARENT(tn);
2329 cindex++;
2330 n = (struct node *)tn;
2331 pend=tn->pos+tn->bits;
2332 indent -= 3;
2333 depth--;
2334 }
2335 }
2336 }
2337 }
2338 else n = NULL;
2339 }
2340 }
2341
2342 read_unlock(&fib_lock);
2343 return s;
2344}
2345
2346#ifdef CONFIG_PROC_FS
2347
2348static struct fib_alias *fib_triestat_get_first(struct seq_file *seq)
2349{
2350 return NULL;
2351}
2352
2353static struct fib_alias *fib_triestat_get_next(struct seq_file *seq)
2354{
2355 return NULL;
2356}
2357
2358static void *fib_triestat_seq_start(struct seq_file *seq, loff_t *pos)
2359{
2360 void *v = NULL;
2361
2362 if (ip_fib_main_table)
2363 v = *pos ? fib_triestat_get_next(seq) : SEQ_START_TOKEN;
2364 return v;
2365}
2366
2367static void *fib_triestat_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2368{
2369 ++*pos;
2370 return v == SEQ_START_TOKEN ? fib_triestat_get_first(seq) : fib_triestat_get_next(seq);
2371}
2372
2373static void fib_triestat_seq_stop(struct seq_file *seq, void *v)
2374{
2375
2376}
2377
2378/*
2379 * This outputs /proc/net/fib_triestats
2380 *
2381 * It always works in backward compatibility mode.
2382 * The format of the file is not supposed to be changed.
2383 */
2384
2385static void collect_and_show(struct trie *t, struct seq_file *seq)
2386{
2387 int bytes = 0; /* How many bytes are used, a ref is 4 bytes */
2388 int i, max, pointers;
2389 struct trie_stat *stat;
2390 int avdepth;
2391
2392 stat = trie_collect_stats(t);
2393
2394 bytes=0;
2395 seq_printf(seq, "trie=%p\n", t);
2396
2397 if (stat) {
2398 if (stat->leaves)
2399 avdepth=stat->totdepth*100 / stat->leaves;
2400 else
2401 avdepth=0;
2402 seq_printf(seq, "Aver depth: %d.%02d\n", avdepth / 100, avdepth % 100 );
2403 seq_printf(seq, "Max depth: %4d\n", stat->maxdepth);
2404
2405 seq_printf(seq, "Leaves: %d\n", stat->leaves);
2406 bytes += sizeof(struct leaf) * stat->leaves;
2407 seq_printf(seq, "Internal nodes: %d\n", stat->tnodes);
2408 bytes += sizeof(struct tnode) * stat->tnodes;
2409
2410 max = MAX_CHILDS-1;
2411
2412 while (max >= 0 && stat->nodesizes[max] == 0)
2413 max--;
2414 pointers = 0;
2415
2416 for (i = 1; i <= max; i++)
2417 if (stat->nodesizes[i] != 0) {
2418 seq_printf(seq, " %d: %d", i, stat->nodesizes[i]);
2419 pointers += (1<<i) * stat->nodesizes[i];
2420 }
2421 seq_printf(seq, "\n");
2422 seq_printf(seq, "Pointers: %d\n", pointers);
2423 bytes += sizeof(struct node *) * pointers;
2424 seq_printf(seq, "Null ptrs: %d\n", stat->nullpointers);
2425 seq_printf(seq, "Total size: %d kB\n", bytes / 1024);
2426
2427 kfree(stat);
2428 }
2429
2430#ifdef CONFIG_IP_FIB_TRIE_STATS
2431 seq_printf(seq, "Counters:\n---------\n");
2432 seq_printf(seq,"gets = %d\n", t->stats.gets);
2433 seq_printf(seq,"backtracks = %d\n", t->stats.backtrack);
2434 seq_printf(seq,"semantic match passed = %d\n", t->stats.semantic_match_passed);
2435 seq_printf(seq,"semantic match miss = %d\n", t->stats.semantic_match_miss);
2436 seq_printf(seq,"null node hit= %d\n", t->stats.null_node_hit);
Robert Olsson2f368952005-07-05 15:02:40 -07002437 seq_printf(seq,"skipped node resize = %d\n", t->stats.resize_node_skipped);
Robert Olsson19baf832005-06-21 12:43:18 -07002438#ifdef CLEAR_STATS
2439 memset(&(t->stats), 0, sizeof(t->stats));
2440#endif
2441#endif /* CONFIG_IP_FIB_TRIE_STATS */
2442}
2443
2444static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2445{
2446 char bf[128];
2447
2448 if (v == SEQ_START_TOKEN) {
2449 seq_printf(seq, "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n",
2450 sizeof(struct leaf), sizeof(struct tnode));
2451 if (trie_local)
2452 collect_and_show(trie_local, seq);
2453
2454 if (trie_main)
2455 collect_and_show(trie_main, seq);
2456 }
2457 else {
2458 snprintf(bf, sizeof(bf),
2459 "*\t%08X\t%08X", 200, 400);
2460
2461 seq_printf(seq, "%-127s\n", bf);
2462 }
2463 return 0;
2464}
2465
2466static struct seq_operations fib_triestat_seq_ops = {
2467 .start = fib_triestat_seq_start,
2468 .next = fib_triestat_seq_next,
2469 .stop = fib_triestat_seq_stop,
2470 .show = fib_triestat_seq_show,
2471};
2472
2473static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2474{
2475 struct seq_file *seq;
2476 int rc = -ENOMEM;
2477
2478 rc = seq_open(file, &fib_triestat_seq_ops);
2479 if (rc)
2480 goto out_kfree;
2481
2482 seq = file->private_data;
2483out:
2484 return rc;
2485out_kfree:
2486 goto out;
2487}
2488
2489static struct file_operations fib_triestat_seq_fops = {
2490 .owner = THIS_MODULE,
2491 .open = fib_triestat_seq_open,
2492 .read = seq_read,
2493 .llseek = seq_lseek,
2494 .release = seq_release_private,
2495};
2496
2497int __init fib_stat_proc_init(void)
2498{
2499 if (!proc_net_fops_create("fib_triestat", S_IRUGO, &fib_triestat_seq_fops))
2500 return -ENOMEM;
2501 return 0;
2502}
2503
2504void __init fib_stat_proc_exit(void)
2505{
2506 proc_net_remove("fib_triestat");
2507}
2508
2509static struct fib_alias *fib_trie_get_first(struct seq_file *seq)
2510{
2511 return NULL;
2512}
2513
2514static struct fib_alias *fib_trie_get_next(struct seq_file *seq)
2515{
2516 return NULL;
2517}
2518
2519static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2520{
2521 void *v = NULL;
2522
2523 if (ip_fib_main_table)
2524 v = *pos ? fib_trie_get_next(seq) : SEQ_START_TOKEN;
2525 return v;
2526}
2527
2528static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2529{
2530 ++*pos;
2531 return v == SEQ_START_TOKEN ? fib_trie_get_first(seq) : fib_trie_get_next(seq);
2532}
2533
2534static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2535{
2536
2537}
2538
2539/*
2540 * This outputs /proc/net/fib_trie.
2541 *
2542 * It always works in backward compatibility mode.
2543 * The format of the file is not supposed to be changed.
2544 */
2545
2546static int fib_trie_seq_show(struct seq_file *seq, void *v)
2547{
2548 char bf[128];
2549
2550 if (v == SEQ_START_TOKEN) {
2551 if (trie_local)
2552 trie_dump_seq(seq, trie_local);
2553
2554 if (trie_main)
2555 trie_dump_seq(seq, trie_main);
2556 }
2557
2558 else {
2559 snprintf(bf, sizeof(bf),
2560 "*\t%08X\t%08X", 200, 400);
2561 seq_printf(seq, "%-127s\n", bf);
2562 }
2563
2564 return 0;
2565}
2566
2567static struct seq_operations fib_trie_seq_ops = {
2568 .start = fib_trie_seq_start,
2569 .next = fib_trie_seq_next,
2570 .stop = fib_trie_seq_stop,
2571 .show = fib_trie_seq_show,
2572};
2573
2574static int fib_trie_seq_open(struct inode *inode, struct file *file)
2575{
2576 struct seq_file *seq;
2577 int rc = -ENOMEM;
2578
2579 rc = seq_open(file, &fib_trie_seq_ops);
2580 if (rc)
2581 goto out_kfree;
2582
2583 seq = file->private_data;
2584out:
2585 return rc;
2586out_kfree:
2587 goto out;
2588}
2589
2590static struct file_operations fib_trie_seq_fops = {
2591 .owner = THIS_MODULE,
2592 .open = fib_trie_seq_open,
2593 .read = seq_read,
2594 .llseek = seq_lseek,
2595 .release = seq_release_private,
2596};
2597
2598int __init fib_proc_init(void)
2599{
2600 if (!proc_net_fops_create("fib_trie", S_IRUGO, &fib_trie_seq_fops))
2601 return -ENOMEM;
2602 return 0;
2603}
2604
2605void __init fib_proc_exit(void)
2606{
2607 proc_net_remove("fib_trie");
2608}
2609
2610#endif /* CONFIG_PROC_FS */