blob: 2dc6dbb284678916db25257405da673cfea06f72 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
8 * Version: $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
9 *
Jesper Juhl02c30a82005-05-05 16:16:16 -070010 * Authors: Ross Biro
Linus Torvalds1da177e2005-04-16 15:20:36 -070011 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Alan Cox, <gw4pts@gw4pts.ampr.org>
13 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15 *
16 * Fixes:
17 * Alan Cox : Verify area fixes.
18 * Alan Cox : cli() protects routing changes
19 * Rui Oliveira : ICMP routing table updates
20 * (rco@di.uminho.pt) Routing table insertion and update
21 * Linus Torvalds : Rewrote bits to be sensible
22 * Alan Cox : Added BSD route gw semantics
23 * Alan Cox : Super /proc >4K
24 * Alan Cox : MTU in route table
25 * Alan Cox : MSS actually. Also added the window
26 * clamper.
27 * Sam Lantinga : Fixed route matching in rt_del()
28 * Alan Cox : Routing cache support.
29 * Alan Cox : Removed compatibility cruft.
30 * Alan Cox : RTF_REJECT support.
31 * Alan Cox : TCP irtt support.
32 * Jonathan Naylor : Added Metric support.
33 * Miquel van Smoorenburg : BSD API fixes.
34 * Miquel van Smoorenburg : Metrics.
35 * Alan Cox : Use __u32 properly
36 * Alan Cox : Aligned routing errors more closely with BSD
37 * our system is still very different.
38 * Alan Cox : Faster /proc handling
39 * Alexey Kuznetsov : Massive rework to support tree based routing,
40 * routing caches and better behaviour.
41 *
42 * Olaf Erb : irtt wasn't being copied right.
43 * Bjorn Ekwall : Kerneld route support.
44 * Alan Cox : Multicast fixed (I hope)
45 * Pavel Krauz : Limited broadcast fixed
46 * Mike McLagan : Routing by source
47 * Alexey Kuznetsov : End of old history. Split to fib.c and
48 * route.c and rewritten from scratch.
49 * Andi Kleen : Load-limit warning messages.
50 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
51 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
52 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
53 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
54 * Marc Boucher : routing by fwmark
55 * Robert Olsson : Added rt_cache statistics
56 * Arnaldo C. Melo : Convert proc stuff to seq_file
Eric Dumazetbb1d23b2005-07-05 15:00:32 -070057 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
Ilia Sotnikovcef26852006-03-25 01:38:55 -080058 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
59 * Ilia Sotnikov : Removed TOS from hash calculations
Linus Torvalds1da177e2005-04-16 15:20:36 -070060 *
61 * This program is free software; you can redistribute it and/or
62 * modify it under the terms of the GNU General Public License
63 * as published by the Free Software Foundation; either version
64 * 2 of the License, or (at your option) any later version.
65 */
66
Linus Torvalds1da177e2005-04-16 15:20:36 -070067#include <linux/module.h>
68#include <asm/uaccess.h>
69#include <asm/system.h>
70#include <linux/bitops.h>
71#include <linux/types.h>
72#include <linux/kernel.h>
73#include <linux/sched.h>
74#include <linux/mm.h>
Eric Dumazet424c4b72005-07-05 14:58:19 -070075#include <linux/bootmem.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070076#include <linux/string.h>
77#include <linux/socket.h>
78#include <linux/sockios.h>
79#include <linux/errno.h>
80#include <linux/in.h>
81#include <linux/inet.h>
82#include <linux/netdevice.h>
83#include <linux/proc_fs.h>
84#include <linux/init.h>
85#include <linux/skbuff.h>
86#include <linux/rtnetlink.h>
87#include <linux/inetdevice.h>
88#include <linux/igmp.h>
89#include <linux/pkt_sched.h>
90#include <linux/mroute.h>
91#include <linux/netfilter_ipv4.h>
92#include <linux/random.h>
93#include <linux/jhash.h>
94#include <linux/rcupdate.h>
95#include <linux/times.h>
96#include <net/protocol.h>
97#include <net/ip.h>
98#include <net/route.h>
99#include <net/inetpeer.h>
100#include <net/sock.h>
101#include <net/ip_fib.h>
102#include <net/arp.h>
103#include <net/tcp.h>
104#include <net/icmp.h>
105#include <net/xfrm.h>
106#include <net/ip_mp_alg.h>
107#ifdef CONFIG_SYSCTL
108#include <linux/sysctl.h>
109#endif
110
111#define RT_FL_TOS(oldflp) \
112 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
113
114#define IP_MAX_MTU 0xFFF0
115
116#define RT_GC_TIMEOUT (300*HZ)
117
118static int ip_rt_min_delay = 2 * HZ;
119static int ip_rt_max_delay = 10 * HZ;
120static int ip_rt_max_size;
121static int ip_rt_gc_timeout = RT_GC_TIMEOUT;
122static int ip_rt_gc_interval = 60 * HZ;
123static int ip_rt_gc_min_interval = HZ / 2;
124static int ip_rt_redirect_number = 9;
125static int ip_rt_redirect_load = HZ / 50;
126static int ip_rt_redirect_silence = ((HZ / 50) << (9 + 1));
127static int ip_rt_error_cost = HZ;
128static int ip_rt_error_burst = 5 * HZ;
129static int ip_rt_gc_elasticity = 8;
130static int ip_rt_mtu_expires = 10 * 60 * HZ;
131static int ip_rt_min_pmtu = 512 + 20 + 20;
132static int ip_rt_min_advmss = 256;
133static int ip_rt_secret_interval = 10 * 60 * HZ;
134static unsigned long rt_deadline;
135
136#define RTprint(a...) printk(KERN_DEBUG a)
137
138static struct timer_list rt_flush_timer;
139static struct timer_list rt_periodic_timer;
140static struct timer_list rt_secret_timer;
141
142/*
143 * Interface to generic destination cache.
144 */
145
146static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
147static void ipv4_dst_destroy(struct dst_entry *dst);
148static void ipv4_dst_ifdown(struct dst_entry *dst,
149 struct net_device *dev, int how);
150static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
151static void ipv4_link_failure(struct sk_buff *skb);
152static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
153static int rt_garbage_collect(void);
154
155
156static struct dst_ops ipv4_dst_ops = {
157 .family = AF_INET,
158 .protocol = __constant_htons(ETH_P_IP),
159 .gc = rt_garbage_collect,
160 .check = ipv4_dst_check,
161 .destroy = ipv4_dst_destroy,
162 .ifdown = ipv4_dst_ifdown,
163 .negative_advice = ipv4_negative_advice,
164 .link_failure = ipv4_link_failure,
165 .update_pmtu = ip_rt_update_pmtu,
166 .entry_size = sizeof(struct rtable),
167};
168
169#define ECN_OR_COST(class) TC_PRIO_##class
170
171__u8 ip_tos2prio[16] = {
172 TC_PRIO_BESTEFFORT,
173 ECN_OR_COST(FILLER),
174 TC_PRIO_BESTEFFORT,
175 ECN_OR_COST(BESTEFFORT),
176 TC_PRIO_BULK,
177 ECN_OR_COST(BULK),
178 TC_PRIO_BULK,
179 ECN_OR_COST(BULK),
180 TC_PRIO_INTERACTIVE,
181 ECN_OR_COST(INTERACTIVE),
182 TC_PRIO_INTERACTIVE,
183 ECN_OR_COST(INTERACTIVE),
184 TC_PRIO_INTERACTIVE_BULK,
185 ECN_OR_COST(INTERACTIVE_BULK),
186 TC_PRIO_INTERACTIVE_BULK,
187 ECN_OR_COST(INTERACTIVE_BULK)
188};
189
190
191/*
192 * Route cache.
193 */
194
195/* The locking scheme is rather straight forward:
196 *
197 * 1) Read-Copy Update protects the buckets of the central route hash.
198 * 2) Only writers remove entries, and they hold the lock
199 * as they look at rtable reference counts.
200 * 3) Only readers acquire references to rtable entries,
201 * they do so with atomic increments and with the
202 * lock held.
203 */
204
205struct rt_hash_bucket {
206 struct rtable *chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700207};
Ingo Molnar8a25d5d2006-07-03 00:24:54 -0700208#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
209 defined(CONFIG_PROVE_LOCKING)
Eric Dumazet22c047c2005-07-05 14:55:24 -0700210/*
211 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
212 * The size of this table is a power of two and depends on the number of CPUS.
Ingo Molnar62051202006-07-03 00:24:59 -0700213 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
Eric Dumazet22c047c2005-07-05 14:55:24 -0700214 */
Ingo Molnar62051202006-07-03 00:24:59 -0700215#ifdef CONFIG_LOCKDEP
216# define RT_HASH_LOCK_SZ 256
Eric Dumazet22c047c2005-07-05 14:55:24 -0700217#else
Ingo Molnar62051202006-07-03 00:24:59 -0700218# if NR_CPUS >= 32
219# define RT_HASH_LOCK_SZ 4096
220# elif NR_CPUS >= 16
221# define RT_HASH_LOCK_SZ 2048
222# elif NR_CPUS >= 8
223# define RT_HASH_LOCK_SZ 1024
224# elif NR_CPUS >= 4
225# define RT_HASH_LOCK_SZ 512
226# else
227# define RT_HASH_LOCK_SZ 256
228# endif
Eric Dumazet22c047c2005-07-05 14:55:24 -0700229#endif
230
231static spinlock_t *rt_hash_locks;
232# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
233# define rt_hash_lock_init() { \
234 int i; \
235 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
236 if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
237 for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
238 spin_lock_init(&rt_hash_locks[i]); \
239 }
240#else
241# define rt_hash_lock_addr(slot) NULL
242# define rt_hash_lock_init()
243#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700244
245static struct rt_hash_bucket *rt_hash_table;
246static unsigned rt_hash_mask;
247static int rt_hash_log;
248static unsigned int rt_hash_rnd;
249
Eric Dumazet2f970d82006-01-17 02:54:36 -0800250static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
Andrew Mortondbd29152006-01-17 21:58:01 -0800251#define RT_CACHE_STAT_INC(field) \
Paul Mackerrasbfe5d832006-06-25 05:47:14 -0700252 (__raw_get_cpu_var(rt_cache_stat).field++)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700253
254static int rt_intern_hash(unsigned hash, struct rtable *rth,
255 struct rtable **res);
256
Ilia Sotnikovcef26852006-03-25 01:38:55 -0800257static unsigned int rt_hash_code(u32 daddr, u32 saddr)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700258{
Ilia Sotnikovcef26852006-03-25 01:38:55 -0800259 return (jhash_2words(daddr, saddr, rt_hash_rnd)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700260 & rt_hash_mask);
261}
262
263#ifdef CONFIG_PROC_FS
264struct rt_cache_iter_state {
265 int bucket;
266};
267
268static struct rtable *rt_cache_get_first(struct seq_file *seq)
269{
270 struct rtable *r = NULL;
271 struct rt_cache_iter_state *st = seq->private;
272
273 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
274 rcu_read_lock_bh();
275 r = rt_hash_table[st->bucket].chain;
276 if (r)
277 break;
278 rcu_read_unlock_bh();
279 }
280 return r;
281}
282
283static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
284{
285 struct rt_cache_iter_state *st = rcu_dereference(seq->private);
286
287 r = r->u.rt_next;
288 while (!r) {
289 rcu_read_unlock_bh();
290 if (--st->bucket < 0)
291 break;
292 rcu_read_lock_bh();
293 r = rt_hash_table[st->bucket].chain;
294 }
295 return r;
296}
297
298static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
299{
300 struct rtable *r = rt_cache_get_first(seq);
301
302 if (r)
303 while (pos && (r = rt_cache_get_next(seq, r)))
304 --pos;
305 return pos ? NULL : r;
306}
307
308static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
309{
310 return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
311}
312
313static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
314{
315 struct rtable *r = NULL;
316
317 if (v == SEQ_START_TOKEN)
318 r = rt_cache_get_first(seq);
319 else
320 r = rt_cache_get_next(seq, v);
321 ++*pos;
322 return r;
323}
324
325static void rt_cache_seq_stop(struct seq_file *seq, void *v)
326{
327 if (v && v != SEQ_START_TOKEN)
328 rcu_read_unlock_bh();
329}
330
331static int rt_cache_seq_show(struct seq_file *seq, void *v)
332{
333 if (v == SEQ_START_TOKEN)
334 seq_printf(seq, "%-127s\n",
335 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
336 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
337 "HHUptod\tSpecDst");
338 else {
339 struct rtable *r = v;
340 char temp[256];
341
342 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
343 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
344 r->u.dst.dev ? r->u.dst.dev->name : "*",
345 (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
346 r->rt_flags, atomic_read(&r->u.dst.__refcnt),
347 r->u.dst.__use, 0, (unsigned long)r->rt_src,
348 (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
349 (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
350 dst_metric(&r->u.dst, RTAX_WINDOW),
351 (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
352 dst_metric(&r->u.dst, RTAX_RTTVAR)),
353 r->fl.fl4_tos,
354 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
355 r->u.dst.hh ? (r->u.dst.hh->hh_output ==
356 dev_queue_xmit) : 0,
357 r->rt_spec_dst);
358 seq_printf(seq, "%-127s\n", temp);
359 }
360 return 0;
361}
362
363static struct seq_operations rt_cache_seq_ops = {
364 .start = rt_cache_seq_start,
365 .next = rt_cache_seq_next,
366 .stop = rt_cache_seq_stop,
367 .show = rt_cache_seq_show,
368};
369
370static int rt_cache_seq_open(struct inode *inode, struct file *file)
371{
372 struct seq_file *seq;
373 int rc = -ENOMEM;
374 struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
375
376 if (!s)
377 goto out;
378 rc = seq_open(file, &rt_cache_seq_ops);
379 if (rc)
380 goto out_kfree;
381 seq = file->private_data;
382 seq->private = s;
383 memset(s, 0, sizeof(*s));
384out:
385 return rc;
386out_kfree:
387 kfree(s);
388 goto out;
389}
390
391static struct file_operations rt_cache_seq_fops = {
392 .owner = THIS_MODULE,
393 .open = rt_cache_seq_open,
394 .read = seq_read,
395 .llseek = seq_lseek,
396 .release = seq_release_private,
397};
398
399
400static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
401{
402 int cpu;
403
404 if (*pos == 0)
405 return SEQ_START_TOKEN;
406
407 for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
408 if (!cpu_possible(cpu))
409 continue;
410 *pos = cpu+1;
Eric Dumazet2f970d82006-01-17 02:54:36 -0800411 return &per_cpu(rt_cache_stat, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700412 }
413 return NULL;
414}
415
416static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
417{
418 int cpu;
419
420 for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
421 if (!cpu_possible(cpu))
422 continue;
423 *pos = cpu+1;
Eric Dumazet2f970d82006-01-17 02:54:36 -0800424 return &per_cpu(rt_cache_stat, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700425 }
426 return NULL;
427
428}
429
430static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
431{
432
433}
434
435static int rt_cpu_seq_show(struct seq_file *seq, void *v)
436{
437 struct rt_cache_stat *st = v;
438
439 if (v == SEQ_START_TOKEN) {
Olaf Rempel5bec0032005-04-28 12:16:08 -0700440 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700441 return 0;
442 }
443
444 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
445 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
446 atomic_read(&ipv4_dst_ops.entries),
447 st->in_hit,
448 st->in_slow_tot,
449 st->in_slow_mc,
450 st->in_no_route,
451 st->in_brd,
452 st->in_martian_dst,
453 st->in_martian_src,
454
455 st->out_hit,
456 st->out_slow_tot,
457 st->out_slow_mc,
458
459 st->gc_total,
460 st->gc_ignored,
461 st->gc_goal_miss,
462 st->gc_dst_overflow,
463 st->in_hlist_search,
464 st->out_hlist_search
465 );
466 return 0;
467}
468
469static struct seq_operations rt_cpu_seq_ops = {
470 .start = rt_cpu_seq_start,
471 .next = rt_cpu_seq_next,
472 .stop = rt_cpu_seq_stop,
473 .show = rt_cpu_seq_show,
474};
475
476
477static int rt_cpu_seq_open(struct inode *inode, struct file *file)
478{
479 return seq_open(file, &rt_cpu_seq_ops);
480}
481
482static struct file_operations rt_cpu_seq_fops = {
483 .owner = THIS_MODULE,
484 .open = rt_cpu_seq_open,
485 .read = seq_read,
486 .llseek = seq_lseek,
487 .release = seq_release,
488};
489
490#endif /* CONFIG_PROC_FS */
491
492static __inline__ void rt_free(struct rtable *rt)
493{
494 multipath_remove(rt);
495 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
496}
497
498static __inline__ void rt_drop(struct rtable *rt)
499{
500 multipath_remove(rt);
501 ip_rt_put(rt);
502 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
503}
504
505static __inline__ int rt_fast_clean(struct rtable *rth)
506{
507 /* Kill broadcast/multicast entries very aggresively, if they
508 collide in hash table with more useful entries */
509 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
510 rth->fl.iif && rth->u.rt_next;
511}
512
513static __inline__ int rt_valuable(struct rtable *rth)
514{
515 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
516 rth->u.dst.expires;
517}
518
519static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
520{
521 unsigned long age;
522 int ret = 0;
523
524 if (atomic_read(&rth->u.dst.__refcnt))
525 goto out;
526
527 ret = 1;
528 if (rth->u.dst.expires &&
529 time_after_eq(jiffies, rth->u.dst.expires))
530 goto out;
531
532 age = jiffies - rth->u.dst.lastuse;
533 ret = 0;
534 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
535 (age <= tmo2 && rt_valuable(rth)))
536 goto out;
537 ret = 1;
538out: return ret;
539}
540
541/* Bits of score are:
542 * 31: very valuable
543 * 30: not quite useless
544 * 29..0: usage counter
545 */
546static inline u32 rt_score(struct rtable *rt)
547{
548 u32 score = jiffies - rt->u.dst.lastuse;
549
550 score = ~score & ~(3<<30);
551
552 if (rt_valuable(rt))
553 score |= (1<<31);
554
555 if (!rt->fl.iif ||
556 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
557 score |= (1<<30);
558
559 return score;
560}
561
562static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
563{
564 return memcmp(&fl1->nl_u.ip4_u, &fl2->nl_u.ip4_u, sizeof(fl1->nl_u.ip4_u)) == 0 &&
565 fl1->oif == fl2->oif &&
566 fl1->iif == fl2->iif;
567}
568
569#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
570static struct rtable **rt_remove_balanced_route(struct rtable **chain_head,
571 struct rtable *expentry,
572 int *removed_count)
573{
574 int passedexpired = 0;
575 struct rtable **nextstep = NULL;
576 struct rtable **rthp = chain_head;
577 struct rtable *rth;
578
579 if (removed_count)
580 *removed_count = 0;
581
582 while ((rth = *rthp) != NULL) {
583 if (rth == expentry)
584 passedexpired = 1;
585
586 if (((*rthp)->u.dst.flags & DST_BALANCED) != 0 &&
587 compare_keys(&(*rthp)->fl, &expentry->fl)) {
588 if (*rthp == expentry) {
589 *rthp = rth->u.rt_next;
590 continue;
591 } else {
592 *rthp = rth->u.rt_next;
593 rt_free(rth);
594 if (removed_count)
595 ++(*removed_count);
596 }
597 } else {
598 if (!((*rthp)->u.dst.flags & DST_BALANCED) &&
599 passedexpired && !nextstep)
600 nextstep = &rth->u.rt_next;
601
602 rthp = &rth->u.rt_next;
603 }
604 }
605
606 rt_free(expentry);
607 if (removed_count)
608 ++(*removed_count);
609
610 return nextstep;
611}
612#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
613
614
615/* This runs via a timer and thus is always in BH context. */
616static void rt_check_expire(unsigned long dummy)
617{
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700618 static unsigned int rover;
619 unsigned int i = rover, goal;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700620 struct rtable *rth, **rthp;
621 unsigned long now = jiffies;
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700622 u64 mult;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700623
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700624 mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
625 if (ip_rt_gc_timeout > 1)
626 do_div(mult, ip_rt_gc_timeout);
627 goal = (unsigned int)mult;
628 if (goal > rt_hash_mask) goal = rt_hash_mask + 1;
629 for (; goal > 0; goal--) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700630 unsigned long tmo = ip_rt_gc_timeout;
631
632 i = (i + 1) & rt_hash_mask;
633 rthp = &rt_hash_table[i].chain;
634
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700635 if (*rthp == 0)
636 continue;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700637 spin_lock(rt_hash_lock_addr(i));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700638 while ((rth = *rthp) != NULL) {
639 if (rth->u.dst.expires) {
640 /* Entry is expired even if it is in use */
641 if (time_before_eq(now, rth->u.dst.expires)) {
642 tmo >>= 1;
643 rthp = &rth->u.rt_next;
644 continue;
645 }
646 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
647 tmo >>= 1;
648 rthp = &rth->u.rt_next;
649 continue;
650 }
651
652 /* Cleanup aged off entries. */
653#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
654 /* remove all related balanced entries if necessary */
655 if (rth->u.dst.flags & DST_BALANCED) {
656 rthp = rt_remove_balanced_route(
657 &rt_hash_table[i].chain,
658 rth, NULL);
659 if (!rthp)
660 break;
661 } else {
662 *rthp = rth->u.rt_next;
663 rt_free(rth);
664 }
665#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
666 *rthp = rth->u.rt_next;
667 rt_free(rth);
668#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
669 }
Eric Dumazet22c047c2005-07-05 14:55:24 -0700670 spin_unlock(rt_hash_lock_addr(i));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700671
672 /* Fallback loop breaker. */
673 if (time_after(jiffies, now))
674 break;
675 }
676 rover = i;
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700677 mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700678}
679
680/* This can run from both BH and non-BH contexts, the latter
681 * in the case of a forced flush event.
682 */
683static void rt_run_flush(unsigned long dummy)
684{
685 int i;
686 struct rtable *rth, *next;
687
688 rt_deadline = 0;
689
690 get_random_bytes(&rt_hash_rnd, 4);
691
692 for (i = rt_hash_mask; i >= 0; i--) {
Eric Dumazet22c047c2005-07-05 14:55:24 -0700693 spin_lock_bh(rt_hash_lock_addr(i));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700694 rth = rt_hash_table[i].chain;
695 if (rth)
696 rt_hash_table[i].chain = NULL;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700697 spin_unlock_bh(rt_hash_lock_addr(i));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700698
699 for (; rth; rth = next) {
700 next = rth->u.rt_next;
701 rt_free(rth);
702 }
703 }
704}
705
706static DEFINE_SPINLOCK(rt_flush_lock);
707
708void rt_cache_flush(int delay)
709{
710 unsigned long now = jiffies;
711 int user_mode = !in_softirq();
712
713 if (delay < 0)
714 delay = ip_rt_min_delay;
715
716 /* flush existing multipath state*/
717 multipath_flush();
718
719 spin_lock_bh(&rt_flush_lock);
720
721 if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
722 long tmo = (long)(rt_deadline - now);
723
724 /* If flush timer is already running
725 and flush request is not immediate (delay > 0):
726
727 if deadline is not achieved, prolongate timer to "delay",
728 otherwise fire it at deadline time.
729 */
730
731 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
732 tmo = 0;
733
734 if (delay > tmo)
735 delay = tmo;
736 }
737
738 if (delay <= 0) {
739 spin_unlock_bh(&rt_flush_lock);
740 rt_run_flush(0);
741 return;
742 }
743
744 if (rt_deadline == 0)
745 rt_deadline = now + ip_rt_max_delay;
746
747 mod_timer(&rt_flush_timer, now+delay);
748 spin_unlock_bh(&rt_flush_lock);
749}
750
751static void rt_secret_rebuild(unsigned long dummy)
752{
753 unsigned long now = jiffies;
754
755 rt_cache_flush(0);
756 mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
757}
758
759/*
760 Short description of GC goals.
761
762 We want to build algorithm, which will keep routing cache
763 at some equilibrium point, when number of aged off entries
764 is kept approximately equal to newly generated ones.
765
766 Current expiration strength is variable "expire".
767 We try to adjust it dynamically, so that if networking
768 is idle expires is large enough to keep enough of warm entries,
769 and when load increases it reduces to limit cache size.
770 */
771
772static int rt_garbage_collect(void)
773{
774 static unsigned long expire = RT_GC_TIMEOUT;
775 static unsigned long last_gc;
776 static int rover;
777 static int equilibrium;
778 struct rtable *rth, **rthp;
779 unsigned long now = jiffies;
780 int goal;
781
782 /*
783 * Garbage collection is pretty expensive,
784 * do not make it too frequently.
785 */
786
787 RT_CACHE_STAT_INC(gc_total);
788
789 if (now - last_gc < ip_rt_gc_min_interval &&
790 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
791 RT_CACHE_STAT_INC(gc_ignored);
792 goto out;
793 }
794
795 /* Calculate number of entries, which we want to expire now. */
796 goal = atomic_read(&ipv4_dst_ops.entries) -
797 (ip_rt_gc_elasticity << rt_hash_log);
798 if (goal <= 0) {
799 if (equilibrium < ipv4_dst_ops.gc_thresh)
800 equilibrium = ipv4_dst_ops.gc_thresh;
801 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
802 if (goal > 0) {
803 equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
804 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
805 }
806 } else {
807 /* We are in dangerous area. Try to reduce cache really
808 * aggressively.
809 */
810 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
811 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
812 }
813
814 if (now - last_gc >= ip_rt_gc_min_interval)
815 last_gc = now;
816
817 if (goal <= 0) {
818 equilibrium += goal;
819 goto work_done;
820 }
821
822 do {
823 int i, k;
824
825 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
826 unsigned long tmo = expire;
827
828 k = (k + 1) & rt_hash_mask;
829 rthp = &rt_hash_table[k].chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700830 spin_lock_bh(rt_hash_lock_addr(k));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700831 while ((rth = *rthp) != NULL) {
832 if (!rt_may_expire(rth, tmo, expire)) {
833 tmo >>= 1;
834 rthp = &rth->u.rt_next;
835 continue;
836 }
837#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
838 /* remove all related balanced entries
839 * if necessary
840 */
841 if (rth->u.dst.flags & DST_BALANCED) {
842 int r;
843
844 rthp = rt_remove_balanced_route(
Suresh Bhogavilli85259872006-02-21 13:42:22 -0800845 &rt_hash_table[k].chain,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700846 rth,
847 &r);
848 goal -= r;
849 if (!rthp)
850 break;
851 } else {
852 *rthp = rth->u.rt_next;
853 rt_free(rth);
854 goal--;
855 }
856#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
857 *rthp = rth->u.rt_next;
858 rt_free(rth);
859 goal--;
860#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
861 }
Eric Dumazet22c047c2005-07-05 14:55:24 -0700862 spin_unlock_bh(rt_hash_lock_addr(k));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700863 if (goal <= 0)
864 break;
865 }
866 rover = k;
867
868 if (goal <= 0)
869 goto work_done;
870
871 /* Goal is not achieved. We stop process if:
872
873 - if expire reduced to zero. Otherwise, expire is halfed.
874 - if table is not full.
875 - if we are called from interrupt.
876 - jiffies check is just fallback/debug loop breaker.
877 We will not spin here for long time in any case.
878 */
879
880 RT_CACHE_STAT_INC(gc_goal_miss);
881
882 if (expire == 0)
883 break;
884
885 expire >>= 1;
886#if RT_CACHE_DEBUG >= 2
887 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
888 atomic_read(&ipv4_dst_ops.entries), goal, i);
889#endif
890
891 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
892 goto out;
893 } while (!in_softirq() && time_before_eq(jiffies, now));
894
895 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
896 goto out;
897 if (net_ratelimit())
898 printk(KERN_WARNING "dst cache overflow\n");
899 RT_CACHE_STAT_INC(gc_dst_overflow);
900 return 1;
901
902work_done:
903 expire += ip_rt_gc_min_interval;
904 if (expire > ip_rt_gc_timeout ||
905 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
906 expire = ip_rt_gc_timeout;
907#if RT_CACHE_DEBUG >= 2
908 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
909 atomic_read(&ipv4_dst_ops.entries), goal, rover);
910#endif
911out: return 0;
912}
913
914static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
915{
916 struct rtable *rth, **rthp;
917 unsigned long now;
918 struct rtable *cand, **candp;
919 u32 min_score;
920 int chain_length;
921 int attempts = !in_softirq();
922
923restart:
924 chain_length = 0;
925 min_score = ~(u32)0;
926 cand = NULL;
927 candp = NULL;
928 now = jiffies;
929
930 rthp = &rt_hash_table[hash].chain;
931
Eric Dumazet22c047c2005-07-05 14:55:24 -0700932 spin_lock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700933 while ((rth = *rthp) != NULL) {
934#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
935 if (!(rth->u.dst.flags & DST_BALANCED) &&
936 compare_keys(&rth->fl, &rt->fl)) {
937#else
938 if (compare_keys(&rth->fl, &rt->fl)) {
939#endif
940 /* Put it first */
941 *rthp = rth->u.rt_next;
942 /*
943 * Since lookup is lockfree, the deletion
944 * must be visible to another weakly ordered CPU before
945 * the insertion at the start of the hash chain.
946 */
947 rcu_assign_pointer(rth->u.rt_next,
948 rt_hash_table[hash].chain);
949 /*
950 * Since lookup is lockfree, the update writes
951 * must be ordered for consistency on SMP.
952 */
953 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
954
955 rth->u.dst.__use++;
956 dst_hold(&rth->u.dst);
957 rth->u.dst.lastuse = now;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700958 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700959
960 rt_drop(rt);
961 *rp = rth;
962 return 0;
963 }
964
965 if (!atomic_read(&rth->u.dst.__refcnt)) {
966 u32 score = rt_score(rth);
967
968 if (score <= min_score) {
969 cand = rth;
970 candp = rthp;
971 min_score = score;
972 }
973 }
974
975 chain_length++;
976
977 rthp = &rth->u.rt_next;
978 }
979
980 if (cand) {
981 /* ip_rt_gc_elasticity used to be average length of chain
982 * length, when exceeded gc becomes really aggressive.
983 *
984 * The second limit is less certain. At the moment it allows
985 * only 2 entries per bucket. We will see.
986 */
987 if (chain_length > ip_rt_gc_elasticity) {
988 *candp = cand->u.rt_next;
989 rt_free(cand);
990 }
991 }
992
993 /* Try to bind route to arp only if it is output
994 route or unicast forwarding path.
995 */
996 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
997 int err = arp_bind_neighbour(&rt->u.dst);
998 if (err) {
Eric Dumazet22c047c2005-07-05 14:55:24 -0700999 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001000
1001 if (err != -ENOBUFS) {
1002 rt_drop(rt);
1003 return err;
1004 }
1005
1006 /* Neighbour tables are full and nothing
1007 can be released. Try to shrink route cache,
1008 it is most likely it holds some neighbour records.
1009 */
1010 if (attempts-- > 0) {
1011 int saved_elasticity = ip_rt_gc_elasticity;
1012 int saved_int = ip_rt_gc_min_interval;
1013 ip_rt_gc_elasticity = 1;
1014 ip_rt_gc_min_interval = 0;
1015 rt_garbage_collect();
1016 ip_rt_gc_min_interval = saved_int;
1017 ip_rt_gc_elasticity = saved_elasticity;
1018 goto restart;
1019 }
1020
1021 if (net_ratelimit())
1022 printk(KERN_WARNING "Neighbour table overflow.\n");
1023 rt_drop(rt);
1024 return -ENOBUFS;
1025 }
1026 }
1027
1028 rt->u.rt_next = rt_hash_table[hash].chain;
1029#if RT_CACHE_DEBUG >= 2
1030 if (rt->u.rt_next) {
1031 struct rtable *trt;
1032 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
1033 NIPQUAD(rt->rt_dst));
1034 for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next)
1035 printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
1036 printk("\n");
1037 }
1038#endif
1039 rt_hash_table[hash].chain = rt;
Eric Dumazet22c047c2005-07-05 14:55:24 -07001040 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001041 *rp = rt;
1042 return 0;
1043}
1044
1045void rt_bind_peer(struct rtable *rt, int create)
1046{
1047 static DEFINE_SPINLOCK(rt_peer_lock);
1048 struct inet_peer *peer;
1049
1050 peer = inet_getpeer(rt->rt_dst, create);
1051
1052 spin_lock_bh(&rt_peer_lock);
1053 if (rt->peer == NULL) {
1054 rt->peer = peer;
1055 peer = NULL;
1056 }
1057 spin_unlock_bh(&rt_peer_lock);
1058 if (peer)
1059 inet_putpeer(peer);
1060}
1061
1062/*
1063 * Peer allocation may fail only in serious out-of-memory conditions. However
1064 * we still can generate some output.
1065 * Random ID selection looks a bit dangerous because we have no chances to
1066 * select ID being unique in a reasonable period of time.
1067 * But broken packet identifier may be better than no packet at all.
1068 */
1069static void ip_select_fb_ident(struct iphdr *iph)
1070{
1071 static DEFINE_SPINLOCK(ip_fb_id_lock);
1072 static u32 ip_fallback_id;
1073 u32 salt;
1074
1075 spin_lock_bh(&ip_fb_id_lock);
1076 salt = secure_ip_id(ip_fallback_id ^ iph->daddr);
1077 iph->id = htons(salt & 0xFFFF);
1078 ip_fallback_id = salt;
1079 spin_unlock_bh(&ip_fb_id_lock);
1080}
1081
1082void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1083{
1084 struct rtable *rt = (struct rtable *) dst;
1085
1086 if (rt) {
1087 if (rt->peer == NULL)
1088 rt_bind_peer(rt, 1);
1089
1090 /* If peer is attached to destination, it is never detached,
1091 so that we need not to grab a lock to dereference it.
1092 */
1093 if (rt->peer) {
1094 iph->id = htons(inet_getid(rt->peer, more));
1095 return;
1096 }
1097 } else
Stephen Hemminger9c2b3322005-04-19 22:39:42 -07001098 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1099 __builtin_return_address(0));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001100
1101 ip_select_fb_ident(iph);
1102}
1103
1104static void rt_del(unsigned hash, struct rtable *rt)
1105{
1106 struct rtable **rthp;
1107
Eric Dumazet22c047c2005-07-05 14:55:24 -07001108 spin_lock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001109 ip_rt_put(rt);
1110 for (rthp = &rt_hash_table[hash].chain; *rthp;
1111 rthp = &(*rthp)->u.rt_next)
1112 if (*rthp == rt) {
1113 *rthp = rt->u.rt_next;
1114 rt_free(rt);
1115 break;
1116 }
Eric Dumazet22c047c2005-07-05 14:55:24 -07001117 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001118}
1119
1120void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
Ilia Sotnikovcef26852006-03-25 01:38:55 -08001121 u32 saddr, struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001122{
1123 int i, k;
1124 struct in_device *in_dev = in_dev_get(dev);
1125 struct rtable *rth, **rthp;
1126 u32 skeys[2] = { saddr, 0 };
1127 int ikeys[2] = { dev->ifindex, 0 };
1128
Linus Torvalds1da177e2005-04-16 15:20:36 -07001129 if (!in_dev)
1130 return;
1131
1132 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1133 || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1134 goto reject_redirect;
1135
1136 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1137 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1138 goto reject_redirect;
1139 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1140 goto reject_redirect;
1141 } else {
1142 if (inet_addr_type(new_gw) != RTN_UNICAST)
1143 goto reject_redirect;
1144 }
1145
1146 for (i = 0; i < 2; i++) {
1147 for (k = 0; k < 2; k++) {
1148 unsigned hash = rt_hash_code(daddr,
Ilia Sotnikovcef26852006-03-25 01:38:55 -08001149 skeys[i] ^ (ikeys[k] << 5));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001150
1151 rthp=&rt_hash_table[hash].chain;
1152
1153 rcu_read_lock();
1154 while ((rth = rcu_dereference(*rthp)) != NULL) {
1155 struct rtable *rt;
1156
1157 if (rth->fl.fl4_dst != daddr ||
1158 rth->fl.fl4_src != skeys[i] ||
Linus Torvalds1da177e2005-04-16 15:20:36 -07001159 rth->fl.oif != ikeys[k] ||
1160 rth->fl.iif != 0) {
1161 rthp = &rth->u.rt_next;
1162 continue;
1163 }
1164
1165 if (rth->rt_dst != daddr ||
1166 rth->rt_src != saddr ||
1167 rth->u.dst.error ||
1168 rth->rt_gateway != old_gw ||
1169 rth->u.dst.dev != dev)
1170 break;
1171
1172 dst_hold(&rth->u.dst);
1173 rcu_read_unlock();
1174
1175 rt = dst_alloc(&ipv4_dst_ops);
1176 if (rt == NULL) {
1177 ip_rt_put(rth);
1178 in_dev_put(in_dev);
1179 return;
1180 }
1181
1182 /* Copy all the information. */
1183 *rt = *rth;
1184 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1185 rt->u.dst.__use = 1;
1186 atomic_set(&rt->u.dst.__refcnt, 1);
1187 rt->u.dst.child = NULL;
1188 if (rt->u.dst.dev)
1189 dev_hold(rt->u.dst.dev);
1190 if (rt->idev)
1191 in_dev_hold(rt->idev);
1192 rt->u.dst.obsolete = 0;
1193 rt->u.dst.lastuse = jiffies;
1194 rt->u.dst.path = &rt->u.dst;
1195 rt->u.dst.neighbour = NULL;
1196 rt->u.dst.hh = NULL;
1197 rt->u.dst.xfrm = NULL;
1198
1199 rt->rt_flags |= RTCF_REDIRECTED;
1200
1201 /* Gateway is different ... */
1202 rt->rt_gateway = new_gw;
1203
1204 /* Redirect received -> path was valid */
1205 dst_confirm(&rth->u.dst);
1206
1207 if (rt->peer)
1208 atomic_inc(&rt->peer->refcnt);
1209
1210 if (arp_bind_neighbour(&rt->u.dst) ||
1211 !(rt->u.dst.neighbour->nud_state &
1212 NUD_VALID)) {
1213 if (rt->u.dst.neighbour)
1214 neigh_event_send(rt->u.dst.neighbour, NULL);
1215 ip_rt_put(rth);
1216 rt_drop(rt);
1217 goto do_next;
1218 }
1219
1220 rt_del(hash, rth);
1221 if (!rt_intern_hash(hash, rt, &rt))
1222 ip_rt_put(rt);
1223 goto do_next;
1224 }
1225 rcu_read_unlock();
1226 do_next:
1227 ;
1228 }
1229 }
1230 in_dev_put(in_dev);
1231 return;
1232
1233reject_redirect:
1234#ifdef CONFIG_IP_ROUTE_VERBOSE
1235 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1236 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1237 "%u.%u.%u.%u ignored.\n"
Ilia Sotnikovcef26852006-03-25 01:38:55 -08001238 " Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -07001239 NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
Ilia Sotnikovcef26852006-03-25 01:38:55 -08001240 NIPQUAD(saddr), NIPQUAD(daddr));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001241#endif
1242 in_dev_put(in_dev);
1243}
1244
1245static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1246{
1247 struct rtable *rt = (struct rtable*)dst;
1248 struct dst_entry *ret = dst;
1249
1250 if (rt) {
1251 if (dst->obsolete) {
1252 ip_rt_put(rt);
1253 ret = NULL;
1254 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1255 rt->u.dst.expires) {
1256 unsigned hash = rt_hash_code(rt->fl.fl4_dst,
1257 rt->fl.fl4_src ^
Ilia Sotnikovcef26852006-03-25 01:38:55 -08001258 (rt->fl.oif << 5));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001259#if RT_CACHE_DEBUG >= 1
1260 printk(KERN_DEBUG "ip_rt_advice: redirect to "
1261 "%u.%u.%u.%u/%02x dropped\n",
1262 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1263#endif
1264 rt_del(hash, rt);
1265 ret = NULL;
1266 }
1267 }
1268 return ret;
1269}
1270
1271/*
1272 * Algorithm:
1273 * 1. The first ip_rt_redirect_number redirects are sent
1274 * with exponential backoff, then we stop sending them at all,
1275 * assuming that the host ignores our redirects.
1276 * 2. If we did not see packets requiring redirects
1277 * during ip_rt_redirect_silence, we assume that the host
1278 * forgot redirected route and start to send redirects again.
1279 *
1280 * This algorithm is much cheaper and more intelligent than dumb load limiting
1281 * in icmp.c.
1282 *
1283 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1284 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1285 */
1286
1287void ip_rt_send_redirect(struct sk_buff *skb)
1288{
1289 struct rtable *rt = (struct rtable*)skb->dst;
1290 struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1291
1292 if (!in_dev)
1293 return;
1294
1295 if (!IN_DEV_TX_REDIRECTS(in_dev))
1296 goto out;
1297
1298 /* No redirected packets during ip_rt_redirect_silence;
1299 * reset the algorithm.
1300 */
1301 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1302 rt->u.dst.rate_tokens = 0;
1303
1304 /* Too many ignored redirects; do not send anything
1305 * set u.dst.rate_last to the last seen redirected packet.
1306 */
1307 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1308 rt->u.dst.rate_last = jiffies;
1309 goto out;
1310 }
1311
1312 /* Check for load limit; set rate_last to the latest sent
1313 * redirect.
1314 */
1315 if (time_after(jiffies,
1316 (rt->u.dst.rate_last +
1317 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1318 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1319 rt->u.dst.rate_last = jiffies;
1320 ++rt->u.dst.rate_tokens;
1321#ifdef CONFIG_IP_ROUTE_VERBOSE
1322 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1323 rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1324 net_ratelimit())
1325 printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1326 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1327 NIPQUAD(rt->rt_src), rt->rt_iif,
1328 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1329#endif
1330 }
1331out:
1332 in_dev_put(in_dev);
1333}
1334
1335static int ip_error(struct sk_buff *skb)
1336{
1337 struct rtable *rt = (struct rtable*)skb->dst;
1338 unsigned long now;
1339 int code;
1340
1341 switch (rt->u.dst.error) {
1342 case EINVAL:
1343 default:
1344 goto out;
1345 case EHOSTUNREACH:
1346 code = ICMP_HOST_UNREACH;
1347 break;
1348 case ENETUNREACH:
1349 code = ICMP_NET_UNREACH;
1350 break;
1351 case EACCES:
1352 code = ICMP_PKT_FILTERED;
1353 break;
1354 }
1355
1356 now = jiffies;
1357 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1358 if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1359 rt->u.dst.rate_tokens = ip_rt_error_burst;
1360 rt->u.dst.rate_last = now;
1361 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1362 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1363 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1364 }
1365
1366out: kfree_skb(skb);
1367 return 0;
1368}
1369
1370/*
1371 * The last two values are not from the RFC but
1372 * are needed for AMPRnet AX.25 paths.
1373 */
1374
Arjan van de Ven9b5b5cf2005-11-29 16:21:38 -08001375static const unsigned short mtu_plateau[] =
Linus Torvalds1da177e2005-04-16 15:20:36 -07001376{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1377
1378static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1379{
1380 int i;
1381
1382 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1383 if (old_mtu > mtu_plateau[i])
1384 return mtu_plateau[i];
1385 return 68;
1386}
1387
1388unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1389{
1390 int i;
1391 unsigned short old_mtu = ntohs(iph->tot_len);
1392 struct rtable *rth;
1393 u32 skeys[2] = { iph->saddr, 0, };
1394 u32 daddr = iph->daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001395 unsigned short est_mtu = 0;
1396
1397 if (ipv4_config.no_pmtu_disc)
1398 return 0;
1399
1400 for (i = 0; i < 2; i++) {
Ilia Sotnikovcef26852006-03-25 01:38:55 -08001401 unsigned hash = rt_hash_code(daddr, skeys[i]);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001402
1403 rcu_read_lock();
1404 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1405 rth = rcu_dereference(rth->u.rt_next)) {
1406 if (rth->fl.fl4_dst == daddr &&
1407 rth->fl.fl4_src == skeys[i] &&
1408 rth->rt_dst == daddr &&
1409 rth->rt_src == iph->saddr &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001410 rth->fl.iif == 0 &&
1411 !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1412 unsigned short mtu = new_mtu;
1413
1414 if (new_mtu < 68 || new_mtu >= old_mtu) {
1415
1416 /* BSD 4.2 compatibility hack :-( */
1417 if (mtu == 0 &&
1418 old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1419 old_mtu >= 68 + (iph->ihl << 2))
1420 old_mtu -= iph->ihl << 2;
1421
1422 mtu = guess_mtu(old_mtu);
1423 }
1424 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1425 if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1426 dst_confirm(&rth->u.dst);
1427 if (mtu < ip_rt_min_pmtu) {
1428 mtu = ip_rt_min_pmtu;
1429 rth->u.dst.metrics[RTAX_LOCK-1] |=
1430 (1 << RTAX_MTU);
1431 }
1432 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1433 dst_set_expires(&rth->u.dst,
1434 ip_rt_mtu_expires);
1435 }
1436 est_mtu = mtu;
1437 }
1438 }
1439 }
1440 rcu_read_unlock();
1441 }
1442 return est_mtu ? : new_mtu;
1443}
1444
1445static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1446{
1447 if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1448 !(dst_metric_locked(dst, RTAX_MTU))) {
1449 if (mtu < ip_rt_min_pmtu) {
1450 mtu = ip_rt_min_pmtu;
1451 dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1452 }
1453 dst->metrics[RTAX_MTU-1] = mtu;
1454 dst_set_expires(dst, ip_rt_mtu_expires);
1455 }
1456}
1457
1458static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1459{
1460 return NULL;
1461}
1462
1463static void ipv4_dst_destroy(struct dst_entry *dst)
1464{
1465 struct rtable *rt = (struct rtable *) dst;
1466 struct inet_peer *peer = rt->peer;
1467 struct in_device *idev = rt->idev;
1468
1469 if (peer) {
1470 rt->peer = NULL;
1471 inet_putpeer(peer);
1472 }
1473
1474 if (idev) {
1475 rt->idev = NULL;
1476 in_dev_put(idev);
1477 }
1478}
1479
1480static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1481 int how)
1482{
1483 struct rtable *rt = (struct rtable *) dst;
1484 struct in_device *idev = rt->idev;
1485 if (dev != &loopback_dev && idev && idev->dev == dev) {
1486 struct in_device *loopback_idev = in_dev_get(&loopback_dev);
1487 if (loopback_idev) {
1488 rt->idev = loopback_idev;
1489 in_dev_put(idev);
1490 }
1491 }
1492}
1493
1494static void ipv4_link_failure(struct sk_buff *skb)
1495{
1496 struct rtable *rt;
1497
1498 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1499
1500 rt = (struct rtable *) skb->dst;
1501 if (rt)
1502 dst_set_expires(&rt->u.dst, 0);
1503}
1504
1505static int ip_rt_bug(struct sk_buff *skb)
1506{
1507 printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1508 NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
1509 skb->dev ? skb->dev->name : "?");
1510 kfree_skb(skb);
1511 return 0;
1512}
1513
1514/*
1515 We do not cache source address of outgoing interface,
1516 because it is used only by IP RR, TS and SRR options,
1517 so that it out of fast path.
1518
1519 BTW remember: "addr" is allowed to be not aligned
1520 in IP options!
1521 */
1522
1523void ip_rt_get_source(u8 *addr, struct rtable *rt)
1524{
1525 u32 src;
1526 struct fib_result res;
1527
1528 if (rt->fl.iif == 0)
1529 src = rt->rt_src;
1530 else if (fib_lookup(&rt->fl, &res) == 0) {
1531 src = FIB_RES_PREFSRC(res);
1532 fib_res_put(&res);
1533 } else
1534 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1535 RT_SCOPE_UNIVERSE);
1536 memcpy(addr, &src, 4);
1537}
1538
1539#ifdef CONFIG_NET_CLS_ROUTE
1540static void set_class_tag(struct rtable *rt, u32 tag)
1541{
1542 if (!(rt->u.dst.tclassid & 0xFFFF))
1543 rt->u.dst.tclassid |= tag & 0xFFFF;
1544 if (!(rt->u.dst.tclassid & 0xFFFF0000))
1545 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1546}
1547#endif
1548
1549static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1550{
1551 struct fib_info *fi = res->fi;
1552
1553 if (fi) {
1554 if (FIB_RES_GW(*res) &&
1555 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1556 rt->rt_gateway = FIB_RES_GW(*res);
1557 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1558 sizeof(rt->u.dst.metrics));
1559 if (fi->fib_mtu == 0) {
1560 rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1561 if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1562 rt->rt_gateway != rt->rt_dst &&
1563 rt->u.dst.dev->mtu > 576)
1564 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1565 }
1566#ifdef CONFIG_NET_CLS_ROUTE
1567 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1568#endif
1569 } else
1570 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1571
1572 if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1573 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1574 if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1575 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1576 if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1577 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1578 ip_rt_min_advmss);
1579 if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1580 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1581
1582#ifdef CONFIG_NET_CLS_ROUTE
1583#ifdef CONFIG_IP_MULTIPLE_TABLES
1584 set_class_tag(rt, fib_rules_tclass(res));
1585#endif
1586 set_class_tag(rt, itag);
1587#endif
1588 rt->rt_type = res->type;
1589}
1590
1591static int ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
1592 u8 tos, struct net_device *dev, int our)
1593{
1594 unsigned hash;
1595 struct rtable *rth;
1596 u32 spec_dst;
1597 struct in_device *in_dev = in_dev_get(dev);
1598 u32 itag = 0;
1599
1600 /* Primary sanity checks. */
1601
1602 if (in_dev == NULL)
1603 return -EINVAL;
1604
1605 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1606 skb->protocol != htons(ETH_P_IP))
1607 goto e_inval;
1608
1609 if (ZERONET(saddr)) {
1610 if (!LOCAL_MCAST(daddr))
1611 goto e_inval;
1612 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1613 } else if (fib_validate_source(saddr, 0, tos, 0,
1614 dev, &spec_dst, &itag) < 0)
1615 goto e_inval;
1616
1617 rth = dst_alloc(&ipv4_dst_ops);
1618 if (!rth)
1619 goto e_nobufs;
1620
1621 rth->u.dst.output= ip_rt_bug;
1622
1623 atomic_set(&rth->u.dst.__refcnt, 1);
1624 rth->u.dst.flags= DST_HOST;
1625 if (in_dev->cnf.no_policy)
1626 rth->u.dst.flags |= DST_NOPOLICY;
1627 rth->fl.fl4_dst = daddr;
1628 rth->rt_dst = daddr;
1629 rth->fl.fl4_tos = tos;
1630#ifdef CONFIG_IP_ROUTE_FWMARK
1631 rth->fl.fl4_fwmark= skb->nfmark;
1632#endif
1633 rth->fl.fl4_src = saddr;
1634 rth->rt_src = saddr;
1635#ifdef CONFIG_NET_CLS_ROUTE
1636 rth->u.dst.tclassid = itag;
1637#endif
1638 rth->rt_iif =
1639 rth->fl.iif = dev->ifindex;
1640 rth->u.dst.dev = &loopback_dev;
1641 dev_hold(rth->u.dst.dev);
1642 rth->idev = in_dev_get(rth->u.dst.dev);
1643 rth->fl.oif = 0;
1644 rth->rt_gateway = daddr;
1645 rth->rt_spec_dst= spec_dst;
1646 rth->rt_type = RTN_MULTICAST;
1647 rth->rt_flags = RTCF_MULTICAST;
1648 if (our) {
1649 rth->u.dst.input= ip_local_deliver;
1650 rth->rt_flags |= RTCF_LOCAL;
1651 }
1652
1653#ifdef CONFIG_IP_MROUTE
1654 if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1655 rth->u.dst.input = ip_mr_input;
1656#endif
1657 RT_CACHE_STAT_INC(in_slow_mc);
1658
1659 in_dev_put(in_dev);
Ilia Sotnikovcef26852006-03-25 01:38:55 -08001660 hash = rt_hash_code(daddr, saddr ^ (dev->ifindex << 5));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001661 return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1662
1663e_nobufs:
1664 in_dev_put(in_dev);
1665 return -ENOBUFS;
1666
1667e_inval:
1668 in_dev_put(in_dev);
1669 return -EINVAL;
1670}
1671
1672
1673static void ip_handle_martian_source(struct net_device *dev,
1674 struct in_device *in_dev,
1675 struct sk_buff *skb,
1676 u32 daddr,
1677 u32 saddr)
1678{
1679 RT_CACHE_STAT_INC(in_martian_src);
1680#ifdef CONFIG_IP_ROUTE_VERBOSE
1681 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1682 /*
1683 * RFC1812 recommendation, if source is martian,
1684 * the only hint is MAC header.
1685 */
1686 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1687 "%u.%u.%u.%u, on dev %s\n",
1688 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
Olaf Kirch0b7f22a2005-07-11 21:01:42 -07001689 if (dev->hard_header_len && skb->mac.raw) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001690 int i;
1691 unsigned char *p = skb->mac.raw;
1692 printk(KERN_WARNING "ll header: ");
1693 for (i = 0; i < dev->hard_header_len; i++, p++) {
1694 printk("%02x", *p);
1695 if (i < (dev->hard_header_len - 1))
1696 printk(":");
1697 }
1698 printk("\n");
1699 }
1700 }
1701#endif
1702}
1703
1704static inline int __mkroute_input(struct sk_buff *skb,
1705 struct fib_result* res,
1706 struct in_device *in_dev,
1707 u32 daddr, u32 saddr, u32 tos,
1708 struct rtable **result)
1709{
1710
1711 struct rtable *rth;
1712 int err;
1713 struct in_device *out_dev;
1714 unsigned flags = 0;
1715 u32 spec_dst, itag;
1716
1717 /* get a working reference to the output device */
1718 out_dev = in_dev_get(FIB_RES_DEV(*res));
1719 if (out_dev == NULL) {
1720 if (net_ratelimit())
1721 printk(KERN_CRIT "Bug in ip_route_input" \
1722 "_slow(). Please, report\n");
1723 return -EINVAL;
1724 }
1725
1726
1727 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1728 in_dev->dev, &spec_dst, &itag);
1729 if (err < 0) {
1730 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1731 saddr);
1732
1733 err = -EINVAL;
1734 goto cleanup;
1735 }
1736
1737 if (err)
1738 flags |= RTCF_DIRECTSRC;
1739
1740 if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1741 (IN_DEV_SHARED_MEDIA(out_dev) ||
1742 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1743 flags |= RTCF_DOREDIRECT;
1744
1745 if (skb->protocol != htons(ETH_P_IP)) {
1746 /* Not IP (i.e. ARP). Do not create route, if it is
1747 * invalid for proxy arp. DNAT routes are always valid.
1748 */
1749 if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1750 err = -EINVAL;
1751 goto cleanup;
1752 }
1753 }
1754
1755
1756 rth = dst_alloc(&ipv4_dst_ops);
1757 if (!rth) {
1758 err = -ENOBUFS;
1759 goto cleanup;
1760 }
1761
Julian Anastasovce723d82005-09-08 13:34:47 -07001762 atomic_set(&rth->u.dst.__refcnt, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001763 rth->u.dst.flags= DST_HOST;
1764#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1765 if (res->fi->fib_nhs > 1)
1766 rth->u.dst.flags |= DST_BALANCED;
1767#endif
1768 if (in_dev->cnf.no_policy)
1769 rth->u.dst.flags |= DST_NOPOLICY;
1770 if (in_dev->cnf.no_xfrm)
1771 rth->u.dst.flags |= DST_NOXFRM;
1772 rth->fl.fl4_dst = daddr;
1773 rth->rt_dst = daddr;
1774 rth->fl.fl4_tos = tos;
1775#ifdef CONFIG_IP_ROUTE_FWMARK
1776 rth->fl.fl4_fwmark= skb->nfmark;
1777#endif
1778 rth->fl.fl4_src = saddr;
1779 rth->rt_src = saddr;
1780 rth->rt_gateway = daddr;
1781 rth->rt_iif =
1782 rth->fl.iif = in_dev->dev->ifindex;
1783 rth->u.dst.dev = (out_dev)->dev;
1784 dev_hold(rth->u.dst.dev);
1785 rth->idev = in_dev_get(rth->u.dst.dev);
1786 rth->fl.oif = 0;
1787 rth->rt_spec_dst= spec_dst;
1788
1789 rth->u.dst.input = ip_forward;
1790 rth->u.dst.output = ip_output;
1791
1792 rt_set_nexthop(rth, res, itag);
1793
1794 rth->rt_flags = flags;
1795
1796 *result = rth;
1797 err = 0;
1798 cleanup:
1799 /* release the working reference to the output device */
1800 in_dev_put(out_dev);
1801 return err;
1802}
1803
1804static inline int ip_mkroute_input_def(struct sk_buff *skb,
1805 struct fib_result* res,
1806 const struct flowi *fl,
1807 struct in_device *in_dev,
1808 u32 daddr, u32 saddr, u32 tos)
1809{
Chuck Short7abaa272005-06-22 22:10:23 -07001810 struct rtable* rth = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001811 int err;
1812 unsigned hash;
1813
1814#ifdef CONFIG_IP_ROUTE_MULTIPATH
1815 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1816 fib_select_multipath(fl, res);
1817#endif
1818
1819 /* create a routing cache entry */
1820 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1821 if (err)
1822 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001823
1824 /* put it into the cache */
Ilia Sotnikovcef26852006-03-25 01:38:55 -08001825 hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001826 return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1827}
1828
1829static inline int ip_mkroute_input(struct sk_buff *skb,
1830 struct fib_result* res,
1831 const struct flowi *fl,
1832 struct in_device *in_dev,
1833 u32 daddr, u32 saddr, u32 tos)
1834{
1835#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
Julian Anastasovce723d82005-09-08 13:34:47 -07001836 struct rtable* rth = NULL, *rtres;
1837 unsigned char hop, hopcount;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001838 int err = -EINVAL;
1839 unsigned int hash;
1840
1841 if (res->fi)
1842 hopcount = res->fi->fib_nhs;
1843 else
1844 hopcount = 1;
1845
Linus Torvalds1da177e2005-04-16 15:20:36 -07001846 /* distinguish between multipath and singlepath */
1847 if (hopcount < 2)
1848 return ip_mkroute_input_def(skb, res, fl, in_dev, daddr,
1849 saddr, tos);
1850
1851 /* add all alternatives to the routing cache */
1852 for (hop = 0; hop < hopcount; hop++) {
1853 res->nh_sel = hop;
1854
Julian Anastasovce723d82005-09-08 13:34:47 -07001855 /* put reference to previous result */
1856 if (hop)
1857 ip_rt_put(rtres);
1858
Linus Torvalds1da177e2005-04-16 15:20:36 -07001859 /* create a routing cache entry */
1860 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos,
1861 &rth);
1862 if (err)
1863 return err;
1864
1865 /* put it into the cache */
Ilia Sotnikovcef26852006-03-25 01:38:55 -08001866 hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5));
Julian Anastasovce723d82005-09-08 13:34:47 -07001867 err = rt_intern_hash(hash, rth, &rtres);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001868 if (err)
1869 return err;
1870
1871 /* forward hop information to multipath impl. */
1872 multipath_set_nhinfo(rth,
1873 FIB_RES_NETWORK(*res),
1874 FIB_RES_NETMASK(*res),
1875 res->prefixlen,
1876 &FIB_RES_NH(*res));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001877 }
Julian Anastasovce723d82005-09-08 13:34:47 -07001878 skb->dst = &rtres->u.dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001879 return err;
1880#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
1881 return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, saddr, tos);
1882#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
1883}
1884
1885
1886/*
1887 * NOTE. We drop all the packets that has local source
1888 * addresses, because every properly looped back packet
1889 * must have correct destination already attached by output routine.
1890 *
1891 * Such approach solves two big problems:
1892 * 1. Not simplex devices are handled properly.
1893 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1894 */
1895
1896static int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
1897 u8 tos, struct net_device *dev)
1898{
1899 struct fib_result res;
1900 struct in_device *in_dev = in_dev_get(dev);
1901 struct flowi fl = { .nl_u = { .ip4_u =
1902 { .daddr = daddr,
1903 .saddr = saddr,
1904 .tos = tos,
1905 .scope = RT_SCOPE_UNIVERSE,
1906#ifdef CONFIG_IP_ROUTE_FWMARK
1907 .fwmark = skb->nfmark
1908#endif
1909 } },
1910 .iif = dev->ifindex };
1911 unsigned flags = 0;
1912 u32 itag = 0;
1913 struct rtable * rth;
1914 unsigned hash;
1915 u32 spec_dst;
1916 int err = -EINVAL;
1917 int free_res = 0;
1918
1919 /* IP on this device is disabled. */
1920
1921 if (!in_dev)
1922 goto out;
1923
1924 /* Check for the most weird martians, which can be not detected
1925 by fib_lookup.
1926 */
1927
1928 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1929 goto martian_source;
1930
1931 if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0))
1932 goto brd_input;
1933
1934 /* Accept zero addresses only to limited broadcast;
1935 * I even do not know to fix it or not. Waiting for complains :-)
1936 */
1937 if (ZERONET(saddr))
1938 goto martian_source;
1939
1940 if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1941 goto martian_destination;
1942
1943 /*
1944 * Now we are ready to route packet.
1945 */
1946 if ((err = fib_lookup(&fl, &res)) != 0) {
1947 if (!IN_DEV_FORWARD(in_dev))
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07001948 goto e_hostunreach;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001949 goto no_route;
1950 }
1951 free_res = 1;
1952
1953 RT_CACHE_STAT_INC(in_slow_tot);
1954
1955 if (res.type == RTN_BROADCAST)
1956 goto brd_input;
1957
1958 if (res.type == RTN_LOCAL) {
1959 int result;
1960 result = fib_validate_source(saddr, daddr, tos,
1961 loopback_dev.ifindex,
1962 dev, &spec_dst, &itag);
1963 if (result < 0)
1964 goto martian_source;
1965 if (result)
1966 flags |= RTCF_DIRECTSRC;
1967 spec_dst = daddr;
1968 goto local_input;
1969 }
1970
1971 if (!IN_DEV_FORWARD(in_dev))
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07001972 goto e_hostunreach;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001973 if (res.type != RTN_UNICAST)
1974 goto martian_destination;
1975
1976 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1977 if (err == -ENOBUFS)
1978 goto e_nobufs;
1979 if (err == -EINVAL)
1980 goto e_inval;
1981
1982done:
1983 in_dev_put(in_dev);
1984 if (free_res)
1985 fib_res_put(&res);
1986out: return err;
1987
1988brd_input:
1989 if (skb->protocol != htons(ETH_P_IP))
1990 goto e_inval;
1991
1992 if (ZERONET(saddr))
1993 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1994 else {
1995 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1996 &itag);
1997 if (err < 0)
1998 goto martian_source;
1999 if (err)
2000 flags |= RTCF_DIRECTSRC;
2001 }
2002 flags |= RTCF_BROADCAST;
2003 res.type = RTN_BROADCAST;
2004 RT_CACHE_STAT_INC(in_brd);
2005
2006local_input:
2007 rth = dst_alloc(&ipv4_dst_ops);
2008 if (!rth)
2009 goto e_nobufs;
2010
2011 rth->u.dst.output= ip_rt_bug;
2012
2013 atomic_set(&rth->u.dst.__refcnt, 1);
2014 rth->u.dst.flags= DST_HOST;
2015 if (in_dev->cnf.no_policy)
2016 rth->u.dst.flags |= DST_NOPOLICY;
2017 rth->fl.fl4_dst = daddr;
2018 rth->rt_dst = daddr;
2019 rth->fl.fl4_tos = tos;
2020#ifdef CONFIG_IP_ROUTE_FWMARK
2021 rth->fl.fl4_fwmark= skb->nfmark;
2022#endif
2023 rth->fl.fl4_src = saddr;
2024 rth->rt_src = saddr;
2025#ifdef CONFIG_NET_CLS_ROUTE
2026 rth->u.dst.tclassid = itag;
2027#endif
2028 rth->rt_iif =
2029 rth->fl.iif = dev->ifindex;
2030 rth->u.dst.dev = &loopback_dev;
2031 dev_hold(rth->u.dst.dev);
2032 rth->idev = in_dev_get(rth->u.dst.dev);
2033 rth->rt_gateway = daddr;
2034 rth->rt_spec_dst= spec_dst;
2035 rth->u.dst.input= ip_local_deliver;
2036 rth->rt_flags = flags|RTCF_LOCAL;
2037 if (res.type == RTN_UNREACHABLE) {
2038 rth->u.dst.input= ip_error;
2039 rth->u.dst.error= -err;
2040 rth->rt_flags &= ~RTCF_LOCAL;
2041 }
2042 rth->rt_type = res.type;
Ilia Sotnikovcef26852006-03-25 01:38:55 -08002043 hash = rt_hash_code(daddr, saddr ^ (fl.iif << 5));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002044 err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
2045 goto done;
2046
2047no_route:
2048 RT_CACHE_STAT_INC(in_no_route);
2049 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2050 res.type = RTN_UNREACHABLE;
2051 goto local_input;
2052
2053 /*
2054 * Do not cache martian addresses: they should be logged (RFC1812)
2055 */
2056martian_destination:
2057 RT_CACHE_STAT_INC(in_martian_dst);
2058#ifdef CONFIG_IP_ROUTE_VERBOSE
2059 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2060 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2061 "%u.%u.%u.%u, dev %s\n",
2062 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2063#endif
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002064
2065e_hostunreach:
2066 err = -EHOSTUNREACH;
2067 goto done;
2068
Linus Torvalds1da177e2005-04-16 15:20:36 -07002069e_inval:
2070 err = -EINVAL;
2071 goto done;
2072
2073e_nobufs:
2074 err = -ENOBUFS;
2075 goto done;
2076
2077martian_source:
2078 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2079 goto e_inval;
2080}
2081
2082int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
2083 u8 tos, struct net_device *dev)
2084{
2085 struct rtable * rth;
2086 unsigned hash;
2087 int iif = dev->ifindex;
2088
2089 tos &= IPTOS_RT_MASK;
Ilia Sotnikovcef26852006-03-25 01:38:55 -08002090 hash = rt_hash_code(daddr, saddr ^ (iif << 5));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002091
2092 rcu_read_lock();
2093 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2094 rth = rcu_dereference(rth->u.rt_next)) {
2095 if (rth->fl.fl4_dst == daddr &&
2096 rth->fl.fl4_src == saddr &&
2097 rth->fl.iif == iif &&
2098 rth->fl.oif == 0 &&
2099#ifdef CONFIG_IP_ROUTE_FWMARK
2100 rth->fl.fl4_fwmark == skb->nfmark &&
2101#endif
2102 rth->fl.fl4_tos == tos) {
2103 rth->u.dst.lastuse = jiffies;
2104 dst_hold(&rth->u.dst);
2105 rth->u.dst.__use++;
2106 RT_CACHE_STAT_INC(in_hit);
2107 rcu_read_unlock();
2108 skb->dst = (struct dst_entry*)rth;
2109 return 0;
2110 }
2111 RT_CACHE_STAT_INC(in_hlist_search);
2112 }
2113 rcu_read_unlock();
2114
2115 /* Multicast recognition logic is moved from route cache to here.
2116 The problem was that too many Ethernet cards have broken/missing
2117 hardware multicast filters :-( As result the host on multicasting
2118 network acquires a lot of useless route cache entries, sort of
2119 SDR messages from all the world. Now we try to get rid of them.
2120 Really, provided software IP multicast filter is organized
2121 reasonably (at least, hashed), it does not result in a slowdown
2122 comparing with route cache reject entries.
2123 Note, that multicast routers are not affected, because
2124 route cache entry is created eventually.
2125 */
2126 if (MULTICAST(daddr)) {
2127 struct in_device *in_dev;
2128
2129 rcu_read_lock();
Herbert Xue5ed6392005-10-03 14:35:55 -07002130 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002131 int our = ip_check_mc(in_dev, daddr, saddr,
2132 skb->nh.iph->protocol);
2133 if (our
2134#ifdef CONFIG_IP_MROUTE
2135 || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
2136#endif
2137 ) {
2138 rcu_read_unlock();
2139 return ip_route_input_mc(skb, daddr, saddr,
2140 tos, dev, our);
2141 }
2142 }
2143 rcu_read_unlock();
2144 return -EINVAL;
2145 }
2146 return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2147}
2148
2149static inline int __mkroute_output(struct rtable **result,
2150 struct fib_result* res,
2151 const struct flowi *fl,
2152 const struct flowi *oldflp,
2153 struct net_device *dev_out,
2154 unsigned flags)
2155{
2156 struct rtable *rth;
2157 struct in_device *in_dev;
2158 u32 tos = RT_FL_TOS(oldflp);
2159 int err = 0;
2160
2161 if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2162 return -EINVAL;
2163
2164 if (fl->fl4_dst == 0xFFFFFFFF)
2165 res->type = RTN_BROADCAST;
2166 else if (MULTICAST(fl->fl4_dst))
2167 res->type = RTN_MULTICAST;
2168 else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
2169 return -EINVAL;
2170
2171 if (dev_out->flags & IFF_LOOPBACK)
2172 flags |= RTCF_LOCAL;
2173
2174 /* get work reference to inet device */
2175 in_dev = in_dev_get(dev_out);
2176 if (!in_dev)
2177 return -EINVAL;
2178
2179 if (res->type == RTN_BROADCAST) {
2180 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2181 if (res->fi) {
2182 fib_info_put(res->fi);
2183 res->fi = NULL;
2184 }
2185 } else if (res->type == RTN_MULTICAST) {
2186 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2187 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2188 oldflp->proto))
2189 flags &= ~RTCF_LOCAL;
2190 /* If multicast route do not exist use
2191 default one, but do not gateway in this case.
2192 Yes, it is hack.
2193 */
2194 if (res->fi && res->prefixlen < 4) {
2195 fib_info_put(res->fi);
2196 res->fi = NULL;
2197 }
2198 }
2199
2200
2201 rth = dst_alloc(&ipv4_dst_ops);
2202 if (!rth) {
2203 err = -ENOBUFS;
2204 goto cleanup;
2205 }
2206
Julian Anastasovce723d82005-09-08 13:34:47 -07002207 atomic_set(&rth->u.dst.__refcnt, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002208 rth->u.dst.flags= DST_HOST;
2209#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2210 if (res->fi) {
2211 rth->rt_multipath_alg = res->fi->fib_mp_alg;
2212 if (res->fi->fib_nhs > 1)
2213 rth->u.dst.flags |= DST_BALANCED;
2214 }
2215#endif
2216 if (in_dev->cnf.no_xfrm)
2217 rth->u.dst.flags |= DST_NOXFRM;
2218 if (in_dev->cnf.no_policy)
2219 rth->u.dst.flags |= DST_NOPOLICY;
2220
2221 rth->fl.fl4_dst = oldflp->fl4_dst;
2222 rth->fl.fl4_tos = tos;
2223 rth->fl.fl4_src = oldflp->fl4_src;
2224 rth->fl.oif = oldflp->oif;
2225#ifdef CONFIG_IP_ROUTE_FWMARK
2226 rth->fl.fl4_fwmark= oldflp->fl4_fwmark;
2227#endif
2228 rth->rt_dst = fl->fl4_dst;
2229 rth->rt_src = fl->fl4_src;
2230 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
2231 /* get references to the devices that are to be hold by the routing
2232 cache entry */
2233 rth->u.dst.dev = dev_out;
2234 dev_hold(dev_out);
2235 rth->idev = in_dev_get(dev_out);
2236 rth->rt_gateway = fl->fl4_dst;
2237 rth->rt_spec_dst= fl->fl4_src;
2238
2239 rth->u.dst.output=ip_output;
2240
2241 RT_CACHE_STAT_INC(out_slow_tot);
2242
2243 if (flags & RTCF_LOCAL) {
2244 rth->u.dst.input = ip_local_deliver;
2245 rth->rt_spec_dst = fl->fl4_dst;
2246 }
2247 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2248 rth->rt_spec_dst = fl->fl4_src;
2249 if (flags & RTCF_LOCAL &&
2250 !(dev_out->flags & IFF_LOOPBACK)) {
2251 rth->u.dst.output = ip_mc_output;
2252 RT_CACHE_STAT_INC(out_slow_mc);
2253 }
2254#ifdef CONFIG_IP_MROUTE
2255 if (res->type == RTN_MULTICAST) {
2256 if (IN_DEV_MFORWARD(in_dev) &&
2257 !LOCAL_MCAST(oldflp->fl4_dst)) {
2258 rth->u.dst.input = ip_mr_input;
2259 rth->u.dst.output = ip_mc_output;
2260 }
2261 }
2262#endif
2263 }
2264
2265 rt_set_nexthop(rth, res, 0);
2266
2267 rth->rt_flags = flags;
2268
2269 *result = rth;
2270 cleanup:
2271 /* release work reference to inet device */
2272 in_dev_put(in_dev);
2273
2274 return err;
2275}
2276
2277static inline int ip_mkroute_output_def(struct rtable **rp,
2278 struct fib_result* res,
2279 const struct flowi *fl,
2280 const struct flowi *oldflp,
2281 struct net_device *dev_out,
2282 unsigned flags)
2283{
Chuck Short7abaa272005-06-22 22:10:23 -07002284 struct rtable *rth = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002285 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2286 unsigned hash;
2287 if (err == 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002288 hash = rt_hash_code(oldflp->fl4_dst,
Ilia Sotnikovcef26852006-03-25 01:38:55 -08002289 oldflp->fl4_src ^ (oldflp->oif << 5));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002290 err = rt_intern_hash(hash, rth, rp);
2291 }
2292
2293 return err;
2294}
2295
2296static inline int ip_mkroute_output(struct rtable** rp,
2297 struct fib_result* res,
2298 const struct flowi *fl,
2299 const struct flowi *oldflp,
2300 struct net_device *dev_out,
2301 unsigned flags)
2302{
2303#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
Linus Torvalds1da177e2005-04-16 15:20:36 -07002304 unsigned char hop;
2305 unsigned hash;
2306 int err = -EINVAL;
Chuck Short7abaa272005-06-22 22:10:23 -07002307 struct rtable *rth = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002308
2309 if (res->fi && res->fi->fib_nhs > 1) {
2310 unsigned char hopcount = res->fi->fib_nhs;
2311
2312 for (hop = 0; hop < hopcount; hop++) {
2313 struct net_device *dev2nexthop;
2314
2315 res->nh_sel = hop;
2316
2317 /* hold a work reference to the output device */
2318 dev2nexthop = FIB_RES_DEV(*res);
2319 dev_hold(dev2nexthop);
2320
Julian Anastasovce723d82005-09-08 13:34:47 -07002321 /* put reference to previous result */
2322 if (hop)
2323 ip_rt_put(*rp);
2324
Linus Torvalds1da177e2005-04-16 15:20:36 -07002325 err = __mkroute_output(&rth, res, fl, oldflp,
2326 dev2nexthop, flags);
2327
2328 if (err != 0)
2329 goto cleanup;
2330
2331 hash = rt_hash_code(oldflp->fl4_dst,
2332 oldflp->fl4_src ^
Ilia Sotnikovcef26852006-03-25 01:38:55 -08002333 (oldflp->oif << 5));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002334 err = rt_intern_hash(hash, rth, rp);
2335
2336 /* forward hop information to multipath impl. */
2337 multipath_set_nhinfo(rth,
2338 FIB_RES_NETWORK(*res),
2339 FIB_RES_NETMASK(*res),
2340 res->prefixlen,
2341 &FIB_RES_NH(*res));
2342 cleanup:
2343 /* release work reference to output device */
2344 dev_put(dev2nexthop);
2345
2346 if (err != 0)
2347 return err;
2348 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002349 return err;
2350 } else {
2351 return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out,
2352 flags);
2353 }
2354#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
2355 return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, flags);
2356#endif
2357}
2358
2359/*
2360 * Major route resolver routine.
2361 */
2362
2363static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2364{
2365 u32 tos = RT_FL_TOS(oldflp);
2366 struct flowi fl = { .nl_u = { .ip4_u =
2367 { .daddr = oldflp->fl4_dst,
2368 .saddr = oldflp->fl4_src,
2369 .tos = tos & IPTOS_RT_MASK,
2370 .scope = ((tos & RTO_ONLINK) ?
2371 RT_SCOPE_LINK :
2372 RT_SCOPE_UNIVERSE),
2373#ifdef CONFIG_IP_ROUTE_FWMARK
2374 .fwmark = oldflp->fl4_fwmark
2375#endif
2376 } },
2377 .iif = loopback_dev.ifindex,
2378 .oif = oldflp->oif };
2379 struct fib_result res;
2380 unsigned flags = 0;
2381 struct net_device *dev_out = NULL;
2382 int free_res = 0;
2383 int err;
2384
2385
2386 res.fi = NULL;
2387#ifdef CONFIG_IP_MULTIPLE_TABLES
2388 res.r = NULL;
2389#endif
2390
2391 if (oldflp->fl4_src) {
2392 err = -EINVAL;
2393 if (MULTICAST(oldflp->fl4_src) ||
2394 BADCLASS(oldflp->fl4_src) ||
2395 ZERONET(oldflp->fl4_src))
2396 goto out;
2397
2398 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2399 dev_out = ip_dev_find(oldflp->fl4_src);
2400 if (dev_out == NULL)
2401 goto out;
2402
2403 /* I removed check for oif == dev_out->oif here.
2404 It was wrong for two reasons:
2405 1. ip_dev_find(saddr) can return wrong iface, if saddr is
2406 assigned to multiple interfaces.
2407 2. Moreover, we are allowed to send packets with saddr
2408 of another iface. --ANK
2409 */
2410
2411 if (oldflp->oif == 0
2412 && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF)) {
2413 /* Special hack: user can direct multicasts
2414 and limited broadcast via necessary interface
2415 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2416 This hack is not just for fun, it allows
2417 vic,vat and friends to work.
2418 They bind socket to loopback, set ttl to zero
2419 and expect that it will work.
2420 From the viewpoint of routing cache they are broken,
2421 because we are not allowed to build multicast path
2422 with loopback source addr (look, routing cache
2423 cannot know, that ttl is zero, so that packet
2424 will not leave this host and route is valid).
2425 Luckily, this hack is good workaround.
2426 */
2427
2428 fl.oif = dev_out->ifindex;
2429 goto make_route;
2430 }
2431 if (dev_out)
2432 dev_put(dev_out);
2433 dev_out = NULL;
2434 }
2435
2436
2437 if (oldflp->oif) {
2438 dev_out = dev_get_by_index(oldflp->oif);
2439 err = -ENODEV;
2440 if (dev_out == NULL)
2441 goto out;
Herbert Xue5ed6392005-10-03 14:35:55 -07002442
2443 /* RACE: Check return value of inet_select_addr instead. */
2444 if (__in_dev_get_rtnl(dev_out) == NULL) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002445 dev_put(dev_out);
2446 goto out; /* Wrong error code */
2447 }
2448
2449 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF) {
2450 if (!fl.fl4_src)
2451 fl.fl4_src = inet_select_addr(dev_out, 0,
2452 RT_SCOPE_LINK);
2453 goto make_route;
2454 }
2455 if (!fl.fl4_src) {
2456 if (MULTICAST(oldflp->fl4_dst))
2457 fl.fl4_src = inet_select_addr(dev_out, 0,
2458 fl.fl4_scope);
2459 else if (!oldflp->fl4_dst)
2460 fl.fl4_src = inet_select_addr(dev_out, 0,
2461 RT_SCOPE_HOST);
2462 }
2463 }
2464
2465 if (!fl.fl4_dst) {
2466 fl.fl4_dst = fl.fl4_src;
2467 if (!fl.fl4_dst)
2468 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2469 if (dev_out)
2470 dev_put(dev_out);
2471 dev_out = &loopback_dev;
2472 dev_hold(dev_out);
2473 fl.oif = loopback_dev.ifindex;
2474 res.type = RTN_LOCAL;
2475 flags |= RTCF_LOCAL;
2476 goto make_route;
2477 }
2478
2479 if (fib_lookup(&fl, &res)) {
2480 res.fi = NULL;
2481 if (oldflp->oif) {
2482 /* Apparently, routing tables are wrong. Assume,
2483 that the destination is on link.
2484
2485 WHY? DW.
2486 Because we are allowed to send to iface
2487 even if it has NO routes and NO assigned
2488 addresses. When oif is specified, routing
2489 tables are looked up with only one purpose:
2490 to catch if destination is gatewayed, rather than
2491 direct. Moreover, if MSG_DONTROUTE is set,
2492 we send packet, ignoring both routing tables
2493 and ifaddr state. --ANK
2494
2495
2496 We could make it even if oif is unknown,
2497 likely IPv6, but we do not.
2498 */
2499
2500 if (fl.fl4_src == 0)
2501 fl.fl4_src = inet_select_addr(dev_out, 0,
2502 RT_SCOPE_LINK);
2503 res.type = RTN_UNICAST;
2504 goto make_route;
2505 }
2506 if (dev_out)
2507 dev_put(dev_out);
2508 err = -ENETUNREACH;
2509 goto out;
2510 }
2511 free_res = 1;
2512
2513 if (res.type == RTN_LOCAL) {
2514 if (!fl.fl4_src)
2515 fl.fl4_src = fl.fl4_dst;
2516 if (dev_out)
2517 dev_put(dev_out);
2518 dev_out = &loopback_dev;
2519 dev_hold(dev_out);
2520 fl.oif = dev_out->ifindex;
2521 if (res.fi)
2522 fib_info_put(res.fi);
2523 res.fi = NULL;
2524 flags |= RTCF_LOCAL;
2525 goto make_route;
2526 }
2527
2528#ifdef CONFIG_IP_ROUTE_MULTIPATH
2529 if (res.fi->fib_nhs > 1 && fl.oif == 0)
2530 fib_select_multipath(&fl, &res);
2531 else
2532#endif
2533 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2534 fib_select_default(&fl, &res);
2535
2536 if (!fl.fl4_src)
2537 fl.fl4_src = FIB_RES_PREFSRC(res);
2538
2539 if (dev_out)
2540 dev_put(dev_out);
2541 dev_out = FIB_RES_DEV(res);
2542 dev_hold(dev_out);
2543 fl.oif = dev_out->ifindex;
2544
2545
2546make_route:
2547 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2548
2549
2550 if (free_res)
2551 fib_res_put(&res);
2552 if (dev_out)
2553 dev_put(dev_out);
2554out: return err;
2555}
2556
2557int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2558{
2559 unsigned hash;
2560 struct rtable *rth;
2561
Ilia Sotnikovcef26852006-03-25 01:38:55 -08002562 hash = rt_hash_code(flp->fl4_dst, flp->fl4_src ^ (flp->oif << 5));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002563
2564 rcu_read_lock_bh();
2565 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2566 rth = rcu_dereference(rth->u.rt_next)) {
2567 if (rth->fl.fl4_dst == flp->fl4_dst &&
2568 rth->fl.fl4_src == flp->fl4_src &&
2569 rth->fl.iif == 0 &&
2570 rth->fl.oif == flp->oif &&
2571#ifdef CONFIG_IP_ROUTE_FWMARK
2572 rth->fl.fl4_fwmark == flp->fl4_fwmark &&
2573#endif
2574 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2575 (IPTOS_RT_MASK | RTO_ONLINK))) {
2576
2577 /* check for multipath routes and choose one if
2578 * necessary
2579 */
2580 if (multipath_select_route(flp, rth, rp)) {
2581 dst_hold(&(*rp)->u.dst);
2582 RT_CACHE_STAT_INC(out_hit);
2583 rcu_read_unlock_bh();
2584 return 0;
2585 }
2586
2587 rth->u.dst.lastuse = jiffies;
2588 dst_hold(&rth->u.dst);
2589 rth->u.dst.__use++;
2590 RT_CACHE_STAT_INC(out_hit);
2591 rcu_read_unlock_bh();
2592 *rp = rth;
2593 return 0;
2594 }
2595 RT_CACHE_STAT_INC(out_hlist_search);
2596 }
2597 rcu_read_unlock_bh();
2598
2599 return ip_route_output_slow(rp, flp);
2600}
2601
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -07002602EXPORT_SYMBOL_GPL(__ip_route_output_key);
2603
Linus Torvalds1da177e2005-04-16 15:20:36 -07002604int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2605{
2606 int err;
2607
2608 if ((err = __ip_route_output_key(rp, flp)) != 0)
2609 return err;
2610
2611 if (flp->proto) {
2612 if (!flp->fl4_src)
2613 flp->fl4_src = (*rp)->rt_src;
2614 if (!flp->fl4_dst)
2615 flp->fl4_dst = (*rp)->rt_dst;
2616 return xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2617 }
2618
2619 return 0;
2620}
2621
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -07002622EXPORT_SYMBOL_GPL(ip_route_output_flow);
2623
Linus Torvalds1da177e2005-04-16 15:20:36 -07002624int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2625{
2626 return ip_route_output_flow(rp, flp, NULL, 0);
2627}
2628
2629static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07002630 int nowait, unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002631{
2632 struct rtable *rt = (struct rtable*)skb->dst;
2633 struct rtmsg *r;
2634 struct nlmsghdr *nlh;
2635 unsigned char *b = skb->tail;
2636 struct rta_cacheinfo ci;
2637#ifdef CONFIG_IP_MROUTE
2638 struct rtattr *eptr;
2639#endif
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07002640 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*r), flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002641 r = NLMSG_DATA(nlh);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002642 r->rtm_family = AF_INET;
2643 r->rtm_dst_len = 32;
2644 r->rtm_src_len = 0;
2645 r->rtm_tos = rt->fl.fl4_tos;
2646 r->rtm_table = RT_TABLE_MAIN;
2647 r->rtm_type = rt->rt_type;
2648 r->rtm_scope = RT_SCOPE_UNIVERSE;
2649 r->rtm_protocol = RTPROT_UNSPEC;
2650 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2651 if (rt->rt_flags & RTCF_NOTIFY)
2652 r->rtm_flags |= RTM_F_NOTIFY;
2653 RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
2654 if (rt->fl.fl4_src) {
2655 r->rtm_src_len = 32;
2656 RTA_PUT(skb, RTA_SRC, 4, &rt->fl.fl4_src);
2657 }
2658 if (rt->u.dst.dev)
2659 RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
2660#ifdef CONFIG_NET_CLS_ROUTE
2661 if (rt->u.dst.tclassid)
2662 RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid);
2663#endif
2664#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2665 if (rt->rt_multipath_alg != IP_MP_ALG_NONE) {
2666 __u32 alg = rt->rt_multipath_alg;
2667
2668 RTA_PUT(skb, RTA_MP_ALGO, 4, &alg);
2669 }
2670#endif
2671 if (rt->fl.iif)
2672 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
2673 else if (rt->rt_src != rt->fl.fl4_src)
2674 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src);
2675 if (rt->rt_dst != rt->rt_gateway)
2676 RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
2677 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2678 goto rtattr_failure;
2679 ci.rta_lastuse = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
2680 ci.rta_used = rt->u.dst.__use;
2681 ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt);
2682 if (rt->u.dst.expires)
2683 ci.rta_expires = jiffies_to_clock_t(rt->u.dst.expires - jiffies);
2684 else
2685 ci.rta_expires = 0;
2686 ci.rta_error = rt->u.dst.error;
2687 ci.rta_id = ci.rta_ts = ci.rta_tsage = 0;
2688 if (rt->peer) {
2689 ci.rta_id = rt->peer->ip_id_count;
2690 if (rt->peer->tcp_ts_stamp) {
2691 ci.rta_ts = rt->peer->tcp_ts;
2692 ci.rta_tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
2693 }
2694 }
2695#ifdef CONFIG_IP_MROUTE
2696 eptr = (struct rtattr*)skb->tail;
2697#endif
2698 RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
2699 if (rt->fl.iif) {
2700#ifdef CONFIG_IP_MROUTE
2701 u32 dst = rt->rt_dst;
2702
2703 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2704 ipv4_devconf.mc_forwarding) {
2705 int err = ipmr_get_route(skb, r, nowait);
2706 if (err <= 0) {
2707 if (!nowait) {
2708 if (err == 0)
2709 return 0;
2710 goto nlmsg_failure;
2711 } else {
2712 if (err == -EMSGSIZE)
2713 goto nlmsg_failure;
2714 ((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err;
2715 }
2716 }
2717 } else
2718#endif
2719 RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->fl.iif);
2720 }
2721
2722 nlh->nlmsg_len = skb->tail - b;
2723 return skb->len;
2724
2725nlmsg_failure:
2726rtattr_failure:
2727 skb_trim(skb, b - skb->data);
2728 return -1;
2729}
2730
2731int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2732{
2733 struct rtattr **rta = arg;
2734 struct rtmsg *rtm = NLMSG_DATA(nlh);
2735 struct rtable *rt = NULL;
2736 u32 dst = 0;
2737 u32 src = 0;
2738 int iif = 0;
2739 int err = -ENOBUFS;
2740 struct sk_buff *skb;
2741
2742 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2743 if (!skb)
2744 goto out;
2745
2746 /* Reserve room for dummy headers, this skb can pass
2747 through good chunk of routing engine.
2748 */
Stephen Hemmingerd2c962b2006-04-17 17:27:11 -07002749 skb->mac.raw = skb->nh.raw = skb->data;
2750
2751 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2752 skb->nh.iph->protocol = IPPROTO_ICMP;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002753 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2754
2755 if (rta[RTA_SRC - 1])
2756 memcpy(&src, RTA_DATA(rta[RTA_SRC - 1]), 4);
2757 if (rta[RTA_DST - 1])
2758 memcpy(&dst, RTA_DATA(rta[RTA_DST - 1]), 4);
2759 if (rta[RTA_IIF - 1])
2760 memcpy(&iif, RTA_DATA(rta[RTA_IIF - 1]), sizeof(int));
2761
2762 if (iif) {
2763 struct net_device *dev = __dev_get_by_index(iif);
2764 err = -ENODEV;
2765 if (!dev)
2766 goto out_free;
2767 skb->protocol = htons(ETH_P_IP);
2768 skb->dev = dev;
2769 local_bh_disable();
2770 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2771 local_bh_enable();
2772 rt = (struct rtable*)skb->dst;
2773 if (!err && rt->u.dst.error)
2774 err = -rt->u.dst.error;
2775 } else {
2776 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dst,
2777 .saddr = src,
2778 .tos = rtm->rtm_tos } } };
2779 int oif = 0;
2780 if (rta[RTA_OIF - 1])
2781 memcpy(&oif, RTA_DATA(rta[RTA_OIF - 1]), sizeof(int));
2782 fl.oif = oif;
2783 err = ip_route_output_key(&rt, &fl);
2784 }
2785 if (err)
2786 goto out_free;
2787
2788 skb->dst = &rt->u.dst;
2789 if (rtm->rtm_flags & RTM_F_NOTIFY)
2790 rt->rt_flags |= RTCF_NOTIFY;
2791
2792 NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
2793
2794 err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07002795 RTM_NEWROUTE, 0, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002796 if (!err)
2797 goto out_free;
2798 if (err < 0) {
2799 err = -EMSGSIZE;
2800 goto out_free;
2801 }
2802
2803 err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
2804 if (err > 0)
2805 err = 0;
2806out: return err;
2807
2808out_free:
2809 kfree_skb(skb);
2810 goto out;
2811}
2812
2813int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2814{
2815 struct rtable *rt;
2816 int h, s_h;
2817 int idx, s_idx;
2818
2819 s_h = cb->args[0];
2820 s_idx = idx = cb->args[1];
2821 for (h = 0; h <= rt_hash_mask; h++) {
2822 if (h < s_h) continue;
2823 if (h > s_h)
2824 s_idx = 0;
2825 rcu_read_lock_bh();
2826 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2827 rt = rcu_dereference(rt->u.rt_next), idx++) {
2828 if (idx < s_idx)
2829 continue;
2830 skb->dst = dst_clone(&rt->u.dst);
2831 if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07002832 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2833 1, NLM_F_MULTI) <= 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002834 dst_release(xchg(&skb->dst, NULL));
2835 rcu_read_unlock_bh();
2836 goto done;
2837 }
2838 dst_release(xchg(&skb->dst, NULL));
2839 }
2840 rcu_read_unlock_bh();
2841 }
2842
2843done:
2844 cb->args[0] = h;
2845 cb->args[1] = idx;
2846 return skb->len;
2847}
2848
2849void ip_rt_multicast_event(struct in_device *in_dev)
2850{
2851 rt_cache_flush(0);
2852}
2853
2854#ifdef CONFIG_SYSCTL
2855static int flush_delay;
2856
2857static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2858 struct file *filp, void __user *buffer,
2859 size_t *lenp, loff_t *ppos)
2860{
2861 if (write) {
2862 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2863 rt_cache_flush(flush_delay);
2864 return 0;
2865 }
2866
2867 return -EINVAL;
2868}
2869
2870static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2871 int __user *name,
2872 int nlen,
2873 void __user *oldval,
2874 size_t __user *oldlenp,
2875 void __user *newval,
2876 size_t newlen,
2877 void **context)
2878{
2879 int delay;
2880 if (newlen != sizeof(int))
2881 return -EINVAL;
2882 if (get_user(delay, (int __user *)newval))
2883 return -EFAULT;
2884 rt_cache_flush(delay);
2885 return 0;
2886}
2887
2888ctl_table ipv4_route_table[] = {
2889 {
2890 .ctl_name = NET_IPV4_ROUTE_FLUSH,
2891 .procname = "flush",
2892 .data = &flush_delay,
2893 .maxlen = sizeof(int),
Dave Jones7e3e0362005-04-28 12:11:03 -07002894 .mode = 0200,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002895 .proc_handler = &ipv4_sysctl_rtcache_flush,
2896 .strategy = &ipv4_sysctl_rtcache_flush_strategy,
2897 },
2898 {
2899 .ctl_name = NET_IPV4_ROUTE_MIN_DELAY,
2900 .procname = "min_delay",
2901 .data = &ip_rt_min_delay,
2902 .maxlen = sizeof(int),
2903 .mode = 0644,
2904 .proc_handler = &proc_dointvec_jiffies,
2905 .strategy = &sysctl_jiffies,
2906 },
2907 {
2908 .ctl_name = NET_IPV4_ROUTE_MAX_DELAY,
2909 .procname = "max_delay",
2910 .data = &ip_rt_max_delay,
2911 .maxlen = sizeof(int),
2912 .mode = 0644,
2913 .proc_handler = &proc_dointvec_jiffies,
2914 .strategy = &sysctl_jiffies,
2915 },
2916 {
2917 .ctl_name = NET_IPV4_ROUTE_GC_THRESH,
2918 .procname = "gc_thresh",
2919 .data = &ipv4_dst_ops.gc_thresh,
2920 .maxlen = sizeof(int),
2921 .mode = 0644,
2922 .proc_handler = &proc_dointvec,
2923 },
2924 {
2925 .ctl_name = NET_IPV4_ROUTE_MAX_SIZE,
2926 .procname = "max_size",
2927 .data = &ip_rt_max_size,
2928 .maxlen = sizeof(int),
2929 .mode = 0644,
2930 .proc_handler = &proc_dointvec,
2931 },
2932 {
2933 /* Deprecated. Use gc_min_interval_ms */
2934
2935 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2936 .procname = "gc_min_interval",
2937 .data = &ip_rt_gc_min_interval,
2938 .maxlen = sizeof(int),
2939 .mode = 0644,
2940 .proc_handler = &proc_dointvec_jiffies,
2941 .strategy = &sysctl_jiffies,
2942 },
2943 {
2944 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2945 .procname = "gc_min_interval_ms",
2946 .data = &ip_rt_gc_min_interval,
2947 .maxlen = sizeof(int),
2948 .mode = 0644,
2949 .proc_handler = &proc_dointvec_ms_jiffies,
2950 .strategy = &sysctl_ms_jiffies,
2951 },
2952 {
2953 .ctl_name = NET_IPV4_ROUTE_GC_TIMEOUT,
2954 .procname = "gc_timeout",
2955 .data = &ip_rt_gc_timeout,
2956 .maxlen = sizeof(int),
2957 .mode = 0644,
2958 .proc_handler = &proc_dointvec_jiffies,
2959 .strategy = &sysctl_jiffies,
2960 },
2961 {
2962 .ctl_name = NET_IPV4_ROUTE_GC_INTERVAL,
2963 .procname = "gc_interval",
2964 .data = &ip_rt_gc_interval,
2965 .maxlen = sizeof(int),
2966 .mode = 0644,
2967 .proc_handler = &proc_dointvec_jiffies,
2968 .strategy = &sysctl_jiffies,
2969 },
2970 {
2971 .ctl_name = NET_IPV4_ROUTE_REDIRECT_LOAD,
2972 .procname = "redirect_load",
2973 .data = &ip_rt_redirect_load,
2974 .maxlen = sizeof(int),
2975 .mode = 0644,
2976 .proc_handler = &proc_dointvec,
2977 },
2978 {
2979 .ctl_name = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2980 .procname = "redirect_number",
2981 .data = &ip_rt_redirect_number,
2982 .maxlen = sizeof(int),
2983 .mode = 0644,
2984 .proc_handler = &proc_dointvec,
2985 },
2986 {
2987 .ctl_name = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2988 .procname = "redirect_silence",
2989 .data = &ip_rt_redirect_silence,
2990 .maxlen = sizeof(int),
2991 .mode = 0644,
2992 .proc_handler = &proc_dointvec,
2993 },
2994 {
2995 .ctl_name = NET_IPV4_ROUTE_ERROR_COST,
2996 .procname = "error_cost",
2997 .data = &ip_rt_error_cost,
2998 .maxlen = sizeof(int),
2999 .mode = 0644,
3000 .proc_handler = &proc_dointvec,
3001 },
3002 {
3003 .ctl_name = NET_IPV4_ROUTE_ERROR_BURST,
3004 .procname = "error_burst",
3005 .data = &ip_rt_error_burst,
3006 .maxlen = sizeof(int),
3007 .mode = 0644,
3008 .proc_handler = &proc_dointvec,
3009 },
3010 {
3011 .ctl_name = NET_IPV4_ROUTE_GC_ELASTICITY,
3012 .procname = "gc_elasticity",
3013 .data = &ip_rt_gc_elasticity,
3014 .maxlen = sizeof(int),
3015 .mode = 0644,
3016 .proc_handler = &proc_dointvec,
3017 },
3018 {
3019 .ctl_name = NET_IPV4_ROUTE_MTU_EXPIRES,
3020 .procname = "mtu_expires",
3021 .data = &ip_rt_mtu_expires,
3022 .maxlen = sizeof(int),
3023 .mode = 0644,
3024 .proc_handler = &proc_dointvec_jiffies,
3025 .strategy = &sysctl_jiffies,
3026 },
3027 {
3028 .ctl_name = NET_IPV4_ROUTE_MIN_PMTU,
3029 .procname = "min_pmtu",
3030 .data = &ip_rt_min_pmtu,
3031 .maxlen = sizeof(int),
3032 .mode = 0644,
3033 .proc_handler = &proc_dointvec,
3034 },
3035 {
3036 .ctl_name = NET_IPV4_ROUTE_MIN_ADVMSS,
3037 .procname = "min_adv_mss",
3038 .data = &ip_rt_min_advmss,
3039 .maxlen = sizeof(int),
3040 .mode = 0644,
3041 .proc_handler = &proc_dointvec,
3042 },
3043 {
3044 .ctl_name = NET_IPV4_ROUTE_SECRET_INTERVAL,
3045 .procname = "secret_interval",
3046 .data = &ip_rt_secret_interval,
3047 .maxlen = sizeof(int),
3048 .mode = 0644,
3049 .proc_handler = &proc_dointvec_jiffies,
3050 .strategy = &sysctl_jiffies,
3051 },
3052 { .ctl_name = 0 }
3053};
3054#endif
3055
3056#ifdef CONFIG_NET_CLS_ROUTE
3057struct ip_rt_acct *ip_rt_acct;
3058
3059/* This code sucks. But you should have seen it before! --RR */
3060
3061/* IP route accounting ptr for this logical cpu number. */
3062#define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
3063
3064#ifdef CONFIG_PROC_FS
3065static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
3066 int length, int *eof, void *data)
3067{
3068 unsigned int i;
3069
3070 if ((offset & 3) || (length & 3))
3071 return -EIO;
3072
3073 if (offset >= sizeof(struct ip_rt_acct) * 256) {
3074 *eof = 1;
3075 return 0;
3076 }
3077
3078 if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
3079 length = sizeof(struct ip_rt_acct) * 256 - offset;
3080 *eof = 1;
3081 }
3082
3083 offset /= sizeof(u32);
3084
3085 if (length > 0) {
3086 u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
3087 u32 *dst = (u32 *) buffer;
3088
3089 /* Copy first cpu. */
3090 *start = buffer;
3091 memcpy(dst, src, length);
3092
3093 /* Add the other cpus in, one int at a time */
KAMEZAWA Hiroyuki6f912042006-04-10 22:52:50 -07003094 for_each_possible_cpu(i) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003095 unsigned int j;
3096
3097 src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
3098
3099 for (j = 0; j < length/4; j++)
3100 dst[j] += src[j];
3101 }
3102 }
3103 return length;
3104}
3105#endif /* CONFIG_PROC_FS */
3106#endif /* CONFIG_NET_CLS_ROUTE */
3107
3108static __initdata unsigned long rhash_entries;
3109static int __init set_rhash_entries(char *str)
3110{
3111 if (!str)
3112 return 0;
3113 rhash_entries = simple_strtoul(str, &str, 0);
3114 return 1;
3115}
3116__setup("rhash_entries=", set_rhash_entries);
3117
3118int __init ip_rt_init(void)
3119{
Eric Dumazet424c4b72005-07-05 14:58:19 -07003120 int rc = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003121
3122 rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
3123 (jiffies ^ (jiffies >> 7)));
3124
3125#ifdef CONFIG_NET_CLS_ROUTE
Eric Dumazet424c4b72005-07-05 14:58:19 -07003126 {
3127 int order;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003128 for (order = 0;
3129 (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
3130 /* NOTHING */;
3131 ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
3132 if (!ip_rt_acct)
3133 panic("IP: failed to allocate ip_rt_acct\n");
3134 memset(ip_rt_acct, 0, PAGE_SIZE << order);
Eric Dumazet424c4b72005-07-05 14:58:19 -07003135 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003136#endif
3137
3138 ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
3139 sizeof(struct rtable),
3140 0, SLAB_HWCACHE_ALIGN,
3141 NULL, NULL);
3142
3143 if (!ipv4_dst_ops.kmem_cachep)
3144 panic("IP: failed to allocate ip_dst_cache\n");
3145
Eric Dumazet424c4b72005-07-05 14:58:19 -07003146 rt_hash_table = (struct rt_hash_bucket *)
3147 alloc_large_system_hash("IP route cache",
3148 sizeof(struct rt_hash_bucket),
3149 rhash_entries,
3150 (num_physpages >= 128 * 1024) ?
Mike Stroyan18955cf2005-11-29 16:12:55 -08003151 15 : 17,
Eric Dumazet424c4b72005-07-05 14:58:19 -07003152 HASH_HIGHMEM,
3153 &rt_hash_log,
3154 &rt_hash_mask,
3155 0);
Eric Dumazet22c047c2005-07-05 14:55:24 -07003156 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3157 rt_hash_lock_init();
Linus Torvalds1da177e2005-04-16 15:20:36 -07003158
3159 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3160 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3161
Linus Torvalds1da177e2005-04-16 15:20:36 -07003162 devinet_init();
3163 ip_fib_init();
3164
3165 init_timer(&rt_flush_timer);
3166 rt_flush_timer.function = rt_run_flush;
3167 init_timer(&rt_periodic_timer);
3168 rt_periodic_timer.function = rt_check_expire;
3169 init_timer(&rt_secret_timer);
3170 rt_secret_timer.function = rt_secret_rebuild;
3171
3172 /* All the timers, started at system startup tend
3173 to synchronize. Perturb it a bit.
3174 */
3175 rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
3176 ip_rt_gc_interval;
3177 add_timer(&rt_periodic_timer);
3178
3179 rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3180 ip_rt_secret_interval;
3181 add_timer(&rt_secret_timer);
3182
3183#ifdef CONFIG_PROC_FS
3184 {
3185 struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
3186 if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
3187 !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO,
3188 proc_net_stat))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003189 return -ENOMEM;
3190 }
3191 rtstat_pde->proc_fops = &rt_cpu_seq_fops;
3192 }
3193#ifdef CONFIG_NET_CLS_ROUTE
3194 create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL);
3195#endif
3196#endif
3197#ifdef CONFIG_XFRM
3198 xfrm_init();
3199 xfrm4_init();
3200#endif
3201 return rc;
3202}
3203
3204EXPORT_SYMBOL(__ip_select_ident);
3205EXPORT_SYMBOL(ip_route_input);
3206EXPORT_SYMBOL(ip_route_output_key);