blob: 8549f26e2495089c8987fd4c241b9ae002a74393 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
8 * Version: $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
9 *
Jesper Juhl02c30a82005-05-05 16:16:16 -070010 * Authors: Ross Biro
Linus Torvalds1da177e2005-04-16 15:20:36 -070011 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Alan Cox, <gw4pts@gw4pts.ampr.org>
13 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15 *
16 * Fixes:
17 * Alan Cox : Verify area fixes.
18 * Alan Cox : cli() protects routing changes
19 * Rui Oliveira : ICMP routing table updates
20 * (rco@di.uminho.pt) Routing table insertion and update
21 * Linus Torvalds : Rewrote bits to be sensible
22 * Alan Cox : Added BSD route gw semantics
23 * Alan Cox : Super /proc >4K
24 * Alan Cox : MTU in route table
25 * Alan Cox : MSS actually. Also added the window
26 * clamper.
27 * Sam Lantinga : Fixed route matching in rt_del()
28 * Alan Cox : Routing cache support.
29 * Alan Cox : Removed compatibility cruft.
30 * Alan Cox : RTF_REJECT support.
31 * Alan Cox : TCP irtt support.
32 * Jonathan Naylor : Added Metric support.
33 * Miquel van Smoorenburg : BSD API fixes.
34 * Miquel van Smoorenburg : Metrics.
35 * Alan Cox : Use __u32 properly
36 * Alan Cox : Aligned routing errors more closely with BSD
37 * our system is still very different.
38 * Alan Cox : Faster /proc handling
39 * Alexey Kuznetsov : Massive rework to support tree based routing,
40 * routing caches and better behaviour.
41 *
42 * Olaf Erb : irtt wasn't being copied right.
43 * Bjorn Ekwall : Kerneld route support.
44 * Alan Cox : Multicast fixed (I hope)
45 * Pavel Krauz : Limited broadcast fixed
46 * Mike McLagan : Routing by source
47 * Alexey Kuznetsov : End of old history. Split to fib.c and
48 * route.c and rewritten from scratch.
49 * Andi Kleen : Load-limit warning messages.
50 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
51 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
52 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
53 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
54 * Marc Boucher : routing by fwmark
55 * Robert Olsson : Added rt_cache statistics
56 * Arnaldo C. Melo : Convert proc stuff to seq_file
Eric Dumazetbb1d23b2005-07-05 15:00:32 -070057 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
Linus Torvalds1da177e2005-04-16 15:20:36 -070058 *
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
63 */
64
65#include <linux/config.h>
66#include <linux/module.h>
67#include <asm/uaccess.h>
68#include <asm/system.h>
69#include <linux/bitops.h>
70#include <linux/types.h>
71#include <linux/kernel.h>
72#include <linux/sched.h>
73#include <linux/mm.h>
Eric Dumazet424c4b72005-07-05 14:58:19 -070074#include <linux/bootmem.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070075#include <linux/string.h>
76#include <linux/socket.h>
77#include <linux/sockios.h>
78#include <linux/errno.h>
79#include <linux/in.h>
80#include <linux/inet.h>
81#include <linux/netdevice.h>
82#include <linux/proc_fs.h>
83#include <linux/init.h>
84#include <linux/skbuff.h>
85#include <linux/rtnetlink.h>
86#include <linux/inetdevice.h>
87#include <linux/igmp.h>
88#include <linux/pkt_sched.h>
89#include <linux/mroute.h>
90#include <linux/netfilter_ipv4.h>
91#include <linux/random.h>
92#include <linux/jhash.h>
93#include <linux/rcupdate.h>
94#include <linux/times.h>
95#include <net/protocol.h>
96#include <net/ip.h>
97#include <net/route.h>
98#include <net/inetpeer.h>
99#include <net/sock.h>
100#include <net/ip_fib.h>
101#include <net/arp.h>
102#include <net/tcp.h>
103#include <net/icmp.h>
104#include <net/xfrm.h>
105#include <net/ip_mp_alg.h>
106#ifdef CONFIG_SYSCTL
107#include <linux/sysctl.h>
108#endif
109
110#define RT_FL_TOS(oldflp) \
111 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
112
113#define IP_MAX_MTU 0xFFF0
114
115#define RT_GC_TIMEOUT (300*HZ)
116
117static int ip_rt_min_delay = 2 * HZ;
118static int ip_rt_max_delay = 10 * HZ;
119static int ip_rt_max_size;
120static int ip_rt_gc_timeout = RT_GC_TIMEOUT;
121static int ip_rt_gc_interval = 60 * HZ;
122static int ip_rt_gc_min_interval = HZ / 2;
123static int ip_rt_redirect_number = 9;
124static int ip_rt_redirect_load = HZ / 50;
125static int ip_rt_redirect_silence = ((HZ / 50) << (9 + 1));
126static int ip_rt_error_cost = HZ;
127static int ip_rt_error_burst = 5 * HZ;
128static int ip_rt_gc_elasticity = 8;
129static int ip_rt_mtu_expires = 10 * 60 * HZ;
130static int ip_rt_min_pmtu = 512 + 20 + 20;
131static int ip_rt_min_advmss = 256;
132static int ip_rt_secret_interval = 10 * 60 * HZ;
133static unsigned long rt_deadline;
134
135#define RTprint(a...) printk(KERN_DEBUG a)
136
137static struct timer_list rt_flush_timer;
138static struct timer_list rt_periodic_timer;
139static struct timer_list rt_secret_timer;
140
141/*
142 * Interface to generic destination cache.
143 */
144
145static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
146static void ipv4_dst_destroy(struct dst_entry *dst);
147static void ipv4_dst_ifdown(struct dst_entry *dst,
148 struct net_device *dev, int how);
149static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
150static void ipv4_link_failure(struct sk_buff *skb);
151static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
152static int rt_garbage_collect(void);
153
154
155static struct dst_ops ipv4_dst_ops = {
156 .family = AF_INET,
157 .protocol = __constant_htons(ETH_P_IP),
158 .gc = rt_garbage_collect,
159 .check = ipv4_dst_check,
160 .destroy = ipv4_dst_destroy,
161 .ifdown = ipv4_dst_ifdown,
162 .negative_advice = ipv4_negative_advice,
163 .link_failure = ipv4_link_failure,
164 .update_pmtu = ip_rt_update_pmtu,
165 .entry_size = sizeof(struct rtable),
166};
167
168#define ECN_OR_COST(class) TC_PRIO_##class
169
170__u8 ip_tos2prio[16] = {
171 TC_PRIO_BESTEFFORT,
172 ECN_OR_COST(FILLER),
173 TC_PRIO_BESTEFFORT,
174 ECN_OR_COST(BESTEFFORT),
175 TC_PRIO_BULK,
176 ECN_OR_COST(BULK),
177 TC_PRIO_BULK,
178 ECN_OR_COST(BULK),
179 TC_PRIO_INTERACTIVE,
180 ECN_OR_COST(INTERACTIVE),
181 TC_PRIO_INTERACTIVE,
182 ECN_OR_COST(INTERACTIVE),
183 TC_PRIO_INTERACTIVE_BULK,
184 ECN_OR_COST(INTERACTIVE_BULK),
185 TC_PRIO_INTERACTIVE_BULK,
186 ECN_OR_COST(INTERACTIVE_BULK)
187};
188
189
190/*
191 * Route cache.
192 */
193
194/* The locking scheme is rather straight forward:
195 *
196 * 1) Read-Copy Update protects the buckets of the central route hash.
197 * 2) Only writers remove entries, and they hold the lock
198 * as they look at rtable reference counts.
199 * 3) Only readers acquire references to rtable entries,
200 * they do so with atomic increments and with the
201 * lock held.
202 */
203
204struct rt_hash_bucket {
205 struct rtable *chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700206};
207#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
208/*
209 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
210 * The size of this table is a power of two and depends on the number of CPUS.
211 */
212#if NR_CPUS >= 32
213#define RT_HASH_LOCK_SZ 4096
214#elif NR_CPUS >= 16
215#define RT_HASH_LOCK_SZ 2048
216#elif NR_CPUS >= 8
217#define RT_HASH_LOCK_SZ 1024
218#elif NR_CPUS >= 4
219#define RT_HASH_LOCK_SZ 512
220#else
221#define RT_HASH_LOCK_SZ 256
222#endif
223
224static spinlock_t *rt_hash_locks;
225# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
226# define rt_hash_lock_init() { \
227 int i; \
228 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
229 if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
230 for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
231 spin_lock_init(&rt_hash_locks[i]); \
232 }
233#else
234# define rt_hash_lock_addr(slot) NULL
235# define rt_hash_lock_init()
236#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700237
238static struct rt_hash_bucket *rt_hash_table;
239static unsigned rt_hash_mask;
240static int rt_hash_log;
241static unsigned int rt_hash_rnd;
242
Adrian Bunk0742fd52005-08-09 19:35:47 -0700243static struct rt_cache_stat *rt_cache_stat;
244#define RT_CACHE_STAT_INC(field) \
245 (per_cpu_ptr(rt_cache_stat, raw_smp_processor_id())->field++)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700246
247static int rt_intern_hash(unsigned hash, struct rtable *rth,
248 struct rtable **res);
249
250static unsigned int rt_hash_code(u32 daddr, u32 saddr, u8 tos)
251{
252 return (jhash_3words(daddr, saddr, (u32) tos, rt_hash_rnd)
253 & rt_hash_mask);
254}
255
256#ifdef CONFIG_PROC_FS
257struct rt_cache_iter_state {
258 int bucket;
259};
260
261static struct rtable *rt_cache_get_first(struct seq_file *seq)
262{
263 struct rtable *r = NULL;
264 struct rt_cache_iter_state *st = seq->private;
265
266 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
267 rcu_read_lock_bh();
268 r = rt_hash_table[st->bucket].chain;
269 if (r)
270 break;
271 rcu_read_unlock_bh();
272 }
273 return r;
274}
275
276static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
277{
278 struct rt_cache_iter_state *st = rcu_dereference(seq->private);
279
280 r = r->u.rt_next;
281 while (!r) {
282 rcu_read_unlock_bh();
283 if (--st->bucket < 0)
284 break;
285 rcu_read_lock_bh();
286 r = rt_hash_table[st->bucket].chain;
287 }
288 return r;
289}
290
291static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
292{
293 struct rtable *r = rt_cache_get_first(seq);
294
295 if (r)
296 while (pos && (r = rt_cache_get_next(seq, r)))
297 --pos;
298 return pos ? NULL : r;
299}
300
301static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
302{
303 return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
304}
305
306static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
307{
308 struct rtable *r = NULL;
309
310 if (v == SEQ_START_TOKEN)
311 r = rt_cache_get_first(seq);
312 else
313 r = rt_cache_get_next(seq, v);
314 ++*pos;
315 return r;
316}
317
318static void rt_cache_seq_stop(struct seq_file *seq, void *v)
319{
320 if (v && v != SEQ_START_TOKEN)
321 rcu_read_unlock_bh();
322}
323
324static int rt_cache_seq_show(struct seq_file *seq, void *v)
325{
326 if (v == SEQ_START_TOKEN)
327 seq_printf(seq, "%-127s\n",
328 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
329 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
330 "HHUptod\tSpecDst");
331 else {
332 struct rtable *r = v;
333 char temp[256];
334
335 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
336 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
337 r->u.dst.dev ? r->u.dst.dev->name : "*",
338 (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
339 r->rt_flags, atomic_read(&r->u.dst.__refcnt),
340 r->u.dst.__use, 0, (unsigned long)r->rt_src,
341 (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
342 (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
343 dst_metric(&r->u.dst, RTAX_WINDOW),
344 (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
345 dst_metric(&r->u.dst, RTAX_RTTVAR)),
346 r->fl.fl4_tos,
347 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
348 r->u.dst.hh ? (r->u.dst.hh->hh_output ==
349 dev_queue_xmit) : 0,
350 r->rt_spec_dst);
351 seq_printf(seq, "%-127s\n", temp);
352 }
353 return 0;
354}
355
356static struct seq_operations rt_cache_seq_ops = {
357 .start = rt_cache_seq_start,
358 .next = rt_cache_seq_next,
359 .stop = rt_cache_seq_stop,
360 .show = rt_cache_seq_show,
361};
362
363static int rt_cache_seq_open(struct inode *inode, struct file *file)
364{
365 struct seq_file *seq;
366 int rc = -ENOMEM;
367 struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
368
369 if (!s)
370 goto out;
371 rc = seq_open(file, &rt_cache_seq_ops);
372 if (rc)
373 goto out_kfree;
374 seq = file->private_data;
375 seq->private = s;
376 memset(s, 0, sizeof(*s));
377out:
378 return rc;
379out_kfree:
380 kfree(s);
381 goto out;
382}
383
384static struct file_operations rt_cache_seq_fops = {
385 .owner = THIS_MODULE,
386 .open = rt_cache_seq_open,
387 .read = seq_read,
388 .llseek = seq_lseek,
389 .release = seq_release_private,
390};
391
392
393static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
394{
395 int cpu;
396
397 if (*pos == 0)
398 return SEQ_START_TOKEN;
399
400 for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
401 if (!cpu_possible(cpu))
402 continue;
403 *pos = cpu+1;
404 return per_cpu_ptr(rt_cache_stat, cpu);
405 }
406 return NULL;
407}
408
409static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
410{
411 int cpu;
412
413 for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
414 if (!cpu_possible(cpu))
415 continue;
416 *pos = cpu+1;
417 return per_cpu_ptr(rt_cache_stat, cpu);
418 }
419 return NULL;
420
421}
422
423static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
424{
425
426}
427
428static int rt_cpu_seq_show(struct seq_file *seq, void *v)
429{
430 struct rt_cache_stat *st = v;
431
432 if (v == SEQ_START_TOKEN) {
Olaf Rempel5bec0032005-04-28 12:16:08 -0700433 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700434 return 0;
435 }
436
437 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
438 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
439 atomic_read(&ipv4_dst_ops.entries),
440 st->in_hit,
441 st->in_slow_tot,
442 st->in_slow_mc,
443 st->in_no_route,
444 st->in_brd,
445 st->in_martian_dst,
446 st->in_martian_src,
447
448 st->out_hit,
449 st->out_slow_tot,
450 st->out_slow_mc,
451
452 st->gc_total,
453 st->gc_ignored,
454 st->gc_goal_miss,
455 st->gc_dst_overflow,
456 st->in_hlist_search,
457 st->out_hlist_search
458 );
459 return 0;
460}
461
462static struct seq_operations rt_cpu_seq_ops = {
463 .start = rt_cpu_seq_start,
464 .next = rt_cpu_seq_next,
465 .stop = rt_cpu_seq_stop,
466 .show = rt_cpu_seq_show,
467};
468
469
470static int rt_cpu_seq_open(struct inode *inode, struct file *file)
471{
472 return seq_open(file, &rt_cpu_seq_ops);
473}
474
475static struct file_operations rt_cpu_seq_fops = {
476 .owner = THIS_MODULE,
477 .open = rt_cpu_seq_open,
478 .read = seq_read,
479 .llseek = seq_lseek,
480 .release = seq_release,
481};
482
483#endif /* CONFIG_PROC_FS */
484
485static __inline__ void rt_free(struct rtable *rt)
486{
487 multipath_remove(rt);
488 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
489}
490
491static __inline__ void rt_drop(struct rtable *rt)
492{
493 multipath_remove(rt);
494 ip_rt_put(rt);
495 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
496}
497
498static __inline__ int rt_fast_clean(struct rtable *rth)
499{
500 /* Kill broadcast/multicast entries very aggresively, if they
501 collide in hash table with more useful entries */
502 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
503 rth->fl.iif && rth->u.rt_next;
504}
505
506static __inline__ int rt_valuable(struct rtable *rth)
507{
508 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
509 rth->u.dst.expires;
510}
511
512static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
513{
514 unsigned long age;
515 int ret = 0;
516
517 if (atomic_read(&rth->u.dst.__refcnt))
518 goto out;
519
520 ret = 1;
521 if (rth->u.dst.expires &&
522 time_after_eq(jiffies, rth->u.dst.expires))
523 goto out;
524
525 age = jiffies - rth->u.dst.lastuse;
526 ret = 0;
527 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
528 (age <= tmo2 && rt_valuable(rth)))
529 goto out;
530 ret = 1;
531out: return ret;
532}
533
534/* Bits of score are:
535 * 31: very valuable
536 * 30: not quite useless
537 * 29..0: usage counter
538 */
539static inline u32 rt_score(struct rtable *rt)
540{
541 u32 score = jiffies - rt->u.dst.lastuse;
542
543 score = ~score & ~(3<<30);
544
545 if (rt_valuable(rt))
546 score |= (1<<31);
547
548 if (!rt->fl.iif ||
549 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
550 score |= (1<<30);
551
552 return score;
553}
554
555static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
556{
557 return memcmp(&fl1->nl_u.ip4_u, &fl2->nl_u.ip4_u, sizeof(fl1->nl_u.ip4_u)) == 0 &&
558 fl1->oif == fl2->oif &&
559 fl1->iif == fl2->iif;
560}
561
562#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
563static struct rtable **rt_remove_balanced_route(struct rtable **chain_head,
564 struct rtable *expentry,
565 int *removed_count)
566{
567 int passedexpired = 0;
568 struct rtable **nextstep = NULL;
569 struct rtable **rthp = chain_head;
570 struct rtable *rth;
571
572 if (removed_count)
573 *removed_count = 0;
574
575 while ((rth = *rthp) != NULL) {
576 if (rth == expentry)
577 passedexpired = 1;
578
579 if (((*rthp)->u.dst.flags & DST_BALANCED) != 0 &&
580 compare_keys(&(*rthp)->fl, &expentry->fl)) {
581 if (*rthp == expentry) {
582 *rthp = rth->u.rt_next;
583 continue;
584 } else {
585 *rthp = rth->u.rt_next;
586 rt_free(rth);
587 if (removed_count)
588 ++(*removed_count);
589 }
590 } else {
591 if (!((*rthp)->u.dst.flags & DST_BALANCED) &&
592 passedexpired && !nextstep)
593 nextstep = &rth->u.rt_next;
594
595 rthp = &rth->u.rt_next;
596 }
597 }
598
599 rt_free(expentry);
600 if (removed_count)
601 ++(*removed_count);
602
603 return nextstep;
604}
605#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
606
607
608/* This runs via a timer and thus is always in BH context. */
609static void rt_check_expire(unsigned long dummy)
610{
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700611 static unsigned int rover;
612 unsigned int i = rover, goal;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700613 struct rtable *rth, **rthp;
614 unsigned long now = jiffies;
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700615 u64 mult;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700616
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700617 mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
618 if (ip_rt_gc_timeout > 1)
619 do_div(mult, ip_rt_gc_timeout);
620 goal = (unsigned int)mult;
621 if (goal > rt_hash_mask) goal = rt_hash_mask + 1;
622 for (; goal > 0; goal--) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700623 unsigned long tmo = ip_rt_gc_timeout;
624
625 i = (i + 1) & rt_hash_mask;
626 rthp = &rt_hash_table[i].chain;
627
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700628 if (*rthp == 0)
629 continue;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700630 spin_lock(rt_hash_lock_addr(i));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700631 while ((rth = *rthp) != NULL) {
632 if (rth->u.dst.expires) {
633 /* Entry is expired even if it is in use */
634 if (time_before_eq(now, rth->u.dst.expires)) {
635 tmo >>= 1;
636 rthp = &rth->u.rt_next;
637 continue;
638 }
639 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
640 tmo >>= 1;
641 rthp = &rth->u.rt_next;
642 continue;
643 }
644
645 /* Cleanup aged off entries. */
646#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
647 /* remove all related balanced entries if necessary */
648 if (rth->u.dst.flags & DST_BALANCED) {
649 rthp = rt_remove_balanced_route(
650 &rt_hash_table[i].chain,
651 rth, NULL);
652 if (!rthp)
653 break;
654 } else {
655 *rthp = rth->u.rt_next;
656 rt_free(rth);
657 }
658#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
659 *rthp = rth->u.rt_next;
660 rt_free(rth);
661#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
662 }
Eric Dumazet22c047c2005-07-05 14:55:24 -0700663 spin_unlock(rt_hash_lock_addr(i));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700664
665 /* Fallback loop breaker. */
666 if (time_after(jiffies, now))
667 break;
668 }
669 rover = i;
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700670 mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700671}
672
673/* This can run from both BH and non-BH contexts, the latter
674 * in the case of a forced flush event.
675 */
676static void rt_run_flush(unsigned long dummy)
677{
678 int i;
679 struct rtable *rth, *next;
680
681 rt_deadline = 0;
682
683 get_random_bytes(&rt_hash_rnd, 4);
684
685 for (i = rt_hash_mask; i >= 0; i--) {
Eric Dumazet22c047c2005-07-05 14:55:24 -0700686 spin_lock_bh(rt_hash_lock_addr(i));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700687 rth = rt_hash_table[i].chain;
688 if (rth)
689 rt_hash_table[i].chain = NULL;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700690 spin_unlock_bh(rt_hash_lock_addr(i));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700691
692 for (; rth; rth = next) {
693 next = rth->u.rt_next;
694 rt_free(rth);
695 }
696 }
697}
698
699static DEFINE_SPINLOCK(rt_flush_lock);
700
701void rt_cache_flush(int delay)
702{
703 unsigned long now = jiffies;
704 int user_mode = !in_softirq();
705
706 if (delay < 0)
707 delay = ip_rt_min_delay;
708
709 /* flush existing multipath state*/
710 multipath_flush();
711
712 spin_lock_bh(&rt_flush_lock);
713
714 if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
715 long tmo = (long)(rt_deadline - now);
716
717 /* If flush timer is already running
718 and flush request is not immediate (delay > 0):
719
720 if deadline is not achieved, prolongate timer to "delay",
721 otherwise fire it at deadline time.
722 */
723
724 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
725 tmo = 0;
726
727 if (delay > tmo)
728 delay = tmo;
729 }
730
731 if (delay <= 0) {
732 spin_unlock_bh(&rt_flush_lock);
733 rt_run_flush(0);
734 return;
735 }
736
737 if (rt_deadline == 0)
738 rt_deadline = now + ip_rt_max_delay;
739
740 mod_timer(&rt_flush_timer, now+delay);
741 spin_unlock_bh(&rt_flush_lock);
742}
743
744static void rt_secret_rebuild(unsigned long dummy)
745{
746 unsigned long now = jiffies;
747
748 rt_cache_flush(0);
749 mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
750}
751
752/*
753 Short description of GC goals.
754
755 We want to build algorithm, which will keep routing cache
756 at some equilibrium point, when number of aged off entries
757 is kept approximately equal to newly generated ones.
758
759 Current expiration strength is variable "expire".
760 We try to adjust it dynamically, so that if networking
761 is idle expires is large enough to keep enough of warm entries,
762 and when load increases it reduces to limit cache size.
763 */
764
765static int rt_garbage_collect(void)
766{
767 static unsigned long expire = RT_GC_TIMEOUT;
768 static unsigned long last_gc;
769 static int rover;
770 static int equilibrium;
771 struct rtable *rth, **rthp;
772 unsigned long now = jiffies;
773 int goal;
774
775 /*
776 * Garbage collection is pretty expensive,
777 * do not make it too frequently.
778 */
779
780 RT_CACHE_STAT_INC(gc_total);
781
782 if (now - last_gc < ip_rt_gc_min_interval &&
783 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
784 RT_CACHE_STAT_INC(gc_ignored);
785 goto out;
786 }
787
788 /* Calculate number of entries, which we want to expire now. */
789 goal = atomic_read(&ipv4_dst_ops.entries) -
790 (ip_rt_gc_elasticity << rt_hash_log);
791 if (goal <= 0) {
792 if (equilibrium < ipv4_dst_ops.gc_thresh)
793 equilibrium = ipv4_dst_ops.gc_thresh;
794 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
795 if (goal > 0) {
796 equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
797 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
798 }
799 } else {
800 /* We are in dangerous area. Try to reduce cache really
801 * aggressively.
802 */
803 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
804 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
805 }
806
807 if (now - last_gc >= ip_rt_gc_min_interval)
808 last_gc = now;
809
810 if (goal <= 0) {
811 equilibrium += goal;
812 goto work_done;
813 }
814
815 do {
816 int i, k;
817
818 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
819 unsigned long tmo = expire;
820
821 k = (k + 1) & rt_hash_mask;
822 rthp = &rt_hash_table[k].chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700823 spin_lock_bh(rt_hash_lock_addr(k));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700824 while ((rth = *rthp) != NULL) {
825 if (!rt_may_expire(rth, tmo, expire)) {
826 tmo >>= 1;
827 rthp = &rth->u.rt_next;
828 continue;
829 }
830#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
831 /* remove all related balanced entries
832 * if necessary
833 */
834 if (rth->u.dst.flags & DST_BALANCED) {
835 int r;
836
837 rthp = rt_remove_balanced_route(
838 &rt_hash_table[i].chain,
839 rth,
840 &r);
841 goal -= r;
842 if (!rthp)
843 break;
844 } else {
845 *rthp = rth->u.rt_next;
846 rt_free(rth);
847 goal--;
848 }
849#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
850 *rthp = rth->u.rt_next;
851 rt_free(rth);
852 goal--;
853#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
854 }
Eric Dumazet22c047c2005-07-05 14:55:24 -0700855 spin_unlock_bh(rt_hash_lock_addr(k));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700856 if (goal <= 0)
857 break;
858 }
859 rover = k;
860
861 if (goal <= 0)
862 goto work_done;
863
864 /* Goal is not achieved. We stop process if:
865
866 - if expire reduced to zero. Otherwise, expire is halfed.
867 - if table is not full.
868 - if we are called from interrupt.
869 - jiffies check is just fallback/debug loop breaker.
870 We will not spin here for long time in any case.
871 */
872
873 RT_CACHE_STAT_INC(gc_goal_miss);
874
875 if (expire == 0)
876 break;
877
878 expire >>= 1;
879#if RT_CACHE_DEBUG >= 2
880 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
881 atomic_read(&ipv4_dst_ops.entries), goal, i);
882#endif
883
884 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
885 goto out;
886 } while (!in_softirq() && time_before_eq(jiffies, now));
887
888 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
889 goto out;
890 if (net_ratelimit())
891 printk(KERN_WARNING "dst cache overflow\n");
892 RT_CACHE_STAT_INC(gc_dst_overflow);
893 return 1;
894
895work_done:
896 expire += ip_rt_gc_min_interval;
897 if (expire > ip_rt_gc_timeout ||
898 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
899 expire = ip_rt_gc_timeout;
900#if RT_CACHE_DEBUG >= 2
901 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
902 atomic_read(&ipv4_dst_ops.entries), goal, rover);
903#endif
904out: return 0;
905}
906
907static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
908{
909 struct rtable *rth, **rthp;
910 unsigned long now;
911 struct rtable *cand, **candp;
912 u32 min_score;
913 int chain_length;
914 int attempts = !in_softirq();
915
916restart:
917 chain_length = 0;
918 min_score = ~(u32)0;
919 cand = NULL;
920 candp = NULL;
921 now = jiffies;
922
923 rthp = &rt_hash_table[hash].chain;
924
Eric Dumazet22c047c2005-07-05 14:55:24 -0700925 spin_lock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700926 while ((rth = *rthp) != NULL) {
927#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
928 if (!(rth->u.dst.flags & DST_BALANCED) &&
929 compare_keys(&rth->fl, &rt->fl)) {
930#else
931 if (compare_keys(&rth->fl, &rt->fl)) {
932#endif
933 /* Put it first */
934 *rthp = rth->u.rt_next;
935 /*
936 * Since lookup is lockfree, the deletion
937 * must be visible to another weakly ordered CPU before
938 * the insertion at the start of the hash chain.
939 */
940 rcu_assign_pointer(rth->u.rt_next,
941 rt_hash_table[hash].chain);
942 /*
943 * Since lookup is lockfree, the update writes
944 * must be ordered for consistency on SMP.
945 */
946 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
947
948 rth->u.dst.__use++;
949 dst_hold(&rth->u.dst);
950 rth->u.dst.lastuse = now;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700951 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700952
953 rt_drop(rt);
954 *rp = rth;
955 return 0;
956 }
957
958 if (!atomic_read(&rth->u.dst.__refcnt)) {
959 u32 score = rt_score(rth);
960
961 if (score <= min_score) {
962 cand = rth;
963 candp = rthp;
964 min_score = score;
965 }
966 }
967
968 chain_length++;
969
970 rthp = &rth->u.rt_next;
971 }
972
973 if (cand) {
974 /* ip_rt_gc_elasticity used to be average length of chain
975 * length, when exceeded gc becomes really aggressive.
976 *
977 * The second limit is less certain. At the moment it allows
978 * only 2 entries per bucket. We will see.
979 */
980 if (chain_length > ip_rt_gc_elasticity) {
981 *candp = cand->u.rt_next;
982 rt_free(cand);
983 }
984 }
985
986 /* Try to bind route to arp only if it is output
987 route or unicast forwarding path.
988 */
989 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
990 int err = arp_bind_neighbour(&rt->u.dst);
991 if (err) {
Eric Dumazet22c047c2005-07-05 14:55:24 -0700992 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700993
994 if (err != -ENOBUFS) {
995 rt_drop(rt);
996 return err;
997 }
998
999 /* Neighbour tables are full and nothing
1000 can be released. Try to shrink route cache,
1001 it is most likely it holds some neighbour records.
1002 */
1003 if (attempts-- > 0) {
1004 int saved_elasticity = ip_rt_gc_elasticity;
1005 int saved_int = ip_rt_gc_min_interval;
1006 ip_rt_gc_elasticity = 1;
1007 ip_rt_gc_min_interval = 0;
1008 rt_garbage_collect();
1009 ip_rt_gc_min_interval = saved_int;
1010 ip_rt_gc_elasticity = saved_elasticity;
1011 goto restart;
1012 }
1013
1014 if (net_ratelimit())
1015 printk(KERN_WARNING "Neighbour table overflow.\n");
1016 rt_drop(rt);
1017 return -ENOBUFS;
1018 }
1019 }
1020
1021 rt->u.rt_next = rt_hash_table[hash].chain;
1022#if RT_CACHE_DEBUG >= 2
1023 if (rt->u.rt_next) {
1024 struct rtable *trt;
1025 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
1026 NIPQUAD(rt->rt_dst));
1027 for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next)
1028 printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
1029 printk("\n");
1030 }
1031#endif
1032 rt_hash_table[hash].chain = rt;
Eric Dumazet22c047c2005-07-05 14:55:24 -07001033 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001034 *rp = rt;
1035 return 0;
1036}
1037
1038void rt_bind_peer(struct rtable *rt, int create)
1039{
1040 static DEFINE_SPINLOCK(rt_peer_lock);
1041 struct inet_peer *peer;
1042
1043 peer = inet_getpeer(rt->rt_dst, create);
1044
1045 spin_lock_bh(&rt_peer_lock);
1046 if (rt->peer == NULL) {
1047 rt->peer = peer;
1048 peer = NULL;
1049 }
1050 spin_unlock_bh(&rt_peer_lock);
1051 if (peer)
1052 inet_putpeer(peer);
1053}
1054
1055/*
1056 * Peer allocation may fail only in serious out-of-memory conditions. However
1057 * we still can generate some output.
1058 * Random ID selection looks a bit dangerous because we have no chances to
1059 * select ID being unique in a reasonable period of time.
1060 * But broken packet identifier may be better than no packet at all.
1061 */
1062static void ip_select_fb_ident(struct iphdr *iph)
1063{
1064 static DEFINE_SPINLOCK(ip_fb_id_lock);
1065 static u32 ip_fallback_id;
1066 u32 salt;
1067
1068 spin_lock_bh(&ip_fb_id_lock);
1069 salt = secure_ip_id(ip_fallback_id ^ iph->daddr);
1070 iph->id = htons(salt & 0xFFFF);
1071 ip_fallback_id = salt;
1072 spin_unlock_bh(&ip_fb_id_lock);
1073}
1074
1075void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1076{
1077 struct rtable *rt = (struct rtable *) dst;
1078
1079 if (rt) {
1080 if (rt->peer == NULL)
1081 rt_bind_peer(rt, 1);
1082
1083 /* If peer is attached to destination, it is never detached,
1084 so that we need not to grab a lock to dereference it.
1085 */
1086 if (rt->peer) {
1087 iph->id = htons(inet_getid(rt->peer, more));
1088 return;
1089 }
1090 } else
Stephen Hemminger9c2b3322005-04-19 22:39:42 -07001091 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1092 __builtin_return_address(0));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001093
1094 ip_select_fb_ident(iph);
1095}
1096
1097static void rt_del(unsigned hash, struct rtable *rt)
1098{
1099 struct rtable **rthp;
1100
Eric Dumazet22c047c2005-07-05 14:55:24 -07001101 spin_lock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001102 ip_rt_put(rt);
1103 for (rthp = &rt_hash_table[hash].chain; *rthp;
1104 rthp = &(*rthp)->u.rt_next)
1105 if (*rthp == rt) {
1106 *rthp = rt->u.rt_next;
1107 rt_free(rt);
1108 break;
1109 }
Eric Dumazet22c047c2005-07-05 14:55:24 -07001110 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001111}
1112
1113void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
1114 u32 saddr, u8 tos, struct net_device *dev)
1115{
1116 int i, k;
1117 struct in_device *in_dev = in_dev_get(dev);
1118 struct rtable *rth, **rthp;
1119 u32 skeys[2] = { saddr, 0 };
1120 int ikeys[2] = { dev->ifindex, 0 };
1121
1122 tos &= IPTOS_RT_MASK;
1123
1124 if (!in_dev)
1125 return;
1126
1127 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1128 || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1129 goto reject_redirect;
1130
1131 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1132 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1133 goto reject_redirect;
1134 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1135 goto reject_redirect;
1136 } else {
1137 if (inet_addr_type(new_gw) != RTN_UNICAST)
1138 goto reject_redirect;
1139 }
1140
1141 for (i = 0; i < 2; i++) {
1142 for (k = 0; k < 2; k++) {
1143 unsigned hash = rt_hash_code(daddr,
1144 skeys[i] ^ (ikeys[k] << 5),
1145 tos);
1146
1147 rthp=&rt_hash_table[hash].chain;
1148
1149 rcu_read_lock();
1150 while ((rth = rcu_dereference(*rthp)) != NULL) {
1151 struct rtable *rt;
1152
1153 if (rth->fl.fl4_dst != daddr ||
1154 rth->fl.fl4_src != skeys[i] ||
1155 rth->fl.fl4_tos != tos ||
1156 rth->fl.oif != ikeys[k] ||
1157 rth->fl.iif != 0) {
1158 rthp = &rth->u.rt_next;
1159 continue;
1160 }
1161
1162 if (rth->rt_dst != daddr ||
1163 rth->rt_src != saddr ||
1164 rth->u.dst.error ||
1165 rth->rt_gateway != old_gw ||
1166 rth->u.dst.dev != dev)
1167 break;
1168
1169 dst_hold(&rth->u.dst);
1170 rcu_read_unlock();
1171
1172 rt = dst_alloc(&ipv4_dst_ops);
1173 if (rt == NULL) {
1174 ip_rt_put(rth);
1175 in_dev_put(in_dev);
1176 return;
1177 }
1178
1179 /* Copy all the information. */
1180 *rt = *rth;
1181 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1182 rt->u.dst.__use = 1;
1183 atomic_set(&rt->u.dst.__refcnt, 1);
1184 rt->u.dst.child = NULL;
1185 if (rt->u.dst.dev)
1186 dev_hold(rt->u.dst.dev);
1187 if (rt->idev)
1188 in_dev_hold(rt->idev);
1189 rt->u.dst.obsolete = 0;
1190 rt->u.dst.lastuse = jiffies;
1191 rt->u.dst.path = &rt->u.dst;
1192 rt->u.dst.neighbour = NULL;
1193 rt->u.dst.hh = NULL;
1194 rt->u.dst.xfrm = NULL;
1195
1196 rt->rt_flags |= RTCF_REDIRECTED;
1197
1198 /* Gateway is different ... */
1199 rt->rt_gateway = new_gw;
1200
1201 /* Redirect received -> path was valid */
1202 dst_confirm(&rth->u.dst);
1203
1204 if (rt->peer)
1205 atomic_inc(&rt->peer->refcnt);
1206
1207 if (arp_bind_neighbour(&rt->u.dst) ||
1208 !(rt->u.dst.neighbour->nud_state &
1209 NUD_VALID)) {
1210 if (rt->u.dst.neighbour)
1211 neigh_event_send(rt->u.dst.neighbour, NULL);
1212 ip_rt_put(rth);
1213 rt_drop(rt);
1214 goto do_next;
1215 }
1216
1217 rt_del(hash, rth);
1218 if (!rt_intern_hash(hash, rt, &rt))
1219 ip_rt_put(rt);
1220 goto do_next;
1221 }
1222 rcu_read_unlock();
1223 do_next:
1224 ;
1225 }
1226 }
1227 in_dev_put(in_dev);
1228 return;
1229
1230reject_redirect:
1231#ifdef CONFIG_IP_ROUTE_VERBOSE
1232 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1233 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1234 "%u.%u.%u.%u ignored.\n"
1235 " Advised path = %u.%u.%u.%u -> %u.%u.%u.%u, "
1236 "tos %02x\n",
1237 NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1238 NIPQUAD(saddr), NIPQUAD(daddr), tos);
1239#endif
1240 in_dev_put(in_dev);
1241}
1242
1243static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1244{
1245 struct rtable *rt = (struct rtable*)dst;
1246 struct dst_entry *ret = dst;
1247
1248 if (rt) {
1249 if (dst->obsolete) {
1250 ip_rt_put(rt);
1251 ret = NULL;
1252 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1253 rt->u.dst.expires) {
1254 unsigned hash = rt_hash_code(rt->fl.fl4_dst,
1255 rt->fl.fl4_src ^
1256 (rt->fl.oif << 5),
1257 rt->fl.fl4_tos);
1258#if RT_CACHE_DEBUG >= 1
1259 printk(KERN_DEBUG "ip_rt_advice: redirect to "
1260 "%u.%u.%u.%u/%02x dropped\n",
1261 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1262#endif
1263 rt_del(hash, rt);
1264 ret = NULL;
1265 }
1266 }
1267 return ret;
1268}
1269
1270/*
1271 * Algorithm:
1272 * 1. The first ip_rt_redirect_number redirects are sent
1273 * with exponential backoff, then we stop sending them at all,
1274 * assuming that the host ignores our redirects.
1275 * 2. If we did not see packets requiring redirects
1276 * during ip_rt_redirect_silence, we assume that the host
1277 * forgot redirected route and start to send redirects again.
1278 *
1279 * This algorithm is much cheaper and more intelligent than dumb load limiting
1280 * in icmp.c.
1281 *
1282 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1283 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1284 */
1285
1286void ip_rt_send_redirect(struct sk_buff *skb)
1287{
1288 struct rtable *rt = (struct rtable*)skb->dst;
1289 struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1290
1291 if (!in_dev)
1292 return;
1293
1294 if (!IN_DEV_TX_REDIRECTS(in_dev))
1295 goto out;
1296
1297 /* No redirected packets during ip_rt_redirect_silence;
1298 * reset the algorithm.
1299 */
1300 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1301 rt->u.dst.rate_tokens = 0;
1302
1303 /* Too many ignored redirects; do not send anything
1304 * set u.dst.rate_last to the last seen redirected packet.
1305 */
1306 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1307 rt->u.dst.rate_last = jiffies;
1308 goto out;
1309 }
1310
1311 /* Check for load limit; set rate_last to the latest sent
1312 * redirect.
1313 */
1314 if (time_after(jiffies,
1315 (rt->u.dst.rate_last +
1316 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1317 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1318 rt->u.dst.rate_last = jiffies;
1319 ++rt->u.dst.rate_tokens;
1320#ifdef CONFIG_IP_ROUTE_VERBOSE
1321 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1322 rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1323 net_ratelimit())
1324 printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1325 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1326 NIPQUAD(rt->rt_src), rt->rt_iif,
1327 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1328#endif
1329 }
1330out:
1331 in_dev_put(in_dev);
1332}
1333
1334static int ip_error(struct sk_buff *skb)
1335{
1336 struct rtable *rt = (struct rtable*)skb->dst;
1337 unsigned long now;
1338 int code;
1339
1340 switch (rt->u.dst.error) {
1341 case EINVAL:
1342 default:
1343 goto out;
1344 case EHOSTUNREACH:
1345 code = ICMP_HOST_UNREACH;
1346 break;
1347 case ENETUNREACH:
1348 code = ICMP_NET_UNREACH;
1349 break;
1350 case EACCES:
1351 code = ICMP_PKT_FILTERED;
1352 break;
1353 }
1354
1355 now = jiffies;
1356 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1357 if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1358 rt->u.dst.rate_tokens = ip_rt_error_burst;
1359 rt->u.dst.rate_last = now;
1360 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1361 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1362 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1363 }
1364
1365out: kfree_skb(skb);
1366 return 0;
1367}
1368
1369/*
1370 * The last two values are not from the RFC but
1371 * are needed for AMPRnet AX.25 paths.
1372 */
1373
1374static unsigned short mtu_plateau[] =
1375{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1376
1377static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1378{
1379 int i;
1380
1381 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1382 if (old_mtu > mtu_plateau[i])
1383 return mtu_plateau[i];
1384 return 68;
1385}
1386
1387unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1388{
1389 int i;
1390 unsigned short old_mtu = ntohs(iph->tot_len);
1391 struct rtable *rth;
1392 u32 skeys[2] = { iph->saddr, 0, };
1393 u32 daddr = iph->daddr;
1394 u8 tos = iph->tos & IPTOS_RT_MASK;
1395 unsigned short est_mtu = 0;
1396
1397 if (ipv4_config.no_pmtu_disc)
1398 return 0;
1399
1400 for (i = 0; i < 2; i++) {
1401 unsigned hash = rt_hash_code(daddr, skeys[i], tos);
1402
1403 rcu_read_lock();
1404 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1405 rth = rcu_dereference(rth->u.rt_next)) {
1406 if (rth->fl.fl4_dst == daddr &&
1407 rth->fl.fl4_src == skeys[i] &&
1408 rth->rt_dst == daddr &&
1409 rth->rt_src == iph->saddr &&
1410 rth->fl.fl4_tos == tos &&
1411 rth->fl.iif == 0 &&
1412 !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1413 unsigned short mtu = new_mtu;
1414
1415 if (new_mtu < 68 || new_mtu >= old_mtu) {
1416
1417 /* BSD 4.2 compatibility hack :-( */
1418 if (mtu == 0 &&
1419 old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1420 old_mtu >= 68 + (iph->ihl << 2))
1421 old_mtu -= iph->ihl << 2;
1422
1423 mtu = guess_mtu(old_mtu);
1424 }
1425 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1426 if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1427 dst_confirm(&rth->u.dst);
1428 if (mtu < ip_rt_min_pmtu) {
1429 mtu = ip_rt_min_pmtu;
1430 rth->u.dst.metrics[RTAX_LOCK-1] |=
1431 (1 << RTAX_MTU);
1432 }
1433 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1434 dst_set_expires(&rth->u.dst,
1435 ip_rt_mtu_expires);
1436 }
1437 est_mtu = mtu;
1438 }
1439 }
1440 }
1441 rcu_read_unlock();
1442 }
1443 return est_mtu ? : new_mtu;
1444}
1445
1446static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1447{
1448 if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1449 !(dst_metric_locked(dst, RTAX_MTU))) {
1450 if (mtu < ip_rt_min_pmtu) {
1451 mtu = ip_rt_min_pmtu;
1452 dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1453 }
1454 dst->metrics[RTAX_MTU-1] = mtu;
1455 dst_set_expires(dst, ip_rt_mtu_expires);
1456 }
1457}
1458
1459static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1460{
1461 return NULL;
1462}
1463
1464static void ipv4_dst_destroy(struct dst_entry *dst)
1465{
1466 struct rtable *rt = (struct rtable *) dst;
1467 struct inet_peer *peer = rt->peer;
1468 struct in_device *idev = rt->idev;
1469
1470 if (peer) {
1471 rt->peer = NULL;
1472 inet_putpeer(peer);
1473 }
1474
1475 if (idev) {
1476 rt->idev = NULL;
1477 in_dev_put(idev);
1478 }
1479}
1480
1481static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1482 int how)
1483{
1484 struct rtable *rt = (struct rtable *) dst;
1485 struct in_device *idev = rt->idev;
1486 if (dev != &loopback_dev && idev && idev->dev == dev) {
1487 struct in_device *loopback_idev = in_dev_get(&loopback_dev);
1488 if (loopback_idev) {
1489 rt->idev = loopback_idev;
1490 in_dev_put(idev);
1491 }
1492 }
1493}
1494
1495static void ipv4_link_failure(struct sk_buff *skb)
1496{
1497 struct rtable *rt;
1498
1499 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1500
1501 rt = (struct rtable *) skb->dst;
1502 if (rt)
1503 dst_set_expires(&rt->u.dst, 0);
1504}
1505
1506static int ip_rt_bug(struct sk_buff *skb)
1507{
1508 printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1509 NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
1510 skb->dev ? skb->dev->name : "?");
1511 kfree_skb(skb);
1512 return 0;
1513}
1514
1515/*
1516 We do not cache source address of outgoing interface,
1517 because it is used only by IP RR, TS and SRR options,
1518 so that it out of fast path.
1519
1520 BTW remember: "addr" is allowed to be not aligned
1521 in IP options!
1522 */
1523
1524void ip_rt_get_source(u8 *addr, struct rtable *rt)
1525{
1526 u32 src;
1527 struct fib_result res;
1528
1529 if (rt->fl.iif == 0)
1530 src = rt->rt_src;
1531 else if (fib_lookup(&rt->fl, &res) == 0) {
1532 src = FIB_RES_PREFSRC(res);
1533 fib_res_put(&res);
1534 } else
1535 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1536 RT_SCOPE_UNIVERSE);
1537 memcpy(addr, &src, 4);
1538}
1539
1540#ifdef CONFIG_NET_CLS_ROUTE
1541static void set_class_tag(struct rtable *rt, u32 tag)
1542{
1543 if (!(rt->u.dst.tclassid & 0xFFFF))
1544 rt->u.dst.tclassid |= tag & 0xFFFF;
1545 if (!(rt->u.dst.tclassid & 0xFFFF0000))
1546 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1547}
1548#endif
1549
1550static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1551{
1552 struct fib_info *fi = res->fi;
1553
1554 if (fi) {
1555 if (FIB_RES_GW(*res) &&
1556 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1557 rt->rt_gateway = FIB_RES_GW(*res);
1558 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1559 sizeof(rt->u.dst.metrics));
1560 if (fi->fib_mtu == 0) {
1561 rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1562 if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1563 rt->rt_gateway != rt->rt_dst &&
1564 rt->u.dst.dev->mtu > 576)
1565 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1566 }
1567#ifdef CONFIG_NET_CLS_ROUTE
1568 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1569#endif
1570 } else
1571 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1572
1573 if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1574 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1575 if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1576 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1577 if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1578 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1579 ip_rt_min_advmss);
1580 if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1581 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1582
1583#ifdef CONFIG_NET_CLS_ROUTE
1584#ifdef CONFIG_IP_MULTIPLE_TABLES
1585 set_class_tag(rt, fib_rules_tclass(res));
1586#endif
1587 set_class_tag(rt, itag);
1588#endif
1589 rt->rt_type = res->type;
1590}
1591
1592static int ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
1593 u8 tos, struct net_device *dev, int our)
1594{
1595 unsigned hash;
1596 struct rtable *rth;
1597 u32 spec_dst;
1598 struct in_device *in_dev = in_dev_get(dev);
1599 u32 itag = 0;
1600
1601 /* Primary sanity checks. */
1602
1603 if (in_dev == NULL)
1604 return -EINVAL;
1605
1606 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1607 skb->protocol != htons(ETH_P_IP))
1608 goto e_inval;
1609
1610 if (ZERONET(saddr)) {
1611 if (!LOCAL_MCAST(daddr))
1612 goto e_inval;
1613 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1614 } else if (fib_validate_source(saddr, 0, tos, 0,
1615 dev, &spec_dst, &itag) < 0)
1616 goto e_inval;
1617
1618 rth = dst_alloc(&ipv4_dst_ops);
1619 if (!rth)
1620 goto e_nobufs;
1621
1622 rth->u.dst.output= ip_rt_bug;
1623
1624 atomic_set(&rth->u.dst.__refcnt, 1);
1625 rth->u.dst.flags= DST_HOST;
1626 if (in_dev->cnf.no_policy)
1627 rth->u.dst.flags |= DST_NOPOLICY;
1628 rth->fl.fl4_dst = daddr;
1629 rth->rt_dst = daddr;
1630 rth->fl.fl4_tos = tos;
1631#ifdef CONFIG_IP_ROUTE_FWMARK
1632 rth->fl.fl4_fwmark= skb->nfmark;
1633#endif
1634 rth->fl.fl4_src = saddr;
1635 rth->rt_src = saddr;
1636#ifdef CONFIG_NET_CLS_ROUTE
1637 rth->u.dst.tclassid = itag;
1638#endif
1639 rth->rt_iif =
1640 rth->fl.iif = dev->ifindex;
1641 rth->u.dst.dev = &loopback_dev;
1642 dev_hold(rth->u.dst.dev);
1643 rth->idev = in_dev_get(rth->u.dst.dev);
1644 rth->fl.oif = 0;
1645 rth->rt_gateway = daddr;
1646 rth->rt_spec_dst= spec_dst;
1647 rth->rt_type = RTN_MULTICAST;
1648 rth->rt_flags = RTCF_MULTICAST;
1649 if (our) {
1650 rth->u.dst.input= ip_local_deliver;
1651 rth->rt_flags |= RTCF_LOCAL;
1652 }
1653
1654#ifdef CONFIG_IP_MROUTE
1655 if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1656 rth->u.dst.input = ip_mr_input;
1657#endif
1658 RT_CACHE_STAT_INC(in_slow_mc);
1659
1660 in_dev_put(in_dev);
1661 hash = rt_hash_code(daddr, saddr ^ (dev->ifindex << 5), tos);
1662 return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1663
1664e_nobufs:
1665 in_dev_put(in_dev);
1666 return -ENOBUFS;
1667
1668e_inval:
1669 in_dev_put(in_dev);
1670 return -EINVAL;
1671}
1672
1673
1674static void ip_handle_martian_source(struct net_device *dev,
1675 struct in_device *in_dev,
1676 struct sk_buff *skb,
1677 u32 daddr,
1678 u32 saddr)
1679{
1680 RT_CACHE_STAT_INC(in_martian_src);
1681#ifdef CONFIG_IP_ROUTE_VERBOSE
1682 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1683 /*
1684 * RFC1812 recommendation, if source is martian,
1685 * the only hint is MAC header.
1686 */
1687 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1688 "%u.%u.%u.%u, on dev %s\n",
1689 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
Olaf Kirch0b7f22a2005-07-11 21:01:42 -07001690 if (dev->hard_header_len && skb->mac.raw) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001691 int i;
1692 unsigned char *p = skb->mac.raw;
1693 printk(KERN_WARNING "ll header: ");
1694 for (i = 0; i < dev->hard_header_len; i++, p++) {
1695 printk("%02x", *p);
1696 if (i < (dev->hard_header_len - 1))
1697 printk(":");
1698 }
1699 printk("\n");
1700 }
1701 }
1702#endif
1703}
1704
1705static inline int __mkroute_input(struct sk_buff *skb,
1706 struct fib_result* res,
1707 struct in_device *in_dev,
1708 u32 daddr, u32 saddr, u32 tos,
1709 struct rtable **result)
1710{
1711
1712 struct rtable *rth;
1713 int err;
1714 struct in_device *out_dev;
1715 unsigned flags = 0;
1716 u32 spec_dst, itag;
1717
1718 /* get a working reference to the output device */
1719 out_dev = in_dev_get(FIB_RES_DEV(*res));
1720 if (out_dev == NULL) {
1721 if (net_ratelimit())
1722 printk(KERN_CRIT "Bug in ip_route_input" \
1723 "_slow(). Please, report\n");
1724 return -EINVAL;
1725 }
1726
1727
1728 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1729 in_dev->dev, &spec_dst, &itag);
1730 if (err < 0) {
1731 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1732 saddr);
1733
1734 err = -EINVAL;
1735 goto cleanup;
1736 }
1737
1738 if (err)
1739 flags |= RTCF_DIRECTSRC;
1740
1741 if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1742 (IN_DEV_SHARED_MEDIA(out_dev) ||
1743 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1744 flags |= RTCF_DOREDIRECT;
1745
1746 if (skb->protocol != htons(ETH_P_IP)) {
1747 /* Not IP (i.e. ARP). Do not create route, if it is
1748 * invalid for proxy arp. DNAT routes are always valid.
1749 */
1750 if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1751 err = -EINVAL;
1752 goto cleanup;
1753 }
1754 }
1755
1756
1757 rth = dst_alloc(&ipv4_dst_ops);
1758 if (!rth) {
1759 err = -ENOBUFS;
1760 goto cleanup;
1761 }
1762
Julian Anastasovce723d82005-09-08 13:34:47 -07001763 atomic_set(&rth->u.dst.__refcnt, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001764 rth->u.dst.flags= DST_HOST;
1765#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1766 if (res->fi->fib_nhs > 1)
1767 rth->u.dst.flags |= DST_BALANCED;
1768#endif
1769 if (in_dev->cnf.no_policy)
1770 rth->u.dst.flags |= DST_NOPOLICY;
1771 if (in_dev->cnf.no_xfrm)
1772 rth->u.dst.flags |= DST_NOXFRM;
1773 rth->fl.fl4_dst = daddr;
1774 rth->rt_dst = daddr;
1775 rth->fl.fl4_tos = tos;
1776#ifdef CONFIG_IP_ROUTE_FWMARK
1777 rth->fl.fl4_fwmark= skb->nfmark;
1778#endif
1779 rth->fl.fl4_src = saddr;
1780 rth->rt_src = saddr;
1781 rth->rt_gateway = daddr;
1782 rth->rt_iif =
1783 rth->fl.iif = in_dev->dev->ifindex;
1784 rth->u.dst.dev = (out_dev)->dev;
1785 dev_hold(rth->u.dst.dev);
1786 rth->idev = in_dev_get(rth->u.dst.dev);
1787 rth->fl.oif = 0;
1788 rth->rt_spec_dst= spec_dst;
1789
1790 rth->u.dst.input = ip_forward;
1791 rth->u.dst.output = ip_output;
1792
1793 rt_set_nexthop(rth, res, itag);
1794
1795 rth->rt_flags = flags;
1796
1797 *result = rth;
1798 err = 0;
1799 cleanup:
1800 /* release the working reference to the output device */
1801 in_dev_put(out_dev);
1802 return err;
1803}
1804
1805static inline int ip_mkroute_input_def(struct sk_buff *skb,
1806 struct fib_result* res,
1807 const struct flowi *fl,
1808 struct in_device *in_dev,
1809 u32 daddr, u32 saddr, u32 tos)
1810{
Chuck Short7abaa272005-06-22 22:10:23 -07001811 struct rtable* rth = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001812 int err;
1813 unsigned hash;
1814
1815#ifdef CONFIG_IP_ROUTE_MULTIPATH
1816 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1817 fib_select_multipath(fl, res);
1818#endif
1819
1820 /* create a routing cache entry */
1821 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1822 if (err)
1823 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001824
1825 /* put it into the cache */
1826 hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5), tos);
1827 return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1828}
1829
1830static inline int ip_mkroute_input(struct sk_buff *skb,
1831 struct fib_result* res,
1832 const struct flowi *fl,
1833 struct in_device *in_dev,
1834 u32 daddr, u32 saddr, u32 tos)
1835{
1836#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
Julian Anastasovce723d82005-09-08 13:34:47 -07001837 struct rtable* rth = NULL, *rtres;
1838 unsigned char hop, hopcount;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001839 int err = -EINVAL;
1840 unsigned int hash;
1841
1842 if (res->fi)
1843 hopcount = res->fi->fib_nhs;
1844 else
1845 hopcount = 1;
1846
Linus Torvalds1da177e2005-04-16 15:20:36 -07001847 /* distinguish between multipath and singlepath */
1848 if (hopcount < 2)
1849 return ip_mkroute_input_def(skb, res, fl, in_dev, daddr,
1850 saddr, tos);
1851
1852 /* add all alternatives to the routing cache */
1853 for (hop = 0; hop < hopcount; hop++) {
1854 res->nh_sel = hop;
1855
Julian Anastasovce723d82005-09-08 13:34:47 -07001856 /* put reference to previous result */
1857 if (hop)
1858 ip_rt_put(rtres);
1859
Linus Torvalds1da177e2005-04-16 15:20:36 -07001860 /* create a routing cache entry */
1861 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos,
1862 &rth);
1863 if (err)
1864 return err;
1865
1866 /* put it into the cache */
1867 hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5), tos);
Julian Anastasovce723d82005-09-08 13:34:47 -07001868 err = rt_intern_hash(hash, rth, &rtres);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001869 if (err)
1870 return err;
1871
1872 /* forward hop information to multipath impl. */
1873 multipath_set_nhinfo(rth,
1874 FIB_RES_NETWORK(*res),
1875 FIB_RES_NETMASK(*res),
1876 res->prefixlen,
1877 &FIB_RES_NH(*res));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001878 }
Julian Anastasovce723d82005-09-08 13:34:47 -07001879 skb->dst = &rtres->u.dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001880 return err;
1881#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
1882 return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, saddr, tos);
1883#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
1884}
1885
1886
1887/*
1888 * NOTE. We drop all the packets that has local source
1889 * addresses, because every properly looped back packet
1890 * must have correct destination already attached by output routine.
1891 *
1892 * Such approach solves two big problems:
1893 * 1. Not simplex devices are handled properly.
1894 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1895 */
1896
1897static int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
1898 u8 tos, struct net_device *dev)
1899{
1900 struct fib_result res;
1901 struct in_device *in_dev = in_dev_get(dev);
1902 struct flowi fl = { .nl_u = { .ip4_u =
1903 { .daddr = daddr,
1904 .saddr = saddr,
1905 .tos = tos,
1906 .scope = RT_SCOPE_UNIVERSE,
1907#ifdef CONFIG_IP_ROUTE_FWMARK
1908 .fwmark = skb->nfmark
1909#endif
1910 } },
1911 .iif = dev->ifindex };
1912 unsigned flags = 0;
1913 u32 itag = 0;
1914 struct rtable * rth;
1915 unsigned hash;
1916 u32 spec_dst;
1917 int err = -EINVAL;
1918 int free_res = 0;
1919
1920 /* IP on this device is disabled. */
1921
1922 if (!in_dev)
1923 goto out;
1924
1925 /* Check for the most weird martians, which can be not detected
1926 by fib_lookup.
1927 */
1928
1929 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1930 goto martian_source;
1931
1932 if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0))
1933 goto brd_input;
1934
1935 /* Accept zero addresses only to limited broadcast;
1936 * I even do not know to fix it or not. Waiting for complains :-)
1937 */
1938 if (ZERONET(saddr))
1939 goto martian_source;
1940
1941 if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1942 goto martian_destination;
1943
1944 /*
1945 * Now we are ready to route packet.
1946 */
1947 if ((err = fib_lookup(&fl, &res)) != 0) {
1948 if (!IN_DEV_FORWARD(in_dev))
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07001949 goto e_hostunreach;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001950 goto no_route;
1951 }
1952 free_res = 1;
1953
1954 RT_CACHE_STAT_INC(in_slow_tot);
1955
1956 if (res.type == RTN_BROADCAST)
1957 goto brd_input;
1958
1959 if (res.type == RTN_LOCAL) {
1960 int result;
1961 result = fib_validate_source(saddr, daddr, tos,
1962 loopback_dev.ifindex,
1963 dev, &spec_dst, &itag);
1964 if (result < 0)
1965 goto martian_source;
1966 if (result)
1967 flags |= RTCF_DIRECTSRC;
1968 spec_dst = daddr;
1969 goto local_input;
1970 }
1971
1972 if (!IN_DEV_FORWARD(in_dev))
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07001973 goto e_hostunreach;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001974 if (res.type != RTN_UNICAST)
1975 goto martian_destination;
1976
1977 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1978 if (err == -ENOBUFS)
1979 goto e_nobufs;
1980 if (err == -EINVAL)
1981 goto e_inval;
1982
1983done:
1984 in_dev_put(in_dev);
1985 if (free_res)
1986 fib_res_put(&res);
1987out: return err;
1988
1989brd_input:
1990 if (skb->protocol != htons(ETH_P_IP))
1991 goto e_inval;
1992
1993 if (ZERONET(saddr))
1994 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1995 else {
1996 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1997 &itag);
1998 if (err < 0)
1999 goto martian_source;
2000 if (err)
2001 flags |= RTCF_DIRECTSRC;
2002 }
2003 flags |= RTCF_BROADCAST;
2004 res.type = RTN_BROADCAST;
2005 RT_CACHE_STAT_INC(in_brd);
2006
2007local_input:
2008 rth = dst_alloc(&ipv4_dst_ops);
2009 if (!rth)
2010 goto e_nobufs;
2011
2012 rth->u.dst.output= ip_rt_bug;
2013
2014 atomic_set(&rth->u.dst.__refcnt, 1);
2015 rth->u.dst.flags= DST_HOST;
2016 if (in_dev->cnf.no_policy)
2017 rth->u.dst.flags |= DST_NOPOLICY;
2018 rth->fl.fl4_dst = daddr;
2019 rth->rt_dst = daddr;
2020 rth->fl.fl4_tos = tos;
2021#ifdef CONFIG_IP_ROUTE_FWMARK
2022 rth->fl.fl4_fwmark= skb->nfmark;
2023#endif
2024 rth->fl.fl4_src = saddr;
2025 rth->rt_src = saddr;
2026#ifdef CONFIG_NET_CLS_ROUTE
2027 rth->u.dst.tclassid = itag;
2028#endif
2029 rth->rt_iif =
2030 rth->fl.iif = dev->ifindex;
2031 rth->u.dst.dev = &loopback_dev;
2032 dev_hold(rth->u.dst.dev);
2033 rth->idev = in_dev_get(rth->u.dst.dev);
2034 rth->rt_gateway = daddr;
2035 rth->rt_spec_dst= spec_dst;
2036 rth->u.dst.input= ip_local_deliver;
2037 rth->rt_flags = flags|RTCF_LOCAL;
2038 if (res.type == RTN_UNREACHABLE) {
2039 rth->u.dst.input= ip_error;
2040 rth->u.dst.error= -err;
2041 rth->rt_flags &= ~RTCF_LOCAL;
2042 }
2043 rth->rt_type = res.type;
2044 hash = rt_hash_code(daddr, saddr ^ (fl.iif << 5), tos);
2045 err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
2046 goto done;
2047
2048no_route:
2049 RT_CACHE_STAT_INC(in_no_route);
2050 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2051 res.type = RTN_UNREACHABLE;
2052 goto local_input;
2053
2054 /*
2055 * Do not cache martian addresses: they should be logged (RFC1812)
2056 */
2057martian_destination:
2058 RT_CACHE_STAT_INC(in_martian_dst);
2059#ifdef CONFIG_IP_ROUTE_VERBOSE
2060 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2061 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2062 "%u.%u.%u.%u, dev %s\n",
2063 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2064#endif
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002065
2066e_hostunreach:
2067 err = -EHOSTUNREACH;
2068 goto done;
2069
Linus Torvalds1da177e2005-04-16 15:20:36 -07002070e_inval:
2071 err = -EINVAL;
2072 goto done;
2073
2074e_nobufs:
2075 err = -ENOBUFS;
2076 goto done;
2077
2078martian_source:
2079 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2080 goto e_inval;
2081}
2082
2083int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
2084 u8 tos, struct net_device *dev)
2085{
2086 struct rtable * rth;
2087 unsigned hash;
2088 int iif = dev->ifindex;
2089
2090 tos &= IPTOS_RT_MASK;
2091 hash = rt_hash_code(daddr, saddr ^ (iif << 5), tos);
2092
2093 rcu_read_lock();
2094 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2095 rth = rcu_dereference(rth->u.rt_next)) {
2096 if (rth->fl.fl4_dst == daddr &&
2097 rth->fl.fl4_src == saddr &&
2098 rth->fl.iif == iif &&
2099 rth->fl.oif == 0 &&
2100#ifdef CONFIG_IP_ROUTE_FWMARK
2101 rth->fl.fl4_fwmark == skb->nfmark &&
2102#endif
2103 rth->fl.fl4_tos == tos) {
2104 rth->u.dst.lastuse = jiffies;
2105 dst_hold(&rth->u.dst);
2106 rth->u.dst.__use++;
2107 RT_CACHE_STAT_INC(in_hit);
2108 rcu_read_unlock();
2109 skb->dst = (struct dst_entry*)rth;
2110 return 0;
2111 }
2112 RT_CACHE_STAT_INC(in_hlist_search);
2113 }
2114 rcu_read_unlock();
2115
2116 /* Multicast recognition logic is moved from route cache to here.
2117 The problem was that too many Ethernet cards have broken/missing
2118 hardware multicast filters :-( As result the host on multicasting
2119 network acquires a lot of useless route cache entries, sort of
2120 SDR messages from all the world. Now we try to get rid of them.
2121 Really, provided software IP multicast filter is organized
2122 reasonably (at least, hashed), it does not result in a slowdown
2123 comparing with route cache reject entries.
2124 Note, that multicast routers are not affected, because
2125 route cache entry is created eventually.
2126 */
2127 if (MULTICAST(daddr)) {
2128 struct in_device *in_dev;
2129
2130 rcu_read_lock();
2131 if ((in_dev = __in_dev_get(dev)) != NULL) {
2132 int our = ip_check_mc(in_dev, daddr, saddr,
2133 skb->nh.iph->protocol);
2134 if (our
2135#ifdef CONFIG_IP_MROUTE
2136 || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
2137#endif
2138 ) {
2139 rcu_read_unlock();
2140 return ip_route_input_mc(skb, daddr, saddr,
2141 tos, dev, our);
2142 }
2143 }
2144 rcu_read_unlock();
2145 return -EINVAL;
2146 }
2147 return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2148}
2149
2150static inline int __mkroute_output(struct rtable **result,
2151 struct fib_result* res,
2152 const struct flowi *fl,
2153 const struct flowi *oldflp,
2154 struct net_device *dev_out,
2155 unsigned flags)
2156{
2157 struct rtable *rth;
2158 struct in_device *in_dev;
2159 u32 tos = RT_FL_TOS(oldflp);
2160 int err = 0;
2161
2162 if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2163 return -EINVAL;
2164
2165 if (fl->fl4_dst == 0xFFFFFFFF)
2166 res->type = RTN_BROADCAST;
2167 else if (MULTICAST(fl->fl4_dst))
2168 res->type = RTN_MULTICAST;
2169 else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
2170 return -EINVAL;
2171
2172 if (dev_out->flags & IFF_LOOPBACK)
2173 flags |= RTCF_LOCAL;
2174
2175 /* get work reference to inet device */
2176 in_dev = in_dev_get(dev_out);
2177 if (!in_dev)
2178 return -EINVAL;
2179
2180 if (res->type == RTN_BROADCAST) {
2181 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2182 if (res->fi) {
2183 fib_info_put(res->fi);
2184 res->fi = NULL;
2185 }
2186 } else if (res->type == RTN_MULTICAST) {
2187 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2188 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2189 oldflp->proto))
2190 flags &= ~RTCF_LOCAL;
2191 /* If multicast route do not exist use
2192 default one, but do not gateway in this case.
2193 Yes, it is hack.
2194 */
2195 if (res->fi && res->prefixlen < 4) {
2196 fib_info_put(res->fi);
2197 res->fi = NULL;
2198 }
2199 }
2200
2201
2202 rth = dst_alloc(&ipv4_dst_ops);
2203 if (!rth) {
2204 err = -ENOBUFS;
2205 goto cleanup;
2206 }
2207
Julian Anastasovce723d82005-09-08 13:34:47 -07002208 atomic_set(&rth->u.dst.__refcnt, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002209 rth->u.dst.flags= DST_HOST;
2210#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2211 if (res->fi) {
2212 rth->rt_multipath_alg = res->fi->fib_mp_alg;
2213 if (res->fi->fib_nhs > 1)
2214 rth->u.dst.flags |= DST_BALANCED;
2215 }
2216#endif
2217 if (in_dev->cnf.no_xfrm)
2218 rth->u.dst.flags |= DST_NOXFRM;
2219 if (in_dev->cnf.no_policy)
2220 rth->u.dst.flags |= DST_NOPOLICY;
2221
2222 rth->fl.fl4_dst = oldflp->fl4_dst;
2223 rth->fl.fl4_tos = tos;
2224 rth->fl.fl4_src = oldflp->fl4_src;
2225 rth->fl.oif = oldflp->oif;
2226#ifdef CONFIG_IP_ROUTE_FWMARK
2227 rth->fl.fl4_fwmark= oldflp->fl4_fwmark;
2228#endif
2229 rth->rt_dst = fl->fl4_dst;
2230 rth->rt_src = fl->fl4_src;
2231 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
2232 /* get references to the devices that are to be hold by the routing
2233 cache entry */
2234 rth->u.dst.dev = dev_out;
2235 dev_hold(dev_out);
2236 rth->idev = in_dev_get(dev_out);
2237 rth->rt_gateway = fl->fl4_dst;
2238 rth->rt_spec_dst= fl->fl4_src;
2239
2240 rth->u.dst.output=ip_output;
2241
2242 RT_CACHE_STAT_INC(out_slow_tot);
2243
2244 if (flags & RTCF_LOCAL) {
2245 rth->u.dst.input = ip_local_deliver;
2246 rth->rt_spec_dst = fl->fl4_dst;
2247 }
2248 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2249 rth->rt_spec_dst = fl->fl4_src;
2250 if (flags & RTCF_LOCAL &&
2251 !(dev_out->flags & IFF_LOOPBACK)) {
2252 rth->u.dst.output = ip_mc_output;
2253 RT_CACHE_STAT_INC(out_slow_mc);
2254 }
2255#ifdef CONFIG_IP_MROUTE
2256 if (res->type == RTN_MULTICAST) {
2257 if (IN_DEV_MFORWARD(in_dev) &&
2258 !LOCAL_MCAST(oldflp->fl4_dst)) {
2259 rth->u.dst.input = ip_mr_input;
2260 rth->u.dst.output = ip_mc_output;
2261 }
2262 }
2263#endif
2264 }
2265
2266 rt_set_nexthop(rth, res, 0);
2267
2268 rth->rt_flags = flags;
2269
2270 *result = rth;
2271 cleanup:
2272 /* release work reference to inet device */
2273 in_dev_put(in_dev);
2274
2275 return err;
2276}
2277
2278static inline int ip_mkroute_output_def(struct rtable **rp,
2279 struct fib_result* res,
2280 const struct flowi *fl,
2281 const struct flowi *oldflp,
2282 struct net_device *dev_out,
2283 unsigned flags)
2284{
Chuck Short7abaa272005-06-22 22:10:23 -07002285 struct rtable *rth = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002286 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2287 unsigned hash;
2288 if (err == 0) {
2289 u32 tos = RT_FL_TOS(oldflp);
2290
Linus Torvalds1da177e2005-04-16 15:20:36 -07002291 hash = rt_hash_code(oldflp->fl4_dst,
2292 oldflp->fl4_src ^ (oldflp->oif << 5), tos);
2293 err = rt_intern_hash(hash, rth, rp);
2294 }
2295
2296 return err;
2297}
2298
2299static inline int ip_mkroute_output(struct rtable** rp,
2300 struct fib_result* res,
2301 const struct flowi *fl,
2302 const struct flowi *oldflp,
2303 struct net_device *dev_out,
2304 unsigned flags)
2305{
2306#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2307 u32 tos = RT_FL_TOS(oldflp);
2308 unsigned char hop;
2309 unsigned hash;
2310 int err = -EINVAL;
Chuck Short7abaa272005-06-22 22:10:23 -07002311 struct rtable *rth = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002312
2313 if (res->fi && res->fi->fib_nhs > 1) {
2314 unsigned char hopcount = res->fi->fib_nhs;
2315
2316 for (hop = 0; hop < hopcount; hop++) {
2317 struct net_device *dev2nexthop;
2318
2319 res->nh_sel = hop;
2320
2321 /* hold a work reference to the output device */
2322 dev2nexthop = FIB_RES_DEV(*res);
2323 dev_hold(dev2nexthop);
2324
Julian Anastasovce723d82005-09-08 13:34:47 -07002325 /* put reference to previous result */
2326 if (hop)
2327 ip_rt_put(*rp);
2328
Linus Torvalds1da177e2005-04-16 15:20:36 -07002329 err = __mkroute_output(&rth, res, fl, oldflp,
2330 dev2nexthop, flags);
2331
2332 if (err != 0)
2333 goto cleanup;
2334
2335 hash = rt_hash_code(oldflp->fl4_dst,
2336 oldflp->fl4_src ^
2337 (oldflp->oif << 5), tos);
2338 err = rt_intern_hash(hash, rth, rp);
2339
2340 /* forward hop information to multipath impl. */
2341 multipath_set_nhinfo(rth,
2342 FIB_RES_NETWORK(*res),
2343 FIB_RES_NETMASK(*res),
2344 res->prefixlen,
2345 &FIB_RES_NH(*res));
2346 cleanup:
2347 /* release work reference to output device */
2348 dev_put(dev2nexthop);
2349
2350 if (err != 0)
2351 return err;
2352 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002353 return err;
2354 } else {
2355 return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out,
2356 flags);
2357 }
2358#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
2359 return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, flags);
2360#endif
2361}
2362
2363/*
2364 * Major route resolver routine.
2365 */
2366
2367static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2368{
2369 u32 tos = RT_FL_TOS(oldflp);
2370 struct flowi fl = { .nl_u = { .ip4_u =
2371 { .daddr = oldflp->fl4_dst,
2372 .saddr = oldflp->fl4_src,
2373 .tos = tos & IPTOS_RT_MASK,
2374 .scope = ((tos & RTO_ONLINK) ?
2375 RT_SCOPE_LINK :
2376 RT_SCOPE_UNIVERSE),
2377#ifdef CONFIG_IP_ROUTE_FWMARK
2378 .fwmark = oldflp->fl4_fwmark
2379#endif
2380 } },
2381 .iif = loopback_dev.ifindex,
2382 .oif = oldflp->oif };
2383 struct fib_result res;
2384 unsigned flags = 0;
2385 struct net_device *dev_out = NULL;
2386 int free_res = 0;
2387 int err;
2388
2389
2390 res.fi = NULL;
2391#ifdef CONFIG_IP_MULTIPLE_TABLES
2392 res.r = NULL;
2393#endif
2394
2395 if (oldflp->fl4_src) {
2396 err = -EINVAL;
2397 if (MULTICAST(oldflp->fl4_src) ||
2398 BADCLASS(oldflp->fl4_src) ||
2399 ZERONET(oldflp->fl4_src))
2400 goto out;
2401
2402 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2403 dev_out = ip_dev_find(oldflp->fl4_src);
2404 if (dev_out == NULL)
2405 goto out;
2406
2407 /* I removed check for oif == dev_out->oif here.
2408 It was wrong for two reasons:
2409 1. ip_dev_find(saddr) can return wrong iface, if saddr is
2410 assigned to multiple interfaces.
2411 2. Moreover, we are allowed to send packets with saddr
2412 of another iface. --ANK
2413 */
2414
2415 if (oldflp->oif == 0
2416 && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF)) {
2417 /* Special hack: user can direct multicasts
2418 and limited broadcast via necessary interface
2419 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2420 This hack is not just for fun, it allows
2421 vic,vat and friends to work.
2422 They bind socket to loopback, set ttl to zero
2423 and expect that it will work.
2424 From the viewpoint of routing cache they are broken,
2425 because we are not allowed to build multicast path
2426 with loopback source addr (look, routing cache
2427 cannot know, that ttl is zero, so that packet
2428 will not leave this host and route is valid).
2429 Luckily, this hack is good workaround.
2430 */
2431
2432 fl.oif = dev_out->ifindex;
2433 goto make_route;
2434 }
2435 if (dev_out)
2436 dev_put(dev_out);
2437 dev_out = NULL;
2438 }
2439
2440
2441 if (oldflp->oif) {
2442 dev_out = dev_get_by_index(oldflp->oif);
2443 err = -ENODEV;
2444 if (dev_out == NULL)
2445 goto out;
2446 if (__in_dev_get(dev_out) == NULL) {
2447 dev_put(dev_out);
2448 goto out; /* Wrong error code */
2449 }
2450
2451 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF) {
2452 if (!fl.fl4_src)
2453 fl.fl4_src = inet_select_addr(dev_out, 0,
2454 RT_SCOPE_LINK);
2455 goto make_route;
2456 }
2457 if (!fl.fl4_src) {
2458 if (MULTICAST(oldflp->fl4_dst))
2459 fl.fl4_src = inet_select_addr(dev_out, 0,
2460 fl.fl4_scope);
2461 else if (!oldflp->fl4_dst)
2462 fl.fl4_src = inet_select_addr(dev_out, 0,
2463 RT_SCOPE_HOST);
2464 }
2465 }
2466
2467 if (!fl.fl4_dst) {
2468 fl.fl4_dst = fl.fl4_src;
2469 if (!fl.fl4_dst)
2470 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2471 if (dev_out)
2472 dev_put(dev_out);
2473 dev_out = &loopback_dev;
2474 dev_hold(dev_out);
2475 fl.oif = loopback_dev.ifindex;
2476 res.type = RTN_LOCAL;
2477 flags |= RTCF_LOCAL;
2478 goto make_route;
2479 }
2480
2481 if (fib_lookup(&fl, &res)) {
2482 res.fi = NULL;
2483 if (oldflp->oif) {
2484 /* Apparently, routing tables are wrong. Assume,
2485 that the destination is on link.
2486
2487 WHY? DW.
2488 Because we are allowed to send to iface
2489 even if it has NO routes and NO assigned
2490 addresses. When oif is specified, routing
2491 tables are looked up with only one purpose:
2492 to catch if destination is gatewayed, rather than
2493 direct. Moreover, if MSG_DONTROUTE is set,
2494 we send packet, ignoring both routing tables
2495 and ifaddr state. --ANK
2496
2497
2498 We could make it even if oif is unknown,
2499 likely IPv6, but we do not.
2500 */
2501
2502 if (fl.fl4_src == 0)
2503 fl.fl4_src = inet_select_addr(dev_out, 0,
2504 RT_SCOPE_LINK);
2505 res.type = RTN_UNICAST;
2506 goto make_route;
2507 }
2508 if (dev_out)
2509 dev_put(dev_out);
2510 err = -ENETUNREACH;
2511 goto out;
2512 }
2513 free_res = 1;
2514
2515 if (res.type == RTN_LOCAL) {
2516 if (!fl.fl4_src)
2517 fl.fl4_src = fl.fl4_dst;
2518 if (dev_out)
2519 dev_put(dev_out);
2520 dev_out = &loopback_dev;
2521 dev_hold(dev_out);
2522 fl.oif = dev_out->ifindex;
2523 if (res.fi)
2524 fib_info_put(res.fi);
2525 res.fi = NULL;
2526 flags |= RTCF_LOCAL;
2527 goto make_route;
2528 }
2529
2530#ifdef CONFIG_IP_ROUTE_MULTIPATH
2531 if (res.fi->fib_nhs > 1 && fl.oif == 0)
2532 fib_select_multipath(&fl, &res);
2533 else
2534#endif
2535 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2536 fib_select_default(&fl, &res);
2537
2538 if (!fl.fl4_src)
2539 fl.fl4_src = FIB_RES_PREFSRC(res);
2540
2541 if (dev_out)
2542 dev_put(dev_out);
2543 dev_out = FIB_RES_DEV(res);
2544 dev_hold(dev_out);
2545 fl.oif = dev_out->ifindex;
2546
2547
2548make_route:
2549 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2550
2551
2552 if (free_res)
2553 fib_res_put(&res);
2554 if (dev_out)
2555 dev_put(dev_out);
2556out: return err;
2557}
2558
2559int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2560{
2561 unsigned hash;
2562 struct rtable *rth;
2563
2564 hash = rt_hash_code(flp->fl4_dst, flp->fl4_src ^ (flp->oif << 5), flp->fl4_tos);
2565
2566 rcu_read_lock_bh();
2567 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2568 rth = rcu_dereference(rth->u.rt_next)) {
2569 if (rth->fl.fl4_dst == flp->fl4_dst &&
2570 rth->fl.fl4_src == flp->fl4_src &&
2571 rth->fl.iif == 0 &&
2572 rth->fl.oif == flp->oif &&
2573#ifdef CONFIG_IP_ROUTE_FWMARK
2574 rth->fl.fl4_fwmark == flp->fl4_fwmark &&
2575#endif
2576 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2577 (IPTOS_RT_MASK | RTO_ONLINK))) {
2578
2579 /* check for multipath routes and choose one if
2580 * necessary
2581 */
2582 if (multipath_select_route(flp, rth, rp)) {
2583 dst_hold(&(*rp)->u.dst);
2584 RT_CACHE_STAT_INC(out_hit);
2585 rcu_read_unlock_bh();
2586 return 0;
2587 }
2588
2589 rth->u.dst.lastuse = jiffies;
2590 dst_hold(&rth->u.dst);
2591 rth->u.dst.__use++;
2592 RT_CACHE_STAT_INC(out_hit);
2593 rcu_read_unlock_bh();
2594 *rp = rth;
2595 return 0;
2596 }
2597 RT_CACHE_STAT_INC(out_hlist_search);
2598 }
2599 rcu_read_unlock_bh();
2600
2601 return ip_route_output_slow(rp, flp);
2602}
2603
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -07002604EXPORT_SYMBOL_GPL(__ip_route_output_key);
2605
Linus Torvalds1da177e2005-04-16 15:20:36 -07002606int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2607{
2608 int err;
2609
2610 if ((err = __ip_route_output_key(rp, flp)) != 0)
2611 return err;
2612
2613 if (flp->proto) {
2614 if (!flp->fl4_src)
2615 flp->fl4_src = (*rp)->rt_src;
2616 if (!flp->fl4_dst)
2617 flp->fl4_dst = (*rp)->rt_dst;
2618 return xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2619 }
2620
2621 return 0;
2622}
2623
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -07002624EXPORT_SYMBOL_GPL(ip_route_output_flow);
2625
Linus Torvalds1da177e2005-04-16 15:20:36 -07002626int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2627{
2628 return ip_route_output_flow(rp, flp, NULL, 0);
2629}
2630
2631static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07002632 int nowait, unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002633{
2634 struct rtable *rt = (struct rtable*)skb->dst;
2635 struct rtmsg *r;
2636 struct nlmsghdr *nlh;
2637 unsigned char *b = skb->tail;
2638 struct rta_cacheinfo ci;
2639#ifdef CONFIG_IP_MROUTE
2640 struct rtattr *eptr;
2641#endif
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07002642 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*r), flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002643 r = NLMSG_DATA(nlh);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002644 r->rtm_family = AF_INET;
2645 r->rtm_dst_len = 32;
2646 r->rtm_src_len = 0;
2647 r->rtm_tos = rt->fl.fl4_tos;
2648 r->rtm_table = RT_TABLE_MAIN;
2649 r->rtm_type = rt->rt_type;
2650 r->rtm_scope = RT_SCOPE_UNIVERSE;
2651 r->rtm_protocol = RTPROT_UNSPEC;
2652 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2653 if (rt->rt_flags & RTCF_NOTIFY)
2654 r->rtm_flags |= RTM_F_NOTIFY;
2655 RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
2656 if (rt->fl.fl4_src) {
2657 r->rtm_src_len = 32;
2658 RTA_PUT(skb, RTA_SRC, 4, &rt->fl.fl4_src);
2659 }
2660 if (rt->u.dst.dev)
2661 RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
2662#ifdef CONFIG_NET_CLS_ROUTE
2663 if (rt->u.dst.tclassid)
2664 RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid);
2665#endif
2666#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2667 if (rt->rt_multipath_alg != IP_MP_ALG_NONE) {
2668 __u32 alg = rt->rt_multipath_alg;
2669
2670 RTA_PUT(skb, RTA_MP_ALGO, 4, &alg);
2671 }
2672#endif
2673 if (rt->fl.iif)
2674 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
2675 else if (rt->rt_src != rt->fl.fl4_src)
2676 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src);
2677 if (rt->rt_dst != rt->rt_gateway)
2678 RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
2679 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2680 goto rtattr_failure;
2681 ci.rta_lastuse = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
2682 ci.rta_used = rt->u.dst.__use;
2683 ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt);
2684 if (rt->u.dst.expires)
2685 ci.rta_expires = jiffies_to_clock_t(rt->u.dst.expires - jiffies);
2686 else
2687 ci.rta_expires = 0;
2688 ci.rta_error = rt->u.dst.error;
2689 ci.rta_id = ci.rta_ts = ci.rta_tsage = 0;
2690 if (rt->peer) {
2691 ci.rta_id = rt->peer->ip_id_count;
2692 if (rt->peer->tcp_ts_stamp) {
2693 ci.rta_ts = rt->peer->tcp_ts;
2694 ci.rta_tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
2695 }
2696 }
2697#ifdef CONFIG_IP_MROUTE
2698 eptr = (struct rtattr*)skb->tail;
2699#endif
2700 RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
2701 if (rt->fl.iif) {
2702#ifdef CONFIG_IP_MROUTE
2703 u32 dst = rt->rt_dst;
2704
2705 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2706 ipv4_devconf.mc_forwarding) {
2707 int err = ipmr_get_route(skb, r, nowait);
2708 if (err <= 0) {
2709 if (!nowait) {
2710 if (err == 0)
2711 return 0;
2712 goto nlmsg_failure;
2713 } else {
2714 if (err == -EMSGSIZE)
2715 goto nlmsg_failure;
2716 ((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err;
2717 }
2718 }
2719 } else
2720#endif
2721 RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->fl.iif);
2722 }
2723
2724 nlh->nlmsg_len = skb->tail - b;
2725 return skb->len;
2726
2727nlmsg_failure:
2728rtattr_failure:
2729 skb_trim(skb, b - skb->data);
2730 return -1;
2731}
2732
2733int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2734{
2735 struct rtattr **rta = arg;
2736 struct rtmsg *rtm = NLMSG_DATA(nlh);
2737 struct rtable *rt = NULL;
2738 u32 dst = 0;
2739 u32 src = 0;
2740 int iif = 0;
2741 int err = -ENOBUFS;
2742 struct sk_buff *skb;
2743
2744 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2745 if (!skb)
2746 goto out;
2747
2748 /* Reserve room for dummy headers, this skb can pass
2749 through good chunk of routing engine.
2750 */
2751 skb->mac.raw = skb->data;
2752 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2753
2754 if (rta[RTA_SRC - 1])
2755 memcpy(&src, RTA_DATA(rta[RTA_SRC - 1]), 4);
2756 if (rta[RTA_DST - 1])
2757 memcpy(&dst, RTA_DATA(rta[RTA_DST - 1]), 4);
2758 if (rta[RTA_IIF - 1])
2759 memcpy(&iif, RTA_DATA(rta[RTA_IIF - 1]), sizeof(int));
2760
2761 if (iif) {
2762 struct net_device *dev = __dev_get_by_index(iif);
2763 err = -ENODEV;
2764 if (!dev)
2765 goto out_free;
2766 skb->protocol = htons(ETH_P_IP);
2767 skb->dev = dev;
2768 local_bh_disable();
2769 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2770 local_bh_enable();
2771 rt = (struct rtable*)skb->dst;
2772 if (!err && rt->u.dst.error)
2773 err = -rt->u.dst.error;
2774 } else {
2775 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dst,
2776 .saddr = src,
2777 .tos = rtm->rtm_tos } } };
2778 int oif = 0;
2779 if (rta[RTA_OIF - 1])
2780 memcpy(&oif, RTA_DATA(rta[RTA_OIF - 1]), sizeof(int));
2781 fl.oif = oif;
2782 err = ip_route_output_key(&rt, &fl);
2783 }
2784 if (err)
2785 goto out_free;
2786
2787 skb->dst = &rt->u.dst;
2788 if (rtm->rtm_flags & RTM_F_NOTIFY)
2789 rt->rt_flags |= RTCF_NOTIFY;
2790
2791 NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
2792
2793 err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07002794 RTM_NEWROUTE, 0, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002795 if (!err)
2796 goto out_free;
2797 if (err < 0) {
2798 err = -EMSGSIZE;
2799 goto out_free;
2800 }
2801
2802 err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
2803 if (err > 0)
2804 err = 0;
2805out: return err;
2806
2807out_free:
2808 kfree_skb(skb);
2809 goto out;
2810}
2811
2812int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2813{
2814 struct rtable *rt;
2815 int h, s_h;
2816 int idx, s_idx;
2817
2818 s_h = cb->args[0];
2819 s_idx = idx = cb->args[1];
2820 for (h = 0; h <= rt_hash_mask; h++) {
2821 if (h < s_h) continue;
2822 if (h > s_h)
2823 s_idx = 0;
2824 rcu_read_lock_bh();
2825 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2826 rt = rcu_dereference(rt->u.rt_next), idx++) {
2827 if (idx < s_idx)
2828 continue;
2829 skb->dst = dst_clone(&rt->u.dst);
2830 if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07002831 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2832 1, NLM_F_MULTI) <= 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002833 dst_release(xchg(&skb->dst, NULL));
2834 rcu_read_unlock_bh();
2835 goto done;
2836 }
2837 dst_release(xchg(&skb->dst, NULL));
2838 }
2839 rcu_read_unlock_bh();
2840 }
2841
2842done:
2843 cb->args[0] = h;
2844 cb->args[1] = idx;
2845 return skb->len;
2846}
2847
2848void ip_rt_multicast_event(struct in_device *in_dev)
2849{
2850 rt_cache_flush(0);
2851}
2852
2853#ifdef CONFIG_SYSCTL
2854static int flush_delay;
2855
2856static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2857 struct file *filp, void __user *buffer,
2858 size_t *lenp, loff_t *ppos)
2859{
2860 if (write) {
2861 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2862 rt_cache_flush(flush_delay);
2863 return 0;
2864 }
2865
2866 return -EINVAL;
2867}
2868
2869static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2870 int __user *name,
2871 int nlen,
2872 void __user *oldval,
2873 size_t __user *oldlenp,
2874 void __user *newval,
2875 size_t newlen,
2876 void **context)
2877{
2878 int delay;
2879 if (newlen != sizeof(int))
2880 return -EINVAL;
2881 if (get_user(delay, (int __user *)newval))
2882 return -EFAULT;
2883 rt_cache_flush(delay);
2884 return 0;
2885}
2886
2887ctl_table ipv4_route_table[] = {
2888 {
2889 .ctl_name = NET_IPV4_ROUTE_FLUSH,
2890 .procname = "flush",
2891 .data = &flush_delay,
2892 .maxlen = sizeof(int),
Dave Jones7e3e0362005-04-28 12:11:03 -07002893 .mode = 0200,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002894 .proc_handler = &ipv4_sysctl_rtcache_flush,
2895 .strategy = &ipv4_sysctl_rtcache_flush_strategy,
2896 },
2897 {
2898 .ctl_name = NET_IPV4_ROUTE_MIN_DELAY,
2899 .procname = "min_delay",
2900 .data = &ip_rt_min_delay,
2901 .maxlen = sizeof(int),
2902 .mode = 0644,
2903 .proc_handler = &proc_dointvec_jiffies,
2904 .strategy = &sysctl_jiffies,
2905 },
2906 {
2907 .ctl_name = NET_IPV4_ROUTE_MAX_DELAY,
2908 .procname = "max_delay",
2909 .data = &ip_rt_max_delay,
2910 .maxlen = sizeof(int),
2911 .mode = 0644,
2912 .proc_handler = &proc_dointvec_jiffies,
2913 .strategy = &sysctl_jiffies,
2914 },
2915 {
2916 .ctl_name = NET_IPV4_ROUTE_GC_THRESH,
2917 .procname = "gc_thresh",
2918 .data = &ipv4_dst_ops.gc_thresh,
2919 .maxlen = sizeof(int),
2920 .mode = 0644,
2921 .proc_handler = &proc_dointvec,
2922 },
2923 {
2924 .ctl_name = NET_IPV4_ROUTE_MAX_SIZE,
2925 .procname = "max_size",
2926 .data = &ip_rt_max_size,
2927 .maxlen = sizeof(int),
2928 .mode = 0644,
2929 .proc_handler = &proc_dointvec,
2930 },
2931 {
2932 /* Deprecated. Use gc_min_interval_ms */
2933
2934 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2935 .procname = "gc_min_interval",
2936 .data = &ip_rt_gc_min_interval,
2937 .maxlen = sizeof(int),
2938 .mode = 0644,
2939 .proc_handler = &proc_dointvec_jiffies,
2940 .strategy = &sysctl_jiffies,
2941 },
2942 {
2943 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2944 .procname = "gc_min_interval_ms",
2945 .data = &ip_rt_gc_min_interval,
2946 .maxlen = sizeof(int),
2947 .mode = 0644,
2948 .proc_handler = &proc_dointvec_ms_jiffies,
2949 .strategy = &sysctl_ms_jiffies,
2950 },
2951 {
2952 .ctl_name = NET_IPV4_ROUTE_GC_TIMEOUT,
2953 .procname = "gc_timeout",
2954 .data = &ip_rt_gc_timeout,
2955 .maxlen = sizeof(int),
2956 .mode = 0644,
2957 .proc_handler = &proc_dointvec_jiffies,
2958 .strategy = &sysctl_jiffies,
2959 },
2960 {
2961 .ctl_name = NET_IPV4_ROUTE_GC_INTERVAL,
2962 .procname = "gc_interval",
2963 .data = &ip_rt_gc_interval,
2964 .maxlen = sizeof(int),
2965 .mode = 0644,
2966 .proc_handler = &proc_dointvec_jiffies,
2967 .strategy = &sysctl_jiffies,
2968 },
2969 {
2970 .ctl_name = NET_IPV4_ROUTE_REDIRECT_LOAD,
2971 .procname = "redirect_load",
2972 .data = &ip_rt_redirect_load,
2973 .maxlen = sizeof(int),
2974 .mode = 0644,
2975 .proc_handler = &proc_dointvec,
2976 },
2977 {
2978 .ctl_name = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2979 .procname = "redirect_number",
2980 .data = &ip_rt_redirect_number,
2981 .maxlen = sizeof(int),
2982 .mode = 0644,
2983 .proc_handler = &proc_dointvec,
2984 },
2985 {
2986 .ctl_name = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2987 .procname = "redirect_silence",
2988 .data = &ip_rt_redirect_silence,
2989 .maxlen = sizeof(int),
2990 .mode = 0644,
2991 .proc_handler = &proc_dointvec,
2992 },
2993 {
2994 .ctl_name = NET_IPV4_ROUTE_ERROR_COST,
2995 .procname = "error_cost",
2996 .data = &ip_rt_error_cost,
2997 .maxlen = sizeof(int),
2998 .mode = 0644,
2999 .proc_handler = &proc_dointvec,
3000 },
3001 {
3002 .ctl_name = NET_IPV4_ROUTE_ERROR_BURST,
3003 .procname = "error_burst",
3004 .data = &ip_rt_error_burst,
3005 .maxlen = sizeof(int),
3006 .mode = 0644,
3007 .proc_handler = &proc_dointvec,
3008 },
3009 {
3010 .ctl_name = NET_IPV4_ROUTE_GC_ELASTICITY,
3011 .procname = "gc_elasticity",
3012 .data = &ip_rt_gc_elasticity,
3013 .maxlen = sizeof(int),
3014 .mode = 0644,
3015 .proc_handler = &proc_dointvec,
3016 },
3017 {
3018 .ctl_name = NET_IPV4_ROUTE_MTU_EXPIRES,
3019 .procname = "mtu_expires",
3020 .data = &ip_rt_mtu_expires,
3021 .maxlen = sizeof(int),
3022 .mode = 0644,
3023 .proc_handler = &proc_dointvec_jiffies,
3024 .strategy = &sysctl_jiffies,
3025 },
3026 {
3027 .ctl_name = NET_IPV4_ROUTE_MIN_PMTU,
3028 .procname = "min_pmtu",
3029 .data = &ip_rt_min_pmtu,
3030 .maxlen = sizeof(int),
3031 .mode = 0644,
3032 .proc_handler = &proc_dointvec,
3033 },
3034 {
3035 .ctl_name = NET_IPV4_ROUTE_MIN_ADVMSS,
3036 .procname = "min_adv_mss",
3037 .data = &ip_rt_min_advmss,
3038 .maxlen = sizeof(int),
3039 .mode = 0644,
3040 .proc_handler = &proc_dointvec,
3041 },
3042 {
3043 .ctl_name = NET_IPV4_ROUTE_SECRET_INTERVAL,
3044 .procname = "secret_interval",
3045 .data = &ip_rt_secret_interval,
3046 .maxlen = sizeof(int),
3047 .mode = 0644,
3048 .proc_handler = &proc_dointvec_jiffies,
3049 .strategy = &sysctl_jiffies,
3050 },
3051 { .ctl_name = 0 }
3052};
3053#endif
3054
3055#ifdef CONFIG_NET_CLS_ROUTE
3056struct ip_rt_acct *ip_rt_acct;
3057
3058/* This code sucks. But you should have seen it before! --RR */
3059
3060/* IP route accounting ptr for this logical cpu number. */
3061#define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
3062
3063#ifdef CONFIG_PROC_FS
3064static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
3065 int length, int *eof, void *data)
3066{
3067 unsigned int i;
3068
3069 if ((offset & 3) || (length & 3))
3070 return -EIO;
3071
3072 if (offset >= sizeof(struct ip_rt_acct) * 256) {
3073 *eof = 1;
3074 return 0;
3075 }
3076
3077 if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
3078 length = sizeof(struct ip_rt_acct) * 256 - offset;
3079 *eof = 1;
3080 }
3081
3082 offset /= sizeof(u32);
3083
3084 if (length > 0) {
3085 u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
3086 u32 *dst = (u32 *) buffer;
3087
3088 /* Copy first cpu. */
3089 *start = buffer;
3090 memcpy(dst, src, length);
3091
3092 /* Add the other cpus in, one int at a time */
3093 for_each_cpu(i) {
3094 unsigned int j;
3095
3096 src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
3097
3098 for (j = 0; j < length/4; j++)
3099 dst[j] += src[j];
3100 }
3101 }
3102 return length;
3103}
3104#endif /* CONFIG_PROC_FS */
3105#endif /* CONFIG_NET_CLS_ROUTE */
3106
3107static __initdata unsigned long rhash_entries;
3108static int __init set_rhash_entries(char *str)
3109{
3110 if (!str)
3111 return 0;
3112 rhash_entries = simple_strtoul(str, &str, 0);
3113 return 1;
3114}
3115__setup("rhash_entries=", set_rhash_entries);
3116
3117int __init ip_rt_init(void)
3118{
Eric Dumazet424c4b72005-07-05 14:58:19 -07003119 int rc = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003120
3121 rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
3122 (jiffies ^ (jiffies >> 7)));
3123
3124#ifdef CONFIG_NET_CLS_ROUTE
Eric Dumazet424c4b72005-07-05 14:58:19 -07003125 {
3126 int order;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003127 for (order = 0;
3128 (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
3129 /* NOTHING */;
3130 ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
3131 if (!ip_rt_acct)
3132 panic("IP: failed to allocate ip_rt_acct\n");
3133 memset(ip_rt_acct, 0, PAGE_SIZE << order);
Eric Dumazet424c4b72005-07-05 14:58:19 -07003134 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003135#endif
3136
3137 ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
3138 sizeof(struct rtable),
3139 0, SLAB_HWCACHE_ALIGN,
3140 NULL, NULL);
3141
3142 if (!ipv4_dst_ops.kmem_cachep)
3143 panic("IP: failed to allocate ip_dst_cache\n");
3144
Eric Dumazet424c4b72005-07-05 14:58:19 -07003145 rt_hash_table = (struct rt_hash_bucket *)
3146 alloc_large_system_hash("IP route cache",
3147 sizeof(struct rt_hash_bucket),
3148 rhash_entries,
3149 (num_physpages >= 128 * 1024) ?
3150 (27 - PAGE_SHIFT) :
3151 (29 - PAGE_SHIFT),
3152 HASH_HIGHMEM,
3153 &rt_hash_log,
3154 &rt_hash_mask,
3155 0);
Eric Dumazet22c047c2005-07-05 14:55:24 -07003156 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3157 rt_hash_lock_init();
Linus Torvalds1da177e2005-04-16 15:20:36 -07003158
3159 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3160 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3161
3162 rt_cache_stat = alloc_percpu(struct rt_cache_stat);
3163 if (!rt_cache_stat)
3164 return -ENOMEM;
3165
3166 devinet_init();
3167 ip_fib_init();
3168
3169 init_timer(&rt_flush_timer);
3170 rt_flush_timer.function = rt_run_flush;
3171 init_timer(&rt_periodic_timer);
3172 rt_periodic_timer.function = rt_check_expire;
3173 init_timer(&rt_secret_timer);
3174 rt_secret_timer.function = rt_secret_rebuild;
3175
3176 /* All the timers, started at system startup tend
3177 to synchronize. Perturb it a bit.
3178 */
3179 rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
3180 ip_rt_gc_interval;
3181 add_timer(&rt_periodic_timer);
3182
3183 rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3184 ip_rt_secret_interval;
3185 add_timer(&rt_secret_timer);
3186
3187#ifdef CONFIG_PROC_FS
3188 {
3189 struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
3190 if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
3191 !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO,
3192 proc_net_stat))) {
3193 free_percpu(rt_cache_stat);
3194 return -ENOMEM;
3195 }
3196 rtstat_pde->proc_fops = &rt_cpu_seq_fops;
3197 }
3198#ifdef CONFIG_NET_CLS_ROUTE
3199 create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL);
3200#endif
3201#endif
3202#ifdef CONFIG_XFRM
3203 xfrm_init();
3204 xfrm4_init();
3205#endif
3206 return rc;
3207}
3208
3209EXPORT_SYMBOL(__ip_select_ident);
3210EXPORT_SYMBOL(ip_route_input);
3211EXPORT_SYMBOL(ip_route_output_key);