blob: 12a1cf306f67468fc532677c647e9db9c6abd786 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
8 * Version: $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
9 *
Jesper Juhl02c30a82005-05-05 16:16:16 -070010 * Authors: Ross Biro
Linus Torvalds1da177e2005-04-16 15:20:36 -070011 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Alan Cox, <gw4pts@gw4pts.ampr.org>
13 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15 *
16 * Fixes:
17 * Alan Cox : Verify area fixes.
18 * Alan Cox : cli() protects routing changes
19 * Rui Oliveira : ICMP routing table updates
20 * (rco@di.uminho.pt) Routing table insertion and update
21 * Linus Torvalds : Rewrote bits to be sensible
22 * Alan Cox : Added BSD route gw semantics
23 * Alan Cox : Super /proc >4K
24 * Alan Cox : MTU in route table
25 * Alan Cox : MSS actually. Also added the window
26 * clamper.
27 * Sam Lantinga : Fixed route matching in rt_del()
28 * Alan Cox : Routing cache support.
29 * Alan Cox : Removed compatibility cruft.
30 * Alan Cox : RTF_REJECT support.
31 * Alan Cox : TCP irtt support.
32 * Jonathan Naylor : Added Metric support.
33 * Miquel van Smoorenburg : BSD API fixes.
34 * Miquel van Smoorenburg : Metrics.
35 * Alan Cox : Use __u32 properly
36 * Alan Cox : Aligned routing errors more closely with BSD
37 * our system is still very different.
38 * Alan Cox : Faster /proc handling
39 * Alexey Kuznetsov : Massive rework to support tree based routing,
40 * routing caches and better behaviour.
41 *
42 * Olaf Erb : irtt wasn't being copied right.
43 * Bjorn Ekwall : Kerneld route support.
44 * Alan Cox : Multicast fixed (I hope)
45 * Pavel Krauz : Limited broadcast fixed
46 * Mike McLagan : Routing by source
47 * Alexey Kuznetsov : End of old history. Split to fib.c and
48 * route.c and rewritten from scratch.
49 * Andi Kleen : Load-limit warning messages.
50 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
51 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
52 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
53 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
54 * Marc Boucher : routing by fwmark
55 * Robert Olsson : Added rt_cache statistics
56 * Arnaldo C. Melo : Convert proc stuff to seq_file
57 *
58 * This program is free software; you can redistribute it and/or
59 * modify it under the terms of the GNU General Public License
60 * as published by the Free Software Foundation; either version
61 * 2 of the License, or (at your option) any later version.
62 */
63
64#include <linux/config.h>
65#include <linux/module.h>
66#include <asm/uaccess.h>
67#include <asm/system.h>
68#include <linux/bitops.h>
69#include <linux/types.h>
70#include <linux/kernel.h>
71#include <linux/sched.h>
72#include <linux/mm.h>
73#include <linux/string.h>
74#include <linux/socket.h>
75#include <linux/sockios.h>
76#include <linux/errno.h>
77#include <linux/in.h>
78#include <linux/inet.h>
79#include <linux/netdevice.h>
80#include <linux/proc_fs.h>
81#include <linux/init.h>
82#include <linux/skbuff.h>
83#include <linux/rtnetlink.h>
84#include <linux/inetdevice.h>
85#include <linux/igmp.h>
86#include <linux/pkt_sched.h>
87#include <linux/mroute.h>
88#include <linux/netfilter_ipv4.h>
89#include <linux/random.h>
90#include <linux/jhash.h>
91#include <linux/rcupdate.h>
92#include <linux/times.h>
93#include <net/protocol.h>
94#include <net/ip.h>
95#include <net/route.h>
96#include <net/inetpeer.h>
97#include <net/sock.h>
98#include <net/ip_fib.h>
99#include <net/arp.h>
100#include <net/tcp.h>
101#include <net/icmp.h>
102#include <net/xfrm.h>
103#include <net/ip_mp_alg.h>
104#ifdef CONFIG_SYSCTL
105#include <linux/sysctl.h>
106#endif
107
108#define RT_FL_TOS(oldflp) \
109 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
110
111#define IP_MAX_MTU 0xFFF0
112
113#define RT_GC_TIMEOUT (300*HZ)
114
115static int ip_rt_min_delay = 2 * HZ;
116static int ip_rt_max_delay = 10 * HZ;
117static int ip_rt_max_size;
118static int ip_rt_gc_timeout = RT_GC_TIMEOUT;
119static int ip_rt_gc_interval = 60 * HZ;
120static int ip_rt_gc_min_interval = HZ / 2;
121static int ip_rt_redirect_number = 9;
122static int ip_rt_redirect_load = HZ / 50;
123static int ip_rt_redirect_silence = ((HZ / 50) << (9 + 1));
124static int ip_rt_error_cost = HZ;
125static int ip_rt_error_burst = 5 * HZ;
126static int ip_rt_gc_elasticity = 8;
127static int ip_rt_mtu_expires = 10 * 60 * HZ;
128static int ip_rt_min_pmtu = 512 + 20 + 20;
129static int ip_rt_min_advmss = 256;
130static int ip_rt_secret_interval = 10 * 60 * HZ;
131static unsigned long rt_deadline;
132
133#define RTprint(a...) printk(KERN_DEBUG a)
134
135static struct timer_list rt_flush_timer;
136static struct timer_list rt_periodic_timer;
137static struct timer_list rt_secret_timer;
138
139/*
140 * Interface to generic destination cache.
141 */
142
143static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
144static void ipv4_dst_destroy(struct dst_entry *dst);
145static void ipv4_dst_ifdown(struct dst_entry *dst,
146 struct net_device *dev, int how);
147static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
148static void ipv4_link_failure(struct sk_buff *skb);
149static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
150static int rt_garbage_collect(void);
151
152
153static struct dst_ops ipv4_dst_ops = {
154 .family = AF_INET,
155 .protocol = __constant_htons(ETH_P_IP),
156 .gc = rt_garbage_collect,
157 .check = ipv4_dst_check,
158 .destroy = ipv4_dst_destroy,
159 .ifdown = ipv4_dst_ifdown,
160 .negative_advice = ipv4_negative_advice,
161 .link_failure = ipv4_link_failure,
162 .update_pmtu = ip_rt_update_pmtu,
163 .entry_size = sizeof(struct rtable),
164};
165
166#define ECN_OR_COST(class) TC_PRIO_##class
167
168__u8 ip_tos2prio[16] = {
169 TC_PRIO_BESTEFFORT,
170 ECN_OR_COST(FILLER),
171 TC_PRIO_BESTEFFORT,
172 ECN_OR_COST(BESTEFFORT),
173 TC_PRIO_BULK,
174 ECN_OR_COST(BULK),
175 TC_PRIO_BULK,
176 ECN_OR_COST(BULK),
177 TC_PRIO_INTERACTIVE,
178 ECN_OR_COST(INTERACTIVE),
179 TC_PRIO_INTERACTIVE,
180 ECN_OR_COST(INTERACTIVE),
181 TC_PRIO_INTERACTIVE_BULK,
182 ECN_OR_COST(INTERACTIVE_BULK),
183 TC_PRIO_INTERACTIVE_BULK,
184 ECN_OR_COST(INTERACTIVE_BULK)
185};
186
187
188/*
189 * Route cache.
190 */
191
192/* The locking scheme is rather straight forward:
193 *
194 * 1) Read-Copy Update protects the buckets of the central route hash.
195 * 2) Only writers remove entries, and they hold the lock
196 * as they look at rtable reference counts.
197 * 3) Only readers acquire references to rtable entries,
198 * they do so with atomic increments and with the
199 * lock held.
200 */
201
202struct rt_hash_bucket {
203 struct rtable *chain;
204 spinlock_t lock;
205} __attribute__((__aligned__(8)));
206
207static struct rt_hash_bucket *rt_hash_table;
208static unsigned rt_hash_mask;
209static int rt_hash_log;
210static unsigned int rt_hash_rnd;
211
212struct rt_cache_stat *rt_cache_stat;
213
214static int rt_intern_hash(unsigned hash, struct rtable *rth,
215 struct rtable **res);
216
217static unsigned int rt_hash_code(u32 daddr, u32 saddr, u8 tos)
218{
219 return (jhash_3words(daddr, saddr, (u32) tos, rt_hash_rnd)
220 & rt_hash_mask);
221}
222
223#ifdef CONFIG_PROC_FS
224struct rt_cache_iter_state {
225 int bucket;
226};
227
228static struct rtable *rt_cache_get_first(struct seq_file *seq)
229{
230 struct rtable *r = NULL;
231 struct rt_cache_iter_state *st = seq->private;
232
233 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
234 rcu_read_lock_bh();
235 r = rt_hash_table[st->bucket].chain;
236 if (r)
237 break;
238 rcu_read_unlock_bh();
239 }
240 return r;
241}
242
243static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
244{
245 struct rt_cache_iter_state *st = rcu_dereference(seq->private);
246
247 r = r->u.rt_next;
248 while (!r) {
249 rcu_read_unlock_bh();
250 if (--st->bucket < 0)
251 break;
252 rcu_read_lock_bh();
253 r = rt_hash_table[st->bucket].chain;
254 }
255 return r;
256}
257
258static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
259{
260 struct rtable *r = rt_cache_get_first(seq);
261
262 if (r)
263 while (pos && (r = rt_cache_get_next(seq, r)))
264 --pos;
265 return pos ? NULL : r;
266}
267
268static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
269{
270 return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
271}
272
273static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
274{
275 struct rtable *r = NULL;
276
277 if (v == SEQ_START_TOKEN)
278 r = rt_cache_get_first(seq);
279 else
280 r = rt_cache_get_next(seq, v);
281 ++*pos;
282 return r;
283}
284
285static void rt_cache_seq_stop(struct seq_file *seq, void *v)
286{
287 if (v && v != SEQ_START_TOKEN)
288 rcu_read_unlock_bh();
289}
290
291static int rt_cache_seq_show(struct seq_file *seq, void *v)
292{
293 if (v == SEQ_START_TOKEN)
294 seq_printf(seq, "%-127s\n",
295 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
296 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
297 "HHUptod\tSpecDst");
298 else {
299 struct rtable *r = v;
300 char temp[256];
301
302 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
303 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
304 r->u.dst.dev ? r->u.dst.dev->name : "*",
305 (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
306 r->rt_flags, atomic_read(&r->u.dst.__refcnt),
307 r->u.dst.__use, 0, (unsigned long)r->rt_src,
308 (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
309 (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
310 dst_metric(&r->u.dst, RTAX_WINDOW),
311 (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
312 dst_metric(&r->u.dst, RTAX_RTTVAR)),
313 r->fl.fl4_tos,
314 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
315 r->u.dst.hh ? (r->u.dst.hh->hh_output ==
316 dev_queue_xmit) : 0,
317 r->rt_spec_dst);
318 seq_printf(seq, "%-127s\n", temp);
319 }
320 return 0;
321}
322
323static struct seq_operations rt_cache_seq_ops = {
324 .start = rt_cache_seq_start,
325 .next = rt_cache_seq_next,
326 .stop = rt_cache_seq_stop,
327 .show = rt_cache_seq_show,
328};
329
330static int rt_cache_seq_open(struct inode *inode, struct file *file)
331{
332 struct seq_file *seq;
333 int rc = -ENOMEM;
334 struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
335
336 if (!s)
337 goto out;
338 rc = seq_open(file, &rt_cache_seq_ops);
339 if (rc)
340 goto out_kfree;
341 seq = file->private_data;
342 seq->private = s;
343 memset(s, 0, sizeof(*s));
344out:
345 return rc;
346out_kfree:
347 kfree(s);
348 goto out;
349}
350
351static struct file_operations rt_cache_seq_fops = {
352 .owner = THIS_MODULE,
353 .open = rt_cache_seq_open,
354 .read = seq_read,
355 .llseek = seq_lseek,
356 .release = seq_release_private,
357};
358
359
360static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
361{
362 int cpu;
363
364 if (*pos == 0)
365 return SEQ_START_TOKEN;
366
367 for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
368 if (!cpu_possible(cpu))
369 continue;
370 *pos = cpu+1;
371 return per_cpu_ptr(rt_cache_stat, cpu);
372 }
373 return NULL;
374}
375
376static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
377{
378 int cpu;
379
380 for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
381 if (!cpu_possible(cpu))
382 continue;
383 *pos = cpu+1;
384 return per_cpu_ptr(rt_cache_stat, cpu);
385 }
386 return NULL;
387
388}
389
390static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
391{
392
393}
394
395static int rt_cpu_seq_show(struct seq_file *seq, void *v)
396{
397 struct rt_cache_stat *st = v;
398
399 if (v == SEQ_START_TOKEN) {
Olaf Rempel5bec0032005-04-28 12:16:08 -0700400 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700401 return 0;
402 }
403
404 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
405 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
406 atomic_read(&ipv4_dst_ops.entries),
407 st->in_hit,
408 st->in_slow_tot,
409 st->in_slow_mc,
410 st->in_no_route,
411 st->in_brd,
412 st->in_martian_dst,
413 st->in_martian_src,
414
415 st->out_hit,
416 st->out_slow_tot,
417 st->out_slow_mc,
418
419 st->gc_total,
420 st->gc_ignored,
421 st->gc_goal_miss,
422 st->gc_dst_overflow,
423 st->in_hlist_search,
424 st->out_hlist_search
425 );
426 return 0;
427}
428
429static struct seq_operations rt_cpu_seq_ops = {
430 .start = rt_cpu_seq_start,
431 .next = rt_cpu_seq_next,
432 .stop = rt_cpu_seq_stop,
433 .show = rt_cpu_seq_show,
434};
435
436
437static int rt_cpu_seq_open(struct inode *inode, struct file *file)
438{
439 return seq_open(file, &rt_cpu_seq_ops);
440}
441
442static struct file_operations rt_cpu_seq_fops = {
443 .owner = THIS_MODULE,
444 .open = rt_cpu_seq_open,
445 .read = seq_read,
446 .llseek = seq_lseek,
447 .release = seq_release,
448};
449
450#endif /* CONFIG_PROC_FS */
451
452static __inline__ void rt_free(struct rtable *rt)
453{
454 multipath_remove(rt);
455 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
456}
457
458static __inline__ void rt_drop(struct rtable *rt)
459{
460 multipath_remove(rt);
461 ip_rt_put(rt);
462 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
463}
464
465static __inline__ int rt_fast_clean(struct rtable *rth)
466{
467 /* Kill broadcast/multicast entries very aggresively, if they
468 collide in hash table with more useful entries */
469 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
470 rth->fl.iif && rth->u.rt_next;
471}
472
473static __inline__ int rt_valuable(struct rtable *rth)
474{
475 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
476 rth->u.dst.expires;
477}
478
479static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
480{
481 unsigned long age;
482 int ret = 0;
483
484 if (atomic_read(&rth->u.dst.__refcnt))
485 goto out;
486
487 ret = 1;
488 if (rth->u.dst.expires &&
489 time_after_eq(jiffies, rth->u.dst.expires))
490 goto out;
491
492 age = jiffies - rth->u.dst.lastuse;
493 ret = 0;
494 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
495 (age <= tmo2 && rt_valuable(rth)))
496 goto out;
497 ret = 1;
498out: return ret;
499}
500
501/* Bits of score are:
502 * 31: very valuable
503 * 30: not quite useless
504 * 29..0: usage counter
505 */
506static inline u32 rt_score(struct rtable *rt)
507{
508 u32 score = jiffies - rt->u.dst.lastuse;
509
510 score = ~score & ~(3<<30);
511
512 if (rt_valuable(rt))
513 score |= (1<<31);
514
515 if (!rt->fl.iif ||
516 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
517 score |= (1<<30);
518
519 return score;
520}
521
522static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
523{
524 return memcmp(&fl1->nl_u.ip4_u, &fl2->nl_u.ip4_u, sizeof(fl1->nl_u.ip4_u)) == 0 &&
525 fl1->oif == fl2->oif &&
526 fl1->iif == fl2->iif;
527}
528
529#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
530static struct rtable **rt_remove_balanced_route(struct rtable **chain_head,
531 struct rtable *expentry,
532 int *removed_count)
533{
534 int passedexpired = 0;
535 struct rtable **nextstep = NULL;
536 struct rtable **rthp = chain_head;
537 struct rtable *rth;
538
539 if (removed_count)
540 *removed_count = 0;
541
542 while ((rth = *rthp) != NULL) {
543 if (rth == expentry)
544 passedexpired = 1;
545
546 if (((*rthp)->u.dst.flags & DST_BALANCED) != 0 &&
547 compare_keys(&(*rthp)->fl, &expentry->fl)) {
548 if (*rthp == expentry) {
549 *rthp = rth->u.rt_next;
550 continue;
551 } else {
552 *rthp = rth->u.rt_next;
553 rt_free(rth);
554 if (removed_count)
555 ++(*removed_count);
556 }
557 } else {
558 if (!((*rthp)->u.dst.flags & DST_BALANCED) &&
559 passedexpired && !nextstep)
560 nextstep = &rth->u.rt_next;
561
562 rthp = &rth->u.rt_next;
563 }
564 }
565
566 rt_free(expentry);
567 if (removed_count)
568 ++(*removed_count);
569
570 return nextstep;
571}
572#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
573
574
575/* This runs via a timer and thus is always in BH context. */
576static void rt_check_expire(unsigned long dummy)
577{
578 static int rover;
579 int i = rover, t;
580 struct rtable *rth, **rthp;
581 unsigned long now = jiffies;
582
583 for (t = ip_rt_gc_interval << rt_hash_log; t >= 0;
584 t -= ip_rt_gc_timeout) {
585 unsigned long tmo = ip_rt_gc_timeout;
586
587 i = (i + 1) & rt_hash_mask;
588 rthp = &rt_hash_table[i].chain;
589
590 spin_lock(&rt_hash_table[i].lock);
591 while ((rth = *rthp) != NULL) {
592 if (rth->u.dst.expires) {
593 /* Entry is expired even if it is in use */
594 if (time_before_eq(now, rth->u.dst.expires)) {
595 tmo >>= 1;
596 rthp = &rth->u.rt_next;
597 continue;
598 }
599 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
600 tmo >>= 1;
601 rthp = &rth->u.rt_next;
602 continue;
603 }
604
605 /* Cleanup aged off entries. */
606#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
607 /* remove all related balanced entries if necessary */
608 if (rth->u.dst.flags & DST_BALANCED) {
609 rthp = rt_remove_balanced_route(
610 &rt_hash_table[i].chain,
611 rth, NULL);
612 if (!rthp)
613 break;
614 } else {
615 *rthp = rth->u.rt_next;
616 rt_free(rth);
617 }
618#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
619 *rthp = rth->u.rt_next;
620 rt_free(rth);
621#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
622 }
623 spin_unlock(&rt_hash_table[i].lock);
624
625 /* Fallback loop breaker. */
626 if (time_after(jiffies, now))
627 break;
628 }
629 rover = i;
630 mod_timer(&rt_periodic_timer, now + ip_rt_gc_interval);
631}
632
633/* This can run from both BH and non-BH contexts, the latter
634 * in the case of a forced flush event.
635 */
636static void rt_run_flush(unsigned long dummy)
637{
638 int i;
639 struct rtable *rth, *next;
640
641 rt_deadline = 0;
642
643 get_random_bytes(&rt_hash_rnd, 4);
644
645 for (i = rt_hash_mask; i >= 0; i--) {
646 spin_lock_bh(&rt_hash_table[i].lock);
647 rth = rt_hash_table[i].chain;
648 if (rth)
649 rt_hash_table[i].chain = NULL;
650 spin_unlock_bh(&rt_hash_table[i].lock);
651
652 for (; rth; rth = next) {
653 next = rth->u.rt_next;
654 rt_free(rth);
655 }
656 }
657}
658
659static DEFINE_SPINLOCK(rt_flush_lock);
660
661void rt_cache_flush(int delay)
662{
663 unsigned long now = jiffies;
664 int user_mode = !in_softirq();
665
666 if (delay < 0)
667 delay = ip_rt_min_delay;
668
669 /* flush existing multipath state*/
670 multipath_flush();
671
672 spin_lock_bh(&rt_flush_lock);
673
674 if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
675 long tmo = (long)(rt_deadline - now);
676
677 /* If flush timer is already running
678 and flush request is not immediate (delay > 0):
679
680 if deadline is not achieved, prolongate timer to "delay",
681 otherwise fire it at deadline time.
682 */
683
684 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
685 tmo = 0;
686
687 if (delay > tmo)
688 delay = tmo;
689 }
690
691 if (delay <= 0) {
692 spin_unlock_bh(&rt_flush_lock);
693 rt_run_flush(0);
694 return;
695 }
696
697 if (rt_deadline == 0)
698 rt_deadline = now + ip_rt_max_delay;
699
700 mod_timer(&rt_flush_timer, now+delay);
701 spin_unlock_bh(&rt_flush_lock);
702}
703
704static void rt_secret_rebuild(unsigned long dummy)
705{
706 unsigned long now = jiffies;
707
708 rt_cache_flush(0);
709 mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
710}
711
712/*
713 Short description of GC goals.
714
715 We want to build algorithm, which will keep routing cache
716 at some equilibrium point, when number of aged off entries
717 is kept approximately equal to newly generated ones.
718
719 Current expiration strength is variable "expire".
720 We try to adjust it dynamically, so that if networking
721 is idle expires is large enough to keep enough of warm entries,
722 and when load increases it reduces to limit cache size.
723 */
724
725static int rt_garbage_collect(void)
726{
727 static unsigned long expire = RT_GC_TIMEOUT;
728 static unsigned long last_gc;
729 static int rover;
730 static int equilibrium;
731 struct rtable *rth, **rthp;
732 unsigned long now = jiffies;
733 int goal;
734
735 /*
736 * Garbage collection is pretty expensive,
737 * do not make it too frequently.
738 */
739
740 RT_CACHE_STAT_INC(gc_total);
741
742 if (now - last_gc < ip_rt_gc_min_interval &&
743 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
744 RT_CACHE_STAT_INC(gc_ignored);
745 goto out;
746 }
747
748 /* Calculate number of entries, which we want to expire now. */
749 goal = atomic_read(&ipv4_dst_ops.entries) -
750 (ip_rt_gc_elasticity << rt_hash_log);
751 if (goal <= 0) {
752 if (equilibrium < ipv4_dst_ops.gc_thresh)
753 equilibrium = ipv4_dst_ops.gc_thresh;
754 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
755 if (goal > 0) {
756 equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
757 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
758 }
759 } else {
760 /* We are in dangerous area. Try to reduce cache really
761 * aggressively.
762 */
763 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
764 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
765 }
766
767 if (now - last_gc >= ip_rt_gc_min_interval)
768 last_gc = now;
769
770 if (goal <= 0) {
771 equilibrium += goal;
772 goto work_done;
773 }
774
775 do {
776 int i, k;
777
778 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
779 unsigned long tmo = expire;
780
781 k = (k + 1) & rt_hash_mask;
782 rthp = &rt_hash_table[k].chain;
783 spin_lock_bh(&rt_hash_table[k].lock);
784 while ((rth = *rthp) != NULL) {
785 if (!rt_may_expire(rth, tmo, expire)) {
786 tmo >>= 1;
787 rthp = &rth->u.rt_next;
788 continue;
789 }
790#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
791 /* remove all related balanced entries
792 * if necessary
793 */
794 if (rth->u.dst.flags & DST_BALANCED) {
795 int r;
796
797 rthp = rt_remove_balanced_route(
798 &rt_hash_table[i].chain,
799 rth,
800 &r);
801 goal -= r;
802 if (!rthp)
803 break;
804 } else {
805 *rthp = rth->u.rt_next;
806 rt_free(rth);
807 goal--;
808 }
809#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
810 *rthp = rth->u.rt_next;
811 rt_free(rth);
812 goal--;
813#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
814 }
815 spin_unlock_bh(&rt_hash_table[k].lock);
816 if (goal <= 0)
817 break;
818 }
819 rover = k;
820
821 if (goal <= 0)
822 goto work_done;
823
824 /* Goal is not achieved. We stop process if:
825
826 - if expire reduced to zero. Otherwise, expire is halfed.
827 - if table is not full.
828 - if we are called from interrupt.
829 - jiffies check is just fallback/debug loop breaker.
830 We will not spin here for long time in any case.
831 */
832
833 RT_CACHE_STAT_INC(gc_goal_miss);
834
835 if (expire == 0)
836 break;
837
838 expire >>= 1;
839#if RT_CACHE_DEBUG >= 2
840 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
841 atomic_read(&ipv4_dst_ops.entries), goal, i);
842#endif
843
844 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
845 goto out;
846 } while (!in_softirq() && time_before_eq(jiffies, now));
847
848 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
849 goto out;
850 if (net_ratelimit())
851 printk(KERN_WARNING "dst cache overflow\n");
852 RT_CACHE_STAT_INC(gc_dst_overflow);
853 return 1;
854
855work_done:
856 expire += ip_rt_gc_min_interval;
857 if (expire > ip_rt_gc_timeout ||
858 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
859 expire = ip_rt_gc_timeout;
860#if RT_CACHE_DEBUG >= 2
861 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
862 atomic_read(&ipv4_dst_ops.entries), goal, rover);
863#endif
864out: return 0;
865}
866
867static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
868{
869 struct rtable *rth, **rthp;
870 unsigned long now;
871 struct rtable *cand, **candp;
872 u32 min_score;
873 int chain_length;
874 int attempts = !in_softirq();
875
876restart:
877 chain_length = 0;
878 min_score = ~(u32)0;
879 cand = NULL;
880 candp = NULL;
881 now = jiffies;
882
883 rthp = &rt_hash_table[hash].chain;
884
885 spin_lock_bh(&rt_hash_table[hash].lock);
886 while ((rth = *rthp) != NULL) {
887#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
888 if (!(rth->u.dst.flags & DST_BALANCED) &&
889 compare_keys(&rth->fl, &rt->fl)) {
890#else
891 if (compare_keys(&rth->fl, &rt->fl)) {
892#endif
893 /* Put it first */
894 *rthp = rth->u.rt_next;
895 /*
896 * Since lookup is lockfree, the deletion
897 * must be visible to another weakly ordered CPU before
898 * the insertion at the start of the hash chain.
899 */
900 rcu_assign_pointer(rth->u.rt_next,
901 rt_hash_table[hash].chain);
902 /*
903 * Since lookup is lockfree, the update writes
904 * must be ordered for consistency on SMP.
905 */
906 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
907
908 rth->u.dst.__use++;
909 dst_hold(&rth->u.dst);
910 rth->u.dst.lastuse = now;
911 spin_unlock_bh(&rt_hash_table[hash].lock);
912
913 rt_drop(rt);
914 *rp = rth;
915 return 0;
916 }
917
918 if (!atomic_read(&rth->u.dst.__refcnt)) {
919 u32 score = rt_score(rth);
920
921 if (score <= min_score) {
922 cand = rth;
923 candp = rthp;
924 min_score = score;
925 }
926 }
927
928 chain_length++;
929
930 rthp = &rth->u.rt_next;
931 }
932
933 if (cand) {
934 /* ip_rt_gc_elasticity used to be average length of chain
935 * length, when exceeded gc becomes really aggressive.
936 *
937 * The second limit is less certain. At the moment it allows
938 * only 2 entries per bucket. We will see.
939 */
940 if (chain_length > ip_rt_gc_elasticity) {
941 *candp = cand->u.rt_next;
942 rt_free(cand);
943 }
944 }
945
946 /* Try to bind route to arp only if it is output
947 route or unicast forwarding path.
948 */
949 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
950 int err = arp_bind_neighbour(&rt->u.dst);
951 if (err) {
952 spin_unlock_bh(&rt_hash_table[hash].lock);
953
954 if (err != -ENOBUFS) {
955 rt_drop(rt);
956 return err;
957 }
958
959 /* Neighbour tables are full and nothing
960 can be released. Try to shrink route cache,
961 it is most likely it holds some neighbour records.
962 */
963 if (attempts-- > 0) {
964 int saved_elasticity = ip_rt_gc_elasticity;
965 int saved_int = ip_rt_gc_min_interval;
966 ip_rt_gc_elasticity = 1;
967 ip_rt_gc_min_interval = 0;
968 rt_garbage_collect();
969 ip_rt_gc_min_interval = saved_int;
970 ip_rt_gc_elasticity = saved_elasticity;
971 goto restart;
972 }
973
974 if (net_ratelimit())
975 printk(KERN_WARNING "Neighbour table overflow.\n");
976 rt_drop(rt);
977 return -ENOBUFS;
978 }
979 }
980
981 rt->u.rt_next = rt_hash_table[hash].chain;
982#if RT_CACHE_DEBUG >= 2
983 if (rt->u.rt_next) {
984 struct rtable *trt;
985 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
986 NIPQUAD(rt->rt_dst));
987 for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next)
988 printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
989 printk("\n");
990 }
991#endif
992 rt_hash_table[hash].chain = rt;
993 spin_unlock_bh(&rt_hash_table[hash].lock);
994 *rp = rt;
995 return 0;
996}
997
998void rt_bind_peer(struct rtable *rt, int create)
999{
1000 static DEFINE_SPINLOCK(rt_peer_lock);
1001 struct inet_peer *peer;
1002
1003 peer = inet_getpeer(rt->rt_dst, create);
1004
1005 spin_lock_bh(&rt_peer_lock);
1006 if (rt->peer == NULL) {
1007 rt->peer = peer;
1008 peer = NULL;
1009 }
1010 spin_unlock_bh(&rt_peer_lock);
1011 if (peer)
1012 inet_putpeer(peer);
1013}
1014
1015/*
1016 * Peer allocation may fail only in serious out-of-memory conditions. However
1017 * we still can generate some output.
1018 * Random ID selection looks a bit dangerous because we have no chances to
1019 * select ID being unique in a reasonable period of time.
1020 * But broken packet identifier may be better than no packet at all.
1021 */
1022static void ip_select_fb_ident(struct iphdr *iph)
1023{
1024 static DEFINE_SPINLOCK(ip_fb_id_lock);
1025 static u32 ip_fallback_id;
1026 u32 salt;
1027
1028 spin_lock_bh(&ip_fb_id_lock);
1029 salt = secure_ip_id(ip_fallback_id ^ iph->daddr);
1030 iph->id = htons(salt & 0xFFFF);
1031 ip_fallback_id = salt;
1032 spin_unlock_bh(&ip_fb_id_lock);
1033}
1034
1035void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1036{
1037 struct rtable *rt = (struct rtable *) dst;
1038
1039 if (rt) {
1040 if (rt->peer == NULL)
1041 rt_bind_peer(rt, 1);
1042
1043 /* If peer is attached to destination, it is never detached,
1044 so that we need not to grab a lock to dereference it.
1045 */
1046 if (rt->peer) {
1047 iph->id = htons(inet_getid(rt->peer, more));
1048 return;
1049 }
1050 } else
Stephen Hemminger9c2b3322005-04-19 22:39:42 -07001051 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1052 __builtin_return_address(0));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001053
1054 ip_select_fb_ident(iph);
1055}
1056
1057static void rt_del(unsigned hash, struct rtable *rt)
1058{
1059 struct rtable **rthp;
1060
1061 spin_lock_bh(&rt_hash_table[hash].lock);
1062 ip_rt_put(rt);
1063 for (rthp = &rt_hash_table[hash].chain; *rthp;
1064 rthp = &(*rthp)->u.rt_next)
1065 if (*rthp == rt) {
1066 *rthp = rt->u.rt_next;
1067 rt_free(rt);
1068 break;
1069 }
1070 spin_unlock_bh(&rt_hash_table[hash].lock);
1071}
1072
1073void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
1074 u32 saddr, u8 tos, struct net_device *dev)
1075{
1076 int i, k;
1077 struct in_device *in_dev = in_dev_get(dev);
1078 struct rtable *rth, **rthp;
1079 u32 skeys[2] = { saddr, 0 };
1080 int ikeys[2] = { dev->ifindex, 0 };
1081
1082 tos &= IPTOS_RT_MASK;
1083
1084 if (!in_dev)
1085 return;
1086
1087 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1088 || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1089 goto reject_redirect;
1090
1091 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1092 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1093 goto reject_redirect;
1094 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1095 goto reject_redirect;
1096 } else {
1097 if (inet_addr_type(new_gw) != RTN_UNICAST)
1098 goto reject_redirect;
1099 }
1100
1101 for (i = 0; i < 2; i++) {
1102 for (k = 0; k < 2; k++) {
1103 unsigned hash = rt_hash_code(daddr,
1104 skeys[i] ^ (ikeys[k] << 5),
1105 tos);
1106
1107 rthp=&rt_hash_table[hash].chain;
1108
1109 rcu_read_lock();
1110 while ((rth = rcu_dereference(*rthp)) != NULL) {
1111 struct rtable *rt;
1112
1113 if (rth->fl.fl4_dst != daddr ||
1114 rth->fl.fl4_src != skeys[i] ||
1115 rth->fl.fl4_tos != tos ||
1116 rth->fl.oif != ikeys[k] ||
1117 rth->fl.iif != 0) {
1118 rthp = &rth->u.rt_next;
1119 continue;
1120 }
1121
1122 if (rth->rt_dst != daddr ||
1123 rth->rt_src != saddr ||
1124 rth->u.dst.error ||
1125 rth->rt_gateway != old_gw ||
1126 rth->u.dst.dev != dev)
1127 break;
1128
1129 dst_hold(&rth->u.dst);
1130 rcu_read_unlock();
1131
1132 rt = dst_alloc(&ipv4_dst_ops);
1133 if (rt == NULL) {
1134 ip_rt_put(rth);
1135 in_dev_put(in_dev);
1136 return;
1137 }
1138
1139 /* Copy all the information. */
1140 *rt = *rth;
1141 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1142 rt->u.dst.__use = 1;
1143 atomic_set(&rt->u.dst.__refcnt, 1);
1144 rt->u.dst.child = NULL;
1145 if (rt->u.dst.dev)
1146 dev_hold(rt->u.dst.dev);
1147 if (rt->idev)
1148 in_dev_hold(rt->idev);
1149 rt->u.dst.obsolete = 0;
1150 rt->u.dst.lastuse = jiffies;
1151 rt->u.dst.path = &rt->u.dst;
1152 rt->u.dst.neighbour = NULL;
1153 rt->u.dst.hh = NULL;
1154 rt->u.dst.xfrm = NULL;
1155
1156 rt->rt_flags |= RTCF_REDIRECTED;
1157
1158 /* Gateway is different ... */
1159 rt->rt_gateway = new_gw;
1160
1161 /* Redirect received -> path was valid */
1162 dst_confirm(&rth->u.dst);
1163
1164 if (rt->peer)
1165 atomic_inc(&rt->peer->refcnt);
1166
1167 if (arp_bind_neighbour(&rt->u.dst) ||
1168 !(rt->u.dst.neighbour->nud_state &
1169 NUD_VALID)) {
1170 if (rt->u.dst.neighbour)
1171 neigh_event_send(rt->u.dst.neighbour, NULL);
1172 ip_rt_put(rth);
1173 rt_drop(rt);
1174 goto do_next;
1175 }
1176
1177 rt_del(hash, rth);
1178 if (!rt_intern_hash(hash, rt, &rt))
1179 ip_rt_put(rt);
1180 goto do_next;
1181 }
1182 rcu_read_unlock();
1183 do_next:
1184 ;
1185 }
1186 }
1187 in_dev_put(in_dev);
1188 return;
1189
1190reject_redirect:
1191#ifdef CONFIG_IP_ROUTE_VERBOSE
1192 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1193 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1194 "%u.%u.%u.%u ignored.\n"
1195 " Advised path = %u.%u.%u.%u -> %u.%u.%u.%u, "
1196 "tos %02x\n",
1197 NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1198 NIPQUAD(saddr), NIPQUAD(daddr), tos);
1199#endif
1200 in_dev_put(in_dev);
1201}
1202
1203static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1204{
1205 struct rtable *rt = (struct rtable*)dst;
1206 struct dst_entry *ret = dst;
1207
1208 if (rt) {
1209 if (dst->obsolete) {
1210 ip_rt_put(rt);
1211 ret = NULL;
1212 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1213 rt->u.dst.expires) {
1214 unsigned hash = rt_hash_code(rt->fl.fl4_dst,
1215 rt->fl.fl4_src ^
1216 (rt->fl.oif << 5),
1217 rt->fl.fl4_tos);
1218#if RT_CACHE_DEBUG >= 1
1219 printk(KERN_DEBUG "ip_rt_advice: redirect to "
1220 "%u.%u.%u.%u/%02x dropped\n",
1221 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1222#endif
1223 rt_del(hash, rt);
1224 ret = NULL;
1225 }
1226 }
1227 return ret;
1228}
1229
1230/*
1231 * Algorithm:
1232 * 1. The first ip_rt_redirect_number redirects are sent
1233 * with exponential backoff, then we stop sending them at all,
1234 * assuming that the host ignores our redirects.
1235 * 2. If we did not see packets requiring redirects
1236 * during ip_rt_redirect_silence, we assume that the host
1237 * forgot redirected route and start to send redirects again.
1238 *
1239 * This algorithm is much cheaper and more intelligent than dumb load limiting
1240 * in icmp.c.
1241 *
1242 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1243 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1244 */
1245
1246void ip_rt_send_redirect(struct sk_buff *skb)
1247{
1248 struct rtable *rt = (struct rtable*)skb->dst;
1249 struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1250
1251 if (!in_dev)
1252 return;
1253
1254 if (!IN_DEV_TX_REDIRECTS(in_dev))
1255 goto out;
1256
1257 /* No redirected packets during ip_rt_redirect_silence;
1258 * reset the algorithm.
1259 */
1260 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1261 rt->u.dst.rate_tokens = 0;
1262
1263 /* Too many ignored redirects; do not send anything
1264 * set u.dst.rate_last to the last seen redirected packet.
1265 */
1266 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1267 rt->u.dst.rate_last = jiffies;
1268 goto out;
1269 }
1270
1271 /* Check for load limit; set rate_last to the latest sent
1272 * redirect.
1273 */
1274 if (time_after(jiffies,
1275 (rt->u.dst.rate_last +
1276 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1277 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1278 rt->u.dst.rate_last = jiffies;
1279 ++rt->u.dst.rate_tokens;
1280#ifdef CONFIG_IP_ROUTE_VERBOSE
1281 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1282 rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1283 net_ratelimit())
1284 printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1285 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1286 NIPQUAD(rt->rt_src), rt->rt_iif,
1287 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1288#endif
1289 }
1290out:
1291 in_dev_put(in_dev);
1292}
1293
1294static int ip_error(struct sk_buff *skb)
1295{
1296 struct rtable *rt = (struct rtable*)skb->dst;
1297 unsigned long now;
1298 int code;
1299
1300 switch (rt->u.dst.error) {
1301 case EINVAL:
1302 default:
1303 goto out;
1304 case EHOSTUNREACH:
1305 code = ICMP_HOST_UNREACH;
1306 break;
1307 case ENETUNREACH:
1308 code = ICMP_NET_UNREACH;
1309 break;
1310 case EACCES:
1311 code = ICMP_PKT_FILTERED;
1312 break;
1313 }
1314
1315 now = jiffies;
1316 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1317 if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1318 rt->u.dst.rate_tokens = ip_rt_error_burst;
1319 rt->u.dst.rate_last = now;
1320 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1321 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1322 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1323 }
1324
1325out: kfree_skb(skb);
1326 return 0;
1327}
1328
1329/*
1330 * The last two values are not from the RFC but
1331 * are needed for AMPRnet AX.25 paths.
1332 */
1333
1334static unsigned short mtu_plateau[] =
1335{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1336
1337static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1338{
1339 int i;
1340
1341 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1342 if (old_mtu > mtu_plateau[i])
1343 return mtu_plateau[i];
1344 return 68;
1345}
1346
1347unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1348{
1349 int i;
1350 unsigned short old_mtu = ntohs(iph->tot_len);
1351 struct rtable *rth;
1352 u32 skeys[2] = { iph->saddr, 0, };
1353 u32 daddr = iph->daddr;
1354 u8 tos = iph->tos & IPTOS_RT_MASK;
1355 unsigned short est_mtu = 0;
1356
1357 if (ipv4_config.no_pmtu_disc)
1358 return 0;
1359
1360 for (i = 0; i < 2; i++) {
1361 unsigned hash = rt_hash_code(daddr, skeys[i], tos);
1362
1363 rcu_read_lock();
1364 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1365 rth = rcu_dereference(rth->u.rt_next)) {
1366 if (rth->fl.fl4_dst == daddr &&
1367 rth->fl.fl4_src == skeys[i] &&
1368 rth->rt_dst == daddr &&
1369 rth->rt_src == iph->saddr &&
1370 rth->fl.fl4_tos == tos &&
1371 rth->fl.iif == 0 &&
1372 !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1373 unsigned short mtu = new_mtu;
1374
1375 if (new_mtu < 68 || new_mtu >= old_mtu) {
1376
1377 /* BSD 4.2 compatibility hack :-( */
1378 if (mtu == 0 &&
1379 old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1380 old_mtu >= 68 + (iph->ihl << 2))
1381 old_mtu -= iph->ihl << 2;
1382
1383 mtu = guess_mtu(old_mtu);
1384 }
1385 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1386 if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1387 dst_confirm(&rth->u.dst);
1388 if (mtu < ip_rt_min_pmtu) {
1389 mtu = ip_rt_min_pmtu;
1390 rth->u.dst.metrics[RTAX_LOCK-1] |=
1391 (1 << RTAX_MTU);
1392 }
1393 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1394 dst_set_expires(&rth->u.dst,
1395 ip_rt_mtu_expires);
1396 }
1397 est_mtu = mtu;
1398 }
1399 }
1400 }
1401 rcu_read_unlock();
1402 }
1403 return est_mtu ? : new_mtu;
1404}
1405
1406static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1407{
1408 if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1409 !(dst_metric_locked(dst, RTAX_MTU))) {
1410 if (mtu < ip_rt_min_pmtu) {
1411 mtu = ip_rt_min_pmtu;
1412 dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1413 }
1414 dst->metrics[RTAX_MTU-1] = mtu;
1415 dst_set_expires(dst, ip_rt_mtu_expires);
1416 }
1417}
1418
1419static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1420{
1421 return NULL;
1422}
1423
1424static void ipv4_dst_destroy(struct dst_entry *dst)
1425{
1426 struct rtable *rt = (struct rtable *) dst;
1427 struct inet_peer *peer = rt->peer;
1428 struct in_device *idev = rt->idev;
1429
1430 if (peer) {
1431 rt->peer = NULL;
1432 inet_putpeer(peer);
1433 }
1434
1435 if (idev) {
1436 rt->idev = NULL;
1437 in_dev_put(idev);
1438 }
1439}
1440
1441static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1442 int how)
1443{
1444 struct rtable *rt = (struct rtable *) dst;
1445 struct in_device *idev = rt->idev;
1446 if (dev != &loopback_dev && idev && idev->dev == dev) {
1447 struct in_device *loopback_idev = in_dev_get(&loopback_dev);
1448 if (loopback_idev) {
1449 rt->idev = loopback_idev;
1450 in_dev_put(idev);
1451 }
1452 }
1453}
1454
1455static void ipv4_link_failure(struct sk_buff *skb)
1456{
1457 struct rtable *rt;
1458
1459 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1460
1461 rt = (struct rtable *) skb->dst;
1462 if (rt)
1463 dst_set_expires(&rt->u.dst, 0);
1464}
1465
1466static int ip_rt_bug(struct sk_buff *skb)
1467{
1468 printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1469 NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
1470 skb->dev ? skb->dev->name : "?");
1471 kfree_skb(skb);
1472 return 0;
1473}
1474
1475/*
1476 We do not cache source address of outgoing interface,
1477 because it is used only by IP RR, TS and SRR options,
1478 so that it out of fast path.
1479
1480 BTW remember: "addr" is allowed to be not aligned
1481 in IP options!
1482 */
1483
1484void ip_rt_get_source(u8 *addr, struct rtable *rt)
1485{
1486 u32 src;
1487 struct fib_result res;
1488
1489 if (rt->fl.iif == 0)
1490 src = rt->rt_src;
1491 else if (fib_lookup(&rt->fl, &res) == 0) {
1492 src = FIB_RES_PREFSRC(res);
1493 fib_res_put(&res);
1494 } else
1495 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1496 RT_SCOPE_UNIVERSE);
1497 memcpy(addr, &src, 4);
1498}
1499
1500#ifdef CONFIG_NET_CLS_ROUTE
1501static void set_class_tag(struct rtable *rt, u32 tag)
1502{
1503 if (!(rt->u.dst.tclassid & 0xFFFF))
1504 rt->u.dst.tclassid |= tag & 0xFFFF;
1505 if (!(rt->u.dst.tclassid & 0xFFFF0000))
1506 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1507}
1508#endif
1509
1510static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1511{
1512 struct fib_info *fi = res->fi;
1513
1514 if (fi) {
1515 if (FIB_RES_GW(*res) &&
1516 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1517 rt->rt_gateway = FIB_RES_GW(*res);
1518 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1519 sizeof(rt->u.dst.metrics));
1520 if (fi->fib_mtu == 0) {
1521 rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1522 if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1523 rt->rt_gateway != rt->rt_dst &&
1524 rt->u.dst.dev->mtu > 576)
1525 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1526 }
1527#ifdef CONFIG_NET_CLS_ROUTE
1528 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1529#endif
1530 } else
1531 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1532
1533 if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1534 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1535 if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1536 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1537 if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1538 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1539 ip_rt_min_advmss);
1540 if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1541 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1542
1543#ifdef CONFIG_NET_CLS_ROUTE
1544#ifdef CONFIG_IP_MULTIPLE_TABLES
1545 set_class_tag(rt, fib_rules_tclass(res));
1546#endif
1547 set_class_tag(rt, itag);
1548#endif
1549 rt->rt_type = res->type;
1550}
1551
1552static int ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
1553 u8 tos, struct net_device *dev, int our)
1554{
1555 unsigned hash;
1556 struct rtable *rth;
1557 u32 spec_dst;
1558 struct in_device *in_dev = in_dev_get(dev);
1559 u32 itag = 0;
1560
1561 /* Primary sanity checks. */
1562
1563 if (in_dev == NULL)
1564 return -EINVAL;
1565
1566 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1567 skb->protocol != htons(ETH_P_IP))
1568 goto e_inval;
1569
1570 if (ZERONET(saddr)) {
1571 if (!LOCAL_MCAST(daddr))
1572 goto e_inval;
1573 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1574 } else if (fib_validate_source(saddr, 0, tos, 0,
1575 dev, &spec_dst, &itag) < 0)
1576 goto e_inval;
1577
1578 rth = dst_alloc(&ipv4_dst_ops);
1579 if (!rth)
1580 goto e_nobufs;
1581
1582 rth->u.dst.output= ip_rt_bug;
1583
1584 atomic_set(&rth->u.dst.__refcnt, 1);
1585 rth->u.dst.flags= DST_HOST;
1586 if (in_dev->cnf.no_policy)
1587 rth->u.dst.flags |= DST_NOPOLICY;
1588 rth->fl.fl4_dst = daddr;
1589 rth->rt_dst = daddr;
1590 rth->fl.fl4_tos = tos;
1591#ifdef CONFIG_IP_ROUTE_FWMARK
1592 rth->fl.fl4_fwmark= skb->nfmark;
1593#endif
1594 rth->fl.fl4_src = saddr;
1595 rth->rt_src = saddr;
1596#ifdef CONFIG_NET_CLS_ROUTE
1597 rth->u.dst.tclassid = itag;
1598#endif
1599 rth->rt_iif =
1600 rth->fl.iif = dev->ifindex;
1601 rth->u.dst.dev = &loopback_dev;
1602 dev_hold(rth->u.dst.dev);
1603 rth->idev = in_dev_get(rth->u.dst.dev);
1604 rth->fl.oif = 0;
1605 rth->rt_gateway = daddr;
1606 rth->rt_spec_dst= spec_dst;
1607 rth->rt_type = RTN_MULTICAST;
1608 rth->rt_flags = RTCF_MULTICAST;
1609 if (our) {
1610 rth->u.dst.input= ip_local_deliver;
1611 rth->rt_flags |= RTCF_LOCAL;
1612 }
1613
1614#ifdef CONFIG_IP_MROUTE
1615 if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1616 rth->u.dst.input = ip_mr_input;
1617#endif
1618 RT_CACHE_STAT_INC(in_slow_mc);
1619
1620 in_dev_put(in_dev);
1621 hash = rt_hash_code(daddr, saddr ^ (dev->ifindex << 5), tos);
1622 return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1623
1624e_nobufs:
1625 in_dev_put(in_dev);
1626 return -ENOBUFS;
1627
1628e_inval:
1629 in_dev_put(in_dev);
1630 return -EINVAL;
1631}
1632
1633
1634static void ip_handle_martian_source(struct net_device *dev,
1635 struct in_device *in_dev,
1636 struct sk_buff *skb,
1637 u32 daddr,
1638 u32 saddr)
1639{
1640 RT_CACHE_STAT_INC(in_martian_src);
1641#ifdef CONFIG_IP_ROUTE_VERBOSE
1642 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1643 /*
1644 * RFC1812 recommendation, if source is martian,
1645 * the only hint is MAC header.
1646 */
1647 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1648 "%u.%u.%u.%u, on dev %s\n",
1649 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1650 if (dev->hard_header_len) {
1651 int i;
1652 unsigned char *p = skb->mac.raw;
1653 printk(KERN_WARNING "ll header: ");
1654 for (i = 0; i < dev->hard_header_len; i++, p++) {
1655 printk("%02x", *p);
1656 if (i < (dev->hard_header_len - 1))
1657 printk(":");
1658 }
1659 printk("\n");
1660 }
1661 }
1662#endif
1663}
1664
1665static inline int __mkroute_input(struct sk_buff *skb,
1666 struct fib_result* res,
1667 struct in_device *in_dev,
1668 u32 daddr, u32 saddr, u32 tos,
1669 struct rtable **result)
1670{
1671
1672 struct rtable *rth;
1673 int err;
1674 struct in_device *out_dev;
1675 unsigned flags = 0;
1676 u32 spec_dst, itag;
1677
1678 /* get a working reference to the output device */
1679 out_dev = in_dev_get(FIB_RES_DEV(*res));
1680 if (out_dev == NULL) {
1681 if (net_ratelimit())
1682 printk(KERN_CRIT "Bug in ip_route_input" \
1683 "_slow(). Please, report\n");
1684 return -EINVAL;
1685 }
1686
1687
1688 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1689 in_dev->dev, &spec_dst, &itag);
1690 if (err < 0) {
1691 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1692 saddr);
1693
1694 err = -EINVAL;
1695 goto cleanup;
1696 }
1697
1698 if (err)
1699 flags |= RTCF_DIRECTSRC;
1700
1701 if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1702 (IN_DEV_SHARED_MEDIA(out_dev) ||
1703 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1704 flags |= RTCF_DOREDIRECT;
1705
1706 if (skb->protocol != htons(ETH_P_IP)) {
1707 /* Not IP (i.e. ARP). Do not create route, if it is
1708 * invalid for proxy arp. DNAT routes are always valid.
1709 */
1710 if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1711 err = -EINVAL;
1712 goto cleanup;
1713 }
1714 }
1715
1716
1717 rth = dst_alloc(&ipv4_dst_ops);
1718 if (!rth) {
1719 err = -ENOBUFS;
1720 goto cleanup;
1721 }
1722
1723 rth->u.dst.flags= DST_HOST;
1724#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1725 if (res->fi->fib_nhs > 1)
1726 rth->u.dst.flags |= DST_BALANCED;
1727#endif
1728 if (in_dev->cnf.no_policy)
1729 rth->u.dst.flags |= DST_NOPOLICY;
1730 if (in_dev->cnf.no_xfrm)
1731 rth->u.dst.flags |= DST_NOXFRM;
1732 rth->fl.fl4_dst = daddr;
1733 rth->rt_dst = daddr;
1734 rth->fl.fl4_tos = tos;
1735#ifdef CONFIG_IP_ROUTE_FWMARK
1736 rth->fl.fl4_fwmark= skb->nfmark;
1737#endif
1738 rth->fl.fl4_src = saddr;
1739 rth->rt_src = saddr;
1740 rth->rt_gateway = daddr;
1741 rth->rt_iif =
1742 rth->fl.iif = in_dev->dev->ifindex;
1743 rth->u.dst.dev = (out_dev)->dev;
1744 dev_hold(rth->u.dst.dev);
1745 rth->idev = in_dev_get(rth->u.dst.dev);
1746 rth->fl.oif = 0;
1747 rth->rt_spec_dst= spec_dst;
1748
1749 rth->u.dst.input = ip_forward;
1750 rth->u.dst.output = ip_output;
1751
1752 rt_set_nexthop(rth, res, itag);
1753
1754 rth->rt_flags = flags;
1755
1756 *result = rth;
1757 err = 0;
1758 cleanup:
1759 /* release the working reference to the output device */
1760 in_dev_put(out_dev);
1761 return err;
1762}
1763
1764static inline int ip_mkroute_input_def(struct sk_buff *skb,
1765 struct fib_result* res,
1766 const struct flowi *fl,
1767 struct in_device *in_dev,
1768 u32 daddr, u32 saddr, u32 tos)
1769{
Chuck Short7abaa272005-06-22 22:10:23 -07001770 struct rtable* rth = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001771 int err;
1772 unsigned hash;
1773
1774#ifdef CONFIG_IP_ROUTE_MULTIPATH
1775 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1776 fib_select_multipath(fl, res);
1777#endif
1778
1779 /* create a routing cache entry */
1780 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1781 if (err)
1782 return err;
1783 atomic_set(&rth->u.dst.__refcnt, 1);
1784
1785 /* put it into the cache */
1786 hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5), tos);
1787 return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1788}
1789
1790static inline int ip_mkroute_input(struct sk_buff *skb,
1791 struct fib_result* res,
1792 const struct flowi *fl,
1793 struct in_device *in_dev,
1794 u32 daddr, u32 saddr, u32 tos)
1795{
1796#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
Chuck Short7abaa272005-06-22 22:10:23 -07001797 struct rtable* rth = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001798 unsigned char hop, hopcount, lasthop;
1799 int err = -EINVAL;
1800 unsigned int hash;
1801
1802 if (res->fi)
1803 hopcount = res->fi->fib_nhs;
1804 else
1805 hopcount = 1;
1806
1807 lasthop = hopcount - 1;
1808
1809 /* distinguish between multipath and singlepath */
1810 if (hopcount < 2)
1811 return ip_mkroute_input_def(skb, res, fl, in_dev, daddr,
1812 saddr, tos);
1813
1814 /* add all alternatives to the routing cache */
1815 for (hop = 0; hop < hopcount; hop++) {
1816 res->nh_sel = hop;
1817
1818 /* create a routing cache entry */
1819 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos,
1820 &rth);
1821 if (err)
1822 return err;
1823
1824 /* put it into the cache */
1825 hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5), tos);
1826 err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1827 if (err)
1828 return err;
1829
1830 /* forward hop information to multipath impl. */
1831 multipath_set_nhinfo(rth,
1832 FIB_RES_NETWORK(*res),
1833 FIB_RES_NETMASK(*res),
1834 res->prefixlen,
1835 &FIB_RES_NH(*res));
1836
1837 /* only for the last hop the reference count is handled
1838 * outside
1839 */
1840 if (hop == lasthop)
1841 atomic_set(&(skb->dst->__refcnt), 1);
1842 }
1843 return err;
1844#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
1845 return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, saddr, tos);
1846#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
1847}
1848
1849
1850/*
1851 * NOTE. We drop all the packets that has local source
1852 * addresses, because every properly looped back packet
1853 * must have correct destination already attached by output routine.
1854 *
1855 * Such approach solves two big problems:
1856 * 1. Not simplex devices are handled properly.
1857 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1858 */
1859
1860static int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
1861 u8 tos, struct net_device *dev)
1862{
1863 struct fib_result res;
1864 struct in_device *in_dev = in_dev_get(dev);
1865 struct flowi fl = { .nl_u = { .ip4_u =
1866 { .daddr = daddr,
1867 .saddr = saddr,
1868 .tos = tos,
1869 .scope = RT_SCOPE_UNIVERSE,
1870#ifdef CONFIG_IP_ROUTE_FWMARK
1871 .fwmark = skb->nfmark
1872#endif
1873 } },
1874 .iif = dev->ifindex };
1875 unsigned flags = 0;
1876 u32 itag = 0;
1877 struct rtable * rth;
1878 unsigned hash;
1879 u32 spec_dst;
1880 int err = -EINVAL;
1881 int free_res = 0;
1882
1883 /* IP on this device is disabled. */
1884
1885 if (!in_dev)
1886 goto out;
1887
1888 /* Check for the most weird martians, which can be not detected
1889 by fib_lookup.
1890 */
1891
1892 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1893 goto martian_source;
1894
1895 if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0))
1896 goto brd_input;
1897
1898 /* Accept zero addresses only to limited broadcast;
1899 * I even do not know to fix it or not. Waiting for complains :-)
1900 */
1901 if (ZERONET(saddr))
1902 goto martian_source;
1903
1904 if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1905 goto martian_destination;
1906
1907 /*
1908 * Now we are ready to route packet.
1909 */
1910 if ((err = fib_lookup(&fl, &res)) != 0) {
1911 if (!IN_DEV_FORWARD(in_dev))
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07001912 goto e_hostunreach;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001913 goto no_route;
1914 }
1915 free_res = 1;
1916
1917 RT_CACHE_STAT_INC(in_slow_tot);
1918
1919 if (res.type == RTN_BROADCAST)
1920 goto brd_input;
1921
1922 if (res.type == RTN_LOCAL) {
1923 int result;
1924 result = fib_validate_source(saddr, daddr, tos,
1925 loopback_dev.ifindex,
1926 dev, &spec_dst, &itag);
1927 if (result < 0)
1928 goto martian_source;
1929 if (result)
1930 flags |= RTCF_DIRECTSRC;
1931 spec_dst = daddr;
1932 goto local_input;
1933 }
1934
1935 if (!IN_DEV_FORWARD(in_dev))
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07001936 goto e_hostunreach;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001937 if (res.type != RTN_UNICAST)
1938 goto martian_destination;
1939
1940 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1941 if (err == -ENOBUFS)
1942 goto e_nobufs;
1943 if (err == -EINVAL)
1944 goto e_inval;
1945
1946done:
1947 in_dev_put(in_dev);
1948 if (free_res)
1949 fib_res_put(&res);
1950out: return err;
1951
1952brd_input:
1953 if (skb->protocol != htons(ETH_P_IP))
1954 goto e_inval;
1955
1956 if (ZERONET(saddr))
1957 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1958 else {
1959 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1960 &itag);
1961 if (err < 0)
1962 goto martian_source;
1963 if (err)
1964 flags |= RTCF_DIRECTSRC;
1965 }
1966 flags |= RTCF_BROADCAST;
1967 res.type = RTN_BROADCAST;
1968 RT_CACHE_STAT_INC(in_brd);
1969
1970local_input:
1971 rth = dst_alloc(&ipv4_dst_ops);
1972 if (!rth)
1973 goto e_nobufs;
1974
1975 rth->u.dst.output= ip_rt_bug;
1976
1977 atomic_set(&rth->u.dst.__refcnt, 1);
1978 rth->u.dst.flags= DST_HOST;
1979 if (in_dev->cnf.no_policy)
1980 rth->u.dst.flags |= DST_NOPOLICY;
1981 rth->fl.fl4_dst = daddr;
1982 rth->rt_dst = daddr;
1983 rth->fl.fl4_tos = tos;
1984#ifdef CONFIG_IP_ROUTE_FWMARK
1985 rth->fl.fl4_fwmark= skb->nfmark;
1986#endif
1987 rth->fl.fl4_src = saddr;
1988 rth->rt_src = saddr;
1989#ifdef CONFIG_NET_CLS_ROUTE
1990 rth->u.dst.tclassid = itag;
1991#endif
1992 rth->rt_iif =
1993 rth->fl.iif = dev->ifindex;
1994 rth->u.dst.dev = &loopback_dev;
1995 dev_hold(rth->u.dst.dev);
1996 rth->idev = in_dev_get(rth->u.dst.dev);
1997 rth->rt_gateway = daddr;
1998 rth->rt_spec_dst= spec_dst;
1999 rth->u.dst.input= ip_local_deliver;
2000 rth->rt_flags = flags|RTCF_LOCAL;
2001 if (res.type == RTN_UNREACHABLE) {
2002 rth->u.dst.input= ip_error;
2003 rth->u.dst.error= -err;
2004 rth->rt_flags &= ~RTCF_LOCAL;
2005 }
2006 rth->rt_type = res.type;
2007 hash = rt_hash_code(daddr, saddr ^ (fl.iif << 5), tos);
2008 err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
2009 goto done;
2010
2011no_route:
2012 RT_CACHE_STAT_INC(in_no_route);
2013 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2014 res.type = RTN_UNREACHABLE;
2015 goto local_input;
2016
2017 /*
2018 * Do not cache martian addresses: they should be logged (RFC1812)
2019 */
2020martian_destination:
2021 RT_CACHE_STAT_INC(in_martian_dst);
2022#ifdef CONFIG_IP_ROUTE_VERBOSE
2023 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2024 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2025 "%u.%u.%u.%u, dev %s\n",
2026 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2027#endif
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002028
2029e_hostunreach:
2030 err = -EHOSTUNREACH;
2031 goto done;
2032
Linus Torvalds1da177e2005-04-16 15:20:36 -07002033e_inval:
2034 err = -EINVAL;
2035 goto done;
2036
2037e_nobufs:
2038 err = -ENOBUFS;
2039 goto done;
2040
2041martian_source:
2042 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2043 goto e_inval;
2044}
2045
2046int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
2047 u8 tos, struct net_device *dev)
2048{
2049 struct rtable * rth;
2050 unsigned hash;
2051 int iif = dev->ifindex;
2052
2053 tos &= IPTOS_RT_MASK;
2054 hash = rt_hash_code(daddr, saddr ^ (iif << 5), tos);
2055
2056 rcu_read_lock();
2057 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2058 rth = rcu_dereference(rth->u.rt_next)) {
2059 if (rth->fl.fl4_dst == daddr &&
2060 rth->fl.fl4_src == saddr &&
2061 rth->fl.iif == iif &&
2062 rth->fl.oif == 0 &&
2063#ifdef CONFIG_IP_ROUTE_FWMARK
2064 rth->fl.fl4_fwmark == skb->nfmark &&
2065#endif
2066 rth->fl.fl4_tos == tos) {
2067 rth->u.dst.lastuse = jiffies;
2068 dst_hold(&rth->u.dst);
2069 rth->u.dst.__use++;
2070 RT_CACHE_STAT_INC(in_hit);
2071 rcu_read_unlock();
2072 skb->dst = (struct dst_entry*)rth;
2073 return 0;
2074 }
2075 RT_CACHE_STAT_INC(in_hlist_search);
2076 }
2077 rcu_read_unlock();
2078
2079 /* Multicast recognition logic is moved from route cache to here.
2080 The problem was that too many Ethernet cards have broken/missing
2081 hardware multicast filters :-( As result the host on multicasting
2082 network acquires a lot of useless route cache entries, sort of
2083 SDR messages from all the world. Now we try to get rid of them.
2084 Really, provided software IP multicast filter is organized
2085 reasonably (at least, hashed), it does not result in a slowdown
2086 comparing with route cache reject entries.
2087 Note, that multicast routers are not affected, because
2088 route cache entry is created eventually.
2089 */
2090 if (MULTICAST(daddr)) {
2091 struct in_device *in_dev;
2092
2093 rcu_read_lock();
2094 if ((in_dev = __in_dev_get(dev)) != NULL) {
2095 int our = ip_check_mc(in_dev, daddr, saddr,
2096 skb->nh.iph->protocol);
2097 if (our
2098#ifdef CONFIG_IP_MROUTE
2099 || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
2100#endif
2101 ) {
2102 rcu_read_unlock();
2103 return ip_route_input_mc(skb, daddr, saddr,
2104 tos, dev, our);
2105 }
2106 }
2107 rcu_read_unlock();
2108 return -EINVAL;
2109 }
2110 return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2111}
2112
2113static inline int __mkroute_output(struct rtable **result,
2114 struct fib_result* res,
2115 const struct flowi *fl,
2116 const struct flowi *oldflp,
2117 struct net_device *dev_out,
2118 unsigned flags)
2119{
2120 struct rtable *rth;
2121 struct in_device *in_dev;
2122 u32 tos = RT_FL_TOS(oldflp);
2123 int err = 0;
2124
2125 if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2126 return -EINVAL;
2127
2128 if (fl->fl4_dst == 0xFFFFFFFF)
2129 res->type = RTN_BROADCAST;
2130 else if (MULTICAST(fl->fl4_dst))
2131 res->type = RTN_MULTICAST;
2132 else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
2133 return -EINVAL;
2134
2135 if (dev_out->flags & IFF_LOOPBACK)
2136 flags |= RTCF_LOCAL;
2137
2138 /* get work reference to inet device */
2139 in_dev = in_dev_get(dev_out);
2140 if (!in_dev)
2141 return -EINVAL;
2142
2143 if (res->type == RTN_BROADCAST) {
2144 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2145 if (res->fi) {
2146 fib_info_put(res->fi);
2147 res->fi = NULL;
2148 }
2149 } else if (res->type == RTN_MULTICAST) {
2150 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2151 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2152 oldflp->proto))
2153 flags &= ~RTCF_LOCAL;
2154 /* If multicast route do not exist use
2155 default one, but do not gateway in this case.
2156 Yes, it is hack.
2157 */
2158 if (res->fi && res->prefixlen < 4) {
2159 fib_info_put(res->fi);
2160 res->fi = NULL;
2161 }
2162 }
2163
2164
2165 rth = dst_alloc(&ipv4_dst_ops);
2166 if (!rth) {
2167 err = -ENOBUFS;
2168 goto cleanup;
2169 }
2170
2171 rth->u.dst.flags= DST_HOST;
2172#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2173 if (res->fi) {
2174 rth->rt_multipath_alg = res->fi->fib_mp_alg;
2175 if (res->fi->fib_nhs > 1)
2176 rth->u.dst.flags |= DST_BALANCED;
2177 }
2178#endif
2179 if (in_dev->cnf.no_xfrm)
2180 rth->u.dst.flags |= DST_NOXFRM;
2181 if (in_dev->cnf.no_policy)
2182 rth->u.dst.flags |= DST_NOPOLICY;
2183
2184 rth->fl.fl4_dst = oldflp->fl4_dst;
2185 rth->fl.fl4_tos = tos;
2186 rth->fl.fl4_src = oldflp->fl4_src;
2187 rth->fl.oif = oldflp->oif;
2188#ifdef CONFIG_IP_ROUTE_FWMARK
2189 rth->fl.fl4_fwmark= oldflp->fl4_fwmark;
2190#endif
2191 rth->rt_dst = fl->fl4_dst;
2192 rth->rt_src = fl->fl4_src;
2193 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
2194 /* get references to the devices that are to be hold by the routing
2195 cache entry */
2196 rth->u.dst.dev = dev_out;
2197 dev_hold(dev_out);
2198 rth->idev = in_dev_get(dev_out);
2199 rth->rt_gateway = fl->fl4_dst;
2200 rth->rt_spec_dst= fl->fl4_src;
2201
2202 rth->u.dst.output=ip_output;
2203
2204 RT_CACHE_STAT_INC(out_slow_tot);
2205
2206 if (flags & RTCF_LOCAL) {
2207 rth->u.dst.input = ip_local_deliver;
2208 rth->rt_spec_dst = fl->fl4_dst;
2209 }
2210 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2211 rth->rt_spec_dst = fl->fl4_src;
2212 if (flags & RTCF_LOCAL &&
2213 !(dev_out->flags & IFF_LOOPBACK)) {
2214 rth->u.dst.output = ip_mc_output;
2215 RT_CACHE_STAT_INC(out_slow_mc);
2216 }
2217#ifdef CONFIG_IP_MROUTE
2218 if (res->type == RTN_MULTICAST) {
2219 if (IN_DEV_MFORWARD(in_dev) &&
2220 !LOCAL_MCAST(oldflp->fl4_dst)) {
2221 rth->u.dst.input = ip_mr_input;
2222 rth->u.dst.output = ip_mc_output;
2223 }
2224 }
2225#endif
2226 }
2227
2228 rt_set_nexthop(rth, res, 0);
2229
2230 rth->rt_flags = flags;
2231
2232 *result = rth;
2233 cleanup:
2234 /* release work reference to inet device */
2235 in_dev_put(in_dev);
2236
2237 return err;
2238}
2239
2240static inline int ip_mkroute_output_def(struct rtable **rp,
2241 struct fib_result* res,
2242 const struct flowi *fl,
2243 const struct flowi *oldflp,
2244 struct net_device *dev_out,
2245 unsigned flags)
2246{
Chuck Short7abaa272005-06-22 22:10:23 -07002247 struct rtable *rth = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002248 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2249 unsigned hash;
2250 if (err == 0) {
2251 u32 tos = RT_FL_TOS(oldflp);
2252
2253 atomic_set(&rth->u.dst.__refcnt, 1);
2254
2255 hash = rt_hash_code(oldflp->fl4_dst,
2256 oldflp->fl4_src ^ (oldflp->oif << 5), tos);
2257 err = rt_intern_hash(hash, rth, rp);
2258 }
2259
2260 return err;
2261}
2262
2263static inline int ip_mkroute_output(struct rtable** rp,
2264 struct fib_result* res,
2265 const struct flowi *fl,
2266 const struct flowi *oldflp,
2267 struct net_device *dev_out,
2268 unsigned flags)
2269{
2270#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2271 u32 tos = RT_FL_TOS(oldflp);
2272 unsigned char hop;
2273 unsigned hash;
2274 int err = -EINVAL;
Chuck Short7abaa272005-06-22 22:10:23 -07002275 struct rtable *rth = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002276
2277 if (res->fi && res->fi->fib_nhs > 1) {
2278 unsigned char hopcount = res->fi->fib_nhs;
2279
2280 for (hop = 0; hop < hopcount; hop++) {
2281 struct net_device *dev2nexthop;
2282
2283 res->nh_sel = hop;
2284
2285 /* hold a work reference to the output device */
2286 dev2nexthop = FIB_RES_DEV(*res);
2287 dev_hold(dev2nexthop);
2288
2289 err = __mkroute_output(&rth, res, fl, oldflp,
2290 dev2nexthop, flags);
2291
2292 if (err != 0)
2293 goto cleanup;
2294
2295 hash = rt_hash_code(oldflp->fl4_dst,
2296 oldflp->fl4_src ^
2297 (oldflp->oif << 5), tos);
2298 err = rt_intern_hash(hash, rth, rp);
2299
2300 /* forward hop information to multipath impl. */
2301 multipath_set_nhinfo(rth,
2302 FIB_RES_NETWORK(*res),
2303 FIB_RES_NETMASK(*res),
2304 res->prefixlen,
2305 &FIB_RES_NH(*res));
2306 cleanup:
2307 /* release work reference to output device */
2308 dev_put(dev2nexthop);
2309
2310 if (err != 0)
2311 return err;
2312 }
2313 atomic_set(&(*rp)->u.dst.__refcnt, 1);
2314 return err;
2315 } else {
2316 return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out,
2317 flags);
2318 }
2319#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
2320 return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, flags);
2321#endif
2322}
2323
2324/*
2325 * Major route resolver routine.
2326 */
2327
2328static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2329{
2330 u32 tos = RT_FL_TOS(oldflp);
2331 struct flowi fl = { .nl_u = { .ip4_u =
2332 { .daddr = oldflp->fl4_dst,
2333 .saddr = oldflp->fl4_src,
2334 .tos = tos & IPTOS_RT_MASK,
2335 .scope = ((tos & RTO_ONLINK) ?
2336 RT_SCOPE_LINK :
2337 RT_SCOPE_UNIVERSE),
2338#ifdef CONFIG_IP_ROUTE_FWMARK
2339 .fwmark = oldflp->fl4_fwmark
2340#endif
2341 } },
2342 .iif = loopback_dev.ifindex,
2343 .oif = oldflp->oif };
2344 struct fib_result res;
2345 unsigned flags = 0;
2346 struct net_device *dev_out = NULL;
2347 int free_res = 0;
2348 int err;
2349
2350
2351 res.fi = NULL;
2352#ifdef CONFIG_IP_MULTIPLE_TABLES
2353 res.r = NULL;
2354#endif
2355
2356 if (oldflp->fl4_src) {
2357 err = -EINVAL;
2358 if (MULTICAST(oldflp->fl4_src) ||
2359 BADCLASS(oldflp->fl4_src) ||
2360 ZERONET(oldflp->fl4_src))
2361 goto out;
2362
2363 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2364 dev_out = ip_dev_find(oldflp->fl4_src);
2365 if (dev_out == NULL)
2366 goto out;
2367
2368 /* I removed check for oif == dev_out->oif here.
2369 It was wrong for two reasons:
2370 1. ip_dev_find(saddr) can return wrong iface, if saddr is
2371 assigned to multiple interfaces.
2372 2. Moreover, we are allowed to send packets with saddr
2373 of another iface. --ANK
2374 */
2375
2376 if (oldflp->oif == 0
2377 && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF)) {
2378 /* Special hack: user can direct multicasts
2379 and limited broadcast via necessary interface
2380 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2381 This hack is not just for fun, it allows
2382 vic,vat and friends to work.
2383 They bind socket to loopback, set ttl to zero
2384 and expect that it will work.
2385 From the viewpoint of routing cache they are broken,
2386 because we are not allowed to build multicast path
2387 with loopback source addr (look, routing cache
2388 cannot know, that ttl is zero, so that packet
2389 will not leave this host and route is valid).
2390 Luckily, this hack is good workaround.
2391 */
2392
2393 fl.oif = dev_out->ifindex;
2394 goto make_route;
2395 }
2396 if (dev_out)
2397 dev_put(dev_out);
2398 dev_out = NULL;
2399 }
2400
2401
2402 if (oldflp->oif) {
2403 dev_out = dev_get_by_index(oldflp->oif);
2404 err = -ENODEV;
2405 if (dev_out == NULL)
2406 goto out;
2407 if (__in_dev_get(dev_out) == NULL) {
2408 dev_put(dev_out);
2409 goto out; /* Wrong error code */
2410 }
2411
2412 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF) {
2413 if (!fl.fl4_src)
2414 fl.fl4_src = inet_select_addr(dev_out, 0,
2415 RT_SCOPE_LINK);
2416 goto make_route;
2417 }
2418 if (!fl.fl4_src) {
2419 if (MULTICAST(oldflp->fl4_dst))
2420 fl.fl4_src = inet_select_addr(dev_out, 0,
2421 fl.fl4_scope);
2422 else if (!oldflp->fl4_dst)
2423 fl.fl4_src = inet_select_addr(dev_out, 0,
2424 RT_SCOPE_HOST);
2425 }
2426 }
2427
2428 if (!fl.fl4_dst) {
2429 fl.fl4_dst = fl.fl4_src;
2430 if (!fl.fl4_dst)
2431 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2432 if (dev_out)
2433 dev_put(dev_out);
2434 dev_out = &loopback_dev;
2435 dev_hold(dev_out);
2436 fl.oif = loopback_dev.ifindex;
2437 res.type = RTN_LOCAL;
2438 flags |= RTCF_LOCAL;
2439 goto make_route;
2440 }
2441
2442 if (fib_lookup(&fl, &res)) {
2443 res.fi = NULL;
2444 if (oldflp->oif) {
2445 /* Apparently, routing tables are wrong. Assume,
2446 that the destination is on link.
2447
2448 WHY? DW.
2449 Because we are allowed to send to iface
2450 even if it has NO routes and NO assigned
2451 addresses. When oif is specified, routing
2452 tables are looked up with only one purpose:
2453 to catch if destination is gatewayed, rather than
2454 direct. Moreover, if MSG_DONTROUTE is set,
2455 we send packet, ignoring both routing tables
2456 and ifaddr state. --ANK
2457
2458
2459 We could make it even if oif is unknown,
2460 likely IPv6, but we do not.
2461 */
2462
2463 if (fl.fl4_src == 0)
2464 fl.fl4_src = inet_select_addr(dev_out, 0,
2465 RT_SCOPE_LINK);
2466 res.type = RTN_UNICAST;
2467 goto make_route;
2468 }
2469 if (dev_out)
2470 dev_put(dev_out);
2471 err = -ENETUNREACH;
2472 goto out;
2473 }
2474 free_res = 1;
2475
2476 if (res.type == RTN_LOCAL) {
2477 if (!fl.fl4_src)
2478 fl.fl4_src = fl.fl4_dst;
2479 if (dev_out)
2480 dev_put(dev_out);
2481 dev_out = &loopback_dev;
2482 dev_hold(dev_out);
2483 fl.oif = dev_out->ifindex;
2484 if (res.fi)
2485 fib_info_put(res.fi);
2486 res.fi = NULL;
2487 flags |= RTCF_LOCAL;
2488 goto make_route;
2489 }
2490
2491#ifdef CONFIG_IP_ROUTE_MULTIPATH
2492 if (res.fi->fib_nhs > 1 && fl.oif == 0)
2493 fib_select_multipath(&fl, &res);
2494 else
2495#endif
2496 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2497 fib_select_default(&fl, &res);
2498
2499 if (!fl.fl4_src)
2500 fl.fl4_src = FIB_RES_PREFSRC(res);
2501
2502 if (dev_out)
2503 dev_put(dev_out);
2504 dev_out = FIB_RES_DEV(res);
2505 dev_hold(dev_out);
2506 fl.oif = dev_out->ifindex;
2507
2508
2509make_route:
2510 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2511
2512
2513 if (free_res)
2514 fib_res_put(&res);
2515 if (dev_out)
2516 dev_put(dev_out);
2517out: return err;
2518}
2519
2520int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2521{
2522 unsigned hash;
2523 struct rtable *rth;
2524
2525 hash = rt_hash_code(flp->fl4_dst, flp->fl4_src ^ (flp->oif << 5), flp->fl4_tos);
2526
2527 rcu_read_lock_bh();
2528 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2529 rth = rcu_dereference(rth->u.rt_next)) {
2530 if (rth->fl.fl4_dst == flp->fl4_dst &&
2531 rth->fl.fl4_src == flp->fl4_src &&
2532 rth->fl.iif == 0 &&
2533 rth->fl.oif == flp->oif &&
2534#ifdef CONFIG_IP_ROUTE_FWMARK
2535 rth->fl.fl4_fwmark == flp->fl4_fwmark &&
2536#endif
2537 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2538 (IPTOS_RT_MASK | RTO_ONLINK))) {
2539
2540 /* check for multipath routes and choose one if
2541 * necessary
2542 */
2543 if (multipath_select_route(flp, rth, rp)) {
2544 dst_hold(&(*rp)->u.dst);
2545 RT_CACHE_STAT_INC(out_hit);
2546 rcu_read_unlock_bh();
2547 return 0;
2548 }
2549
2550 rth->u.dst.lastuse = jiffies;
2551 dst_hold(&rth->u.dst);
2552 rth->u.dst.__use++;
2553 RT_CACHE_STAT_INC(out_hit);
2554 rcu_read_unlock_bh();
2555 *rp = rth;
2556 return 0;
2557 }
2558 RT_CACHE_STAT_INC(out_hlist_search);
2559 }
2560 rcu_read_unlock_bh();
2561
2562 return ip_route_output_slow(rp, flp);
2563}
2564
2565int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2566{
2567 int err;
2568
2569 if ((err = __ip_route_output_key(rp, flp)) != 0)
2570 return err;
2571
2572 if (flp->proto) {
2573 if (!flp->fl4_src)
2574 flp->fl4_src = (*rp)->rt_src;
2575 if (!flp->fl4_dst)
2576 flp->fl4_dst = (*rp)->rt_dst;
2577 return xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2578 }
2579
2580 return 0;
2581}
2582
2583int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2584{
2585 return ip_route_output_flow(rp, flp, NULL, 0);
2586}
2587
2588static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07002589 int nowait, unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002590{
2591 struct rtable *rt = (struct rtable*)skb->dst;
2592 struct rtmsg *r;
2593 struct nlmsghdr *nlh;
2594 unsigned char *b = skb->tail;
2595 struct rta_cacheinfo ci;
2596#ifdef CONFIG_IP_MROUTE
2597 struct rtattr *eptr;
2598#endif
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07002599 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*r), flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002600 r = NLMSG_DATA(nlh);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002601 r->rtm_family = AF_INET;
2602 r->rtm_dst_len = 32;
2603 r->rtm_src_len = 0;
2604 r->rtm_tos = rt->fl.fl4_tos;
2605 r->rtm_table = RT_TABLE_MAIN;
2606 r->rtm_type = rt->rt_type;
2607 r->rtm_scope = RT_SCOPE_UNIVERSE;
2608 r->rtm_protocol = RTPROT_UNSPEC;
2609 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2610 if (rt->rt_flags & RTCF_NOTIFY)
2611 r->rtm_flags |= RTM_F_NOTIFY;
2612 RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
2613 if (rt->fl.fl4_src) {
2614 r->rtm_src_len = 32;
2615 RTA_PUT(skb, RTA_SRC, 4, &rt->fl.fl4_src);
2616 }
2617 if (rt->u.dst.dev)
2618 RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
2619#ifdef CONFIG_NET_CLS_ROUTE
2620 if (rt->u.dst.tclassid)
2621 RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid);
2622#endif
2623#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2624 if (rt->rt_multipath_alg != IP_MP_ALG_NONE) {
2625 __u32 alg = rt->rt_multipath_alg;
2626
2627 RTA_PUT(skb, RTA_MP_ALGO, 4, &alg);
2628 }
2629#endif
2630 if (rt->fl.iif)
2631 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
2632 else if (rt->rt_src != rt->fl.fl4_src)
2633 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src);
2634 if (rt->rt_dst != rt->rt_gateway)
2635 RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
2636 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2637 goto rtattr_failure;
2638 ci.rta_lastuse = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
2639 ci.rta_used = rt->u.dst.__use;
2640 ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt);
2641 if (rt->u.dst.expires)
2642 ci.rta_expires = jiffies_to_clock_t(rt->u.dst.expires - jiffies);
2643 else
2644 ci.rta_expires = 0;
2645 ci.rta_error = rt->u.dst.error;
2646 ci.rta_id = ci.rta_ts = ci.rta_tsage = 0;
2647 if (rt->peer) {
2648 ci.rta_id = rt->peer->ip_id_count;
2649 if (rt->peer->tcp_ts_stamp) {
2650 ci.rta_ts = rt->peer->tcp_ts;
2651 ci.rta_tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
2652 }
2653 }
2654#ifdef CONFIG_IP_MROUTE
2655 eptr = (struct rtattr*)skb->tail;
2656#endif
2657 RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
2658 if (rt->fl.iif) {
2659#ifdef CONFIG_IP_MROUTE
2660 u32 dst = rt->rt_dst;
2661
2662 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2663 ipv4_devconf.mc_forwarding) {
2664 int err = ipmr_get_route(skb, r, nowait);
2665 if (err <= 0) {
2666 if (!nowait) {
2667 if (err == 0)
2668 return 0;
2669 goto nlmsg_failure;
2670 } else {
2671 if (err == -EMSGSIZE)
2672 goto nlmsg_failure;
2673 ((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err;
2674 }
2675 }
2676 } else
2677#endif
2678 RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->fl.iif);
2679 }
2680
2681 nlh->nlmsg_len = skb->tail - b;
2682 return skb->len;
2683
2684nlmsg_failure:
2685rtattr_failure:
2686 skb_trim(skb, b - skb->data);
2687 return -1;
2688}
2689
2690int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2691{
2692 struct rtattr **rta = arg;
2693 struct rtmsg *rtm = NLMSG_DATA(nlh);
2694 struct rtable *rt = NULL;
2695 u32 dst = 0;
2696 u32 src = 0;
2697 int iif = 0;
2698 int err = -ENOBUFS;
2699 struct sk_buff *skb;
2700
2701 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2702 if (!skb)
2703 goto out;
2704
2705 /* Reserve room for dummy headers, this skb can pass
2706 through good chunk of routing engine.
2707 */
2708 skb->mac.raw = skb->data;
2709 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2710
2711 if (rta[RTA_SRC - 1])
2712 memcpy(&src, RTA_DATA(rta[RTA_SRC - 1]), 4);
2713 if (rta[RTA_DST - 1])
2714 memcpy(&dst, RTA_DATA(rta[RTA_DST - 1]), 4);
2715 if (rta[RTA_IIF - 1])
2716 memcpy(&iif, RTA_DATA(rta[RTA_IIF - 1]), sizeof(int));
2717
2718 if (iif) {
2719 struct net_device *dev = __dev_get_by_index(iif);
2720 err = -ENODEV;
2721 if (!dev)
2722 goto out_free;
2723 skb->protocol = htons(ETH_P_IP);
2724 skb->dev = dev;
2725 local_bh_disable();
2726 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2727 local_bh_enable();
2728 rt = (struct rtable*)skb->dst;
2729 if (!err && rt->u.dst.error)
2730 err = -rt->u.dst.error;
2731 } else {
2732 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dst,
2733 .saddr = src,
2734 .tos = rtm->rtm_tos } } };
2735 int oif = 0;
2736 if (rta[RTA_OIF - 1])
2737 memcpy(&oif, RTA_DATA(rta[RTA_OIF - 1]), sizeof(int));
2738 fl.oif = oif;
2739 err = ip_route_output_key(&rt, &fl);
2740 }
2741 if (err)
2742 goto out_free;
2743
2744 skb->dst = &rt->u.dst;
2745 if (rtm->rtm_flags & RTM_F_NOTIFY)
2746 rt->rt_flags |= RTCF_NOTIFY;
2747
2748 NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
2749
2750 err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07002751 RTM_NEWROUTE, 0, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002752 if (!err)
2753 goto out_free;
2754 if (err < 0) {
2755 err = -EMSGSIZE;
2756 goto out_free;
2757 }
2758
2759 err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
2760 if (err > 0)
2761 err = 0;
2762out: return err;
2763
2764out_free:
2765 kfree_skb(skb);
2766 goto out;
2767}
2768
2769int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2770{
2771 struct rtable *rt;
2772 int h, s_h;
2773 int idx, s_idx;
2774
2775 s_h = cb->args[0];
2776 s_idx = idx = cb->args[1];
2777 for (h = 0; h <= rt_hash_mask; h++) {
2778 if (h < s_h) continue;
2779 if (h > s_h)
2780 s_idx = 0;
2781 rcu_read_lock_bh();
2782 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2783 rt = rcu_dereference(rt->u.rt_next), idx++) {
2784 if (idx < s_idx)
2785 continue;
2786 skb->dst = dst_clone(&rt->u.dst);
2787 if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07002788 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2789 1, NLM_F_MULTI) <= 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002790 dst_release(xchg(&skb->dst, NULL));
2791 rcu_read_unlock_bh();
2792 goto done;
2793 }
2794 dst_release(xchg(&skb->dst, NULL));
2795 }
2796 rcu_read_unlock_bh();
2797 }
2798
2799done:
2800 cb->args[0] = h;
2801 cb->args[1] = idx;
2802 return skb->len;
2803}
2804
2805void ip_rt_multicast_event(struct in_device *in_dev)
2806{
2807 rt_cache_flush(0);
2808}
2809
2810#ifdef CONFIG_SYSCTL
2811static int flush_delay;
2812
2813static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2814 struct file *filp, void __user *buffer,
2815 size_t *lenp, loff_t *ppos)
2816{
2817 if (write) {
2818 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2819 rt_cache_flush(flush_delay);
2820 return 0;
2821 }
2822
2823 return -EINVAL;
2824}
2825
2826static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2827 int __user *name,
2828 int nlen,
2829 void __user *oldval,
2830 size_t __user *oldlenp,
2831 void __user *newval,
2832 size_t newlen,
2833 void **context)
2834{
2835 int delay;
2836 if (newlen != sizeof(int))
2837 return -EINVAL;
2838 if (get_user(delay, (int __user *)newval))
2839 return -EFAULT;
2840 rt_cache_flush(delay);
2841 return 0;
2842}
2843
2844ctl_table ipv4_route_table[] = {
2845 {
2846 .ctl_name = NET_IPV4_ROUTE_FLUSH,
2847 .procname = "flush",
2848 .data = &flush_delay,
2849 .maxlen = sizeof(int),
Dave Jones7e3e0362005-04-28 12:11:03 -07002850 .mode = 0200,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002851 .proc_handler = &ipv4_sysctl_rtcache_flush,
2852 .strategy = &ipv4_sysctl_rtcache_flush_strategy,
2853 },
2854 {
2855 .ctl_name = NET_IPV4_ROUTE_MIN_DELAY,
2856 .procname = "min_delay",
2857 .data = &ip_rt_min_delay,
2858 .maxlen = sizeof(int),
2859 .mode = 0644,
2860 .proc_handler = &proc_dointvec_jiffies,
2861 .strategy = &sysctl_jiffies,
2862 },
2863 {
2864 .ctl_name = NET_IPV4_ROUTE_MAX_DELAY,
2865 .procname = "max_delay",
2866 .data = &ip_rt_max_delay,
2867 .maxlen = sizeof(int),
2868 .mode = 0644,
2869 .proc_handler = &proc_dointvec_jiffies,
2870 .strategy = &sysctl_jiffies,
2871 },
2872 {
2873 .ctl_name = NET_IPV4_ROUTE_GC_THRESH,
2874 .procname = "gc_thresh",
2875 .data = &ipv4_dst_ops.gc_thresh,
2876 .maxlen = sizeof(int),
2877 .mode = 0644,
2878 .proc_handler = &proc_dointvec,
2879 },
2880 {
2881 .ctl_name = NET_IPV4_ROUTE_MAX_SIZE,
2882 .procname = "max_size",
2883 .data = &ip_rt_max_size,
2884 .maxlen = sizeof(int),
2885 .mode = 0644,
2886 .proc_handler = &proc_dointvec,
2887 },
2888 {
2889 /* Deprecated. Use gc_min_interval_ms */
2890
2891 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2892 .procname = "gc_min_interval",
2893 .data = &ip_rt_gc_min_interval,
2894 .maxlen = sizeof(int),
2895 .mode = 0644,
2896 .proc_handler = &proc_dointvec_jiffies,
2897 .strategy = &sysctl_jiffies,
2898 },
2899 {
2900 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2901 .procname = "gc_min_interval_ms",
2902 .data = &ip_rt_gc_min_interval,
2903 .maxlen = sizeof(int),
2904 .mode = 0644,
2905 .proc_handler = &proc_dointvec_ms_jiffies,
2906 .strategy = &sysctl_ms_jiffies,
2907 },
2908 {
2909 .ctl_name = NET_IPV4_ROUTE_GC_TIMEOUT,
2910 .procname = "gc_timeout",
2911 .data = &ip_rt_gc_timeout,
2912 .maxlen = sizeof(int),
2913 .mode = 0644,
2914 .proc_handler = &proc_dointvec_jiffies,
2915 .strategy = &sysctl_jiffies,
2916 },
2917 {
2918 .ctl_name = NET_IPV4_ROUTE_GC_INTERVAL,
2919 .procname = "gc_interval",
2920 .data = &ip_rt_gc_interval,
2921 .maxlen = sizeof(int),
2922 .mode = 0644,
2923 .proc_handler = &proc_dointvec_jiffies,
2924 .strategy = &sysctl_jiffies,
2925 },
2926 {
2927 .ctl_name = NET_IPV4_ROUTE_REDIRECT_LOAD,
2928 .procname = "redirect_load",
2929 .data = &ip_rt_redirect_load,
2930 .maxlen = sizeof(int),
2931 .mode = 0644,
2932 .proc_handler = &proc_dointvec,
2933 },
2934 {
2935 .ctl_name = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2936 .procname = "redirect_number",
2937 .data = &ip_rt_redirect_number,
2938 .maxlen = sizeof(int),
2939 .mode = 0644,
2940 .proc_handler = &proc_dointvec,
2941 },
2942 {
2943 .ctl_name = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2944 .procname = "redirect_silence",
2945 .data = &ip_rt_redirect_silence,
2946 .maxlen = sizeof(int),
2947 .mode = 0644,
2948 .proc_handler = &proc_dointvec,
2949 },
2950 {
2951 .ctl_name = NET_IPV4_ROUTE_ERROR_COST,
2952 .procname = "error_cost",
2953 .data = &ip_rt_error_cost,
2954 .maxlen = sizeof(int),
2955 .mode = 0644,
2956 .proc_handler = &proc_dointvec,
2957 },
2958 {
2959 .ctl_name = NET_IPV4_ROUTE_ERROR_BURST,
2960 .procname = "error_burst",
2961 .data = &ip_rt_error_burst,
2962 .maxlen = sizeof(int),
2963 .mode = 0644,
2964 .proc_handler = &proc_dointvec,
2965 },
2966 {
2967 .ctl_name = NET_IPV4_ROUTE_GC_ELASTICITY,
2968 .procname = "gc_elasticity",
2969 .data = &ip_rt_gc_elasticity,
2970 .maxlen = sizeof(int),
2971 .mode = 0644,
2972 .proc_handler = &proc_dointvec,
2973 },
2974 {
2975 .ctl_name = NET_IPV4_ROUTE_MTU_EXPIRES,
2976 .procname = "mtu_expires",
2977 .data = &ip_rt_mtu_expires,
2978 .maxlen = sizeof(int),
2979 .mode = 0644,
2980 .proc_handler = &proc_dointvec_jiffies,
2981 .strategy = &sysctl_jiffies,
2982 },
2983 {
2984 .ctl_name = NET_IPV4_ROUTE_MIN_PMTU,
2985 .procname = "min_pmtu",
2986 .data = &ip_rt_min_pmtu,
2987 .maxlen = sizeof(int),
2988 .mode = 0644,
2989 .proc_handler = &proc_dointvec,
2990 },
2991 {
2992 .ctl_name = NET_IPV4_ROUTE_MIN_ADVMSS,
2993 .procname = "min_adv_mss",
2994 .data = &ip_rt_min_advmss,
2995 .maxlen = sizeof(int),
2996 .mode = 0644,
2997 .proc_handler = &proc_dointvec,
2998 },
2999 {
3000 .ctl_name = NET_IPV4_ROUTE_SECRET_INTERVAL,
3001 .procname = "secret_interval",
3002 .data = &ip_rt_secret_interval,
3003 .maxlen = sizeof(int),
3004 .mode = 0644,
3005 .proc_handler = &proc_dointvec_jiffies,
3006 .strategy = &sysctl_jiffies,
3007 },
3008 { .ctl_name = 0 }
3009};
3010#endif
3011
3012#ifdef CONFIG_NET_CLS_ROUTE
3013struct ip_rt_acct *ip_rt_acct;
3014
3015/* This code sucks. But you should have seen it before! --RR */
3016
3017/* IP route accounting ptr for this logical cpu number. */
3018#define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
3019
3020#ifdef CONFIG_PROC_FS
3021static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
3022 int length, int *eof, void *data)
3023{
3024 unsigned int i;
3025
3026 if ((offset & 3) || (length & 3))
3027 return -EIO;
3028
3029 if (offset >= sizeof(struct ip_rt_acct) * 256) {
3030 *eof = 1;
3031 return 0;
3032 }
3033
3034 if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
3035 length = sizeof(struct ip_rt_acct) * 256 - offset;
3036 *eof = 1;
3037 }
3038
3039 offset /= sizeof(u32);
3040
3041 if (length > 0) {
3042 u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
3043 u32 *dst = (u32 *) buffer;
3044
3045 /* Copy first cpu. */
3046 *start = buffer;
3047 memcpy(dst, src, length);
3048
3049 /* Add the other cpus in, one int at a time */
3050 for_each_cpu(i) {
3051 unsigned int j;
3052
3053 src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
3054
3055 for (j = 0; j < length/4; j++)
3056 dst[j] += src[j];
3057 }
3058 }
3059 return length;
3060}
3061#endif /* CONFIG_PROC_FS */
3062#endif /* CONFIG_NET_CLS_ROUTE */
3063
3064static __initdata unsigned long rhash_entries;
3065static int __init set_rhash_entries(char *str)
3066{
3067 if (!str)
3068 return 0;
3069 rhash_entries = simple_strtoul(str, &str, 0);
3070 return 1;
3071}
3072__setup("rhash_entries=", set_rhash_entries);
3073
3074int __init ip_rt_init(void)
3075{
3076 int i, order, goal, rc = 0;
3077
3078 rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
3079 (jiffies ^ (jiffies >> 7)));
3080
3081#ifdef CONFIG_NET_CLS_ROUTE
3082 for (order = 0;
3083 (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
3084 /* NOTHING */;
3085 ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
3086 if (!ip_rt_acct)
3087 panic("IP: failed to allocate ip_rt_acct\n");
3088 memset(ip_rt_acct, 0, PAGE_SIZE << order);
3089#endif
3090
3091 ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
3092 sizeof(struct rtable),
3093 0, SLAB_HWCACHE_ALIGN,
3094 NULL, NULL);
3095
3096 if (!ipv4_dst_ops.kmem_cachep)
3097 panic("IP: failed to allocate ip_dst_cache\n");
3098
3099 goal = num_physpages >> (26 - PAGE_SHIFT);
3100 if (rhash_entries)
3101 goal = (rhash_entries * sizeof(struct rt_hash_bucket)) >> PAGE_SHIFT;
3102 for (order = 0; (1UL << order) < goal; order++)
3103 /* NOTHING */;
3104
3105 do {
3106 rt_hash_mask = (1UL << order) * PAGE_SIZE /
3107 sizeof(struct rt_hash_bucket);
3108 while (rt_hash_mask & (rt_hash_mask - 1))
3109 rt_hash_mask--;
3110 rt_hash_table = (struct rt_hash_bucket *)
3111 __get_free_pages(GFP_ATOMIC, order);
3112 } while (rt_hash_table == NULL && --order > 0);
3113
3114 if (!rt_hash_table)
3115 panic("Failed to allocate IP route cache hash table\n");
3116
3117 printk(KERN_INFO "IP: routing cache hash table of %u buckets, %ldKbytes\n",
3118 rt_hash_mask,
3119 (long) (rt_hash_mask * sizeof(struct rt_hash_bucket)) / 1024);
3120
3121 for (rt_hash_log = 0; (1 << rt_hash_log) != rt_hash_mask; rt_hash_log++)
3122 /* NOTHING */;
3123
3124 rt_hash_mask--;
3125 for (i = 0; i <= rt_hash_mask; i++) {
3126 spin_lock_init(&rt_hash_table[i].lock);
3127 rt_hash_table[i].chain = NULL;
3128 }
3129
3130 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3131 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3132
3133 rt_cache_stat = alloc_percpu(struct rt_cache_stat);
3134 if (!rt_cache_stat)
3135 return -ENOMEM;
3136
3137 devinet_init();
3138 ip_fib_init();
3139
3140 init_timer(&rt_flush_timer);
3141 rt_flush_timer.function = rt_run_flush;
3142 init_timer(&rt_periodic_timer);
3143 rt_periodic_timer.function = rt_check_expire;
3144 init_timer(&rt_secret_timer);
3145 rt_secret_timer.function = rt_secret_rebuild;
3146
3147 /* All the timers, started at system startup tend
3148 to synchronize. Perturb it a bit.
3149 */
3150 rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
3151 ip_rt_gc_interval;
3152 add_timer(&rt_periodic_timer);
3153
3154 rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3155 ip_rt_secret_interval;
3156 add_timer(&rt_secret_timer);
3157
3158#ifdef CONFIG_PROC_FS
3159 {
3160 struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
3161 if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
3162 !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO,
3163 proc_net_stat))) {
3164 free_percpu(rt_cache_stat);
3165 return -ENOMEM;
3166 }
3167 rtstat_pde->proc_fops = &rt_cpu_seq_fops;
3168 }
3169#ifdef CONFIG_NET_CLS_ROUTE
3170 create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL);
3171#endif
3172#endif
3173#ifdef CONFIG_XFRM
3174 xfrm_init();
3175 xfrm4_init();
3176#endif
3177 return rc;
3178}
3179
3180EXPORT_SYMBOL(__ip_select_ident);
3181EXPORT_SYMBOL(ip_route_input);
3182EXPORT_SYMBOL(ip_route_output_key);