blob: fcae074b7ae4f83e87d8e41d6e48b356cea58c27 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
8 * Version: $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
9 *
Jesper Juhl02c30a82005-05-05 16:16:16 -070010 * Authors: Ross Biro
Linus Torvalds1da177e2005-04-16 15:20:36 -070011 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Alan Cox, <gw4pts@gw4pts.ampr.org>
13 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15 *
16 * Fixes:
17 * Alan Cox : Verify area fixes.
18 * Alan Cox : cli() protects routing changes
19 * Rui Oliveira : ICMP routing table updates
20 * (rco@di.uminho.pt) Routing table insertion and update
21 * Linus Torvalds : Rewrote bits to be sensible
22 * Alan Cox : Added BSD route gw semantics
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090023 * Alan Cox : Super /proc >4K
Linus Torvalds1da177e2005-04-16 15:20:36 -070024 * Alan Cox : MTU in route table
25 * Alan Cox : MSS actually. Also added the window
26 * clamper.
27 * Sam Lantinga : Fixed route matching in rt_del()
28 * Alan Cox : Routing cache support.
29 * Alan Cox : Removed compatibility cruft.
30 * Alan Cox : RTF_REJECT support.
31 * Alan Cox : TCP irtt support.
32 * Jonathan Naylor : Added Metric support.
33 * Miquel van Smoorenburg : BSD API fixes.
34 * Miquel van Smoorenburg : Metrics.
35 * Alan Cox : Use __u32 properly
36 * Alan Cox : Aligned routing errors more closely with BSD
37 * our system is still very different.
38 * Alan Cox : Faster /proc handling
39 * Alexey Kuznetsov : Massive rework to support tree based routing,
40 * routing caches and better behaviour.
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090041 *
Linus Torvalds1da177e2005-04-16 15:20:36 -070042 * Olaf Erb : irtt wasn't being copied right.
43 * Bjorn Ekwall : Kerneld route support.
44 * Alan Cox : Multicast fixed (I hope)
45 * Pavel Krauz : Limited broadcast fixed
46 * Mike McLagan : Routing by source
47 * Alexey Kuznetsov : End of old history. Split to fib.c and
48 * route.c and rewritten from scratch.
49 * Andi Kleen : Load-limit warning messages.
50 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
51 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
52 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
53 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
54 * Marc Boucher : routing by fwmark
55 * Robert Olsson : Added rt_cache statistics
56 * Arnaldo C. Melo : Convert proc stuff to seq_file
Eric Dumazetbb1d23b2005-07-05 15:00:32 -070057 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
Ilia Sotnikovcef26852006-03-25 01:38:55 -080058 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
59 * Ilia Sotnikov : Removed TOS from hash calculations
Linus Torvalds1da177e2005-04-16 15:20:36 -070060 *
61 * This program is free software; you can redistribute it and/or
62 * modify it under the terms of the GNU General Public License
63 * as published by the Free Software Foundation; either version
64 * 2 of the License, or (at your option) any later version.
65 */
66
Linus Torvalds1da177e2005-04-16 15:20:36 -070067#include <linux/module.h>
68#include <asm/uaccess.h>
69#include <asm/system.h>
70#include <linux/bitops.h>
71#include <linux/types.h>
72#include <linux/kernel.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070073#include <linux/mm.h>
Eric Dumazet424c4b72005-07-05 14:58:19 -070074#include <linux/bootmem.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070075#include <linux/string.h>
76#include <linux/socket.h>
77#include <linux/sockios.h>
78#include <linux/errno.h>
79#include <linux/in.h>
80#include <linux/inet.h>
81#include <linux/netdevice.h>
82#include <linux/proc_fs.h>
83#include <linux/init.h>
Eric Dumazet39c90ec2007-09-15 10:55:54 -070084#include <linux/workqueue.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070085#include <linux/skbuff.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070086#include <linux/inetdevice.h>
87#include <linux/igmp.h>
88#include <linux/pkt_sched.h>
89#include <linux/mroute.h>
90#include <linux/netfilter_ipv4.h>
91#include <linux/random.h>
92#include <linux/jhash.h>
93#include <linux/rcupdate.h>
94#include <linux/times.h>
Herbert Xu352e5122007-11-13 21:34:06 -080095#include <net/dst.h>
Eric W. Biederman457c4cb2007-09-12 12:01:34 +020096#include <net/net_namespace.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070097#include <net/protocol.h>
98#include <net/ip.h>
99#include <net/route.h>
100#include <net/inetpeer.h>
101#include <net/sock.h>
102#include <net/ip_fib.h>
103#include <net/arp.h>
104#include <net/tcp.h>
105#include <net/icmp.h>
106#include <net/xfrm.h>
Tom Tucker8d717402006-07-30 20:43:36 -0700107#include <net/netevent.h>
Thomas Graf63f34442007-03-22 11:55:17 -0700108#include <net/rtnetlink.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700109#ifdef CONFIG_SYSCTL
110#include <linux/sysctl.h>
111#endif
112
113#define RT_FL_TOS(oldflp) \
114 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
115
116#define IP_MAX_MTU 0xFFF0
117
118#define RT_GC_TIMEOUT (300*HZ)
119
120static int ip_rt_min_delay = 2 * HZ;
121static int ip_rt_max_delay = 10 * HZ;
122static int ip_rt_max_size;
123static int ip_rt_gc_timeout = RT_GC_TIMEOUT;
124static int ip_rt_gc_interval = 60 * HZ;
125static int ip_rt_gc_min_interval = HZ / 2;
126static int ip_rt_redirect_number = 9;
127static int ip_rt_redirect_load = HZ / 50;
128static int ip_rt_redirect_silence = ((HZ / 50) << (9 + 1));
129static int ip_rt_error_cost = HZ;
130static int ip_rt_error_burst = 5 * HZ;
131static int ip_rt_gc_elasticity = 8;
132static int ip_rt_mtu_expires = 10 * 60 * HZ;
133static int ip_rt_min_pmtu = 512 + 20 + 20;
134static int ip_rt_min_advmss = 256;
135static int ip_rt_secret_interval = 10 * 60 * HZ;
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800136static int ip_rt_flush_expected;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700137static unsigned long rt_deadline;
138
139#define RTprint(a...) printk(KERN_DEBUG a)
140
141static struct timer_list rt_flush_timer;
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800142static void rt_worker_func(struct work_struct *work);
143static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700144static struct timer_list rt_secret_timer;
145
146/*
147 * Interface to generic destination cache.
148 */
149
150static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
151static void ipv4_dst_destroy(struct dst_entry *dst);
152static void ipv4_dst_ifdown(struct dst_entry *dst,
153 struct net_device *dev, int how);
154static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
155static void ipv4_link_failure(struct sk_buff *skb);
156static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
157static int rt_garbage_collect(void);
158
159
160static struct dst_ops ipv4_dst_ops = {
161 .family = AF_INET,
162 .protocol = __constant_htons(ETH_P_IP),
163 .gc = rt_garbage_collect,
164 .check = ipv4_dst_check,
165 .destroy = ipv4_dst_destroy,
166 .ifdown = ipv4_dst_ifdown,
167 .negative_advice = ipv4_negative_advice,
168 .link_failure = ipv4_link_failure,
169 .update_pmtu = ip_rt_update_pmtu,
Herbert Xu862b82c2007-11-13 21:43:11 -0800170 .local_out = ip_local_out,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700171 .entry_size = sizeof(struct rtable),
172};
173
174#define ECN_OR_COST(class) TC_PRIO_##class
175
Philippe De Muyter4839c522007-07-09 15:32:57 -0700176const __u8 ip_tos2prio[16] = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700177 TC_PRIO_BESTEFFORT,
178 ECN_OR_COST(FILLER),
179 TC_PRIO_BESTEFFORT,
180 ECN_OR_COST(BESTEFFORT),
181 TC_PRIO_BULK,
182 ECN_OR_COST(BULK),
183 TC_PRIO_BULK,
184 ECN_OR_COST(BULK),
185 TC_PRIO_INTERACTIVE,
186 ECN_OR_COST(INTERACTIVE),
187 TC_PRIO_INTERACTIVE,
188 ECN_OR_COST(INTERACTIVE),
189 TC_PRIO_INTERACTIVE_BULK,
190 ECN_OR_COST(INTERACTIVE_BULK),
191 TC_PRIO_INTERACTIVE_BULK,
192 ECN_OR_COST(INTERACTIVE_BULK)
193};
194
195
196/*
197 * Route cache.
198 */
199
200/* The locking scheme is rather straight forward:
201 *
202 * 1) Read-Copy Update protects the buckets of the central route hash.
203 * 2) Only writers remove entries, and they hold the lock
204 * as they look at rtable reference counts.
205 * 3) Only readers acquire references to rtable entries,
206 * they do so with atomic increments and with the
207 * lock held.
208 */
209
210struct rt_hash_bucket {
211 struct rtable *chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700212};
Ingo Molnar8a25d5d2006-07-03 00:24:54 -0700213#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
214 defined(CONFIG_PROVE_LOCKING)
Eric Dumazet22c047c2005-07-05 14:55:24 -0700215/*
216 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
217 * The size of this table is a power of two and depends on the number of CPUS.
Ingo Molnar62051202006-07-03 00:24:59 -0700218 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
Eric Dumazet22c047c2005-07-05 14:55:24 -0700219 */
Ingo Molnar62051202006-07-03 00:24:59 -0700220#ifdef CONFIG_LOCKDEP
221# define RT_HASH_LOCK_SZ 256
Eric Dumazet22c047c2005-07-05 14:55:24 -0700222#else
Ingo Molnar62051202006-07-03 00:24:59 -0700223# if NR_CPUS >= 32
224# define RT_HASH_LOCK_SZ 4096
225# elif NR_CPUS >= 16
226# define RT_HASH_LOCK_SZ 2048
227# elif NR_CPUS >= 8
228# define RT_HASH_LOCK_SZ 1024
229# elif NR_CPUS >= 4
230# define RT_HASH_LOCK_SZ 512
231# else
232# define RT_HASH_LOCK_SZ 256
233# endif
Eric Dumazet22c047c2005-07-05 14:55:24 -0700234#endif
235
236static spinlock_t *rt_hash_locks;
237# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
238# define rt_hash_lock_init() { \
239 int i; \
240 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
241 if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
242 for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
243 spin_lock_init(&rt_hash_locks[i]); \
244 }
245#else
246# define rt_hash_lock_addr(slot) NULL
247# define rt_hash_lock_init()
248#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700249
250static struct rt_hash_bucket *rt_hash_table;
251static unsigned rt_hash_mask;
Stephen Hemmingercfcabdc2007-10-09 01:59:42 -0700252static unsigned int rt_hash_log;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700253static unsigned int rt_hash_rnd;
254
Eric Dumazet2f970d82006-01-17 02:54:36 -0800255static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
Andrew Mortondbd29152006-01-17 21:58:01 -0800256#define RT_CACHE_STAT_INC(field) \
Paul Mackerrasbfe5d832006-06-25 05:47:14 -0700257 (__raw_get_cpu_var(rt_cache_stat).field++)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700258
259static int rt_intern_hash(unsigned hash, struct rtable *rth,
260 struct rtable **res);
261
Ilia Sotnikovcef26852006-03-25 01:38:55 -0800262static unsigned int rt_hash_code(u32 daddr, u32 saddr)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700263{
Ilia Sotnikovcef26852006-03-25 01:38:55 -0800264 return (jhash_2words(daddr, saddr, rt_hash_rnd)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700265 & rt_hash_mask);
266}
267
Al Viro8c7bc842006-09-26 21:26:19 -0700268#define rt_hash(daddr, saddr, idx) \
269 rt_hash_code((__force u32)(__be32)(daddr),\
270 (__force u32)(__be32)(saddr) ^ ((idx) << 5))
271
Linus Torvalds1da177e2005-04-16 15:20:36 -0700272#ifdef CONFIG_PROC_FS
273struct rt_cache_iter_state {
274 int bucket;
275};
276
277static struct rtable *rt_cache_get_first(struct seq_file *seq)
278{
279 struct rtable *r = NULL;
280 struct rt_cache_iter_state *st = seq->private;
281
282 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
283 rcu_read_lock_bh();
284 r = rt_hash_table[st->bucket].chain;
285 if (r)
286 break;
287 rcu_read_unlock_bh();
288 }
Eric Dumazet0bccead2008-01-10 03:55:57 -0800289 return rcu_dereference(r);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700290}
291
292static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
293{
Eric Dumazet0bccead2008-01-10 03:55:57 -0800294 struct rt_cache_iter_state *st = seq->private;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700295
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800296 r = r->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700297 while (!r) {
298 rcu_read_unlock_bh();
299 if (--st->bucket < 0)
300 break;
301 rcu_read_lock_bh();
302 r = rt_hash_table[st->bucket].chain;
303 }
Eric Dumazet0bccead2008-01-10 03:55:57 -0800304 return rcu_dereference(r);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700305}
306
307static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
308{
309 struct rtable *r = rt_cache_get_first(seq);
310
311 if (r)
312 while (pos && (r = rt_cache_get_next(seq, r)))
313 --pos;
314 return pos ? NULL : r;
315}
316
317static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
318{
319 return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
320}
321
322static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
323{
324 struct rtable *r = NULL;
325
326 if (v == SEQ_START_TOKEN)
327 r = rt_cache_get_first(seq);
328 else
329 r = rt_cache_get_next(seq, v);
330 ++*pos;
331 return r;
332}
333
334static void rt_cache_seq_stop(struct seq_file *seq, void *v)
335{
336 if (v && v != SEQ_START_TOKEN)
337 rcu_read_unlock_bh();
338}
339
340static int rt_cache_seq_show(struct seq_file *seq, void *v)
341{
342 if (v == SEQ_START_TOKEN)
343 seq_printf(seq, "%-127s\n",
344 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
345 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
346 "HHUptod\tSpecDst");
347 else {
348 struct rtable *r = v;
349 char temp[256];
350
351 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
352 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
353 r->u.dst.dev ? r->u.dst.dev->name : "*",
354 (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
355 r->rt_flags, atomic_read(&r->u.dst.__refcnt),
356 r->u.dst.__use, 0, (unsigned long)r->rt_src,
357 (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
358 (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
359 dst_metric(&r->u.dst, RTAX_WINDOW),
360 (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
361 dst_metric(&r->u.dst, RTAX_RTTVAR)),
362 r->fl.fl4_tos,
363 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
364 r->u.dst.hh ? (r->u.dst.hh->hh_output ==
365 dev_queue_xmit) : 0,
366 r->rt_spec_dst);
367 seq_printf(seq, "%-127s\n", temp);
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900368 }
369 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700370}
371
Stephen Hemmingerf6908082007-03-12 14:34:29 -0700372static const struct seq_operations rt_cache_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700373 .start = rt_cache_seq_start,
374 .next = rt_cache_seq_next,
375 .stop = rt_cache_seq_stop,
376 .show = rt_cache_seq_show,
377};
378
379static int rt_cache_seq_open(struct inode *inode, struct file *file)
380{
Pavel Emelyanovcf7732e2007-10-10 02:29:29 -0700381 return seq_open_private(file, &rt_cache_seq_ops,
382 sizeof(struct rt_cache_iter_state));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700383}
384
Arjan van de Ven9a321442007-02-12 00:55:35 -0800385static const struct file_operations rt_cache_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700386 .owner = THIS_MODULE,
387 .open = rt_cache_seq_open,
388 .read = seq_read,
389 .llseek = seq_lseek,
390 .release = seq_release_private,
391};
392
393
394static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
395{
396 int cpu;
397
398 if (*pos == 0)
399 return SEQ_START_TOKEN;
400
401 for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
402 if (!cpu_possible(cpu))
403 continue;
404 *pos = cpu+1;
Eric Dumazet2f970d82006-01-17 02:54:36 -0800405 return &per_cpu(rt_cache_stat, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700406 }
407 return NULL;
408}
409
410static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
411{
412 int cpu;
413
414 for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
415 if (!cpu_possible(cpu))
416 continue;
417 *pos = cpu+1;
Eric Dumazet2f970d82006-01-17 02:54:36 -0800418 return &per_cpu(rt_cache_stat, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700419 }
420 return NULL;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900421
Linus Torvalds1da177e2005-04-16 15:20:36 -0700422}
423
424static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
425{
426
427}
428
429static int rt_cpu_seq_show(struct seq_file *seq, void *v)
430{
431 struct rt_cache_stat *st = v;
432
433 if (v == SEQ_START_TOKEN) {
Olaf Rempel5bec0032005-04-28 12:16:08 -0700434 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700435 return 0;
436 }
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900437
Linus Torvalds1da177e2005-04-16 15:20:36 -0700438 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
439 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
440 atomic_read(&ipv4_dst_ops.entries),
441 st->in_hit,
442 st->in_slow_tot,
443 st->in_slow_mc,
444 st->in_no_route,
445 st->in_brd,
446 st->in_martian_dst,
447 st->in_martian_src,
448
449 st->out_hit,
450 st->out_slow_tot,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900451 st->out_slow_mc,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700452
453 st->gc_total,
454 st->gc_ignored,
455 st->gc_goal_miss,
456 st->gc_dst_overflow,
457 st->in_hlist_search,
458 st->out_hlist_search
459 );
460 return 0;
461}
462
Stephen Hemmingerf6908082007-03-12 14:34:29 -0700463static const struct seq_operations rt_cpu_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700464 .start = rt_cpu_seq_start,
465 .next = rt_cpu_seq_next,
466 .stop = rt_cpu_seq_stop,
467 .show = rt_cpu_seq_show,
468};
469
470
471static int rt_cpu_seq_open(struct inode *inode, struct file *file)
472{
473 return seq_open(file, &rt_cpu_seq_ops);
474}
475
Arjan van de Ven9a321442007-02-12 00:55:35 -0800476static const struct file_operations rt_cpu_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700477 .owner = THIS_MODULE,
478 .open = rt_cpu_seq_open,
479 .read = seq_read,
480 .llseek = seq_lseek,
481 .release = seq_release,
482};
483
484#endif /* CONFIG_PROC_FS */
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900485
Linus Torvalds1da177e2005-04-16 15:20:36 -0700486static __inline__ void rt_free(struct rtable *rt)
487{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700488 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
489}
490
491static __inline__ void rt_drop(struct rtable *rt)
492{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700493 ip_rt_put(rt);
494 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
495}
496
497static __inline__ int rt_fast_clean(struct rtable *rth)
498{
499 /* Kill broadcast/multicast entries very aggresively, if they
500 collide in hash table with more useful entries */
501 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800502 rth->fl.iif && rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700503}
504
505static __inline__ int rt_valuable(struct rtable *rth)
506{
507 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
508 rth->u.dst.expires;
509}
510
511static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
512{
513 unsigned long age;
514 int ret = 0;
515
516 if (atomic_read(&rth->u.dst.__refcnt))
517 goto out;
518
519 ret = 1;
520 if (rth->u.dst.expires &&
521 time_after_eq(jiffies, rth->u.dst.expires))
522 goto out;
523
524 age = jiffies - rth->u.dst.lastuse;
525 ret = 0;
526 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
527 (age <= tmo2 && rt_valuable(rth)))
528 goto out;
529 ret = 1;
530out: return ret;
531}
532
533/* Bits of score are:
534 * 31: very valuable
535 * 30: not quite useless
536 * 29..0: usage counter
537 */
538static inline u32 rt_score(struct rtable *rt)
539{
540 u32 score = jiffies - rt->u.dst.lastuse;
541
542 score = ~score & ~(3<<30);
543
544 if (rt_valuable(rt))
545 score |= (1<<31);
546
547 if (!rt->fl.iif ||
548 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
549 score |= (1<<30);
550
551 return score;
552}
553
554static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
555{
Al Viro714e85b2006-11-14 20:51:49 -0800556 return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
557 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
Thomas Graf47dcf0c2006-11-09 15:20:38 -0800558 (fl1->mark ^ fl2->mark) |
David S. Miller8238b212006-10-12 00:49:15 -0700559 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
560 *(u16 *)&fl2->nl_u.ip4_u.tos) |
561 (fl1->oif ^ fl2->oif) |
562 (fl1->iif ^ fl2->iif)) == 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700563}
564
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800565/*
566 * Perform a full scan of hash table and free all entries.
567 * Can be called by a softirq or a process.
568 * In the later case, we want to be reschedule if necessary
569 */
570static void rt_do_flush(int process_context)
571{
572 unsigned int i;
573 struct rtable *rth, *next;
574
575 for (i = 0; i <= rt_hash_mask; i++) {
576 if (process_context && need_resched())
577 cond_resched();
578 rth = rt_hash_table[i].chain;
579 if (!rth)
580 continue;
581
582 spin_lock_bh(rt_hash_lock_addr(i));
583 rth = rt_hash_table[i].chain;
584 rt_hash_table[i].chain = NULL;
585 spin_unlock_bh(rt_hash_lock_addr(i));
586
587 for (; rth; rth = next) {
588 next = rth->u.dst.rt_next;
589 rt_free(rth);
590 }
591 }
592}
593
594static void rt_check_expire(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700595{
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700596 static unsigned int rover;
597 unsigned int i = rover, goal;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700598 struct rtable *rth, **rthp;
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700599 u64 mult;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700600
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700601 mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
602 if (ip_rt_gc_timeout > 1)
603 do_div(mult, ip_rt_gc_timeout);
604 goal = (unsigned int)mult;
Eric Dumazet39c90ec2007-09-15 10:55:54 -0700605 if (goal > rt_hash_mask)
606 goal = rt_hash_mask + 1;
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700607 for (; goal > 0; goal--) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700608 unsigned long tmo = ip_rt_gc_timeout;
609
610 i = (i + 1) & rt_hash_mask;
611 rthp = &rt_hash_table[i].chain;
612
Eric Dumazetd90bf5a2007-11-14 16:14:05 -0800613 if (need_resched())
614 cond_resched();
615
Stephen Hemmingercfcabdc2007-10-09 01:59:42 -0700616 if (*rthp == NULL)
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700617 continue;
Eric Dumazet39c90ec2007-09-15 10:55:54 -0700618 spin_lock_bh(rt_hash_lock_addr(i));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700619 while ((rth = *rthp) != NULL) {
620 if (rth->u.dst.expires) {
621 /* Entry is expired even if it is in use */
Eric Dumazet39c90ec2007-09-15 10:55:54 -0700622 if (time_before_eq(jiffies, rth->u.dst.expires)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700623 tmo >>= 1;
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800624 rthp = &rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700625 continue;
626 }
627 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
628 tmo >>= 1;
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800629 rthp = &rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700630 continue;
631 }
632
633 /* Cleanup aged off entries. */
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800634 *rthp = rth->u.dst.rt_next;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900635 rt_free(rth);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700636 }
Eric Dumazet39c90ec2007-09-15 10:55:54 -0700637 spin_unlock_bh(rt_hash_lock_addr(i));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700638 }
639 rover = i;
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800640}
641
642/*
643 * rt_worker_func() is run in process context.
644 * If a whole flush was scheduled, it is done.
645 * Else, we call rt_check_expire() to scan part of the hash table
646 */
647static void rt_worker_func(struct work_struct *work)
648{
649 if (ip_rt_flush_expected) {
650 ip_rt_flush_expected = 0;
651 rt_do_flush(1);
652 } else
653 rt_check_expire();
Eric Dumazet39c90ec2007-09-15 10:55:54 -0700654 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700655}
656
657/* This can run from both BH and non-BH contexts, the latter
658 * in the case of a forced flush event.
659 */
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800660static void rt_run_flush(unsigned long process_context)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700661{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700662 rt_deadline = 0;
663
664 get_random_bytes(&rt_hash_rnd, 4);
665
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800666 rt_do_flush(process_context);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700667}
668
669static DEFINE_SPINLOCK(rt_flush_lock);
670
671void rt_cache_flush(int delay)
672{
673 unsigned long now = jiffies;
674 int user_mode = !in_softirq();
675
676 if (delay < 0)
677 delay = ip_rt_min_delay;
678
Linus Torvalds1da177e2005-04-16 15:20:36 -0700679 spin_lock_bh(&rt_flush_lock);
680
681 if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
682 long tmo = (long)(rt_deadline - now);
683
684 /* If flush timer is already running
685 and flush request is not immediate (delay > 0):
686
687 if deadline is not achieved, prolongate timer to "delay",
688 otherwise fire it at deadline time.
689 */
690
691 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
692 tmo = 0;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900693
Linus Torvalds1da177e2005-04-16 15:20:36 -0700694 if (delay > tmo)
695 delay = tmo;
696 }
697
698 if (delay <= 0) {
699 spin_unlock_bh(&rt_flush_lock);
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800700 rt_run_flush(user_mode);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700701 return;
702 }
703
704 if (rt_deadline == 0)
705 rt_deadline = now + ip_rt_max_delay;
706
707 mod_timer(&rt_flush_timer, now+delay);
708 spin_unlock_bh(&rt_flush_lock);
709}
710
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800711/*
712 * We change rt_hash_rnd and ask next rt_worker_func() invocation
713 * to perform a flush in process context
714 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700715static void rt_secret_rebuild(unsigned long dummy)
716{
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800717 get_random_bytes(&rt_hash_rnd, 4);
718 ip_rt_flush_expected = 1;
719 cancel_delayed_work(&expires_work);
720 schedule_delayed_work(&expires_work, HZ/10);
721 mod_timer(&rt_secret_timer, jiffies + ip_rt_secret_interval);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700722}
723
724/*
725 Short description of GC goals.
726
727 We want to build algorithm, which will keep routing cache
728 at some equilibrium point, when number of aged off entries
729 is kept approximately equal to newly generated ones.
730
731 Current expiration strength is variable "expire".
732 We try to adjust it dynamically, so that if networking
733 is idle expires is large enough to keep enough of warm entries,
734 and when load increases it reduces to limit cache size.
735 */
736
737static int rt_garbage_collect(void)
738{
739 static unsigned long expire = RT_GC_TIMEOUT;
740 static unsigned long last_gc;
741 static int rover;
742 static int equilibrium;
743 struct rtable *rth, **rthp;
744 unsigned long now = jiffies;
745 int goal;
746
747 /*
748 * Garbage collection is pretty expensive,
749 * do not make it too frequently.
750 */
751
752 RT_CACHE_STAT_INC(gc_total);
753
754 if (now - last_gc < ip_rt_gc_min_interval &&
755 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
756 RT_CACHE_STAT_INC(gc_ignored);
757 goto out;
758 }
759
760 /* Calculate number of entries, which we want to expire now. */
761 goal = atomic_read(&ipv4_dst_ops.entries) -
762 (ip_rt_gc_elasticity << rt_hash_log);
763 if (goal <= 0) {
764 if (equilibrium < ipv4_dst_ops.gc_thresh)
765 equilibrium = ipv4_dst_ops.gc_thresh;
766 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
767 if (goal > 0) {
768 equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
769 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
770 }
771 } else {
772 /* We are in dangerous area. Try to reduce cache really
773 * aggressively.
774 */
775 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
776 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
777 }
778
779 if (now - last_gc >= ip_rt_gc_min_interval)
780 last_gc = now;
781
782 if (goal <= 0) {
783 equilibrium += goal;
784 goto work_done;
785 }
786
787 do {
788 int i, k;
789
790 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
791 unsigned long tmo = expire;
792
793 k = (k + 1) & rt_hash_mask;
794 rthp = &rt_hash_table[k].chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700795 spin_lock_bh(rt_hash_lock_addr(k));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700796 while ((rth = *rthp) != NULL) {
797 if (!rt_may_expire(rth, tmo, expire)) {
798 tmo >>= 1;
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800799 rthp = &rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700800 continue;
801 }
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800802 *rthp = rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700803 rt_free(rth);
804 goal--;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700805 }
Eric Dumazet22c047c2005-07-05 14:55:24 -0700806 spin_unlock_bh(rt_hash_lock_addr(k));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700807 if (goal <= 0)
808 break;
809 }
810 rover = k;
811
812 if (goal <= 0)
813 goto work_done;
814
815 /* Goal is not achieved. We stop process if:
816
817 - if expire reduced to zero. Otherwise, expire is halfed.
818 - if table is not full.
819 - if we are called from interrupt.
820 - jiffies check is just fallback/debug loop breaker.
821 We will not spin here for long time in any case.
822 */
823
824 RT_CACHE_STAT_INC(gc_goal_miss);
825
826 if (expire == 0)
827 break;
828
829 expire >>= 1;
830#if RT_CACHE_DEBUG >= 2
831 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
832 atomic_read(&ipv4_dst_ops.entries), goal, i);
833#endif
834
835 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
836 goto out;
837 } while (!in_softirq() && time_before_eq(jiffies, now));
838
839 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
840 goto out;
841 if (net_ratelimit())
842 printk(KERN_WARNING "dst cache overflow\n");
843 RT_CACHE_STAT_INC(gc_dst_overflow);
844 return 1;
845
846work_done:
847 expire += ip_rt_gc_min_interval;
848 if (expire > ip_rt_gc_timeout ||
849 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
850 expire = ip_rt_gc_timeout;
851#if RT_CACHE_DEBUG >= 2
852 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
853 atomic_read(&ipv4_dst_ops.entries), goal, rover);
854#endif
855out: return 0;
856}
857
858static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
859{
860 struct rtable *rth, **rthp;
861 unsigned long now;
862 struct rtable *cand, **candp;
863 u32 min_score;
864 int chain_length;
865 int attempts = !in_softirq();
866
867restart:
868 chain_length = 0;
869 min_score = ~(u32)0;
870 cand = NULL;
871 candp = NULL;
872 now = jiffies;
873
874 rthp = &rt_hash_table[hash].chain;
875
Eric Dumazet22c047c2005-07-05 14:55:24 -0700876 spin_lock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700877 while ((rth = *rthp) != NULL) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700878 if (compare_keys(&rth->fl, &rt->fl)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700879 /* Put it first */
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800880 *rthp = rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700881 /*
882 * Since lookup is lockfree, the deletion
883 * must be visible to another weakly ordered CPU before
884 * the insertion at the start of the hash chain.
885 */
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800886 rcu_assign_pointer(rth->u.dst.rt_next,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700887 rt_hash_table[hash].chain);
888 /*
889 * Since lookup is lockfree, the update writes
890 * must be ordered for consistency on SMP.
891 */
892 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
893
Pavel Emelyanov03f49f32007-11-10 21:28:34 -0800894 dst_use(&rth->u.dst, now);
Eric Dumazet22c047c2005-07-05 14:55:24 -0700895 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700896
897 rt_drop(rt);
898 *rp = rth;
899 return 0;
900 }
901
902 if (!atomic_read(&rth->u.dst.__refcnt)) {
903 u32 score = rt_score(rth);
904
905 if (score <= min_score) {
906 cand = rth;
907 candp = rthp;
908 min_score = score;
909 }
910 }
911
912 chain_length++;
913
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800914 rthp = &rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700915 }
916
917 if (cand) {
918 /* ip_rt_gc_elasticity used to be average length of chain
919 * length, when exceeded gc becomes really aggressive.
920 *
921 * The second limit is less certain. At the moment it allows
922 * only 2 entries per bucket. We will see.
923 */
924 if (chain_length > ip_rt_gc_elasticity) {
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800925 *candp = cand->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700926 rt_free(cand);
927 }
928 }
929
930 /* Try to bind route to arp only if it is output
931 route or unicast forwarding path.
932 */
933 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
934 int err = arp_bind_neighbour(&rt->u.dst);
935 if (err) {
Eric Dumazet22c047c2005-07-05 14:55:24 -0700936 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700937
938 if (err != -ENOBUFS) {
939 rt_drop(rt);
940 return err;
941 }
942
943 /* Neighbour tables are full and nothing
944 can be released. Try to shrink route cache,
945 it is most likely it holds some neighbour records.
946 */
947 if (attempts-- > 0) {
948 int saved_elasticity = ip_rt_gc_elasticity;
949 int saved_int = ip_rt_gc_min_interval;
950 ip_rt_gc_elasticity = 1;
951 ip_rt_gc_min_interval = 0;
952 rt_garbage_collect();
953 ip_rt_gc_min_interval = saved_int;
954 ip_rt_gc_elasticity = saved_elasticity;
955 goto restart;
956 }
957
958 if (net_ratelimit())
959 printk(KERN_WARNING "Neighbour table overflow.\n");
960 rt_drop(rt);
961 return -ENOBUFS;
962 }
963 }
964
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800965 rt->u.dst.rt_next = rt_hash_table[hash].chain;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700966#if RT_CACHE_DEBUG >= 2
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800967 if (rt->u.dst.rt_next) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700968 struct rtable *trt;
969 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
970 NIPQUAD(rt->rt_dst));
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800971 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700972 printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
973 printk("\n");
974 }
975#endif
976 rt_hash_table[hash].chain = rt;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700977 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700978 *rp = rt;
979 return 0;
980}
981
982void rt_bind_peer(struct rtable *rt, int create)
983{
984 static DEFINE_SPINLOCK(rt_peer_lock);
985 struct inet_peer *peer;
986
987 peer = inet_getpeer(rt->rt_dst, create);
988
989 spin_lock_bh(&rt_peer_lock);
990 if (rt->peer == NULL) {
991 rt->peer = peer;
992 peer = NULL;
993 }
994 spin_unlock_bh(&rt_peer_lock);
995 if (peer)
996 inet_putpeer(peer);
997}
998
999/*
1000 * Peer allocation may fail only in serious out-of-memory conditions. However
1001 * we still can generate some output.
1002 * Random ID selection looks a bit dangerous because we have no chances to
1003 * select ID being unique in a reasonable period of time.
1004 * But broken packet identifier may be better than no packet at all.
1005 */
1006static void ip_select_fb_ident(struct iphdr *iph)
1007{
1008 static DEFINE_SPINLOCK(ip_fb_id_lock);
1009 static u32 ip_fallback_id;
1010 u32 salt;
1011
1012 spin_lock_bh(&ip_fb_id_lock);
Al Viroe4485152006-09-26 22:15:01 -07001013 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001014 iph->id = htons(salt & 0xFFFF);
1015 ip_fallback_id = salt;
1016 spin_unlock_bh(&ip_fb_id_lock);
1017}
1018
1019void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1020{
1021 struct rtable *rt = (struct rtable *) dst;
1022
1023 if (rt) {
1024 if (rt->peer == NULL)
1025 rt_bind_peer(rt, 1);
1026
1027 /* If peer is attached to destination, it is never detached,
1028 so that we need not to grab a lock to dereference it.
1029 */
1030 if (rt->peer) {
1031 iph->id = htons(inet_getid(rt->peer, more));
1032 return;
1033 }
1034 } else
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001035 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
Stephen Hemminger9c2b3322005-04-19 22:39:42 -07001036 __builtin_return_address(0));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001037
1038 ip_select_fb_ident(iph);
1039}
1040
1041static void rt_del(unsigned hash, struct rtable *rt)
1042{
1043 struct rtable **rthp;
1044
Eric Dumazet22c047c2005-07-05 14:55:24 -07001045 spin_lock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001046 ip_rt_put(rt);
1047 for (rthp = &rt_hash_table[hash].chain; *rthp;
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001048 rthp = &(*rthp)->u.dst.rt_next)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001049 if (*rthp == rt) {
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001050 *rthp = rt->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001051 rt_free(rt);
1052 break;
1053 }
Eric Dumazet22c047c2005-07-05 14:55:24 -07001054 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001055}
1056
Al Virof7655222006-09-26 21:25:43 -07001057void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1058 __be32 saddr, struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001059{
1060 int i, k;
1061 struct in_device *in_dev = in_dev_get(dev);
1062 struct rtable *rth, **rthp;
Al Virof7655222006-09-26 21:25:43 -07001063 __be32 skeys[2] = { saddr, 0 };
Linus Torvalds1da177e2005-04-16 15:20:36 -07001064 int ikeys[2] = { dev->ifindex, 0 };
Tom Tucker8d717402006-07-30 20:43:36 -07001065 struct netevent_redirect netevent;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001066
Linus Torvalds1da177e2005-04-16 15:20:36 -07001067 if (!in_dev)
1068 return;
1069
1070 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1071 || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1072 goto reject_redirect;
1073
1074 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1075 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1076 goto reject_redirect;
1077 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1078 goto reject_redirect;
1079 } else {
1080 if (inet_addr_type(new_gw) != RTN_UNICAST)
1081 goto reject_redirect;
1082 }
1083
1084 for (i = 0; i < 2; i++) {
1085 for (k = 0; k < 2; k++) {
Al Viro8c7bc842006-09-26 21:26:19 -07001086 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001087
1088 rthp=&rt_hash_table[hash].chain;
1089
1090 rcu_read_lock();
1091 while ((rth = rcu_dereference(*rthp)) != NULL) {
1092 struct rtable *rt;
1093
1094 if (rth->fl.fl4_dst != daddr ||
1095 rth->fl.fl4_src != skeys[i] ||
Linus Torvalds1da177e2005-04-16 15:20:36 -07001096 rth->fl.oif != ikeys[k] ||
1097 rth->fl.iif != 0) {
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001098 rthp = &rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001099 continue;
1100 }
1101
1102 if (rth->rt_dst != daddr ||
1103 rth->rt_src != saddr ||
1104 rth->u.dst.error ||
1105 rth->rt_gateway != old_gw ||
1106 rth->u.dst.dev != dev)
1107 break;
1108
1109 dst_hold(&rth->u.dst);
1110 rcu_read_unlock();
1111
1112 rt = dst_alloc(&ipv4_dst_ops);
1113 if (rt == NULL) {
1114 ip_rt_put(rth);
1115 in_dev_put(in_dev);
1116 return;
1117 }
1118
1119 /* Copy all the information. */
1120 *rt = *rth;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001121 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001122 rt->u.dst.__use = 1;
1123 atomic_set(&rt->u.dst.__refcnt, 1);
1124 rt->u.dst.child = NULL;
1125 if (rt->u.dst.dev)
1126 dev_hold(rt->u.dst.dev);
1127 if (rt->idev)
1128 in_dev_hold(rt->idev);
1129 rt->u.dst.obsolete = 0;
1130 rt->u.dst.lastuse = jiffies;
1131 rt->u.dst.path = &rt->u.dst;
1132 rt->u.dst.neighbour = NULL;
1133 rt->u.dst.hh = NULL;
1134 rt->u.dst.xfrm = NULL;
1135
1136 rt->rt_flags |= RTCF_REDIRECTED;
1137
1138 /* Gateway is different ... */
1139 rt->rt_gateway = new_gw;
1140
1141 /* Redirect received -> path was valid */
1142 dst_confirm(&rth->u.dst);
1143
1144 if (rt->peer)
1145 atomic_inc(&rt->peer->refcnt);
1146
1147 if (arp_bind_neighbour(&rt->u.dst) ||
1148 !(rt->u.dst.neighbour->nud_state &
1149 NUD_VALID)) {
1150 if (rt->u.dst.neighbour)
1151 neigh_event_send(rt->u.dst.neighbour, NULL);
1152 ip_rt_put(rth);
1153 rt_drop(rt);
1154 goto do_next;
1155 }
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001156
Tom Tucker8d717402006-07-30 20:43:36 -07001157 netevent.old = &rth->u.dst;
1158 netevent.new = &rt->u.dst;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001159 call_netevent_notifiers(NETEVENT_REDIRECT,
1160 &netevent);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001161
1162 rt_del(hash, rth);
1163 if (!rt_intern_hash(hash, rt, &rt))
1164 ip_rt_put(rt);
1165 goto do_next;
1166 }
1167 rcu_read_unlock();
1168 do_next:
1169 ;
1170 }
1171 }
1172 in_dev_put(in_dev);
1173 return;
1174
1175reject_redirect:
1176#ifdef CONFIG_IP_ROUTE_VERBOSE
1177 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1178 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1179 "%u.%u.%u.%u ignored.\n"
Ilia Sotnikovcef26852006-03-25 01:38:55 -08001180 " Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -07001181 NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
Ilia Sotnikovcef26852006-03-25 01:38:55 -08001182 NIPQUAD(saddr), NIPQUAD(daddr));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001183#endif
1184 in_dev_put(in_dev);
1185}
1186
1187static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1188{
1189 struct rtable *rt = (struct rtable*)dst;
1190 struct dst_entry *ret = dst;
1191
1192 if (rt) {
1193 if (dst->obsolete) {
1194 ip_rt_put(rt);
1195 ret = NULL;
1196 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1197 rt->u.dst.expires) {
Al Viro8c7bc842006-09-26 21:26:19 -07001198 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1199 rt->fl.oif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001200#if RT_CACHE_DEBUG >= 1
Denis V. Lunev56c99d02007-12-06 02:19:07 -08001201 printk(KERN_DEBUG "ipv4_negative_advice: redirect to "
Linus Torvalds1da177e2005-04-16 15:20:36 -07001202 "%u.%u.%u.%u/%02x dropped\n",
1203 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1204#endif
1205 rt_del(hash, rt);
1206 ret = NULL;
1207 }
1208 }
1209 return ret;
1210}
1211
1212/*
1213 * Algorithm:
1214 * 1. The first ip_rt_redirect_number redirects are sent
1215 * with exponential backoff, then we stop sending them at all,
1216 * assuming that the host ignores our redirects.
1217 * 2. If we did not see packets requiring redirects
1218 * during ip_rt_redirect_silence, we assume that the host
1219 * forgot redirected route and start to send redirects again.
1220 *
1221 * This algorithm is much cheaper and more intelligent than dumb load limiting
1222 * in icmp.c.
1223 *
1224 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1225 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1226 */
1227
1228void ip_rt_send_redirect(struct sk_buff *skb)
1229{
1230 struct rtable *rt = (struct rtable*)skb->dst;
1231 struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1232
1233 if (!in_dev)
1234 return;
1235
1236 if (!IN_DEV_TX_REDIRECTS(in_dev))
1237 goto out;
1238
1239 /* No redirected packets during ip_rt_redirect_silence;
1240 * reset the algorithm.
1241 */
1242 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1243 rt->u.dst.rate_tokens = 0;
1244
1245 /* Too many ignored redirects; do not send anything
1246 * set u.dst.rate_last to the last seen redirected packet.
1247 */
1248 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1249 rt->u.dst.rate_last = jiffies;
1250 goto out;
1251 }
1252
1253 /* Check for load limit; set rate_last to the latest sent
1254 * redirect.
1255 */
Li Yewang14fb8a72006-12-18 00:26:35 -08001256 if (rt->u.dst.rate_tokens == 0 ||
1257 time_after(jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001258 (rt->u.dst.rate_last +
1259 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1260 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1261 rt->u.dst.rate_last = jiffies;
1262 ++rt->u.dst.rate_tokens;
1263#ifdef CONFIG_IP_ROUTE_VERBOSE
1264 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1265 rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1266 net_ratelimit())
1267 printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1268 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1269 NIPQUAD(rt->rt_src), rt->rt_iif,
1270 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1271#endif
1272 }
1273out:
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001274 in_dev_put(in_dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001275}
1276
1277static int ip_error(struct sk_buff *skb)
1278{
1279 struct rtable *rt = (struct rtable*)skb->dst;
1280 unsigned long now;
1281 int code;
1282
1283 switch (rt->u.dst.error) {
1284 case EINVAL:
1285 default:
1286 goto out;
1287 case EHOSTUNREACH:
1288 code = ICMP_HOST_UNREACH;
1289 break;
1290 case ENETUNREACH:
1291 code = ICMP_NET_UNREACH;
Mitsuru Chinen7f538782007-12-07 01:07:24 -08001292 IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001293 break;
1294 case EACCES:
1295 code = ICMP_PKT_FILTERED;
1296 break;
1297 }
1298
1299 now = jiffies;
1300 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1301 if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1302 rt->u.dst.rate_tokens = ip_rt_error_burst;
1303 rt->u.dst.rate_last = now;
1304 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1305 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1306 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1307 }
1308
1309out: kfree_skb(skb);
1310 return 0;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001311}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001312
1313/*
1314 * The last two values are not from the RFC but
1315 * are needed for AMPRnet AX.25 paths.
1316 */
1317
Arjan van de Ven9b5b5cf2005-11-29 16:21:38 -08001318static const unsigned short mtu_plateau[] =
Linus Torvalds1da177e2005-04-16 15:20:36 -07001319{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1320
1321static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1322{
1323 int i;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001324
Linus Torvalds1da177e2005-04-16 15:20:36 -07001325 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1326 if (old_mtu > mtu_plateau[i])
1327 return mtu_plateau[i];
1328 return 68;
1329}
1330
1331unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1332{
1333 int i;
1334 unsigned short old_mtu = ntohs(iph->tot_len);
1335 struct rtable *rth;
Al Viroe4485152006-09-26 22:15:01 -07001336 __be32 skeys[2] = { iph->saddr, 0, };
1337 __be32 daddr = iph->daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001338 unsigned short est_mtu = 0;
1339
1340 if (ipv4_config.no_pmtu_disc)
1341 return 0;
1342
1343 for (i = 0; i < 2; i++) {
Al Viro8c7bc842006-09-26 21:26:19 -07001344 unsigned hash = rt_hash(daddr, skeys[i], 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001345
1346 rcu_read_lock();
1347 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001348 rth = rcu_dereference(rth->u.dst.rt_next)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001349 if (rth->fl.fl4_dst == daddr &&
1350 rth->fl.fl4_src == skeys[i] &&
1351 rth->rt_dst == daddr &&
1352 rth->rt_src == iph->saddr &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001353 rth->fl.iif == 0 &&
1354 !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1355 unsigned short mtu = new_mtu;
1356
1357 if (new_mtu < 68 || new_mtu >= old_mtu) {
1358
1359 /* BSD 4.2 compatibility hack :-( */
1360 if (mtu == 0 &&
1361 old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1362 old_mtu >= 68 + (iph->ihl << 2))
1363 old_mtu -= iph->ihl << 2;
1364
1365 mtu = guess_mtu(old_mtu);
1366 }
1367 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001368 if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001369 dst_confirm(&rth->u.dst);
1370 if (mtu < ip_rt_min_pmtu) {
1371 mtu = ip_rt_min_pmtu;
1372 rth->u.dst.metrics[RTAX_LOCK-1] |=
1373 (1 << RTAX_MTU);
1374 }
1375 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1376 dst_set_expires(&rth->u.dst,
1377 ip_rt_mtu_expires);
1378 }
1379 est_mtu = mtu;
1380 }
1381 }
1382 }
1383 rcu_read_unlock();
1384 }
1385 return est_mtu ? : new_mtu;
1386}
1387
1388static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1389{
1390 if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1391 !(dst_metric_locked(dst, RTAX_MTU))) {
1392 if (mtu < ip_rt_min_pmtu) {
1393 mtu = ip_rt_min_pmtu;
1394 dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1395 }
1396 dst->metrics[RTAX_MTU-1] = mtu;
1397 dst_set_expires(dst, ip_rt_mtu_expires);
Tom Tucker8d717402006-07-30 20:43:36 -07001398 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001399 }
1400}
1401
1402static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1403{
1404 return NULL;
1405}
1406
1407static void ipv4_dst_destroy(struct dst_entry *dst)
1408{
1409 struct rtable *rt = (struct rtable *) dst;
1410 struct inet_peer *peer = rt->peer;
1411 struct in_device *idev = rt->idev;
1412
1413 if (peer) {
1414 rt->peer = NULL;
1415 inet_putpeer(peer);
1416 }
1417
1418 if (idev) {
1419 rt->idev = NULL;
1420 in_dev_put(idev);
1421 }
1422}
1423
1424static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1425 int how)
1426{
1427 struct rtable *rt = (struct rtable *) dst;
1428 struct in_device *idev = rt->idev;
Eric W. Biederman2774c7a2007-09-26 22:10:56 -07001429 if (dev != init_net.loopback_dev && idev && idev->dev == dev) {
1430 struct in_device *loopback_idev = in_dev_get(init_net.loopback_dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001431 if (loopback_idev) {
1432 rt->idev = loopback_idev;
1433 in_dev_put(idev);
1434 }
1435 }
1436}
1437
1438static void ipv4_link_failure(struct sk_buff *skb)
1439{
1440 struct rtable *rt;
1441
1442 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1443
1444 rt = (struct rtable *) skb->dst;
1445 if (rt)
1446 dst_set_expires(&rt->u.dst, 0);
1447}
1448
1449static int ip_rt_bug(struct sk_buff *skb)
1450{
1451 printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07001452 NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
Linus Torvalds1da177e2005-04-16 15:20:36 -07001453 skb->dev ? skb->dev->name : "?");
1454 kfree_skb(skb);
1455 return 0;
1456}
1457
1458/*
1459 We do not cache source address of outgoing interface,
1460 because it is used only by IP RR, TS and SRR options,
1461 so that it out of fast path.
1462
1463 BTW remember: "addr" is allowed to be not aligned
1464 in IP options!
1465 */
1466
1467void ip_rt_get_source(u8 *addr, struct rtable *rt)
1468{
Al Viroa61ced52006-09-26 21:27:54 -07001469 __be32 src;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001470 struct fib_result res;
1471
1472 if (rt->fl.iif == 0)
1473 src = rt->rt_src;
1474 else if (fib_lookup(&rt->fl, &res) == 0) {
1475 src = FIB_RES_PREFSRC(res);
1476 fib_res_put(&res);
1477 } else
1478 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1479 RT_SCOPE_UNIVERSE);
1480 memcpy(addr, &src, 4);
1481}
1482
1483#ifdef CONFIG_NET_CLS_ROUTE
1484static void set_class_tag(struct rtable *rt, u32 tag)
1485{
1486 if (!(rt->u.dst.tclassid & 0xFFFF))
1487 rt->u.dst.tclassid |= tag & 0xFFFF;
1488 if (!(rt->u.dst.tclassid & 0xFFFF0000))
1489 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1490}
1491#endif
1492
1493static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1494{
1495 struct fib_info *fi = res->fi;
1496
1497 if (fi) {
1498 if (FIB_RES_GW(*res) &&
1499 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1500 rt->rt_gateway = FIB_RES_GW(*res);
1501 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1502 sizeof(rt->u.dst.metrics));
1503 if (fi->fib_mtu == 0) {
1504 rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1505 if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1506 rt->rt_gateway != rt->rt_dst &&
1507 rt->u.dst.dev->mtu > 576)
1508 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1509 }
1510#ifdef CONFIG_NET_CLS_ROUTE
1511 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1512#endif
1513 } else
1514 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1515
1516 if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1517 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1518 if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1519 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1520 if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1521 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1522 ip_rt_min_advmss);
1523 if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1524 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1525
1526#ifdef CONFIG_NET_CLS_ROUTE
1527#ifdef CONFIG_IP_MULTIPLE_TABLES
1528 set_class_tag(rt, fib_rules_tclass(res));
1529#endif
1530 set_class_tag(rt, itag);
1531#endif
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001532 rt->rt_type = res->type;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001533}
1534
Al Viro9e12bb22006-09-26 21:25:20 -07001535static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001536 u8 tos, struct net_device *dev, int our)
1537{
1538 unsigned hash;
1539 struct rtable *rth;
Al Viroa61ced52006-09-26 21:27:54 -07001540 __be32 spec_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001541 struct in_device *in_dev = in_dev_get(dev);
1542 u32 itag = 0;
1543
1544 /* Primary sanity checks. */
1545
1546 if (in_dev == NULL)
1547 return -EINVAL;
1548
1549 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1550 skb->protocol != htons(ETH_P_IP))
1551 goto e_inval;
1552
1553 if (ZERONET(saddr)) {
1554 if (!LOCAL_MCAST(daddr))
1555 goto e_inval;
1556 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1557 } else if (fib_validate_source(saddr, 0, tos, 0,
1558 dev, &spec_dst, &itag) < 0)
1559 goto e_inval;
1560
1561 rth = dst_alloc(&ipv4_dst_ops);
1562 if (!rth)
1563 goto e_nobufs;
1564
1565 rth->u.dst.output= ip_rt_bug;
1566
1567 atomic_set(&rth->u.dst.__refcnt, 1);
1568 rth->u.dst.flags= DST_HOST;
Herbert Xu42f811b2007-06-04 23:34:44 -07001569 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001570 rth->u.dst.flags |= DST_NOPOLICY;
1571 rth->fl.fl4_dst = daddr;
1572 rth->rt_dst = daddr;
1573 rth->fl.fl4_tos = tos;
Thomas Graf47dcf0c2006-11-09 15:20:38 -08001574 rth->fl.mark = skb->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001575 rth->fl.fl4_src = saddr;
1576 rth->rt_src = saddr;
1577#ifdef CONFIG_NET_CLS_ROUTE
1578 rth->u.dst.tclassid = itag;
1579#endif
1580 rth->rt_iif =
1581 rth->fl.iif = dev->ifindex;
Eric W. Biederman2774c7a2007-09-26 22:10:56 -07001582 rth->u.dst.dev = init_net.loopback_dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001583 dev_hold(rth->u.dst.dev);
1584 rth->idev = in_dev_get(rth->u.dst.dev);
1585 rth->fl.oif = 0;
1586 rth->rt_gateway = daddr;
1587 rth->rt_spec_dst= spec_dst;
1588 rth->rt_type = RTN_MULTICAST;
1589 rth->rt_flags = RTCF_MULTICAST;
1590 if (our) {
1591 rth->u.dst.input= ip_local_deliver;
1592 rth->rt_flags |= RTCF_LOCAL;
1593 }
1594
1595#ifdef CONFIG_IP_MROUTE
1596 if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1597 rth->u.dst.input = ip_mr_input;
1598#endif
1599 RT_CACHE_STAT_INC(in_slow_mc);
1600
1601 in_dev_put(in_dev);
Al Viro8c7bc842006-09-26 21:26:19 -07001602 hash = rt_hash(daddr, saddr, dev->ifindex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001603 return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1604
1605e_nobufs:
1606 in_dev_put(in_dev);
1607 return -ENOBUFS;
1608
1609e_inval:
1610 in_dev_put(in_dev);
1611 return -EINVAL;
1612}
1613
1614
1615static void ip_handle_martian_source(struct net_device *dev,
1616 struct in_device *in_dev,
1617 struct sk_buff *skb,
Al Viro9e12bb22006-09-26 21:25:20 -07001618 __be32 daddr,
1619 __be32 saddr)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001620{
1621 RT_CACHE_STAT_INC(in_martian_src);
1622#ifdef CONFIG_IP_ROUTE_VERBOSE
1623 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1624 /*
1625 * RFC1812 recommendation, if source is martian,
1626 * the only hint is MAC header.
1627 */
1628 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1629 "%u.%u.%u.%u, on dev %s\n",
1630 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
Arnaldo Carvalho de Melo98e399f2007-03-19 15:33:04 -07001631 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001632 int i;
Arnaldo Carvalho de Melo98e399f2007-03-19 15:33:04 -07001633 const unsigned char *p = skb_mac_header(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001634 printk(KERN_WARNING "ll header: ");
1635 for (i = 0; i < dev->hard_header_len; i++, p++) {
1636 printk("%02x", *p);
1637 if (i < (dev->hard_header_len - 1))
1638 printk(":");
1639 }
1640 printk("\n");
1641 }
1642 }
1643#endif
1644}
1645
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001646static inline int __mkroute_input(struct sk_buff *skb,
1647 struct fib_result* res,
1648 struct in_device *in_dev,
Al Viro9e12bb22006-09-26 21:25:20 -07001649 __be32 daddr, __be32 saddr, u32 tos,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001650 struct rtable **result)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001651{
1652
1653 struct rtable *rth;
1654 int err;
1655 struct in_device *out_dev;
1656 unsigned flags = 0;
Al Virod9c9df82006-09-26 21:28:14 -07001657 __be32 spec_dst;
1658 u32 itag;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001659
1660 /* get a working reference to the output device */
1661 out_dev = in_dev_get(FIB_RES_DEV(*res));
1662 if (out_dev == NULL) {
1663 if (net_ratelimit())
1664 printk(KERN_CRIT "Bug in ip_route_input" \
1665 "_slow(). Please, report\n");
1666 return -EINVAL;
1667 }
1668
1669
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001670 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
Linus Torvalds1da177e2005-04-16 15:20:36 -07001671 in_dev->dev, &spec_dst, &itag);
1672 if (err < 0) {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001673 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001674 saddr);
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001675
Linus Torvalds1da177e2005-04-16 15:20:36 -07001676 err = -EINVAL;
1677 goto cleanup;
1678 }
1679
1680 if (err)
1681 flags |= RTCF_DIRECTSRC;
1682
1683 if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1684 (IN_DEV_SHARED_MEDIA(out_dev) ||
1685 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1686 flags |= RTCF_DOREDIRECT;
1687
1688 if (skb->protocol != htons(ETH_P_IP)) {
1689 /* Not IP (i.e. ARP). Do not create route, if it is
1690 * invalid for proxy arp. DNAT routes are always valid.
1691 */
1692 if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1693 err = -EINVAL;
1694 goto cleanup;
1695 }
1696 }
1697
1698
1699 rth = dst_alloc(&ipv4_dst_ops);
1700 if (!rth) {
1701 err = -ENOBUFS;
1702 goto cleanup;
1703 }
1704
Julian Anastasovce723d82005-09-08 13:34:47 -07001705 atomic_set(&rth->u.dst.__refcnt, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001706 rth->u.dst.flags= DST_HOST;
Herbert Xu42f811b2007-06-04 23:34:44 -07001707 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001708 rth->u.dst.flags |= DST_NOPOLICY;
Herbert Xu42f811b2007-06-04 23:34:44 -07001709 if (IN_DEV_CONF_GET(out_dev, NOXFRM))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001710 rth->u.dst.flags |= DST_NOXFRM;
1711 rth->fl.fl4_dst = daddr;
1712 rth->rt_dst = daddr;
1713 rth->fl.fl4_tos = tos;
Thomas Graf47dcf0c2006-11-09 15:20:38 -08001714 rth->fl.mark = skb->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001715 rth->fl.fl4_src = saddr;
1716 rth->rt_src = saddr;
1717 rth->rt_gateway = daddr;
1718 rth->rt_iif =
1719 rth->fl.iif = in_dev->dev->ifindex;
1720 rth->u.dst.dev = (out_dev)->dev;
1721 dev_hold(rth->u.dst.dev);
1722 rth->idev = in_dev_get(rth->u.dst.dev);
1723 rth->fl.oif = 0;
1724 rth->rt_spec_dst= spec_dst;
1725
1726 rth->u.dst.input = ip_forward;
1727 rth->u.dst.output = ip_output;
1728
1729 rt_set_nexthop(rth, res, itag);
1730
1731 rth->rt_flags = flags;
1732
1733 *result = rth;
1734 err = 0;
1735 cleanup:
1736 /* release the working reference to the output device */
1737 in_dev_put(out_dev);
1738 return err;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001739}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001740
David S. Millere06e7c62007-06-10 17:22:39 -07001741static inline int ip_mkroute_input(struct sk_buff *skb,
1742 struct fib_result* res,
1743 const struct flowi *fl,
1744 struct in_device *in_dev,
1745 __be32 daddr, __be32 saddr, u32 tos)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001746{
Chuck Short7abaa272005-06-22 22:10:23 -07001747 struct rtable* rth = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001748 int err;
1749 unsigned hash;
1750
1751#ifdef CONFIG_IP_ROUTE_MULTIPATH
1752 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1753 fib_select_multipath(fl, res);
1754#endif
1755
1756 /* create a routing cache entry */
1757 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1758 if (err)
1759 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001760
1761 /* put it into the cache */
Al Viro8c7bc842006-09-26 21:26:19 -07001762 hash = rt_hash(daddr, saddr, fl->iif);
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001763 return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001764}
1765
Linus Torvalds1da177e2005-04-16 15:20:36 -07001766/*
1767 * NOTE. We drop all the packets that has local source
1768 * addresses, because every properly looped back packet
1769 * must have correct destination already attached by output routine.
1770 *
1771 * Such approach solves two big problems:
1772 * 1. Not simplex devices are handled properly.
1773 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1774 */
1775
Al Viro9e12bb22006-09-26 21:25:20 -07001776static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001777 u8 tos, struct net_device *dev)
1778{
1779 struct fib_result res;
1780 struct in_device *in_dev = in_dev_get(dev);
1781 struct flowi fl = { .nl_u = { .ip4_u =
1782 { .daddr = daddr,
1783 .saddr = saddr,
1784 .tos = tos,
1785 .scope = RT_SCOPE_UNIVERSE,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001786 } },
Thomas Graf47dcf0c2006-11-09 15:20:38 -08001787 .mark = skb->mark,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001788 .iif = dev->ifindex };
1789 unsigned flags = 0;
1790 u32 itag = 0;
1791 struct rtable * rth;
1792 unsigned hash;
Al Viro9e12bb22006-09-26 21:25:20 -07001793 __be32 spec_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001794 int err = -EINVAL;
1795 int free_res = 0;
1796
1797 /* IP on this device is disabled. */
1798
1799 if (!in_dev)
1800 goto out;
1801
1802 /* Check for the most weird martians, which can be not detected
1803 by fib_lookup.
1804 */
1805
1806 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1807 goto martian_source;
1808
Al Viroe4485152006-09-26 22:15:01 -07001809 if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001810 goto brd_input;
1811
1812 /* Accept zero addresses only to limited broadcast;
1813 * I even do not know to fix it or not. Waiting for complains :-)
1814 */
1815 if (ZERONET(saddr))
1816 goto martian_source;
1817
1818 if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1819 goto martian_destination;
1820
1821 /*
1822 * Now we are ready to route packet.
1823 */
1824 if ((err = fib_lookup(&fl, &res)) != 0) {
1825 if (!IN_DEV_FORWARD(in_dev))
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07001826 goto e_hostunreach;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001827 goto no_route;
1828 }
1829 free_res = 1;
1830
1831 RT_CACHE_STAT_INC(in_slow_tot);
1832
1833 if (res.type == RTN_BROADCAST)
1834 goto brd_input;
1835
1836 if (res.type == RTN_LOCAL) {
1837 int result;
1838 result = fib_validate_source(saddr, daddr, tos,
Eric W. Biederman2774c7a2007-09-26 22:10:56 -07001839 init_net.loopback_dev->ifindex,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001840 dev, &spec_dst, &itag);
1841 if (result < 0)
1842 goto martian_source;
1843 if (result)
1844 flags |= RTCF_DIRECTSRC;
1845 spec_dst = daddr;
1846 goto local_input;
1847 }
1848
1849 if (!IN_DEV_FORWARD(in_dev))
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07001850 goto e_hostunreach;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001851 if (res.type != RTN_UNICAST)
1852 goto martian_destination;
1853
1854 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001855done:
1856 in_dev_put(in_dev);
1857 if (free_res)
1858 fib_res_put(&res);
1859out: return err;
1860
1861brd_input:
1862 if (skb->protocol != htons(ETH_P_IP))
1863 goto e_inval;
1864
1865 if (ZERONET(saddr))
1866 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1867 else {
1868 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1869 &itag);
1870 if (err < 0)
1871 goto martian_source;
1872 if (err)
1873 flags |= RTCF_DIRECTSRC;
1874 }
1875 flags |= RTCF_BROADCAST;
1876 res.type = RTN_BROADCAST;
1877 RT_CACHE_STAT_INC(in_brd);
1878
1879local_input:
1880 rth = dst_alloc(&ipv4_dst_ops);
1881 if (!rth)
1882 goto e_nobufs;
1883
1884 rth->u.dst.output= ip_rt_bug;
1885
1886 atomic_set(&rth->u.dst.__refcnt, 1);
1887 rth->u.dst.flags= DST_HOST;
Herbert Xu42f811b2007-06-04 23:34:44 -07001888 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001889 rth->u.dst.flags |= DST_NOPOLICY;
1890 rth->fl.fl4_dst = daddr;
1891 rth->rt_dst = daddr;
1892 rth->fl.fl4_tos = tos;
Thomas Graf47dcf0c2006-11-09 15:20:38 -08001893 rth->fl.mark = skb->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001894 rth->fl.fl4_src = saddr;
1895 rth->rt_src = saddr;
1896#ifdef CONFIG_NET_CLS_ROUTE
1897 rth->u.dst.tclassid = itag;
1898#endif
1899 rth->rt_iif =
1900 rth->fl.iif = dev->ifindex;
Eric W. Biederman2774c7a2007-09-26 22:10:56 -07001901 rth->u.dst.dev = init_net.loopback_dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001902 dev_hold(rth->u.dst.dev);
1903 rth->idev = in_dev_get(rth->u.dst.dev);
1904 rth->rt_gateway = daddr;
1905 rth->rt_spec_dst= spec_dst;
1906 rth->u.dst.input= ip_local_deliver;
1907 rth->rt_flags = flags|RTCF_LOCAL;
1908 if (res.type == RTN_UNREACHABLE) {
1909 rth->u.dst.input= ip_error;
1910 rth->u.dst.error= -err;
1911 rth->rt_flags &= ~RTCF_LOCAL;
1912 }
1913 rth->rt_type = res.type;
Al Viro8c7bc842006-09-26 21:26:19 -07001914 hash = rt_hash(daddr, saddr, fl.iif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001915 err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1916 goto done;
1917
1918no_route:
1919 RT_CACHE_STAT_INC(in_no_route);
1920 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
1921 res.type = RTN_UNREACHABLE;
Mitsuru Chinen7f538782007-12-07 01:07:24 -08001922 if (err == -ESRCH)
1923 err = -ENETUNREACH;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001924 goto local_input;
1925
1926 /*
1927 * Do not cache martian addresses: they should be logged (RFC1812)
1928 */
1929martian_destination:
1930 RT_CACHE_STAT_INC(in_martian_dst);
1931#ifdef CONFIG_IP_ROUTE_VERBOSE
1932 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1933 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
1934 "%u.%u.%u.%u, dev %s\n",
1935 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1936#endif
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07001937
1938e_hostunreach:
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001939 err = -EHOSTUNREACH;
1940 goto done;
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07001941
Linus Torvalds1da177e2005-04-16 15:20:36 -07001942e_inval:
1943 err = -EINVAL;
1944 goto done;
1945
1946e_nobufs:
1947 err = -ENOBUFS;
1948 goto done;
1949
1950martian_source:
1951 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1952 goto e_inval;
1953}
1954
Al Viro9e12bb22006-09-26 21:25:20 -07001955int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001956 u8 tos, struct net_device *dev)
1957{
1958 struct rtable * rth;
1959 unsigned hash;
1960 int iif = dev->ifindex;
1961
1962 tos &= IPTOS_RT_MASK;
Al Viro8c7bc842006-09-26 21:26:19 -07001963 hash = rt_hash(daddr, saddr, iif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001964
1965 rcu_read_lock();
1966 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001967 rth = rcu_dereference(rth->u.dst.rt_next)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001968 if (rth->fl.fl4_dst == daddr &&
1969 rth->fl.fl4_src == saddr &&
1970 rth->fl.iif == iif &&
1971 rth->fl.oif == 0 &&
Thomas Graf47dcf0c2006-11-09 15:20:38 -08001972 rth->fl.mark == skb->mark &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001973 rth->fl.fl4_tos == tos) {
Pavel Emelyanov03f49f32007-11-10 21:28:34 -08001974 dst_use(&rth->u.dst, jiffies);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001975 RT_CACHE_STAT_INC(in_hit);
1976 rcu_read_unlock();
1977 skb->dst = (struct dst_entry*)rth;
1978 return 0;
1979 }
1980 RT_CACHE_STAT_INC(in_hlist_search);
1981 }
1982 rcu_read_unlock();
1983
1984 /* Multicast recognition logic is moved from route cache to here.
1985 The problem was that too many Ethernet cards have broken/missing
1986 hardware multicast filters :-( As result the host on multicasting
1987 network acquires a lot of useless route cache entries, sort of
1988 SDR messages from all the world. Now we try to get rid of them.
1989 Really, provided software IP multicast filter is organized
1990 reasonably (at least, hashed), it does not result in a slowdown
1991 comparing with route cache reject entries.
1992 Note, that multicast routers are not affected, because
1993 route cache entry is created eventually.
1994 */
1995 if (MULTICAST(daddr)) {
1996 struct in_device *in_dev;
1997
1998 rcu_read_lock();
Herbert Xue5ed6392005-10-03 14:35:55 -07001999 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002000 int our = ip_check_mc(in_dev, daddr, saddr,
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07002001 ip_hdr(skb)->protocol);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002002 if (our
2003#ifdef CONFIG_IP_MROUTE
2004 || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
2005#endif
2006 ) {
2007 rcu_read_unlock();
2008 return ip_route_input_mc(skb, daddr, saddr,
2009 tos, dev, our);
2010 }
2011 }
2012 rcu_read_unlock();
2013 return -EINVAL;
2014 }
2015 return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2016}
2017
2018static inline int __mkroute_output(struct rtable **result,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002019 struct fib_result* res,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002020 const struct flowi *fl,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002021 const struct flowi *oldflp,
2022 struct net_device *dev_out,
2023 unsigned flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002024{
2025 struct rtable *rth;
2026 struct in_device *in_dev;
2027 u32 tos = RT_FL_TOS(oldflp);
2028 int err = 0;
2029
2030 if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2031 return -EINVAL;
2032
Al Viroe4485152006-09-26 22:15:01 -07002033 if (fl->fl4_dst == htonl(0xFFFFFFFF))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002034 res->type = RTN_BROADCAST;
2035 else if (MULTICAST(fl->fl4_dst))
2036 res->type = RTN_MULTICAST;
2037 else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
2038 return -EINVAL;
2039
2040 if (dev_out->flags & IFF_LOOPBACK)
2041 flags |= RTCF_LOCAL;
2042
2043 /* get work reference to inet device */
2044 in_dev = in_dev_get(dev_out);
2045 if (!in_dev)
2046 return -EINVAL;
2047
2048 if (res->type == RTN_BROADCAST) {
2049 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2050 if (res->fi) {
2051 fib_info_put(res->fi);
2052 res->fi = NULL;
2053 }
2054 } else if (res->type == RTN_MULTICAST) {
2055 flags |= RTCF_MULTICAST|RTCF_LOCAL;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002056 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002057 oldflp->proto))
2058 flags &= ~RTCF_LOCAL;
2059 /* If multicast route do not exist use
2060 default one, but do not gateway in this case.
2061 Yes, it is hack.
2062 */
2063 if (res->fi && res->prefixlen < 4) {
2064 fib_info_put(res->fi);
2065 res->fi = NULL;
2066 }
2067 }
2068
2069
2070 rth = dst_alloc(&ipv4_dst_ops);
2071 if (!rth) {
2072 err = -ENOBUFS;
2073 goto cleanup;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002074 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002075
Julian Anastasovce723d82005-09-08 13:34:47 -07002076 atomic_set(&rth->u.dst.__refcnt, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002077 rth->u.dst.flags= DST_HOST;
Herbert Xu42f811b2007-06-04 23:34:44 -07002078 if (IN_DEV_CONF_GET(in_dev, NOXFRM))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002079 rth->u.dst.flags |= DST_NOXFRM;
Herbert Xu42f811b2007-06-04 23:34:44 -07002080 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002081 rth->u.dst.flags |= DST_NOPOLICY;
2082
2083 rth->fl.fl4_dst = oldflp->fl4_dst;
2084 rth->fl.fl4_tos = tos;
2085 rth->fl.fl4_src = oldflp->fl4_src;
2086 rth->fl.oif = oldflp->oif;
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002087 rth->fl.mark = oldflp->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002088 rth->rt_dst = fl->fl4_dst;
2089 rth->rt_src = fl->fl4_src;
2090 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002091 /* get references to the devices that are to be hold by the routing
Linus Torvalds1da177e2005-04-16 15:20:36 -07002092 cache entry */
2093 rth->u.dst.dev = dev_out;
2094 dev_hold(dev_out);
2095 rth->idev = in_dev_get(dev_out);
2096 rth->rt_gateway = fl->fl4_dst;
2097 rth->rt_spec_dst= fl->fl4_src;
2098
2099 rth->u.dst.output=ip_output;
2100
2101 RT_CACHE_STAT_INC(out_slow_tot);
2102
2103 if (flags & RTCF_LOCAL) {
2104 rth->u.dst.input = ip_local_deliver;
2105 rth->rt_spec_dst = fl->fl4_dst;
2106 }
2107 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2108 rth->rt_spec_dst = fl->fl4_src;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002109 if (flags & RTCF_LOCAL &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07002110 !(dev_out->flags & IFF_LOOPBACK)) {
2111 rth->u.dst.output = ip_mc_output;
2112 RT_CACHE_STAT_INC(out_slow_mc);
2113 }
2114#ifdef CONFIG_IP_MROUTE
2115 if (res->type == RTN_MULTICAST) {
2116 if (IN_DEV_MFORWARD(in_dev) &&
2117 !LOCAL_MCAST(oldflp->fl4_dst)) {
2118 rth->u.dst.input = ip_mr_input;
2119 rth->u.dst.output = ip_mc_output;
2120 }
2121 }
2122#endif
2123 }
2124
2125 rt_set_nexthop(rth, res, 0);
2126
2127 rth->rt_flags = flags;
2128
2129 *result = rth;
2130 cleanup:
2131 /* release work reference to inet device */
2132 in_dev_put(in_dev);
2133
2134 return err;
2135}
2136
David S. Millere06e7c62007-06-10 17:22:39 -07002137static inline int ip_mkroute_output(struct rtable **rp,
2138 struct fib_result* res,
2139 const struct flowi *fl,
2140 const struct flowi *oldflp,
2141 struct net_device *dev_out,
2142 unsigned flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002143{
Chuck Short7abaa272005-06-22 22:10:23 -07002144 struct rtable *rth = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002145 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2146 unsigned hash;
2147 if (err == 0) {
Al Viro8c7bc842006-09-26 21:26:19 -07002148 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002149 err = rt_intern_hash(hash, rth, rp);
2150 }
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002151
Linus Torvalds1da177e2005-04-16 15:20:36 -07002152 return err;
2153}
2154
Linus Torvalds1da177e2005-04-16 15:20:36 -07002155/*
2156 * Major route resolver routine.
2157 */
2158
2159static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2160{
2161 u32 tos = RT_FL_TOS(oldflp);
2162 struct flowi fl = { .nl_u = { .ip4_u =
2163 { .daddr = oldflp->fl4_dst,
2164 .saddr = oldflp->fl4_src,
2165 .tos = tos & IPTOS_RT_MASK,
2166 .scope = ((tos & RTO_ONLINK) ?
2167 RT_SCOPE_LINK :
2168 RT_SCOPE_UNIVERSE),
Linus Torvalds1da177e2005-04-16 15:20:36 -07002169 } },
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002170 .mark = oldflp->mark,
Eric W. Biederman2774c7a2007-09-26 22:10:56 -07002171 .iif = init_net.loopback_dev->ifindex,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002172 .oif = oldflp->oif };
2173 struct fib_result res;
2174 unsigned flags = 0;
2175 struct net_device *dev_out = NULL;
2176 int free_res = 0;
2177 int err;
2178
2179
2180 res.fi = NULL;
2181#ifdef CONFIG_IP_MULTIPLE_TABLES
2182 res.r = NULL;
2183#endif
2184
2185 if (oldflp->fl4_src) {
2186 err = -EINVAL;
2187 if (MULTICAST(oldflp->fl4_src) ||
2188 BADCLASS(oldflp->fl4_src) ||
2189 ZERONET(oldflp->fl4_src))
2190 goto out;
2191
2192 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2193 dev_out = ip_dev_find(oldflp->fl4_src);
David S. Millerf6c5d732007-05-18 02:07:50 -07002194 if (dev_out == NULL)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002195 goto out;
2196
2197 /* I removed check for oif == dev_out->oif here.
2198 It was wrong for two reasons:
2199 1. ip_dev_find(saddr) can return wrong iface, if saddr is
2200 assigned to multiple interfaces.
2201 2. Moreover, we are allowed to send packets with saddr
2202 of another iface. --ANK
2203 */
2204
David S. Millerf6c5d732007-05-18 02:07:50 -07002205 if (oldflp->oif == 0
Al Viroe4485152006-09-26 22:15:01 -07002206 && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002207 /* Special hack: user can direct multicasts
2208 and limited broadcast via necessary interface
2209 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2210 This hack is not just for fun, it allows
2211 vic,vat and friends to work.
2212 They bind socket to loopback, set ttl to zero
2213 and expect that it will work.
2214 From the viewpoint of routing cache they are broken,
2215 because we are not allowed to build multicast path
2216 with loopback source addr (look, routing cache
2217 cannot know, that ttl is zero, so that packet
2218 will not leave this host and route is valid).
2219 Luckily, this hack is good workaround.
2220 */
2221
2222 fl.oif = dev_out->ifindex;
2223 goto make_route;
2224 }
2225 if (dev_out)
2226 dev_put(dev_out);
2227 dev_out = NULL;
2228 }
2229
2230
2231 if (oldflp->oif) {
Eric W. Biederman881d9662007-09-17 11:56:21 -07002232 dev_out = dev_get_by_index(&init_net, oldflp->oif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002233 err = -ENODEV;
2234 if (dev_out == NULL)
2235 goto out;
Herbert Xue5ed6392005-10-03 14:35:55 -07002236
2237 /* RACE: Check return value of inet_select_addr instead. */
2238 if (__in_dev_get_rtnl(dev_out) == NULL) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002239 dev_put(dev_out);
2240 goto out; /* Wrong error code */
2241 }
2242
Al Viroe4485152006-09-26 22:15:01 -07002243 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002244 if (!fl.fl4_src)
2245 fl.fl4_src = inet_select_addr(dev_out, 0,
2246 RT_SCOPE_LINK);
2247 goto make_route;
2248 }
2249 if (!fl.fl4_src) {
2250 if (MULTICAST(oldflp->fl4_dst))
2251 fl.fl4_src = inet_select_addr(dev_out, 0,
2252 fl.fl4_scope);
2253 else if (!oldflp->fl4_dst)
2254 fl.fl4_src = inet_select_addr(dev_out, 0,
2255 RT_SCOPE_HOST);
2256 }
2257 }
2258
2259 if (!fl.fl4_dst) {
2260 fl.fl4_dst = fl.fl4_src;
2261 if (!fl.fl4_dst)
2262 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2263 if (dev_out)
2264 dev_put(dev_out);
Eric W. Biederman2774c7a2007-09-26 22:10:56 -07002265 dev_out = init_net.loopback_dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002266 dev_hold(dev_out);
Eric W. Biederman2774c7a2007-09-26 22:10:56 -07002267 fl.oif = init_net.loopback_dev->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002268 res.type = RTN_LOCAL;
2269 flags |= RTCF_LOCAL;
2270 goto make_route;
2271 }
2272
2273 if (fib_lookup(&fl, &res)) {
2274 res.fi = NULL;
2275 if (oldflp->oif) {
2276 /* Apparently, routing tables are wrong. Assume,
2277 that the destination is on link.
2278
2279 WHY? DW.
2280 Because we are allowed to send to iface
2281 even if it has NO routes and NO assigned
2282 addresses. When oif is specified, routing
2283 tables are looked up with only one purpose:
2284 to catch if destination is gatewayed, rather than
2285 direct. Moreover, if MSG_DONTROUTE is set,
2286 we send packet, ignoring both routing tables
2287 and ifaddr state. --ANK
2288
2289
2290 We could make it even if oif is unknown,
2291 likely IPv6, but we do not.
2292 */
2293
2294 if (fl.fl4_src == 0)
2295 fl.fl4_src = inet_select_addr(dev_out, 0,
2296 RT_SCOPE_LINK);
2297 res.type = RTN_UNICAST;
2298 goto make_route;
2299 }
2300 if (dev_out)
2301 dev_put(dev_out);
2302 err = -ENETUNREACH;
2303 goto out;
2304 }
2305 free_res = 1;
2306
2307 if (res.type == RTN_LOCAL) {
2308 if (!fl.fl4_src)
2309 fl.fl4_src = fl.fl4_dst;
2310 if (dev_out)
2311 dev_put(dev_out);
Eric W. Biederman2774c7a2007-09-26 22:10:56 -07002312 dev_out = init_net.loopback_dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002313 dev_hold(dev_out);
2314 fl.oif = dev_out->ifindex;
2315 if (res.fi)
2316 fib_info_put(res.fi);
2317 res.fi = NULL;
2318 flags |= RTCF_LOCAL;
2319 goto make_route;
2320 }
2321
2322#ifdef CONFIG_IP_ROUTE_MULTIPATH
2323 if (res.fi->fib_nhs > 1 && fl.oif == 0)
2324 fib_select_multipath(&fl, &res);
2325 else
2326#endif
2327 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2328 fib_select_default(&fl, &res);
2329
2330 if (!fl.fl4_src)
2331 fl.fl4_src = FIB_RES_PREFSRC(res);
2332
2333 if (dev_out)
2334 dev_put(dev_out);
2335 dev_out = FIB_RES_DEV(res);
2336 dev_hold(dev_out);
2337 fl.oif = dev_out->ifindex;
2338
2339
2340make_route:
2341 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2342
2343
2344 if (free_res)
2345 fib_res_put(&res);
2346 if (dev_out)
2347 dev_put(dev_out);
2348out: return err;
2349}
2350
2351int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2352{
2353 unsigned hash;
2354 struct rtable *rth;
2355
Al Viro8c7bc842006-09-26 21:26:19 -07002356 hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002357
2358 rcu_read_lock_bh();
2359 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
Eric Dumazet093c2ca2007-02-09 16:19:26 -08002360 rth = rcu_dereference(rth->u.dst.rt_next)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002361 if (rth->fl.fl4_dst == flp->fl4_dst &&
2362 rth->fl.fl4_src == flp->fl4_src &&
2363 rth->fl.iif == 0 &&
2364 rth->fl.oif == flp->oif &&
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002365 rth->fl.mark == flp->mark &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07002366 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2367 (IPTOS_RT_MASK | RTO_ONLINK))) {
Pavel Emelyanov03f49f32007-11-10 21:28:34 -08002368 dst_use(&rth->u.dst, jiffies);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002369 RT_CACHE_STAT_INC(out_hit);
2370 rcu_read_unlock_bh();
2371 *rp = rth;
2372 return 0;
2373 }
2374 RT_CACHE_STAT_INC(out_hlist_search);
2375 }
2376 rcu_read_unlock_bh();
2377
2378 return ip_route_output_slow(rp, flp);
2379}
2380
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -07002381EXPORT_SYMBOL_GPL(__ip_route_output_key);
2382
David S. Miller14e50e52007-05-24 18:17:54 -07002383static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2384{
2385}
2386
2387static struct dst_ops ipv4_dst_blackhole_ops = {
2388 .family = AF_INET,
2389 .protocol = __constant_htons(ETH_P_IP),
2390 .destroy = ipv4_dst_destroy,
2391 .check = ipv4_dst_check,
2392 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2393 .entry_size = sizeof(struct rtable),
2394};
2395
2396
David S. Miller14e50e52007-05-24 18:17:54 -07002397static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp, struct sock *sk)
2398{
2399 struct rtable *ort = *rp;
2400 struct rtable *rt = (struct rtable *)
2401 dst_alloc(&ipv4_dst_blackhole_ops);
2402
2403 if (rt) {
2404 struct dst_entry *new = &rt->u.dst;
2405
2406 atomic_set(&new->__refcnt, 1);
2407 new->__use = 1;
Herbert Xu352e5122007-11-13 21:34:06 -08002408 new->input = dst_discard;
2409 new->output = dst_discard;
David S. Miller14e50e52007-05-24 18:17:54 -07002410 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2411
2412 new->dev = ort->u.dst.dev;
2413 if (new->dev)
2414 dev_hold(new->dev);
2415
2416 rt->fl = ort->fl;
2417
2418 rt->idev = ort->idev;
2419 if (rt->idev)
2420 in_dev_hold(rt->idev);
2421 rt->rt_flags = ort->rt_flags;
2422 rt->rt_type = ort->rt_type;
2423 rt->rt_dst = ort->rt_dst;
2424 rt->rt_src = ort->rt_src;
2425 rt->rt_iif = ort->rt_iif;
2426 rt->rt_gateway = ort->rt_gateway;
2427 rt->rt_spec_dst = ort->rt_spec_dst;
2428 rt->peer = ort->peer;
2429 if (rt->peer)
2430 atomic_inc(&rt->peer->refcnt);
2431
2432 dst_free(new);
2433 }
2434
2435 dst_release(&(*rp)->u.dst);
2436 *rp = rt;
2437 return (rt ? 0 : -ENOMEM);
2438}
2439
Linus Torvalds1da177e2005-04-16 15:20:36 -07002440int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2441{
2442 int err;
2443
2444 if ((err = __ip_route_output_key(rp, flp)) != 0)
2445 return err;
2446
2447 if (flp->proto) {
2448 if (!flp->fl4_src)
2449 flp->fl4_src = (*rp)->rt_src;
2450 if (!flp->fl4_dst)
2451 flp->fl4_dst = (*rp)->rt_dst;
David S. Miller14e50e52007-05-24 18:17:54 -07002452 err = __xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2453 if (err == -EREMOTE)
2454 err = ipv4_dst_blackhole(rp, flp, sk);
2455
2456 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002457 }
2458
2459 return 0;
2460}
2461
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -07002462EXPORT_SYMBOL_GPL(ip_route_output_flow);
2463
Linus Torvalds1da177e2005-04-16 15:20:36 -07002464int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2465{
2466 return ip_route_output_flow(rp, flp, NULL, 0);
2467}
2468
2469static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07002470 int nowait, unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002471{
2472 struct rtable *rt = (struct rtable*)skb->dst;
2473 struct rtmsg *r;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002474 struct nlmsghdr *nlh;
Thomas Grafe3703b32006-11-27 09:27:07 -08002475 long expires;
2476 u32 id = 0, ts = 0, tsage = 0, error;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002477
2478 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2479 if (nlh == NULL)
Patrick McHardy26932562007-01-31 23:16:40 -08002480 return -EMSGSIZE;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002481
2482 r = nlmsg_data(nlh);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002483 r->rtm_family = AF_INET;
2484 r->rtm_dst_len = 32;
2485 r->rtm_src_len = 0;
2486 r->rtm_tos = rt->fl.fl4_tos;
2487 r->rtm_table = RT_TABLE_MAIN;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002488 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002489 r->rtm_type = rt->rt_type;
2490 r->rtm_scope = RT_SCOPE_UNIVERSE;
2491 r->rtm_protocol = RTPROT_UNSPEC;
2492 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2493 if (rt->rt_flags & RTCF_NOTIFY)
2494 r->rtm_flags |= RTM_F_NOTIFY;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002495
Al Viro17fb2c62006-09-26 22:15:25 -07002496 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
Thomas Grafbe403ea2006-08-17 18:15:17 -07002497
Linus Torvalds1da177e2005-04-16 15:20:36 -07002498 if (rt->fl.fl4_src) {
2499 r->rtm_src_len = 32;
Al Viro17fb2c62006-09-26 22:15:25 -07002500 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002501 }
2502 if (rt->u.dst.dev)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002503 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002504#ifdef CONFIG_NET_CLS_ROUTE
2505 if (rt->u.dst.tclassid)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002506 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002507#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07002508 if (rt->fl.iif)
Al Viro17fb2c62006-09-26 22:15:25 -07002509 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002510 else if (rt->rt_src != rt->fl.fl4_src)
Al Viro17fb2c62006-09-26 22:15:25 -07002511 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
Thomas Grafbe403ea2006-08-17 18:15:17 -07002512
Linus Torvalds1da177e2005-04-16 15:20:36 -07002513 if (rt->rt_dst != rt->rt_gateway)
Al Viro17fb2c62006-09-26 22:15:25 -07002514 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
Thomas Grafbe403ea2006-08-17 18:15:17 -07002515
Linus Torvalds1da177e2005-04-16 15:20:36 -07002516 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002517 goto nla_put_failure;
2518
Thomas Grafe3703b32006-11-27 09:27:07 -08002519 error = rt->u.dst.error;
2520 expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002521 if (rt->peer) {
Thomas Grafe3703b32006-11-27 09:27:07 -08002522 id = rt->peer->ip_id_count;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002523 if (rt->peer->tcp_ts_stamp) {
Thomas Grafe3703b32006-11-27 09:27:07 -08002524 ts = rt->peer->tcp_ts;
James Morris9d729f72007-03-04 16:12:44 -08002525 tsage = get_seconds() - rt->peer->tcp_ts_stamp;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002526 }
2527 }
Thomas Grafbe403ea2006-08-17 18:15:17 -07002528
Linus Torvalds1da177e2005-04-16 15:20:36 -07002529 if (rt->fl.iif) {
2530#ifdef CONFIG_IP_MROUTE
Al Viroe4485152006-09-26 22:15:01 -07002531 __be32 dst = rt->rt_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002532
2533 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
Herbert Xu42f811b2007-06-04 23:34:44 -07002534 IPV4_DEVCONF_ALL(MC_FORWARDING)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002535 int err = ipmr_get_route(skb, r, nowait);
2536 if (err <= 0) {
2537 if (!nowait) {
2538 if (err == 0)
2539 return 0;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002540 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002541 } else {
2542 if (err == -EMSGSIZE)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002543 goto nla_put_failure;
Thomas Grafe3703b32006-11-27 09:27:07 -08002544 error = err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002545 }
2546 }
2547 } else
2548#endif
Thomas Grafbe403ea2006-08-17 18:15:17 -07002549 NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002550 }
2551
Thomas Grafe3703b32006-11-27 09:27:07 -08002552 if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2553 expires, error) < 0)
2554 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002555
Thomas Grafbe403ea2006-08-17 18:15:17 -07002556 return nlmsg_end(skb, nlh);
2557
2558nla_put_failure:
Patrick McHardy26932562007-01-31 23:16:40 -08002559 nlmsg_cancel(skb, nlh);
2560 return -EMSGSIZE;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002561}
2562
Thomas Graf63f34442007-03-22 11:55:17 -07002563static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002564{
Denis V. Lunevb8542722007-12-01 00:21:31 +11002565 struct net *net = in_skb->sk->sk_net;
Thomas Grafd889ce32006-08-17 18:15:44 -07002566 struct rtmsg *rtm;
2567 struct nlattr *tb[RTA_MAX+1];
Linus Torvalds1da177e2005-04-16 15:20:36 -07002568 struct rtable *rt = NULL;
Al Viro9e12bb22006-09-26 21:25:20 -07002569 __be32 dst = 0;
2570 __be32 src = 0;
2571 u32 iif;
Thomas Grafd889ce32006-08-17 18:15:44 -07002572 int err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002573 struct sk_buff *skb;
2574
Denis V. Lunevb8542722007-12-01 00:21:31 +11002575 if (net != &init_net)
2576 return -EINVAL;
2577
Thomas Grafd889ce32006-08-17 18:15:44 -07002578 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2579 if (err < 0)
2580 goto errout;
2581
2582 rtm = nlmsg_data(nlh);
2583
Linus Torvalds1da177e2005-04-16 15:20:36 -07002584 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
Thomas Grafd889ce32006-08-17 18:15:44 -07002585 if (skb == NULL) {
2586 err = -ENOBUFS;
2587 goto errout;
2588 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002589
2590 /* Reserve room for dummy headers, this skb can pass
2591 through good chunk of routing engine.
2592 */
Arnaldo Carvalho de Melo459a98e2007-03-19 15:30:44 -07002593 skb_reset_mac_header(skb);
Arnaldo Carvalho de Meloc1d2bbe2007-04-10 20:45:18 -07002594 skb_reset_network_header(skb);
Stephen Hemmingerd2c962b2006-04-17 17:27:11 -07002595
2596 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07002597 ip_hdr(skb)->protocol = IPPROTO_ICMP;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002598 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2599
Al Viro17fb2c62006-09-26 22:15:25 -07002600 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2601 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
Thomas Grafd889ce32006-08-17 18:15:44 -07002602 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002603
2604 if (iif) {
Thomas Grafd889ce32006-08-17 18:15:44 -07002605 struct net_device *dev;
2606
Eric W. Biederman881d9662007-09-17 11:56:21 -07002607 dev = __dev_get_by_index(&init_net, iif);
Thomas Grafd889ce32006-08-17 18:15:44 -07002608 if (dev == NULL) {
2609 err = -ENODEV;
2610 goto errout_free;
2611 }
2612
Linus Torvalds1da177e2005-04-16 15:20:36 -07002613 skb->protocol = htons(ETH_P_IP);
2614 skb->dev = dev;
2615 local_bh_disable();
2616 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2617 local_bh_enable();
Thomas Grafd889ce32006-08-17 18:15:44 -07002618
2619 rt = (struct rtable*) skb->dst;
2620 if (err == 0 && rt->u.dst.error)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002621 err = -rt->u.dst.error;
2622 } else {
Thomas Grafd889ce32006-08-17 18:15:44 -07002623 struct flowi fl = {
2624 .nl_u = {
2625 .ip4_u = {
2626 .daddr = dst,
2627 .saddr = src,
2628 .tos = rtm->rtm_tos,
2629 },
2630 },
2631 .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2632 };
Linus Torvalds1da177e2005-04-16 15:20:36 -07002633 err = ip_route_output_key(&rt, &fl);
2634 }
Thomas Grafd889ce32006-08-17 18:15:44 -07002635
Linus Torvalds1da177e2005-04-16 15:20:36 -07002636 if (err)
Thomas Grafd889ce32006-08-17 18:15:44 -07002637 goto errout_free;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002638
2639 skb->dst = &rt->u.dst;
2640 if (rtm->rtm_flags & RTM_F_NOTIFY)
2641 rt->rt_flags |= RTCF_NOTIFY;
2642
Linus Torvalds1da177e2005-04-16 15:20:36 -07002643 err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07002644 RTM_NEWROUTE, 0, 0);
Thomas Grafd889ce32006-08-17 18:15:44 -07002645 if (err <= 0)
2646 goto errout_free;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002647
Denis V. Lunev97c53ca2007-11-19 22:26:51 -08002648 err = rtnl_unicast(skb, &init_net, NETLINK_CB(in_skb).pid);
Thomas Grafd889ce32006-08-17 18:15:44 -07002649errout:
Thomas Graf2942e902006-08-15 00:30:25 -07002650 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002651
Thomas Grafd889ce32006-08-17 18:15:44 -07002652errout_free:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002653 kfree_skb(skb);
Thomas Grafd889ce32006-08-17 18:15:44 -07002654 goto errout;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002655}
2656
2657int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2658{
2659 struct rtable *rt;
2660 int h, s_h;
2661 int idx, s_idx;
2662
2663 s_h = cb->args[0];
Eric Dumazetd8c92832008-01-07 21:52:14 -08002664 if (s_h < 0)
2665 s_h = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002666 s_idx = idx = cb->args[1];
Eric Dumazetd8c92832008-01-07 21:52:14 -08002667 for (h = s_h; h <= rt_hash_mask; h++) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002668 rcu_read_lock_bh();
2669 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
Eric Dumazet093c2ca2007-02-09 16:19:26 -08002670 rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002671 if (idx < s_idx)
2672 continue;
2673 skb->dst = dst_clone(&rt->u.dst);
2674 if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002675 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07002676 1, NLM_F_MULTI) <= 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002677 dst_release(xchg(&skb->dst, NULL));
2678 rcu_read_unlock_bh();
2679 goto done;
2680 }
2681 dst_release(xchg(&skb->dst, NULL));
2682 }
2683 rcu_read_unlock_bh();
Eric Dumazetd8c92832008-01-07 21:52:14 -08002684 s_idx = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002685 }
2686
2687done:
2688 cb->args[0] = h;
2689 cb->args[1] = idx;
2690 return skb->len;
2691}
2692
2693void ip_rt_multicast_event(struct in_device *in_dev)
2694{
2695 rt_cache_flush(0);
2696}
2697
2698#ifdef CONFIG_SYSCTL
2699static int flush_delay;
2700
2701static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2702 struct file *filp, void __user *buffer,
2703 size_t *lenp, loff_t *ppos)
2704{
2705 if (write) {
2706 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2707 rt_cache_flush(flush_delay);
2708 return 0;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002709 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002710
2711 return -EINVAL;
2712}
2713
2714static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2715 int __user *name,
2716 int nlen,
2717 void __user *oldval,
2718 size_t __user *oldlenp,
2719 void __user *newval,
Alexey Dobriyan1f29bcd2006-12-10 02:19:10 -08002720 size_t newlen)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002721{
2722 int delay;
2723 if (newlen != sizeof(int))
2724 return -EINVAL;
2725 if (get_user(delay, (int __user *)newval))
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002726 return -EFAULT;
2727 rt_cache_flush(delay);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002728 return 0;
2729}
2730
2731ctl_table ipv4_route_table[] = {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002732 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002733 .ctl_name = NET_IPV4_ROUTE_FLUSH,
2734 .procname = "flush",
2735 .data = &flush_delay,
2736 .maxlen = sizeof(int),
Dave Jones7e3e0362005-04-28 12:11:03 -07002737 .mode = 0200,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002738 .proc_handler = &ipv4_sysctl_rtcache_flush,
2739 .strategy = &ipv4_sysctl_rtcache_flush_strategy,
2740 },
2741 {
2742 .ctl_name = NET_IPV4_ROUTE_MIN_DELAY,
2743 .procname = "min_delay",
2744 .data = &ip_rt_min_delay,
2745 .maxlen = sizeof(int),
2746 .mode = 0644,
2747 .proc_handler = &proc_dointvec_jiffies,
2748 .strategy = &sysctl_jiffies,
2749 },
2750 {
2751 .ctl_name = NET_IPV4_ROUTE_MAX_DELAY,
2752 .procname = "max_delay",
2753 .data = &ip_rt_max_delay,
2754 .maxlen = sizeof(int),
2755 .mode = 0644,
2756 .proc_handler = &proc_dointvec_jiffies,
2757 .strategy = &sysctl_jiffies,
2758 },
2759 {
2760 .ctl_name = NET_IPV4_ROUTE_GC_THRESH,
2761 .procname = "gc_thresh",
2762 .data = &ipv4_dst_ops.gc_thresh,
2763 .maxlen = sizeof(int),
2764 .mode = 0644,
2765 .proc_handler = &proc_dointvec,
2766 },
2767 {
2768 .ctl_name = NET_IPV4_ROUTE_MAX_SIZE,
2769 .procname = "max_size",
2770 .data = &ip_rt_max_size,
2771 .maxlen = sizeof(int),
2772 .mode = 0644,
2773 .proc_handler = &proc_dointvec,
2774 },
2775 {
2776 /* Deprecated. Use gc_min_interval_ms */
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002777
Linus Torvalds1da177e2005-04-16 15:20:36 -07002778 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2779 .procname = "gc_min_interval",
2780 .data = &ip_rt_gc_min_interval,
2781 .maxlen = sizeof(int),
2782 .mode = 0644,
2783 .proc_handler = &proc_dointvec_jiffies,
2784 .strategy = &sysctl_jiffies,
2785 },
2786 {
2787 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2788 .procname = "gc_min_interval_ms",
2789 .data = &ip_rt_gc_min_interval,
2790 .maxlen = sizeof(int),
2791 .mode = 0644,
2792 .proc_handler = &proc_dointvec_ms_jiffies,
2793 .strategy = &sysctl_ms_jiffies,
2794 },
2795 {
2796 .ctl_name = NET_IPV4_ROUTE_GC_TIMEOUT,
2797 .procname = "gc_timeout",
2798 .data = &ip_rt_gc_timeout,
2799 .maxlen = sizeof(int),
2800 .mode = 0644,
2801 .proc_handler = &proc_dointvec_jiffies,
2802 .strategy = &sysctl_jiffies,
2803 },
2804 {
2805 .ctl_name = NET_IPV4_ROUTE_GC_INTERVAL,
2806 .procname = "gc_interval",
2807 .data = &ip_rt_gc_interval,
2808 .maxlen = sizeof(int),
2809 .mode = 0644,
2810 .proc_handler = &proc_dointvec_jiffies,
2811 .strategy = &sysctl_jiffies,
2812 },
2813 {
2814 .ctl_name = NET_IPV4_ROUTE_REDIRECT_LOAD,
2815 .procname = "redirect_load",
2816 .data = &ip_rt_redirect_load,
2817 .maxlen = sizeof(int),
2818 .mode = 0644,
2819 .proc_handler = &proc_dointvec,
2820 },
2821 {
2822 .ctl_name = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2823 .procname = "redirect_number",
2824 .data = &ip_rt_redirect_number,
2825 .maxlen = sizeof(int),
2826 .mode = 0644,
2827 .proc_handler = &proc_dointvec,
2828 },
2829 {
2830 .ctl_name = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2831 .procname = "redirect_silence",
2832 .data = &ip_rt_redirect_silence,
2833 .maxlen = sizeof(int),
2834 .mode = 0644,
2835 .proc_handler = &proc_dointvec,
2836 },
2837 {
2838 .ctl_name = NET_IPV4_ROUTE_ERROR_COST,
2839 .procname = "error_cost",
2840 .data = &ip_rt_error_cost,
2841 .maxlen = sizeof(int),
2842 .mode = 0644,
2843 .proc_handler = &proc_dointvec,
2844 },
2845 {
2846 .ctl_name = NET_IPV4_ROUTE_ERROR_BURST,
2847 .procname = "error_burst",
2848 .data = &ip_rt_error_burst,
2849 .maxlen = sizeof(int),
2850 .mode = 0644,
2851 .proc_handler = &proc_dointvec,
2852 },
2853 {
2854 .ctl_name = NET_IPV4_ROUTE_GC_ELASTICITY,
2855 .procname = "gc_elasticity",
2856 .data = &ip_rt_gc_elasticity,
2857 .maxlen = sizeof(int),
2858 .mode = 0644,
2859 .proc_handler = &proc_dointvec,
2860 },
2861 {
2862 .ctl_name = NET_IPV4_ROUTE_MTU_EXPIRES,
2863 .procname = "mtu_expires",
2864 .data = &ip_rt_mtu_expires,
2865 .maxlen = sizeof(int),
2866 .mode = 0644,
2867 .proc_handler = &proc_dointvec_jiffies,
2868 .strategy = &sysctl_jiffies,
2869 },
2870 {
2871 .ctl_name = NET_IPV4_ROUTE_MIN_PMTU,
2872 .procname = "min_pmtu",
2873 .data = &ip_rt_min_pmtu,
2874 .maxlen = sizeof(int),
2875 .mode = 0644,
2876 .proc_handler = &proc_dointvec,
2877 },
2878 {
2879 .ctl_name = NET_IPV4_ROUTE_MIN_ADVMSS,
2880 .procname = "min_adv_mss",
2881 .data = &ip_rt_min_advmss,
2882 .maxlen = sizeof(int),
2883 .mode = 0644,
2884 .proc_handler = &proc_dointvec,
2885 },
2886 {
2887 .ctl_name = NET_IPV4_ROUTE_SECRET_INTERVAL,
2888 .procname = "secret_interval",
2889 .data = &ip_rt_secret_interval,
2890 .maxlen = sizeof(int),
2891 .mode = 0644,
2892 .proc_handler = &proc_dointvec_jiffies,
2893 .strategy = &sysctl_jiffies,
2894 },
2895 { .ctl_name = 0 }
2896};
2897#endif
2898
2899#ifdef CONFIG_NET_CLS_ROUTE
Eric Dumazet8dbde282007-11-16 03:32:10 -08002900struct ip_rt_acct *ip_rt_acct __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002901
2902/* IP route accounting ptr for this logical cpu number. */
Eric Dumazet8dbde282007-11-16 03:32:10 -08002903#define IP_RT_ACCT_CPU(cpu) (per_cpu_ptr(ip_rt_acct, cpu))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002904
2905#ifdef CONFIG_PROC_FS
2906static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
2907 int length, int *eof, void *data)
2908{
2909 unsigned int i;
2910
2911 if ((offset & 3) || (length & 3))
2912 return -EIO;
2913
2914 if (offset >= sizeof(struct ip_rt_acct) * 256) {
2915 *eof = 1;
2916 return 0;
2917 }
2918
2919 if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
2920 length = sizeof(struct ip_rt_acct) * 256 - offset;
2921 *eof = 1;
2922 }
2923
2924 offset /= sizeof(u32);
2925
2926 if (length > 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002927 u32 *dst = (u32 *) buffer;
2928
Linus Torvalds1da177e2005-04-16 15:20:36 -07002929 *start = buffer;
Eric Dumazet483b23f2007-11-16 02:29:24 -08002930 memset(dst, 0, length);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002931
KAMEZAWA Hiroyuki6f912042006-04-10 22:52:50 -07002932 for_each_possible_cpu(i) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002933 unsigned int j;
Eric Dumazet483b23f2007-11-16 02:29:24 -08002934 u32 *src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002935
2936 for (j = 0; j < length/4; j++)
2937 dst[j] += src[j];
2938 }
2939 }
2940 return length;
2941}
2942#endif /* CONFIG_PROC_FS */
2943#endif /* CONFIG_NET_CLS_ROUTE */
2944
2945static __initdata unsigned long rhash_entries;
2946static int __init set_rhash_entries(char *str)
2947{
2948 if (!str)
2949 return 0;
2950 rhash_entries = simple_strtoul(str, &str, 0);
2951 return 1;
2952}
2953__setup("rhash_entries=", set_rhash_entries);
2954
2955int __init ip_rt_init(void)
2956{
Eric Dumazet424c4b72005-07-05 14:58:19 -07002957 int rc = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002958
2959 rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
2960 (jiffies ^ (jiffies >> 7)));
2961
2962#ifdef CONFIG_NET_CLS_ROUTE
Eric Dumazet8dbde282007-11-16 03:32:10 -08002963 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002964 if (!ip_rt_acct)
2965 panic("IP: failed to allocate ip_rt_acct\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07002966#endif
2967
Alexey Dobriyane5d679f332006-08-26 19:25:52 -07002968 ipv4_dst_ops.kmem_cachep =
2969 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
Paul Mundt20c2df82007-07-20 10:11:58 +09002970 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002971
David S. Miller14e50e52007-05-24 18:17:54 -07002972 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2973
Eric Dumazet424c4b72005-07-05 14:58:19 -07002974 rt_hash_table = (struct rt_hash_bucket *)
2975 alloc_large_system_hash("IP route cache",
2976 sizeof(struct rt_hash_bucket),
2977 rhash_entries,
2978 (num_physpages >= 128 * 1024) ?
Mike Stroyan18955cf2005-11-29 16:12:55 -08002979 15 : 17,
Kirill Korotaev8d1502d2006-08-07 20:44:22 -07002980 0,
Eric Dumazet424c4b72005-07-05 14:58:19 -07002981 &rt_hash_log,
2982 &rt_hash_mask,
2983 0);
Eric Dumazet22c047c2005-07-05 14:55:24 -07002984 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
2985 rt_hash_lock_init();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002986
2987 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
2988 ip_rt_max_size = (rt_hash_mask + 1) * 16;
2989
Linus Torvalds1da177e2005-04-16 15:20:36 -07002990 devinet_init();
2991 ip_fib_init();
2992
Pavel Emelyanovb24b8a22008-01-23 21:20:07 -08002993 setup_timer(&rt_flush_timer, rt_run_flush, 0);
2994 setup_timer(&rt_secret_timer, rt_secret_rebuild, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002995
2996 /* All the timers, started at system startup tend
2997 to synchronize. Perturb it a bit.
2998 */
Eric Dumazet39c90ec2007-09-15 10:55:54 -07002999 schedule_delayed_work(&expires_work,
3000 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003001
3002 rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3003 ip_rt_secret_interval;
3004 add_timer(&rt_secret_timer);
3005
3006#ifdef CONFIG_PROC_FS
3007 {
3008 struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
Eric W. Biederman457c4cb2007-09-12 12:01:34 +02003009 if (!proc_net_fops_create(&init_net, "rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003010 !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO,
Eric W. Biederman457c4cb2007-09-12 12:01:34 +02003011 init_net.proc_net_stat))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003012 return -ENOMEM;
3013 }
3014 rtstat_pde->proc_fops = &rt_cpu_seq_fops;
3015 }
3016#ifdef CONFIG_NET_CLS_ROUTE
Eric W. Biederman457c4cb2007-09-12 12:01:34 +02003017 create_proc_read_entry("rt_acct", 0, init_net.proc_net, ip_rt_acct_read, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003018#endif
3019#endif
3020#ifdef CONFIG_XFRM
3021 xfrm_init();
3022 xfrm4_init();
3023#endif
Thomas Graf63f34442007-03-22 11:55:17 -07003024 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3025
Linus Torvalds1da177e2005-04-16 15:20:36 -07003026 return rc;
3027}
3028
3029EXPORT_SYMBOL(__ip_select_ident);
3030EXPORT_SYMBOL(ip_route_input);
3031EXPORT_SYMBOL(ip_route_output_key);