blob: 230716c2dfe0ae4db22a93c34e87c1424113f792 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
8 * Version: $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
9 *
Jesper Juhl02c30a82005-05-05 16:16:16 -070010 * Authors: Ross Biro
Linus Torvalds1da177e2005-04-16 15:20:36 -070011 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Alan Cox, <gw4pts@gw4pts.ampr.org>
13 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15 *
16 * Fixes:
17 * Alan Cox : Verify area fixes.
18 * Alan Cox : cli() protects routing changes
19 * Rui Oliveira : ICMP routing table updates
20 * (rco@di.uminho.pt) Routing table insertion and update
21 * Linus Torvalds : Rewrote bits to be sensible
22 * Alan Cox : Added BSD route gw semantics
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090023 * Alan Cox : Super /proc >4K
Linus Torvalds1da177e2005-04-16 15:20:36 -070024 * Alan Cox : MTU in route table
25 * Alan Cox : MSS actually. Also added the window
26 * clamper.
27 * Sam Lantinga : Fixed route matching in rt_del()
28 * Alan Cox : Routing cache support.
29 * Alan Cox : Removed compatibility cruft.
30 * Alan Cox : RTF_REJECT support.
31 * Alan Cox : TCP irtt support.
32 * Jonathan Naylor : Added Metric support.
33 * Miquel van Smoorenburg : BSD API fixes.
34 * Miquel van Smoorenburg : Metrics.
35 * Alan Cox : Use __u32 properly
36 * Alan Cox : Aligned routing errors more closely with BSD
37 * our system is still very different.
38 * Alan Cox : Faster /proc handling
39 * Alexey Kuznetsov : Massive rework to support tree based routing,
40 * routing caches and better behaviour.
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090041 *
Linus Torvalds1da177e2005-04-16 15:20:36 -070042 * Olaf Erb : irtt wasn't being copied right.
43 * Bjorn Ekwall : Kerneld route support.
44 * Alan Cox : Multicast fixed (I hope)
45 * Pavel Krauz : Limited broadcast fixed
46 * Mike McLagan : Routing by source
47 * Alexey Kuznetsov : End of old history. Split to fib.c and
48 * route.c and rewritten from scratch.
49 * Andi Kleen : Load-limit warning messages.
50 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
51 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
52 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
53 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
54 * Marc Boucher : routing by fwmark
55 * Robert Olsson : Added rt_cache statistics
56 * Arnaldo C. Melo : Convert proc stuff to seq_file
Eric Dumazetbb1d23b2005-07-05 15:00:32 -070057 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
Ilia Sotnikovcef26852006-03-25 01:38:55 -080058 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
59 * Ilia Sotnikov : Removed TOS from hash calculations
Linus Torvalds1da177e2005-04-16 15:20:36 -070060 *
61 * This program is free software; you can redistribute it and/or
62 * modify it under the terms of the GNU General Public License
63 * as published by the Free Software Foundation; either version
64 * 2 of the License, or (at your option) any later version.
65 */
66
Linus Torvalds1da177e2005-04-16 15:20:36 -070067#include <linux/module.h>
68#include <asm/uaccess.h>
69#include <asm/system.h>
70#include <linux/bitops.h>
71#include <linux/types.h>
72#include <linux/kernel.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070073#include <linux/mm.h>
Eric Dumazet424c4b72005-07-05 14:58:19 -070074#include <linux/bootmem.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070075#include <linux/string.h>
76#include <linux/socket.h>
77#include <linux/sockios.h>
78#include <linux/errno.h>
79#include <linux/in.h>
80#include <linux/inet.h>
81#include <linux/netdevice.h>
82#include <linux/proc_fs.h>
83#include <linux/init.h>
Eric Dumazet39c90ec2007-09-15 10:55:54 -070084#include <linux/workqueue.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070085#include <linux/skbuff.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070086#include <linux/inetdevice.h>
87#include <linux/igmp.h>
88#include <linux/pkt_sched.h>
89#include <linux/mroute.h>
90#include <linux/netfilter_ipv4.h>
91#include <linux/random.h>
92#include <linux/jhash.h>
93#include <linux/rcupdate.h>
94#include <linux/times.h>
Herbert Xu352e5122007-11-13 21:34:06 -080095#include <net/dst.h>
Eric W. Biederman457c4cb2007-09-12 12:01:34 +020096#include <net/net_namespace.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070097#include <net/protocol.h>
98#include <net/ip.h>
99#include <net/route.h>
100#include <net/inetpeer.h>
101#include <net/sock.h>
102#include <net/ip_fib.h>
103#include <net/arp.h>
104#include <net/tcp.h>
105#include <net/icmp.h>
106#include <net/xfrm.h>
Tom Tucker8d717402006-07-30 20:43:36 -0700107#include <net/netevent.h>
Thomas Graf63f34442007-03-22 11:55:17 -0700108#include <net/rtnetlink.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700109#ifdef CONFIG_SYSCTL
110#include <linux/sysctl.h>
111#endif
112
113#define RT_FL_TOS(oldflp) \
114 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
115
116#define IP_MAX_MTU 0xFFF0
117
118#define RT_GC_TIMEOUT (300*HZ)
119
Linus Torvalds1da177e2005-04-16 15:20:36 -0700120static int ip_rt_max_size;
Stephen Hemminger817bc4d2008-03-22 17:43:59 -0700121static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
122static int ip_rt_gc_interval __read_mostly = 60 * HZ;
123static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
124static int ip_rt_redirect_number __read_mostly = 9;
125static int ip_rt_redirect_load __read_mostly = HZ / 50;
126static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
127static int ip_rt_error_cost __read_mostly = HZ;
128static int ip_rt_error_burst __read_mostly = 5 * HZ;
129static int ip_rt_gc_elasticity __read_mostly = 8;
130static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
131static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
132static int ip_rt_min_advmss __read_mostly = 256;
133static int ip_rt_secret_interval __read_mostly = 10 * 60 * HZ;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700134
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800135static void rt_worker_func(struct work_struct *work);
136static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700137static struct timer_list rt_secret_timer;
138
139/*
140 * Interface to generic destination cache.
141 */
142
143static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
144static void ipv4_dst_destroy(struct dst_entry *dst);
145static void ipv4_dst_ifdown(struct dst_entry *dst,
146 struct net_device *dev, int how);
147static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
148static void ipv4_link_failure(struct sk_buff *skb);
149static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
Daniel Lezcano569d3642008-01-18 03:56:57 -0800150static int rt_garbage_collect(struct dst_ops *ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700151
152
153static struct dst_ops ipv4_dst_ops = {
154 .family = AF_INET,
155 .protocol = __constant_htons(ETH_P_IP),
156 .gc = rt_garbage_collect,
157 .check = ipv4_dst_check,
158 .destroy = ipv4_dst_destroy,
159 .ifdown = ipv4_dst_ifdown,
160 .negative_advice = ipv4_negative_advice,
161 .link_failure = ipv4_link_failure,
162 .update_pmtu = ip_rt_update_pmtu,
Herbert Xu862b82c2007-11-13 21:43:11 -0800163 .local_out = ip_local_out,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700164 .entry_size = sizeof(struct rtable),
Eric Dumazete2422972008-01-30 20:07:45 -0800165 .entries = ATOMIC_INIT(0),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700166};
167
168#define ECN_OR_COST(class) TC_PRIO_##class
169
Philippe De Muyter4839c522007-07-09 15:32:57 -0700170const __u8 ip_tos2prio[16] = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700171 TC_PRIO_BESTEFFORT,
172 ECN_OR_COST(FILLER),
173 TC_PRIO_BESTEFFORT,
174 ECN_OR_COST(BESTEFFORT),
175 TC_PRIO_BULK,
176 ECN_OR_COST(BULK),
177 TC_PRIO_BULK,
178 ECN_OR_COST(BULK),
179 TC_PRIO_INTERACTIVE,
180 ECN_OR_COST(INTERACTIVE),
181 TC_PRIO_INTERACTIVE,
182 ECN_OR_COST(INTERACTIVE),
183 TC_PRIO_INTERACTIVE_BULK,
184 ECN_OR_COST(INTERACTIVE_BULK),
185 TC_PRIO_INTERACTIVE_BULK,
186 ECN_OR_COST(INTERACTIVE_BULK)
187};
188
189
190/*
191 * Route cache.
192 */
193
194/* The locking scheme is rather straight forward:
195 *
196 * 1) Read-Copy Update protects the buckets of the central route hash.
197 * 2) Only writers remove entries, and they hold the lock
198 * as they look at rtable reference counts.
199 * 3) Only readers acquire references to rtable entries,
200 * they do so with atomic increments and with the
201 * lock held.
202 */
203
204struct rt_hash_bucket {
205 struct rtable *chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700206};
Ingo Molnar8a25d5d2006-07-03 00:24:54 -0700207#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
208 defined(CONFIG_PROVE_LOCKING)
Eric Dumazet22c047c2005-07-05 14:55:24 -0700209/*
210 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
211 * The size of this table is a power of two and depends on the number of CPUS.
Ingo Molnar62051202006-07-03 00:24:59 -0700212 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
Eric Dumazet22c047c2005-07-05 14:55:24 -0700213 */
Ingo Molnar62051202006-07-03 00:24:59 -0700214#ifdef CONFIG_LOCKDEP
215# define RT_HASH_LOCK_SZ 256
Eric Dumazet22c047c2005-07-05 14:55:24 -0700216#else
Ingo Molnar62051202006-07-03 00:24:59 -0700217# if NR_CPUS >= 32
218# define RT_HASH_LOCK_SZ 4096
219# elif NR_CPUS >= 16
220# define RT_HASH_LOCK_SZ 2048
221# elif NR_CPUS >= 8
222# define RT_HASH_LOCK_SZ 1024
223# elif NR_CPUS >= 4
224# define RT_HASH_LOCK_SZ 512
225# else
226# define RT_HASH_LOCK_SZ 256
227# endif
Eric Dumazet22c047c2005-07-05 14:55:24 -0700228#endif
229
230static spinlock_t *rt_hash_locks;
231# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
Pavel Emelyanov1ff1cc22007-12-05 21:15:05 -0800232
233static __init void rt_hash_lock_init(void)
234{
235 int i;
236
237 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
238 GFP_KERNEL);
239 if (!rt_hash_locks)
240 panic("IP: failed to allocate rt_hash_locks\n");
241
242 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
243 spin_lock_init(&rt_hash_locks[i]);
244}
Eric Dumazet22c047c2005-07-05 14:55:24 -0700245#else
246# define rt_hash_lock_addr(slot) NULL
Pavel Emelyanov1ff1cc22007-12-05 21:15:05 -0800247
248static inline void rt_hash_lock_init(void)
249{
250}
Eric Dumazet22c047c2005-07-05 14:55:24 -0700251#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700252
Stephen Hemminger817bc4d2008-03-22 17:43:59 -0700253static struct rt_hash_bucket *rt_hash_table __read_mostly;
254static unsigned rt_hash_mask __read_mostly;
255static unsigned int rt_hash_log __read_mostly;
256static atomic_t rt_genid __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700257
Eric Dumazet2f970d82006-01-17 02:54:36 -0800258static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
Andrew Mortondbd29152006-01-17 21:58:01 -0800259#define RT_CACHE_STAT_INC(field) \
Paul Mackerrasbfe5d832006-06-25 05:47:14 -0700260 (__raw_get_cpu_var(rt_cache_stat).field++)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700261
Ilia Sotnikovcef26852006-03-25 01:38:55 -0800262static unsigned int rt_hash_code(u32 daddr, u32 saddr)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700263{
Eric Dumazet29e75252008-01-31 17:05:09 -0800264 return jhash_2words(daddr, saddr, atomic_read(&rt_genid))
265 & rt_hash_mask;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700266}
267
Al Viro8c7bc842006-09-26 21:26:19 -0700268#define rt_hash(daddr, saddr, idx) \
269 rt_hash_code((__force u32)(__be32)(daddr),\
270 (__force u32)(__be32)(saddr) ^ ((idx) << 5))
271
Linus Torvalds1da177e2005-04-16 15:20:36 -0700272#ifdef CONFIG_PROC_FS
273struct rt_cache_iter_state {
Denis V. Luneva75e9362008-02-28 20:50:55 -0800274 struct seq_net_private p;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700275 int bucket;
Eric Dumazet29e75252008-01-31 17:05:09 -0800276 int genid;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700277};
278
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900279static struct rtable *rt_cache_get_first(struct seq_file *seq)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700280{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900281 struct rt_cache_iter_state *st = seq->private;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700282 struct rtable *r = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700283
284 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
285 rcu_read_lock_bh();
Eric Dumazet29e75252008-01-31 17:05:09 -0800286 r = rcu_dereference(rt_hash_table[st->bucket].chain);
287 while (r) {
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900288 if (dev_net(r->u.dst.dev) == seq_file_net(seq) &&
Denis V. Luneva75e9362008-02-28 20:50:55 -0800289 r->rt_genid == st->genid)
Eric Dumazet29e75252008-01-31 17:05:09 -0800290 return r;
291 r = rcu_dereference(r->u.dst.rt_next);
292 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700293 rcu_read_unlock_bh();
294 }
Eric Dumazet29e75252008-01-31 17:05:09 -0800295 return r;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700296}
297
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900298static struct rtable *__rt_cache_get_next(struct seq_file *seq,
Denis V. Lunev642d6312008-02-28 20:50:33 -0800299 struct rtable *r)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700300{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900301 struct rt_cache_iter_state *st = seq->private;
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800302 r = r->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700303 while (!r) {
304 rcu_read_unlock_bh();
305 if (--st->bucket < 0)
306 break;
307 rcu_read_lock_bh();
308 r = rt_hash_table[st->bucket].chain;
309 }
Eric Dumazet0bccead2008-01-10 03:55:57 -0800310 return rcu_dereference(r);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700311}
312
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900313static struct rtable *rt_cache_get_next(struct seq_file *seq,
Denis V. Lunev642d6312008-02-28 20:50:33 -0800314 struct rtable *r)
315{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900316 struct rt_cache_iter_state *st = seq->private;
317 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
318 if (dev_net(r->u.dst.dev) != seq_file_net(seq))
Denis V. Luneva75e9362008-02-28 20:50:55 -0800319 continue;
Denis V. Lunev642d6312008-02-28 20:50:33 -0800320 if (r->rt_genid == st->genid)
321 break;
322 }
323 return r;
324}
325
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900326static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700327{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900328 struct rtable *r = rt_cache_get_first(seq);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700329
330 if (r)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900331 while (pos && (r = rt_cache_get_next(seq, r)))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700332 --pos;
333 return pos ? NULL : r;
334}
335
336static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
337{
Eric Dumazet29e75252008-01-31 17:05:09 -0800338 struct rt_cache_iter_state *st = seq->private;
Eric Dumazet29e75252008-01-31 17:05:09 -0800339 if (*pos)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900340 return rt_cache_get_idx(seq, *pos - 1);
Eric Dumazet29e75252008-01-31 17:05:09 -0800341 st->genid = atomic_read(&rt_genid);
342 return SEQ_START_TOKEN;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700343}
344
345static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
346{
Eric Dumazet29e75252008-01-31 17:05:09 -0800347 struct rtable *r;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700348
349 if (v == SEQ_START_TOKEN)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900350 r = rt_cache_get_first(seq);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700351 else
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900352 r = rt_cache_get_next(seq, v);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700353 ++*pos;
354 return r;
355}
356
357static void rt_cache_seq_stop(struct seq_file *seq, void *v)
358{
359 if (v && v != SEQ_START_TOKEN)
360 rcu_read_unlock_bh();
361}
362
363static int rt_cache_seq_show(struct seq_file *seq, void *v)
364{
365 if (v == SEQ_START_TOKEN)
366 seq_printf(seq, "%-127s\n",
367 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
368 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
369 "HHUptod\tSpecDst");
370 else {
371 struct rtable *r = v;
372 char temp[256];
373
374 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
375 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
376 r->u.dst.dev ? r->u.dst.dev->name : "*",
377 (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
378 r->rt_flags, atomic_read(&r->u.dst.__refcnt),
379 r->u.dst.__use, 0, (unsigned long)r->rt_src,
380 (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
381 (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
382 dst_metric(&r->u.dst, RTAX_WINDOW),
383 (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
384 dst_metric(&r->u.dst, RTAX_RTTVAR)),
385 r->fl.fl4_tos,
386 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
387 r->u.dst.hh ? (r->u.dst.hh->hh_output ==
388 dev_queue_xmit) : 0,
389 r->rt_spec_dst);
390 seq_printf(seq, "%-127s\n", temp);
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900391 }
392 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700393}
394
Stephen Hemmingerf6908082007-03-12 14:34:29 -0700395static const struct seq_operations rt_cache_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700396 .start = rt_cache_seq_start,
397 .next = rt_cache_seq_next,
398 .stop = rt_cache_seq_stop,
399 .show = rt_cache_seq_show,
400};
401
402static int rt_cache_seq_open(struct inode *inode, struct file *file)
403{
Denis V. Luneva75e9362008-02-28 20:50:55 -0800404 return seq_open_net(inode, file, &rt_cache_seq_ops,
Pavel Emelyanovcf7732e2007-10-10 02:29:29 -0700405 sizeof(struct rt_cache_iter_state));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700406}
407
Arjan van de Ven9a321442007-02-12 00:55:35 -0800408static const struct file_operations rt_cache_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700409 .owner = THIS_MODULE,
410 .open = rt_cache_seq_open,
411 .read = seq_read,
412 .llseek = seq_lseek,
Denis V. Luneva75e9362008-02-28 20:50:55 -0800413 .release = seq_release_net,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700414};
415
416
417static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
418{
419 int cpu;
420
421 if (*pos == 0)
422 return SEQ_START_TOKEN;
423
424 for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
425 if (!cpu_possible(cpu))
426 continue;
427 *pos = cpu+1;
Eric Dumazet2f970d82006-01-17 02:54:36 -0800428 return &per_cpu(rt_cache_stat, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700429 }
430 return NULL;
431}
432
433static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
434{
435 int cpu;
436
437 for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
438 if (!cpu_possible(cpu))
439 continue;
440 *pos = cpu+1;
Eric Dumazet2f970d82006-01-17 02:54:36 -0800441 return &per_cpu(rt_cache_stat, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700442 }
443 return NULL;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900444
Linus Torvalds1da177e2005-04-16 15:20:36 -0700445}
446
447static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
448{
449
450}
451
452static int rt_cpu_seq_show(struct seq_file *seq, void *v)
453{
454 struct rt_cache_stat *st = v;
455
456 if (v == SEQ_START_TOKEN) {
Olaf Rempel5bec0032005-04-28 12:16:08 -0700457 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700458 return 0;
459 }
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900460
Linus Torvalds1da177e2005-04-16 15:20:36 -0700461 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
462 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
463 atomic_read(&ipv4_dst_ops.entries),
464 st->in_hit,
465 st->in_slow_tot,
466 st->in_slow_mc,
467 st->in_no_route,
468 st->in_brd,
469 st->in_martian_dst,
470 st->in_martian_src,
471
472 st->out_hit,
473 st->out_slow_tot,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900474 st->out_slow_mc,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700475
476 st->gc_total,
477 st->gc_ignored,
478 st->gc_goal_miss,
479 st->gc_dst_overflow,
480 st->in_hlist_search,
481 st->out_hlist_search
482 );
483 return 0;
484}
485
Stephen Hemmingerf6908082007-03-12 14:34:29 -0700486static const struct seq_operations rt_cpu_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700487 .start = rt_cpu_seq_start,
488 .next = rt_cpu_seq_next,
489 .stop = rt_cpu_seq_stop,
490 .show = rt_cpu_seq_show,
491};
492
493
494static int rt_cpu_seq_open(struct inode *inode, struct file *file)
495{
496 return seq_open(file, &rt_cpu_seq_ops);
497}
498
Arjan van de Ven9a321442007-02-12 00:55:35 -0800499static const struct file_operations rt_cpu_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700500 .owner = THIS_MODULE,
501 .open = rt_cpu_seq_open,
502 .read = seq_read,
503 .llseek = seq_lseek,
504 .release = seq_release,
505};
506
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800507#ifdef CONFIG_NET_CLS_ROUTE
508static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
509 int length, int *eof, void *data)
510{
511 unsigned int i;
512
513 if ((offset & 3) || (length & 3))
514 return -EIO;
515
516 if (offset >= sizeof(struct ip_rt_acct) * 256) {
517 *eof = 1;
518 return 0;
519 }
520
521 if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
522 length = sizeof(struct ip_rt_acct) * 256 - offset;
523 *eof = 1;
524 }
525
526 offset /= sizeof(u32);
527
528 if (length > 0) {
529 u32 *dst = (u32 *) buffer;
530
531 *start = buffer;
532 memset(dst, 0, length);
533
534 for_each_possible_cpu(i) {
535 unsigned int j;
536 u32 *src;
537
538 src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset;
539 for (j = 0; j < length/4; j++)
540 dst[j] += src[j];
541 }
542 }
543 return length;
544}
545#endif
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800546
Denis V. Lunev73b38712008-02-28 20:51:18 -0800547static int __net_init ip_rt_do_proc_init(struct net *net)
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800548{
549 struct proc_dir_entry *pde;
550
551 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
552 &rt_cache_seq_fops);
553 if (!pde)
554 goto err1;
555
Wang Chen77020722008-02-28 14:14:25 -0800556 pde = proc_create("rt_cache", S_IRUGO,
557 net->proc_net_stat, &rt_cpu_seq_fops);
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800558 if (!pde)
559 goto err2;
560
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800561#ifdef CONFIG_NET_CLS_ROUTE
562 pde = create_proc_read_entry("rt_acct", 0, net->proc_net,
563 ip_rt_acct_read, NULL);
564 if (!pde)
565 goto err3;
566#endif
567 return 0;
568
569#ifdef CONFIG_NET_CLS_ROUTE
570err3:
571 remove_proc_entry("rt_cache", net->proc_net_stat);
572#endif
573err2:
574 remove_proc_entry("rt_cache", net->proc_net);
575err1:
576 return -ENOMEM;
577}
Denis V. Lunev73b38712008-02-28 20:51:18 -0800578
579static void __net_exit ip_rt_do_proc_exit(struct net *net)
580{
581 remove_proc_entry("rt_cache", net->proc_net_stat);
582 remove_proc_entry("rt_cache", net->proc_net);
583 remove_proc_entry("rt_acct", net->proc_net);
584}
585
586static struct pernet_operations ip_rt_proc_ops __net_initdata = {
587 .init = ip_rt_do_proc_init,
588 .exit = ip_rt_do_proc_exit,
589};
590
591static int __init ip_rt_proc_init(void)
592{
593 return register_pernet_subsys(&ip_rt_proc_ops);
594}
595
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800596#else
Denis V. Lunev73b38712008-02-28 20:51:18 -0800597static inline int ip_rt_proc_init(void)
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800598{
599 return 0;
600}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700601#endif /* CONFIG_PROC_FS */
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900602
Linus Torvalds1da177e2005-04-16 15:20:36 -0700603static __inline__ void rt_free(struct rtable *rt)
604{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700605 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
606}
607
608static __inline__ void rt_drop(struct rtable *rt)
609{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700610 ip_rt_put(rt);
611 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
612}
613
614static __inline__ int rt_fast_clean(struct rtable *rth)
615{
616 /* Kill broadcast/multicast entries very aggresively, if they
617 collide in hash table with more useful entries */
618 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800619 rth->fl.iif && rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700620}
621
622static __inline__ int rt_valuable(struct rtable *rth)
623{
624 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
625 rth->u.dst.expires;
626}
627
628static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
629{
630 unsigned long age;
631 int ret = 0;
632
633 if (atomic_read(&rth->u.dst.__refcnt))
634 goto out;
635
636 ret = 1;
637 if (rth->u.dst.expires &&
638 time_after_eq(jiffies, rth->u.dst.expires))
639 goto out;
640
641 age = jiffies - rth->u.dst.lastuse;
642 ret = 0;
643 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
644 (age <= tmo2 && rt_valuable(rth)))
645 goto out;
646 ret = 1;
647out: return ret;
648}
649
650/* Bits of score are:
651 * 31: very valuable
652 * 30: not quite useless
653 * 29..0: usage counter
654 */
655static inline u32 rt_score(struct rtable *rt)
656{
657 u32 score = jiffies - rt->u.dst.lastuse;
658
659 score = ~score & ~(3<<30);
660
661 if (rt_valuable(rt))
662 score |= (1<<31);
663
664 if (!rt->fl.iif ||
665 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
666 score |= (1<<30);
667
668 return score;
669}
670
671static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
672{
Al Viro714e85b2006-11-14 20:51:49 -0800673 return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
674 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
Thomas Graf47dcf0c2006-11-09 15:20:38 -0800675 (fl1->mark ^ fl2->mark) |
David S. Miller8238b212006-10-12 00:49:15 -0700676 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
677 *(u16 *)&fl2->nl_u.ip4_u.tos) |
678 (fl1->oif ^ fl2->oif) |
679 (fl1->iif ^ fl2->iif)) == 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700680}
681
Denis V. Lunevb5921912008-01-22 23:50:25 -0800682static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
683{
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +0900684 return dev_net(rt1->u.dst.dev) == dev_net(rt2->u.dst.dev);
Denis V. Lunevb5921912008-01-22 23:50:25 -0800685}
686
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800687/*
688 * Perform a full scan of hash table and free all entries.
689 * Can be called by a softirq or a process.
690 * In the later case, we want to be reschedule if necessary
691 */
692static void rt_do_flush(int process_context)
693{
694 unsigned int i;
695 struct rtable *rth, *next;
696
697 for (i = 0; i <= rt_hash_mask; i++) {
698 if (process_context && need_resched())
699 cond_resched();
700 rth = rt_hash_table[i].chain;
701 if (!rth)
702 continue;
703
704 spin_lock_bh(rt_hash_lock_addr(i));
705 rth = rt_hash_table[i].chain;
706 rt_hash_table[i].chain = NULL;
707 spin_unlock_bh(rt_hash_lock_addr(i));
708
709 for (; rth; rth = next) {
710 next = rth->u.dst.rt_next;
711 rt_free(rth);
712 }
713 }
714}
715
716static void rt_check_expire(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700717{
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700718 static unsigned int rover;
719 unsigned int i = rover, goal;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700720 struct rtable *rth, **rthp;
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700721 u64 mult;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700722
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700723 mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
724 if (ip_rt_gc_timeout > 1)
725 do_div(mult, ip_rt_gc_timeout);
726 goal = (unsigned int)mult;
Eric Dumazet39c90ec2007-09-15 10:55:54 -0700727 if (goal > rt_hash_mask)
728 goal = rt_hash_mask + 1;
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700729 for (; goal > 0; goal--) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700730 unsigned long tmo = ip_rt_gc_timeout;
731
732 i = (i + 1) & rt_hash_mask;
733 rthp = &rt_hash_table[i].chain;
734
Eric Dumazetd90bf5a2007-11-14 16:14:05 -0800735 if (need_resched())
736 cond_resched();
737
Stephen Hemmingercfcabdc2007-10-09 01:59:42 -0700738 if (*rthp == NULL)
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700739 continue;
Eric Dumazet39c90ec2007-09-15 10:55:54 -0700740 spin_lock_bh(rt_hash_lock_addr(i));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700741 while ((rth = *rthp) != NULL) {
Eric Dumazet29e75252008-01-31 17:05:09 -0800742 if (rth->rt_genid != atomic_read(&rt_genid)) {
743 *rthp = rth->u.dst.rt_next;
744 rt_free(rth);
745 continue;
746 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700747 if (rth->u.dst.expires) {
748 /* Entry is expired even if it is in use */
Eric Dumazet39c90ec2007-09-15 10:55:54 -0700749 if (time_before_eq(jiffies, rth->u.dst.expires)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700750 tmo >>= 1;
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800751 rthp = &rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700752 continue;
753 }
754 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
755 tmo >>= 1;
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800756 rthp = &rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700757 continue;
758 }
759
760 /* Cleanup aged off entries. */
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800761 *rthp = rth->u.dst.rt_next;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900762 rt_free(rth);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700763 }
Eric Dumazet39c90ec2007-09-15 10:55:54 -0700764 spin_unlock_bh(rt_hash_lock_addr(i));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700765 }
766 rover = i;
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800767}
768
769/*
770 * rt_worker_func() is run in process context.
Eric Dumazet29e75252008-01-31 17:05:09 -0800771 * we call rt_check_expire() to scan part of the hash table
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800772 */
773static void rt_worker_func(struct work_struct *work)
774{
Eric Dumazet29e75252008-01-31 17:05:09 -0800775 rt_check_expire();
Eric Dumazet39c90ec2007-09-15 10:55:54 -0700776 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700777}
778
Eric Dumazet29e75252008-01-31 17:05:09 -0800779/*
780 * Pertubation of rt_genid by a small quantity [1..256]
781 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
782 * many times (2^24) without giving recent rt_genid.
783 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700784 */
Eric Dumazet29e75252008-01-31 17:05:09 -0800785static void rt_cache_invalidate(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700786{
Eric Dumazet29e75252008-01-31 17:05:09 -0800787 unsigned char shuffle;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700788
Eric Dumazet29e75252008-01-31 17:05:09 -0800789 get_random_bytes(&shuffle, sizeof(shuffle));
790 atomic_add(shuffle + 1U, &rt_genid);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700791}
792
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800793/*
Eric Dumazet29e75252008-01-31 17:05:09 -0800794 * delay < 0 : invalidate cache (fast : entries will be deleted later)
795 * delay >= 0 : invalidate & flush cache (can be long)
796 */
797void rt_cache_flush(int delay)
798{
799 rt_cache_invalidate();
800 if (delay >= 0)
801 rt_do_flush(!in_softirq());
802}
803
804/*
805 * We change rt_genid and let gc do the cleanup
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800806 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700807static void rt_secret_rebuild(unsigned long dummy)
808{
Eric Dumazet29e75252008-01-31 17:05:09 -0800809 rt_cache_invalidate();
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800810 mod_timer(&rt_secret_timer, jiffies + ip_rt_secret_interval);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700811}
812
813/*
814 Short description of GC goals.
815
816 We want to build algorithm, which will keep routing cache
817 at some equilibrium point, when number of aged off entries
818 is kept approximately equal to newly generated ones.
819
820 Current expiration strength is variable "expire".
821 We try to adjust it dynamically, so that if networking
822 is idle expires is large enough to keep enough of warm entries,
823 and when load increases it reduces to limit cache size.
824 */
825
Daniel Lezcano569d3642008-01-18 03:56:57 -0800826static int rt_garbage_collect(struct dst_ops *ops)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700827{
828 static unsigned long expire = RT_GC_TIMEOUT;
829 static unsigned long last_gc;
830 static int rover;
831 static int equilibrium;
832 struct rtable *rth, **rthp;
833 unsigned long now = jiffies;
834 int goal;
835
836 /*
837 * Garbage collection is pretty expensive,
838 * do not make it too frequently.
839 */
840
841 RT_CACHE_STAT_INC(gc_total);
842
843 if (now - last_gc < ip_rt_gc_min_interval &&
844 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
845 RT_CACHE_STAT_INC(gc_ignored);
846 goto out;
847 }
848
849 /* Calculate number of entries, which we want to expire now. */
850 goal = atomic_read(&ipv4_dst_ops.entries) -
851 (ip_rt_gc_elasticity << rt_hash_log);
852 if (goal <= 0) {
853 if (equilibrium < ipv4_dst_ops.gc_thresh)
854 equilibrium = ipv4_dst_ops.gc_thresh;
855 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
856 if (goal > 0) {
Eric Dumazetb790ced2007-12-21 01:49:07 -0800857 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700858 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
859 }
860 } else {
861 /* We are in dangerous area. Try to reduce cache really
862 * aggressively.
863 */
Eric Dumazetb790ced2007-12-21 01:49:07 -0800864 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700865 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
866 }
867
868 if (now - last_gc >= ip_rt_gc_min_interval)
869 last_gc = now;
870
871 if (goal <= 0) {
872 equilibrium += goal;
873 goto work_done;
874 }
875
876 do {
877 int i, k;
878
879 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
880 unsigned long tmo = expire;
881
882 k = (k + 1) & rt_hash_mask;
883 rthp = &rt_hash_table[k].chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700884 spin_lock_bh(rt_hash_lock_addr(k));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700885 while ((rth = *rthp) != NULL) {
Eric Dumazet29e75252008-01-31 17:05:09 -0800886 if (rth->rt_genid == atomic_read(&rt_genid) &&
887 !rt_may_expire(rth, tmo, expire)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700888 tmo >>= 1;
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800889 rthp = &rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700890 continue;
891 }
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800892 *rthp = rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700893 rt_free(rth);
894 goal--;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700895 }
Eric Dumazet22c047c2005-07-05 14:55:24 -0700896 spin_unlock_bh(rt_hash_lock_addr(k));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700897 if (goal <= 0)
898 break;
899 }
900 rover = k;
901
902 if (goal <= 0)
903 goto work_done;
904
905 /* Goal is not achieved. We stop process if:
906
907 - if expire reduced to zero. Otherwise, expire is halfed.
908 - if table is not full.
909 - if we are called from interrupt.
910 - jiffies check is just fallback/debug loop breaker.
911 We will not spin here for long time in any case.
912 */
913
914 RT_CACHE_STAT_INC(gc_goal_miss);
915
916 if (expire == 0)
917 break;
918
919 expire >>= 1;
920#if RT_CACHE_DEBUG >= 2
921 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
922 atomic_read(&ipv4_dst_ops.entries), goal, i);
923#endif
924
925 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
926 goto out;
927 } while (!in_softirq() && time_before_eq(jiffies, now));
928
929 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
930 goto out;
931 if (net_ratelimit())
932 printk(KERN_WARNING "dst cache overflow\n");
933 RT_CACHE_STAT_INC(gc_dst_overflow);
934 return 1;
935
936work_done:
937 expire += ip_rt_gc_min_interval;
938 if (expire > ip_rt_gc_timeout ||
939 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
940 expire = ip_rt_gc_timeout;
941#if RT_CACHE_DEBUG >= 2
942 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
943 atomic_read(&ipv4_dst_ops.entries), goal, rover);
944#endif
945out: return 0;
946}
947
948static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
949{
950 struct rtable *rth, **rthp;
951 unsigned long now;
952 struct rtable *cand, **candp;
953 u32 min_score;
954 int chain_length;
955 int attempts = !in_softirq();
956
957restart:
958 chain_length = 0;
959 min_score = ~(u32)0;
960 cand = NULL;
961 candp = NULL;
962 now = jiffies;
963
964 rthp = &rt_hash_table[hash].chain;
965
Eric Dumazet22c047c2005-07-05 14:55:24 -0700966 spin_lock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700967 while ((rth = *rthp) != NULL) {
Eric Dumazet29e75252008-01-31 17:05:09 -0800968 if (rth->rt_genid != atomic_read(&rt_genid)) {
969 *rthp = rth->u.dst.rt_next;
970 rt_free(rth);
971 continue;
972 }
Denis V. Lunevb5921912008-01-22 23:50:25 -0800973 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700974 /* Put it first */
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800975 *rthp = rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700976 /*
977 * Since lookup is lockfree, the deletion
978 * must be visible to another weakly ordered CPU before
979 * the insertion at the start of the hash chain.
980 */
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800981 rcu_assign_pointer(rth->u.dst.rt_next,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700982 rt_hash_table[hash].chain);
983 /*
984 * Since lookup is lockfree, the update writes
985 * must be ordered for consistency on SMP.
986 */
987 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
988
Pavel Emelyanov03f49f32007-11-10 21:28:34 -0800989 dst_use(&rth->u.dst, now);
Eric Dumazet22c047c2005-07-05 14:55:24 -0700990 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700991
992 rt_drop(rt);
993 *rp = rth;
994 return 0;
995 }
996
997 if (!atomic_read(&rth->u.dst.__refcnt)) {
998 u32 score = rt_score(rth);
999
1000 if (score <= min_score) {
1001 cand = rth;
1002 candp = rthp;
1003 min_score = score;
1004 }
1005 }
1006
1007 chain_length++;
1008
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001009 rthp = &rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001010 }
1011
1012 if (cand) {
1013 /* ip_rt_gc_elasticity used to be average length of chain
1014 * length, when exceeded gc becomes really aggressive.
1015 *
1016 * The second limit is less certain. At the moment it allows
1017 * only 2 entries per bucket. We will see.
1018 */
1019 if (chain_length > ip_rt_gc_elasticity) {
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001020 *candp = cand->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001021 rt_free(cand);
1022 }
1023 }
1024
1025 /* Try to bind route to arp only if it is output
1026 route or unicast forwarding path.
1027 */
1028 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1029 int err = arp_bind_neighbour(&rt->u.dst);
1030 if (err) {
Eric Dumazet22c047c2005-07-05 14:55:24 -07001031 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001032
1033 if (err != -ENOBUFS) {
1034 rt_drop(rt);
1035 return err;
1036 }
1037
1038 /* Neighbour tables are full and nothing
1039 can be released. Try to shrink route cache,
1040 it is most likely it holds some neighbour records.
1041 */
1042 if (attempts-- > 0) {
1043 int saved_elasticity = ip_rt_gc_elasticity;
1044 int saved_int = ip_rt_gc_min_interval;
1045 ip_rt_gc_elasticity = 1;
1046 ip_rt_gc_min_interval = 0;
Daniel Lezcano569d3642008-01-18 03:56:57 -08001047 rt_garbage_collect(&ipv4_dst_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001048 ip_rt_gc_min_interval = saved_int;
1049 ip_rt_gc_elasticity = saved_elasticity;
1050 goto restart;
1051 }
1052
1053 if (net_ratelimit())
1054 printk(KERN_WARNING "Neighbour table overflow.\n");
1055 rt_drop(rt);
1056 return -ENOBUFS;
1057 }
1058 }
1059
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001060 rt->u.dst.rt_next = rt_hash_table[hash].chain;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001061#if RT_CACHE_DEBUG >= 2
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001062 if (rt->u.dst.rt_next) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001063 struct rtable *trt;
1064 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
1065 NIPQUAD(rt->rt_dst));
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001066 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001067 printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
1068 printk("\n");
1069 }
1070#endif
1071 rt_hash_table[hash].chain = rt;
Eric Dumazet22c047c2005-07-05 14:55:24 -07001072 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001073 *rp = rt;
1074 return 0;
1075}
1076
1077void rt_bind_peer(struct rtable *rt, int create)
1078{
1079 static DEFINE_SPINLOCK(rt_peer_lock);
1080 struct inet_peer *peer;
1081
1082 peer = inet_getpeer(rt->rt_dst, create);
1083
1084 spin_lock_bh(&rt_peer_lock);
1085 if (rt->peer == NULL) {
1086 rt->peer = peer;
1087 peer = NULL;
1088 }
1089 spin_unlock_bh(&rt_peer_lock);
1090 if (peer)
1091 inet_putpeer(peer);
1092}
1093
1094/*
1095 * Peer allocation may fail only in serious out-of-memory conditions. However
1096 * we still can generate some output.
1097 * Random ID selection looks a bit dangerous because we have no chances to
1098 * select ID being unique in a reasonable period of time.
1099 * But broken packet identifier may be better than no packet at all.
1100 */
1101static void ip_select_fb_ident(struct iphdr *iph)
1102{
1103 static DEFINE_SPINLOCK(ip_fb_id_lock);
1104 static u32 ip_fallback_id;
1105 u32 salt;
1106
1107 spin_lock_bh(&ip_fb_id_lock);
Al Viroe4485152006-09-26 22:15:01 -07001108 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001109 iph->id = htons(salt & 0xFFFF);
1110 ip_fallback_id = salt;
1111 spin_unlock_bh(&ip_fb_id_lock);
1112}
1113
1114void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1115{
1116 struct rtable *rt = (struct rtable *) dst;
1117
1118 if (rt) {
1119 if (rt->peer == NULL)
1120 rt_bind_peer(rt, 1);
1121
1122 /* If peer is attached to destination, it is never detached,
1123 so that we need not to grab a lock to dereference it.
1124 */
1125 if (rt->peer) {
1126 iph->id = htons(inet_getid(rt->peer, more));
1127 return;
1128 }
1129 } else
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001130 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
Stephen Hemminger9c2b3322005-04-19 22:39:42 -07001131 __builtin_return_address(0));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001132
1133 ip_select_fb_ident(iph);
1134}
1135
1136static void rt_del(unsigned hash, struct rtable *rt)
1137{
Eric Dumazet29e75252008-01-31 17:05:09 -08001138 struct rtable **rthp, *aux;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001139
Eric Dumazet29e75252008-01-31 17:05:09 -08001140 rthp = &rt_hash_table[hash].chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -07001141 spin_lock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001142 ip_rt_put(rt);
Eric Dumazet29e75252008-01-31 17:05:09 -08001143 while ((aux = *rthp) != NULL) {
1144 if (aux == rt || (aux->rt_genid != atomic_read(&rt_genid))) {
1145 *rthp = aux->u.dst.rt_next;
1146 rt_free(aux);
1147 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001148 }
Eric Dumazet29e75252008-01-31 17:05:09 -08001149 rthp = &aux->u.dst.rt_next;
1150 }
Eric Dumazet22c047c2005-07-05 14:55:24 -07001151 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001152}
1153
Al Virof7655222006-09-26 21:25:43 -07001154void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1155 __be32 saddr, struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001156{
1157 int i, k;
1158 struct in_device *in_dev = in_dev_get(dev);
1159 struct rtable *rth, **rthp;
Al Virof7655222006-09-26 21:25:43 -07001160 __be32 skeys[2] = { saddr, 0 };
Linus Torvalds1da177e2005-04-16 15:20:36 -07001161 int ikeys[2] = { dev->ifindex, 0 };
Tom Tucker8d717402006-07-30 20:43:36 -07001162 struct netevent_redirect netevent;
Denis V. Lunev317805b2008-02-28 20:50:06 -08001163 struct net *net;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001164
Linus Torvalds1da177e2005-04-16 15:20:36 -07001165 if (!in_dev)
1166 return;
1167
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09001168 net = dev_net(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001169 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
Jan Engelhardt1e637c72008-01-21 03:18:08 -08001170 || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw)
Joe Perchesf97c1e02007-12-16 13:45:43 -08001171 || ipv4_is_zeronet(new_gw))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001172 goto reject_redirect;
1173
1174 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1175 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1176 goto reject_redirect;
1177 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1178 goto reject_redirect;
1179 } else {
Denis V. Lunev317805b2008-02-28 20:50:06 -08001180 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001181 goto reject_redirect;
1182 }
1183
1184 for (i = 0; i < 2; i++) {
1185 for (k = 0; k < 2; k++) {
Al Viro8c7bc842006-09-26 21:26:19 -07001186 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001187
1188 rthp=&rt_hash_table[hash].chain;
1189
1190 rcu_read_lock();
1191 while ((rth = rcu_dereference(*rthp)) != NULL) {
1192 struct rtable *rt;
1193
1194 if (rth->fl.fl4_dst != daddr ||
1195 rth->fl.fl4_src != skeys[i] ||
Linus Torvalds1da177e2005-04-16 15:20:36 -07001196 rth->fl.oif != ikeys[k] ||
Eric Dumazet29e75252008-01-31 17:05:09 -08001197 rth->fl.iif != 0 ||
Denis V. Lunev317805b2008-02-28 20:50:06 -08001198 rth->rt_genid != atomic_read(&rt_genid) ||
YOSHIFUJI Hideaki878628f2008-03-26 03:57:35 +09001199 !net_eq(dev_net(rth->u.dst.dev), net)) {
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001200 rthp = &rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001201 continue;
1202 }
1203
1204 if (rth->rt_dst != daddr ||
1205 rth->rt_src != saddr ||
1206 rth->u.dst.error ||
1207 rth->rt_gateway != old_gw ||
1208 rth->u.dst.dev != dev)
1209 break;
1210
1211 dst_hold(&rth->u.dst);
1212 rcu_read_unlock();
1213
1214 rt = dst_alloc(&ipv4_dst_ops);
1215 if (rt == NULL) {
1216 ip_rt_put(rth);
1217 in_dev_put(in_dev);
1218 return;
1219 }
1220
1221 /* Copy all the information. */
1222 *rt = *rth;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001223 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001224 rt->u.dst.__use = 1;
1225 atomic_set(&rt->u.dst.__refcnt, 1);
1226 rt->u.dst.child = NULL;
1227 if (rt->u.dst.dev)
1228 dev_hold(rt->u.dst.dev);
1229 if (rt->idev)
1230 in_dev_hold(rt->idev);
1231 rt->u.dst.obsolete = 0;
1232 rt->u.dst.lastuse = jiffies;
1233 rt->u.dst.path = &rt->u.dst;
1234 rt->u.dst.neighbour = NULL;
1235 rt->u.dst.hh = NULL;
1236 rt->u.dst.xfrm = NULL;
Eric Dumazet29e75252008-01-31 17:05:09 -08001237 rt->rt_genid = atomic_read(&rt_genid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001238 rt->rt_flags |= RTCF_REDIRECTED;
1239
1240 /* Gateway is different ... */
1241 rt->rt_gateway = new_gw;
1242
1243 /* Redirect received -> path was valid */
1244 dst_confirm(&rth->u.dst);
1245
1246 if (rt->peer)
1247 atomic_inc(&rt->peer->refcnt);
1248
1249 if (arp_bind_neighbour(&rt->u.dst) ||
1250 !(rt->u.dst.neighbour->nud_state &
1251 NUD_VALID)) {
1252 if (rt->u.dst.neighbour)
1253 neigh_event_send(rt->u.dst.neighbour, NULL);
1254 ip_rt_put(rth);
1255 rt_drop(rt);
1256 goto do_next;
1257 }
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001258
Tom Tucker8d717402006-07-30 20:43:36 -07001259 netevent.old = &rth->u.dst;
1260 netevent.new = &rt->u.dst;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001261 call_netevent_notifiers(NETEVENT_REDIRECT,
1262 &netevent);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001263
1264 rt_del(hash, rth);
1265 if (!rt_intern_hash(hash, rt, &rt))
1266 ip_rt_put(rt);
1267 goto do_next;
1268 }
1269 rcu_read_unlock();
1270 do_next:
1271 ;
1272 }
1273 }
1274 in_dev_put(in_dev);
1275 return;
1276
1277reject_redirect:
1278#ifdef CONFIG_IP_ROUTE_VERBOSE
1279 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1280 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1281 "%u.%u.%u.%u ignored.\n"
Ilia Sotnikovcef26852006-03-25 01:38:55 -08001282 " Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -07001283 NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
Ilia Sotnikovcef26852006-03-25 01:38:55 -08001284 NIPQUAD(saddr), NIPQUAD(daddr));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001285#endif
1286 in_dev_put(in_dev);
1287}
1288
1289static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1290{
Eric Dumazetee6b9672008-03-05 18:30:47 -08001291 struct rtable *rt = (struct rtable *)dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001292 struct dst_entry *ret = dst;
1293
1294 if (rt) {
1295 if (dst->obsolete) {
1296 ip_rt_put(rt);
1297 ret = NULL;
1298 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1299 rt->u.dst.expires) {
Al Viro8c7bc842006-09-26 21:26:19 -07001300 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1301 rt->fl.oif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001302#if RT_CACHE_DEBUG >= 1
Denis V. Lunev56c99d02007-12-06 02:19:07 -08001303 printk(KERN_DEBUG "ipv4_negative_advice: redirect to "
Linus Torvalds1da177e2005-04-16 15:20:36 -07001304 "%u.%u.%u.%u/%02x dropped\n",
1305 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1306#endif
1307 rt_del(hash, rt);
1308 ret = NULL;
1309 }
1310 }
1311 return ret;
1312}
1313
1314/*
1315 * Algorithm:
1316 * 1. The first ip_rt_redirect_number redirects are sent
1317 * with exponential backoff, then we stop sending them at all,
1318 * assuming that the host ignores our redirects.
1319 * 2. If we did not see packets requiring redirects
1320 * during ip_rt_redirect_silence, we assume that the host
1321 * forgot redirected route and start to send redirects again.
1322 *
1323 * This algorithm is much cheaper and more intelligent than dumb load limiting
1324 * in icmp.c.
1325 *
1326 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1327 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1328 */
1329
1330void ip_rt_send_redirect(struct sk_buff *skb)
1331{
Eric Dumazetee6b9672008-03-05 18:30:47 -08001332 struct rtable *rt = skb->rtable;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001333 struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1334
1335 if (!in_dev)
1336 return;
1337
1338 if (!IN_DEV_TX_REDIRECTS(in_dev))
1339 goto out;
1340
1341 /* No redirected packets during ip_rt_redirect_silence;
1342 * reset the algorithm.
1343 */
1344 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1345 rt->u.dst.rate_tokens = 0;
1346
1347 /* Too many ignored redirects; do not send anything
1348 * set u.dst.rate_last to the last seen redirected packet.
1349 */
1350 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1351 rt->u.dst.rate_last = jiffies;
1352 goto out;
1353 }
1354
1355 /* Check for load limit; set rate_last to the latest sent
1356 * redirect.
1357 */
Li Yewang14fb8a72006-12-18 00:26:35 -08001358 if (rt->u.dst.rate_tokens == 0 ||
1359 time_after(jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001360 (rt->u.dst.rate_last +
1361 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1362 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1363 rt->u.dst.rate_last = jiffies;
1364 ++rt->u.dst.rate_tokens;
1365#ifdef CONFIG_IP_ROUTE_VERBOSE
1366 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1367 rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1368 net_ratelimit())
1369 printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1370 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1371 NIPQUAD(rt->rt_src), rt->rt_iif,
1372 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1373#endif
1374 }
1375out:
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001376 in_dev_put(in_dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001377}
1378
1379static int ip_error(struct sk_buff *skb)
1380{
Eric Dumazetee6b9672008-03-05 18:30:47 -08001381 struct rtable *rt = skb->rtable;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001382 unsigned long now;
1383 int code;
1384
1385 switch (rt->u.dst.error) {
1386 case EINVAL:
1387 default:
1388 goto out;
1389 case EHOSTUNREACH:
1390 code = ICMP_HOST_UNREACH;
1391 break;
1392 case ENETUNREACH:
1393 code = ICMP_NET_UNREACH;
Mitsuru Chinen7f538782007-12-07 01:07:24 -08001394 IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001395 break;
1396 case EACCES:
1397 code = ICMP_PKT_FILTERED;
1398 break;
1399 }
1400
1401 now = jiffies;
1402 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1403 if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1404 rt->u.dst.rate_tokens = ip_rt_error_burst;
1405 rt->u.dst.rate_last = now;
1406 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1407 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1408 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1409 }
1410
1411out: kfree_skb(skb);
1412 return 0;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001413}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001414
1415/*
1416 * The last two values are not from the RFC but
1417 * are needed for AMPRnet AX.25 paths.
1418 */
1419
Arjan van de Ven9b5b5cf2005-11-29 16:21:38 -08001420static const unsigned short mtu_plateau[] =
Linus Torvalds1da177e2005-04-16 15:20:36 -07001421{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1422
1423static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1424{
1425 int i;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001426
Linus Torvalds1da177e2005-04-16 15:20:36 -07001427 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1428 if (old_mtu > mtu_plateau[i])
1429 return mtu_plateau[i];
1430 return 68;
1431}
1432
Denis V. Lunevb5921912008-01-22 23:50:25 -08001433unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1434 unsigned short new_mtu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001435{
1436 int i;
1437 unsigned short old_mtu = ntohs(iph->tot_len);
1438 struct rtable *rth;
Al Viroe4485152006-09-26 22:15:01 -07001439 __be32 skeys[2] = { iph->saddr, 0, };
1440 __be32 daddr = iph->daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001441 unsigned short est_mtu = 0;
1442
1443 if (ipv4_config.no_pmtu_disc)
1444 return 0;
1445
1446 for (i = 0; i < 2; i++) {
Al Viro8c7bc842006-09-26 21:26:19 -07001447 unsigned hash = rt_hash(daddr, skeys[i], 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001448
1449 rcu_read_lock();
1450 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001451 rth = rcu_dereference(rth->u.dst.rt_next)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001452 if (rth->fl.fl4_dst == daddr &&
1453 rth->fl.fl4_src == skeys[i] &&
1454 rth->rt_dst == daddr &&
1455 rth->rt_src == iph->saddr &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001456 rth->fl.iif == 0 &&
Denis V. Lunevb5921912008-01-22 23:50:25 -08001457 !(dst_metric_locked(&rth->u.dst, RTAX_MTU)) &&
YOSHIFUJI Hideaki878628f2008-03-26 03:57:35 +09001458 net_eq(dev_net(rth->u.dst.dev), net) &&
Eric Dumazet29e75252008-01-31 17:05:09 -08001459 rth->rt_genid == atomic_read(&rt_genid)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001460 unsigned short mtu = new_mtu;
1461
1462 if (new_mtu < 68 || new_mtu >= old_mtu) {
1463
1464 /* BSD 4.2 compatibility hack :-( */
1465 if (mtu == 0 &&
1466 old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1467 old_mtu >= 68 + (iph->ihl << 2))
1468 old_mtu -= iph->ihl << 2;
1469
1470 mtu = guess_mtu(old_mtu);
1471 }
1472 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001473 if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001474 dst_confirm(&rth->u.dst);
1475 if (mtu < ip_rt_min_pmtu) {
1476 mtu = ip_rt_min_pmtu;
1477 rth->u.dst.metrics[RTAX_LOCK-1] |=
1478 (1 << RTAX_MTU);
1479 }
1480 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1481 dst_set_expires(&rth->u.dst,
1482 ip_rt_mtu_expires);
1483 }
1484 est_mtu = mtu;
1485 }
1486 }
1487 }
1488 rcu_read_unlock();
1489 }
1490 return est_mtu ? : new_mtu;
1491}
1492
1493static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1494{
1495 if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1496 !(dst_metric_locked(dst, RTAX_MTU))) {
1497 if (mtu < ip_rt_min_pmtu) {
1498 mtu = ip_rt_min_pmtu;
1499 dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1500 }
1501 dst->metrics[RTAX_MTU-1] = mtu;
1502 dst_set_expires(dst, ip_rt_mtu_expires);
Tom Tucker8d717402006-07-30 20:43:36 -07001503 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001504 }
1505}
1506
1507static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1508{
1509 return NULL;
1510}
1511
1512static void ipv4_dst_destroy(struct dst_entry *dst)
1513{
1514 struct rtable *rt = (struct rtable *) dst;
1515 struct inet_peer *peer = rt->peer;
1516 struct in_device *idev = rt->idev;
1517
1518 if (peer) {
1519 rt->peer = NULL;
1520 inet_putpeer(peer);
1521 }
1522
1523 if (idev) {
1524 rt->idev = NULL;
1525 in_dev_put(idev);
1526 }
1527}
1528
1529static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1530 int how)
1531{
1532 struct rtable *rt = (struct rtable *) dst;
1533 struct in_device *idev = rt->idev;
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09001534 if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) {
Denis V. Lunev5a3e55d2007-12-07 00:38:10 -08001535 struct in_device *loopback_idev =
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09001536 in_dev_get(dev_net(dev)->loopback_dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001537 if (loopback_idev) {
1538 rt->idev = loopback_idev;
1539 in_dev_put(idev);
1540 }
1541 }
1542}
1543
1544static void ipv4_link_failure(struct sk_buff *skb)
1545{
1546 struct rtable *rt;
1547
1548 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1549
Eric Dumazetee6b9672008-03-05 18:30:47 -08001550 rt = skb->rtable;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001551 if (rt)
1552 dst_set_expires(&rt->u.dst, 0);
1553}
1554
1555static int ip_rt_bug(struct sk_buff *skb)
1556{
1557 printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07001558 NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
Linus Torvalds1da177e2005-04-16 15:20:36 -07001559 skb->dev ? skb->dev->name : "?");
1560 kfree_skb(skb);
1561 return 0;
1562}
1563
1564/*
1565 We do not cache source address of outgoing interface,
1566 because it is used only by IP RR, TS and SRR options,
1567 so that it out of fast path.
1568
1569 BTW remember: "addr" is allowed to be not aligned
1570 in IP options!
1571 */
1572
1573void ip_rt_get_source(u8 *addr, struct rtable *rt)
1574{
Al Viroa61ced52006-09-26 21:27:54 -07001575 __be32 src;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001576 struct fib_result res;
1577
1578 if (rt->fl.iif == 0)
1579 src = rt->rt_src;
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09001580 else if (fib_lookup(dev_net(rt->u.dst.dev), &rt->fl, &res) == 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001581 src = FIB_RES_PREFSRC(res);
1582 fib_res_put(&res);
1583 } else
1584 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1585 RT_SCOPE_UNIVERSE);
1586 memcpy(addr, &src, 4);
1587}
1588
1589#ifdef CONFIG_NET_CLS_ROUTE
1590static void set_class_tag(struct rtable *rt, u32 tag)
1591{
1592 if (!(rt->u.dst.tclassid & 0xFFFF))
1593 rt->u.dst.tclassid |= tag & 0xFFFF;
1594 if (!(rt->u.dst.tclassid & 0xFFFF0000))
1595 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1596}
1597#endif
1598
1599static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1600{
1601 struct fib_info *fi = res->fi;
1602
1603 if (fi) {
1604 if (FIB_RES_GW(*res) &&
1605 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1606 rt->rt_gateway = FIB_RES_GW(*res);
1607 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1608 sizeof(rt->u.dst.metrics));
1609 if (fi->fib_mtu == 0) {
1610 rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1611 if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1612 rt->rt_gateway != rt->rt_dst &&
1613 rt->u.dst.dev->mtu > 576)
1614 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1615 }
1616#ifdef CONFIG_NET_CLS_ROUTE
1617 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1618#endif
1619 } else
1620 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1621
1622 if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1623 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1624 if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1625 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1626 if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1627 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1628 ip_rt_min_advmss);
1629 if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1630 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1631
1632#ifdef CONFIG_NET_CLS_ROUTE
1633#ifdef CONFIG_IP_MULTIPLE_TABLES
1634 set_class_tag(rt, fib_rules_tclass(res));
1635#endif
1636 set_class_tag(rt, itag);
1637#endif
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001638 rt->rt_type = res->type;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001639}
1640
Al Viro9e12bb22006-09-26 21:25:20 -07001641static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001642 u8 tos, struct net_device *dev, int our)
1643{
1644 unsigned hash;
1645 struct rtable *rth;
Al Viroa61ced52006-09-26 21:27:54 -07001646 __be32 spec_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001647 struct in_device *in_dev = in_dev_get(dev);
1648 u32 itag = 0;
1649
1650 /* Primary sanity checks. */
1651
1652 if (in_dev == NULL)
1653 return -EINVAL;
1654
Jan Engelhardt1e637c72008-01-21 03:18:08 -08001655 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
Joe Perchesf97c1e02007-12-16 13:45:43 -08001656 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001657 goto e_inval;
1658
Joe Perchesf97c1e02007-12-16 13:45:43 -08001659 if (ipv4_is_zeronet(saddr)) {
1660 if (!ipv4_is_local_multicast(daddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001661 goto e_inval;
1662 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1663 } else if (fib_validate_source(saddr, 0, tos, 0,
1664 dev, &spec_dst, &itag) < 0)
1665 goto e_inval;
1666
1667 rth = dst_alloc(&ipv4_dst_ops);
1668 if (!rth)
1669 goto e_nobufs;
1670
1671 rth->u.dst.output= ip_rt_bug;
1672
1673 atomic_set(&rth->u.dst.__refcnt, 1);
1674 rth->u.dst.flags= DST_HOST;
Herbert Xu42f811b2007-06-04 23:34:44 -07001675 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001676 rth->u.dst.flags |= DST_NOPOLICY;
1677 rth->fl.fl4_dst = daddr;
1678 rth->rt_dst = daddr;
1679 rth->fl.fl4_tos = tos;
Thomas Graf47dcf0c2006-11-09 15:20:38 -08001680 rth->fl.mark = skb->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001681 rth->fl.fl4_src = saddr;
1682 rth->rt_src = saddr;
1683#ifdef CONFIG_NET_CLS_ROUTE
1684 rth->u.dst.tclassid = itag;
1685#endif
1686 rth->rt_iif =
1687 rth->fl.iif = dev->ifindex;
Eric W. Biederman2774c7a2007-09-26 22:10:56 -07001688 rth->u.dst.dev = init_net.loopback_dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001689 dev_hold(rth->u.dst.dev);
1690 rth->idev = in_dev_get(rth->u.dst.dev);
1691 rth->fl.oif = 0;
1692 rth->rt_gateway = daddr;
1693 rth->rt_spec_dst= spec_dst;
Eric Dumazet29e75252008-01-31 17:05:09 -08001694 rth->rt_genid = atomic_read(&rt_genid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001695 rth->rt_flags = RTCF_MULTICAST;
Eric Dumazet29e75252008-01-31 17:05:09 -08001696 rth->rt_type = RTN_MULTICAST;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001697 if (our) {
1698 rth->u.dst.input= ip_local_deliver;
1699 rth->rt_flags |= RTCF_LOCAL;
1700 }
1701
1702#ifdef CONFIG_IP_MROUTE
Joe Perchesf97c1e02007-12-16 13:45:43 -08001703 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001704 rth->u.dst.input = ip_mr_input;
1705#endif
1706 RT_CACHE_STAT_INC(in_slow_mc);
1707
1708 in_dev_put(in_dev);
Al Viro8c7bc842006-09-26 21:26:19 -07001709 hash = rt_hash(daddr, saddr, dev->ifindex);
Eric Dumazetee6b9672008-03-05 18:30:47 -08001710 return rt_intern_hash(hash, rth, &skb->rtable);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001711
1712e_nobufs:
1713 in_dev_put(in_dev);
1714 return -ENOBUFS;
1715
1716e_inval:
1717 in_dev_put(in_dev);
1718 return -EINVAL;
1719}
1720
1721
1722static void ip_handle_martian_source(struct net_device *dev,
1723 struct in_device *in_dev,
1724 struct sk_buff *skb,
Al Viro9e12bb22006-09-26 21:25:20 -07001725 __be32 daddr,
1726 __be32 saddr)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001727{
1728 RT_CACHE_STAT_INC(in_martian_src);
1729#ifdef CONFIG_IP_ROUTE_VERBOSE
1730 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1731 /*
1732 * RFC1812 recommendation, if source is martian,
1733 * the only hint is MAC header.
1734 */
1735 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1736 "%u.%u.%u.%u, on dev %s\n",
1737 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
Arnaldo Carvalho de Melo98e399f2007-03-19 15:33:04 -07001738 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001739 int i;
Arnaldo Carvalho de Melo98e399f2007-03-19 15:33:04 -07001740 const unsigned char *p = skb_mac_header(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001741 printk(KERN_WARNING "ll header: ");
1742 for (i = 0; i < dev->hard_header_len; i++, p++) {
1743 printk("%02x", *p);
1744 if (i < (dev->hard_header_len - 1))
1745 printk(":");
1746 }
1747 printk("\n");
1748 }
1749 }
1750#endif
1751}
1752
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001753static inline int __mkroute_input(struct sk_buff *skb,
1754 struct fib_result* res,
1755 struct in_device *in_dev,
Al Viro9e12bb22006-09-26 21:25:20 -07001756 __be32 daddr, __be32 saddr, u32 tos,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001757 struct rtable **result)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001758{
1759
1760 struct rtable *rth;
1761 int err;
1762 struct in_device *out_dev;
1763 unsigned flags = 0;
Al Virod9c9df82006-09-26 21:28:14 -07001764 __be32 spec_dst;
1765 u32 itag;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001766
1767 /* get a working reference to the output device */
1768 out_dev = in_dev_get(FIB_RES_DEV(*res));
1769 if (out_dev == NULL) {
1770 if (net_ratelimit())
1771 printk(KERN_CRIT "Bug in ip_route_input" \
1772 "_slow(). Please, report\n");
1773 return -EINVAL;
1774 }
1775
1776
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001777 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
Linus Torvalds1da177e2005-04-16 15:20:36 -07001778 in_dev->dev, &spec_dst, &itag);
1779 if (err < 0) {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001780 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001781 saddr);
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001782
Linus Torvalds1da177e2005-04-16 15:20:36 -07001783 err = -EINVAL;
1784 goto cleanup;
1785 }
1786
1787 if (err)
1788 flags |= RTCF_DIRECTSRC;
1789
Rami Rosencb7928a2008-01-09 00:18:24 -08001790 if (out_dev == in_dev && err && !(flags & RTCF_MASQ) &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001791 (IN_DEV_SHARED_MEDIA(out_dev) ||
1792 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1793 flags |= RTCF_DOREDIRECT;
1794
1795 if (skb->protocol != htons(ETH_P_IP)) {
1796 /* Not IP (i.e. ARP). Do not create route, if it is
1797 * invalid for proxy arp. DNAT routes are always valid.
1798 */
Rami Rosencb7928a2008-01-09 00:18:24 -08001799 if (out_dev == in_dev) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001800 err = -EINVAL;
1801 goto cleanup;
1802 }
1803 }
1804
1805
1806 rth = dst_alloc(&ipv4_dst_ops);
1807 if (!rth) {
1808 err = -ENOBUFS;
1809 goto cleanup;
1810 }
1811
Julian Anastasovce723d82005-09-08 13:34:47 -07001812 atomic_set(&rth->u.dst.__refcnt, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001813 rth->u.dst.flags= DST_HOST;
Herbert Xu42f811b2007-06-04 23:34:44 -07001814 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001815 rth->u.dst.flags |= DST_NOPOLICY;
Herbert Xu42f811b2007-06-04 23:34:44 -07001816 if (IN_DEV_CONF_GET(out_dev, NOXFRM))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001817 rth->u.dst.flags |= DST_NOXFRM;
1818 rth->fl.fl4_dst = daddr;
1819 rth->rt_dst = daddr;
1820 rth->fl.fl4_tos = tos;
Thomas Graf47dcf0c2006-11-09 15:20:38 -08001821 rth->fl.mark = skb->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001822 rth->fl.fl4_src = saddr;
1823 rth->rt_src = saddr;
1824 rth->rt_gateway = daddr;
1825 rth->rt_iif =
1826 rth->fl.iif = in_dev->dev->ifindex;
1827 rth->u.dst.dev = (out_dev)->dev;
1828 dev_hold(rth->u.dst.dev);
1829 rth->idev = in_dev_get(rth->u.dst.dev);
1830 rth->fl.oif = 0;
1831 rth->rt_spec_dst= spec_dst;
1832
1833 rth->u.dst.input = ip_forward;
1834 rth->u.dst.output = ip_output;
Eric Dumazet29e75252008-01-31 17:05:09 -08001835 rth->rt_genid = atomic_read(&rt_genid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001836
1837 rt_set_nexthop(rth, res, itag);
1838
1839 rth->rt_flags = flags;
1840
1841 *result = rth;
1842 err = 0;
1843 cleanup:
1844 /* release the working reference to the output device */
1845 in_dev_put(out_dev);
1846 return err;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001847}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001848
David S. Millere06e7c62007-06-10 17:22:39 -07001849static inline int ip_mkroute_input(struct sk_buff *skb,
1850 struct fib_result* res,
1851 const struct flowi *fl,
1852 struct in_device *in_dev,
1853 __be32 daddr, __be32 saddr, u32 tos)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001854{
Chuck Short7abaa272005-06-22 22:10:23 -07001855 struct rtable* rth = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001856 int err;
1857 unsigned hash;
1858
1859#ifdef CONFIG_IP_ROUTE_MULTIPATH
1860 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1861 fib_select_multipath(fl, res);
1862#endif
1863
1864 /* create a routing cache entry */
1865 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1866 if (err)
1867 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001868
1869 /* put it into the cache */
Al Viro8c7bc842006-09-26 21:26:19 -07001870 hash = rt_hash(daddr, saddr, fl->iif);
Eric Dumazetee6b9672008-03-05 18:30:47 -08001871 return rt_intern_hash(hash, rth, &skb->rtable);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001872}
1873
Linus Torvalds1da177e2005-04-16 15:20:36 -07001874/*
1875 * NOTE. We drop all the packets that has local source
1876 * addresses, because every properly looped back packet
1877 * must have correct destination already attached by output routine.
1878 *
1879 * Such approach solves two big problems:
1880 * 1. Not simplex devices are handled properly.
1881 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1882 */
1883
Al Viro9e12bb22006-09-26 21:25:20 -07001884static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001885 u8 tos, struct net_device *dev)
1886{
1887 struct fib_result res;
1888 struct in_device *in_dev = in_dev_get(dev);
1889 struct flowi fl = { .nl_u = { .ip4_u =
1890 { .daddr = daddr,
1891 .saddr = saddr,
1892 .tos = tos,
1893 .scope = RT_SCOPE_UNIVERSE,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001894 } },
Thomas Graf47dcf0c2006-11-09 15:20:38 -08001895 .mark = skb->mark,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001896 .iif = dev->ifindex };
1897 unsigned flags = 0;
1898 u32 itag = 0;
1899 struct rtable * rth;
1900 unsigned hash;
Al Viro9e12bb22006-09-26 21:25:20 -07001901 __be32 spec_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001902 int err = -EINVAL;
1903 int free_res = 0;
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09001904 struct net * net = dev_net(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001905
1906 /* IP on this device is disabled. */
1907
1908 if (!in_dev)
1909 goto out;
1910
1911 /* Check for the most weird martians, which can be not detected
1912 by fib_lookup.
1913 */
1914
Jan Engelhardt1e637c72008-01-21 03:18:08 -08001915 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
Joe Perchesf97c1e02007-12-16 13:45:43 -08001916 ipv4_is_loopback(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001917 goto martian_source;
1918
Al Viroe4485152006-09-26 22:15:01 -07001919 if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001920 goto brd_input;
1921
1922 /* Accept zero addresses only to limited broadcast;
1923 * I even do not know to fix it or not. Waiting for complains :-)
1924 */
Joe Perchesf97c1e02007-12-16 13:45:43 -08001925 if (ipv4_is_zeronet(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001926 goto martian_source;
1927
Jan Engelhardt1e637c72008-01-21 03:18:08 -08001928 if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
Joe Perchesf97c1e02007-12-16 13:45:43 -08001929 ipv4_is_loopback(daddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001930 goto martian_destination;
1931
1932 /*
1933 * Now we are ready to route packet.
1934 */
Denis V. Lunev84a885f2008-01-21 17:34:35 -08001935 if ((err = fib_lookup(net, &fl, &res)) != 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001936 if (!IN_DEV_FORWARD(in_dev))
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07001937 goto e_hostunreach;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001938 goto no_route;
1939 }
1940 free_res = 1;
1941
1942 RT_CACHE_STAT_INC(in_slow_tot);
1943
1944 if (res.type == RTN_BROADCAST)
1945 goto brd_input;
1946
1947 if (res.type == RTN_LOCAL) {
1948 int result;
1949 result = fib_validate_source(saddr, daddr, tos,
Denis V. Lunev84a885f2008-01-21 17:34:35 -08001950 net->loopback_dev->ifindex,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001951 dev, &spec_dst, &itag);
1952 if (result < 0)
1953 goto martian_source;
1954 if (result)
1955 flags |= RTCF_DIRECTSRC;
1956 spec_dst = daddr;
1957 goto local_input;
1958 }
1959
1960 if (!IN_DEV_FORWARD(in_dev))
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07001961 goto e_hostunreach;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001962 if (res.type != RTN_UNICAST)
1963 goto martian_destination;
1964
1965 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001966done:
1967 in_dev_put(in_dev);
1968 if (free_res)
1969 fib_res_put(&res);
1970out: return err;
1971
1972brd_input:
1973 if (skb->protocol != htons(ETH_P_IP))
1974 goto e_inval;
1975
Joe Perchesf97c1e02007-12-16 13:45:43 -08001976 if (ipv4_is_zeronet(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001977 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1978 else {
1979 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1980 &itag);
1981 if (err < 0)
1982 goto martian_source;
1983 if (err)
1984 flags |= RTCF_DIRECTSRC;
1985 }
1986 flags |= RTCF_BROADCAST;
1987 res.type = RTN_BROADCAST;
1988 RT_CACHE_STAT_INC(in_brd);
1989
1990local_input:
1991 rth = dst_alloc(&ipv4_dst_ops);
1992 if (!rth)
1993 goto e_nobufs;
1994
1995 rth->u.dst.output= ip_rt_bug;
Eric Dumazet29e75252008-01-31 17:05:09 -08001996 rth->rt_genid = atomic_read(&rt_genid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001997
1998 atomic_set(&rth->u.dst.__refcnt, 1);
1999 rth->u.dst.flags= DST_HOST;
Herbert Xu42f811b2007-06-04 23:34:44 -07002000 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002001 rth->u.dst.flags |= DST_NOPOLICY;
2002 rth->fl.fl4_dst = daddr;
2003 rth->rt_dst = daddr;
2004 rth->fl.fl4_tos = tos;
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002005 rth->fl.mark = skb->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002006 rth->fl.fl4_src = saddr;
2007 rth->rt_src = saddr;
2008#ifdef CONFIG_NET_CLS_ROUTE
2009 rth->u.dst.tclassid = itag;
2010#endif
2011 rth->rt_iif =
2012 rth->fl.iif = dev->ifindex;
Denis V. Lunev84a885f2008-01-21 17:34:35 -08002013 rth->u.dst.dev = net->loopback_dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002014 dev_hold(rth->u.dst.dev);
2015 rth->idev = in_dev_get(rth->u.dst.dev);
2016 rth->rt_gateway = daddr;
2017 rth->rt_spec_dst= spec_dst;
2018 rth->u.dst.input= ip_local_deliver;
2019 rth->rt_flags = flags|RTCF_LOCAL;
2020 if (res.type == RTN_UNREACHABLE) {
2021 rth->u.dst.input= ip_error;
2022 rth->u.dst.error= -err;
2023 rth->rt_flags &= ~RTCF_LOCAL;
2024 }
2025 rth->rt_type = res.type;
Al Viro8c7bc842006-09-26 21:26:19 -07002026 hash = rt_hash(daddr, saddr, fl.iif);
Eric Dumazetee6b9672008-03-05 18:30:47 -08002027 err = rt_intern_hash(hash, rth, &skb->rtable);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002028 goto done;
2029
2030no_route:
2031 RT_CACHE_STAT_INC(in_no_route);
2032 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2033 res.type = RTN_UNREACHABLE;
Mitsuru Chinen7f538782007-12-07 01:07:24 -08002034 if (err == -ESRCH)
2035 err = -ENETUNREACH;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002036 goto local_input;
2037
2038 /*
2039 * Do not cache martian addresses: they should be logged (RFC1812)
2040 */
2041martian_destination:
2042 RT_CACHE_STAT_INC(in_martian_dst);
2043#ifdef CONFIG_IP_ROUTE_VERBOSE
2044 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2045 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2046 "%u.%u.%u.%u, dev %s\n",
2047 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2048#endif
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002049
2050e_hostunreach:
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002051 err = -EHOSTUNREACH;
2052 goto done;
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002053
Linus Torvalds1da177e2005-04-16 15:20:36 -07002054e_inval:
2055 err = -EINVAL;
2056 goto done;
2057
2058e_nobufs:
2059 err = -ENOBUFS;
2060 goto done;
2061
2062martian_source:
2063 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2064 goto e_inval;
2065}
2066
Al Viro9e12bb22006-09-26 21:25:20 -07002067int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002068 u8 tos, struct net_device *dev)
2069{
2070 struct rtable * rth;
2071 unsigned hash;
2072 int iif = dev->ifindex;
Denis V. Lunevb5921912008-01-22 23:50:25 -08002073 struct net *net;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002074
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09002075 net = dev_net(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002076 tos &= IPTOS_RT_MASK;
Al Viro8c7bc842006-09-26 21:26:19 -07002077 hash = rt_hash(daddr, saddr, iif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002078
2079 rcu_read_lock();
2080 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
Eric Dumazet093c2ca2007-02-09 16:19:26 -08002081 rth = rcu_dereference(rth->u.dst.rt_next)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002082 if (rth->fl.fl4_dst == daddr &&
2083 rth->fl.fl4_src == saddr &&
2084 rth->fl.iif == iif &&
2085 rth->fl.oif == 0 &&
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002086 rth->fl.mark == skb->mark &&
Denis V. Lunevb5921912008-01-22 23:50:25 -08002087 rth->fl.fl4_tos == tos &&
YOSHIFUJI Hideaki878628f2008-03-26 03:57:35 +09002088 net_eq(dev_net(rth->u.dst.dev), net) &&
Eric Dumazet29e75252008-01-31 17:05:09 -08002089 rth->rt_genid == atomic_read(&rt_genid)) {
Pavel Emelyanov03f49f32007-11-10 21:28:34 -08002090 dst_use(&rth->u.dst, jiffies);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002091 RT_CACHE_STAT_INC(in_hit);
2092 rcu_read_unlock();
Eric Dumazetee6b9672008-03-05 18:30:47 -08002093 skb->rtable = rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002094 return 0;
2095 }
2096 RT_CACHE_STAT_INC(in_hlist_search);
2097 }
2098 rcu_read_unlock();
2099
2100 /* Multicast recognition logic is moved from route cache to here.
2101 The problem was that too many Ethernet cards have broken/missing
2102 hardware multicast filters :-( As result the host on multicasting
2103 network acquires a lot of useless route cache entries, sort of
2104 SDR messages from all the world. Now we try to get rid of them.
2105 Really, provided software IP multicast filter is organized
2106 reasonably (at least, hashed), it does not result in a slowdown
2107 comparing with route cache reject entries.
2108 Note, that multicast routers are not affected, because
2109 route cache entry is created eventually.
2110 */
Joe Perchesf97c1e02007-12-16 13:45:43 -08002111 if (ipv4_is_multicast(daddr)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002112 struct in_device *in_dev;
2113
2114 rcu_read_lock();
Herbert Xue5ed6392005-10-03 14:35:55 -07002115 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002116 int our = ip_check_mc(in_dev, daddr, saddr,
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07002117 ip_hdr(skb)->protocol);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002118 if (our
2119#ifdef CONFIG_IP_MROUTE
Joe Perchesf97c1e02007-12-16 13:45:43 -08002120 || (!ipv4_is_local_multicast(daddr) &&
2121 IN_DEV_MFORWARD(in_dev))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002122#endif
2123 ) {
2124 rcu_read_unlock();
2125 return ip_route_input_mc(skb, daddr, saddr,
2126 tos, dev, our);
2127 }
2128 }
2129 rcu_read_unlock();
2130 return -EINVAL;
2131 }
2132 return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2133}
2134
2135static inline int __mkroute_output(struct rtable **result,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002136 struct fib_result* res,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002137 const struct flowi *fl,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002138 const struct flowi *oldflp,
2139 struct net_device *dev_out,
2140 unsigned flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002141{
2142 struct rtable *rth;
2143 struct in_device *in_dev;
2144 u32 tos = RT_FL_TOS(oldflp);
2145 int err = 0;
2146
Joe Perchesf97c1e02007-12-16 13:45:43 -08002147 if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002148 return -EINVAL;
2149
Al Viroe4485152006-09-26 22:15:01 -07002150 if (fl->fl4_dst == htonl(0xFFFFFFFF))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002151 res->type = RTN_BROADCAST;
Joe Perchesf97c1e02007-12-16 13:45:43 -08002152 else if (ipv4_is_multicast(fl->fl4_dst))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002153 res->type = RTN_MULTICAST;
Jan Engelhardt1e637c72008-01-21 03:18:08 -08002154 else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002155 return -EINVAL;
2156
2157 if (dev_out->flags & IFF_LOOPBACK)
2158 flags |= RTCF_LOCAL;
2159
2160 /* get work reference to inet device */
2161 in_dev = in_dev_get(dev_out);
2162 if (!in_dev)
2163 return -EINVAL;
2164
2165 if (res->type == RTN_BROADCAST) {
2166 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2167 if (res->fi) {
2168 fib_info_put(res->fi);
2169 res->fi = NULL;
2170 }
2171 } else if (res->type == RTN_MULTICAST) {
2172 flags |= RTCF_MULTICAST|RTCF_LOCAL;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002173 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002174 oldflp->proto))
2175 flags &= ~RTCF_LOCAL;
2176 /* If multicast route do not exist use
2177 default one, but do not gateway in this case.
2178 Yes, it is hack.
2179 */
2180 if (res->fi && res->prefixlen < 4) {
2181 fib_info_put(res->fi);
2182 res->fi = NULL;
2183 }
2184 }
2185
2186
2187 rth = dst_alloc(&ipv4_dst_ops);
2188 if (!rth) {
2189 err = -ENOBUFS;
2190 goto cleanup;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002191 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002192
Julian Anastasovce723d82005-09-08 13:34:47 -07002193 atomic_set(&rth->u.dst.__refcnt, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002194 rth->u.dst.flags= DST_HOST;
Herbert Xu42f811b2007-06-04 23:34:44 -07002195 if (IN_DEV_CONF_GET(in_dev, NOXFRM))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002196 rth->u.dst.flags |= DST_NOXFRM;
Herbert Xu42f811b2007-06-04 23:34:44 -07002197 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002198 rth->u.dst.flags |= DST_NOPOLICY;
2199
2200 rth->fl.fl4_dst = oldflp->fl4_dst;
2201 rth->fl.fl4_tos = tos;
2202 rth->fl.fl4_src = oldflp->fl4_src;
2203 rth->fl.oif = oldflp->oif;
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002204 rth->fl.mark = oldflp->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002205 rth->rt_dst = fl->fl4_dst;
2206 rth->rt_src = fl->fl4_src;
2207 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002208 /* get references to the devices that are to be hold by the routing
Linus Torvalds1da177e2005-04-16 15:20:36 -07002209 cache entry */
2210 rth->u.dst.dev = dev_out;
2211 dev_hold(dev_out);
2212 rth->idev = in_dev_get(dev_out);
2213 rth->rt_gateway = fl->fl4_dst;
2214 rth->rt_spec_dst= fl->fl4_src;
2215
2216 rth->u.dst.output=ip_output;
Eric Dumazet29e75252008-01-31 17:05:09 -08002217 rth->rt_genid = atomic_read(&rt_genid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002218
2219 RT_CACHE_STAT_INC(out_slow_tot);
2220
2221 if (flags & RTCF_LOCAL) {
2222 rth->u.dst.input = ip_local_deliver;
2223 rth->rt_spec_dst = fl->fl4_dst;
2224 }
2225 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2226 rth->rt_spec_dst = fl->fl4_src;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002227 if (flags & RTCF_LOCAL &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07002228 !(dev_out->flags & IFF_LOOPBACK)) {
2229 rth->u.dst.output = ip_mc_output;
2230 RT_CACHE_STAT_INC(out_slow_mc);
2231 }
2232#ifdef CONFIG_IP_MROUTE
2233 if (res->type == RTN_MULTICAST) {
2234 if (IN_DEV_MFORWARD(in_dev) &&
Joe Perchesf97c1e02007-12-16 13:45:43 -08002235 !ipv4_is_local_multicast(oldflp->fl4_dst)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002236 rth->u.dst.input = ip_mr_input;
2237 rth->u.dst.output = ip_mc_output;
2238 }
2239 }
2240#endif
2241 }
2242
2243 rt_set_nexthop(rth, res, 0);
2244
2245 rth->rt_flags = flags;
2246
2247 *result = rth;
2248 cleanup:
2249 /* release work reference to inet device */
2250 in_dev_put(in_dev);
2251
2252 return err;
2253}
2254
David S. Millere06e7c62007-06-10 17:22:39 -07002255static inline int ip_mkroute_output(struct rtable **rp,
2256 struct fib_result* res,
2257 const struct flowi *fl,
2258 const struct flowi *oldflp,
2259 struct net_device *dev_out,
2260 unsigned flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002261{
Chuck Short7abaa272005-06-22 22:10:23 -07002262 struct rtable *rth = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002263 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2264 unsigned hash;
2265 if (err == 0) {
Al Viro8c7bc842006-09-26 21:26:19 -07002266 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002267 err = rt_intern_hash(hash, rth, rp);
2268 }
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002269
Linus Torvalds1da177e2005-04-16 15:20:36 -07002270 return err;
2271}
2272
Linus Torvalds1da177e2005-04-16 15:20:36 -07002273/*
2274 * Major route resolver routine.
2275 */
2276
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002277static int ip_route_output_slow(struct net *net, struct rtable **rp,
2278 const struct flowi *oldflp)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002279{
2280 u32 tos = RT_FL_TOS(oldflp);
2281 struct flowi fl = { .nl_u = { .ip4_u =
2282 { .daddr = oldflp->fl4_dst,
2283 .saddr = oldflp->fl4_src,
2284 .tos = tos & IPTOS_RT_MASK,
2285 .scope = ((tos & RTO_ONLINK) ?
2286 RT_SCOPE_LINK :
2287 RT_SCOPE_UNIVERSE),
Linus Torvalds1da177e2005-04-16 15:20:36 -07002288 } },
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002289 .mark = oldflp->mark,
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002290 .iif = net->loopback_dev->ifindex,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002291 .oif = oldflp->oif };
2292 struct fib_result res;
2293 unsigned flags = 0;
2294 struct net_device *dev_out = NULL;
2295 int free_res = 0;
2296 int err;
2297
2298
2299 res.fi = NULL;
2300#ifdef CONFIG_IP_MULTIPLE_TABLES
2301 res.r = NULL;
2302#endif
2303
2304 if (oldflp->fl4_src) {
2305 err = -EINVAL;
Joe Perchesf97c1e02007-12-16 13:45:43 -08002306 if (ipv4_is_multicast(oldflp->fl4_src) ||
Jan Engelhardt1e637c72008-01-21 03:18:08 -08002307 ipv4_is_lbcast(oldflp->fl4_src) ||
Joe Perchesf97c1e02007-12-16 13:45:43 -08002308 ipv4_is_zeronet(oldflp->fl4_src))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002309 goto out;
2310
2311 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002312 dev_out = ip_dev_find(net, oldflp->fl4_src);
David S. Millerf6c5d732007-05-18 02:07:50 -07002313 if (dev_out == NULL)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002314 goto out;
2315
2316 /* I removed check for oif == dev_out->oif here.
2317 It was wrong for two reasons:
Denis V. Lunev1ab35272008-01-22 22:04:30 -08002318 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2319 is assigned to multiple interfaces.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002320 2. Moreover, we are allowed to send packets with saddr
2321 of another iface. --ANK
2322 */
2323
David S. Millerf6c5d732007-05-18 02:07:50 -07002324 if (oldflp->oif == 0
Joe Perchesf97c1e02007-12-16 13:45:43 -08002325 && (ipv4_is_multicast(oldflp->fl4_dst) ||
2326 oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002327 /* Special hack: user can direct multicasts
2328 and limited broadcast via necessary interface
2329 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2330 This hack is not just for fun, it allows
2331 vic,vat and friends to work.
2332 They bind socket to loopback, set ttl to zero
2333 and expect that it will work.
2334 From the viewpoint of routing cache they are broken,
2335 because we are not allowed to build multicast path
2336 with loopback source addr (look, routing cache
2337 cannot know, that ttl is zero, so that packet
2338 will not leave this host and route is valid).
2339 Luckily, this hack is good workaround.
2340 */
2341
2342 fl.oif = dev_out->ifindex;
2343 goto make_route;
2344 }
2345 if (dev_out)
2346 dev_put(dev_out);
2347 dev_out = NULL;
2348 }
2349
2350
2351 if (oldflp->oif) {
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002352 dev_out = dev_get_by_index(net, oldflp->oif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002353 err = -ENODEV;
2354 if (dev_out == NULL)
2355 goto out;
Herbert Xue5ed6392005-10-03 14:35:55 -07002356
2357 /* RACE: Check return value of inet_select_addr instead. */
2358 if (__in_dev_get_rtnl(dev_out) == NULL) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002359 dev_put(dev_out);
2360 goto out; /* Wrong error code */
2361 }
2362
Joe Perchesf97c1e02007-12-16 13:45:43 -08002363 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2364 oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002365 if (!fl.fl4_src)
2366 fl.fl4_src = inet_select_addr(dev_out, 0,
2367 RT_SCOPE_LINK);
2368 goto make_route;
2369 }
2370 if (!fl.fl4_src) {
Joe Perchesf97c1e02007-12-16 13:45:43 -08002371 if (ipv4_is_multicast(oldflp->fl4_dst))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002372 fl.fl4_src = inet_select_addr(dev_out, 0,
2373 fl.fl4_scope);
2374 else if (!oldflp->fl4_dst)
2375 fl.fl4_src = inet_select_addr(dev_out, 0,
2376 RT_SCOPE_HOST);
2377 }
2378 }
2379
2380 if (!fl.fl4_dst) {
2381 fl.fl4_dst = fl.fl4_src;
2382 if (!fl.fl4_dst)
2383 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2384 if (dev_out)
2385 dev_put(dev_out);
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002386 dev_out = net->loopback_dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002387 dev_hold(dev_out);
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002388 fl.oif = net->loopback_dev->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002389 res.type = RTN_LOCAL;
2390 flags |= RTCF_LOCAL;
2391 goto make_route;
2392 }
2393
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002394 if (fib_lookup(net, &fl, &res)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002395 res.fi = NULL;
2396 if (oldflp->oif) {
2397 /* Apparently, routing tables are wrong. Assume,
2398 that the destination is on link.
2399
2400 WHY? DW.
2401 Because we are allowed to send to iface
2402 even if it has NO routes and NO assigned
2403 addresses. When oif is specified, routing
2404 tables are looked up with only one purpose:
2405 to catch if destination is gatewayed, rather than
2406 direct. Moreover, if MSG_DONTROUTE is set,
2407 we send packet, ignoring both routing tables
2408 and ifaddr state. --ANK
2409
2410
2411 We could make it even if oif is unknown,
2412 likely IPv6, but we do not.
2413 */
2414
2415 if (fl.fl4_src == 0)
2416 fl.fl4_src = inet_select_addr(dev_out, 0,
2417 RT_SCOPE_LINK);
2418 res.type = RTN_UNICAST;
2419 goto make_route;
2420 }
2421 if (dev_out)
2422 dev_put(dev_out);
2423 err = -ENETUNREACH;
2424 goto out;
2425 }
2426 free_res = 1;
2427
2428 if (res.type == RTN_LOCAL) {
2429 if (!fl.fl4_src)
2430 fl.fl4_src = fl.fl4_dst;
2431 if (dev_out)
2432 dev_put(dev_out);
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002433 dev_out = net->loopback_dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002434 dev_hold(dev_out);
2435 fl.oif = dev_out->ifindex;
2436 if (res.fi)
2437 fib_info_put(res.fi);
2438 res.fi = NULL;
2439 flags |= RTCF_LOCAL;
2440 goto make_route;
2441 }
2442
2443#ifdef CONFIG_IP_ROUTE_MULTIPATH
2444 if (res.fi->fib_nhs > 1 && fl.oif == 0)
2445 fib_select_multipath(&fl, &res);
2446 else
2447#endif
2448 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002449 fib_select_default(net, &fl, &res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002450
2451 if (!fl.fl4_src)
2452 fl.fl4_src = FIB_RES_PREFSRC(res);
2453
2454 if (dev_out)
2455 dev_put(dev_out);
2456 dev_out = FIB_RES_DEV(res);
2457 dev_hold(dev_out);
2458 fl.oif = dev_out->ifindex;
2459
2460
2461make_route:
2462 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2463
2464
2465 if (free_res)
2466 fib_res_put(&res);
2467 if (dev_out)
2468 dev_put(dev_out);
2469out: return err;
2470}
2471
Denis V. Lunev611c1832008-01-22 22:06:48 -08002472int __ip_route_output_key(struct net *net, struct rtable **rp,
2473 const struct flowi *flp)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002474{
2475 unsigned hash;
2476 struct rtable *rth;
2477
Al Viro8c7bc842006-09-26 21:26:19 -07002478 hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002479
2480 rcu_read_lock_bh();
2481 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
Eric Dumazet093c2ca2007-02-09 16:19:26 -08002482 rth = rcu_dereference(rth->u.dst.rt_next)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002483 if (rth->fl.fl4_dst == flp->fl4_dst &&
2484 rth->fl.fl4_src == flp->fl4_src &&
2485 rth->fl.iif == 0 &&
2486 rth->fl.oif == flp->oif &&
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002487 rth->fl.mark == flp->mark &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07002488 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
Denis V. Lunevb5921912008-01-22 23:50:25 -08002489 (IPTOS_RT_MASK | RTO_ONLINK)) &&
YOSHIFUJI Hideaki878628f2008-03-26 03:57:35 +09002490 net_eq(dev_net(rth->u.dst.dev), net) &&
Eric Dumazet29e75252008-01-31 17:05:09 -08002491 rth->rt_genid == atomic_read(&rt_genid)) {
Pavel Emelyanov03f49f32007-11-10 21:28:34 -08002492 dst_use(&rth->u.dst, jiffies);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002493 RT_CACHE_STAT_INC(out_hit);
2494 rcu_read_unlock_bh();
2495 *rp = rth;
2496 return 0;
2497 }
2498 RT_CACHE_STAT_INC(out_hlist_search);
2499 }
2500 rcu_read_unlock_bh();
2501
Denis V. Lunev611c1832008-01-22 22:06:48 -08002502 return ip_route_output_slow(net, rp, flp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002503}
2504
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -07002505EXPORT_SYMBOL_GPL(__ip_route_output_key);
2506
David S. Miller14e50e52007-05-24 18:17:54 -07002507static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2508{
2509}
2510
2511static struct dst_ops ipv4_dst_blackhole_ops = {
2512 .family = AF_INET,
2513 .protocol = __constant_htons(ETH_P_IP),
2514 .destroy = ipv4_dst_destroy,
2515 .check = ipv4_dst_check,
2516 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2517 .entry_size = sizeof(struct rtable),
Eric Dumazete2422972008-01-30 20:07:45 -08002518 .entries = ATOMIC_INIT(0),
David S. Miller14e50e52007-05-24 18:17:54 -07002519};
2520
2521
Denis V. Lunevce259992008-03-22 17:42:37 -07002522static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp)
David S. Miller14e50e52007-05-24 18:17:54 -07002523{
2524 struct rtable *ort = *rp;
2525 struct rtable *rt = (struct rtable *)
2526 dst_alloc(&ipv4_dst_blackhole_ops);
2527
2528 if (rt) {
2529 struct dst_entry *new = &rt->u.dst;
2530
2531 atomic_set(&new->__refcnt, 1);
2532 new->__use = 1;
Herbert Xu352e5122007-11-13 21:34:06 -08002533 new->input = dst_discard;
2534 new->output = dst_discard;
David S. Miller14e50e52007-05-24 18:17:54 -07002535 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2536
2537 new->dev = ort->u.dst.dev;
2538 if (new->dev)
2539 dev_hold(new->dev);
2540
2541 rt->fl = ort->fl;
2542
2543 rt->idev = ort->idev;
2544 if (rt->idev)
2545 in_dev_hold(rt->idev);
Eric Dumazet29e75252008-01-31 17:05:09 -08002546 rt->rt_genid = atomic_read(&rt_genid);
David S. Miller14e50e52007-05-24 18:17:54 -07002547 rt->rt_flags = ort->rt_flags;
2548 rt->rt_type = ort->rt_type;
2549 rt->rt_dst = ort->rt_dst;
2550 rt->rt_src = ort->rt_src;
2551 rt->rt_iif = ort->rt_iif;
2552 rt->rt_gateway = ort->rt_gateway;
2553 rt->rt_spec_dst = ort->rt_spec_dst;
2554 rt->peer = ort->peer;
2555 if (rt->peer)
2556 atomic_inc(&rt->peer->refcnt);
2557
2558 dst_free(new);
2559 }
2560
2561 dst_release(&(*rp)->u.dst);
2562 *rp = rt;
2563 return (rt ? 0 : -ENOMEM);
2564}
2565
Denis V. Lunevf1b050b2008-01-22 22:07:10 -08002566int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2567 struct sock *sk, int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002568{
2569 int err;
2570
Denis V. Lunevf1b050b2008-01-22 22:07:10 -08002571 if ((err = __ip_route_output_key(net, rp, flp)) != 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002572 return err;
2573
2574 if (flp->proto) {
2575 if (!flp->fl4_src)
2576 flp->fl4_src = (*rp)->rt_src;
2577 if (!flp->fl4_dst)
2578 flp->fl4_dst = (*rp)->rt_dst;
Herbert Xubb728452007-12-12 18:48:58 -08002579 err = __xfrm_lookup((struct dst_entry **)rp, flp, sk,
2580 flags ? XFRM_LOOKUP_WAIT : 0);
David S. Miller14e50e52007-05-24 18:17:54 -07002581 if (err == -EREMOTE)
Denis V. Lunevce259992008-03-22 17:42:37 -07002582 err = ipv4_dst_blackhole(rp, flp);
David S. Miller14e50e52007-05-24 18:17:54 -07002583
2584 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002585 }
2586
2587 return 0;
2588}
2589
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -07002590EXPORT_SYMBOL_GPL(ip_route_output_flow);
2591
Denis V. Lunevf2063512008-01-22 22:07:34 -08002592int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002593{
Denis V. Lunevf2063512008-01-22 22:07:34 -08002594 return ip_route_output_flow(net, rp, flp, NULL, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002595}
2596
2597static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07002598 int nowait, unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002599{
Eric Dumazetee6b9672008-03-05 18:30:47 -08002600 struct rtable *rt = skb->rtable;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002601 struct rtmsg *r;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002602 struct nlmsghdr *nlh;
Thomas Grafe3703b32006-11-27 09:27:07 -08002603 long expires;
2604 u32 id = 0, ts = 0, tsage = 0, error;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002605
2606 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2607 if (nlh == NULL)
Patrick McHardy26932562007-01-31 23:16:40 -08002608 return -EMSGSIZE;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002609
2610 r = nlmsg_data(nlh);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002611 r->rtm_family = AF_INET;
2612 r->rtm_dst_len = 32;
2613 r->rtm_src_len = 0;
2614 r->rtm_tos = rt->fl.fl4_tos;
2615 r->rtm_table = RT_TABLE_MAIN;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002616 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002617 r->rtm_type = rt->rt_type;
2618 r->rtm_scope = RT_SCOPE_UNIVERSE;
2619 r->rtm_protocol = RTPROT_UNSPEC;
2620 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2621 if (rt->rt_flags & RTCF_NOTIFY)
2622 r->rtm_flags |= RTM_F_NOTIFY;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002623
Al Viro17fb2c62006-09-26 22:15:25 -07002624 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
Thomas Grafbe403ea2006-08-17 18:15:17 -07002625
Linus Torvalds1da177e2005-04-16 15:20:36 -07002626 if (rt->fl.fl4_src) {
2627 r->rtm_src_len = 32;
Al Viro17fb2c62006-09-26 22:15:25 -07002628 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002629 }
2630 if (rt->u.dst.dev)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002631 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002632#ifdef CONFIG_NET_CLS_ROUTE
2633 if (rt->u.dst.tclassid)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002634 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002635#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07002636 if (rt->fl.iif)
Al Viro17fb2c62006-09-26 22:15:25 -07002637 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002638 else if (rt->rt_src != rt->fl.fl4_src)
Al Viro17fb2c62006-09-26 22:15:25 -07002639 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
Thomas Grafbe403ea2006-08-17 18:15:17 -07002640
Linus Torvalds1da177e2005-04-16 15:20:36 -07002641 if (rt->rt_dst != rt->rt_gateway)
Al Viro17fb2c62006-09-26 22:15:25 -07002642 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
Thomas Grafbe403ea2006-08-17 18:15:17 -07002643
Linus Torvalds1da177e2005-04-16 15:20:36 -07002644 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002645 goto nla_put_failure;
2646
Thomas Grafe3703b32006-11-27 09:27:07 -08002647 error = rt->u.dst.error;
2648 expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002649 if (rt->peer) {
Thomas Grafe3703b32006-11-27 09:27:07 -08002650 id = rt->peer->ip_id_count;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002651 if (rt->peer->tcp_ts_stamp) {
Thomas Grafe3703b32006-11-27 09:27:07 -08002652 ts = rt->peer->tcp_ts;
James Morris9d729f72007-03-04 16:12:44 -08002653 tsage = get_seconds() - rt->peer->tcp_ts_stamp;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002654 }
2655 }
Thomas Grafbe403ea2006-08-17 18:15:17 -07002656
Linus Torvalds1da177e2005-04-16 15:20:36 -07002657 if (rt->fl.iif) {
2658#ifdef CONFIG_IP_MROUTE
Al Viroe4485152006-09-26 22:15:01 -07002659 __be32 dst = rt->rt_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002660
Joe Perchesf97c1e02007-12-16 13:45:43 -08002661 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
Pavel Emelyanov586f1212007-12-16 13:32:48 -08002662 IPV4_DEVCONF_ALL(&init_net, MC_FORWARDING)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002663 int err = ipmr_get_route(skb, r, nowait);
2664 if (err <= 0) {
2665 if (!nowait) {
2666 if (err == 0)
2667 return 0;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002668 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002669 } else {
2670 if (err == -EMSGSIZE)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002671 goto nla_put_failure;
Thomas Grafe3703b32006-11-27 09:27:07 -08002672 error = err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002673 }
2674 }
2675 } else
2676#endif
Thomas Grafbe403ea2006-08-17 18:15:17 -07002677 NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002678 }
2679
Thomas Grafe3703b32006-11-27 09:27:07 -08002680 if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2681 expires, error) < 0)
2682 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002683
Thomas Grafbe403ea2006-08-17 18:15:17 -07002684 return nlmsg_end(skb, nlh);
2685
2686nla_put_failure:
Patrick McHardy26932562007-01-31 23:16:40 -08002687 nlmsg_cancel(skb, nlh);
2688 return -EMSGSIZE;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002689}
2690
Thomas Graf63f34442007-03-22 11:55:17 -07002691static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002692{
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09002693 struct net *net = sock_net(in_skb->sk);
Thomas Grafd889ce32006-08-17 18:15:44 -07002694 struct rtmsg *rtm;
2695 struct nlattr *tb[RTA_MAX+1];
Linus Torvalds1da177e2005-04-16 15:20:36 -07002696 struct rtable *rt = NULL;
Al Viro9e12bb22006-09-26 21:25:20 -07002697 __be32 dst = 0;
2698 __be32 src = 0;
2699 u32 iif;
Thomas Grafd889ce32006-08-17 18:15:44 -07002700 int err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002701 struct sk_buff *skb;
2702
Thomas Grafd889ce32006-08-17 18:15:44 -07002703 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2704 if (err < 0)
2705 goto errout;
2706
2707 rtm = nlmsg_data(nlh);
2708
Linus Torvalds1da177e2005-04-16 15:20:36 -07002709 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
Thomas Grafd889ce32006-08-17 18:15:44 -07002710 if (skb == NULL) {
2711 err = -ENOBUFS;
2712 goto errout;
2713 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002714
2715 /* Reserve room for dummy headers, this skb can pass
2716 through good chunk of routing engine.
2717 */
Arnaldo Carvalho de Melo459a98e2007-03-19 15:30:44 -07002718 skb_reset_mac_header(skb);
Arnaldo Carvalho de Meloc1d2bbe2007-04-10 20:45:18 -07002719 skb_reset_network_header(skb);
Stephen Hemmingerd2c962b2006-04-17 17:27:11 -07002720
2721 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07002722 ip_hdr(skb)->protocol = IPPROTO_ICMP;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002723 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2724
Al Viro17fb2c62006-09-26 22:15:25 -07002725 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2726 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
Thomas Grafd889ce32006-08-17 18:15:44 -07002727 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002728
2729 if (iif) {
Thomas Grafd889ce32006-08-17 18:15:44 -07002730 struct net_device *dev;
2731
Denis V. Lunev19375042008-02-28 20:52:04 -08002732 dev = __dev_get_by_index(net, iif);
Thomas Grafd889ce32006-08-17 18:15:44 -07002733 if (dev == NULL) {
2734 err = -ENODEV;
2735 goto errout_free;
2736 }
2737
Linus Torvalds1da177e2005-04-16 15:20:36 -07002738 skb->protocol = htons(ETH_P_IP);
2739 skb->dev = dev;
2740 local_bh_disable();
2741 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2742 local_bh_enable();
Thomas Grafd889ce32006-08-17 18:15:44 -07002743
Eric Dumazetee6b9672008-03-05 18:30:47 -08002744 rt = skb->rtable;
Thomas Grafd889ce32006-08-17 18:15:44 -07002745 if (err == 0 && rt->u.dst.error)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002746 err = -rt->u.dst.error;
2747 } else {
Thomas Grafd889ce32006-08-17 18:15:44 -07002748 struct flowi fl = {
2749 .nl_u = {
2750 .ip4_u = {
2751 .daddr = dst,
2752 .saddr = src,
2753 .tos = rtm->rtm_tos,
2754 },
2755 },
2756 .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2757 };
Denis V. Lunev19375042008-02-28 20:52:04 -08002758 err = ip_route_output_key(net, &rt, &fl);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002759 }
Thomas Grafd889ce32006-08-17 18:15:44 -07002760
Linus Torvalds1da177e2005-04-16 15:20:36 -07002761 if (err)
Thomas Grafd889ce32006-08-17 18:15:44 -07002762 goto errout_free;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002763
Eric Dumazetee6b9672008-03-05 18:30:47 -08002764 skb->rtable = rt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002765 if (rtm->rtm_flags & RTM_F_NOTIFY)
2766 rt->rt_flags |= RTCF_NOTIFY;
2767
Linus Torvalds1da177e2005-04-16 15:20:36 -07002768 err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
Denis V. Lunev19375042008-02-28 20:52:04 -08002769 RTM_NEWROUTE, 0, 0);
Thomas Grafd889ce32006-08-17 18:15:44 -07002770 if (err <= 0)
2771 goto errout_free;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002772
Denis V. Lunev19375042008-02-28 20:52:04 -08002773 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
Thomas Grafd889ce32006-08-17 18:15:44 -07002774errout:
Thomas Graf2942e902006-08-15 00:30:25 -07002775 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002776
Thomas Grafd889ce32006-08-17 18:15:44 -07002777errout_free:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002778 kfree_skb(skb);
Thomas Grafd889ce32006-08-17 18:15:44 -07002779 goto errout;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002780}
2781
2782int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2783{
2784 struct rtable *rt;
2785 int h, s_h;
2786 int idx, s_idx;
Denis V. Lunev19375042008-02-28 20:52:04 -08002787 struct net *net;
2788
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09002789 net = sock_net(skb->sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002790
2791 s_h = cb->args[0];
Eric Dumazetd8c92832008-01-07 21:52:14 -08002792 if (s_h < 0)
2793 s_h = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002794 s_idx = idx = cb->args[1];
Eric Dumazetd8c92832008-01-07 21:52:14 -08002795 for (h = s_h; h <= rt_hash_mask; h++) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002796 rcu_read_lock_bh();
2797 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
Eric Dumazet093c2ca2007-02-09 16:19:26 -08002798 rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
YOSHIFUJI Hideaki878628f2008-03-26 03:57:35 +09002799 if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002800 continue;
Eric Dumazet29e75252008-01-31 17:05:09 -08002801 if (rt->rt_genid != atomic_read(&rt_genid))
2802 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002803 skb->dst = dst_clone(&rt->u.dst);
2804 if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002805 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07002806 1, NLM_F_MULTI) <= 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002807 dst_release(xchg(&skb->dst, NULL));
2808 rcu_read_unlock_bh();
2809 goto done;
2810 }
2811 dst_release(xchg(&skb->dst, NULL));
2812 }
2813 rcu_read_unlock_bh();
Eric Dumazetd8c92832008-01-07 21:52:14 -08002814 s_idx = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002815 }
2816
2817done:
2818 cb->args[0] = h;
2819 cb->args[1] = idx;
2820 return skb->len;
2821}
2822
2823void ip_rt_multicast_event(struct in_device *in_dev)
2824{
2825 rt_cache_flush(0);
2826}
2827
2828#ifdef CONFIG_SYSCTL
2829static int flush_delay;
2830
2831static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2832 struct file *filp, void __user *buffer,
2833 size_t *lenp, loff_t *ppos)
2834{
2835 if (write) {
2836 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2837 rt_cache_flush(flush_delay);
2838 return 0;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002839 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002840
2841 return -EINVAL;
2842}
2843
2844static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2845 int __user *name,
2846 int nlen,
2847 void __user *oldval,
2848 size_t __user *oldlenp,
2849 void __user *newval,
Alexey Dobriyan1f29bcd2006-12-10 02:19:10 -08002850 size_t newlen)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002851{
2852 int delay;
2853 if (newlen != sizeof(int))
2854 return -EINVAL;
2855 if (get_user(delay, (int __user *)newval))
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002856 return -EFAULT;
2857 rt_cache_flush(delay);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002858 return 0;
2859}
2860
2861ctl_table ipv4_route_table[] = {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002862 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002863 .ctl_name = NET_IPV4_ROUTE_FLUSH,
2864 .procname = "flush",
2865 .data = &flush_delay,
2866 .maxlen = sizeof(int),
Dave Jones7e3e0362005-04-28 12:11:03 -07002867 .mode = 0200,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002868 .proc_handler = &ipv4_sysctl_rtcache_flush,
2869 .strategy = &ipv4_sysctl_rtcache_flush_strategy,
2870 },
2871 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002872 .ctl_name = NET_IPV4_ROUTE_GC_THRESH,
2873 .procname = "gc_thresh",
2874 .data = &ipv4_dst_ops.gc_thresh,
2875 .maxlen = sizeof(int),
2876 .mode = 0644,
2877 .proc_handler = &proc_dointvec,
2878 },
2879 {
2880 .ctl_name = NET_IPV4_ROUTE_MAX_SIZE,
2881 .procname = "max_size",
2882 .data = &ip_rt_max_size,
2883 .maxlen = sizeof(int),
2884 .mode = 0644,
2885 .proc_handler = &proc_dointvec,
2886 },
2887 {
2888 /* Deprecated. Use gc_min_interval_ms */
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002889
Linus Torvalds1da177e2005-04-16 15:20:36 -07002890 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2891 .procname = "gc_min_interval",
2892 .data = &ip_rt_gc_min_interval,
2893 .maxlen = sizeof(int),
2894 .mode = 0644,
2895 .proc_handler = &proc_dointvec_jiffies,
2896 .strategy = &sysctl_jiffies,
2897 },
2898 {
2899 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2900 .procname = "gc_min_interval_ms",
2901 .data = &ip_rt_gc_min_interval,
2902 .maxlen = sizeof(int),
2903 .mode = 0644,
2904 .proc_handler = &proc_dointvec_ms_jiffies,
2905 .strategy = &sysctl_ms_jiffies,
2906 },
2907 {
2908 .ctl_name = NET_IPV4_ROUTE_GC_TIMEOUT,
2909 .procname = "gc_timeout",
2910 .data = &ip_rt_gc_timeout,
2911 .maxlen = sizeof(int),
2912 .mode = 0644,
2913 .proc_handler = &proc_dointvec_jiffies,
2914 .strategy = &sysctl_jiffies,
2915 },
2916 {
2917 .ctl_name = NET_IPV4_ROUTE_GC_INTERVAL,
2918 .procname = "gc_interval",
2919 .data = &ip_rt_gc_interval,
2920 .maxlen = sizeof(int),
2921 .mode = 0644,
2922 .proc_handler = &proc_dointvec_jiffies,
2923 .strategy = &sysctl_jiffies,
2924 },
2925 {
2926 .ctl_name = NET_IPV4_ROUTE_REDIRECT_LOAD,
2927 .procname = "redirect_load",
2928 .data = &ip_rt_redirect_load,
2929 .maxlen = sizeof(int),
2930 .mode = 0644,
2931 .proc_handler = &proc_dointvec,
2932 },
2933 {
2934 .ctl_name = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2935 .procname = "redirect_number",
2936 .data = &ip_rt_redirect_number,
2937 .maxlen = sizeof(int),
2938 .mode = 0644,
2939 .proc_handler = &proc_dointvec,
2940 },
2941 {
2942 .ctl_name = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2943 .procname = "redirect_silence",
2944 .data = &ip_rt_redirect_silence,
2945 .maxlen = sizeof(int),
2946 .mode = 0644,
2947 .proc_handler = &proc_dointvec,
2948 },
2949 {
2950 .ctl_name = NET_IPV4_ROUTE_ERROR_COST,
2951 .procname = "error_cost",
2952 .data = &ip_rt_error_cost,
2953 .maxlen = sizeof(int),
2954 .mode = 0644,
2955 .proc_handler = &proc_dointvec,
2956 },
2957 {
2958 .ctl_name = NET_IPV4_ROUTE_ERROR_BURST,
2959 .procname = "error_burst",
2960 .data = &ip_rt_error_burst,
2961 .maxlen = sizeof(int),
2962 .mode = 0644,
2963 .proc_handler = &proc_dointvec,
2964 },
2965 {
2966 .ctl_name = NET_IPV4_ROUTE_GC_ELASTICITY,
2967 .procname = "gc_elasticity",
2968 .data = &ip_rt_gc_elasticity,
2969 .maxlen = sizeof(int),
2970 .mode = 0644,
2971 .proc_handler = &proc_dointvec,
2972 },
2973 {
2974 .ctl_name = NET_IPV4_ROUTE_MTU_EXPIRES,
2975 .procname = "mtu_expires",
2976 .data = &ip_rt_mtu_expires,
2977 .maxlen = sizeof(int),
2978 .mode = 0644,
2979 .proc_handler = &proc_dointvec_jiffies,
2980 .strategy = &sysctl_jiffies,
2981 },
2982 {
2983 .ctl_name = NET_IPV4_ROUTE_MIN_PMTU,
2984 .procname = "min_pmtu",
2985 .data = &ip_rt_min_pmtu,
2986 .maxlen = sizeof(int),
2987 .mode = 0644,
2988 .proc_handler = &proc_dointvec,
2989 },
2990 {
2991 .ctl_name = NET_IPV4_ROUTE_MIN_ADVMSS,
2992 .procname = "min_adv_mss",
2993 .data = &ip_rt_min_advmss,
2994 .maxlen = sizeof(int),
2995 .mode = 0644,
2996 .proc_handler = &proc_dointvec,
2997 },
2998 {
2999 .ctl_name = NET_IPV4_ROUTE_SECRET_INTERVAL,
3000 .procname = "secret_interval",
3001 .data = &ip_rt_secret_interval,
3002 .maxlen = sizeof(int),
3003 .mode = 0644,
3004 .proc_handler = &proc_dointvec_jiffies,
3005 .strategy = &sysctl_jiffies,
3006 },
3007 { .ctl_name = 0 }
3008};
3009#endif
3010
3011#ifdef CONFIG_NET_CLS_ROUTE
Eric Dumazet8dbde282007-11-16 03:32:10 -08003012struct ip_rt_acct *ip_rt_acct __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003013#endif /* CONFIG_NET_CLS_ROUTE */
3014
3015static __initdata unsigned long rhash_entries;
3016static int __init set_rhash_entries(char *str)
3017{
3018 if (!str)
3019 return 0;
3020 rhash_entries = simple_strtoul(str, &str, 0);
3021 return 1;
3022}
3023__setup("rhash_entries=", set_rhash_entries);
3024
3025int __init ip_rt_init(void)
3026{
Eric Dumazet424c4b72005-07-05 14:58:19 -07003027 int rc = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003028
Eric Dumazet29e75252008-01-31 17:05:09 -08003029 atomic_set(&rt_genid, (int) ((num_physpages ^ (num_physpages>>8)) ^
3030 (jiffies ^ (jiffies >> 7))));
Linus Torvalds1da177e2005-04-16 15:20:36 -07003031
3032#ifdef CONFIG_NET_CLS_ROUTE
Eric Dumazet8dbde282007-11-16 03:32:10 -08003033 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct));
Linus Torvalds1da177e2005-04-16 15:20:36 -07003034 if (!ip_rt_acct)
3035 panic("IP: failed to allocate ip_rt_acct\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07003036#endif
3037
Alexey Dobriyane5d679f332006-08-26 19:25:52 -07003038 ipv4_dst_ops.kmem_cachep =
3039 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
Paul Mundt20c2df82007-07-20 10:11:58 +09003040 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003041
David S. Miller14e50e52007-05-24 18:17:54 -07003042 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3043
Eric Dumazet424c4b72005-07-05 14:58:19 -07003044 rt_hash_table = (struct rt_hash_bucket *)
3045 alloc_large_system_hash("IP route cache",
3046 sizeof(struct rt_hash_bucket),
3047 rhash_entries,
3048 (num_physpages >= 128 * 1024) ?
Mike Stroyan18955cf2005-11-29 16:12:55 -08003049 15 : 17,
Kirill Korotaev8d1502d2006-08-07 20:44:22 -07003050 0,
Eric Dumazet424c4b72005-07-05 14:58:19 -07003051 &rt_hash_log,
3052 &rt_hash_mask,
3053 0);
Eric Dumazet22c047c2005-07-05 14:55:24 -07003054 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3055 rt_hash_lock_init();
Linus Torvalds1da177e2005-04-16 15:20:36 -07003056
3057 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3058 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3059
Linus Torvalds1da177e2005-04-16 15:20:36 -07003060 devinet_init();
3061 ip_fib_init();
3062
Pavel Emelyanovb24b8a22008-01-23 21:20:07 -08003063 setup_timer(&rt_secret_timer, rt_secret_rebuild, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003064
3065 /* All the timers, started at system startup tend
3066 to synchronize. Perturb it a bit.
3067 */
Eric Dumazet39c90ec2007-09-15 10:55:54 -07003068 schedule_delayed_work(&expires_work,
3069 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003070
3071 rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3072 ip_rt_secret_interval;
3073 add_timer(&rt_secret_timer);
3074
Denis V. Lunev73b38712008-02-28 20:51:18 -08003075 if (ip_rt_proc_init())
Pavel Emelyanov107f1632007-12-05 21:14:28 -08003076 printk(KERN_ERR "Unable to create route proc files\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07003077#ifdef CONFIG_XFRM
3078 xfrm_init();
3079 xfrm4_init();
3080#endif
Thomas Graf63f34442007-03-22 11:55:17 -07003081 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3082
Linus Torvalds1da177e2005-04-16 15:20:36 -07003083 return rc;
3084}
3085
3086EXPORT_SYMBOL(__ip_select_ident);
3087EXPORT_SYMBOL(ip_route_input);
3088EXPORT_SYMBOL(ip_route_output_key);