blob: c6388e825ed309d7d8d5f239bc18dc6971711ac2 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
Jesper Juhl02c30a82005-05-05 16:16:16 -07008 * Authors: Ross Biro
Linus Torvalds1da177e2005-04-16 15:20:36 -07009 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 *
14 * Fixes:
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090021 * Alan Cox : Super /proc >4K
Linus Torvalds1da177e2005-04-16 15:20:36 -070022 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090039 *
Linus Torvalds1da177e2005-04-16 15:20:36 -070040 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
Eric Dumazetbb1d23b2005-07-05 15:00:32 -070055 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
Ilia Sotnikovcef26852006-03-25 01:38:55 -080056 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
Linus Torvalds1da177e2005-04-16 15:20:36 -070058 *
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
63 */
64
Linus Torvalds1da177e2005-04-16 15:20:36 -070065#include <linux/module.h>
66#include <asm/uaccess.h>
67#include <asm/system.h>
68#include <linux/bitops.h>
69#include <linux/types.h>
70#include <linux/kernel.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070071#include <linux/mm.h>
Eric Dumazet424c4b72005-07-05 14:58:19 -070072#include <linux/bootmem.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070073#include <linux/string.h>
74#include <linux/socket.h>
75#include <linux/sockios.h>
76#include <linux/errno.h>
77#include <linux/in.h>
78#include <linux/inet.h>
79#include <linux/netdevice.h>
80#include <linux/proc_fs.h>
81#include <linux/init.h>
Eric Dumazet39c90ec2007-09-15 10:55:54 -070082#include <linux/workqueue.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070083#include <linux/skbuff.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070084#include <linux/inetdevice.h>
85#include <linux/igmp.h>
86#include <linux/pkt_sched.h>
87#include <linux/mroute.h>
88#include <linux/netfilter_ipv4.h>
89#include <linux/random.h>
90#include <linux/jhash.h>
91#include <linux/rcupdate.h>
92#include <linux/times.h>
Tejun Heo5a0e3ad2010-03-24 17:04:11 +090093#include <linux/slab.h>
Herbert Xu352e5122007-11-13 21:34:06 -080094#include <net/dst.h>
Eric W. Biederman457c4cb2007-09-12 12:01:34 +020095#include <net/net_namespace.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070096#include <net/protocol.h>
97#include <net/ip.h>
98#include <net/route.h>
99#include <net/inetpeer.h>
100#include <net/sock.h>
101#include <net/ip_fib.h>
102#include <net/arp.h>
103#include <net/tcp.h>
104#include <net/icmp.h>
105#include <net/xfrm.h>
Tom Tucker8d717402006-07-30 20:43:36 -0700106#include <net/netevent.h>
Thomas Graf63f34442007-03-22 11:55:17 -0700107#include <net/rtnetlink.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700108#ifdef CONFIG_SYSCTL
109#include <linux/sysctl.h>
110#endif
David Miller3769cff2011-07-11 22:44:24 +0000111#include <net/atmclip.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700112
David S. Miller68a5e3d2011-03-11 20:07:33 -0500113#define RT_FL_TOS(oldflp4) \
114 ((u32)(oldflp4->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700115
116#define IP_MAX_MTU 0xFFF0
117
118#define RT_GC_TIMEOUT (300*HZ)
119
Linus Torvalds1da177e2005-04-16 15:20:36 -0700120static int ip_rt_max_size;
Stephen Hemminger817bc4d2008-03-22 17:43:59 -0700121static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
122static int ip_rt_gc_interval __read_mostly = 60 * HZ;
123static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
124static int ip_rt_redirect_number __read_mostly = 9;
125static int ip_rt_redirect_load __read_mostly = HZ / 50;
126static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
127static int ip_rt_error_cost __read_mostly = HZ;
128static int ip_rt_error_burst __read_mostly = 5 * HZ;
129static int ip_rt_gc_elasticity __read_mostly = 8;
130static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
131static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
132static int ip_rt_min_advmss __read_mostly = 256;
Neil Horman1080d702008-10-27 12:28:25 -0700133static int rt_chain_length_max __read_mostly = 20;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700134
Linus Torvalds1da177e2005-04-16 15:20:36 -0700135/*
136 * Interface to generic destination cache.
137 */
138
139static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
David S. Miller0dbaee32010-12-13 12:52:14 -0800140static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
David S. Millerd33e4552010-12-14 13:01:14 -0800141static unsigned int ipv4_default_mtu(const struct dst_entry *dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700142static void ipv4_dst_destroy(struct dst_entry *dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700143static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
144static void ipv4_link_failure(struct sk_buff *skb);
145static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
Daniel Lezcano569d3642008-01-18 03:56:57 -0800146static int rt_garbage_collect(struct dst_ops *ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700147
Eric Dumazet72cdd1d2010-11-11 07:14:07 +0000148static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
149 int how)
150{
151}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700152
David S. Miller62fa8a82011-01-26 20:51:05 -0800153static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
154{
David S. Miller06582542011-01-27 14:58:42 -0800155 struct rtable *rt = (struct rtable *) dst;
156 struct inet_peer *peer;
157 u32 *p = NULL;
David S. Miller62fa8a82011-01-26 20:51:05 -0800158
David S. Miller06582542011-01-27 14:58:42 -0800159 if (!rt->peer)
David S. Millera48eff12011-05-18 18:42:43 -0400160 rt_bind_peer(rt, rt->rt_dst, 1);
David S. Miller06582542011-01-27 14:58:42 -0800161
162 peer = rt->peer;
163 if (peer) {
David S. Miller62fa8a82011-01-26 20:51:05 -0800164 u32 *old_p = __DST_METRICS_PTR(old);
165 unsigned long prev, new;
166
David S. Miller06582542011-01-27 14:58:42 -0800167 p = peer->metrics;
168 if (inet_metrics_new(peer))
169 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
David S. Miller62fa8a82011-01-26 20:51:05 -0800170
171 new = (unsigned long) p;
172 prev = cmpxchg(&dst->_metrics, old, new);
173
174 if (prev != old) {
David S. Miller62fa8a82011-01-26 20:51:05 -0800175 p = __DST_METRICS_PTR(prev);
176 if (prev & DST_METRICS_READ_ONLY)
177 p = NULL;
178 } else {
David S. Miller62fa8a82011-01-26 20:51:05 -0800179 if (rt->fi) {
180 fib_info_put(rt->fi);
181 rt->fi = NULL;
182 }
183 }
184 }
185 return p;
186}
187
Linus Torvalds1da177e2005-04-16 15:20:36 -0700188static struct dst_ops ipv4_dst_ops = {
189 .family = AF_INET,
Harvey Harrison09640e632009-02-01 00:45:17 -0800190 .protocol = cpu_to_be16(ETH_P_IP),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700191 .gc = rt_garbage_collect,
192 .check = ipv4_dst_check,
David S. Miller0dbaee32010-12-13 12:52:14 -0800193 .default_advmss = ipv4_default_advmss,
David S. Millerd33e4552010-12-14 13:01:14 -0800194 .default_mtu = ipv4_default_mtu,
David S. Miller62fa8a82011-01-26 20:51:05 -0800195 .cow_metrics = ipv4_cow_metrics,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700196 .destroy = ipv4_dst_destroy,
197 .ifdown = ipv4_dst_ifdown,
198 .negative_advice = ipv4_negative_advice,
199 .link_failure = ipv4_link_failure,
200 .update_pmtu = ip_rt_update_pmtu,
Herbert Xu1ac06e02008-05-20 14:32:14 -0700201 .local_out = __ip_local_out,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700202};
203
204#define ECN_OR_COST(class) TC_PRIO_##class
205
Philippe De Muyter4839c522007-07-09 15:32:57 -0700206const __u8 ip_tos2prio[16] = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700207 TC_PRIO_BESTEFFORT,
Dan Siemon4a2b9c32011-03-15 13:56:07 +0000208 ECN_OR_COST(BESTEFFORT),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700209 TC_PRIO_BESTEFFORT,
210 ECN_OR_COST(BESTEFFORT),
211 TC_PRIO_BULK,
212 ECN_OR_COST(BULK),
213 TC_PRIO_BULK,
214 ECN_OR_COST(BULK),
215 TC_PRIO_INTERACTIVE,
216 ECN_OR_COST(INTERACTIVE),
217 TC_PRIO_INTERACTIVE,
218 ECN_OR_COST(INTERACTIVE),
219 TC_PRIO_INTERACTIVE_BULK,
220 ECN_OR_COST(INTERACTIVE_BULK),
221 TC_PRIO_INTERACTIVE_BULK,
222 ECN_OR_COST(INTERACTIVE_BULK)
223};
224
225
226/*
227 * Route cache.
228 */
229
230/* The locking scheme is rather straight forward:
231 *
232 * 1) Read-Copy Update protects the buckets of the central route hash.
233 * 2) Only writers remove entries, and they hold the lock
234 * as they look at rtable reference counts.
235 * 3) Only readers acquire references to rtable entries,
236 * they do so with atomic increments and with the
237 * lock held.
238 */
239
240struct rt_hash_bucket {
Eric Dumazet1c317202010-10-25 21:02:07 +0000241 struct rtable __rcu *chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700242};
Neil Horman1080d702008-10-27 12:28:25 -0700243
Ingo Molnar8a25d5d2006-07-03 00:24:54 -0700244#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
245 defined(CONFIG_PROVE_LOCKING)
Eric Dumazet22c047c2005-07-05 14:55:24 -0700246/*
247 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
248 * The size of this table is a power of two and depends on the number of CPUS.
Ingo Molnar62051202006-07-03 00:24:59 -0700249 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
Eric Dumazet22c047c2005-07-05 14:55:24 -0700250 */
Ingo Molnar62051202006-07-03 00:24:59 -0700251#ifdef CONFIG_LOCKDEP
252# define RT_HASH_LOCK_SZ 256
Eric Dumazet22c047c2005-07-05 14:55:24 -0700253#else
Ingo Molnar62051202006-07-03 00:24:59 -0700254# if NR_CPUS >= 32
255# define RT_HASH_LOCK_SZ 4096
256# elif NR_CPUS >= 16
257# define RT_HASH_LOCK_SZ 2048
258# elif NR_CPUS >= 8
259# define RT_HASH_LOCK_SZ 1024
260# elif NR_CPUS >= 4
261# define RT_HASH_LOCK_SZ 512
262# else
263# define RT_HASH_LOCK_SZ 256
264# endif
Eric Dumazet22c047c2005-07-05 14:55:24 -0700265#endif
266
267static spinlock_t *rt_hash_locks;
268# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
Pavel Emelyanov1ff1cc22007-12-05 21:15:05 -0800269
270static __init void rt_hash_lock_init(void)
271{
272 int i;
273
274 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
275 GFP_KERNEL);
276 if (!rt_hash_locks)
277 panic("IP: failed to allocate rt_hash_locks\n");
278
279 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
280 spin_lock_init(&rt_hash_locks[i]);
281}
Eric Dumazet22c047c2005-07-05 14:55:24 -0700282#else
283# define rt_hash_lock_addr(slot) NULL
Pavel Emelyanov1ff1cc22007-12-05 21:15:05 -0800284
285static inline void rt_hash_lock_init(void)
286{
287}
Eric Dumazet22c047c2005-07-05 14:55:24 -0700288#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700289
Stephen Hemminger817bc4d2008-03-22 17:43:59 -0700290static struct rt_hash_bucket *rt_hash_table __read_mostly;
291static unsigned rt_hash_mask __read_mostly;
292static unsigned int rt_hash_log __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700293
Eric Dumazet2f970d82006-01-17 02:54:36 -0800294static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
Eric Dumazet27f39c73e2010-05-19 22:07:23 +0000295#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700296
Denis V. Lunevb00180d2008-07-05 19:04:09 -0700297static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
Eric Dumazet0eae88f2010-04-20 19:06:52 -0700298 int genid)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700299{
Eric Dumazet0eae88f2010-04-20 19:06:52 -0700300 return jhash_3words((__force u32)daddr, (__force u32)saddr,
Denis V. Lunevb00180d2008-07-05 19:04:09 -0700301 idx, genid)
Eric Dumazet29e75252008-01-31 17:05:09 -0800302 & rt_hash_mask;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700303}
304
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700305static inline int rt_genid(struct net *net)
306{
307 return atomic_read(&net->ipv4.rt_genid);
308}
309
Linus Torvalds1da177e2005-04-16 15:20:36 -0700310#ifdef CONFIG_PROC_FS
311struct rt_cache_iter_state {
Denis V. Luneva75e9362008-02-28 20:50:55 -0800312 struct seq_net_private p;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700313 int bucket;
Eric Dumazet29e75252008-01-31 17:05:09 -0800314 int genid;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700315};
316
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900317static struct rtable *rt_cache_get_first(struct seq_file *seq)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700318{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900319 struct rt_cache_iter_state *st = seq->private;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700320 struct rtable *r = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700321
322 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
Eric Dumazet1c317202010-10-25 21:02:07 +0000323 if (!rcu_dereference_raw(rt_hash_table[st->bucket].chain))
Eric Dumazeta6272662008-08-28 01:11:25 -0700324 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700325 rcu_read_lock_bh();
Paul E. McKenneya898def2010-02-22 17:04:49 -0800326 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
Eric Dumazet29e75252008-01-31 17:05:09 -0800327 while (r) {
Changli Gaod8d1f302010-06-10 23:31:35 -0700328 if (dev_net(r->dst.dev) == seq_file_net(seq) &&
Denis V. Luneva75e9362008-02-28 20:50:55 -0800329 r->rt_genid == st->genid)
Eric Dumazet29e75252008-01-31 17:05:09 -0800330 return r;
Changli Gaod8d1f302010-06-10 23:31:35 -0700331 r = rcu_dereference_bh(r->dst.rt_next);
Eric Dumazet29e75252008-01-31 17:05:09 -0800332 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700333 rcu_read_unlock_bh();
334 }
Eric Dumazet29e75252008-01-31 17:05:09 -0800335 return r;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700336}
337
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900338static struct rtable *__rt_cache_get_next(struct seq_file *seq,
Denis V. Lunev642d6312008-02-28 20:50:33 -0800339 struct rtable *r)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700340{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900341 struct rt_cache_iter_state *st = seq->private;
Eric Dumazeta6272662008-08-28 01:11:25 -0700342
Eric Dumazet1c317202010-10-25 21:02:07 +0000343 r = rcu_dereference_bh(r->dst.rt_next);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700344 while (!r) {
345 rcu_read_unlock_bh();
Eric Dumazeta6272662008-08-28 01:11:25 -0700346 do {
347 if (--st->bucket < 0)
348 return NULL;
Eric Dumazet1c317202010-10-25 21:02:07 +0000349 } while (!rcu_dereference_raw(rt_hash_table[st->bucket].chain));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700350 rcu_read_lock_bh();
Eric Dumazet1c317202010-10-25 21:02:07 +0000351 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700352 }
Eric Dumazet1c317202010-10-25 21:02:07 +0000353 return r;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700354}
355
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900356static struct rtable *rt_cache_get_next(struct seq_file *seq,
Denis V. Lunev642d6312008-02-28 20:50:33 -0800357 struct rtable *r)
358{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900359 struct rt_cache_iter_state *st = seq->private;
360 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
Changli Gaod8d1f302010-06-10 23:31:35 -0700361 if (dev_net(r->dst.dev) != seq_file_net(seq))
Denis V. Luneva75e9362008-02-28 20:50:55 -0800362 continue;
Denis V. Lunev642d6312008-02-28 20:50:33 -0800363 if (r->rt_genid == st->genid)
364 break;
365 }
366 return r;
367}
368
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900369static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700370{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900371 struct rtable *r = rt_cache_get_first(seq);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700372
373 if (r)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900374 while (pos && (r = rt_cache_get_next(seq, r)))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700375 --pos;
376 return pos ? NULL : r;
377}
378
379static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
380{
Eric Dumazet29e75252008-01-31 17:05:09 -0800381 struct rt_cache_iter_state *st = seq->private;
Eric Dumazet29e75252008-01-31 17:05:09 -0800382 if (*pos)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900383 return rt_cache_get_idx(seq, *pos - 1);
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700384 st->genid = rt_genid(seq_file_net(seq));
Eric Dumazet29e75252008-01-31 17:05:09 -0800385 return SEQ_START_TOKEN;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700386}
387
388static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
389{
Eric Dumazet29e75252008-01-31 17:05:09 -0800390 struct rtable *r;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700391
392 if (v == SEQ_START_TOKEN)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900393 r = rt_cache_get_first(seq);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700394 else
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900395 r = rt_cache_get_next(seq, v);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700396 ++*pos;
397 return r;
398}
399
400static void rt_cache_seq_stop(struct seq_file *seq, void *v)
401{
402 if (v && v != SEQ_START_TOKEN)
403 rcu_read_unlock_bh();
404}
405
406static int rt_cache_seq_show(struct seq_file *seq, void *v)
407{
408 if (v == SEQ_START_TOKEN)
409 seq_printf(seq, "%-127s\n",
410 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
411 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
412 "HHUptod\tSpecDst");
413 else {
414 struct rtable *r = v;
Pavel Emelyanov5e659e42008-04-24 01:02:16 -0700415 int len;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700416
Eric Dumazet0eae88f2010-04-20 19:06:52 -0700417 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
418 "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
Changli Gaod8d1f302010-06-10 23:31:35 -0700419 r->dst.dev ? r->dst.dev->name : "*",
Eric Dumazet0eae88f2010-04-20 19:06:52 -0700420 (__force u32)r->rt_dst,
421 (__force u32)r->rt_gateway,
Changli Gaod8d1f302010-06-10 23:31:35 -0700422 r->rt_flags, atomic_read(&r->dst.__refcnt),
423 r->dst.__use, 0, (__force u32)r->rt_src,
David S. Miller0dbaee32010-12-13 12:52:14 -0800424 dst_metric_advmss(&r->dst) + 40,
Changli Gaod8d1f302010-06-10 23:31:35 -0700425 dst_metric(&r->dst, RTAX_WINDOW),
426 (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
427 dst_metric(&r->dst, RTAX_RTTVAR)),
David S. Miller475949d2011-05-03 19:45:15 -0700428 r->rt_key_tos,
Changli Gaod8d1f302010-06-10 23:31:35 -0700429 r->dst.hh ? atomic_read(&r->dst.hh->hh_refcnt) : -1,
430 r->dst.hh ? (r->dst.hh->hh_output ==
Linus Torvalds1da177e2005-04-16 15:20:36 -0700431 dev_queue_xmit) : 0,
Pavel Emelyanov5e659e42008-04-24 01:02:16 -0700432 r->rt_spec_dst, &len);
433
434 seq_printf(seq, "%*s\n", 127 - len, "");
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900435 }
436 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700437}
438
Stephen Hemmingerf6908082007-03-12 14:34:29 -0700439static const struct seq_operations rt_cache_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700440 .start = rt_cache_seq_start,
441 .next = rt_cache_seq_next,
442 .stop = rt_cache_seq_stop,
443 .show = rt_cache_seq_show,
444};
445
446static int rt_cache_seq_open(struct inode *inode, struct file *file)
447{
Denis V. Luneva75e9362008-02-28 20:50:55 -0800448 return seq_open_net(inode, file, &rt_cache_seq_ops,
Pavel Emelyanovcf7732e2007-10-10 02:29:29 -0700449 sizeof(struct rt_cache_iter_state));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700450}
451
Arjan van de Ven9a321442007-02-12 00:55:35 -0800452static const struct file_operations rt_cache_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700453 .owner = THIS_MODULE,
454 .open = rt_cache_seq_open,
455 .read = seq_read,
456 .llseek = seq_lseek,
Denis V. Luneva75e9362008-02-28 20:50:55 -0800457 .release = seq_release_net,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700458};
459
460
461static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
462{
463 int cpu;
464
465 if (*pos == 0)
466 return SEQ_START_TOKEN;
467
Rusty Russell0f23174a2008-12-29 12:23:42 +0000468 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700469 if (!cpu_possible(cpu))
470 continue;
471 *pos = cpu+1;
Eric Dumazet2f970d82006-01-17 02:54:36 -0800472 return &per_cpu(rt_cache_stat, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700473 }
474 return NULL;
475}
476
477static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
478{
479 int cpu;
480
Rusty Russell0f23174a2008-12-29 12:23:42 +0000481 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700482 if (!cpu_possible(cpu))
483 continue;
484 *pos = cpu+1;
Eric Dumazet2f970d82006-01-17 02:54:36 -0800485 return &per_cpu(rt_cache_stat, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700486 }
487 return NULL;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900488
Linus Torvalds1da177e2005-04-16 15:20:36 -0700489}
490
491static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
492{
493
494}
495
496static int rt_cpu_seq_show(struct seq_file *seq, void *v)
497{
498 struct rt_cache_stat *st = v;
499
500 if (v == SEQ_START_TOKEN) {
Olaf Rempel5bec0032005-04-28 12:16:08 -0700501 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700502 return 0;
503 }
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900504
Linus Torvalds1da177e2005-04-16 15:20:36 -0700505 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
506 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
Eric Dumazetfc66f952010-10-08 06:37:34 +0000507 dst_entries_get_slow(&ipv4_dst_ops),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700508 st->in_hit,
509 st->in_slow_tot,
510 st->in_slow_mc,
511 st->in_no_route,
512 st->in_brd,
513 st->in_martian_dst,
514 st->in_martian_src,
515
516 st->out_hit,
517 st->out_slow_tot,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900518 st->out_slow_mc,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700519
520 st->gc_total,
521 st->gc_ignored,
522 st->gc_goal_miss,
523 st->gc_dst_overflow,
524 st->in_hlist_search,
525 st->out_hlist_search
526 );
527 return 0;
528}
529
Stephen Hemmingerf6908082007-03-12 14:34:29 -0700530static const struct seq_operations rt_cpu_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700531 .start = rt_cpu_seq_start,
532 .next = rt_cpu_seq_next,
533 .stop = rt_cpu_seq_stop,
534 .show = rt_cpu_seq_show,
535};
536
537
538static int rt_cpu_seq_open(struct inode *inode, struct file *file)
539{
540 return seq_open(file, &rt_cpu_seq_ops);
541}
542
Arjan van de Ven9a321442007-02-12 00:55:35 -0800543static const struct file_operations rt_cpu_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700544 .owner = THIS_MODULE,
545 .open = rt_cpu_seq_open,
546 .read = seq_read,
547 .llseek = seq_lseek,
548 .release = seq_release,
549};
550
Patrick McHardyc7066f72011-01-14 13:36:42 +0100551#ifdef CONFIG_IP_ROUTE_CLASSID
Alexey Dobriyana661c412009-11-25 15:40:35 -0800552static int rt_acct_proc_show(struct seq_file *m, void *v)
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800553{
Alexey Dobriyana661c412009-11-25 15:40:35 -0800554 struct ip_rt_acct *dst, *src;
555 unsigned int i, j;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800556
Alexey Dobriyana661c412009-11-25 15:40:35 -0800557 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
558 if (!dst)
559 return -ENOMEM;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800560
Alexey Dobriyana661c412009-11-25 15:40:35 -0800561 for_each_possible_cpu(i) {
562 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
563 for (j = 0; j < 256; j++) {
564 dst[j].o_bytes += src[j].o_bytes;
565 dst[j].o_packets += src[j].o_packets;
566 dst[j].i_bytes += src[j].i_bytes;
567 dst[j].i_packets += src[j].i_packets;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800568 }
569 }
Alexey Dobriyana661c412009-11-25 15:40:35 -0800570
571 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
572 kfree(dst);
573 return 0;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800574}
Alexey Dobriyana661c412009-11-25 15:40:35 -0800575
576static int rt_acct_proc_open(struct inode *inode, struct file *file)
577{
578 return single_open(file, rt_acct_proc_show, NULL);
579}
580
581static const struct file_operations rt_acct_proc_fops = {
582 .owner = THIS_MODULE,
583 .open = rt_acct_proc_open,
584 .read = seq_read,
585 .llseek = seq_lseek,
586 .release = single_release,
587};
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800588#endif
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800589
Denis V. Lunev73b38712008-02-28 20:51:18 -0800590static int __net_init ip_rt_do_proc_init(struct net *net)
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800591{
592 struct proc_dir_entry *pde;
593
594 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
595 &rt_cache_seq_fops);
596 if (!pde)
597 goto err1;
598
Wang Chen77020722008-02-28 14:14:25 -0800599 pde = proc_create("rt_cache", S_IRUGO,
600 net->proc_net_stat, &rt_cpu_seq_fops);
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800601 if (!pde)
602 goto err2;
603
Patrick McHardyc7066f72011-01-14 13:36:42 +0100604#ifdef CONFIG_IP_ROUTE_CLASSID
Alexey Dobriyana661c412009-11-25 15:40:35 -0800605 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800606 if (!pde)
607 goto err3;
608#endif
609 return 0;
610
Patrick McHardyc7066f72011-01-14 13:36:42 +0100611#ifdef CONFIG_IP_ROUTE_CLASSID
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800612err3:
613 remove_proc_entry("rt_cache", net->proc_net_stat);
614#endif
615err2:
616 remove_proc_entry("rt_cache", net->proc_net);
617err1:
618 return -ENOMEM;
619}
Denis V. Lunev73b38712008-02-28 20:51:18 -0800620
621static void __net_exit ip_rt_do_proc_exit(struct net *net)
622{
623 remove_proc_entry("rt_cache", net->proc_net_stat);
624 remove_proc_entry("rt_cache", net->proc_net);
Patrick McHardyc7066f72011-01-14 13:36:42 +0100625#ifdef CONFIG_IP_ROUTE_CLASSID
Denis V. Lunev73b38712008-02-28 20:51:18 -0800626 remove_proc_entry("rt_acct", net->proc_net);
Alexey Dobriyan0a931ac2010-01-17 03:32:50 +0000627#endif
Denis V. Lunev73b38712008-02-28 20:51:18 -0800628}
629
630static struct pernet_operations ip_rt_proc_ops __net_initdata = {
631 .init = ip_rt_do_proc_init,
632 .exit = ip_rt_do_proc_exit,
633};
634
635static int __init ip_rt_proc_init(void)
636{
637 return register_pernet_subsys(&ip_rt_proc_ops);
638}
639
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800640#else
Denis V. Lunev73b38712008-02-28 20:51:18 -0800641static inline int ip_rt_proc_init(void)
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800642{
643 return 0;
644}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700645#endif /* CONFIG_PROC_FS */
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900646
Stephen Hemminger5969f712008-04-10 01:52:09 -0700647static inline void rt_free(struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700648{
Changli Gaod8d1f302010-06-10 23:31:35 -0700649 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700650}
651
Stephen Hemminger5969f712008-04-10 01:52:09 -0700652static inline void rt_drop(struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700653{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700654 ip_rt_put(rt);
Changli Gaod8d1f302010-06-10 23:31:35 -0700655 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700656}
657
Stephen Hemminger5969f712008-04-10 01:52:09 -0700658static inline int rt_fast_clean(struct rtable *rth)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700659{
660 /* Kill broadcast/multicast entries very aggresively, if they
661 collide in hash table with more useful entries */
662 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
David S. Millerc7537962010-11-11 17:07:48 -0800663 rt_is_input_route(rth) && rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700664}
665
Stephen Hemminger5969f712008-04-10 01:52:09 -0700666static inline int rt_valuable(struct rtable *rth)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700667{
668 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
David S. Miller2c8cec52011-02-09 20:42:07 -0800669 (rth->peer && rth->peer->pmtu_expires);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700670}
671
672static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
673{
674 unsigned long age;
675 int ret = 0;
676
Changli Gaod8d1f302010-06-10 23:31:35 -0700677 if (atomic_read(&rth->dst.__refcnt))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700678 goto out;
679
Changli Gaod8d1f302010-06-10 23:31:35 -0700680 age = jiffies - rth->dst.lastuse;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700681 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
682 (age <= tmo2 && rt_valuable(rth)))
683 goto out;
684 ret = 1;
685out: return ret;
686}
687
688/* Bits of score are:
689 * 31: very valuable
690 * 30: not quite useless
691 * 29..0: usage counter
692 */
693static inline u32 rt_score(struct rtable *rt)
694{
Changli Gaod8d1f302010-06-10 23:31:35 -0700695 u32 score = jiffies - rt->dst.lastuse;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700696
697 score = ~score & ~(3<<30);
698
699 if (rt_valuable(rt))
700 score |= (1<<31);
701
David S. Millerc7537962010-11-11 17:07:48 -0800702 if (rt_is_output_route(rt) ||
Linus Torvalds1da177e2005-04-16 15:20:36 -0700703 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
704 score |= (1<<30);
705
706 return score;
707}
708
Neil Horman1080d702008-10-27 12:28:25 -0700709static inline bool rt_caching(const struct net *net)
710{
711 return net->ipv4.current_rt_cache_rebuild_count <=
712 net->ipv4.sysctl_rt_cache_rebuild_count;
713}
714
David S. Miller5e2b61f2011-03-04 21:47:09 -0800715static inline bool compare_hash_inputs(const struct rtable *rt1,
716 const struct rtable *rt2)
Neil Horman1080d702008-10-27 12:28:25 -0700717{
David S. Miller5e2b61f2011-03-04 21:47:09 -0800718 return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
719 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
720 (rt1->rt_iif ^ rt2->rt_iif)) == 0);
Neil Horman1080d702008-10-27 12:28:25 -0700721}
722
David S. Miller5e2b61f2011-03-04 21:47:09 -0800723static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700724{
David S. Miller5e2b61f2011-03-04 21:47:09 -0800725 return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
726 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
727 (rt1->rt_mark ^ rt2->rt_mark) |
David S. Miller475949d2011-05-03 19:45:15 -0700728 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
David S. Miller5e2b61f2011-03-04 21:47:09 -0800729 (rt1->rt_oif ^ rt2->rt_oif) |
730 (rt1->rt_iif ^ rt2->rt_iif)) == 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700731}
732
Denis V. Lunevb5921912008-01-22 23:50:25 -0800733static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
734{
Changli Gaod8d1f302010-06-10 23:31:35 -0700735 return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
Denis V. Lunevb5921912008-01-22 23:50:25 -0800736}
737
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700738static inline int rt_is_expired(struct rtable *rth)
739{
Changli Gaod8d1f302010-06-10 23:31:35 -0700740 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700741}
742
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800743/*
744 * Perform a full scan of hash table and free all entries.
745 * Can be called by a softirq or a process.
746 * In the later case, we want to be reschedule if necessary
747 */
David S. Miller6561a3b2010-12-19 21:11:20 -0800748static void rt_do_flush(struct net *net, int process_context)
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800749{
750 unsigned int i;
751 struct rtable *rth, *next;
752
753 for (i = 0; i <= rt_hash_mask; i++) {
David S. Miller6561a3b2010-12-19 21:11:20 -0800754 struct rtable __rcu **pprev;
755 struct rtable *list;
756
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800757 if (process_context && need_resched())
758 cond_resched();
Eric Dumazet1c317202010-10-25 21:02:07 +0000759 rth = rcu_dereference_raw(rt_hash_table[i].chain);
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800760 if (!rth)
761 continue;
762
763 spin_lock_bh(rt_hash_lock_addr(i));
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700764
David S. Miller6561a3b2010-12-19 21:11:20 -0800765 list = NULL;
766 pprev = &rt_hash_table[i].chain;
767 rth = rcu_dereference_protected(*pprev,
Eric Dumazet1c317202010-10-25 21:02:07 +0000768 lockdep_is_held(rt_hash_lock_addr(i)));
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700769
David S. Miller6561a3b2010-12-19 21:11:20 -0800770 while (rth) {
771 next = rcu_dereference_protected(rth->dst.rt_next,
772 lockdep_is_held(rt_hash_lock_addr(i)));
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700773
David S. Miller6561a3b2010-12-19 21:11:20 -0800774 if (!net ||
775 net_eq(dev_net(rth->dst.dev), net)) {
776 rcu_assign_pointer(*pprev, next);
777 rcu_assign_pointer(rth->dst.rt_next, list);
778 list = rth;
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700779 } else {
David S. Miller6561a3b2010-12-19 21:11:20 -0800780 pprev = &rth->dst.rt_next;
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700781 }
David S. Miller6561a3b2010-12-19 21:11:20 -0800782 rth = next;
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700783 }
David S. Miller6561a3b2010-12-19 21:11:20 -0800784
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800785 spin_unlock_bh(rt_hash_lock_addr(i));
786
David S. Miller6561a3b2010-12-19 21:11:20 -0800787 for (; list; list = next) {
788 next = rcu_dereference_protected(list->dst.rt_next, 1);
789 rt_free(list);
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800790 }
791 }
792}
793
Neil Horman1080d702008-10-27 12:28:25 -0700794/*
795 * While freeing expired entries, we compute average chain length
796 * and standard deviation, using fixed-point arithmetic.
797 * This to have an estimation of rt_chain_length_max
798 * rt_chain_length_max = max(elasticity, AVG + 4*SD)
799 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
800 */
801
802#define FRACT_BITS 3
803#define ONE (1UL << FRACT_BITS)
804
Eric Dumazet98376382010-03-08 03:20:00 +0000805/*
806 * Given a hash chain and an item in this hash chain,
807 * find if a previous entry has the same hash_inputs
808 * (but differs on tos, mark or oif)
809 * Returns 0 if an alias is found.
810 * Returns ONE if rth has no alias before itself.
811 */
812static int has_noalias(const struct rtable *head, const struct rtable *rth)
813{
814 const struct rtable *aux = head;
815
816 while (aux != rth) {
David S. Miller5e2b61f2011-03-04 21:47:09 -0800817 if (compare_hash_inputs(aux, rth))
Eric Dumazet98376382010-03-08 03:20:00 +0000818 return 0;
Eric Dumazet1c317202010-10-25 21:02:07 +0000819 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
Eric Dumazet98376382010-03-08 03:20:00 +0000820 }
821 return ONE;
822}
823
Eric Dumazet29e75252008-01-31 17:05:09 -0800824/*
Lucas De Marchi25985ed2011-03-30 22:57:33 -0300825 * Perturbation of rt_genid by a small quantity [1..256]
Eric Dumazet29e75252008-01-31 17:05:09 -0800826 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
827 * many times (2^24) without giving recent rt_genid.
828 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700829 */
Denis V. Lunev86c657f2008-07-05 19:03:31 -0700830static void rt_cache_invalidate(struct net *net)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700831{
Eric Dumazet29e75252008-01-31 17:05:09 -0800832 unsigned char shuffle;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700833
Eric Dumazet29e75252008-01-31 17:05:09 -0800834 get_random_bytes(&shuffle, sizeof(shuffle));
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700835 atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700836}
837
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800838/*
Eric Dumazet29e75252008-01-31 17:05:09 -0800839 * delay < 0 : invalidate cache (fast : entries will be deleted later)
840 * delay >= 0 : invalidate & flush cache (can be long)
841 */
Denis V. Lunev76e6ebf2008-07-05 19:00:44 -0700842void rt_cache_flush(struct net *net, int delay)
Eric Dumazet29e75252008-01-31 17:05:09 -0800843{
Denis V. Lunev86c657f2008-07-05 19:03:31 -0700844 rt_cache_invalidate(net);
Eric Dumazet29e75252008-01-31 17:05:09 -0800845 if (delay >= 0)
David S. Miller6561a3b2010-12-19 21:11:20 -0800846 rt_do_flush(net, !in_softirq());
Eric Dumazet29e75252008-01-31 17:05:09 -0800847}
848
Eric W. Biedermana5ee1552009-11-29 15:45:58 +0000849/* Flush previous cache invalidated entries from the cache */
David S. Miller6561a3b2010-12-19 21:11:20 -0800850void rt_cache_flush_batch(struct net *net)
Eric W. Biedermana5ee1552009-11-29 15:45:58 +0000851{
David S. Miller6561a3b2010-12-19 21:11:20 -0800852 rt_do_flush(net, !in_softirq());
Eric W. Biedermana5ee1552009-11-29 15:45:58 +0000853}
854
Neil Horman1080d702008-10-27 12:28:25 -0700855static void rt_emergency_hash_rebuild(struct net *net)
856{
Neil Horman3ee94372010-05-08 01:57:52 -0700857 if (net_ratelimit())
Neil Horman1080d702008-10-27 12:28:25 -0700858 printk(KERN_WARNING "Route hash chain too long!\n");
Neil Horman3ee94372010-05-08 01:57:52 -0700859 rt_cache_invalidate(net);
Neil Horman1080d702008-10-27 12:28:25 -0700860}
861
Linus Torvalds1da177e2005-04-16 15:20:36 -0700862/*
863 Short description of GC goals.
864
865 We want to build algorithm, which will keep routing cache
866 at some equilibrium point, when number of aged off entries
867 is kept approximately equal to newly generated ones.
868
869 Current expiration strength is variable "expire".
870 We try to adjust it dynamically, so that if networking
871 is idle expires is large enough to keep enough of warm entries,
872 and when load increases it reduces to limit cache size.
873 */
874
Daniel Lezcano569d3642008-01-18 03:56:57 -0800875static int rt_garbage_collect(struct dst_ops *ops)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700876{
877 static unsigned long expire = RT_GC_TIMEOUT;
878 static unsigned long last_gc;
879 static int rover;
880 static int equilibrium;
Eric Dumazet1c317202010-10-25 21:02:07 +0000881 struct rtable *rth;
882 struct rtable __rcu **rthp;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700883 unsigned long now = jiffies;
884 int goal;
Eric Dumazetfc66f952010-10-08 06:37:34 +0000885 int entries = dst_entries_get_fast(&ipv4_dst_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700886
887 /*
888 * Garbage collection is pretty expensive,
889 * do not make it too frequently.
890 */
891
892 RT_CACHE_STAT_INC(gc_total);
893
894 if (now - last_gc < ip_rt_gc_min_interval &&
Eric Dumazetfc66f952010-10-08 06:37:34 +0000895 entries < ip_rt_max_size) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700896 RT_CACHE_STAT_INC(gc_ignored);
897 goto out;
898 }
899
Eric Dumazetfc66f952010-10-08 06:37:34 +0000900 entries = dst_entries_get_slow(&ipv4_dst_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700901 /* Calculate number of entries, which we want to expire now. */
Eric Dumazetfc66f952010-10-08 06:37:34 +0000902 goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700903 if (goal <= 0) {
904 if (equilibrium < ipv4_dst_ops.gc_thresh)
905 equilibrium = ipv4_dst_ops.gc_thresh;
Eric Dumazetfc66f952010-10-08 06:37:34 +0000906 goal = entries - equilibrium;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700907 if (goal > 0) {
Eric Dumazetb790ced2007-12-21 01:49:07 -0800908 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
Eric Dumazetfc66f952010-10-08 06:37:34 +0000909 goal = entries - equilibrium;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700910 }
911 } else {
912 /* We are in dangerous area. Try to reduce cache really
913 * aggressively.
914 */
Eric Dumazetb790ced2007-12-21 01:49:07 -0800915 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
Eric Dumazetfc66f952010-10-08 06:37:34 +0000916 equilibrium = entries - goal;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700917 }
918
919 if (now - last_gc >= ip_rt_gc_min_interval)
920 last_gc = now;
921
922 if (goal <= 0) {
923 equilibrium += goal;
924 goto work_done;
925 }
926
927 do {
928 int i, k;
929
930 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
931 unsigned long tmo = expire;
932
933 k = (k + 1) & rt_hash_mask;
934 rthp = &rt_hash_table[k].chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700935 spin_lock_bh(rt_hash_lock_addr(k));
Eric Dumazet1c317202010-10-25 21:02:07 +0000936 while ((rth = rcu_dereference_protected(*rthp,
937 lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700938 if (!rt_is_expired(rth) &&
Eric Dumazet29e75252008-01-31 17:05:09 -0800939 !rt_may_expire(rth, tmo, expire)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700940 tmo >>= 1;
Changli Gaod8d1f302010-06-10 23:31:35 -0700941 rthp = &rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700942 continue;
943 }
Changli Gaod8d1f302010-06-10 23:31:35 -0700944 *rthp = rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700945 rt_free(rth);
946 goal--;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700947 }
Eric Dumazet22c047c2005-07-05 14:55:24 -0700948 spin_unlock_bh(rt_hash_lock_addr(k));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700949 if (goal <= 0)
950 break;
951 }
952 rover = k;
953
954 if (goal <= 0)
955 goto work_done;
956
957 /* Goal is not achieved. We stop process if:
958
959 - if expire reduced to zero. Otherwise, expire is halfed.
960 - if table is not full.
961 - if we are called from interrupt.
962 - jiffies check is just fallback/debug loop breaker.
963 We will not spin here for long time in any case.
964 */
965
966 RT_CACHE_STAT_INC(gc_goal_miss);
967
968 if (expire == 0)
969 break;
970
971 expire >>= 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700972
Eric Dumazetfc66f952010-10-08 06:37:34 +0000973 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700974 goto out;
975 } while (!in_softirq() && time_before_eq(jiffies, now));
976
Eric Dumazetfc66f952010-10-08 06:37:34 +0000977 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
978 goto out;
979 if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700980 goto out;
981 if (net_ratelimit())
982 printk(KERN_WARNING "dst cache overflow\n");
983 RT_CACHE_STAT_INC(gc_dst_overflow);
984 return 1;
985
986work_done:
987 expire += ip_rt_gc_min_interval;
988 if (expire > ip_rt_gc_timeout ||
Eric Dumazetfc66f952010-10-08 06:37:34 +0000989 dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
990 dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700991 expire = ip_rt_gc_timeout;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700992out: return 0;
993}
994
Eric Dumazet98376382010-03-08 03:20:00 +0000995/*
996 * Returns number of entries in a hash chain that have different hash_inputs
997 */
998static int slow_chain_length(const struct rtable *head)
999{
1000 int length = 0;
1001 const struct rtable *rth = head;
1002
1003 while (rth) {
1004 length += has_noalias(head, rth);
Eric Dumazet1c317202010-10-25 21:02:07 +00001005 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
Eric Dumazet98376382010-03-08 03:20:00 +00001006 }
1007 return length >> FRACT_BITS;
1008}
1009
David Miller3769cff2011-07-11 22:44:24 +00001010static int rt_bind_neighbour(struct rtable *rt)
1011{
1012 static const __be32 inaddr_any = 0;
1013 struct net_device *dev = rt->dst.dev;
1014 struct neigh_table *tbl = &arp_tbl;
1015 const __be32 *nexthop;
1016 struct neighbour *n;
1017
1018#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
1019 if (dev->type == ARPHRD_ATM)
1020 tbl = clip_tbl_hook;
1021#endif
1022 nexthop = &rt->rt_gateway;
1023 if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
1024 nexthop = &inaddr_any;
1025 n = ipv4_neigh_lookup(tbl, dev, nexthop);
1026 if (IS_ERR(n))
1027 return PTR_ERR(n);
1028 rt->dst.neighbour = n;
1029
1030 return 0;
1031}
1032
David S. Millerb23dd4f2011-03-02 14:31:35 -08001033static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1034 struct sk_buff *skb, int ifindex)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001035{
Eric Dumazet1c317202010-10-25 21:02:07 +00001036 struct rtable *rth, *cand;
1037 struct rtable __rcu **rthp, **candp;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001038 unsigned long now;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001039 u32 min_score;
1040 int chain_length;
1041 int attempts = !in_softirq();
1042
1043restart:
1044 chain_length = 0;
1045 min_score = ~(u32)0;
1046 cand = NULL;
1047 candp = NULL;
1048 now = jiffies;
1049
Changli Gaod8d1f302010-06-10 23:31:35 -07001050 if (!rt_caching(dev_net(rt->dst.dev))) {
Neil Horman73e42892009-06-20 01:15:16 -07001051 /*
1052 * If we're not caching, just tell the caller we
1053 * were successful and don't touch the route. The
1054 * caller hold the sole reference to the cache entry, and
1055 * it will be released when the caller is done with it.
1056 * If we drop it here, the callers have no way to resolve routes
1057 * when we're not caching. Instead, just point *rp at rt, so
1058 * the caller gets a single use out of the route
Neil Hormanb6280b42009-06-22 10:18:53 +00001059 * Note that we do rt_free on this new route entry, so that
1060 * once its refcount hits zero, we are still able to reap it
1061 * (Thanks Alexey)
Eric Dumazet27b75c92010-10-15 05:44:11 +00001062 * Note: To avoid expensive rcu stuff for this uncached dst,
1063 * we set DST_NOCACHE so that dst_release() can free dst without
1064 * waiting a grace period.
Neil Horman73e42892009-06-20 01:15:16 -07001065 */
Neil Hormanb6280b42009-06-22 10:18:53 +00001066
Eric Dumazetc7d44262010-10-03 22:17:54 -07001067 rt->dst.flags |= DST_NOCACHE;
David S. Millerc7537962010-11-11 17:07:48 -08001068 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
David Miller3769cff2011-07-11 22:44:24 +00001069 int err = rt_bind_neighbour(rt);
Neil Hormanb6280b42009-06-22 10:18:53 +00001070 if (err) {
1071 if (net_ratelimit())
1072 printk(KERN_WARNING
1073 "Neighbour table failure & not caching routes.\n");
Eric Dumazet27b75c92010-10-15 05:44:11 +00001074 ip_rt_put(rt);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001075 return ERR_PTR(err);
Neil Hormanb6280b42009-06-22 10:18:53 +00001076 }
1077 }
1078
Neil Hormanb6280b42009-06-22 10:18:53 +00001079 goto skip_hashing;
Neil Horman1080d702008-10-27 12:28:25 -07001080 }
1081
Linus Torvalds1da177e2005-04-16 15:20:36 -07001082 rthp = &rt_hash_table[hash].chain;
1083
Eric Dumazet22c047c2005-07-05 14:55:24 -07001084 spin_lock_bh(rt_hash_lock_addr(hash));
Eric Dumazet1c317202010-10-25 21:02:07 +00001085 while ((rth = rcu_dereference_protected(*rthp,
1086 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001087 if (rt_is_expired(rth)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001088 *rthp = rth->dst.rt_next;
Eric Dumazet29e75252008-01-31 17:05:09 -08001089 rt_free(rth);
1090 continue;
1091 }
David S. Miller5e2b61f2011-03-04 21:47:09 -08001092 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001093 /* Put it first */
Changli Gaod8d1f302010-06-10 23:31:35 -07001094 *rthp = rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001095 /*
1096 * Since lookup is lockfree, the deletion
1097 * must be visible to another weakly ordered CPU before
1098 * the insertion at the start of the hash chain.
1099 */
Changli Gaod8d1f302010-06-10 23:31:35 -07001100 rcu_assign_pointer(rth->dst.rt_next,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001101 rt_hash_table[hash].chain);
1102 /*
1103 * Since lookup is lockfree, the update writes
1104 * must be ordered for consistency on SMP.
1105 */
1106 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1107
Changli Gaod8d1f302010-06-10 23:31:35 -07001108 dst_use(&rth->dst, now);
Eric Dumazet22c047c2005-07-05 14:55:24 -07001109 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001110
1111 rt_drop(rt);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001112 if (skb)
Changli Gaod8d1f302010-06-10 23:31:35 -07001113 skb_dst_set(skb, &rth->dst);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001114 return rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001115 }
1116
Changli Gaod8d1f302010-06-10 23:31:35 -07001117 if (!atomic_read(&rth->dst.__refcnt)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001118 u32 score = rt_score(rth);
1119
1120 if (score <= min_score) {
1121 cand = rth;
1122 candp = rthp;
1123 min_score = score;
1124 }
1125 }
1126
1127 chain_length++;
1128
Changli Gaod8d1f302010-06-10 23:31:35 -07001129 rthp = &rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001130 }
1131
1132 if (cand) {
1133 /* ip_rt_gc_elasticity used to be average length of chain
1134 * length, when exceeded gc becomes really aggressive.
1135 *
1136 * The second limit is less certain. At the moment it allows
1137 * only 2 entries per bucket. We will see.
1138 */
1139 if (chain_length > ip_rt_gc_elasticity) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001140 *candp = cand->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001141 rt_free(cand);
1142 }
Neil Horman1080d702008-10-27 12:28:25 -07001143 } else {
Eric Dumazet98376382010-03-08 03:20:00 +00001144 if (chain_length > rt_chain_length_max &&
1145 slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001146 struct net *net = dev_net(rt->dst.dev);
Neil Horman1080d702008-10-27 12:28:25 -07001147 int num = ++net->ipv4.current_rt_cache_rebuild_count;
Pavel Emelyanovb35ecb52010-03-24 07:43:17 +00001148 if (!rt_caching(net)) {
Neil Horman1080d702008-10-27 12:28:25 -07001149 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
Changli Gaod8d1f302010-06-10 23:31:35 -07001150 rt->dst.dev->name, num);
Neil Horman1080d702008-10-27 12:28:25 -07001151 }
Pavel Emelyanovb35ecb52010-03-24 07:43:17 +00001152 rt_emergency_hash_rebuild(net);
Pavel Emelyanov6a2bad72010-03-24 21:51:22 +00001153 spin_unlock_bh(rt_hash_lock_addr(hash));
1154
David S. Miller5e2b61f2011-03-04 21:47:09 -08001155 hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
Pavel Emelyanov6a2bad72010-03-24 21:51:22 +00001156 ifindex, rt_genid(net));
1157 goto restart;
Neil Horman1080d702008-10-27 12:28:25 -07001158 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001159 }
1160
1161 /* Try to bind route to arp only if it is output
1162 route or unicast forwarding path.
1163 */
David S. Millerc7537962010-11-11 17:07:48 -08001164 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
David Miller3769cff2011-07-11 22:44:24 +00001165 int err = rt_bind_neighbour(rt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001166 if (err) {
Eric Dumazet22c047c2005-07-05 14:55:24 -07001167 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001168
1169 if (err != -ENOBUFS) {
1170 rt_drop(rt);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001171 return ERR_PTR(err);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001172 }
1173
1174 /* Neighbour tables are full and nothing
1175 can be released. Try to shrink route cache,
1176 it is most likely it holds some neighbour records.
1177 */
1178 if (attempts-- > 0) {
1179 int saved_elasticity = ip_rt_gc_elasticity;
1180 int saved_int = ip_rt_gc_min_interval;
1181 ip_rt_gc_elasticity = 1;
1182 ip_rt_gc_min_interval = 0;
Daniel Lezcano569d3642008-01-18 03:56:57 -08001183 rt_garbage_collect(&ipv4_dst_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001184 ip_rt_gc_min_interval = saved_int;
1185 ip_rt_gc_elasticity = saved_elasticity;
1186 goto restart;
1187 }
1188
1189 if (net_ratelimit())
Ulrich Weber7e1b33e2010-09-27 15:02:18 -07001190 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07001191 rt_drop(rt);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001192 return ERR_PTR(-ENOBUFS);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001193 }
1194 }
1195
Changli Gaod8d1f302010-06-10 23:31:35 -07001196 rt->dst.rt_next = rt_hash_table[hash].chain;
Neil Horman1080d702008-10-27 12:28:25 -07001197
Eric Dumazet00269b52008-10-16 14:18:29 -07001198 /*
1199 * Since lookup is lockfree, we must make sure
Lucas De Marchi25985ed2011-03-30 22:57:33 -03001200 * previous writes to rt are committed to memory
Eric Dumazet00269b52008-10-16 14:18:29 -07001201 * before making rt visible to other CPUS.
1202 */
Eric Dumazet1ddbcb02009-05-19 20:14:28 +00001203 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
Neil Horman1080d702008-10-27 12:28:25 -07001204
Eric Dumazet22c047c2005-07-05 14:55:24 -07001205 spin_unlock_bh(rt_hash_lock_addr(hash));
Neil Horman73e42892009-06-20 01:15:16 -07001206
Neil Hormanb6280b42009-06-22 10:18:53 +00001207skip_hashing:
David S. Millerb23dd4f2011-03-02 14:31:35 -08001208 if (skb)
Changli Gaod8d1f302010-06-10 23:31:35 -07001209 skb_dst_set(skb, &rt->dst);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001210 return rt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001211}
1212
David S. Miller6431cbc2011-02-07 20:38:06 -08001213static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1214
1215static u32 rt_peer_genid(void)
1216{
1217 return atomic_read(&__rt_peer_genid);
1218}
1219
David S. Millera48eff12011-05-18 18:42:43 -04001220void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001221{
Linus Torvalds1da177e2005-04-16 15:20:36 -07001222 struct inet_peer *peer;
1223
David S. Millera48eff12011-05-18 18:42:43 -04001224 peer = inet_getpeer_v4(daddr, create);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001225
Eric Dumazet49e8ab02010-08-19 06:10:45 +00001226 if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001227 inet_putpeer(peer);
David S. Miller6431cbc2011-02-07 20:38:06 -08001228 else
1229 rt->rt_peer_genid = rt_peer_genid();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001230}
1231
1232/*
1233 * Peer allocation may fail only in serious out-of-memory conditions. However
1234 * we still can generate some output.
1235 * Random ID selection looks a bit dangerous because we have no chances to
1236 * select ID being unique in a reasonable period of time.
1237 * But broken packet identifier may be better than no packet at all.
1238 */
1239static void ip_select_fb_ident(struct iphdr *iph)
1240{
1241 static DEFINE_SPINLOCK(ip_fb_id_lock);
1242 static u32 ip_fallback_id;
1243 u32 salt;
1244
1245 spin_lock_bh(&ip_fb_id_lock);
Al Viroe4485152006-09-26 22:15:01 -07001246 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001247 iph->id = htons(salt & 0xFFFF);
1248 ip_fallback_id = salt;
1249 spin_unlock_bh(&ip_fb_id_lock);
1250}
1251
1252void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1253{
1254 struct rtable *rt = (struct rtable *) dst;
1255
1256 if (rt) {
1257 if (rt->peer == NULL)
David S. Millera48eff12011-05-18 18:42:43 -04001258 rt_bind_peer(rt, rt->rt_dst, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001259
1260 /* If peer is attached to destination, it is never detached,
1261 so that we need not to grab a lock to dereference it.
1262 */
1263 if (rt->peer) {
1264 iph->id = htons(inet_getid(rt->peer, more));
1265 return;
1266 }
1267 } else
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001268 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
Stephen Hemminger9c2b3322005-04-19 22:39:42 -07001269 __builtin_return_address(0));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001270
1271 ip_select_fb_ident(iph);
1272}
Eric Dumazet4bc2f182010-07-09 21:22:10 +00001273EXPORT_SYMBOL(__ip_select_ident);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001274
1275static void rt_del(unsigned hash, struct rtable *rt)
1276{
Eric Dumazet1c317202010-10-25 21:02:07 +00001277 struct rtable __rcu **rthp;
1278 struct rtable *aux;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001279
Eric Dumazet29e75252008-01-31 17:05:09 -08001280 rthp = &rt_hash_table[hash].chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -07001281 spin_lock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001282 ip_rt_put(rt);
Eric Dumazet1c317202010-10-25 21:02:07 +00001283 while ((aux = rcu_dereference_protected(*rthp,
1284 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001285 if (aux == rt || rt_is_expired(aux)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001286 *rthp = aux->dst.rt_next;
Eric Dumazet29e75252008-01-31 17:05:09 -08001287 rt_free(aux);
1288 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001289 }
Changli Gaod8d1f302010-06-10 23:31:35 -07001290 rthp = &aux->dst.rt_next;
Eric Dumazet29e75252008-01-31 17:05:09 -08001291 }
Eric Dumazet22c047c2005-07-05 14:55:24 -07001292 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001293}
1294
Eric Dumazeted7865a42010-06-07 21:49:44 -07001295/* called in rcu_read_lock() section */
Al Virof7655222006-09-26 21:25:43 -07001296void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1297 __be32 saddr, struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001298{
Eric Dumazeted7865a42010-06-07 21:49:44 -07001299 struct in_device *in_dev = __in_dev_get_rcu(dev);
David S. Millerf39925d2011-02-09 22:00:16 -08001300 struct inet_peer *peer;
Denis V. Lunev317805b2008-02-28 20:50:06 -08001301 struct net *net;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001302
Linus Torvalds1da177e2005-04-16 15:20:36 -07001303 if (!in_dev)
1304 return;
1305
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09001306 net = dev_net(dev);
Joe Perches9d4fb272009-11-23 10:41:23 -08001307 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1308 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1309 ipv4_is_zeronet(new_gw))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001310 goto reject_redirect;
1311
1312 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1313 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1314 goto reject_redirect;
1315 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1316 goto reject_redirect;
1317 } else {
Denis V. Lunev317805b2008-02-28 20:50:06 -08001318 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001319 goto reject_redirect;
1320 }
1321
David S. Millerf39925d2011-02-09 22:00:16 -08001322 peer = inet_getpeer_v4(daddr, 1);
1323 if (peer) {
1324 peer->redirect_learned.a4 = new_gw;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001325
David S. Millerf39925d2011-02-09 22:00:16 -08001326 inet_putpeer(peer);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001327
David S. Millerf39925d2011-02-09 22:00:16 -08001328 atomic_inc(&__rt_peer_genid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001329 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001330 return;
1331
1332reject_redirect:
1333#ifdef CONFIG_IP_ROUTE_VERBOSE
1334 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
Harvey Harrison673d57e2008-10-31 00:53:57 -07001335 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1336 " Advised path = %pI4 -> %pI4\n",
1337 &old_gw, dev->name, &new_gw,
1338 &saddr, &daddr);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001339#endif
Eric Dumazeted7865a42010-06-07 21:49:44 -07001340 ;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001341}
1342
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001343static bool peer_pmtu_expired(struct inet_peer *peer)
1344{
1345 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1346
1347 return orig &&
1348 time_after_eq(jiffies, orig) &&
1349 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1350}
1351
1352static bool peer_pmtu_cleaned(struct inet_peer *peer)
1353{
1354 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1355
1356 return orig &&
1357 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1358}
1359
Linus Torvalds1da177e2005-04-16 15:20:36 -07001360static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1361{
Eric Dumazetee6b9672008-03-05 18:30:47 -08001362 struct rtable *rt = (struct rtable *)dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001363 struct dst_entry *ret = dst;
1364
1365 if (rt) {
Timo Teräsd11a4dc2010-03-18 23:20:20 +00001366 if (dst->obsolete > 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001367 ip_rt_put(rt);
1368 ret = NULL;
David S. Miller2c8cec52011-02-09 20:42:07 -08001369 } else if (rt->rt_flags & RTCF_REDIRECTED) {
David S. Miller5e2b61f2011-03-04 21:47:09 -08001370 unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1371 rt->rt_oif,
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001372 rt_genid(dev_net(dst->dev)));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001373 rt_del(hash, rt);
1374 ret = NULL;
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001375 } else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1376 dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001377 }
1378 }
1379 return ret;
1380}
1381
1382/*
1383 * Algorithm:
1384 * 1. The first ip_rt_redirect_number redirects are sent
1385 * with exponential backoff, then we stop sending them at all,
1386 * assuming that the host ignores our redirects.
1387 * 2. If we did not see packets requiring redirects
1388 * during ip_rt_redirect_silence, we assume that the host
1389 * forgot redirected route and start to send redirects again.
1390 *
1391 * This algorithm is much cheaper and more intelligent than dumb load limiting
1392 * in icmp.c.
1393 *
1394 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1395 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1396 */
1397
1398void ip_rt_send_redirect(struct sk_buff *skb)
1399{
Eric Dumazet511c3f92009-06-02 05:14:27 +00001400 struct rtable *rt = skb_rtable(skb);
Eric Dumazet30038fc2009-08-28 23:52:01 -07001401 struct in_device *in_dev;
David S. Miller92d86822011-02-04 15:55:25 -08001402 struct inet_peer *peer;
Eric Dumazet30038fc2009-08-28 23:52:01 -07001403 int log_martians;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001404
Eric Dumazet30038fc2009-08-28 23:52:01 -07001405 rcu_read_lock();
Changli Gaod8d1f302010-06-10 23:31:35 -07001406 in_dev = __in_dev_get_rcu(rt->dst.dev);
Eric Dumazet30038fc2009-08-28 23:52:01 -07001407 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1408 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001409 return;
Eric Dumazet30038fc2009-08-28 23:52:01 -07001410 }
1411 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1412 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001413
David S. Miller92d86822011-02-04 15:55:25 -08001414 if (!rt->peer)
David S. Millera48eff12011-05-18 18:42:43 -04001415 rt_bind_peer(rt, rt->rt_dst, 1);
David S. Miller92d86822011-02-04 15:55:25 -08001416 peer = rt->peer;
1417 if (!peer) {
1418 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1419 return;
1420 }
1421
Linus Torvalds1da177e2005-04-16 15:20:36 -07001422 /* No redirected packets during ip_rt_redirect_silence;
1423 * reset the algorithm.
1424 */
David S. Miller92d86822011-02-04 15:55:25 -08001425 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1426 peer->rate_tokens = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001427
1428 /* Too many ignored redirects; do not send anything
Changli Gaod8d1f302010-06-10 23:31:35 -07001429 * set dst.rate_last to the last seen redirected packet.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001430 */
David S. Miller92d86822011-02-04 15:55:25 -08001431 if (peer->rate_tokens >= ip_rt_redirect_number) {
1432 peer->rate_last = jiffies;
Eric Dumazet30038fc2009-08-28 23:52:01 -07001433 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001434 }
1435
1436 /* Check for load limit; set rate_last to the latest sent
1437 * redirect.
1438 */
David S. Miller92d86822011-02-04 15:55:25 -08001439 if (peer->rate_tokens == 0 ||
Li Yewang14fb8a72006-12-18 00:26:35 -08001440 time_after(jiffies,
David S. Miller92d86822011-02-04 15:55:25 -08001441 (peer->rate_last +
1442 (ip_rt_redirect_load << peer->rate_tokens)))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001443 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
David S. Miller92d86822011-02-04 15:55:25 -08001444 peer->rate_last = jiffies;
1445 ++peer->rate_tokens;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001446#ifdef CONFIG_IP_ROUTE_VERBOSE
Eric Dumazet30038fc2009-08-28 23:52:01 -07001447 if (log_martians &&
David S. Miller92d86822011-02-04 15:55:25 -08001448 peer->rate_tokens == ip_rt_redirect_number &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001449 net_ratelimit())
Harvey Harrison673d57e2008-10-31 00:53:57 -07001450 printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
David S. Millerc5be24f2011-05-13 18:01:21 -04001451 &ip_hdr(skb)->saddr, rt->rt_iif,
Harvey Harrison673d57e2008-10-31 00:53:57 -07001452 &rt->rt_dst, &rt->rt_gateway);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001453#endif
1454 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001455}
1456
1457static int ip_error(struct sk_buff *skb)
1458{
Eric Dumazet511c3f92009-06-02 05:14:27 +00001459 struct rtable *rt = skb_rtable(skb);
David S. Miller92d86822011-02-04 15:55:25 -08001460 struct inet_peer *peer;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001461 unsigned long now;
David S. Miller92d86822011-02-04 15:55:25 -08001462 bool send;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001463 int code;
1464
Changli Gaod8d1f302010-06-10 23:31:35 -07001465 switch (rt->dst.error) {
Joe Perches4500ebf2011-07-01 09:43:07 +00001466 case EINVAL:
1467 default:
1468 goto out;
1469 case EHOSTUNREACH:
1470 code = ICMP_HOST_UNREACH;
1471 break;
1472 case ENETUNREACH:
1473 code = ICMP_NET_UNREACH;
1474 IP_INC_STATS_BH(dev_net(rt->dst.dev),
1475 IPSTATS_MIB_INNOROUTES);
1476 break;
1477 case EACCES:
1478 code = ICMP_PKT_FILTERED;
1479 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001480 }
1481
David S. Miller92d86822011-02-04 15:55:25 -08001482 if (!rt->peer)
David S. Millera48eff12011-05-18 18:42:43 -04001483 rt_bind_peer(rt, rt->rt_dst, 1);
David S. Miller92d86822011-02-04 15:55:25 -08001484 peer = rt->peer;
1485
1486 send = true;
1487 if (peer) {
1488 now = jiffies;
1489 peer->rate_tokens += now - peer->rate_last;
1490 if (peer->rate_tokens > ip_rt_error_burst)
1491 peer->rate_tokens = ip_rt_error_burst;
1492 peer->rate_last = now;
1493 if (peer->rate_tokens >= ip_rt_error_cost)
1494 peer->rate_tokens -= ip_rt_error_cost;
1495 else
1496 send = false;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001497 }
David S. Miller92d86822011-02-04 15:55:25 -08001498 if (send)
1499 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001500
1501out: kfree_skb(skb);
1502 return 0;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001503}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001504
1505/*
1506 * The last two values are not from the RFC but
1507 * are needed for AMPRnet AX.25 paths.
1508 */
1509
Arjan van de Ven9b5b5cf2005-11-29 16:21:38 -08001510static const unsigned short mtu_plateau[] =
Linus Torvalds1da177e2005-04-16 15:20:36 -07001511{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1512
Stephen Hemminger5969f712008-04-10 01:52:09 -07001513static inline unsigned short guess_mtu(unsigned short old_mtu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001514{
1515 int i;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001516
Linus Torvalds1da177e2005-04-16 15:20:36 -07001517 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1518 if (old_mtu > mtu_plateau[i])
1519 return mtu_plateau[i];
1520 return 68;
1521}
1522
Eric Dumazetb71d1d42011-04-22 04:53:02 +00001523unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
Timo Teras0010e462008-04-29 03:32:25 -07001524 unsigned short new_mtu,
1525 struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001526{
Linus Torvalds1da177e2005-04-16 15:20:36 -07001527 unsigned short old_mtu = ntohs(iph->tot_len);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001528 unsigned short est_mtu = 0;
David S. Miller2c8cec52011-02-09 20:42:07 -08001529 struct inet_peer *peer;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001530
David S. Miller2c8cec52011-02-09 20:42:07 -08001531 peer = inet_getpeer_v4(iph->daddr, 1);
1532 if (peer) {
1533 unsigned short mtu = new_mtu;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001534
David S. Miller2c8cec52011-02-09 20:42:07 -08001535 if (new_mtu < 68 || new_mtu >= old_mtu) {
1536 /* BSD 4.2 derived systems incorrectly adjust
1537 * tot_len by the IP header length, and report
1538 * a zero MTU in the ICMP message.
1539 */
1540 if (mtu == 0 &&
1541 old_mtu >= 68 + (iph->ihl << 2))
1542 old_mtu -= iph->ihl << 2;
1543 mtu = guess_mtu(old_mtu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001544 }
David S. Miller2c8cec52011-02-09 20:42:07 -08001545
1546 if (mtu < ip_rt_min_pmtu)
1547 mtu = ip_rt_min_pmtu;
1548 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
Hiroaki SHIMODA46af3182011-03-09 20:09:58 +00001549 unsigned long pmtu_expires;
1550
1551 pmtu_expires = jiffies + ip_rt_mtu_expires;
1552 if (!pmtu_expires)
1553 pmtu_expires = 1UL;
1554
David S. Miller2c8cec52011-02-09 20:42:07 -08001555 est_mtu = mtu;
1556 peer->pmtu_learned = mtu;
Hiroaki SHIMODA46af3182011-03-09 20:09:58 +00001557 peer->pmtu_expires = pmtu_expires;
David S. Miller2c8cec52011-02-09 20:42:07 -08001558 }
1559
1560 inet_putpeer(peer);
1561
1562 atomic_inc(&__rt_peer_genid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001563 }
1564 return est_mtu ? : new_mtu;
1565}
1566
David S. Miller2c8cec52011-02-09 20:42:07 -08001567static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1568{
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001569 unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
David S. Miller2c8cec52011-02-09 20:42:07 -08001570
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001571 if (!expires)
1572 return;
Hiroaki SHIMODA46af3182011-03-09 20:09:58 +00001573 if (time_before(jiffies, expires)) {
David S. Miller2c8cec52011-02-09 20:42:07 -08001574 u32 orig_dst_mtu = dst_mtu(dst);
1575 if (peer->pmtu_learned < orig_dst_mtu) {
1576 if (!peer->pmtu_orig)
1577 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1578 dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1579 }
1580 } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1581 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1582}
1583
Linus Torvalds1da177e2005-04-16 15:20:36 -07001584static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1585{
David S. Miller2c8cec52011-02-09 20:42:07 -08001586 struct rtable *rt = (struct rtable *) dst;
1587 struct inet_peer *peer;
1588
1589 dst_confirm(dst);
1590
1591 if (!rt->peer)
David S. Millera48eff12011-05-18 18:42:43 -04001592 rt_bind_peer(rt, rt->rt_dst, 1);
David S. Miller2c8cec52011-02-09 20:42:07 -08001593 peer = rt->peer;
1594 if (peer) {
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001595 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1596
David S. Miller2c8cec52011-02-09 20:42:07 -08001597 if (mtu < ip_rt_min_pmtu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001598 mtu = ip_rt_min_pmtu;
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001599 if (!pmtu_expires || mtu < peer->pmtu_learned) {
Hiroaki SHIMODA46af3182011-03-09 20:09:58 +00001600
1601 pmtu_expires = jiffies + ip_rt_mtu_expires;
1602 if (!pmtu_expires)
1603 pmtu_expires = 1UL;
1604
David S. Miller2c8cec52011-02-09 20:42:07 -08001605 peer->pmtu_learned = mtu;
Hiroaki SHIMODA46af3182011-03-09 20:09:58 +00001606 peer->pmtu_expires = pmtu_expires;
David S. Miller2c8cec52011-02-09 20:42:07 -08001607
1608 atomic_inc(&__rt_peer_genid);
1609 rt->rt_peer_genid = rt_peer_genid();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001610 }
Hiroaki SHIMODA46af3182011-03-09 20:09:58 +00001611 check_peer_pmtu(dst, peer);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001612 }
1613}
1614
David S. Millerf39925d2011-02-09 22:00:16 -08001615static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1616{
1617 struct rtable *rt = (struct rtable *) dst;
1618 __be32 orig_gw = rt->rt_gateway;
1619
1620 dst_confirm(&rt->dst);
1621
1622 neigh_release(rt->dst.neighbour);
1623 rt->dst.neighbour = NULL;
1624
1625 rt->rt_gateway = peer->redirect_learned.a4;
David Miller3769cff2011-07-11 22:44:24 +00001626 if (rt_bind_neighbour(rt) ||
David S. Millerf39925d2011-02-09 22:00:16 -08001627 !(rt->dst.neighbour->nud_state & NUD_VALID)) {
1628 if (rt->dst.neighbour)
1629 neigh_event_send(rt->dst.neighbour, NULL);
1630 rt->rt_gateway = orig_gw;
1631 return -EAGAIN;
1632 } else {
1633 rt->rt_flags |= RTCF_REDIRECTED;
1634 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE,
1635 rt->dst.neighbour);
1636 }
1637 return 0;
1638}
1639
Linus Torvalds1da177e2005-04-16 15:20:36 -07001640static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1641{
David S. Miller6431cbc2011-02-07 20:38:06 -08001642 struct rtable *rt = (struct rtable *) dst;
1643
1644 if (rt_is_expired(rt))
Timo Teräsd11a4dc2010-03-18 23:20:20 +00001645 return NULL;
David S. Miller6431cbc2011-02-07 20:38:06 -08001646 if (rt->rt_peer_genid != rt_peer_genid()) {
David S. Miller2c8cec52011-02-09 20:42:07 -08001647 struct inet_peer *peer;
1648
David S. Miller6431cbc2011-02-07 20:38:06 -08001649 if (!rt->peer)
David S. Millera48eff12011-05-18 18:42:43 -04001650 rt_bind_peer(rt, rt->rt_dst, 0);
David S. Miller6431cbc2011-02-07 20:38:06 -08001651
David S. Miller2c8cec52011-02-09 20:42:07 -08001652 peer = rt->peer;
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001653 if (peer) {
David S. Miller2c8cec52011-02-09 20:42:07 -08001654 check_peer_pmtu(dst, peer);
1655
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001656 if (peer->redirect_learned.a4 &&
1657 peer->redirect_learned.a4 != rt->rt_gateway) {
1658 if (check_peer_redir(dst, peer))
1659 return NULL;
1660 }
David S. Millerf39925d2011-02-09 22:00:16 -08001661 }
1662
David S. Miller6431cbc2011-02-07 20:38:06 -08001663 rt->rt_peer_genid = rt_peer_genid();
1664 }
Timo Teräsd11a4dc2010-03-18 23:20:20 +00001665 return dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001666}
1667
1668static void ipv4_dst_destroy(struct dst_entry *dst)
1669{
1670 struct rtable *rt = (struct rtable *) dst;
1671 struct inet_peer *peer = rt->peer;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001672
David S. Miller62fa8a82011-01-26 20:51:05 -08001673 if (rt->fi) {
1674 fib_info_put(rt->fi);
1675 rt->fi = NULL;
1676 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001677 if (peer) {
1678 rt->peer = NULL;
1679 inet_putpeer(peer);
1680 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001681}
1682
Linus Torvalds1da177e2005-04-16 15:20:36 -07001683
1684static void ipv4_link_failure(struct sk_buff *skb)
1685{
1686 struct rtable *rt;
1687
1688 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1689
Eric Dumazet511c3f92009-06-02 05:14:27 +00001690 rt = skb_rtable(skb);
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001691 if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1692 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001693}
1694
1695static int ip_rt_bug(struct sk_buff *skb)
1696{
Harvey Harrison673d57e2008-10-31 00:53:57 -07001697 printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1698 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001699 skb->dev ? skb->dev->name : "?");
1700 kfree_skb(skb);
Dave Jonesc378a9c2011-05-21 07:16:42 +00001701 WARN_ON(1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001702 return 0;
1703}
1704
1705/*
1706 We do not cache source address of outgoing interface,
1707 because it is used only by IP RR, TS and SRR options,
1708 so that it out of fast path.
1709
1710 BTW remember: "addr" is allowed to be not aligned
1711 in IP options!
1712 */
1713
David S. Miller8e363602011-05-13 17:29:41 -04001714void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001715{
Al Viroa61ced52006-09-26 21:27:54 -07001716 __be32 src;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001717
David S. Millerc7537962010-11-11 17:07:48 -08001718 if (rt_is_output_route(rt))
David S. Millerc5be24f2011-05-13 18:01:21 -04001719 src = ip_hdr(skb)->saddr;
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00001720 else {
David S. Miller8e363602011-05-13 17:29:41 -04001721 struct fib_result res;
1722 struct flowi4 fl4;
1723 struct iphdr *iph;
1724
1725 iph = ip_hdr(skb);
1726
1727 memset(&fl4, 0, sizeof(fl4));
1728 fl4.daddr = iph->daddr;
1729 fl4.saddr = iph->saddr;
1730 fl4.flowi4_tos = iph->tos;
1731 fl4.flowi4_oif = rt->dst.dev->ifindex;
1732 fl4.flowi4_iif = skb->dev->ifindex;
1733 fl4.flowi4_mark = skb->mark;
David S. Miller5e2b61f2011-03-04 21:47:09 -08001734
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00001735 rcu_read_lock();
David S. Miller68a5e3d2011-03-11 20:07:33 -05001736 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
David S. Miller436c3b62011-03-24 17:42:21 -07001737 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00001738 else
1739 src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001740 RT_SCOPE_UNIVERSE);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00001741 rcu_read_unlock();
1742 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001743 memcpy(addr, &src, 4);
1744}
1745
Patrick McHardyc7066f72011-01-14 13:36:42 +01001746#ifdef CONFIG_IP_ROUTE_CLASSID
Linus Torvalds1da177e2005-04-16 15:20:36 -07001747static void set_class_tag(struct rtable *rt, u32 tag)
1748{
Changli Gaod8d1f302010-06-10 23:31:35 -07001749 if (!(rt->dst.tclassid & 0xFFFF))
1750 rt->dst.tclassid |= tag & 0xFFFF;
1751 if (!(rt->dst.tclassid & 0xFFFF0000))
1752 rt->dst.tclassid |= tag & 0xFFFF0000;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001753}
1754#endif
1755
David S. Miller0dbaee32010-12-13 12:52:14 -08001756static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1757{
1758 unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1759
1760 if (advmss == 0) {
1761 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1762 ip_rt_min_advmss);
1763 if (advmss > 65535 - 40)
1764 advmss = 65535 - 40;
1765 }
1766 return advmss;
1767}
1768
David S. Millerd33e4552010-12-14 13:01:14 -08001769static unsigned int ipv4_default_mtu(const struct dst_entry *dst)
1770{
1771 unsigned int mtu = dst->dev->mtu;
1772
1773 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1774 const struct rtable *rt = (const struct rtable *) dst;
1775
1776 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1777 mtu = 576;
1778 }
1779
1780 if (mtu > IP_MAX_MTU)
1781 mtu = IP_MAX_MTU;
1782
1783 return mtu;
1784}
1785
David S. Miller813b3b52011-04-28 14:48:42 -07001786static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
David S. Miller5e2b61f2011-03-04 21:47:09 -08001787 struct fib_info *fi)
David S. Millera4daad62011-01-27 22:01:53 -08001788{
David S. Miller0131ba42011-02-04 14:37:30 -08001789 struct inet_peer *peer;
1790 int create = 0;
1791
1792 /* If a peer entry exists for this destination, we must hook
1793 * it up in order to get at cached metrics.
1794 */
David S. Miller813b3b52011-04-28 14:48:42 -07001795 if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
David S. Miller0131ba42011-02-04 14:37:30 -08001796 create = 1;
1797
David S. Miller3c0afdc2011-03-04 21:26:07 -08001798 rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
David S. Miller0131ba42011-02-04 14:37:30 -08001799 if (peer) {
David S. Miller3c0afdc2011-03-04 21:26:07 -08001800 rt->rt_peer_genid = rt_peer_genid();
David S. Miller0131ba42011-02-04 14:37:30 -08001801 if (inet_metrics_new(peer))
1802 memcpy(peer->metrics, fi->fib_metrics,
1803 sizeof(u32) * RTAX_MAX);
1804 dst_init_metrics(&rt->dst, peer->metrics, false);
David S. Miller2c8cec52011-02-09 20:42:07 -08001805
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001806 check_peer_pmtu(&rt->dst, peer);
David S. Millerf39925d2011-02-09 22:00:16 -08001807 if (peer->redirect_learned.a4 &&
1808 peer->redirect_learned.a4 != rt->rt_gateway) {
1809 rt->rt_gateway = peer->redirect_learned.a4;
1810 rt->rt_flags |= RTCF_REDIRECTED;
1811 }
David S. Miller0131ba42011-02-04 14:37:30 -08001812 } else {
David S. Millerb8dad612011-01-28 14:07:16 -08001813 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1814 rt->fi = fi;
1815 atomic_inc(&fi->fib_clntref);
1816 }
David S. Millera4daad62011-01-27 22:01:53 -08001817 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
David S. Millera4daad62011-01-27 22:01:53 -08001818 }
1819}
1820
David S. Miller813b3b52011-04-28 14:48:42 -07001821static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
David S. Miller5e2b61f2011-03-04 21:47:09 -08001822 const struct fib_result *res,
David S. Miller982721f2011-02-16 21:44:24 -08001823 struct fib_info *fi, u16 type, u32 itag)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001824{
David S. Millerdefb3512010-12-08 21:16:57 -08001825 struct dst_entry *dst = &rt->dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001826
1827 if (fi) {
1828 if (FIB_RES_GW(*res) &&
1829 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1830 rt->rt_gateway = FIB_RES_GW(*res);
David S. Miller813b3b52011-04-28 14:48:42 -07001831 rt_init_metrics(rt, fl4, fi);
Patrick McHardyc7066f72011-01-14 13:36:42 +01001832#ifdef CONFIG_IP_ROUTE_CLASSID
David S. Millerdefb3512010-12-08 21:16:57 -08001833 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001834#endif
David S. Millerd33e4552010-12-14 13:01:14 -08001835 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001836
David S. Millerdefb3512010-12-08 21:16:57 -08001837 if (dst_mtu(dst) > IP_MAX_MTU)
1838 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
David S. Miller0dbaee32010-12-13 12:52:14 -08001839 if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
David S. Millerdefb3512010-12-08 21:16:57 -08001840 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001841
Patrick McHardyc7066f72011-01-14 13:36:42 +01001842#ifdef CONFIG_IP_ROUTE_CLASSID
Linus Torvalds1da177e2005-04-16 15:20:36 -07001843#ifdef CONFIG_IP_MULTIPLE_TABLES
1844 set_class_tag(rt, fib_rules_tclass(res));
1845#endif
1846 set_class_tag(rt, itag);
1847#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07001848}
1849
David S. Miller5c1e6aa2011-04-28 14:13:38 -07001850static struct rtable *rt_dst_alloc(struct net_device *dev,
1851 bool nopolicy, bool noxfrm)
David S. Miller0c4dcd52011-02-17 15:42:37 -08001852{
David S. Miller5c1e6aa2011-04-28 14:13:38 -07001853 return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
1854 DST_HOST |
1855 (nopolicy ? DST_NOPOLICY : 0) |
1856 (noxfrm ? DST_NOXFRM : 0));
David S. Miller0c4dcd52011-02-17 15:42:37 -08001857}
1858
Eric Dumazet96d36222010-06-02 19:21:31 +00001859/* called in rcu_read_lock() section */
Al Viro9e12bb22006-09-26 21:25:20 -07001860static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001861 u8 tos, struct net_device *dev, int our)
1862{
Eric Dumazet96d36222010-06-02 19:21:31 +00001863 unsigned int hash;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001864 struct rtable *rth;
Al Viroa61ced52006-09-26 21:27:54 -07001865 __be32 spec_dst;
Eric Dumazet96d36222010-06-02 19:21:31 +00001866 struct in_device *in_dev = __in_dev_get_rcu(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001867 u32 itag = 0;
Eric Dumazetb5f7e752010-06-02 12:05:27 +00001868 int err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001869
1870 /* Primary sanity checks. */
1871
1872 if (in_dev == NULL)
1873 return -EINVAL;
1874
Jan Engelhardt1e637c72008-01-21 03:18:08 -08001875 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
Joe Perchesf97c1e02007-12-16 13:45:43 -08001876 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001877 goto e_inval;
1878
Joe Perchesf97c1e02007-12-16 13:45:43 -08001879 if (ipv4_is_zeronet(saddr)) {
1880 if (!ipv4_is_local_multicast(daddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001881 goto e_inval;
1882 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
Eric Dumazetb5f7e752010-06-02 12:05:27 +00001883 } else {
Michael Smith5c04c812011-04-07 04:51:50 +00001884 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
1885 &itag);
Eric Dumazetb5f7e752010-06-02 12:05:27 +00001886 if (err < 0)
1887 goto e_err;
1888 }
David S. Miller5c1e6aa2011-04-28 14:13:38 -07001889 rth = rt_dst_alloc(init_net.loopback_dev,
1890 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001891 if (!rth)
1892 goto e_nobufs;
1893
Patrick McHardyc7066f72011-01-14 13:36:42 +01001894#ifdef CONFIG_IP_ROUTE_CLASSID
Changli Gaod8d1f302010-06-10 23:31:35 -07001895 rth->dst.tclassid = itag;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001896#endif
David S. Millercf911662011-04-28 14:31:47 -07001897 rth->dst.output = ip_rt_bug;
1898
1899 rth->rt_key_dst = daddr;
1900 rth->rt_key_src = saddr;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001901 rth->rt_genid = rt_genid(dev_net(dev));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001902 rth->rt_flags = RTCF_MULTICAST;
Eric Dumazet29e75252008-01-31 17:05:09 -08001903 rth->rt_type = RTN_MULTICAST;
David S. Miller475949d2011-05-03 19:45:15 -07001904 rth->rt_key_tos = tos;
David S. Millercf911662011-04-28 14:31:47 -07001905 rth->rt_dst = daddr;
1906 rth->rt_src = saddr;
1907 rth->rt_route_iif = dev->ifindex;
1908 rth->rt_iif = dev->ifindex;
1909 rth->rt_oif = 0;
1910 rth->rt_mark = skb->mark;
1911 rth->rt_gateway = daddr;
1912 rth->rt_spec_dst= spec_dst;
1913 rth->rt_peer_genid = 0;
1914 rth->peer = NULL;
1915 rth->fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001916 if (our) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001917 rth->dst.input= ip_local_deliver;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001918 rth->rt_flags |= RTCF_LOCAL;
1919 }
1920
1921#ifdef CONFIG_IP_MROUTE
Joe Perchesf97c1e02007-12-16 13:45:43 -08001922 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
Changli Gaod8d1f302010-06-10 23:31:35 -07001923 rth->dst.input = ip_mr_input;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001924#endif
1925 RT_CACHE_STAT_INC(in_slow_mc);
1926
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001927 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
David S. Millerb23dd4f2011-03-02 14:31:35 -08001928 rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
Eric Dumazet9aa3c942011-06-18 11:59:18 -07001929 return IS_ERR(rth) ? PTR_ERR(rth) : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001930
1931e_nobufs:
Linus Torvalds1da177e2005-04-16 15:20:36 -07001932 return -ENOBUFS;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001933e_inval:
Eric Dumazet96d36222010-06-02 19:21:31 +00001934 return -EINVAL;
Eric Dumazetb5f7e752010-06-02 12:05:27 +00001935e_err:
Eric Dumazetb5f7e752010-06-02 12:05:27 +00001936 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001937}
1938
1939
1940static void ip_handle_martian_source(struct net_device *dev,
1941 struct in_device *in_dev,
1942 struct sk_buff *skb,
Al Viro9e12bb22006-09-26 21:25:20 -07001943 __be32 daddr,
1944 __be32 saddr)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001945{
1946 RT_CACHE_STAT_INC(in_martian_src);
1947#ifdef CONFIG_IP_ROUTE_VERBOSE
1948 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1949 /*
1950 * RFC1812 recommendation, if source is martian,
1951 * the only hint is MAC header.
1952 */
Harvey Harrison673d57e2008-10-31 00:53:57 -07001953 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
1954 &daddr, &saddr, dev->name);
Arnaldo Carvalho de Melo98e399f2007-03-19 15:33:04 -07001955 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001956 int i;
Arnaldo Carvalho de Melo98e399f2007-03-19 15:33:04 -07001957 const unsigned char *p = skb_mac_header(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001958 printk(KERN_WARNING "ll header: ");
1959 for (i = 0; i < dev->hard_header_len; i++, p++) {
1960 printk("%02x", *p);
1961 if (i < (dev->hard_header_len - 1))
1962 printk(":");
1963 }
1964 printk("\n");
1965 }
1966 }
1967#endif
1968}
1969
Eric Dumazet47360222010-06-03 04:13:21 +00001970/* called in rcu_read_lock() section */
Stephen Hemminger5969f712008-04-10 01:52:09 -07001971static int __mkroute_input(struct sk_buff *skb,
David S. Miller982721f2011-02-16 21:44:24 -08001972 const struct fib_result *res,
Stephen Hemminger5969f712008-04-10 01:52:09 -07001973 struct in_device *in_dev,
1974 __be32 daddr, __be32 saddr, u32 tos,
1975 struct rtable **result)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001976{
Linus Torvalds1da177e2005-04-16 15:20:36 -07001977 struct rtable *rth;
1978 int err;
1979 struct in_device *out_dev;
Eric Dumazet47360222010-06-03 04:13:21 +00001980 unsigned int flags = 0;
Al Virod9c9df82006-09-26 21:28:14 -07001981 __be32 spec_dst;
1982 u32 itag;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001983
1984 /* get a working reference to the output device */
Eric Dumazet47360222010-06-03 04:13:21 +00001985 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001986 if (out_dev == NULL) {
1987 if (net_ratelimit())
1988 printk(KERN_CRIT "Bug in ip_route_input" \
1989 "_slow(). Please, report\n");
1990 return -EINVAL;
1991 }
1992
1993
Michael Smith5c04c812011-04-07 04:51:50 +00001994 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1995 in_dev->dev, &spec_dst, &itag);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001996 if (err < 0) {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001997 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001998 saddr);
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001999
Linus Torvalds1da177e2005-04-16 15:20:36 -07002000 goto cleanup;
2001 }
2002
2003 if (err)
2004 flags |= RTCF_DIRECTSRC;
2005
Thomas Graf51b77ca2008-06-03 16:36:01 -07002006 if (out_dev == in_dev && err &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07002007 (IN_DEV_SHARED_MEDIA(out_dev) ||
2008 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2009 flags |= RTCF_DOREDIRECT;
2010
2011 if (skb->protocol != htons(ETH_P_IP)) {
2012 /* Not IP (i.e. ARP). Do not create route, if it is
2013 * invalid for proxy arp. DNAT routes are always valid.
Jesper Dangaard Brouer65324142010-01-05 05:50:47 +00002014 *
2015 * Proxy arp feature have been extended to allow, ARP
2016 * replies back to the same interface, to support
2017 * Private VLAN switch technologies. See arp.c.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002018 */
Jesper Dangaard Brouer65324142010-01-05 05:50:47 +00002019 if (out_dev == in_dev &&
2020 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002021 err = -EINVAL;
2022 goto cleanup;
2023 }
2024 }
2025
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002026 rth = rt_dst_alloc(out_dev->dev,
2027 IN_DEV_CONF_GET(in_dev, NOPOLICY),
David S. Miller0c4dcd52011-02-17 15:42:37 -08002028 IN_DEV_CONF_GET(out_dev, NOXFRM));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002029 if (!rth) {
2030 err = -ENOBUFS;
2031 goto cleanup;
2032 }
2033
David S. Miller5e2b61f2011-03-04 21:47:09 -08002034 rth->rt_key_dst = daddr;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002035 rth->rt_key_src = saddr;
David S. Millercf911662011-04-28 14:31:47 -07002036 rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2037 rth->rt_flags = flags;
2038 rth->rt_type = res->type;
David S. Miller475949d2011-05-03 19:45:15 -07002039 rth->rt_key_tos = tos;
David S. Millercf911662011-04-28 14:31:47 -07002040 rth->rt_dst = daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002041 rth->rt_src = saddr;
OGAWA Hirofumi1b86a582011-04-07 14:04:08 -07002042 rth->rt_route_iif = in_dev->dev->ifindex;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002043 rth->rt_iif = in_dev->dev->ifindex;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002044 rth->rt_oif = 0;
David S. Millercf911662011-04-28 14:31:47 -07002045 rth->rt_mark = skb->mark;
2046 rth->rt_gateway = daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002047 rth->rt_spec_dst= spec_dst;
David S. Millercf911662011-04-28 14:31:47 -07002048 rth->rt_peer_genid = 0;
2049 rth->peer = NULL;
2050 rth->fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002051
Changli Gaod8d1f302010-06-10 23:31:35 -07002052 rth->dst.input = ip_forward;
2053 rth->dst.output = ip_output;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002054
David S. Miller5e2b61f2011-03-04 21:47:09 -08002055 rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002056
Linus Torvalds1da177e2005-04-16 15:20:36 -07002057 *result = rth;
2058 err = 0;
2059 cleanup:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002060 return err;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002061}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002062
Stephen Hemminger5969f712008-04-10 01:52:09 -07002063static int ip_mkroute_input(struct sk_buff *skb,
2064 struct fib_result *res,
David S. Miller68a5e3d2011-03-11 20:07:33 -05002065 const struct flowi4 *fl4,
Stephen Hemminger5969f712008-04-10 01:52:09 -07002066 struct in_device *in_dev,
2067 __be32 daddr, __be32 saddr, u32 tos)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002068{
Chuck Short7abaa272005-06-22 22:10:23 -07002069 struct rtable* rth = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002070 int err;
2071 unsigned hash;
2072
2073#ifdef CONFIG_IP_ROUTE_MULTIPATH
David S. Millerff3fccb2011-03-10 16:23:24 -08002074 if (res->fi && res->fi->fib_nhs > 1)
David S. Miller1b7fe5932011-03-10 17:01:16 -08002075 fib_select_multipath(res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002076#endif
2077
2078 /* create a routing cache entry */
2079 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2080 if (err)
2081 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002082
2083 /* put it into the cache */
David S. Miller68a5e3d2011-03-11 20:07:33 -05002084 hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
Changli Gaod8d1f302010-06-10 23:31:35 -07002085 rt_genid(dev_net(rth->dst.dev)));
David S. Miller68a5e3d2011-03-11 20:07:33 -05002086 rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
David S. Millerb23dd4f2011-03-02 14:31:35 -08002087 if (IS_ERR(rth))
2088 return PTR_ERR(rth);
2089 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002090}
2091
Linus Torvalds1da177e2005-04-16 15:20:36 -07002092/*
2093 * NOTE. We drop all the packets that has local source
2094 * addresses, because every properly looped back packet
2095 * must have correct destination already attached by output routine.
2096 *
2097 * Such approach solves two big problems:
2098 * 1. Not simplex devices are handled properly.
2099 * 2. IP spoofing attempts are filtered with 100% of guarantee.
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002100 * called with rcu_read_lock()
Linus Torvalds1da177e2005-04-16 15:20:36 -07002101 */
2102
Al Viro9e12bb22006-09-26 21:25:20 -07002103static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002104 u8 tos, struct net_device *dev)
2105{
2106 struct fib_result res;
Eric Dumazet96d36222010-06-02 19:21:31 +00002107 struct in_device *in_dev = __in_dev_get_rcu(dev);
David S. Miller68a5e3d2011-03-11 20:07:33 -05002108 struct flowi4 fl4;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002109 unsigned flags = 0;
2110 u32 itag = 0;
2111 struct rtable * rth;
2112 unsigned hash;
Al Viro9e12bb22006-09-26 21:25:20 -07002113 __be32 spec_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002114 int err = -EINVAL;
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09002115 struct net * net = dev_net(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002116
2117 /* IP on this device is disabled. */
2118
2119 if (!in_dev)
2120 goto out;
2121
2122 /* Check for the most weird martians, which can be not detected
2123 by fib_lookup.
2124 */
2125
Jan Engelhardt1e637c72008-01-21 03:18:08 -08002126 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
Joe Perchesf97c1e02007-12-16 13:45:43 -08002127 ipv4_is_loopback(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002128 goto martian_source;
2129
Andy Walls27a954b2010-10-17 15:11:22 +00002130 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002131 goto brd_input;
2132
2133 /* Accept zero addresses only to limited broadcast;
2134 * I even do not know to fix it or not. Waiting for complains :-)
2135 */
Joe Perchesf97c1e02007-12-16 13:45:43 -08002136 if (ipv4_is_zeronet(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002137 goto martian_source;
2138
Andy Walls27a954b2010-10-17 15:11:22 +00002139 if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002140 goto martian_destination;
2141
2142 /*
2143 * Now we are ready to route packet.
2144 */
David S. Miller68a5e3d2011-03-11 20:07:33 -05002145 fl4.flowi4_oif = 0;
2146 fl4.flowi4_iif = dev->ifindex;
2147 fl4.flowi4_mark = skb->mark;
2148 fl4.flowi4_tos = tos;
2149 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2150 fl4.daddr = daddr;
2151 fl4.saddr = saddr;
2152 err = fib_lookup(net, &fl4, &res);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002153 if (err != 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002154 if (!IN_DEV_FORWARD(in_dev))
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002155 goto e_hostunreach;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002156 goto no_route;
2157 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002158
2159 RT_CACHE_STAT_INC(in_slow_tot);
2160
2161 if (res.type == RTN_BROADCAST)
2162 goto brd_input;
2163
2164 if (res.type == RTN_LOCAL) {
Michael Smith5c04c812011-04-07 04:51:50 +00002165 err = fib_validate_source(skb, saddr, daddr, tos,
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002166 net->loopback_dev->ifindex,
Michael Smith5c04c812011-04-07 04:51:50 +00002167 dev, &spec_dst, &itag);
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002168 if (err < 0)
2169 goto martian_source_keep_err;
2170 if (err)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002171 flags |= RTCF_DIRECTSRC;
2172 spec_dst = daddr;
2173 goto local_input;
2174 }
2175
2176 if (!IN_DEV_FORWARD(in_dev))
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002177 goto e_hostunreach;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002178 if (res.type != RTN_UNICAST)
2179 goto martian_destination;
2180
David S. Miller68a5e3d2011-03-11 20:07:33 -05002181 err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002182out: return err;
2183
2184brd_input:
2185 if (skb->protocol != htons(ETH_P_IP))
2186 goto e_inval;
2187
Joe Perchesf97c1e02007-12-16 13:45:43 -08002188 if (ipv4_is_zeronet(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002189 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2190 else {
Michael Smith5c04c812011-04-07 04:51:50 +00002191 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2192 &itag);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002193 if (err < 0)
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002194 goto martian_source_keep_err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002195 if (err)
2196 flags |= RTCF_DIRECTSRC;
2197 }
2198 flags |= RTCF_BROADCAST;
2199 res.type = RTN_BROADCAST;
2200 RT_CACHE_STAT_INC(in_brd);
2201
2202local_input:
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002203 rth = rt_dst_alloc(net->loopback_dev,
2204 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002205 if (!rth)
2206 goto e_nobufs;
2207
David S. Millercf911662011-04-28 14:31:47 -07002208 rth->dst.input= ip_local_deliver;
Changli Gaod8d1f302010-06-10 23:31:35 -07002209 rth->dst.output= ip_rt_bug;
David S. Millercf911662011-04-28 14:31:47 -07002210#ifdef CONFIG_IP_ROUTE_CLASSID
2211 rth->dst.tclassid = itag;
2212#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07002213
David S. Miller5e2b61f2011-03-04 21:47:09 -08002214 rth->rt_key_dst = daddr;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002215 rth->rt_key_src = saddr;
David S. Millercf911662011-04-28 14:31:47 -07002216 rth->rt_genid = rt_genid(net);
2217 rth->rt_flags = flags|RTCF_LOCAL;
2218 rth->rt_type = res.type;
David S. Miller475949d2011-05-03 19:45:15 -07002219 rth->rt_key_tos = tos;
David S. Millercf911662011-04-28 14:31:47 -07002220 rth->rt_dst = daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002221 rth->rt_src = saddr;
Patrick McHardyc7066f72011-01-14 13:36:42 +01002222#ifdef CONFIG_IP_ROUTE_CLASSID
Changli Gaod8d1f302010-06-10 23:31:35 -07002223 rth->dst.tclassid = itag;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002224#endif
OGAWA Hirofumi1b86a582011-04-07 14:04:08 -07002225 rth->rt_route_iif = dev->ifindex;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002226 rth->rt_iif = dev->ifindex;
David S. Millercf911662011-04-28 14:31:47 -07002227 rth->rt_oif = 0;
2228 rth->rt_mark = skb->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002229 rth->rt_gateway = daddr;
2230 rth->rt_spec_dst= spec_dst;
David S. Millercf911662011-04-28 14:31:47 -07002231 rth->rt_peer_genid = 0;
2232 rth->peer = NULL;
2233 rth->fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002234 if (res.type == RTN_UNREACHABLE) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002235 rth->dst.input= ip_error;
2236 rth->dst.error= -err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002237 rth->rt_flags &= ~RTCF_LOCAL;
2238 }
David S. Miller68a5e3d2011-03-11 20:07:33 -05002239 hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2240 rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
David S. Millerb23dd4f2011-03-02 14:31:35 -08002241 err = 0;
2242 if (IS_ERR(rth))
2243 err = PTR_ERR(rth);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002244 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002245
2246no_route:
2247 RT_CACHE_STAT_INC(in_no_route);
2248 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2249 res.type = RTN_UNREACHABLE;
Mitsuru Chinen7f538782007-12-07 01:07:24 -08002250 if (err == -ESRCH)
2251 err = -ENETUNREACH;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002252 goto local_input;
2253
2254 /*
2255 * Do not cache martian addresses: they should be logged (RFC1812)
2256 */
2257martian_destination:
2258 RT_CACHE_STAT_INC(in_martian_dst);
2259#ifdef CONFIG_IP_ROUTE_VERBOSE
2260 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
Harvey Harrison673d57e2008-10-31 00:53:57 -07002261 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2262 &daddr, &saddr, dev->name);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002263#endif
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002264
2265e_hostunreach:
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002266 err = -EHOSTUNREACH;
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002267 goto out;
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002268
Linus Torvalds1da177e2005-04-16 15:20:36 -07002269e_inval:
2270 err = -EINVAL;
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002271 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002272
2273e_nobufs:
2274 err = -ENOBUFS;
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002275 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002276
2277martian_source:
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002278 err = -EINVAL;
2279martian_source_keep_err:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002280 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002281 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002282}
2283
Eric Dumazet407eadd2010-05-10 11:32:55 +00002284int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2285 u8 tos, struct net_device *dev, bool noref)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002286{
2287 struct rtable * rth;
2288 unsigned hash;
2289 int iif = dev->ifindex;
Denis V. Lunevb5921912008-01-22 23:50:25 -08002290 struct net *net;
Eric Dumazet96d36222010-06-02 19:21:31 +00002291 int res;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002292
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09002293 net = dev_net(dev);
Neil Horman1080d702008-10-27 12:28:25 -07002294
Eric Dumazet96d36222010-06-02 19:21:31 +00002295 rcu_read_lock();
2296
Neil Horman1080d702008-10-27 12:28:25 -07002297 if (!rt_caching(net))
2298 goto skip_cache;
2299
Linus Torvalds1da177e2005-04-16 15:20:36 -07002300 tos &= IPTOS_RT_MASK;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002301 hash = rt_hash(daddr, saddr, iif, rt_genid(net));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002302
Linus Torvalds1da177e2005-04-16 15:20:36 -07002303 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
Changli Gaod8d1f302010-06-10 23:31:35 -07002304 rth = rcu_dereference(rth->dst.rt_next)) {
David S. Miller5e2b61f2011-03-04 21:47:09 -08002305 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2306 ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2307 (rth->rt_iif ^ iif) |
2308 rth->rt_oif |
David S. Miller475949d2011-05-03 19:45:15 -07002309 (rth->rt_key_tos ^ tos)) == 0 &&
David S. Miller5e2b61f2011-03-04 21:47:09 -08002310 rth->rt_mark == skb->mark &&
Changli Gaod8d1f302010-06-10 23:31:35 -07002311 net_eq(dev_net(rth->dst.dev), net) &&
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002312 !rt_is_expired(rth)) {
Eric Dumazet407eadd2010-05-10 11:32:55 +00002313 if (noref) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002314 dst_use_noref(&rth->dst, jiffies);
2315 skb_dst_set_noref(skb, &rth->dst);
Eric Dumazet407eadd2010-05-10 11:32:55 +00002316 } else {
Changli Gaod8d1f302010-06-10 23:31:35 -07002317 dst_use(&rth->dst, jiffies);
2318 skb_dst_set(skb, &rth->dst);
Eric Dumazet407eadd2010-05-10 11:32:55 +00002319 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002320 RT_CACHE_STAT_INC(in_hit);
2321 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002322 return 0;
2323 }
2324 RT_CACHE_STAT_INC(in_hlist_search);
2325 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002326
Neil Horman1080d702008-10-27 12:28:25 -07002327skip_cache:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002328 /* Multicast recognition logic is moved from route cache to here.
2329 The problem was that too many Ethernet cards have broken/missing
2330 hardware multicast filters :-( As result the host on multicasting
2331 network acquires a lot of useless route cache entries, sort of
2332 SDR messages from all the world. Now we try to get rid of them.
2333 Really, provided software IP multicast filter is organized
2334 reasonably (at least, hashed), it does not result in a slowdown
2335 comparing with route cache reject entries.
2336 Note, that multicast routers are not affected, because
2337 route cache entry is created eventually.
2338 */
Joe Perchesf97c1e02007-12-16 13:45:43 -08002339 if (ipv4_is_multicast(daddr)) {
Eric Dumazet96d36222010-06-02 19:21:31 +00002340 struct in_device *in_dev = __in_dev_get_rcu(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002341
Eric Dumazet96d36222010-06-02 19:21:31 +00002342 if (in_dev) {
David S. Millerdbdd9a52011-03-10 16:34:38 -08002343 int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2344 ip_hdr(skb)->protocol);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002345 if (our
2346#ifdef CONFIG_IP_MROUTE
Joe Perches9d4fb272009-11-23 10:41:23 -08002347 ||
2348 (!ipv4_is_local_multicast(daddr) &&
2349 IN_DEV_MFORWARD(in_dev))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002350#endif
Joe Perches9d4fb272009-11-23 10:41:23 -08002351 ) {
Eric Dumazet96d36222010-06-02 19:21:31 +00002352 int res = ip_route_input_mc(skb, daddr, saddr,
2353 tos, dev, our);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002354 rcu_read_unlock();
Eric Dumazet96d36222010-06-02 19:21:31 +00002355 return res;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002356 }
2357 }
2358 rcu_read_unlock();
2359 return -EINVAL;
2360 }
Eric Dumazet96d36222010-06-02 19:21:31 +00002361 res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2362 rcu_read_unlock();
2363 return res;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002364}
Eric Dumazet407eadd2010-05-10 11:32:55 +00002365EXPORT_SYMBOL(ip_route_input_common);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002366
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002367/* called with rcu_read_lock() */
David S. Miller982721f2011-02-16 21:44:24 -08002368static struct rtable *__mkroute_output(const struct fib_result *res,
David S. Miller68a5e3d2011-03-11 20:07:33 -05002369 const struct flowi4 *fl4,
David S. Miller813b3b52011-04-28 14:48:42 -07002370 __be32 orig_daddr, __be32 orig_saddr,
2371 int orig_oif, struct net_device *dev_out,
David S. Miller5ada5522011-02-17 15:29:00 -08002372 unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002373{
David S. Miller982721f2011-02-16 21:44:24 -08002374 struct fib_info *fi = res->fi;
David S. Miller813b3b52011-04-28 14:48:42 -07002375 u32 tos = RT_FL_TOS(fl4);
David S. Miller5ada5522011-02-17 15:29:00 -08002376 struct in_device *in_dev;
David S. Miller982721f2011-02-16 21:44:24 -08002377 u16 type = res->type;
David S. Miller5ada5522011-02-17 15:29:00 -08002378 struct rtable *rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002379
David S. Miller68a5e3d2011-03-11 20:07:33 -05002380 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
David S. Miller5ada5522011-02-17 15:29:00 -08002381 return ERR_PTR(-EINVAL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002382
David S. Miller68a5e3d2011-03-11 20:07:33 -05002383 if (ipv4_is_lbcast(fl4->daddr))
David S. Miller982721f2011-02-16 21:44:24 -08002384 type = RTN_BROADCAST;
David S. Miller68a5e3d2011-03-11 20:07:33 -05002385 else if (ipv4_is_multicast(fl4->daddr))
David S. Miller982721f2011-02-16 21:44:24 -08002386 type = RTN_MULTICAST;
David S. Miller68a5e3d2011-03-11 20:07:33 -05002387 else if (ipv4_is_zeronet(fl4->daddr))
David S. Miller5ada5522011-02-17 15:29:00 -08002388 return ERR_PTR(-EINVAL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002389
2390 if (dev_out->flags & IFF_LOOPBACK)
2391 flags |= RTCF_LOCAL;
2392
Eric Dumazetdd28d1a2010-09-29 11:53:50 +00002393 in_dev = __in_dev_get_rcu(dev_out);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002394 if (!in_dev)
David S. Miller5ada5522011-02-17 15:29:00 -08002395 return ERR_PTR(-EINVAL);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002396
David S. Miller982721f2011-02-16 21:44:24 -08002397 if (type == RTN_BROADCAST) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002398 flags |= RTCF_BROADCAST | RTCF_LOCAL;
David S. Miller982721f2011-02-16 21:44:24 -08002399 fi = NULL;
2400 } else if (type == RTN_MULTICAST) {
Eric Dumazetdd28d1a2010-09-29 11:53:50 +00002401 flags |= RTCF_MULTICAST | RTCF_LOCAL;
David S. Miller813b3b52011-04-28 14:48:42 -07002402 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2403 fl4->flowi4_proto))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002404 flags &= ~RTCF_LOCAL;
2405 /* If multicast route do not exist use
Eric Dumazetdd28d1a2010-09-29 11:53:50 +00002406 * default one, but do not gateway in this case.
2407 * Yes, it is hack.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002408 */
David S. Miller982721f2011-02-16 21:44:24 -08002409 if (fi && res->prefixlen < 4)
2410 fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002411 }
2412
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002413 rth = rt_dst_alloc(dev_out,
2414 IN_DEV_CONF_GET(in_dev, NOPOLICY),
David S. Miller0c4dcd52011-02-17 15:42:37 -08002415 IN_DEV_CONF_GET(in_dev, NOXFRM));
Dimitris Michailidis8391d072010-10-07 14:48:38 +00002416 if (!rth)
David S. Miller5ada5522011-02-17 15:29:00 -08002417 return ERR_PTR(-ENOBUFS);
Dimitris Michailidis8391d072010-10-07 14:48:38 +00002418
David S. Millercf911662011-04-28 14:31:47 -07002419 rth->dst.output = ip_output;
2420
David S. Miller813b3b52011-04-28 14:48:42 -07002421 rth->rt_key_dst = orig_daddr;
2422 rth->rt_key_src = orig_saddr;
David S. Millercf911662011-04-28 14:31:47 -07002423 rth->rt_genid = rt_genid(dev_net(dev_out));
2424 rth->rt_flags = flags;
2425 rth->rt_type = type;
David S. Miller475949d2011-05-03 19:45:15 -07002426 rth->rt_key_tos = tos;
David S. Miller68a5e3d2011-03-11 20:07:33 -05002427 rth->rt_dst = fl4->daddr;
2428 rth->rt_src = fl4->saddr;
OGAWA Hirofumi1b86a582011-04-07 14:04:08 -07002429 rth->rt_route_iif = 0;
David S. Miller813b3b52011-04-28 14:48:42 -07002430 rth->rt_iif = orig_oif ? : dev_out->ifindex;
2431 rth->rt_oif = orig_oif;
2432 rth->rt_mark = fl4->flowi4_mark;
David S. Miller68a5e3d2011-03-11 20:07:33 -05002433 rth->rt_gateway = fl4->daddr;
2434 rth->rt_spec_dst= fl4->saddr;
David S. Millercf911662011-04-28 14:31:47 -07002435 rth->rt_peer_genid = 0;
2436 rth->peer = NULL;
2437 rth->fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002438
2439 RT_CACHE_STAT_INC(out_slow_tot);
2440
2441 if (flags & RTCF_LOCAL) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002442 rth->dst.input = ip_local_deliver;
David S. Miller68a5e3d2011-03-11 20:07:33 -05002443 rth->rt_spec_dst = fl4->daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002444 }
2445 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
David S. Miller68a5e3d2011-03-11 20:07:33 -05002446 rth->rt_spec_dst = fl4->saddr;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002447 if (flags & RTCF_LOCAL &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07002448 !(dev_out->flags & IFF_LOOPBACK)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002449 rth->dst.output = ip_mc_output;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002450 RT_CACHE_STAT_INC(out_slow_mc);
2451 }
2452#ifdef CONFIG_IP_MROUTE
David S. Miller982721f2011-02-16 21:44:24 -08002453 if (type == RTN_MULTICAST) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002454 if (IN_DEV_MFORWARD(in_dev) &&
David S. Miller813b3b52011-04-28 14:48:42 -07002455 !ipv4_is_local_multicast(fl4->daddr)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002456 rth->dst.input = ip_mr_input;
2457 rth->dst.output = ip_mc_output;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002458 }
2459 }
2460#endif
2461 }
2462
David S. Miller813b3b52011-04-28 14:48:42 -07002463 rt_set_nexthop(rth, fl4, res, fi, type, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002464
David S. Miller5ada5522011-02-17 15:29:00 -08002465 return rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002466}
2467
Linus Torvalds1da177e2005-04-16 15:20:36 -07002468/*
2469 * Major route resolver routine.
Eric Dumazet0197aa32010-09-30 03:33:58 +00002470 * called with rcu_read_lock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002471 */
2472
David S. Miller813b3b52011-04-28 14:48:42 -07002473static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002474{
Linus Torvalds1da177e2005-04-16 15:20:36 -07002475 struct net_device *dev_out = NULL;
David S. Miller813b3b52011-04-28 14:48:42 -07002476 u32 tos = RT_FL_TOS(fl4);
2477 unsigned int flags = 0;
2478 struct fib_result res;
David S. Miller5ada5522011-02-17 15:29:00 -08002479 struct rtable *rth;
David S. Miller813b3b52011-04-28 14:48:42 -07002480 __be32 orig_daddr;
2481 __be32 orig_saddr;
2482 int orig_oif;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002483
2484 res.fi = NULL;
2485#ifdef CONFIG_IP_MULTIPLE_TABLES
2486 res.r = NULL;
2487#endif
2488
David S. Miller813b3b52011-04-28 14:48:42 -07002489 orig_daddr = fl4->daddr;
2490 orig_saddr = fl4->saddr;
2491 orig_oif = fl4->flowi4_oif;
2492
2493 fl4->flowi4_iif = net->loopback_dev->ifindex;
2494 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2495 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2496 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
David S. Miller44713b62011-03-04 21:24:47 -08002497
David S. Miller010c2702011-02-17 15:37:09 -08002498 rcu_read_lock();
David S. Miller813b3b52011-04-28 14:48:42 -07002499 if (fl4->saddr) {
David S. Millerb23dd4f2011-03-02 14:31:35 -08002500 rth = ERR_PTR(-EINVAL);
David S. Miller813b3b52011-04-28 14:48:42 -07002501 if (ipv4_is_multicast(fl4->saddr) ||
2502 ipv4_is_lbcast(fl4->saddr) ||
2503 ipv4_is_zeronet(fl4->saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002504 goto out;
2505
Linus Torvalds1da177e2005-04-16 15:20:36 -07002506 /* I removed check for oif == dev_out->oif here.
2507 It was wrong for two reasons:
Denis V. Lunev1ab35272008-01-22 22:04:30 -08002508 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2509 is assigned to multiple interfaces.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002510 2. Moreover, we are allowed to send packets with saddr
2511 of another iface. --ANK
2512 */
2513
David S. Miller813b3b52011-04-28 14:48:42 -07002514 if (fl4->flowi4_oif == 0 &&
2515 (ipv4_is_multicast(fl4->daddr) ||
2516 ipv4_is_lbcast(fl4->daddr))) {
Julian Anastasova210d012008-10-01 07:28:28 -07002517 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
David S. Miller813b3b52011-04-28 14:48:42 -07002518 dev_out = __ip_dev_find(net, fl4->saddr, false);
Julian Anastasova210d012008-10-01 07:28:28 -07002519 if (dev_out == NULL)
2520 goto out;
2521
Linus Torvalds1da177e2005-04-16 15:20:36 -07002522 /* Special hack: user can direct multicasts
2523 and limited broadcast via necessary interface
2524 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2525 This hack is not just for fun, it allows
2526 vic,vat and friends to work.
2527 They bind socket to loopback, set ttl to zero
2528 and expect that it will work.
2529 From the viewpoint of routing cache they are broken,
2530 because we are not allowed to build multicast path
2531 with loopback source addr (look, routing cache
2532 cannot know, that ttl is zero, so that packet
2533 will not leave this host and route is valid).
2534 Luckily, this hack is good workaround.
2535 */
2536
David S. Miller813b3b52011-04-28 14:48:42 -07002537 fl4->flowi4_oif = dev_out->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002538 goto make_route;
2539 }
Julian Anastasova210d012008-10-01 07:28:28 -07002540
David S. Miller813b3b52011-04-28 14:48:42 -07002541 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
Julian Anastasova210d012008-10-01 07:28:28 -07002542 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
David S. Miller813b3b52011-04-28 14:48:42 -07002543 if (!__ip_dev_find(net, fl4->saddr, false))
Julian Anastasova210d012008-10-01 07:28:28 -07002544 goto out;
Julian Anastasova210d012008-10-01 07:28:28 -07002545 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002546 }
2547
2548
David S. Miller813b3b52011-04-28 14:48:42 -07002549 if (fl4->flowi4_oif) {
2550 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
David S. Millerb23dd4f2011-03-02 14:31:35 -08002551 rth = ERR_PTR(-ENODEV);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002552 if (dev_out == NULL)
2553 goto out;
Herbert Xue5ed6392005-10-03 14:35:55 -07002554
2555 /* RACE: Check return value of inet_select_addr instead. */
Eric Dumazetfc75fc82010-12-22 04:39:39 +00002556 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
David S. Millerb23dd4f2011-03-02 14:31:35 -08002557 rth = ERR_PTR(-ENETUNREACH);
Eric Dumazetfc75fc82010-12-22 04:39:39 +00002558 goto out;
2559 }
David S. Miller813b3b52011-04-28 14:48:42 -07002560 if (ipv4_is_local_multicast(fl4->daddr) ||
2561 ipv4_is_lbcast(fl4->daddr)) {
2562 if (!fl4->saddr)
2563 fl4->saddr = inet_select_addr(dev_out, 0,
2564 RT_SCOPE_LINK);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002565 goto make_route;
2566 }
David S. Miller813b3b52011-04-28 14:48:42 -07002567 if (fl4->saddr) {
2568 if (ipv4_is_multicast(fl4->daddr))
2569 fl4->saddr = inet_select_addr(dev_out, 0,
2570 fl4->flowi4_scope);
2571 else if (!fl4->daddr)
2572 fl4->saddr = inet_select_addr(dev_out, 0,
2573 RT_SCOPE_HOST);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002574 }
2575 }
2576
David S. Miller813b3b52011-04-28 14:48:42 -07002577 if (!fl4->daddr) {
2578 fl4->daddr = fl4->saddr;
2579 if (!fl4->daddr)
2580 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002581 dev_out = net->loopback_dev;
David S. Miller813b3b52011-04-28 14:48:42 -07002582 fl4->flowi4_oif = net->loopback_dev->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002583 res.type = RTN_LOCAL;
2584 flags |= RTCF_LOCAL;
2585 goto make_route;
2586 }
2587
David S. Miller813b3b52011-04-28 14:48:42 -07002588 if (fib_lookup(net, fl4, &res)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002589 res.fi = NULL;
David S. Miller813b3b52011-04-28 14:48:42 -07002590 if (fl4->flowi4_oif) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002591 /* Apparently, routing tables are wrong. Assume,
2592 that the destination is on link.
2593
2594 WHY? DW.
2595 Because we are allowed to send to iface
2596 even if it has NO routes and NO assigned
2597 addresses. When oif is specified, routing
2598 tables are looked up with only one purpose:
2599 to catch if destination is gatewayed, rather than
2600 direct. Moreover, if MSG_DONTROUTE is set,
2601 we send packet, ignoring both routing tables
2602 and ifaddr state. --ANK
2603
2604
2605 We could make it even if oif is unknown,
2606 likely IPv6, but we do not.
2607 */
2608
David S. Miller813b3b52011-04-28 14:48:42 -07002609 if (fl4->saddr == 0)
2610 fl4->saddr = inet_select_addr(dev_out, 0,
2611 RT_SCOPE_LINK);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002612 res.type = RTN_UNICAST;
2613 goto make_route;
2614 }
David S. Millerb23dd4f2011-03-02 14:31:35 -08002615 rth = ERR_PTR(-ENETUNREACH);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002616 goto out;
2617 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002618
2619 if (res.type == RTN_LOCAL) {
David S. Miller813b3b52011-04-28 14:48:42 -07002620 if (!fl4->saddr) {
Joel Sing9fc3bbb2011-01-03 20:24:20 +00002621 if (res.fi->fib_prefsrc)
David S. Miller813b3b52011-04-28 14:48:42 -07002622 fl4->saddr = res.fi->fib_prefsrc;
Joel Sing9fc3bbb2011-01-03 20:24:20 +00002623 else
David S. Miller813b3b52011-04-28 14:48:42 -07002624 fl4->saddr = fl4->daddr;
Joel Sing9fc3bbb2011-01-03 20:24:20 +00002625 }
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002626 dev_out = net->loopback_dev;
David S. Miller813b3b52011-04-28 14:48:42 -07002627 fl4->flowi4_oif = dev_out->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002628 res.fi = NULL;
2629 flags |= RTCF_LOCAL;
2630 goto make_route;
2631 }
2632
2633#ifdef CONFIG_IP_ROUTE_MULTIPATH
David S. Miller813b3b52011-04-28 14:48:42 -07002634 if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
David S. Miller1b7fe5932011-03-10 17:01:16 -08002635 fib_select_multipath(&res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002636 else
2637#endif
David S. Miller21d8c492011-04-14 14:49:37 -07002638 if (!res.prefixlen &&
2639 res.table->tb_num_default > 1 &&
David S. Miller813b3b52011-04-28 14:48:42 -07002640 res.type == RTN_UNICAST && !fl4->flowi4_oif)
David S. Miller0c838ff2011-01-31 16:16:50 -08002641 fib_select_default(&res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002642
David S. Miller813b3b52011-04-28 14:48:42 -07002643 if (!fl4->saddr)
2644 fl4->saddr = FIB_RES_PREFSRC(net, res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002645
Linus Torvalds1da177e2005-04-16 15:20:36 -07002646 dev_out = FIB_RES_DEV(res);
David S. Miller813b3b52011-04-28 14:48:42 -07002647 fl4->flowi4_oif = dev_out->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002648
2649
2650make_route:
David S. Miller813b3b52011-04-28 14:48:42 -07002651 rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2652 dev_out, flags);
David S. Millerb23dd4f2011-03-02 14:31:35 -08002653 if (!IS_ERR(rth)) {
David S. Miller5ada5522011-02-17 15:29:00 -08002654 unsigned int hash;
2655
David S. Miller813b3b52011-04-28 14:48:42 -07002656 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
David S. Miller5ada5522011-02-17 15:29:00 -08002657 rt_genid(dev_net(dev_out)));
David S. Miller813b3b52011-04-28 14:48:42 -07002658 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
David S. Miller5ada5522011-02-17 15:29:00 -08002659 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002660
David S. Miller010c2702011-02-17 15:37:09 -08002661out:
2662 rcu_read_unlock();
David S. Millerb23dd4f2011-03-02 14:31:35 -08002663 return rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002664}
2665
David S. Miller813b3b52011-04-28 14:48:42 -07002666struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002667{
Linus Torvalds1da177e2005-04-16 15:20:36 -07002668 struct rtable *rth;
David S. Miller010c2702011-02-17 15:37:09 -08002669 unsigned int hash;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002670
Neil Horman1080d702008-10-27 12:28:25 -07002671 if (!rt_caching(net))
2672 goto slow_output;
2673
David S. Miller9d6ec932011-03-12 01:12:47 -05002674 hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002675
2676 rcu_read_lock_bh();
Paul E. McKenneya898def2010-02-22 17:04:49 -08002677 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
Changli Gaod8d1f302010-06-10 23:31:35 -07002678 rth = rcu_dereference_bh(rth->dst.rt_next)) {
David S. Miller9d6ec932011-03-12 01:12:47 -05002679 if (rth->rt_key_dst == flp4->daddr &&
2680 rth->rt_key_src == flp4->saddr &&
David S. Millerc7537962010-11-11 17:07:48 -08002681 rt_is_output_route(rth) &&
David S. Miller9d6ec932011-03-12 01:12:47 -05002682 rth->rt_oif == flp4->flowi4_oif &&
2683 rth->rt_mark == flp4->flowi4_mark &&
David S. Miller475949d2011-05-03 19:45:15 -07002684 !((rth->rt_key_tos ^ flp4->flowi4_tos) &
Denis V. Lunevb5921912008-01-22 23:50:25 -08002685 (IPTOS_RT_MASK | RTO_ONLINK)) &&
Changli Gaod8d1f302010-06-10 23:31:35 -07002686 net_eq(dev_net(rth->dst.dev), net) &&
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002687 !rt_is_expired(rth)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002688 dst_use(&rth->dst, jiffies);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002689 RT_CACHE_STAT_INC(out_hit);
2690 rcu_read_unlock_bh();
David S. Miller56157872011-05-02 14:37:45 -07002691 if (!flp4->saddr)
2692 flp4->saddr = rth->rt_src;
2693 if (!flp4->daddr)
2694 flp4->daddr = rth->rt_dst;
David S. Millerb23dd4f2011-03-02 14:31:35 -08002695 return rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002696 }
2697 RT_CACHE_STAT_INC(out_hlist_search);
2698 }
2699 rcu_read_unlock_bh();
2700
Neil Horman1080d702008-10-27 12:28:25 -07002701slow_output:
David S. Miller9d6ec932011-03-12 01:12:47 -05002702 return ip_route_output_slow(net, flp4);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002703}
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -07002704EXPORT_SYMBOL_GPL(__ip_route_output_key);
2705
Jianzhao Wangae2688d2010-09-08 14:35:43 -07002706static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2707{
2708 return NULL;
2709}
2710
Roland Dreierec831ea2011-01-31 13:16:00 -08002711static unsigned int ipv4_blackhole_default_mtu(const struct dst_entry *dst)
2712{
2713 return 0;
2714}
2715
David S. Miller14e50e52007-05-24 18:17:54 -07002716static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2717{
2718}
2719
Held Bernhard0972ddb2011-04-24 22:07:32 +00002720static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2721 unsigned long old)
2722{
2723 return NULL;
2724}
2725
David S. Miller14e50e52007-05-24 18:17:54 -07002726static struct dst_ops ipv4_dst_blackhole_ops = {
2727 .family = AF_INET,
Harvey Harrison09640e632009-02-01 00:45:17 -08002728 .protocol = cpu_to_be16(ETH_P_IP),
David S. Miller14e50e52007-05-24 18:17:54 -07002729 .destroy = ipv4_dst_destroy,
Jianzhao Wangae2688d2010-09-08 14:35:43 -07002730 .check = ipv4_blackhole_dst_check,
Roland Dreierec831ea2011-01-31 13:16:00 -08002731 .default_mtu = ipv4_blackhole_default_mtu,
Eric Dumazet214f45c2011-02-18 11:39:01 -08002732 .default_advmss = ipv4_default_advmss,
David S. Miller14e50e52007-05-24 18:17:54 -07002733 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
Held Bernhard0972ddb2011-04-24 22:07:32 +00002734 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
David S. Miller14e50e52007-05-24 18:17:54 -07002735};
2736
David S. Miller2774c132011-03-01 14:59:04 -08002737struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
David S. Miller14e50e52007-05-24 18:17:54 -07002738{
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002739 struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
David S. Miller2774c132011-03-01 14:59:04 -08002740 struct rtable *ort = (struct rtable *) dst_orig;
David S. Miller14e50e52007-05-24 18:17:54 -07002741
2742 if (rt) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002743 struct dst_entry *new = &rt->dst;
David S. Miller14e50e52007-05-24 18:17:54 -07002744
David S. Miller14e50e52007-05-24 18:17:54 -07002745 new->__use = 1;
Herbert Xu352e5122007-11-13 21:34:06 -08002746 new->input = dst_discard;
2747 new->output = dst_discard;
David S. Millerdefb3512010-12-08 21:16:57 -08002748 dst_copy_metrics(new, &ort->dst);
David S. Miller14e50e52007-05-24 18:17:54 -07002749
Changli Gaod8d1f302010-06-10 23:31:35 -07002750 new->dev = ort->dst.dev;
David S. Miller14e50e52007-05-24 18:17:54 -07002751 if (new->dev)
2752 dev_hold(new->dev);
2753
David S. Miller5e2b61f2011-03-04 21:47:09 -08002754 rt->rt_key_dst = ort->rt_key_dst;
2755 rt->rt_key_src = ort->rt_key_src;
David S. Miller475949d2011-05-03 19:45:15 -07002756 rt->rt_key_tos = ort->rt_key_tos;
OGAWA Hirofumi1b86a582011-04-07 14:04:08 -07002757 rt->rt_route_iif = ort->rt_route_iif;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002758 rt->rt_iif = ort->rt_iif;
2759 rt->rt_oif = ort->rt_oif;
2760 rt->rt_mark = ort->rt_mark;
David S. Miller14e50e52007-05-24 18:17:54 -07002761
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002762 rt->rt_genid = rt_genid(net);
David S. Miller14e50e52007-05-24 18:17:54 -07002763 rt->rt_flags = ort->rt_flags;
2764 rt->rt_type = ort->rt_type;
2765 rt->rt_dst = ort->rt_dst;
2766 rt->rt_src = ort->rt_src;
David S. Miller14e50e52007-05-24 18:17:54 -07002767 rt->rt_gateway = ort->rt_gateway;
2768 rt->rt_spec_dst = ort->rt_spec_dst;
2769 rt->peer = ort->peer;
2770 if (rt->peer)
2771 atomic_inc(&rt->peer->refcnt);
David S. Miller62fa8a82011-01-26 20:51:05 -08002772 rt->fi = ort->fi;
2773 if (rt->fi)
2774 atomic_inc(&rt->fi->fib_clntref);
David S. Miller14e50e52007-05-24 18:17:54 -07002775
2776 dst_free(new);
2777 }
2778
David S. Miller2774c132011-03-01 14:59:04 -08002779 dst_release(dst_orig);
2780
2781 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
David S. Miller14e50e52007-05-24 18:17:54 -07002782}
2783
David S. Miller9d6ec932011-03-12 01:12:47 -05002784struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
David S. Millerb23dd4f2011-03-02 14:31:35 -08002785 struct sock *sk)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002786{
David S. Miller9d6ec932011-03-12 01:12:47 -05002787 struct rtable *rt = __ip_route_output_key(net, flp4);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002788
David S. Millerb23dd4f2011-03-02 14:31:35 -08002789 if (IS_ERR(rt))
2790 return rt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002791
David S. Miller56157872011-05-02 14:37:45 -07002792 if (flp4->flowi4_proto)
David S. Miller9d6ec932011-03-12 01:12:47 -05002793 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2794 flowi4_to_flowi(flp4),
2795 sk, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002796
David S. Millerb23dd4f2011-03-02 14:31:35 -08002797 return rt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002798}
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -07002799EXPORT_SYMBOL_GPL(ip_route_output_flow);
2800
Benjamin Thery4feb88e2009-01-22 04:56:23 +00002801static int rt_fill_info(struct net *net,
2802 struct sk_buff *skb, u32 pid, u32 seq, int event,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07002803 int nowait, unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002804{
Eric Dumazet511c3f92009-06-02 05:14:27 +00002805 struct rtable *rt = skb_rtable(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002806 struct rtmsg *r;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002807 struct nlmsghdr *nlh;
Eric Dumazetfe6fe792011-06-08 06:07:07 +00002808 long expires = 0;
2809 const struct inet_peer *peer = rt->peer;
Thomas Grafe3703b32006-11-27 09:27:07 -08002810 u32 id = 0, ts = 0, tsage = 0, error;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002811
2812 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2813 if (nlh == NULL)
Patrick McHardy26932562007-01-31 23:16:40 -08002814 return -EMSGSIZE;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002815
2816 r = nlmsg_data(nlh);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002817 r->rtm_family = AF_INET;
2818 r->rtm_dst_len = 32;
2819 r->rtm_src_len = 0;
David S. Miller475949d2011-05-03 19:45:15 -07002820 r->rtm_tos = rt->rt_key_tos;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002821 r->rtm_table = RT_TABLE_MAIN;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002822 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002823 r->rtm_type = rt->rt_type;
2824 r->rtm_scope = RT_SCOPE_UNIVERSE;
2825 r->rtm_protocol = RTPROT_UNSPEC;
2826 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2827 if (rt->rt_flags & RTCF_NOTIFY)
2828 r->rtm_flags |= RTM_F_NOTIFY;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002829
Al Viro17fb2c62006-09-26 22:15:25 -07002830 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
Thomas Grafbe403ea2006-08-17 18:15:17 -07002831
David S. Miller5e2b61f2011-03-04 21:47:09 -08002832 if (rt->rt_key_src) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002833 r->rtm_src_len = 32;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002834 NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002835 }
Changli Gaod8d1f302010-06-10 23:31:35 -07002836 if (rt->dst.dev)
2837 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
Patrick McHardyc7066f72011-01-14 13:36:42 +01002838#ifdef CONFIG_IP_ROUTE_CLASSID
Changli Gaod8d1f302010-06-10 23:31:35 -07002839 if (rt->dst.tclassid)
2840 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002841#endif
David S. Millerc7537962010-11-11 17:07:48 -08002842 if (rt_is_input_route(rt))
Al Viro17fb2c62006-09-26 22:15:25 -07002843 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
David S. Miller5e2b61f2011-03-04 21:47:09 -08002844 else if (rt->rt_src != rt->rt_key_src)
Al Viro17fb2c62006-09-26 22:15:25 -07002845 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
Thomas Grafbe403ea2006-08-17 18:15:17 -07002846
Linus Torvalds1da177e2005-04-16 15:20:36 -07002847 if (rt->rt_dst != rt->rt_gateway)
Al Viro17fb2c62006-09-26 22:15:25 -07002848 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
Thomas Grafbe403ea2006-08-17 18:15:17 -07002849
David S. Millerdefb3512010-12-08 21:16:57 -08002850 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002851 goto nla_put_failure;
2852
David S. Miller5e2b61f2011-03-04 21:47:09 -08002853 if (rt->rt_mark)
2854 NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
Eric Dumazet963bfee2010-07-20 22:03:14 +00002855
Changli Gaod8d1f302010-06-10 23:31:35 -07002856 error = rt->dst.error;
Eric Dumazetfe6fe792011-06-08 06:07:07 +00002857 if (peer) {
Eric Dumazet317fe0e2010-06-16 04:52:13 +00002858 inet_peer_refcheck(rt->peer);
Eric Dumazetfe6fe792011-06-08 06:07:07 +00002859 id = atomic_read(&peer->ip_id_count) & 0xffff;
2860 if (peer->tcp_ts_stamp) {
2861 ts = peer->tcp_ts;
2862 tsage = get_seconds() - peer->tcp_ts_stamp;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002863 }
Eric Dumazetfe6fe792011-06-08 06:07:07 +00002864 expires = ACCESS_ONCE(peer->pmtu_expires);
2865 if (expires)
2866 expires -= jiffies;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002867 }
Thomas Grafbe403ea2006-08-17 18:15:17 -07002868
David S. Millerc7537962010-11-11 17:07:48 -08002869 if (rt_is_input_route(rt)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002870#ifdef CONFIG_IP_MROUTE
Al Viroe4485152006-09-26 22:15:01 -07002871 __be32 dst = rt->rt_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002872
Joe Perchesf97c1e02007-12-16 13:45:43 -08002873 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
Benjamin Thery4feb88e2009-01-22 04:56:23 +00002874 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
David S. Miller9a1b9492011-05-04 12:18:54 -07002875 int err = ipmr_get_route(net, skb,
2876 rt->rt_src, rt->rt_dst,
2877 r, nowait);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002878 if (err <= 0) {
2879 if (!nowait) {
2880 if (err == 0)
2881 return 0;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002882 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002883 } else {
2884 if (err == -EMSGSIZE)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002885 goto nla_put_failure;
Thomas Grafe3703b32006-11-27 09:27:07 -08002886 error = err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002887 }
2888 }
2889 } else
2890#endif
David S. Miller5e2b61f2011-03-04 21:47:09 -08002891 NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002892 }
2893
Changli Gaod8d1f302010-06-10 23:31:35 -07002894 if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
Thomas Grafe3703b32006-11-27 09:27:07 -08002895 expires, error) < 0)
2896 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002897
Thomas Grafbe403ea2006-08-17 18:15:17 -07002898 return nlmsg_end(skb, nlh);
2899
2900nla_put_failure:
Patrick McHardy26932562007-01-31 23:16:40 -08002901 nlmsg_cancel(skb, nlh);
2902 return -EMSGSIZE;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002903}
2904
Thomas Graf63f34442007-03-22 11:55:17 -07002905static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002906{
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09002907 struct net *net = sock_net(in_skb->sk);
Thomas Grafd889ce32006-08-17 18:15:44 -07002908 struct rtmsg *rtm;
2909 struct nlattr *tb[RTA_MAX+1];
Linus Torvalds1da177e2005-04-16 15:20:36 -07002910 struct rtable *rt = NULL;
Al Viro9e12bb22006-09-26 21:25:20 -07002911 __be32 dst = 0;
2912 __be32 src = 0;
2913 u32 iif;
Thomas Grafd889ce32006-08-17 18:15:44 -07002914 int err;
Eric Dumazet963bfee2010-07-20 22:03:14 +00002915 int mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002916 struct sk_buff *skb;
2917
Thomas Grafd889ce32006-08-17 18:15:44 -07002918 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2919 if (err < 0)
2920 goto errout;
2921
2922 rtm = nlmsg_data(nlh);
2923
Linus Torvalds1da177e2005-04-16 15:20:36 -07002924 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
Thomas Grafd889ce32006-08-17 18:15:44 -07002925 if (skb == NULL) {
2926 err = -ENOBUFS;
2927 goto errout;
2928 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002929
2930 /* Reserve room for dummy headers, this skb can pass
2931 through good chunk of routing engine.
2932 */
Arnaldo Carvalho de Melo459a98e2007-03-19 15:30:44 -07002933 skb_reset_mac_header(skb);
Arnaldo Carvalho de Meloc1d2bbe2007-04-10 20:45:18 -07002934 skb_reset_network_header(skb);
Stephen Hemmingerd2c962b2006-04-17 17:27:11 -07002935
2936 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07002937 ip_hdr(skb)->protocol = IPPROTO_ICMP;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002938 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2939
Al Viro17fb2c62006-09-26 22:15:25 -07002940 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2941 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
Thomas Grafd889ce32006-08-17 18:15:44 -07002942 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
Eric Dumazet963bfee2010-07-20 22:03:14 +00002943 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002944
2945 if (iif) {
Thomas Grafd889ce32006-08-17 18:15:44 -07002946 struct net_device *dev;
2947
Denis V. Lunev19375042008-02-28 20:52:04 -08002948 dev = __dev_get_by_index(net, iif);
Thomas Grafd889ce32006-08-17 18:15:44 -07002949 if (dev == NULL) {
2950 err = -ENODEV;
2951 goto errout_free;
2952 }
2953
Linus Torvalds1da177e2005-04-16 15:20:36 -07002954 skb->protocol = htons(ETH_P_IP);
2955 skb->dev = dev;
Eric Dumazet963bfee2010-07-20 22:03:14 +00002956 skb->mark = mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002957 local_bh_disable();
2958 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2959 local_bh_enable();
Thomas Grafd889ce32006-08-17 18:15:44 -07002960
Eric Dumazet511c3f92009-06-02 05:14:27 +00002961 rt = skb_rtable(skb);
Changli Gaod8d1f302010-06-10 23:31:35 -07002962 if (err == 0 && rt->dst.error)
2963 err = -rt->dst.error;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002964 } else {
David S. Miller68a5e3d2011-03-11 20:07:33 -05002965 struct flowi4 fl4 = {
2966 .daddr = dst,
2967 .saddr = src,
2968 .flowi4_tos = rtm->rtm_tos,
2969 .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2970 .flowi4_mark = mark,
Thomas Grafd889ce32006-08-17 18:15:44 -07002971 };
David S. Miller9d6ec932011-03-12 01:12:47 -05002972 rt = ip_route_output_key(net, &fl4);
David S. Millerb23dd4f2011-03-02 14:31:35 -08002973
2974 err = 0;
2975 if (IS_ERR(rt))
2976 err = PTR_ERR(rt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002977 }
Thomas Grafd889ce32006-08-17 18:15:44 -07002978
Linus Torvalds1da177e2005-04-16 15:20:36 -07002979 if (err)
Thomas Grafd889ce32006-08-17 18:15:44 -07002980 goto errout_free;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002981
Changli Gaod8d1f302010-06-10 23:31:35 -07002982 skb_dst_set(skb, &rt->dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002983 if (rtm->rtm_flags & RTM_F_NOTIFY)
2984 rt->rt_flags |= RTCF_NOTIFY;
2985
Benjamin Thery4feb88e2009-01-22 04:56:23 +00002986 err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
Denis V. Lunev19375042008-02-28 20:52:04 -08002987 RTM_NEWROUTE, 0, 0);
Thomas Grafd889ce32006-08-17 18:15:44 -07002988 if (err <= 0)
2989 goto errout_free;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002990
Denis V. Lunev19375042008-02-28 20:52:04 -08002991 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
Thomas Grafd889ce32006-08-17 18:15:44 -07002992errout:
Thomas Graf2942e902006-08-15 00:30:25 -07002993 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002994
Thomas Grafd889ce32006-08-17 18:15:44 -07002995errout_free:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002996 kfree_skb(skb);
Thomas Grafd889ce32006-08-17 18:15:44 -07002997 goto errout;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002998}
2999
3000int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
3001{
3002 struct rtable *rt;
3003 int h, s_h;
3004 int idx, s_idx;
Denis V. Lunev19375042008-02-28 20:52:04 -08003005 struct net *net;
3006
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09003007 net = sock_net(skb->sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003008
3009 s_h = cb->args[0];
Eric Dumazetd8c92832008-01-07 21:52:14 -08003010 if (s_h < 0)
3011 s_h = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003012 s_idx = idx = cb->args[1];
Eric Dumazeta6272662008-08-28 01:11:25 -07003013 for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3014 if (!rt_hash_table[h].chain)
3015 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003016 rcu_read_lock_bh();
Paul E. McKenneya898def2010-02-22 17:04:49 -08003017 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
Changli Gaod8d1f302010-06-10 23:31:35 -07003018 rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3019 if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003020 continue;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07003021 if (rt_is_expired(rt))
Eric Dumazet29e75252008-01-31 17:05:09 -08003022 continue;
Changli Gaod8d1f302010-06-10 23:31:35 -07003023 skb_dst_set_noref(skb, &rt->dst);
Benjamin Thery4feb88e2009-01-22 04:56:23 +00003024 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003025 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07003026 1, NLM_F_MULTI) <= 0) {
Eric Dumazetadf30902009-06-02 05:19:30 +00003027 skb_dst_drop(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003028 rcu_read_unlock_bh();
3029 goto done;
3030 }
Eric Dumazetadf30902009-06-02 05:19:30 +00003031 skb_dst_drop(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003032 }
3033 rcu_read_unlock_bh();
3034 }
3035
3036done:
3037 cb->args[0] = h;
3038 cb->args[1] = idx;
3039 return skb->len;
3040}
3041
3042void ip_rt_multicast_event(struct in_device *in_dev)
3043{
Denis V. Lunev76e6ebf2008-07-05 19:00:44 -07003044 rt_cache_flush(dev_net(in_dev->dev), 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003045}
3046
3047#ifdef CONFIG_SYSCTL
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003048static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
Alexey Dobriyan8d65af72009-09-23 15:57:19 -07003049 void __user *buffer,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003050 size_t *lenp, loff_t *ppos)
3051{
3052 if (write) {
Denis V. Lunev639e1042008-07-05 19:02:06 -07003053 int flush_delay;
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003054 ctl_table ctl;
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003055 struct net *net;
Denis V. Lunev639e1042008-07-05 19:02:06 -07003056
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003057 memcpy(&ctl, __ctl, sizeof(ctl));
3058 ctl.data = &flush_delay;
Alexey Dobriyan8d65af72009-09-23 15:57:19 -07003059 proc_dointvec(&ctl, write, buffer, lenp, ppos);
Denis V. Lunev639e1042008-07-05 19:02:06 -07003060
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003061 net = (struct net *)__ctl->extra1;
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003062 rt_cache_flush(net, flush_delay);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003063 return 0;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003064 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003065
3066 return -EINVAL;
3067}
3068
Al Viroeeb61f72008-07-27 08:59:33 +01003069static ctl_table ipv4_route_table[] = {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003070 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003071 .procname = "gc_thresh",
3072 .data = &ipv4_dst_ops.gc_thresh,
3073 .maxlen = sizeof(int),
3074 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003075 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003076 },
3077 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003078 .procname = "max_size",
3079 .data = &ip_rt_max_size,
3080 .maxlen = sizeof(int),
3081 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003082 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003083 },
3084 {
3085 /* Deprecated. Use gc_min_interval_ms */
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003086
Linus Torvalds1da177e2005-04-16 15:20:36 -07003087 .procname = "gc_min_interval",
3088 .data = &ip_rt_gc_min_interval,
3089 .maxlen = sizeof(int),
3090 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003091 .proc_handler = proc_dointvec_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003092 },
3093 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003094 .procname = "gc_min_interval_ms",
3095 .data = &ip_rt_gc_min_interval,
3096 .maxlen = sizeof(int),
3097 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003098 .proc_handler = proc_dointvec_ms_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003099 },
3100 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003101 .procname = "gc_timeout",
3102 .data = &ip_rt_gc_timeout,
3103 .maxlen = sizeof(int),
3104 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003105 .proc_handler = proc_dointvec_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003106 },
3107 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003108 .procname = "gc_interval",
3109 .data = &ip_rt_gc_interval,
3110 .maxlen = sizeof(int),
3111 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003112 .proc_handler = proc_dointvec_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003113 },
3114 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003115 .procname = "redirect_load",
3116 .data = &ip_rt_redirect_load,
3117 .maxlen = sizeof(int),
3118 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003119 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003120 },
3121 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003122 .procname = "redirect_number",
3123 .data = &ip_rt_redirect_number,
3124 .maxlen = sizeof(int),
3125 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003126 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003127 },
3128 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003129 .procname = "redirect_silence",
3130 .data = &ip_rt_redirect_silence,
3131 .maxlen = sizeof(int),
3132 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003133 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003134 },
3135 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003136 .procname = "error_cost",
3137 .data = &ip_rt_error_cost,
3138 .maxlen = sizeof(int),
3139 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003140 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003141 },
3142 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003143 .procname = "error_burst",
3144 .data = &ip_rt_error_burst,
3145 .maxlen = sizeof(int),
3146 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003147 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003148 },
3149 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003150 .procname = "gc_elasticity",
3151 .data = &ip_rt_gc_elasticity,
3152 .maxlen = sizeof(int),
3153 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003154 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003155 },
3156 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003157 .procname = "mtu_expires",
3158 .data = &ip_rt_mtu_expires,
3159 .maxlen = sizeof(int),
3160 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003161 .proc_handler = proc_dointvec_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003162 },
3163 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003164 .procname = "min_pmtu",
3165 .data = &ip_rt_min_pmtu,
3166 .maxlen = sizeof(int),
3167 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003168 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003169 },
3170 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003171 .procname = "min_adv_mss",
3172 .data = &ip_rt_min_advmss,
3173 .maxlen = sizeof(int),
3174 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003175 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003176 },
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003177 { }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003178};
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003179
Al Viro2f4520d2008-08-25 15:17:44 -07003180static struct ctl_table empty[1];
3181
3182static struct ctl_table ipv4_skeleton[] =
3183{
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003184 { .procname = "route",
Hugh Dickinsd994af02008-08-27 02:35:18 -07003185 .mode = 0555, .child = ipv4_route_table},
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003186 { .procname = "neigh",
Hugh Dickinsd994af02008-08-27 02:35:18 -07003187 .mode = 0555, .child = empty},
Al Viro2f4520d2008-08-25 15:17:44 -07003188 { }
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003189};
3190
Al Viro2f4520d2008-08-25 15:17:44 -07003191static __net_initdata struct ctl_path ipv4_path[] = {
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003192 { .procname = "net", },
3193 { .procname = "ipv4", },
Al Viro2f4520d2008-08-25 15:17:44 -07003194 { },
3195};
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003196
3197static struct ctl_table ipv4_route_flush_table[] = {
3198 {
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003199 .procname = "flush",
3200 .maxlen = sizeof(int),
3201 .mode = 0200,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003202 .proc_handler = ipv4_sysctl_rtcache_flush,
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003203 },
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003204 { },
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003205};
3206
Al Viro2f4520d2008-08-25 15:17:44 -07003207static __net_initdata struct ctl_path ipv4_route_path[] = {
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003208 { .procname = "net", },
3209 { .procname = "ipv4", },
3210 { .procname = "route", },
Al Viro2f4520d2008-08-25 15:17:44 -07003211 { },
3212};
3213
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003214static __net_init int sysctl_route_net_init(struct net *net)
3215{
3216 struct ctl_table *tbl;
3217
3218 tbl = ipv4_route_flush_table;
Octavian Purdila09ad9bc2009-11-25 15:14:13 -08003219 if (!net_eq(net, &init_net)) {
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003220 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3221 if (tbl == NULL)
3222 goto err_dup;
3223 }
3224 tbl[0].extra1 = net;
3225
3226 net->ipv4.route_hdr =
3227 register_net_sysctl_table(net, ipv4_route_path, tbl);
3228 if (net->ipv4.route_hdr == NULL)
3229 goto err_reg;
3230 return 0;
3231
3232err_reg:
3233 if (tbl != ipv4_route_flush_table)
3234 kfree(tbl);
3235err_dup:
3236 return -ENOMEM;
3237}
3238
3239static __net_exit void sysctl_route_net_exit(struct net *net)
3240{
3241 struct ctl_table *tbl;
3242
3243 tbl = net->ipv4.route_hdr->ctl_table_arg;
3244 unregister_net_sysctl_table(net->ipv4.route_hdr);
3245 BUG_ON(tbl == ipv4_route_flush_table);
3246 kfree(tbl);
3247}
3248
3249static __net_initdata struct pernet_operations sysctl_route_ops = {
3250 .init = sysctl_route_net_init,
3251 .exit = sysctl_route_net_exit,
3252};
Linus Torvalds1da177e2005-04-16 15:20:36 -07003253#endif
3254
Neil Horman3ee94372010-05-08 01:57:52 -07003255static __net_init int rt_genid_init(struct net *net)
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003256{
Neil Horman3ee94372010-05-08 01:57:52 -07003257 get_random_bytes(&net->ipv4.rt_genid,
3258 sizeof(net->ipv4.rt_genid));
David S. Miller436c3b62011-03-24 17:42:21 -07003259 get_random_bytes(&net->ipv4.dev_addr_genid,
3260 sizeof(net->ipv4.dev_addr_genid));
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003261 return 0;
3262}
3263
Neil Horman3ee94372010-05-08 01:57:52 -07003264static __net_initdata struct pernet_operations rt_genid_ops = {
3265 .init = rt_genid_init,
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003266};
3267
3268
Patrick McHardyc7066f72011-01-14 13:36:42 +01003269#ifdef CONFIG_IP_ROUTE_CLASSID
Tejun Heo7d720c32010-02-16 15:20:26 +00003270struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
Patrick McHardyc7066f72011-01-14 13:36:42 +01003271#endif /* CONFIG_IP_ROUTE_CLASSID */
Linus Torvalds1da177e2005-04-16 15:20:36 -07003272
3273static __initdata unsigned long rhash_entries;
3274static int __init set_rhash_entries(char *str)
3275{
3276 if (!str)
3277 return 0;
3278 rhash_entries = simple_strtoul(str, &str, 0);
3279 return 1;
3280}
3281__setup("rhash_entries=", set_rhash_entries);
3282
3283int __init ip_rt_init(void)
3284{
Eric Dumazet424c4b72005-07-05 14:58:19 -07003285 int rc = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003286
Patrick McHardyc7066f72011-01-14 13:36:42 +01003287#ifdef CONFIG_IP_ROUTE_CLASSID
Ingo Molnar0dcec8c2009-02-25 14:07:33 +01003288 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
Linus Torvalds1da177e2005-04-16 15:20:36 -07003289 if (!ip_rt_acct)
3290 panic("IP: failed to allocate ip_rt_acct\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07003291#endif
3292
Alexey Dobriyane5d679f332006-08-26 19:25:52 -07003293 ipv4_dst_ops.kmem_cachep =
3294 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
Paul Mundt20c2df82007-07-20 10:11:58 +09003295 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003296
David S. Miller14e50e52007-05-24 18:17:54 -07003297 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3298
Eric Dumazetfc66f952010-10-08 06:37:34 +00003299 if (dst_entries_init(&ipv4_dst_ops) < 0)
3300 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3301
3302 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3303 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3304
Eric Dumazet424c4b72005-07-05 14:58:19 -07003305 rt_hash_table = (struct rt_hash_bucket *)
3306 alloc_large_system_hash("IP route cache",
3307 sizeof(struct rt_hash_bucket),
3308 rhash_entries,
Jan Beulich44813742009-09-21 17:03:05 -07003309 (totalram_pages >= 128 * 1024) ?
Mike Stroyan18955cf2005-11-29 16:12:55 -08003310 15 : 17,
Kirill Korotaev8d1502d2006-08-07 20:44:22 -07003311 0,
Eric Dumazet424c4b72005-07-05 14:58:19 -07003312 &rt_hash_log,
3313 &rt_hash_mask,
Anton Blanchardc9503e02009-04-27 05:42:24 -07003314 rhash_entries ? 0 : 512 * 1024);
Eric Dumazet22c047c2005-07-05 14:55:24 -07003315 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3316 rt_hash_lock_init();
Linus Torvalds1da177e2005-04-16 15:20:36 -07003317
3318 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3319 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3320
Linus Torvalds1da177e2005-04-16 15:20:36 -07003321 devinet_init();
3322 ip_fib_init();
3323
Denis V. Lunev73b38712008-02-28 20:51:18 -08003324 if (ip_rt_proc_init())
Pavel Emelyanov107f1632007-12-05 21:14:28 -08003325 printk(KERN_ERR "Unable to create route proc files\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07003326#ifdef CONFIG_XFRM
3327 xfrm_init();
Neil Hormana33bc5c2009-07-30 18:52:15 -07003328 xfrm4_init(ip_rt_max_size);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003329#endif
Greg Rosec7ac8672011-06-10 01:27:09 +00003330 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
Thomas Graf63f34442007-03-22 11:55:17 -07003331
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003332#ifdef CONFIG_SYSCTL
3333 register_pernet_subsys(&sysctl_route_ops);
3334#endif
Neil Horman3ee94372010-05-08 01:57:52 -07003335 register_pernet_subsys(&rt_genid_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003336 return rc;
3337}
3338
Al Viroa1bc6eb2008-07-30 06:32:52 -04003339#ifdef CONFIG_SYSCTL
Al Viroeeb61f72008-07-27 08:59:33 +01003340/*
3341 * We really need to sanitize the damn ipv4 init order, then all
3342 * this nonsense will go away.
3343 */
3344void __init ip_static_sysctl_init(void)
3345{
Al Viro2f4520d2008-08-25 15:17:44 -07003346 register_sysctl_paths(ipv4_path, ipv4_skeleton);
Al Viroeeb61f72008-07-27 08:59:33 +01003347}
Al Viroa1bc6eb2008-07-30 06:32:52 -04003348#endif