blob: aabece6b729a8053b7bb07858bf74f35ca038b20 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
Jesper Juhl02c30a82005-05-05 16:16:16 -07008 * Authors: Ross Biro
Linus Torvalds1da177e2005-04-16 15:20:36 -07009 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 *
14 * Fixes:
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090021 * Alan Cox : Super /proc >4K
Linus Torvalds1da177e2005-04-16 15:20:36 -070022 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090039 *
Linus Torvalds1da177e2005-04-16 15:20:36 -070040 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
Eric Dumazetbb1d23b2005-07-05 15:00:32 -070055 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
Ilia Sotnikovcef26852006-03-25 01:38:55 -080056 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
Linus Torvalds1da177e2005-04-16 15:20:36 -070058 *
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
63 */
64
Joe Perchesafd465032012-03-12 07:03:32 +000065#define pr_fmt(fmt) "IPv4: " fmt
66
Linus Torvalds1da177e2005-04-16 15:20:36 -070067#include <linux/module.h>
68#include <asm/uaccess.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070069#include <linux/bitops.h>
70#include <linux/types.h>
71#include <linux/kernel.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070072#include <linux/mm.h>
Eric Dumazet424c4b72005-07-05 14:58:19 -070073#include <linux/bootmem.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070074#include <linux/string.h>
75#include <linux/socket.h>
76#include <linux/sockios.h>
77#include <linux/errno.h>
78#include <linux/in.h>
79#include <linux/inet.h>
80#include <linux/netdevice.h>
81#include <linux/proc_fs.h>
82#include <linux/init.h>
Eric Dumazet39c90ec2007-09-15 10:55:54 -070083#include <linux/workqueue.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070084#include <linux/skbuff.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070085#include <linux/inetdevice.h>
86#include <linux/igmp.h>
87#include <linux/pkt_sched.h>
88#include <linux/mroute.h>
89#include <linux/netfilter_ipv4.h>
90#include <linux/random.h>
91#include <linux/jhash.h>
92#include <linux/rcupdate.h>
93#include <linux/times.h>
Tejun Heo5a0e3ad2010-03-24 17:04:11 +090094#include <linux/slab.h>
Stephen Rothwellb9eda062011-12-22 17:03:29 +110095#include <linux/prefetch.h>
Herbert Xu352e5122007-11-13 21:34:06 -080096#include <net/dst.h>
Eric W. Biederman457c4cb2007-09-12 12:01:34 +020097#include <net/net_namespace.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070098#include <net/protocol.h>
99#include <net/ip.h>
100#include <net/route.h>
101#include <net/inetpeer.h>
102#include <net/sock.h>
103#include <net/ip_fib.h>
104#include <net/arp.h>
105#include <net/tcp.h>
106#include <net/icmp.h>
107#include <net/xfrm.h>
Tom Tucker8d717402006-07-30 20:43:36 -0700108#include <net/netevent.h>
Thomas Graf63f34442007-03-22 11:55:17 -0700109#include <net/rtnetlink.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700110#ifdef CONFIG_SYSCTL
111#include <linux/sysctl.h>
Shan Wei7426a562012-04-18 18:05:46 +0000112#include <linux/kmemleak.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700113#endif
David S. Miller6e5714e2011-08-03 20:50:44 -0700114#include <net/secure_seq.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700115
David S. Miller68a5e3d2011-03-11 20:07:33 -0500116#define RT_FL_TOS(oldflp4) \
Julian Anastasovf61759e2011-12-02 11:39:42 +0000117 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700118
119#define IP_MAX_MTU 0xFFF0
120
121#define RT_GC_TIMEOUT (300*HZ)
122
Linus Torvalds1da177e2005-04-16 15:20:36 -0700123static int ip_rt_max_size;
Stephen Hemminger817bc4d2008-03-22 17:43:59 -0700124static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
Eric Dumazet9f28a2f2011-12-21 15:47:16 -0500125static int ip_rt_gc_interval __read_mostly = 60 * HZ;
Stephen Hemminger817bc4d2008-03-22 17:43:59 -0700126static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
127static int ip_rt_redirect_number __read_mostly = 9;
128static int ip_rt_redirect_load __read_mostly = HZ / 50;
129static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
130static int ip_rt_error_cost __read_mostly = HZ;
131static int ip_rt_error_burst __read_mostly = 5 * HZ;
132static int ip_rt_gc_elasticity __read_mostly = 8;
133static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
134static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
135static int ip_rt_min_advmss __read_mostly = 256;
Neil Horman1080d702008-10-27 12:28:25 -0700136static int rt_chain_length_max __read_mostly = 20;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700137
Eric Dumazet9f28a2f2011-12-21 15:47:16 -0500138static struct delayed_work expires_work;
139static unsigned long expires_ljiffies;
140
Linus Torvalds1da177e2005-04-16 15:20:36 -0700141/*
142 * Interface to generic destination cache.
143 */
144
145static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
David S. Miller0dbaee32010-12-13 12:52:14 -0800146static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
Steffen Klassertebb762f2011-11-23 02:12:51 +0000147static unsigned int ipv4_mtu(const struct dst_entry *dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700148static void ipv4_dst_destroy(struct dst_entry *dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700149static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
150static void ipv4_link_failure(struct sk_buff *skb);
151static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
David S. Millere47a1852012-07-11 20:55:47 -0700152static void ip_do_redirect(struct dst_entry *dst, struct sk_buff *skb);
Daniel Lezcano569d3642008-01-18 03:56:57 -0800153static int rt_garbage_collect(struct dst_ops *ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700154
Eric Dumazet72cdd1d2010-11-11 07:14:07 +0000155static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
156 int how)
157{
158}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700159
David S. Miller62fa8a82011-01-26 20:51:05 -0800160static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
161{
David S. Miller31248732012-07-10 07:08:18 -0700162 WARN_ON(1);
163 return NULL;
David S. Miller62fa8a82011-01-26 20:51:05 -0800164}
165
David S. Millerf894cbf2012-07-02 21:52:24 -0700166static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
167 struct sk_buff *skb,
168 const void *daddr);
David S. Millerd3aaeb32011-07-18 00:40:17 -0700169
Linus Torvalds1da177e2005-04-16 15:20:36 -0700170static struct dst_ops ipv4_dst_ops = {
171 .family = AF_INET,
Harvey Harrison09640e62009-02-01 00:45:17 -0800172 .protocol = cpu_to_be16(ETH_P_IP),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700173 .gc = rt_garbage_collect,
174 .check = ipv4_dst_check,
David S. Miller0dbaee32010-12-13 12:52:14 -0800175 .default_advmss = ipv4_default_advmss,
Steffen Klassertebb762f2011-11-23 02:12:51 +0000176 .mtu = ipv4_mtu,
David S. Miller62fa8a82011-01-26 20:51:05 -0800177 .cow_metrics = ipv4_cow_metrics,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700178 .destroy = ipv4_dst_destroy,
179 .ifdown = ipv4_dst_ifdown,
180 .negative_advice = ipv4_negative_advice,
181 .link_failure = ipv4_link_failure,
182 .update_pmtu = ip_rt_update_pmtu,
David S. Millere47a1852012-07-11 20:55:47 -0700183 .redirect = ip_do_redirect,
Herbert Xu1ac06e02008-05-20 14:32:14 -0700184 .local_out = __ip_local_out,
David S. Millerd3aaeb32011-07-18 00:40:17 -0700185 .neigh_lookup = ipv4_neigh_lookup,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700186};
187
188#define ECN_OR_COST(class) TC_PRIO_##class
189
Philippe De Muyter4839c522007-07-09 15:32:57 -0700190const __u8 ip_tos2prio[16] = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700191 TC_PRIO_BESTEFFORT,
Dan Siemon4a2b9c32011-03-15 13:56:07 +0000192 ECN_OR_COST(BESTEFFORT),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700193 TC_PRIO_BESTEFFORT,
194 ECN_OR_COST(BESTEFFORT),
195 TC_PRIO_BULK,
196 ECN_OR_COST(BULK),
197 TC_PRIO_BULK,
198 ECN_OR_COST(BULK),
199 TC_PRIO_INTERACTIVE,
200 ECN_OR_COST(INTERACTIVE),
201 TC_PRIO_INTERACTIVE,
202 ECN_OR_COST(INTERACTIVE),
203 TC_PRIO_INTERACTIVE_BULK,
204 ECN_OR_COST(INTERACTIVE_BULK),
205 TC_PRIO_INTERACTIVE_BULK,
206 ECN_OR_COST(INTERACTIVE_BULK)
207};
Amir Vadaid4a96862012-04-04 21:33:28 +0000208EXPORT_SYMBOL(ip_tos2prio);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700209
210/*
211 * Route cache.
212 */
213
214/* The locking scheme is rather straight forward:
215 *
216 * 1) Read-Copy Update protects the buckets of the central route hash.
217 * 2) Only writers remove entries, and they hold the lock
218 * as they look at rtable reference counts.
219 * 3) Only readers acquire references to rtable entries,
220 * they do so with atomic increments and with the
221 * lock held.
222 */
223
224struct rt_hash_bucket {
Eric Dumazet1c317202010-10-25 21:02:07 +0000225 struct rtable __rcu *chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700226};
Neil Horman1080d702008-10-27 12:28:25 -0700227
Ingo Molnar8a25d5d2006-07-03 00:24:54 -0700228#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
229 defined(CONFIG_PROVE_LOCKING)
Eric Dumazet22c047c2005-07-05 14:55:24 -0700230/*
231 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
232 * The size of this table is a power of two and depends on the number of CPUS.
Ingo Molnar62051202006-07-03 00:24:59 -0700233 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
Eric Dumazet22c047c2005-07-05 14:55:24 -0700234 */
Ingo Molnar62051202006-07-03 00:24:59 -0700235#ifdef CONFIG_LOCKDEP
236# define RT_HASH_LOCK_SZ 256
Eric Dumazet22c047c2005-07-05 14:55:24 -0700237#else
Ingo Molnar62051202006-07-03 00:24:59 -0700238# if NR_CPUS >= 32
239# define RT_HASH_LOCK_SZ 4096
240# elif NR_CPUS >= 16
241# define RT_HASH_LOCK_SZ 2048
242# elif NR_CPUS >= 8
243# define RT_HASH_LOCK_SZ 1024
244# elif NR_CPUS >= 4
245# define RT_HASH_LOCK_SZ 512
246# else
247# define RT_HASH_LOCK_SZ 256
248# endif
Eric Dumazet22c047c2005-07-05 14:55:24 -0700249#endif
250
251static spinlock_t *rt_hash_locks;
252# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
Pavel Emelyanov1ff1cc22007-12-05 21:15:05 -0800253
254static __init void rt_hash_lock_init(void)
255{
256 int i;
257
258 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
259 GFP_KERNEL);
260 if (!rt_hash_locks)
261 panic("IP: failed to allocate rt_hash_locks\n");
262
263 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
264 spin_lock_init(&rt_hash_locks[i]);
265}
Eric Dumazet22c047c2005-07-05 14:55:24 -0700266#else
267# define rt_hash_lock_addr(slot) NULL
Pavel Emelyanov1ff1cc22007-12-05 21:15:05 -0800268
269static inline void rt_hash_lock_init(void)
270{
271}
Eric Dumazet22c047c2005-07-05 14:55:24 -0700272#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700273
Stephen Hemminger817bc4d2008-03-22 17:43:59 -0700274static struct rt_hash_bucket *rt_hash_table __read_mostly;
Eric Dumazet95c96172012-04-15 05:58:06 +0000275static unsigned int rt_hash_mask __read_mostly;
Stephen Hemminger817bc4d2008-03-22 17:43:59 -0700276static unsigned int rt_hash_log __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700277
Eric Dumazet2f970d82006-01-17 02:54:36 -0800278static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
Eric Dumazet27f39c73e2010-05-19 22:07:23 +0000279#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700280
Denis V. Lunevb00180d2008-07-05 19:04:09 -0700281static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
Eric Dumazet0eae88f2010-04-20 19:06:52 -0700282 int genid)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700283{
Eric Dumazet0eae88f2010-04-20 19:06:52 -0700284 return jhash_3words((__force u32)daddr, (__force u32)saddr,
Denis V. Lunevb00180d2008-07-05 19:04:09 -0700285 idx, genid)
Eric Dumazet29e75252008-01-31 17:05:09 -0800286 & rt_hash_mask;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700287}
288
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700289static inline int rt_genid(struct net *net)
290{
291 return atomic_read(&net->ipv4.rt_genid);
292}
293
Linus Torvalds1da177e2005-04-16 15:20:36 -0700294#ifdef CONFIG_PROC_FS
295struct rt_cache_iter_state {
Denis V. Luneva75e9362008-02-28 20:50:55 -0800296 struct seq_net_private p;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700297 int bucket;
Eric Dumazet29e75252008-01-31 17:05:09 -0800298 int genid;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700299};
300
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900301static struct rtable *rt_cache_get_first(struct seq_file *seq)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700302{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900303 struct rt_cache_iter_state *st = seq->private;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700304 struct rtable *r = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700305
306 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
Eric Dumazet33d480c2011-08-11 19:30:52 +0000307 if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
Eric Dumazeta6272662008-08-28 01:11:25 -0700308 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700309 rcu_read_lock_bh();
Paul E. McKenneya898def2010-02-22 17:04:49 -0800310 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
Eric Dumazet29e75252008-01-31 17:05:09 -0800311 while (r) {
Changli Gaod8d1f302010-06-10 23:31:35 -0700312 if (dev_net(r->dst.dev) == seq_file_net(seq) &&
Denis V. Luneva75e9362008-02-28 20:50:55 -0800313 r->rt_genid == st->genid)
Eric Dumazet29e75252008-01-31 17:05:09 -0800314 return r;
Changli Gaod8d1f302010-06-10 23:31:35 -0700315 r = rcu_dereference_bh(r->dst.rt_next);
Eric Dumazet29e75252008-01-31 17:05:09 -0800316 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700317 rcu_read_unlock_bh();
318 }
Eric Dumazet29e75252008-01-31 17:05:09 -0800319 return r;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700320}
321
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900322static struct rtable *__rt_cache_get_next(struct seq_file *seq,
Denis V. Lunev642d6312008-02-28 20:50:33 -0800323 struct rtable *r)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700324{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900325 struct rt_cache_iter_state *st = seq->private;
Eric Dumazeta6272662008-08-28 01:11:25 -0700326
Eric Dumazet1c317202010-10-25 21:02:07 +0000327 r = rcu_dereference_bh(r->dst.rt_next);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700328 while (!r) {
329 rcu_read_unlock_bh();
Eric Dumazeta6272662008-08-28 01:11:25 -0700330 do {
331 if (--st->bucket < 0)
332 return NULL;
Eric Dumazet33d480c2011-08-11 19:30:52 +0000333 } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700334 rcu_read_lock_bh();
Eric Dumazet1c317202010-10-25 21:02:07 +0000335 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700336 }
Eric Dumazet1c317202010-10-25 21:02:07 +0000337 return r;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700338}
339
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900340static struct rtable *rt_cache_get_next(struct seq_file *seq,
Denis V. Lunev642d6312008-02-28 20:50:33 -0800341 struct rtable *r)
342{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900343 struct rt_cache_iter_state *st = seq->private;
344 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
Changli Gaod8d1f302010-06-10 23:31:35 -0700345 if (dev_net(r->dst.dev) != seq_file_net(seq))
Denis V. Luneva75e9362008-02-28 20:50:55 -0800346 continue;
Denis V. Lunev642d6312008-02-28 20:50:33 -0800347 if (r->rt_genid == st->genid)
348 break;
349 }
350 return r;
351}
352
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900353static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700354{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900355 struct rtable *r = rt_cache_get_first(seq);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700356
357 if (r)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900358 while (pos && (r = rt_cache_get_next(seq, r)))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700359 --pos;
360 return pos ? NULL : r;
361}
362
363static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
364{
Eric Dumazet29e75252008-01-31 17:05:09 -0800365 struct rt_cache_iter_state *st = seq->private;
Eric Dumazet29e75252008-01-31 17:05:09 -0800366 if (*pos)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900367 return rt_cache_get_idx(seq, *pos - 1);
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700368 st->genid = rt_genid(seq_file_net(seq));
Eric Dumazet29e75252008-01-31 17:05:09 -0800369 return SEQ_START_TOKEN;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700370}
371
372static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
373{
Eric Dumazet29e75252008-01-31 17:05:09 -0800374 struct rtable *r;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700375
376 if (v == SEQ_START_TOKEN)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900377 r = rt_cache_get_first(seq);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700378 else
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900379 r = rt_cache_get_next(seq, v);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700380 ++*pos;
381 return r;
382}
383
384static void rt_cache_seq_stop(struct seq_file *seq, void *v)
385{
386 if (v && v != SEQ_START_TOKEN)
387 rcu_read_unlock_bh();
388}
389
390static int rt_cache_seq_show(struct seq_file *seq, void *v)
391{
392 if (v == SEQ_START_TOKEN)
393 seq_printf(seq, "%-127s\n",
394 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
395 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
396 "HHUptod\tSpecDst");
397 else {
398 struct rtable *r = v;
David S. Miller3c521f22012-07-02 02:04:13 -0700399 int len;
Eric Dumazet218fa902011-11-29 20:05:55 +0000400
Eric Dumazet0eae88f2010-04-20 19:06:52 -0700401 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
David S. Miller794785b2012-07-10 00:52:56 -0700402 "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
403 r->dst.dev ? r->dst.dev->name : "*",
404 (__force u32)r->rt_dst,
405 (__force u32)r->rt_gateway,
406 r->rt_flags, atomic_read(&r->dst.__refcnt),
407 r->dst.__use, 0, (__force u32)r->rt_src,
408 dst_metric_advmss(&r->dst) + 40,
409 dst_metric(&r->dst, RTAX_WINDOW), 0,
410 r->rt_key_tos,
411 -1, 0, 0, &len);
Pavel Emelyanov5e659e42008-04-24 01:02:16 -0700412
413 seq_printf(seq, "%*s\n", 127 - len, "");
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900414 }
415 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700416}
417
Stephen Hemmingerf6908082007-03-12 14:34:29 -0700418static const struct seq_operations rt_cache_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700419 .start = rt_cache_seq_start,
420 .next = rt_cache_seq_next,
421 .stop = rt_cache_seq_stop,
422 .show = rt_cache_seq_show,
423};
424
425static int rt_cache_seq_open(struct inode *inode, struct file *file)
426{
Denis V. Luneva75e9362008-02-28 20:50:55 -0800427 return seq_open_net(inode, file, &rt_cache_seq_ops,
Pavel Emelyanovcf7732e2007-10-10 02:29:29 -0700428 sizeof(struct rt_cache_iter_state));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700429}
430
Arjan van de Ven9a321442007-02-12 00:55:35 -0800431static const struct file_operations rt_cache_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700432 .owner = THIS_MODULE,
433 .open = rt_cache_seq_open,
434 .read = seq_read,
435 .llseek = seq_lseek,
Denis V. Luneva75e9362008-02-28 20:50:55 -0800436 .release = seq_release_net,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700437};
438
439
440static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
441{
442 int cpu;
443
444 if (*pos == 0)
445 return SEQ_START_TOKEN;
446
Rusty Russell0f23174a2008-12-29 12:23:42 +0000447 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700448 if (!cpu_possible(cpu))
449 continue;
450 *pos = cpu+1;
Eric Dumazet2f970d82006-01-17 02:54:36 -0800451 return &per_cpu(rt_cache_stat, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700452 }
453 return NULL;
454}
455
456static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
457{
458 int cpu;
459
Rusty Russell0f23174a2008-12-29 12:23:42 +0000460 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700461 if (!cpu_possible(cpu))
462 continue;
463 *pos = cpu+1;
Eric Dumazet2f970d82006-01-17 02:54:36 -0800464 return &per_cpu(rt_cache_stat, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700465 }
466 return NULL;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900467
Linus Torvalds1da177e2005-04-16 15:20:36 -0700468}
469
470static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
471{
472
473}
474
475static int rt_cpu_seq_show(struct seq_file *seq, void *v)
476{
477 struct rt_cache_stat *st = v;
478
479 if (v == SEQ_START_TOKEN) {
Olaf Rempel5bec0032005-04-28 12:16:08 -0700480 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700481 return 0;
482 }
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900483
Linus Torvalds1da177e2005-04-16 15:20:36 -0700484 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
485 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
Eric Dumazetfc66f952010-10-08 06:37:34 +0000486 dst_entries_get_slow(&ipv4_dst_ops),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700487 st->in_hit,
488 st->in_slow_tot,
489 st->in_slow_mc,
490 st->in_no_route,
491 st->in_brd,
492 st->in_martian_dst,
493 st->in_martian_src,
494
495 st->out_hit,
496 st->out_slow_tot,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900497 st->out_slow_mc,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700498
499 st->gc_total,
500 st->gc_ignored,
501 st->gc_goal_miss,
502 st->gc_dst_overflow,
503 st->in_hlist_search,
504 st->out_hlist_search
505 );
506 return 0;
507}
508
Stephen Hemmingerf6908082007-03-12 14:34:29 -0700509static const struct seq_operations rt_cpu_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700510 .start = rt_cpu_seq_start,
511 .next = rt_cpu_seq_next,
512 .stop = rt_cpu_seq_stop,
513 .show = rt_cpu_seq_show,
514};
515
516
517static int rt_cpu_seq_open(struct inode *inode, struct file *file)
518{
519 return seq_open(file, &rt_cpu_seq_ops);
520}
521
Arjan van de Ven9a321442007-02-12 00:55:35 -0800522static const struct file_operations rt_cpu_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700523 .owner = THIS_MODULE,
524 .open = rt_cpu_seq_open,
525 .read = seq_read,
526 .llseek = seq_lseek,
527 .release = seq_release,
528};
529
Patrick McHardyc7066f72011-01-14 13:36:42 +0100530#ifdef CONFIG_IP_ROUTE_CLASSID
Alexey Dobriyana661c412009-11-25 15:40:35 -0800531static int rt_acct_proc_show(struct seq_file *m, void *v)
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800532{
Alexey Dobriyana661c412009-11-25 15:40:35 -0800533 struct ip_rt_acct *dst, *src;
534 unsigned int i, j;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800535
Alexey Dobriyana661c412009-11-25 15:40:35 -0800536 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
537 if (!dst)
538 return -ENOMEM;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800539
Alexey Dobriyana661c412009-11-25 15:40:35 -0800540 for_each_possible_cpu(i) {
541 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
542 for (j = 0; j < 256; j++) {
543 dst[j].o_bytes += src[j].o_bytes;
544 dst[j].o_packets += src[j].o_packets;
545 dst[j].i_bytes += src[j].i_bytes;
546 dst[j].i_packets += src[j].i_packets;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800547 }
548 }
Alexey Dobriyana661c412009-11-25 15:40:35 -0800549
550 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
551 kfree(dst);
552 return 0;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800553}
Alexey Dobriyana661c412009-11-25 15:40:35 -0800554
555static int rt_acct_proc_open(struct inode *inode, struct file *file)
556{
557 return single_open(file, rt_acct_proc_show, NULL);
558}
559
560static const struct file_operations rt_acct_proc_fops = {
561 .owner = THIS_MODULE,
562 .open = rt_acct_proc_open,
563 .read = seq_read,
564 .llseek = seq_lseek,
565 .release = single_release,
566};
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800567#endif
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800568
Denis V. Lunev73b38712008-02-28 20:51:18 -0800569static int __net_init ip_rt_do_proc_init(struct net *net)
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800570{
571 struct proc_dir_entry *pde;
572
573 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
574 &rt_cache_seq_fops);
575 if (!pde)
576 goto err1;
577
Wang Chen77020722008-02-28 14:14:25 -0800578 pde = proc_create("rt_cache", S_IRUGO,
579 net->proc_net_stat, &rt_cpu_seq_fops);
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800580 if (!pde)
581 goto err2;
582
Patrick McHardyc7066f72011-01-14 13:36:42 +0100583#ifdef CONFIG_IP_ROUTE_CLASSID
Alexey Dobriyana661c412009-11-25 15:40:35 -0800584 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800585 if (!pde)
586 goto err3;
587#endif
588 return 0;
589
Patrick McHardyc7066f72011-01-14 13:36:42 +0100590#ifdef CONFIG_IP_ROUTE_CLASSID
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800591err3:
592 remove_proc_entry("rt_cache", net->proc_net_stat);
593#endif
594err2:
595 remove_proc_entry("rt_cache", net->proc_net);
596err1:
597 return -ENOMEM;
598}
Denis V. Lunev73b38712008-02-28 20:51:18 -0800599
600static void __net_exit ip_rt_do_proc_exit(struct net *net)
601{
602 remove_proc_entry("rt_cache", net->proc_net_stat);
603 remove_proc_entry("rt_cache", net->proc_net);
Patrick McHardyc7066f72011-01-14 13:36:42 +0100604#ifdef CONFIG_IP_ROUTE_CLASSID
Denis V. Lunev73b38712008-02-28 20:51:18 -0800605 remove_proc_entry("rt_acct", net->proc_net);
Alexey Dobriyan0a931ac2010-01-17 03:32:50 +0000606#endif
Denis V. Lunev73b38712008-02-28 20:51:18 -0800607}
608
609static struct pernet_operations ip_rt_proc_ops __net_initdata = {
610 .init = ip_rt_do_proc_init,
611 .exit = ip_rt_do_proc_exit,
612};
613
614static int __init ip_rt_proc_init(void)
615{
616 return register_pernet_subsys(&ip_rt_proc_ops);
617}
618
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800619#else
Denis V. Lunev73b38712008-02-28 20:51:18 -0800620static inline int ip_rt_proc_init(void)
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800621{
622 return 0;
623}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700624#endif /* CONFIG_PROC_FS */
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900625
Stephen Hemminger5969f712008-04-10 01:52:09 -0700626static inline void rt_free(struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700627{
Changli Gaod8d1f302010-06-10 23:31:35 -0700628 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700629}
630
Stephen Hemminger5969f712008-04-10 01:52:09 -0700631static inline void rt_drop(struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700632{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700633 ip_rt_put(rt);
Changli Gaod8d1f302010-06-10 23:31:35 -0700634 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700635}
636
Stephen Hemminger5969f712008-04-10 01:52:09 -0700637static inline int rt_fast_clean(struct rtable *rth)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700638{
639 /* Kill broadcast/multicast entries very aggresively, if they
640 collide in hash table with more useful entries */
641 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
David S. Millerc7537962010-11-11 17:07:48 -0800642 rt_is_input_route(rth) && rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700643}
644
Stephen Hemminger5969f712008-04-10 01:52:09 -0700645static inline int rt_valuable(struct rtable *rth)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700646{
647 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
David S. Miller59436342012-07-10 06:58:42 -0700648 rth->dst.expires;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700649}
650
651static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
652{
653 unsigned long age;
654 int ret = 0;
655
Changli Gaod8d1f302010-06-10 23:31:35 -0700656 if (atomic_read(&rth->dst.__refcnt))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700657 goto out;
658
Changli Gaod8d1f302010-06-10 23:31:35 -0700659 age = jiffies - rth->dst.lastuse;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700660 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
661 (age <= tmo2 && rt_valuable(rth)))
662 goto out;
663 ret = 1;
664out: return ret;
665}
666
667/* Bits of score are:
668 * 31: very valuable
669 * 30: not quite useless
670 * 29..0: usage counter
671 */
672static inline u32 rt_score(struct rtable *rt)
673{
Changli Gaod8d1f302010-06-10 23:31:35 -0700674 u32 score = jiffies - rt->dst.lastuse;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700675
676 score = ~score & ~(3<<30);
677
678 if (rt_valuable(rt))
679 score |= (1<<31);
680
David S. Millerc7537962010-11-11 17:07:48 -0800681 if (rt_is_output_route(rt) ||
Linus Torvalds1da177e2005-04-16 15:20:36 -0700682 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
683 score |= (1<<30);
684
685 return score;
686}
687
Neil Horman1080d702008-10-27 12:28:25 -0700688static inline bool rt_caching(const struct net *net)
689{
690 return net->ipv4.current_rt_cache_rebuild_count <=
691 net->ipv4.sysctl_rt_cache_rebuild_count;
692}
693
David S. Miller5e2b61f2011-03-04 21:47:09 -0800694static inline bool compare_hash_inputs(const struct rtable *rt1,
695 const struct rtable *rt2)
Neil Horman1080d702008-10-27 12:28:25 -0700696{
David S. Miller5e2b61f2011-03-04 21:47:09 -0800697 return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
698 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
Julian Anastasov97a80412011-08-09 04:01:16 +0000699 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
Neil Horman1080d702008-10-27 12:28:25 -0700700}
701
David S. Miller5e2b61f2011-03-04 21:47:09 -0800702static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700703{
David S. Miller5e2b61f2011-03-04 21:47:09 -0800704 return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
705 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
706 (rt1->rt_mark ^ rt2->rt_mark) |
David S. Miller475949d2011-05-03 19:45:15 -0700707 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
Julian Anastasovd547f722011-08-07 22:20:20 -0700708 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
Julian Anastasov97a80412011-08-09 04:01:16 +0000709 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700710}
711
Denis V. Lunevb5921912008-01-22 23:50:25 -0800712static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
713{
Changli Gaod8d1f302010-06-10 23:31:35 -0700714 return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
Denis V. Lunevb5921912008-01-22 23:50:25 -0800715}
716
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700717static inline int rt_is_expired(struct rtable *rth)
718{
Changli Gaod8d1f302010-06-10 23:31:35 -0700719 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700720}
721
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800722/*
723 * Perform a full scan of hash table and free all entries.
724 * Can be called by a softirq or a process.
725 * In the later case, we want to be reschedule if necessary
726 */
David S. Miller6561a3b2010-12-19 21:11:20 -0800727static void rt_do_flush(struct net *net, int process_context)
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800728{
729 unsigned int i;
730 struct rtable *rth, *next;
731
732 for (i = 0; i <= rt_hash_mask; i++) {
David S. Miller6561a3b2010-12-19 21:11:20 -0800733 struct rtable __rcu **pprev;
734 struct rtable *list;
735
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800736 if (process_context && need_resched())
737 cond_resched();
Eric Dumazet33d480c2011-08-11 19:30:52 +0000738 rth = rcu_access_pointer(rt_hash_table[i].chain);
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800739 if (!rth)
740 continue;
741
742 spin_lock_bh(rt_hash_lock_addr(i));
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700743
David S. Miller6561a3b2010-12-19 21:11:20 -0800744 list = NULL;
745 pprev = &rt_hash_table[i].chain;
746 rth = rcu_dereference_protected(*pprev,
Eric Dumazet1c317202010-10-25 21:02:07 +0000747 lockdep_is_held(rt_hash_lock_addr(i)));
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700748
David S. Miller6561a3b2010-12-19 21:11:20 -0800749 while (rth) {
750 next = rcu_dereference_protected(rth->dst.rt_next,
751 lockdep_is_held(rt_hash_lock_addr(i)));
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700752
David S. Miller6561a3b2010-12-19 21:11:20 -0800753 if (!net ||
754 net_eq(dev_net(rth->dst.dev), net)) {
755 rcu_assign_pointer(*pprev, next);
756 rcu_assign_pointer(rth->dst.rt_next, list);
757 list = rth;
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700758 } else {
David S. Miller6561a3b2010-12-19 21:11:20 -0800759 pprev = &rth->dst.rt_next;
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700760 }
David S. Miller6561a3b2010-12-19 21:11:20 -0800761 rth = next;
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700762 }
David S. Miller6561a3b2010-12-19 21:11:20 -0800763
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800764 spin_unlock_bh(rt_hash_lock_addr(i));
765
David S. Miller6561a3b2010-12-19 21:11:20 -0800766 for (; list; list = next) {
767 next = rcu_dereference_protected(list->dst.rt_next, 1);
768 rt_free(list);
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800769 }
770 }
771}
772
Neil Horman1080d702008-10-27 12:28:25 -0700773/*
774 * While freeing expired entries, we compute average chain length
775 * and standard deviation, using fixed-point arithmetic.
776 * This to have an estimation of rt_chain_length_max
777 * rt_chain_length_max = max(elasticity, AVG + 4*SD)
778 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
779 */
780
781#define FRACT_BITS 3
782#define ONE (1UL << FRACT_BITS)
783
Eric Dumazet98376382010-03-08 03:20:00 +0000784/*
785 * Given a hash chain and an item in this hash chain,
786 * find if a previous entry has the same hash_inputs
787 * (but differs on tos, mark or oif)
788 * Returns 0 if an alias is found.
789 * Returns ONE if rth has no alias before itself.
790 */
791static int has_noalias(const struct rtable *head, const struct rtable *rth)
792{
793 const struct rtable *aux = head;
794
795 while (aux != rth) {
David S. Miller5e2b61f2011-03-04 21:47:09 -0800796 if (compare_hash_inputs(aux, rth))
Eric Dumazet98376382010-03-08 03:20:00 +0000797 return 0;
Eric Dumazet1c317202010-10-25 21:02:07 +0000798 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
Eric Dumazet98376382010-03-08 03:20:00 +0000799 }
800 return ONE;
801}
802
Eric Dumazet9f28a2f2011-12-21 15:47:16 -0500803static void rt_check_expire(void)
804{
805 static unsigned int rover;
806 unsigned int i = rover, goal;
807 struct rtable *rth;
808 struct rtable __rcu **rthp;
809 unsigned long samples = 0;
810 unsigned long sum = 0, sum2 = 0;
811 unsigned long delta;
812 u64 mult;
813
814 delta = jiffies - expires_ljiffies;
815 expires_ljiffies = jiffies;
816 mult = ((u64)delta) << rt_hash_log;
817 if (ip_rt_gc_timeout > 1)
818 do_div(mult, ip_rt_gc_timeout);
819 goal = (unsigned int)mult;
820 if (goal > rt_hash_mask)
821 goal = rt_hash_mask + 1;
822 for (; goal > 0; goal--) {
823 unsigned long tmo = ip_rt_gc_timeout;
824 unsigned long length;
825
826 i = (i + 1) & rt_hash_mask;
827 rthp = &rt_hash_table[i].chain;
828
829 if (need_resched())
830 cond_resched();
831
832 samples++;
833
834 if (rcu_dereference_raw(*rthp) == NULL)
835 continue;
836 length = 0;
837 spin_lock_bh(rt_hash_lock_addr(i));
838 while ((rth = rcu_dereference_protected(*rthp,
839 lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
840 prefetch(rth->dst.rt_next);
David S. Millerdf67e6c2012-06-26 00:10:09 -0700841 if (rt_is_expired(rth) ||
842 rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
Eric Dumazet9f28a2f2011-12-21 15:47:16 -0500843 *rthp = rth->dst.rt_next;
844 rt_free(rth);
845 continue;
846 }
Eric Dumazet9f28a2f2011-12-21 15:47:16 -0500847
David S. Millerdf67e6c2012-06-26 00:10:09 -0700848 /* We only count entries on a chain with equal
849 * hash inputs once so that entries for
850 * different QOS levels, and other non-hash
851 * input attributes don't unfairly skew the
852 * length computation
853 */
854 tmo >>= 1;
855 rthp = &rth->dst.rt_next;
856 length += has_noalias(rt_hash_table[i].chain, rth);
Eric Dumazet9f28a2f2011-12-21 15:47:16 -0500857 }
858 spin_unlock_bh(rt_hash_lock_addr(i));
859 sum += length;
860 sum2 += length*length;
861 }
862 if (samples) {
863 unsigned long avg = sum / samples;
864 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
865 rt_chain_length_max = max_t(unsigned long,
866 ip_rt_gc_elasticity,
867 (avg + 4*sd) >> FRACT_BITS);
868 }
869 rover = i;
870}
871
872/*
873 * rt_worker_func() is run in process context.
874 * we call rt_check_expire() to scan part of the hash table
875 */
876static void rt_worker_func(struct work_struct *work)
877{
878 rt_check_expire();
879 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
880}
881
Eric Dumazet29e75252008-01-31 17:05:09 -0800882/*
Lucas De Marchi25985ed2011-03-30 22:57:33 -0300883 * Perturbation of rt_genid by a small quantity [1..256]
Eric Dumazet29e75252008-01-31 17:05:09 -0800884 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
885 * many times (2^24) without giving recent rt_genid.
886 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700887 */
Denis V. Lunev86c657f2008-07-05 19:03:31 -0700888static void rt_cache_invalidate(struct net *net)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700889{
Eric Dumazet29e75252008-01-31 17:05:09 -0800890 unsigned char shuffle;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700891
Eric Dumazet29e75252008-01-31 17:05:09 -0800892 get_random_bytes(&shuffle, sizeof(shuffle));
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700893 atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700894}
895
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800896/*
Eric Dumazet29e75252008-01-31 17:05:09 -0800897 * delay < 0 : invalidate cache (fast : entries will be deleted later)
898 * delay >= 0 : invalidate & flush cache (can be long)
899 */
Denis V. Lunev76e6ebf2008-07-05 19:00:44 -0700900void rt_cache_flush(struct net *net, int delay)
Eric Dumazet29e75252008-01-31 17:05:09 -0800901{
Denis V. Lunev86c657f2008-07-05 19:03:31 -0700902 rt_cache_invalidate(net);
Eric Dumazet29e75252008-01-31 17:05:09 -0800903 if (delay >= 0)
David S. Miller6561a3b2010-12-19 21:11:20 -0800904 rt_do_flush(net, !in_softirq());
Eric Dumazet29e75252008-01-31 17:05:09 -0800905}
906
Eric W. Biedermana5ee1552009-11-29 15:45:58 +0000907/* Flush previous cache invalidated entries from the cache */
David S. Miller6561a3b2010-12-19 21:11:20 -0800908void rt_cache_flush_batch(struct net *net)
Eric W. Biedermana5ee1552009-11-29 15:45:58 +0000909{
David S. Miller6561a3b2010-12-19 21:11:20 -0800910 rt_do_flush(net, !in_softirq());
Eric W. Biedermana5ee1552009-11-29 15:45:58 +0000911}
912
Neil Horman1080d702008-10-27 12:28:25 -0700913static void rt_emergency_hash_rebuild(struct net *net)
914{
Joe Perchese87cc472012-05-13 21:56:26 +0000915 net_warn_ratelimited("Route hash chain too long!\n");
Neil Horman3ee94372010-05-08 01:57:52 -0700916 rt_cache_invalidate(net);
Neil Horman1080d702008-10-27 12:28:25 -0700917}
918
Linus Torvalds1da177e2005-04-16 15:20:36 -0700919/*
920 Short description of GC goals.
921
922 We want to build algorithm, which will keep routing cache
923 at some equilibrium point, when number of aged off entries
924 is kept approximately equal to newly generated ones.
925
926 Current expiration strength is variable "expire".
927 We try to adjust it dynamically, so that if networking
928 is idle expires is large enough to keep enough of warm entries,
929 and when load increases it reduces to limit cache size.
930 */
931
Daniel Lezcano569d3642008-01-18 03:56:57 -0800932static int rt_garbage_collect(struct dst_ops *ops)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700933{
934 static unsigned long expire = RT_GC_TIMEOUT;
935 static unsigned long last_gc;
936 static int rover;
937 static int equilibrium;
Eric Dumazet1c317202010-10-25 21:02:07 +0000938 struct rtable *rth;
939 struct rtable __rcu **rthp;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700940 unsigned long now = jiffies;
941 int goal;
Eric Dumazetfc66f952010-10-08 06:37:34 +0000942 int entries = dst_entries_get_fast(&ipv4_dst_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700943
944 /*
945 * Garbage collection is pretty expensive,
946 * do not make it too frequently.
947 */
948
949 RT_CACHE_STAT_INC(gc_total);
950
951 if (now - last_gc < ip_rt_gc_min_interval &&
Eric Dumazetfc66f952010-10-08 06:37:34 +0000952 entries < ip_rt_max_size) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700953 RT_CACHE_STAT_INC(gc_ignored);
954 goto out;
955 }
956
Eric Dumazetfc66f952010-10-08 06:37:34 +0000957 entries = dst_entries_get_slow(&ipv4_dst_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700958 /* Calculate number of entries, which we want to expire now. */
Eric Dumazetfc66f952010-10-08 06:37:34 +0000959 goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700960 if (goal <= 0) {
961 if (equilibrium < ipv4_dst_ops.gc_thresh)
962 equilibrium = ipv4_dst_ops.gc_thresh;
Eric Dumazetfc66f952010-10-08 06:37:34 +0000963 goal = entries - equilibrium;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700964 if (goal > 0) {
Eric Dumazetb790ced2007-12-21 01:49:07 -0800965 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
Eric Dumazetfc66f952010-10-08 06:37:34 +0000966 goal = entries - equilibrium;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700967 }
968 } else {
969 /* We are in dangerous area. Try to reduce cache really
970 * aggressively.
971 */
Eric Dumazetb790ced2007-12-21 01:49:07 -0800972 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
Eric Dumazetfc66f952010-10-08 06:37:34 +0000973 equilibrium = entries - goal;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700974 }
975
976 if (now - last_gc >= ip_rt_gc_min_interval)
977 last_gc = now;
978
979 if (goal <= 0) {
980 equilibrium += goal;
981 goto work_done;
982 }
983
984 do {
985 int i, k;
986
987 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
988 unsigned long tmo = expire;
989
990 k = (k + 1) & rt_hash_mask;
991 rthp = &rt_hash_table[k].chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700992 spin_lock_bh(rt_hash_lock_addr(k));
Eric Dumazet1c317202010-10-25 21:02:07 +0000993 while ((rth = rcu_dereference_protected(*rthp,
994 lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700995 if (!rt_is_expired(rth) &&
Eric Dumazet29e75252008-01-31 17:05:09 -0800996 !rt_may_expire(rth, tmo, expire)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700997 tmo >>= 1;
Changli Gaod8d1f302010-06-10 23:31:35 -0700998 rthp = &rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700999 continue;
1000 }
Changli Gaod8d1f302010-06-10 23:31:35 -07001001 *rthp = rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001002 rt_free(rth);
1003 goal--;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001004 }
Eric Dumazet22c047c2005-07-05 14:55:24 -07001005 spin_unlock_bh(rt_hash_lock_addr(k));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001006 if (goal <= 0)
1007 break;
1008 }
1009 rover = k;
1010
1011 if (goal <= 0)
1012 goto work_done;
1013
1014 /* Goal is not achieved. We stop process if:
1015
1016 - if expire reduced to zero. Otherwise, expire is halfed.
1017 - if table is not full.
1018 - if we are called from interrupt.
1019 - jiffies check is just fallback/debug loop breaker.
1020 We will not spin here for long time in any case.
1021 */
1022
1023 RT_CACHE_STAT_INC(gc_goal_miss);
1024
1025 if (expire == 0)
1026 break;
1027
1028 expire >>= 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001029
Eric Dumazetfc66f952010-10-08 06:37:34 +00001030 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001031 goto out;
1032 } while (!in_softirq() && time_before_eq(jiffies, now));
1033
Eric Dumazetfc66f952010-10-08 06:37:34 +00001034 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1035 goto out;
1036 if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001037 goto out;
Joe Perchese87cc472012-05-13 21:56:26 +00001038 net_warn_ratelimited("dst cache overflow\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07001039 RT_CACHE_STAT_INC(gc_dst_overflow);
1040 return 1;
1041
1042work_done:
1043 expire += ip_rt_gc_min_interval;
1044 if (expire > ip_rt_gc_timeout ||
Eric Dumazetfc66f952010-10-08 06:37:34 +00001045 dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1046 dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001047 expire = ip_rt_gc_timeout;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001048out: return 0;
1049}
1050
Eric Dumazet98376382010-03-08 03:20:00 +00001051/*
1052 * Returns number of entries in a hash chain that have different hash_inputs
1053 */
1054static int slow_chain_length(const struct rtable *head)
1055{
1056 int length = 0;
1057 const struct rtable *rth = head;
1058
1059 while (rth) {
1060 length += has_noalias(head, rth);
Eric Dumazet1c317202010-10-25 21:02:07 +00001061 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
Eric Dumazet98376382010-03-08 03:20:00 +00001062 }
1063 return length >> FRACT_BITS;
1064}
1065
David S. Millerf894cbf2012-07-02 21:52:24 -07001066static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
1067 struct sk_buff *skb,
1068 const void *daddr)
David Miller3769cff2011-07-11 22:44:24 +00001069{
David S. Millerd3aaeb32011-07-18 00:40:17 -07001070 struct net_device *dev = dst->dev;
1071 const __be32 *pkey = daddr;
David S. Miller39232972012-01-26 15:22:32 -05001072 const struct rtable *rt;
David Miller3769cff2011-07-11 22:44:24 +00001073 struct neighbour *n;
1074
David S. Miller39232972012-01-26 15:22:32 -05001075 rt = (const struct rtable *) dst;
David S. Millera263b302012-07-02 02:02:15 -07001076 if (rt->rt_gateway)
David S. Miller39232972012-01-26 15:22:32 -05001077 pkey = (const __be32 *) &rt->rt_gateway;
David S. Millerf894cbf2012-07-02 21:52:24 -07001078 else if (skb)
1079 pkey = &ip_hdr(skb)->daddr;
David S. Millerd3aaeb32011-07-18 00:40:17 -07001080
David S. Miller80703d22012-02-15 17:48:35 -05001081 n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
David S. Millerd3aaeb32011-07-18 00:40:17 -07001082 if (n)
1083 return n;
David Miller32092ec2011-07-25 00:01:41 +00001084 return neigh_create(&arp_tbl, pkey, dev);
David S. Millerd3aaeb32011-07-18 00:40:17 -07001085}
1086
Eric Dumazet95c96172012-04-15 05:58:06 +00001087static struct rtable *rt_intern_hash(unsigned int hash, struct rtable *rt,
David S. Millerb23dd4f2011-03-02 14:31:35 -08001088 struct sk_buff *skb, int ifindex)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001089{
Eric Dumazet1c317202010-10-25 21:02:07 +00001090 struct rtable *rth, *cand;
1091 struct rtable __rcu **rthp, **candp;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001092 unsigned long now;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001093 u32 min_score;
1094 int chain_length;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001095
1096restart:
1097 chain_length = 0;
1098 min_score = ~(u32)0;
1099 cand = NULL;
1100 candp = NULL;
1101 now = jiffies;
1102
Eric Dumazet7586ece2012-06-20 05:02:19 +00001103 if (!rt_caching(dev_net(rt->dst.dev)) || (rt->dst.flags & DST_NOCACHE)) {
Neil Horman73e42892009-06-20 01:15:16 -07001104 /*
1105 * If we're not caching, just tell the caller we
1106 * were successful and don't touch the route. The
1107 * caller hold the sole reference to the cache entry, and
1108 * it will be released when the caller is done with it.
1109 * If we drop it here, the callers have no way to resolve routes
1110 * when we're not caching. Instead, just point *rp at rt, so
1111 * the caller gets a single use out of the route
Neil Hormanb6280b42009-06-22 10:18:53 +00001112 * Note that we do rt_free on this new route entry, so that
1113 * once its refcount hits zero, we are still able to reap it
1114 * (Thanks Alexey)
Eric Dumazet27b75c92010-10-15 05:44:11 +00001115 * Note: To avoid expensive rcu stuff for this uncached dst,
1116 * we set DST_NOCACHE so that dst_release() can free dst without
1117 * waiting a grace period.
Neil Horman73e42892009-06-20 01:15:16 -07001118 */
Neil Hormanb6280b42009-06-22 10:18:53 +00001119
Eric Dumazetc7d44262010-10-03 22:17:54 -07001120 rt->dst.flags |= DST_NOCACHE;
Neil Hormanb6280b42009-06-22 10:18:53 +00001121 goto skip_hashing;
Neil Horman1080d702008-10-27 12:28:25 -07001122 }
1123
Linus Torvalds1da177e2005-04-16 15:20:36 -07001124 rthp = &rt_hash_table[hash].chain;
1125
Eric Dumazet22c047c2005-07-05 14:55:24 -07001126 spin_lock_bh(rt_hash_lock_addr(hash));
Eric Dumazet1c317202010-10-25 21:02:07 +00001127 while ((rth = rcu_dereference_protected(*rthp,
1128 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001129 if (rt_is_expired(rth)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001130 *rthp = rth->dst.rt_next;
Eric Dumazet29e75252008-01-31 17:05:09 -08001131 rt_free(rth);
1132 continue;
1133 }
David S. Miller5e2b61f2011-03-04 21:47:09 -08001134 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001135 /* Put it first */
Changli Gaod8d1f302010-06-10 23:31:35 -07001136 *rthp = rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001137 /*
1138 * Since lookup is lockfree, the deletion
1139 * must be visible to another weakly ordered CPU before
1140 * the insertion at the start of the hash chain.
1141 */
Changli Gaod8d1f302010-06-10 23:31:35 -07001142 rcu_assign_pointer(rth->dst.rt_next,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001143 rt_hash_table[hash].chain);
1144 /*
1145 * Since lookup is lockfree, the update writes
1146 * must be ordered for consistency on SMP.
1147 */
1148 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1149
Changli Gaod8d1f302010-06-10 23:31:35 -07001150 dst_use(&rth->dst, now);
Eric Dumazet22c047c2005-07-05 14:55:24 -07001151 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001152
1153 rt_drop(rt);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001154 if (skb)
Changli Gaod8d1f302010-06-10 23:31:35 -07001155 skb_dst_set(skb, &rth->dst);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001156 return rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001157 }
1158
Changli Gaod8d1f302010-06-10 23:31:35 -07001159 if (!atomic_read(&rth->dst.__refcnt)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001160 u32 score = rt_score(rth);
1161
1162 if (score <= min_score) {
1163 cand = rth;
1164 candp = rthp;
1165 min_score = score;
1166 }
1167 }
1168
1169 chain_length++;
1170
Changli Gaod8d1f302010-06-10 23:31:35 -07001171 rthp = &rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001172 }
1173
1174 if (cand) {
1175 /* ip_rt_gc_elasticity used to be average length of chain
1176 * length, when exceeded gc becomes really aggressive.
1177 *
1178 * The second limit is less certain. At the moment it allows
1179 * only 2 entries per bucket. We will see.
1180 */
1181 if (chain_length > ip_rt_gc_elasticity) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001182 *candp = cand->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001183 rt_free(cand);
1184 }
Neil Horman1080d702008-10-27 12:28:25 -07001185 } else {
Eric Dumazet98376382010-03-08 03:20:00 +00001186 if (chain_length > rt_chain_length_max &&
1187 slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001188 struct net *net = dev_net(rt->dst.dev);
Neil Horman1080d702008-10-27 12:28:25 -07001189 int num = ++net->ipv4.current_rt_cache_rebuild_count;
Pavel Emelyanovb35ecb52010-03-24 07:43:17 +00001190 if (!rt_caching(net)) {
Joe Perches058bd4d2012-03-11 18:36:11 +00001191 pr_warn("%s: %d rebuilds is over limit, route caching disabled\n",
Changli Gaod8d1f302010-06-10 23:31:35 -07001192 rt->dst.dev->name, num);
Neil Horman1080d702008-10-27 12:28:25 -07001193 }
Pavel Emelyanovb35ecb52010-03-24 07:43:17 +00001194 rt_emergency_hash_rebuild(net);
Pavel Emelyanov6a2bad72010-03-24 21:51:22 +00001195 spin_unlock_bh(rt_hash_lock_addr(hash));
1196
David S. Miller5e2b61f2011-03-04 21:47:09 -08001197 hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
Pavel Emelyanov6a2bad72010-03-24 21:51:22 +00001198 ifindex, rt_genid(net));
1199 goto restart;
Neil Horman1080d702008-10-27 12:28:25 -07001200 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001201 }
1202
Changli Gaod8d1f302010-06-10 23:31:35 -07001203 rt->dst.rt_next = rt_hash_table[hash].chain;
Neil Horman1080d702008-10-27 12:28:25 -07001204
Eric Dumazet00269b52008-10-16 14:18:29 -07001205 /*
1206 * Since lookup is lockfree, we must make sure
Lucas De Marchi25985ed2011-03-30 22:57:33 -03001207 * previous writes to rt are committed to memory
Eric Dumazet00269b52008-10-16 14:18:29 -07001208 * before making rt visible to other CPUS.
1209 */
Eric Dumazet1ddbcb02009-05-19 20:14:28 +00001210 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
Neil Horman1080d702008-10-27 12:28:25 -07001211
Eric Dumazet22c047c2005-07-05 14:55:24 -07001212 spin_unlock_bh(rt_hash_lock_addr(hash));
Neil Horman73e42892009-06-20 01:15:16 -07001213
Neil Hormanb6280b42009-06-22 10:18:53 +00001214skip_hashing:
David S. Millerb23dd4f2011-03-02 14:31:35 -08001215 if (skb)
Changli Gaod8d1f302010-06-10 23:31:35 -07001216 skb_dst_set(skb, &rt->dst);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001217 return rt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001218}
1219
Linus Torvalds1da177e2005-04-16 15:20:36 -07001220/*
1221 * Peer allocation may fail only in serious out-of-memory conditions. However
1222 * we still can generate some output.
1223 * Random ID selection looks a bit dangerous because we have no chances to
1224 * select ID being unique in a reasonable period of time.
1225 * But broken packet identifier may be better than no packet at all.
1226 */
1227static void ip_select_fb_ident(struct iphdr *iph)
1228{
1229 static DEFINE_SPINLOCK(ip_fb_id_lock);
1230 static u32 ip_fallback_id;
1231 u32 salt;
1232
1233 spin_lock_bh(&ip_fb_id_lock);
Al Viroe4485152006-09-26 22:15:01 -07001234 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001235 iph->id = htons(salt & 0xFFFF);
1236 ip_fallback_id = salt;
1237 spin_unlock_bh(&ip_fb_id_lock);
1238}
1239
1240void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1241{
David S. Miller1d861aa2012-07-10 03:58:16 -07001242 struct net *net = dev_net(dst->dev);
1243 struct inet_peer *peer;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001244
David S. Miller1d861aa2012-07-10 03:58:16 -07001245 peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
1246 if (peer) {
1247 iph->id = htons(inet_getid(peer, more));
1248 inet_putpeer(peer);
1249 return;
1250 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001251
1252 ip_select_fb_ident(iph);
1253}
Eric Dumazet4bc2f182010-07-09 21:22:10 +00001254EXPORT_SYMBOL(__ip_select_ident);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001255
Eric Dumazet95c96172012-04-15 05:58:06 +00001256static void rt_del(unsigned int hash, struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001257{
Eric Dumazet1c317202010-10-25 21:02:07 +00001258 struct rtable __rcu **rthp;
1259 struct rtable *aux;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001260
Eric Dumazet29e75252008-01-31 17:05:09 -08001261 rthp = &rt_hash_table[hash].chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -07001262 spin_lock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001263 ip_rt_put(rt);
Eric Dumazet1c317202010-10-25 21:02:07 +00001264 while ((aux = rcu_dereference_protected(*rthp,
1265 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001266 if (aux == rt || rt_is_expired(aux)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001267 *rthp = aux->dst.rt_next;
Eric Dumazet29e75252008-01-31 17:05:09 -08001268 rt_free(aux);
1269 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001270 }
Changli Gaod8d1f302010-06-10 23:31:35 -07001271 rthp = &aux->dst.rt_next;
Eric Dumazet29e75252008-01-31 17:05:09 -08001272 }
Eric Dumazet22c047c2005-07-05 14:55:24 -07001273 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001274}
1275
David S. Millere47a1852012-07-11 20:55:47 -07001276static void ip_do_redirect(struct dst_entry *dst, struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001277{
David S. Miller94206122012-07-11 20:38:08 -07001278 const struct iphdr *iph = (const struct iphdr *) skb->data;
David S. Millere47a1852012-07-11 20:55:47 -07001279 __be32 new_gw = icmp_hdr(skb)->un.gateway;
David S. Miller94206122012-07-11 20:38:08 -07001280 __be32 old_gw = ip_hdr(skb)->saddr;
David S. Millere47a1852012-07-11 20:55:47 -07001281 struct net_device *dev = skb->dev;
David S. Miller94206122012-07-11 20:38:08 -07001282 __be32 daddr = iph->daddr;
1283 __be32 saddr = iph->saddr;
David S. Millere47a1852012-07-11 20:55:47 -07001284 struct in_device *in_dev;
1285 struct neighbour *n;
1286 struct rtable *rt;
Denis V. Lunev317805b2008-02-28 20:50:06 -08001287 struct net *net;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001288
David S. Miller94206122012-07-11 20:38:08 -07001289 switch (icmp_hdr(skb)->code & 7) {
1290 case ICMP_REDIR_NET:
1291 case ICMP_REDIR_NETTOS:
1292 case ICMP_REDIR_HOST:
1293 case ICMP_REDIR_HOSTTOS:
1294 break;
1295
1296 default:
1297 return;
1298 }
1299
David S. Millere47a1852012-07-11 20:55:47 -07001300 rt = (struct rtable *) dst;
1301 if (rt->rt_gateway != old_gw)
1302 return;
1303
1304 in_dev = __in_dev_get_rcu(dev);
1305 if (!in_dev)
1306 return;
1307
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09001308 net = dev_net(dev);
Joe Perches9d4fb272009-11-23 10:41:23 -08001309 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1310 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1311 ipv4_is_zeronet(new_gw))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001312 goto reject_redirect;
1313
1314 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1315 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1316 goto reject_redirect;
1317 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1318 goto reject_redirect;
1319 } else {
Denis V. Lunev317805b2008-02-28 20:50:06 -08001320 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001321 goto reject_redirect;
1322 }
1323
David S. Millere47a1852012-07-11 20:55:47 -07001324 n = ipv4_neigh_lookup(dst, NULL, &new_gw);
1325 if (n) {
1326 if (!(n->nud_state & NUD_VALID)) {
1327 neigh_event_send(n, NULL);
1328 } else {
1329 rt->rt_gateway = new_gw;
1330 rt->rt_flags |= RTCF_REDIRECTED;
1331 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1332 }
1333 neigh_release(n);
1334 }
1335 return;
1336
1337reject_redirect:
1338#ifdef CONFIG_IP_ROUTE_VERBOSE
1339 if (IN_DEV_LOG_MARTIANS(in_dev))
1340 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
1341 " Advised path = %pI4 -> %pI4\n",
1342 &old_gw, dev->name, &new_gw,
1343 &saddr, &daddr);
1344#endif
1345 ;
1346}
1347
1348/* called in rcu_read_lock() section */
1349void ip_rt_redirect(struct sk_buff *skb, __be32 new_gw)
1350{
1351 const struct iphdr *iph = (const struct iphdr *) skb->data;
1352 __be32 daddr = iph->daddr;
1353 __be32 saddr = iph->saddr;
1354 struct net_device *dev = skb->dev;
1355 int ikeys[2] = { dev->ifindex, 0 };
1356 __be32 skeys[2] = { saddr, 0 };
1357 struct net *net;
1358 int s, i;
1359
1360 net = dev_net(dev);
Flavio Leitner7cc91502011-10-24 02:56:38 -04001361 for (s = 0; s < 2; s++) {
1362 for (i = 0; i < 2; i++) {
Eric Dumazet9cc20b22011-11-18 15:24:32 -05001363 unsigned int hash;
1364 struct rtable __rcu **rthp;
1365 struct rtable *rt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001366
Eric Dumazet9cc20b22011-11-18 15:24:32 -05001367 hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
1368
1369 rthp = &rt_hash_table[hash].chain;
1370
1371 while ((rt = rcu_dereference(*rthp)) != NULL) {
1372 rthp = &rt->dst.rt_next;
1373
1374 if (rt->rt_key_dst != daddr ||
1375 rt->rt_key_src != skeys[s] ||
1376 rt->rt_oif != ikeys[i] ||
1377 rt_is_input_route(rt) ||
1378 rt_is_expired(rt) ||
1379 !net_eq(dev_net(rt->dst.dev), net) ||
1380 rt->dst.error ||
David S. Millerd0da7202012-07-11 20:27:54 -07001381 rt->dst.dev != dev)
Eric Dumazet9cc20b22011-11-18 15:24:32 -05001382 continue;
1383
David S. Millere47a1852012-07-11 20:55:47 -07001384 ip_do_redirect(&rt->dst, skb);
Flavio Leitner7cc91502011-10-24 02:56:38 -04001385 }
Flavio Leitner7cc91502011-10-24 02:56:38 -04001386 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001387 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001388 return;
1389
Linus Torvalds1da177e2005-04-16 15:20:36 -07001390}
1391
1392static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1393{
Eric Dumazetee6b9672008-03-05 18:30:47 -08001394 struct rtable *rt = (struct rtable *)dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001395 struct dst_entry *ret = dst;
1396
1397 if (rt) {
Timo Teräsd11a4dc2010-03-18 23:20:20 +00001398 if (dst->obsolete > 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001399 ip_rt_put(rt);
1400 ret = NULL;
David S. Miller59436342012-07-10 06:58:42 -07001401 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1402 rt->dst.expires) {
Eric Dumazet95c96172012-04-15 05:58:06 +00001403 unsigned int hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
David S. Miller5e2b61f2011-03-04 21:47:09 -08001404 rt->rt_oif,
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001405 rt_genid(dev_net(dst->dev)));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001406 rt_del(hash, rt);
1407 ret = NULL;
1408 }
1409 }
1410 return ret;
1411}
1412
1413/*
1414 * Algorithm:
1415 * 1. The first ip_rt_redirect_number redirects are sent
1416 * with exponential backoff, then we stop sending them at all,
1417 * assuming that the host ignores our redirects.
1418 * 2. If we did not see packets requiring redirects
1419 * during ip_rt_redirect_silence, we assume that the host
1420 * forgot redirected route and start to send redirects again.
1421 *
1422 * This algorithm is much cheaper and more intelligent than dumb load limiting
1423 * in icmp.c.
1424 *
1425 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1426 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1427 */
1428
1429void ip_rt_send_redirect(struct sk_buff *skb)
1430{
Eric Dumazet511c3f92009-06-02 05:14:27 +00001431 struct rtable *rt = skb_rtable(skb);
Eric Dumazet30038fc2009-08-28 23:52:01 -07001432 struct in_device *in_dev;
David S. Miller92d86822011-02-04 15:55:25 -08001433 struct inet_peer *peer;
David S. Miller1d861aa2012-07-10 03:58:16 -07001434 struct net *net;
Eric Dumazet30038fc2009-08-28 23:52:01 -07001435 int log_martians;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001436
Eric Dumazet30038fc2009-08-28 23:52:01 -07001437 rcu_read_lock();
Changli Gaod8d1f302010-06-10 23:31:35 -07001438 in_dev = __in_dev_get_rcu(rt->dst.dev);
Eric Dumazet30038fc2009-08-28 23:52:01 -07001439 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1440 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001441 return;
Eric Dumazet30038fc2009-08-28 23:52:01 -07001442 }
1443 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1444 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001445
David S. Miller1d861aa2012-07-10 03:58:16 -07001446 net = dev_net(rt->dst.dev);
1447 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
David S. Miller92d86822011-02-04 15:55:25 -08001448 if (!peer) {
1449 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1450 return;
1451 }
1452
Linus Torvalds1da177e2005-04-16 15:20:36 -07001453 /* No redirected packets during ip_rt_redirect_silence;
1454 * reset the algorithm.
1455 */
David S. Miller92d86822011-02-04 15:55:25 -08001456 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1457 peer->rate_tokens = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001458
1459 /* Too many ignored redirects; do not send anything
Changli Gaod8d1f302010-06-10 23:31:35 -07001460 * set dst.rate_last to the last seen redirected packet.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001461 */
David S. Miller92d86822011-02-04 15:55:25 -08001462 if (peer->rate_tokens >= ip_rt_redirect_number) {
1463 peer->rate_last = jiffies;
David S. Miller1d861aa2012-07-10 03:58:16 -07001464 goto out_put_peer;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001465 }
1466
1467 /* Check for load limit; set rate_last to the latest sent
1468 * redirect.
1469 */
David S. Miller92d86822011-02-04 15:55:25 -08001470 if (peer->rate_tokens == 0 ||
Li Yewang14fb8a72006-12-18 00:26:35 -08001471 time_after(jiffies,
David S. Miller92d86822011-02-04 15:55:25 -08001472 (peer->rate_last +
1473 (ip_rt_redirect_load << peer->rate_tokens)))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001474 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
David S. Miller92d86822011-02-04 15:55:25 -08001475 peer->rate_last = jiffies;
1476 ++peer->rate_tokens;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001477#ifdef CONFIG_IP_ROUTE_VERBOSE
Eric Dumazet30038fc2009-08-28 23:52:01 -07001478 if (log_martians &&
Joe Perchese87cc472012-05-13 21:56:26 +00001479 peer->rate_tokens == ip_rt_redirect_number)
1480 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
1481 &ip_hdr(skb)->saddr, rt->rt_iif,
1482 &rt->rt_dst, &rt->rt_gateway);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001483#endif
1484 }
David S. Miller1d861aa2012-07-10 03:58:16 -07001485out_put_peer:
1486 inet_putpeer(peer);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001487}
1488
1489static int ip_error(struct sk_buff *skb)
1490{
David S. Miller251da412012-06-26 16:27:09 -07001491 struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
Eric Dumazet511c3f92009-06-02 05:14:27 +00001492 struct rtable *rt = skb_rtable(skb);
David S. Miller92d86822011-02-04 15:55:25 -08001493 struct inet_peer *peer;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001494 unsigned long now;
David S. Miller251da412012-06-26 16:27:09 -07001495 struct net *net;
David S. Miller92d86822011-02-04 15:55:25 -08001496 bool send;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001497 int code;
1498
David S. Miller251da412012-06-26 16:27:09 -07001499 net = dev_net(rt->dst.dev);
1500 if (!IN_DEV_FORWARD(in_dev)) {
1501 switch (rt->dst.error) {
1502 case EHOSTUNREACH:
1503 IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
1504 break;
1505
1506 case ENETUNREACH:
1507 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
1508 break;
1509 }
1510 goto out;
1511 }
1512
Changli Gaod8d1f302010-06-10 23:31:35 -07001513 switch (rt->dst.error) {
Joe Perches4500ebf2011-07-01 09:43:07 +00001514 case EINVAL:
1515 default:
1516 goto out;
1517 case EHOSTUNREACH:
1518 code = ICMP_HOST_UNREACH;
1519 break;
1520 case ENETUNREACH:
1521 code = ICMP_NET_UNREACH;
David S. Miller251da412012-06-26 16:27:09 -07001522 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
Joe Perches4500ebf2011-07-01 09:43:07 +00001523 break;
1524 case EACCES:
1525 code = ICMP_PKT_FILTERED;
1526 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001527 }
1528
David S. Miller1d861aa2012-07-10 03:58:16 -07001529 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
David S. Miller92d86822011-02-04 15:55:25 -08001530
1531 send = true;
1532 if (peer) {
1533 now = jiffies;
1534 peer->rate_tokens += now - peer->rate_last;
1535 if (peer->rate_tokens > ip_rt_error_burst)
1536 peer->rate_tokens = ip_rt_error_burst;
1537 peer->rate_last = now;
1538 if (peer->rate_tokens >= ip_rt_error_cost)
1539 peer->rate_tokens -= ip_rt_error_cost;
1540 else
1541 send = false;
David S. Miller1d861aa2012-07-10 03:58:16 -07001542 inet_putpeer(peer);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001543 }
David S. Miller92d86822011-02-04 15:55:25 -08001544 if (send)
1545 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001546
1547out: kfree_skb(skb);
1548 return 0;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001549}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001550
Linus Torvalds1da177e2005-04-16 15:20:36 -07001551static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1552{
David S. Miller2c8cec52011-02-09 20:42:07 -08001553 struct rtable *rt = (struct rtable *) dst;
David S. Miller2c8cec52011-02-09 20:42:07 -08001554
1555 dst_confirm(dst);
1556
David S. Miller59436342012-07-10 06:58:42 -07001557 if (mtu < ip_rt_min_pmtu)
1558 mtu = ip_rt_min_pmtu;
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001559
David S. Miller59436342012-07-10 06:58:42 -07001560 rt->rt_pmtu = mtu;
1561 dst_set_expires(&rt->dst, ip_rt_mtu_expires);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001562}
1563
David S. Miller36393392012-06-14 22:21:46 -07001564void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1565 int oif, u32 mark, u8 protocol, int flow_flags)
1566{
1567 const struct iphdr *iph = (const struct iphdr *)skb->data;
1568 struct flowi4 fl4;
1569 struct rtable *rt;
1570
1571 flowi4_init_output(&fl4, oif, mark, RT_TOS(iph->tos), RT_SCOPE_UNIVERSE,
David S. Miller3e129392012-07-10 04:01:57 -07001572 protocol, flow_flags,
David S. Miller36393392012-06-14 22:21:46 -07001573 iph->daddr, iph->saddr, 0, 0);
1574 rt = __ip_route_output_key(net, &fl4);
1575 if (!IS_ERR(rt)) {
1576 ip_rt_update_pmtu(&rt->dst, mtu);
1577 ip_rt_put(rt);
1578 }
1579}
1580EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1581
1582void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1583{
1584 const struct inet_sock *inet = inet_sk(sk);
1585
1586 return ipv4_update_pmtu(skb, sock_net(sk), mtu,
1587 sk->sk_bound_dev_if, sk->sk_mark,
1588 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
1589 inet_sk_flowi_flags(sk));
1590}
1591EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
David S. Millerf39925d2011-02-09 22:00:16 -08001592
David S. Millerb42597e2012-07-11 21:25:45 -07001593void ipv4_redirect(struct sk_buff *skb, struct net *net,
1594 int oif, u32 mark, u8 protocol, int flow_flags)
1595{
1596 const struct iphdr *iph = (const struct iphdr *)skb->data;
1597 struct flowi4 fl4;
1598 struct rtable *rt;
1599
1600 flowi4_init_output(&fl4, oif, mark, RT_TOS(iph->tos), RT_SCOPE_UNIVERSE,
1601 protocol, flow_flags, iph->daddr, iph->saddr, 0, 0);
1602 rt = __ip_route_output_key(net, &fl4);
1603 if (!IS_ERR(rt)) {
1604 ip_do_redirect(&rt->dst, skb);
1605 ip_rt_put(rt);
1606 }
1607}
1608EXPORT_SYMBOL_GPL(ipv4_redirect);
1609
1610void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1611{
1612 const struct inet_sock *inet = inet_sk(sk);
1613
1614 return ipv4_redirect(skb, sock_net(sk), sk->sk_bound_dev_if,
1615 sk->sk_mark,
1616 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
1617 inet_sk_flowi_flags(sk));
1618}
1619EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1620
David S. Millerefbc3682011-12-01 13:38:59 -05001621static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1622{
1623 struct rtable *rt = (struct rtable *) dst;
1624
1625 if (rt_is_expired(rt))
1626 return NULL;
Timo Teräsd11a4dc2010-03-18 23:20:20 +00001627 return dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001628}
1629
1630static void ipv4_dst_destroy(struct dst_entry *dst)
1631{
1632 struct rtable *rt = (struct rtable *) dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001633
David S. Miller62fa8a82011-01-26 20:51:05 -08001634 if (rt->fi) {
1635 fib_info_put(rt->fi);
1636 rt->fi = NULL;
1637 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001638}
1639
Linus Torvalds1da177e2005-04-16 15:20:36 -07001640
1641static void ipv4_link_failure(struct sk_buff *skb)
1642{
1643 struct rtable *rt;
1644
1645 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1646
Eric Dumazet511c3f92009-06-02 05:14:27 +00001647 rt = skb_rtable(skb);
David S. Miller59436342012-07-10 06:58:42 -07001648 if (rt)
1649 dst_set_expires(&rt->dst, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001650}
1651
1652static int ip_rt_bug(struct sk_buff *skb)
1653{
Joe Perches91df42b2012-05-15 14:11:54 +00001654 pr_debug("%s: %pI4 -> %pI4, %s\n",
1655 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1656 skb->dev ? skb->dev->name : "?");
Linus Torvalds1da177e2005-04-16 15:20:36 -07001657 kfree_skb(skb);
Dave Jonesc378a9c2011-05-21 07:16:42 +00001658 WARN_ON(1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001659 return 0;
1660}
1661
1662/*
1663 We do not cache source address of outgoing interface,
1664 because it is used only by IP RR, TS and SRR options,
1665 so that it out of fast path.
1666
1667 BTW remember: "addr" is allowed to be not aligned
1668 in IP options!
1669 */
1670
David S. Miller8e363602011-05-13 17:29:41 -04001671void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001672{
Al Viroa61ced52006-09-26 21:27:54 -07001673 __be32 src;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001674
David S. Millerc7537962010-11-11 17:07:48 -08001675 if (rt_is_output_route(rt))
David S. Millerc5be24f2011-05-13 18:01:21 -04001676 src = ip_hdr(skb)->saddr;
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00001677 else {
David S. Miller8e363602011-05-13 17:29:41 -04001678 struct fib_result res;
1679 struct flowi4 fl4;
1680 struct iphdr *iph;
1681
1682 iph = ip_hdr(skb);
1683
1684 memset(&fl4, 0, sizeof(fl4));
1685 fl4.daddr = iph->daddr;
1686 fl4.saddr = iph->saddr;
Julian Anastasovb0fe4a32011-07-23 02:00:41 +00001687 fl4.flowi4_tos = RT_TOS(iph->tos);
David S. Miller8e363602011-05-13 17:29:41 -04001688 fl4.flowi4_oif = rt->dst.dev->ifindex;
1689 fl4.flowi4_iif = skb->dev->ifindex;
1690 fl4.flowi4_mark = skb->mark;
David S. Miller5e2b61f2011-03-04 21:47:09 -08001691
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00001692 rcu_read_lock();
David S. Miller68a5e3d2011-03-11 20:07:33 -05001693 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
David S. Miller436c3b62011-03-24 17:42:21 -07001694 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00001695 else
1696 src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001697 RT_SCOPE_UNIVERSE);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00001698 rcu_read_unlock();
1699 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001700 memcpy(addr, &src, 4);
1701}
1702
Patrick McHardyc7066f72011-01-14 13:36:42 +01001703#ifdef CONFIG_IP_ROUTE_CLASSID
Linus Torvalds1da177e2005-04-16 15:20:36 -07001704static void set_class_tag(struct rtable *rt, u32 tag)
1705{
Changli Gaod8d1f302010-06-10 23:31:35 -07001706 if (!(rt->dst.tclassid & 0xFFFF))
1707 rt->dst.tclassid |= tag & 0xFFFF;
1708 if (!(rt->dst.tclassid & 0xFFFF0000))
1709 rt->dst.tclassid |= tag & 0xFFFF0000;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001710}
1711#endif
1712
David S. Miller0dbaee32010-12-13 12:52:14 -08001713static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1714{
1715 unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1716
1717 if (advmss == 0) {
1718 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1719 ip_rt_min_advmss);
1720 if (advmss > 65535 - 40)
1721 advmss = 65535 - 40;
1722 }
1723 return advmss;
1724}
1725
Steffen Klassertebb762f2011-11-23 02:12:51 +00001726static unsigned int ipv4_mtu(const struct dst_entry *dst)
David S. Millerd33e4552010-12-14 13:01:14 -08001727{
Steffen Klassert261663b2011-11-23 02:14:50 +00001728 const struct rtable *rt = (const struct rtable *) dst;
David S. Miller59436342012-07-10 06:58:42 -07001729 unsigned int mtu = rt->rt_pmtu;
1730
1731 if (mtu && time_after_eq(jiffies, rt->dst.expires))
1732 mtu = 0;
1733
1734 if (!mtu)
1735 mtu = dst_metric_raw(dst, RTAX_MTU);
Steffen Klassert618f9bc2011-11-23 02:13:31 +00001736
Steffen Klassert261663b2011-11-23 02:14:50 +00001737 if (mtu && rt_is_output_route(rt))
Steffen Klassert618f9bc2011-11-23 02:13:31 +00001738 return mtu;
1739
1740 mtu = dst->dev->mtu;
David S. Millerd33e4552010-12-14 13:01:14 -08001741
1742 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
David S. Millerd33e4552010-12-14 13:01:14 -08001743
1744 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1745 mtu = 576;
1746 }
1747
1748 if (mtu > IP_MAX_MTU)
1749 mtu = IP_MAX_MTU;
1750
1751 return mtu;
1752}
1753
David S. Miller813b3b52011-04-28 14:48:42 -07001754static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
David S. Miller5e2b61f2011-03-04 21:47:09 -08001755 struct fib_info *fi)
David S. Millera4daad62011-01-27 22:01:53 -08001756{
David S. Millerf1850712012-07-10 07:26:01 -07001757 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1758 rt->fi = fi;
1759 atomic_inc(&fi->fib_clntref);
David S. Millera4daad62011-01-27 22:01:53 -08001760 }
David S. Millerf1850712012-07-10 07:26:01 -07001761 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
David S. Millera4daad62011-01-27 22:01:53 -08001762}
1763
David S. Miller813b3b52011-04-28 14:48:42 -07001764static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
David S. Miller5e2b61f2011-03-04 21:47:09 -08001765 const struct fib_result *res,
David S. Miller982721f2011-02-16 21:44:24 -08001766 struct fib_info *fi, u16 type, u32 itag)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001767{
Linus Torvalds1da177e2005-04-16 15:20:36 -07001768 if (fi) {
1769 if (FIB_RES_GW(*res) &&
1770 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1771 rt->rt_gateway = FIB_RES_GW(*res);
David S. Miller813b3b52011-04-28 14:48:42 -07001772 rt_init_metrics(rt, fl4, fi);
Patrick McHardyc7066f72011-01-14 13:36:42 +01001773#ifdef CONFIG_IP_ROUTE_CLASSID
David S. Miller710ab6c2012-07-10 07:02:09 -07001774 rt->dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001775#endif
David S. Millerd33e4552010-12-14 13:01:14 -08001776 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001777
Patrick McHardyc7066f72011-01-14 13:36:42 +01001778#ifdef CONFIG_IP_ROUTE_CLASSID
Linus Torvalds1da177e2005-04-16 15:20:36 -07001779#ifdef CONFIG_IP_MULTIPLE_TABLES
1780 set_class_tag(rt, fib_rules_tclass(res));
1781#endif
1782 set_class_tag(rt, itag);
1783#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07001784}
1785
David S. Miller5c1e6aa2011-04-28 14:13:38 -07001786static struct rtable *rt_dst_alloc(struct net_device *dev,
1787 bool nopolicy, bool noxfrm)
David S. Miller0c4dcd52011-02-17 15:42:37 -08001788{
David S. Miller5c1e6aa2011-04-28 14:13:38 -07001789 return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
1790 DST_HOST |
1791 (nopolicy ? DST_NOPOLICY : 0) |
1792 (noxfrm ? DST_NOXFRM : 0));
David S. Miller0c4dcd52011-02-17 15:42:37 -08001793}
1794
Eric Dumazet96d36222010-06-02 19:21:31 +00001795/* called in rcu_read_lock() section */
Al Viro9e12bb22006-09-26 21:25:20 -07001796static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001797 u8 tos, struct net_device *dev, int our)
1798{
Eric Dumazet96d36222010-06-02 19:21:31 +00001799 unsigned int hash;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001800 struct rtable *rth;
Eric Dumazet96d36222010-06-02 19:21:31 +00001801 struct in_device *in_dev = __in_dev_get_rcu(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001802 u32 itag = 0;
Eric Dumazetb5f7e752010-06-02 12:05:27 +00001803 int err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001804
1805 /* Primary sanity checks. */
1806
1807 if (in_dev == NULL)
1808 return -EINVAL;
1809
Jan Engelhardt1e637c72008-01-21 03:18:08 -08001810 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
Thomas Grafd0daebc32012-06-12 00:44:01 +00001811 skb->protocol != htons(ETH_P_IP))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001812 goto e_inval;
1813
Thomas Grafd0daebc32012-06-12 00:44:01 +00001814 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1815 if (ipv4_is_loopback(saddr))
1816 goto e_inval;
1817
Joe Perchesf97c1e02007-12-16 13:45:43 -08001818 if (ipv4_is_zeronet(saddr)) {
1819 if (!ipv4_is_local_multicast(daddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001820 goto e_inval;
Eric Dumazetb5f7e752010-06-02 12:05:27 +00001821 } else {
David S. Miller9e56e382012-06-28 18:54:02 -07001822 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1823 in_dev, &itag);
Eric Dumazetb5f7e752010-06-02 12:05:27 +00001824 if (err < 0)
1825 goto e_err;
1826 }
Benjamin LaHaise4e7b2f12012-03-27 15:55:32 +00001827 rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
David S. Miller5c1e6aa2011-04-28 14:13:38 -07001828 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001829 if (!rth)
1830 goto e_nobufs;
1831
Patrick McHardyc7066f72011-01-14 13:36:42 +01001832#ifdef CONFIG_IP_ROUTE_CLASSID
Changli Gaod8d1f302010-06-10 23:31:35 -07001833 rth->dst.tclassid = itag;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001834#endif
David S. Millercf911662011-04-28 14:31:47 -07001835 rth->dst.output = ip_rt_bug;
1836
1837 rth->rt_key_dst = daddr;
1838 rth->rt_key_src = saddr;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001839 rth->rt_genid = rt_genid(dev_net(dev));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001840 rth->rt_flags = RTCF_MULTICAST;
Eric Dumazet29e75252008-01-31 17:05:09 -08001841 rth->rt_type = RTN_MULTICAST;
David S. Miller475949d2011-05-03 19:45:15 -07001842 rth->rt_key_tos = tos;
David S. Millercf911662011-04-28 14:31:47 -07001843 rth->rt_dst = daddr;
1844 rth->rt_src = saddr;
1845 rth->rt_route_iif = dev->ifindex;
1846 rth->rt_iif = dev->ifindex;
1847 rth->rt_oif = 0;
1848 rth->rt_mark = skb->mark;
David S. Miller59436342012-07-10 06:58:42 -07001849 rth->rt_pmtu = 0;
David S. Millercf911662011-04-28 14:31:47 -07001850 rth->rt_gateway = daddr;
David S. Millercf911662011-04-28 14:31:47 -07001851 rth->fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001852 if (our) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001853 rth->dst.input= ip_local_deliver;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001854 rth->rt_flags |= RTCF_LOCAL;
1855 }
1856
1857#ifdef CONFIG_IP_MROUTE
Joe Perchesf97c1e02007-12-16 13:45:43 -08001858 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
Changli Gaod8d1f302010-06-10 23:31:35 -07001859 rth->dst.input = ip_mr_input;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001860#endif
1861 RT_CACHE_STAT_INC(in_slow_mc);
1862
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001863 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
David S. Millerb23dd4f2011-03-02 14:31:35 -08001864 rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
Eric Dumazet9aa3c942011-06-18 11:59:18 -07001865 return IS_ERR(rth) ? PTR_ERR(rth) : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001866
1867e_nobufs:
Linus Torvalds1da177e2005-04-16 15:20:36 -07001868 return -ENOBUFS;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001869e_inval:
Eric Dumazet96d36222010-06-02 19:21:31 +00001870 return -EINVAL;
Eric Dumazetb5f7e752010-06-02 12:05:27 +00001871e_err:
Eric Dumazetb5f7e752010-06-02 12:05:27 +00001872 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001873}
1874
1875
1876static void ip_handle_martian_source(struct net_device *dev,
1877 struct in_device *in_dev,
1878 struct sk_buff *skb,
Al Viro9e12bb22006-09-26 21:25:20 -07001879 __be32 daddr,
1880 __be32 saddr)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001881{
1882 RT_CACHE_STAT_INC(in_martian_src);
1883#ifdef CONFIG_IP_ROUTE_VERBOSE
1884 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1885 /*
1886 * RFC1812 recommendation, if source is martian,
1887 * the only hint is MAC header.
1888 */
Joe Perches058bd4d2012-03-11 18:36:11 +00001889 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
Harvey Harrison673d57e2008-10-31 00:53:57 -07001890 &daddr, &saddr, dev->name);
Arnaldo Carvalho de Melo98e399f2007-03-19 15:33:04 -07001891 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
Joe Perches058bd4d2012-03-11 18:36:11 +00001892 print_hex_dump(KERN_WARNING, "ll header: ",
1893 DUMP_PREFIX_OFFSET, 16, 1,
1894 skb_mac_header(skb),
1895 dev->hard_header_len, true);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001896 }
1897 }
1898#endif
1899}
1900
Eric Dumazet47360222010-06-03 04:13:21 +00001901/* called in rcu_read_lock() section */
Stephen Hemminger5969f712008-04-10 01:52:09 -07001902static int __mkroute_input(struct sk_buff *skb,
David S. Miller982721f2011-02-16 21:44:24 -08001903 const struct fib_result *res,
Stephen Hemminger5969f712008-04-10 01:52:09 -07001904 struct in_device *in_dev,
1905 __be32 daddr, __be32 saddr, u32 tos,
1906 struct rtable **result)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001907{
Linus Torvalds1da177e2005-04-16 15:20:36 -07001908 struct rtable *rth;
1909 int err;
1910 struct in_device *out_dev;
Eric Dumazet47360222010-06-03 04:13:21 +00001911 unsigned int flags = 0;
Al Virod9c9df82006-09-26 21:28:14 -07001912 u32 itag;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001913
1914 /* get a working reference to the output device */
Eric Dumazet47360222010-06-03 04:13:21 +00001915 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001916 if (out_dev == NULL) {
Joe Perchese87cc472012-05-13 21:56:26 +00001917 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07001918 return -EINVAL;
1919 }
1920
1921
Michael Smith5c04c812011-04-07 04:51:50 +00001922 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
David S. Miller9e56e382012-06-28 18:54:02 -07001923 in_dev->dev, in_dev, &itag);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001924 if (err < 0) {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001925 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001926 saddr);
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001927
Linus Torvalds1da177e2005-04-16 15:20:36 -07001928 goto cleanup;
1929 }
1930
1931 if (err)
1932 flags |= RTCF_DIRECTSRC;
1933
Thomas Graf51b77ca2008-06-03 16:36:01 -07001934 if (out_dev == in_dev && err &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001935 (IN_DEV_SHARED_MEDIA(out_dev) ||
1936 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1937 flags |= RTCF_DOREDIRECT;
1938
1939 if (skb->protocol != htons(ETH_P_IP)) {
1940 /* Not IP (i.e. ARP). Do not create route, if it is
1941 * invalid for proxy arp. DNAT routes are always valid.
Jesper Dangaard Brouer65324142010-01-05 05:50:47 +00001942 *
1943 * Proxy arp feature have been extended to allow, ARP
1944 * replies back to the same interface, to support
1945 * Private VLAN switch technologies. See arp.c.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001946 */
Jesper Dangaard Brouer65324142010-01-05 05:50:47 +00001947 if (out_dev == in_dev &&
1948 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001949 err = -EINVAL;
1950 goto cleanup;
1951 }
1952 }
1953
David S. Miller5c1e6aa2011-04-28 14:13:38 -07001954 rth = rt_dst_alloc(out_dev->dev,
1955 IN_DEV_CONF_GET(in_dev, NOPOLICY),
David S. Miller0c4dcd52011-02-17 15:42:37 -08001956 IN_DEV_CONF_GET(out_dev, NOXFRM));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001957 if (!rth) {
1958 err = -ENOBUFS;
1959 goto cleanup;
1960 }
1961
David S. Miller5e2b61f2011-03-04 21:47:09 -08001962 rth->rt_key_dst = daddr;
David S. Miller5e2b61f2011-03-04 21:47:09 -08001963 rth->rt_key_src = saddr;
David S. Millercf911662011-04-28 14:31:47 -07001964 rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
1965 rth->rt_flags = flags;
1966 rth->rt_type = res->type;
David S. Miller475949d2011-05-03 19:45:15 -07001967 rth->rt_key_tos = tos;
David S. Millercf911662011-04-28 14:31:47 -07001968 rth->rt_dst = daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001969 rth->rt_src = saddr;
OGAWA Hirofumi1b86a582011-04-07 14:04:08 -07001970 rth->rt_route_iif = in_dev->dev->ifindex;
David S. Miller5e2b61f2011-03-04 21:47:09 -08001971 rth->rt_iif = in_dev->dev->ifindex;
David S. Miller5e2b61f2011-03-04 21:47:09 -08001972 rth->rt_oif = 0;
David S. Millercf911662011-04-28 14:31:47 -07001973 rth->rt_mark = skb->mark;
David S. Miller59436342012-07-10 06:58:42 -07001974 rth->rt_pmtu = 0;
David S. Millercf911662011-04-28 14:31:47 -07001975 rth->rt_gateway = daddr;
David S. Millercf911662011-04-28 14:31:47 -07001976 rth->fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001977
Changli Gaod8d1f302010-06-10 23:31:35 -07001978 rth->dst.input = ip_forward;
1979 rth->dst.output = ip_output;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001980
David S. Miller5e2b61f2011-03-04 21:47:09 -08001981 rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001982
Linus Torvalds1da177e2005-04-16 15:20:36 -07001983 *result = rth;
1984 err = 0;
1985 cleanup:
Linus Torvalds1da177e2005-04-16 15:20:36 -07001986 return err;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001987}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001988
Stephen Hemminger5969f712008-04-10 01:52:09 -07001989static int ip_mkroute_input(struct sk_buff *skb,
1990 struct fib_result *res,
David S. Miller68a5e3d2011-03-11 20:07:33 -05001991 const struct flowi4 *fl4,
Stephen Hemminger5969f712008-04-10 01:52:09 -07001992 struct in_device *in_dev,
1993 __be32 daddr, __be32 saddr, u32 tos)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001994{
Daniel Baluta5e73ea12012-04-15 01:34:41 +00001995 struct rtable *rth = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001996 int err;
Eric Dumazet95c96172012-04-15 05:58:06 +00001997 unsigned int hash;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001998
1999#ifdef CONFIG_IP_ROUTE_MULTIPATH
David S. Millerff3fccb2011-03-10 16:23:24 -08002000 if (res->fi && res->fi->fib_nhs > 1)
David S. Miller1b7fe5932011-03-10 17:01:16 -08002001 fib_select_multipath(res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002002#endif
2003
2004 /* create a routing cache entry */
2005 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2006 if (err)
2007 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002008
2009 /* put it into the cache */
David S. Miller68a5e3d2011-03-11 20:07:33 -05002010 hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
Changli Gaod8d1f302010-06-10 23:31:35 -07002011 rt_genid(dev_net(rth->dst.dev)));
David S. Miller68a5e3d2011-03-11 20:07:33 -05002012 rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
David S. Millerb23dd4f2011-03-02 14:31:35 -08002013 if (IS_ERR(rth))
2014 return PTR_ERR(rth);
2015 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002016}
2017
Linus Torvalds1da177e2005-04-16 15:20:36 -07002018/*
2019 * NOTE. We drop all the packets that has local source
2020 * addresses, because every properly looped back packet
2021 * must have correct destination already attached by output routine.
2022 *
2023 * Such approach solves two big problems:
2024 * 1. Not simplex devices are handled properly.
2025 * 2. IP spoofing attempts are filtered with 100% of guarantee.
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002026 * called with rcu_read_lock()
Linus Torvalds1da177e2005-04-16 15:20:36 -07002027 */
2028
Al Viro9e12bb22006-09-26 21:25:20 -07002029static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
David S. Millerc10237e2012-06-27 17:05:06 -07002030 u8 tos, struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002031{
2032 struct fib_result res;
Eric Dumazet96d36222010-06-02 19:21:31 +00002033 struct in_device *in_dev = __in_dev_get_rcu(dev);
David S. Miller68a5e3d2011-03-11 20:07:33 -05002034 struct flowi4 fl4;
Eric Dumazet95c96172012-04-15 05:58:06 +00002035 unsigned int flags = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002036 u32 itag = 0;
Eric Dumazet95c96172012-04-15 05:58:06 +00002037 struct rtable *rth;
2038 unsigned int hash;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002039 int err = -EINVAL;
Daniel Baluta5e73ea12012-04-15 01:34:41 +00002040 struct net *net = dev_net(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002041
2042 /* IP on this device is disabled. */
2043
2044 if (!in_dev)
2045 goto out;
2046
2047 /* Check for the most weird martians, which can be not detected
2048 by fib_lookup.
2049 */
2050
Thomas Grafd0daebc32012-06-12 00:44:01 +00002051 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002052 goto martian_source;
2053
Andy Walls27a954b2010-10-17 15:11:22 +00002054 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002055 goto brd_input;
2056
2057 /* Accept zero addresses only to limited broadcast;
2058 * I even do not know to fix it or not. Waiting for complains :-)
2059 */
Joe Perchesf97c1e02007-12-16 13:45:43 -08002060 if (ipv4_is_zeronet(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002061 goto martian_source;
2062
Thomas Grafd0daebc32012-06-12 00:44:01 +00002063 if (ipv4_is_zeronet(daddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002064 goto martian_destination;
2065
Thomas Grafd0daebc32012-06-12 00:44:01 +00002066 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) {
2067 if (ipv4_is_loopback(daddr))
2068 goto martian_destination;
2069
2070 if (ipv4_is_loopback(saddr))
2071 goto martian_source;
2072 }
2073
Linus Torvalds1da177e2005-04-16 15:20:36 -07002074 /*
2075 * Now we are ready to route packet.
2076 */
David S. Miller68a5e3d2011-03-11 20:07:33 -05002077 fl4.flowi4_oif = 0;
2078 fl4.flowi4_iif = dev->ifindex;
2079 fl4.flowi4_mark = skb->mark;
2080 fl4.flowi4_tos = tos;
2081 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2082 fl4.daddr = daddr;
2083 fl4.saddr = saddr;
2084 err = fib_lookup(net, &fl4, &res);
David S. Miller251da412012-06-26 16:27:09 -07002085 if (err != 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002086 goto no_route;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002087
2088 RT_CACHE_STAT_INC(in_slow_tot);
2089
2090 if (res.type == RTN_BROADCAST)
2091 goto brd_input;
2092
2093 if (res.type == RTN_LOCAL) {
Michael Smith5c04c812011-04-07 04:51:50 +00002094 err = fib_validate_source(skb, saddr, daddr, tos,
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002095 net->loopback_dev->ifindex,
David S. Miller9e56e382012-06-28 18:54:02 -07002096 dev, in_dev, &itag);
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002097 if (err < 0)
2098 goto martian_source_keep_err;
2099 if (err)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002100 flags |= RTCF_DIRECTSRC;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002101 goto local_input;
2102 }
2103
2104 if (!IN_DEV_FORWARD(in_dev))
David S. Miller251da412012-06-26 16:27:09 -07002105 goto no_route;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002106 if (res.type != RTN_UNICAST)
2107 goto martian_destination;
2108
David S. Miller68a5e3d2011-03-11 20:07:33 -05002109 err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002110out: return err;
2111
2112brd_input:
2113 if (skb->protocol != htons(ETH_P_IP))
2114 goto e_inval;
2115
David S. Miller41347dc2012-06-28 04:05:27 -07002116 if (!ipv4_is_zeronet(saddr)) {
David S. Miller9e56e382012-06-28 18:54:02 -07002117 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2118 in_dev, &itag);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002119 if (err < 0)
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002120 goto martian_source_keep_err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002121 if (err)
2122 flags |= RTCF_DIRECTSRC;
2123 }
2124 flags |= RTCF_BROADCAST;
2125 res.type = RTN_BROADCAST;
2126 RT_CACHE_STAT_INC(in_brd);
2127
2128local_input:
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002129 rth = rt_dst_alloc(net->loopback_dev,
2130 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002131 if (!rth)
2132 goto e_nobufs;
2133
David S. Millercf911662011-04-28 14:31:47 -07002134 rth->dst.input= ip_local_deliver;
Changli Gaod8d1f302010-06-10 23:31:35 -07002135 rth->dst.output= ip_rt_bug;
David S. Millercf911662011-04-28 14:31:47 -07002136#ifdef CONFIG_IP_ROUTE_CLASSID
2137 rth->dst.tclassid = itag;
2138#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07002139
David S. Miller5e2b61f2011-03-04 21:47:09 -08002140 rth->rt_key_dst = daddr;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002141 rth->rt_key_src = saddr;
David S. Millercf911662011-04-28 14:31:47 -07002142 rth->rt_genid = rt_genid(net);
2143 rth->rt_flags = flags|RTCF_LOCAL;
2144 rth->rt_type = res.type;
David S. Miller475949d2011-05-03 19:45:15 -07002145 rth->rt_key_tos = tos;
David S. Millercf911662011-04-28 14:31:47 -07002146 rth->rt_dst = daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002147 rth->rt_src = saddr;
OGAWA Hirofumi1b86a582011-04-07 14:04:08 -07002148 rth->rt_route_iif = dev->ifindex;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002149 rth->rt_iif = dev->ifindex;
David S. Millercf911662011-04-28 14:31:47 -07002150 rth->rt_oif = 0;
2151 rth->rt_mark = skb->mark;
David S. Miller59436342012-07-10 06:58:42 -07002152 rth->rt_pmtu = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002153 rth->rt_gateway = daddr;
David S. Millercf911662011-04-28 14:31:47 -07002154 rth->fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002155 if (res.type == RTN_UNREACHABLE) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002156 rth->dst.input= ip_error;
2157 rth->dst.error= -err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002158 rth->rt_flags &= ~RTCF_LOCAL;
2159 }
David S. Miller68a5e3d2011-03-11 20:07:33 -05002160 hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2161 rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
David S. Millerb23dd4f2011-03-02 14:31:35 -08002162 err = 0;
2163 if (IS_ERR(rth))
2164 err = PTR_ERR(rth);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002165 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002166
2167no_route:
2168 RT_CACHE_STAT_INC(in_no_route);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002169 res.type = RTN_UNREACHABLE;
Mitsuru Chinen7f538782007-12-07 01:07:24 -08002170 if (err == -ESRCH)
2171 err = -ENETUNREACH;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002172 goto local_input;
2173
2174 /*
2175 * Do not cache martian addresses: they should be logged (RFC1812)
2176 */
2177martian_destination:
2178 RT_CACHE_STAT_INC(in_martian_dst);
2179#ifdef CONFIG_IP_ROUTE_VERBOSE
Joe Perchese87cc472012-05-13 21:56:26 +00002180 if (IN_DEV_LOG_MARTIANS(in_dev))
2181 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2182 &daddr, &saddr, dev->name);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002183#endif
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002184
Linus Torvalds1da177e2005-04-16 15:20:36 -07002185e_inval:
2186 err = -EINVAL;
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002187 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002188
2189e_nobufs:
2190 err = -ENOBUFS;
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002191 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002192
2193martian_source:
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002194 err = -EINVAL;
2195martian_source_keep_err:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002196 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002197 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002198}
2199
Eric Dumazet407eadd2010-05-10 11:32:55 +00002200int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
David S. Millerc10237e2012-06-27 17:05:06 -07002201 u8 tos, struct net_device *dev, bool noref)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002202{
Eric Dumazet95c96172012-04-15 05:58:06 +00002203 struct rtable *rth;
2204 unsigned int hash;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002205 int iif = dev->ifindex;
Denis V. Lunevb5921912008-01-22 23:50:25 -08002206 struct net *net;
Eric Dumazet96d36222010-06-02 19:21:31 +00002207 int res;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002208
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09002209 net = dev_net(dev);
Neil Horman1080d702008-10-27 12:28:25 -07002210
Eric Dumazet96d36222010-06-02 19:21:31 +00002211 rcu_read_lock();
2212
Neil Horman1080d702008-10-27 12:28:25 -07002213 if (!rt_caching(net))
2214 goto skip_cache;
2215
Linus Torvalds1da177e2005-04-16 15:20:36 -07002216 tos &= IPTOS_RT_MASK;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002217 hash = rt_hash(daddr, saddr, iif, rt_genid(net));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002218
Linus Torvalds1da177e2005-04-16 15:20:36 -07002219 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
Changli Gaod8d1f302010-06-10 23:31:35 -07002220 rth = rcu_dereference(rth->dst.rt_next)) {
David S. Miller5e2b61f2011-03-04 21:47:09 -08002221 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2222 ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
Julian Anastasov97a80412011-08-09 04:01:16 +00002223 (rth->rt_route_iif ^ iif) |
David S. Miller475949d2011-05-03 19:45:15 -07002224 (rth->rt_key_tos ^ tos)) == 0 &&
David S. Miller5e2b61f2011-03-04 21:47:09 -08002225 rth->rt_mark == skb->mark &&
Changli Gaod8d1f302010-06-10 23:31:35 -07002226 net_eq(dev_net(rth->dst.dev), net) &&
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002227 !rt_is_expired(rth)) {
Eric Dumazet407eadd2010-05-10 11:32:55 +00002228 if (noref) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002229 dst_use_noref(&rth->dst, jiffies);
2230 skb_dst_set_noref(skb, &rth->dst);
Eric Dumazet407eadd2010-05-10 11:32:55 +00002231 } else {
Changli Gaod8d1f302010-06-10 23:31:35 -07002232 dst_use(&rth->dst, jiffies);
2233 skb_dst_set(skb, &rth->dst);
Eric Dumazet407eadd2010-05-10 11:32:55 +00002234 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002235 RT_CACHE_STAT_INC(in_hit);
2236 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002237 return 0;
2238 }
2239 RT_CACHE_STAT_INC(in_hlist_search);
2240 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002241
Neil Horman1080d702008-10-27 12:28:25 -07002242skip_cache:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002243 /* Multicast recognition logic is moved from route cache to here.
2244 The problem was that too many Ethernet cards have broken/missing
2245 hardware multicast filters :-( As result the host on multicasting
2246 network acquires a lot of useless route cache entries, sort of
2247 SDR messages from all the world. Now we try to get rid of them.
2248 Really, provided software IP multicast filter is organized
2249 reasonably (at least, hashed), it does not result in a slowdown
2250 comparing with route cache reject entries.
2251 Note, that multicast routers are not affected, because
2252 route cache entry is created eventually.
2253 */
Joe Perchesf97c1e02007-12-16 13:45:43 -08002254 if (ipv4_is_multicast(daddr)) {
Eric Dumazet96d36222010-06-02 19:21:31 +00002255 struct in_device *in_dev = __in_dev_get_rcu(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002256
Eric Dumazet96d36222010-06-02 19:21:31 +00002257 if (in_dev) {
David S. Millerdbdd9a52011-03-10 16:34:38 -08002258 int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2259 ip_hdr(skb)->protocol);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002260 if (our
2261#ifdef CONFIG_IP_MROUTE
Joe Perches9d4fb272009-11-23 10:41:23 -08002262 ||
2263 (!ipv4_is_local_multicast(daddr) &&
2264 IN_DEV_MFORWARD(in_dev))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002265#endif
Joe Perches9d4fb272009-11-23 10:41:23 -08002266 ) {
Eric Dumazet96d36222010-06-02 19:21:31 +00002267 int res = ip_route_input_mc(skb, daddr, saddr,
2268 tos, dev, our);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002269 rcu_read_unlock();
Eric Dumazet96d36222010-06-02 19:21:31 +00002270 return res;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002271 }
2272 }
2273 rcu_read_unlock();
2274 return -EINVAL;
2275 }
David S. Millerc10237e2012-06-27 17:05:06 -07002276 res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
Eric Dumazet96d36222010-06-02 19:21:31 +00002277 rcu_read_unlock();
2278 return res;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002279}
Eric Dumazet407eadd2010-05-10 11:32:55 +00002280EXPORT_SYMBOL(ip_route_input_common);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002281
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002282/* called with rcu_read_lock() */
David S. Miller982721f2011-02-16 21:44:24 -08002283static struct rtable *__mkroute_output(const struct fib_result *res,
David S. Miller68a5e3d2011-03-11 20:07:33 -05002284 const struct flowi4 *fl4,
David S. Miller813b3b52011-04-28 14:48:42 -07002285 __be32 orig_daddr, __be32 orig_saddr,
Julian Anastasovf61759e2011-12-02 11:39:42 +00002286 int orig_oif, __u8 orig_rtos,
2287 struct net_device *dev_out,
David S. Miller5ada5522011-02-17 15:29:00 -08002288 unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002289{
David S. Miller982721f2011-02-16 21:44:24 -08002290 struct fib_info *fi = res->fi;
David S. Miller5ada5522011-02-17 15:29:00 -08002291 struct in_device *in_dev;
David S. Miller982721f2011-02-16 21:44:24 -08002292 u16 type = res->type;
David S. Miller5ada5522011-02-17 15:29:00 -08002293 struct rtable *rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002294
Thomas Grafd0daebc32012-06-12 00:44:01 +00002295 in_dev = __in_dev_get_rcu(dev_out);
2296 if (!in_dev)
David S. Miller5ada5522011-02-17 15:29:00 -08002297 return ERR_PTR(-EINVAL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002298
Thomas Grafd0daebc32012-06-12 00:44:01 +00002299 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2300 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2301 return ERR_PTR(-EINVAL);
2302
David S. Miller68a5e3d2011-03-11 20:07:33 -05002303 if (ipv4_is_lbcast(fl4->daddr))
David S. Miller982721f2011-02-16 21:44:24 -08002304 type = RTN_BROADCAST;
David S. Miller68a5e3d2011-03-11 20:07:33 -05002305 else if (ipv4_is_multicast(fl4->daddr))
David S. Miller982721f2011-02-16 21:44:24 -08002306 type = RTN_MULTICAST;
David S. Miller68a5e3d2011-03-11 20:07:33 -05002307 else if (ipv4_is_zeronet(fl4->daddr))
David S. Miller5ada5522011-02-17 15:29:00 -08002308 return ERR_PTR(-EINVAL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002309
2310 if (dev_out->flags & IFF_LOOPBACK)
2311 flags |= RTCF_LOCAL;
2312
David S. Miller982721f2011-02-16 21:44:24 -08002313 if (type == RTN_BROADCAST) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002314 flags |= RTCF_BROADCAST | RTCF_LOCAL;
David S. Miller982721f2011-02-16 21:44:24 -08002315 fi = NULL;
2316 } else if (type == RTN_MULTICAST) {
Eric Dumazetdd28d1a2010-09-29 11:53:50 +00002317 flags |= RTCF_MULTICAST | RTCF_LOCAL;
David S. Miller813b3b52011-04-28 14:48:42 -07002318 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2319 fl4->flowi4_proto))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002320 flags &= ~RTCF_LOCAL;
2321 /* If multicast route do not exist use
Eric Dumazetdd28d1a2010-09-29 11:53:50 +00002322 * default one, but do not gateway in this case.
2323 * Yes, it is hack.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002324 */
David S. Miller982721f2011-02-16 21:44:24 -08002325 if (fi && res->prefixlen < 4)
2326 fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002327 }
2328
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002329 rth = rt_dst_alloc(dev_out,
2330 IN_DEV_CONF_GET(in_dev, NOPOLICY),
David S. Miller0c4dcd52011-02-17 15:42:37 -08002331 IN_DEV_CONF_GET(in_dev, NOXFRM));
Dimitris Michailidis8391d072010-10-07 14:48:38 +00002332 if (!rth)
David S. Miller5ada5522011-02-17 15:29:00 -08002333 return ERR_PTR(-ENOBUFS);
Dimitris Michailidis8391d072010-10-07 14:48:38 +00002334
David S. Millercf911662011-04-28 14:31:47 -07002335 rth->dst.output = ip_output;
2336
David S. Miller813b3b52011-04-28 14:48:42 -07002337 rth->rt_key_dst = orig_daddr;
2338 rth->rt_key_src = orig_saddr;
David S. Millercf911662011-04-28 14:31:47 -07002339 rth->rt_genid = rt_genid(dev_net(dev_out));
2340 rth->rt_flags = flags;
2341 rth->rt_type = type;
Julian Anastasovf61759e2011-12-02 11:39:42 +00002342 rth->rt_key_tos = orig_rtos;
David S. Miller68a5e3d2011-03-11 20:07:33 -05002343 rth->rt_dst = fl4->daddr;
2344 rth->rt_src = fl4->saddr;
OGAWA Hirofumi1b86a582011-04-07 14:04:08 -07002345 rth->rt_route_iif = 0;
David S. Miller813b3b52011-04-28 14:48:42 -07002346 rth->rt_iif = orig_oif ? : dev_out->ifindex;
2347 rth->rt_oif = orig_oif;
2348 rth->rt_mark = fl4->flowi4_mark;
David S. Miller59436342012-07-10 06:58:42 -07002349 rth->rt_pmtu = 0;
David S. Miller68a5e3d2011-03-11 20:07:33 -05002350 rth->rt_gateway = fl4->daddr;
David S. Millercf911662011-04-28 14:31:47 -07002351 rth->fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002352
2353 RT_CACHE_STAT_INC(out_slow_tot);
2354
David S. Miller41347dc2012-06-28 04:05:27 -07002355 if (flags & RTCF_LOCAL)
Changli Gaod8d1f302010-06-10 23:31:35 -07002356 rth->dst.input = ip_local_deliver;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002357 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002358 if (flags & RTCF_LOCAL &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07002359 !(dev_out->flags & IFF_LOOPBACK)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002360 rth->dst.output = ip_mc_output;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002361 RT_CACHE_STAT_INC(out_slow_mc);
2362 }
2363#ifdef CONFIG_IP_MROUTE
David S. Miller982721f2011-02-16 21:44:24 -08002364 if (type == RTN_MULTICAST) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002365 if (IN_DEV_MFORWARD(in_dev) &&
David S. Miller813b3b52011-04-28 14:48:42 -07002366 !ipv4_is_local_multicast(fl4->daddr)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002367 rth->dst.input = ip_mr_input;
2368 rth->dst.output = ip_mc_output;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002369 }
2370 }
2371#endif
2372 }
2373
David S. Miller813b3b52011-04-28 14:48:42 -07002374 rt_set_nexthop(rth, fl4, res, fi, type, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002375
Eric Dumazet7586ece2012-06-20 05:02:19 +00002376 if (fl4->flowi4_flags & FLOWI_FLAG_RT_NOCACHE)
2377 rth->dst.flags |= DST_NOCACHE;
2378
David S. Miller5ada5522011-02-17 15:29:00 -08002379 return rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002380}
2381
Linus Torvalds1da177e2005-04-16 15:20:36 -07002382/*
2383 * Major route resolver routine.
Eric Dumazet0197aa32010-09-30 03:33:58 +00002384 * called with rcu_read_lock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002385 */
2386
David S. Miller813b3b52011-04-28 14:48:42 -07002387static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002388{
Linus Torvalds1da177e2005-04-16 15:20:36 -07002389 struct net_device *dev_out = NULL;
Julian Anastasovf61759e2011-12-02 11:39:42 +00002390 __u8 tos = RT_FL_TOS(fl4);
David S. Miller813b3b52011-04-28 14:48:42 -07002391 unsigned int flags = 0;
2392 struct fib_result res;
David S. Miller5ada5522011-02-17 15:29:00 -08002393 struct rtable *rth;
David S. Miller813b3b52011-04-28 14:48:42 -07002394 __be32 orig_daddr;
2395 __be32 orig_saddr;
2396 int orig_oif;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002397
2398 res.fi = NULL;
David S. Miller8b96d222012-06-11 02:01:56 -07002399 res.table = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002400#ifdef CONFIG_IP_MULTIPLE_TABLES
2401 res.r = NULL;
2402#endif
2403
David S. Miller813b3b52011-04-28 14:48:42 -07002404 orig_daddr = fl4->daddr;
2405 orig_saddr = fl4->saddr;
2406 orig_oif = fl4->flowi4_oif;
2407
2408 fl4->flowi4_iif = net->loopback_dev->ifindex;
2409 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2410 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2411 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
David S. Miller44713b62011-03-04 21:24:47 -08002412
David S. Miller010c2702011-02-17 15:37:09 -08002413 rcu_read_lock();
David S. Miller813b3b52011-04-28 14:48:42 -07002414 if (fl4->saddr) {
David S. Millerb23dd4f2011-03-02 14:31:35 -08002415 rth = ERR_PTR(-EINVAL);
David S. Miller813b3b52011-04-28 14:48:42 -07002416 if (ipv4_is_multicast(fl4->saddr) ||
2417 ipv4_is_lbcast(fl4->saddr) ||
2418 ipv4_is_zeronet(fl4->saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002419 goto out;
2420
Linus Torvalds1da177e2005-04-16 15:20:36 -07002421 /* I removed check for oif == dev_out->oif here.
2422 It was wrong for two reasons:
Denis V. Lunev1ab35272008-01-22 22:04:30 -08002423 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2424 is assigned to multiple interfaces.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002425 2. Moreover, we are allowed to send packets with saddr
2426 of another iface. --ANK
2427 */
2428
David S. Miller813b3b52011-04-28 14:48:42 -07002429 if (fl4->flowi4_oif == 0 &&
2430 (ipv4_is_multicast(fl4->daddr) ||
2431 ipv4_is_lbcast(fl4->daddr))) {
Julian Anastasova210d012008-10-01 07:28:28 -07002432 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
David S. Miller813b3b52011-04-28 14:48:42 -07002433 dev_out = __ip_dev_find(net, fl4->saddr, false);
Julian Anastasova210d012008-10-01 07:28:28 -07002434 if (dev_out == NULL)
2435 goto out;
2436
Linus Torvalds1da177e2005-04-16 15:20:36 -07002437 /* Special hack: user can direct multicasts
2438 and limited broadcast via necessary interface
2439 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2440 This hack is not just for fun, it allows
2441 vic,vat and friends to work.
2442 They bind socket to loopback, set ttl to zero
2443 and expect that it will work.
2444 From the viewpoint of routing cache they are broken,
2445 because we are not allowed to build multicast path
2446 with loopback source addr (look, routing cache
2447 cannot know, that ttl is zero, so that packet
2448 will not leave this host and route is valid).
2449 Luckily, this hack is good workaround.
2450 */
2451
David S. Miller813b3b52011-04-28 14:48:42 -07002452 fl4->flowi4_oif = dev_out->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002453 goto make_route;
2454 }
Julian Anastasova210d012008-10-01 07:28:28 -07002455
David S. Miller813b3b52011-04-28 14:48:42 -07002456 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
Julian Anastasova210d012008-10-01 07:28:28 -07002457 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
David S. Miller813b3b52011-04-28 14:48:42 -07002458 if (!__ip_dev_find(net, fl4->saddr, false))
Julian Anastasova210d012008-10-01 07:28:28 -07002459 goto out;
Julian Anastasova210d012008-10-01 07:28:28 -07002460 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002461 }
2462
2463
David S. Miller813b3b52011-04-28 14:48:42 -07002464 if (fl4->flowi4_oif) {
2465 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
David S. Millerb23dd4f2011-03-02 14:31:35 -08002466 rth = ERR_PTR(-ENODEV);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002467 if (dev_out == NULL)
2468 goto out;
Herbert Xue5ed6392005-10-03 14:35:55 -07002469
2470 /* RACE: Check return value of inet_select_addr instead. */
Eric Dumazetfc75fc82010-12-22 04:39:39 +00002471 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
David S. Millerb23dd4f2011-03-02 14:31:35 -08002472 rth = ERR_PTR(-ENETUNREACH);
Eric Dumazetfc75fc82010-12-22 04:39:39 +00002473 goto out;
2474 }
David S. Miller813b3b52011-04-28 14:48:42 -07002475 if (ipv4_is_local_multicast(fl4->daddr) ||
2476 ipv4_is_lbcast(fl4->daddr)) {
2477 if (!fl4->saddr)
2478 fl4->saddr = inet_select_addr(dev_out, 0,
2479 RT_SCOPE_LINK);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002480 goto make_route;
2481 }
David S. Miller813b3b52011-04-28 14:48:42 -07002482 if (fl4->saddr) {
2483 if (ipv4_is_multicast(fl4->daddr))
2484 fl4->saddr = inet_select_addr(dev_out, 0,
2485 fl4->flowi4_scope);
2486 else if (!fl4->daddr)
2487 fl4->saddr = inet_select_addr(dev_out, 0,
2488 RT_SCOPE_HOST);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002489 }
2490 }
2491
David S. Miller813b3b52011-04-28 14:48:42 -07002492 if (!fl4->daddr) {
2493 fl4->daddr = fl4->saddr;
2494 if (!fl4->daddr)
2495 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002496 dev_out = net->loopback_dev;
David S. Miller813b3b52011-04-28 14:48:42 -07002497 fl4->flowi4_oif = net->loopback_dev->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002498 res.type = RTN_LOCAL;
2499 flags |= RTCF_LOCAL;
2500 goto make_route;
2501 }
2502
David S. Miller813b3b52011-04-28 14:48:42 -07002503 if (fib_lookup(net, fl4, &res)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002504 res.fi = NULL;
David S. Miller8b96d222012-06-11 02:01:56 -07002505 res.table = NULL;
David S. Miller813b3b52011-04-28 14:48:42 -07002506 if (fl4->flowi4_oif) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002507 /* Apparently, routing tables are wrong. Assume,
2508 that the destination is on link.
2509
2510 WHY? DW.
2511 Because we are allowed to send to iface
2512 even if it has NO routes and NO assigned
2513 addresses. When oif is specified, routing
2514 tables are looked up with only one purpose:
2515 to catch if destination is gatewayed, rather than
2516 direct. Moreover, if MSG_DONTROUTE is set,
2517 we send packet, ignoring both routing tables
2518 and ifaddr state. --ANK
2519
2520
2521 We could make it even if oif is unknown,
2522 likely IPv6, but we do not.
2523 */
2524
David S. Miller813b3b52011-04-28 14:48:42 -07002525 if (fl4->saddr == 0)
2526 fl4->saddr = inet_select_addr(dev_out, 0,
2527 RT_SCOPE_LINK);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002528 res.type = RTN_UNICAST;
2529 goto make_route;
2530 }
David S. Millerb23dd4f2011-03-02 14:31:35 -08002531 rth = ERR_PTR(-ENETUNREACH);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002532 goto out;
2533 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002534
2535 if (res.type == RTN_LOCAL) {
David S. Miller813b3b52011-04-28 14:48:42 -07002536 if (!fl4->saddr) {
Joel Sing9fc3bbb2011-01-03 20:24:20 +00002537 if (res.fi->fib_prefsrc)
David S. Miller813b3b52011-04-28 14:48:42 -07002538 fl4->saddr = res.fi->fib_prefsrc;
Joel Sing9fc3bbb2011-01-03 20:24:20 +00002539 else
David S. Miller813b3b52011-04-28 14:48:42 -07002540 fl4->saddr = fl4->daddr;
Joel Sing9fc3bbb2011-01-03 20:24:20 +00002541 }
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002542 dev_out = net->loopback_dev;
David S. Miller813b3b52011-04-28 14:48:42 -07002543 fl4->flowi4_oif = dev_out->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002544 res.fi = NULL;
2545 flags |= RTCF_LOCAL;
2546 goto make_route;
2547 }
2548
2549#ifdef CONFIG_IP_ROUTE_MULTIPATH
David S. Miller813b3b52011-04-28 14:48:42 -07002550 if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
David S. Miller1b7fe5932011-03-10 17:01:16 -08002551 fib_select_multipath(&res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002552 else
2553#endif
David S. Miller21d8c492011-04-14 14:49:37 -07002554 if (!res.prefixlen &&
2555 res.table->tb_num_default > 1 &&
David S. Miller813b3b52011-04-28 14:48:42 -07002556 res.type == RTN_UNICAST && !fl4->flowi4_oif)
David S. Miller0c838ff2011-01-31 16:16:50 -08002557 fib_select_default(&res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002558
David S. Miller813b3b52011-04-28 14:48:42 -07002559 if (!fl4->saddr)
2560 fl4->saddr = FIB_RES_PREFSRC(net, res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002561
Linus Torvalds1da177e2005-04-16 15:20:36 -07002562 dev_out = FIB_RES_DEV(res);
David S. Miller813b3b52011-04-28 14:48:42 -07002563 fl4->flowi4_oif = dev_out->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002564
2565
2566make_route:
David S. Miller813b3b52011-04-28 14:48:42 -07002567 rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
Julian Anastasovf61759e2011-12-02 11:39:42 +00002568 tos, dev_out, flags);
David S. Millerb23dd4f2011-03-02 14:31:35 -08002569 if (!IS_ERR(rth)) {
David S. Miller5ada5522011-02-17 15:29:00 -08002570 unsigned int hash;
2571
David S. Miller813b3b52011-04-28 14:48:42 -07002572 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
David S. Miller5ada5522011-02-17 15:29:00 -08002573 rt_genid(dev_net(dev_out)));
David S. Miller813b3b52011-04-28 14:48:42 -07002574 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
David S. Miller5ada5522011-02-17 15:29:00 -08002575 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002576
David S. Miller010c2702011-02-17 15:37:09 -08002577out:
2578 rcu_read_unlock();
David S. Millerb23dd4f2011-03-02 14:31:35 -08002579 return rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002580}
2581
David S. Miller813b3b52011-04-28 14:48:42 -07002582struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002583{
Linus Torvalds1da177e2005-04-16 15:20:36 -07002584 struct rtable *rth;
David S. Miller010c2702011-02-17 15:37:09 -08002585 unsigned int hash;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002586
Neil Horman1080d702008-10-27 12:28:25 -07002587 if (!rt_caching(net))
2588 goto slow_output;
2589
David S. Miller9d6ec932011-03-12 01:12:47 -05002590 hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002591
2592 rcu_read_lock_bh();
Paul E. McKenneya898def2010-02-22 17:04:49 -08002593 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
Changli Gaod8d1f302010-06-10 23:31:35 -07002594 rth = rcu_dereference_bh(rth->dst.rt_next)) {
David S. Miller9d6ec932011-03-12 01:12:47 -05002595 if (rth->rt_key_dst == flp4->daddr &&
2596 rth->rt_key_src == flp4->saddr &&
David S. Millerc7537962010-11-11 17:07:48 -08002597 rt_is_output_route(rth) &&
David S. Miller9d6ec932011-03-12 01:12:47 -05002598 rth->rt_oif == flp4->flowi4_oif &&
2599 rth->rt_mark == flp4->flowi4_mark &&
David S. Miller475949d2011-05-03 19:45:15 -07002600 !((rth->rt_key_tos ^ flp4->flowi4_tos) &
Denis V. Lunevb5921912008-01-22 23:50:25 -08002601 (IPTOS_RT_MASK | RTO_ONLINK)) &&
Changli Gaod8d1f302010-06-10 23:31:35 -07002602 net_eq(dev_net(rth->dst.dev), net) &&
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002603 !rt_is_expired(rth)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002604 dst_use(&rth->dst, jiffies);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002605 RT_CACHE_STAT_INC(out_hit);
2606 rcu_read_unlock_bh();
David S. Miller56157872011-05-02 14:37:45 -07002607 if (!flp4->saddr)
2608 flp4->saddr = rth->rt_src;
2609 if (!flp4->daddr)
2610 flp4->daddr = rth->rt_dst;
David S. Millerb23dd4f2011-03-02 14:31:35 -08002611 return rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002612 }
2613 RT_CACHE_STAT_INC(out_hlist_search);
2614 }
2615 rcu_read_unlock_bh();
2616
Neil Horman1080d702008-10-27 12:28:25 -07002617slow_output:
David S. Miller9d6ec932011-03-12 01:12:47 -05002618 return ip_route_output_slow(net, flp4);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002619}
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -07002620EXPORT_SYMBOL_GPL(__ip_route_output_key);
2621
Jianzhao Wangae2688d2010-09-08 14:35:43 -07002622static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2623{
2624 return NULL;
2625}
2626
Steffen Klassertebb762f2011-11-23 02:12:51 +00002627static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
Roland Dreierec831ea2011-01-31 13:16:00 -08002628{
Steffen Klassert618f9bc2011-11-23 02:13:31 +00002629 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2630
2631 return mtu ? : dst->dev->mtu;
Roland Dreierec831ea2011-01-31 13:16:00 -08002632}
2633
David S. Miller14e50e52007-05-24 18:17:54 -07002634static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2635{
2636}
2637
Held Bernhard0972ddb2011-04-24 22:07:32 +00002638static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2639 unsigned long old)
2640{
2641 return NULL;
2642}
2643
David S. Miller14e50e52007-05-24 18:17:54 -07002644static struct dst_ops ipv4_dst_blackhole_ops = {
2645 .family = AF_INET,
Harvey Harrison09640e62009-02-01 00:45:17 -08002646 .protocol = cpu_to_be16(ETH_P_IP),
David S. Miller14e50e52007-05-24 18:17:54 -07002647 .destroy = ipv4_dst_destroy,
Jianzhao Wangae2688d2010-09-08 14:35:43 -07002648 .check = ipv4_blackhole_dst_check,
Steffen Klassertebb762f2011-11-23 02:12:51 +00002649 .mtu = ipv4_blackhole_mtu,
Eric Dumazet214f45c2011-02-18 11:39:01 -08002650 .default_advmss = ipv4_default_advmss,
David S. Miller14e50e52007-05-24 18:17:54 -07002651 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
Held Bernhard0972ddb2011-04-24 22:07:32 +00002652 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
David S. Millerd3aaeb32011-07-18 00:40:17 -07002653 .neigh_lookup = ipv4_neigh_lookup,
David S. Miller14e50e52007-05-24 18:17:54 -07002654};
2655
David S. Miller2774c132011-03-01 14:59:04 -08002656struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
David S. Miller14e50e52007-05-24 18:17:54 -07002657{
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002658 struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
David S. Miller2774c132011-03-01 14:59:04 -08002659 struct rtable *ort = (struct rtable *) dst_orig;
David S. Miller14e50e52007-05-24 18:17:54 -07002660
2661 if (rt) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002662 struct dst_entry *new = &rt->dst;
David S. Miller14e50e52007-05-24 18:17:54 -07002663
David S. Miller14e50e52007-05-24 18:17:54 -07002664 new->__use = 1;
Herbert Xu352e5122007-11-13 21:34:06 -08002665 new->input = dst_discard;
2666 new->output = dst_discard;
David S. Miller14e50e52007-05-24 18:17:54 -07002667
Changli Gaod8d1f302010-06-10 23:31:35 -07002668 new->dev = ort->dst.dev;
David S. Miller14e50e52007-05-24 18:17:54 -07002669 if (new->dev)
2670 dev_hold(new->dev);
2671
David S. Miller5e2b61f2011-03-04 21:47:09 -08002672 rt->rt_key_dst = ort->rt_key_dst;
2673 rt->rt_key_src = ort->rt_key_src;
David S. Miller475949d2011-05-03 19:45:15 -07002674 rt->rt_key_tos = ort->rt_key_tos;
OGAWA Hirofumi1b86a582011-04-07 14:04:08 -07002675 rt->rt_route_iif = ort->rt_route_iif;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002676 rt->rt_iif = ort->rt_iif;
2677 rt->rt_oif = ort->rt_oif;
2678 rt->rt_mark = ort->rt_mark;
David S. Miller59436342012-07-10 06:58:42 -07002679 rt->rt_pmtu = ort->rt_pmtu;
David S. Miller14e50e52007-05-24 18:17:54 -07002680
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002681 rt->rt_genid = rt_genid(net);
David S. Miller14e50e52007-05-24 18:17:54 -07002682 rt->rt_flags = ort->rt_flags;
2683 rt->rt_type = ort->rt_type;
2684 rt->rt_dst = ort->rt_dst;
2685 rt->rt_src = ort->rt_src;
David S. Miller14e50e52007-05-24 18:17:54 -07002686 rt->rt_gateway = ort->rt_gateway;
David S. Miller62fa8a82011-01-26 20:51:05 -08002687 rt->fi = ort->fi;
2688 if (rt->fi)
2689 atomic_inc(&rt->fi->fib_clntref);
David S. Miller14e50e52007-05-24 18:17:54 -07002690
2691 dst_free(new);
2692 }
2693
David S. Miller2774c132011-03-01 14:59:04 -08002694 dst_release(dst_orig);
2695
2696 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
David S. Miller14e50e52007-05-24 18:17:54 -07002697}
2698
David S. Miller9d6ec932011-03-12 01:12:47 -05002699struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
David S. Millerb23dd4f2011-03-02 14:31:35 -08002700 struct sock *sk)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002701{
David S. Miller9d6ec932011-03-12 01:12:47 -05002702 struct rtable *rt = __ip_route_output_key(net, flp4);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002703
David S. Millerb23dd4f2011-03-02 14:31:35 -08002704 if (IS_ERR(rt))
2705 return rt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002706
David S. Miller56157872011-05-02 14:37:45 -07002707 if (flp4->flowi4_proto)
David S. Miller9d6ec932011-03-12 01:12:47 -05002708 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2709 flowi4_to_flowi(flp4),
2710 sk, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002711
David S. Millerb23dd4f2011-03-02 14:31:35 -08002712 return rt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002713}
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -07002714EXPORT_SYMBOL_GPL(ip_route_output_flow);
2715
Benjamin Thery4feb88e2009-01-22 04:56:23 +00002716static int rt_fill_info(struct net *net,
2717 struct sk_buff *skb, u32 pid, u32 seq, int event,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07002718 int nowait, unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002719{
Eric Dumazet511c3f92009-06-02 05:14:27 +00002720 struct rtable *rt = skb_rtable(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002721 struct rtmsg *r;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002722 struct nlmsghdr *nlh;
Steffen Klassert2bc8ca42011-10-11 01:12:02 +00002723 unsigned long expires = 0;
David S. Millerf1850712012-07-10 07:26:01 -07002724 u32 error;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002725
2726 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2727 if (nlh == NULL)
Patrick McHardy26932562007-01-31 23:16:40 -08002728 return -EMSGSIZE;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002729
2730 r = nlmsg_data(nlh);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002731 r->rtm_family = AF_INET;
2732 r->rtm_dst_len = 32;
2733 r->rtm_src_len = 0;
David S. Miller475949d2011-05-03 19:45:15 -07002734 r->rtm_tos = rt->rt_key_tos;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002735 r->rtm_table = RT_TABLE_MAIN;
David S. Millerf3756b72012-04-01 20:39:02 -04002736 if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2737 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002738 r->rtm_type = rt->rt_type;
2739 r->rtm_scope = RT_SCOPE_UNIVERSE;
2740 r->rtm_protocol = RTPROT_UNSPEC;
2741 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2742 if (rt->rt_flags & RTCF_NOTIFY)
2743 r->rtm_flags |= RTM_F_NOTIFY;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002744
David S. Millerf3756b72012-04-01 20:39:02 -04002745 if (nla_put_be32(skb, RTA_DST, rt->rt_dst))
2746 goto nla_put_failure;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002747 if (rt->rt_key_src) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002748 r->rtm_src_len = 32;
David S. Millerf3756b72012-04-01 20:39:02 -04002749 if (nla_put_be32(skb, RTA_SRC, rt->rt_key_src))
2750 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002751 }
David S. Millerf3756b72012-04-01 20:39:02 -04002752 if (rt->dst.dev &&
2753 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2754 goto nla_put_failure;
Patrick McHardyc7066f72011-01-14 13:36:42 +01002755#ifdef CONFIG_IP_ROUTE_CLASSID
David S. Millerf3756b72012-04-01 20:39:02 -04002756 if (rt->dst.tclassid &&
2757 nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2758 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002759#endif
David S. Miller41347dc2012-06-28 04:05:27 -07002760 if (!rt_is_input_route(rt) &&
2761 rt->rt_src != rt->rt_key_src) {
David S. Millerf3756b72012-04-01 20:39:02 -04002762 if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_src))
2763 goto nla_put_failure;
2764 }
2765 if (rt->rt_dst != rt->rt_gateway &&
2766 nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2767 goto nla_put_failure;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002768
David S. Millerdefb3512010-12-08 21:16:57 -08002769 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002770 goto nla_put_failure;
2771
David S. Millerf3756b72012-04-01 20:39:02 -04002772 if (rt->rt_mark &&
2773 nla_put_be32(skb, RTA_MARK, rt->rt_mark))
2774 goto nla_put_failure;
Eric Dumazet963bfee2010-07-20 22:03:14 +00002775
Changli Gaod8d1f302010-06-10 23:31:35 -07002776 error = rt->dst.error;
David S. Miller59436342012-07-10 06:58:42 -07002777 expires = rt->dst.expires;
2778 if (expires) {
2779 if (time_before(jiffies, expires))
2780 expires -= jiffies;
2781 else
2782 expires = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002783 }
Thomas Grafbe403ea2006-08-17 18:15:17 -07002784
David S. Millerc7537962010-11-11 17:07:48 -08002785 if (rt_is_input_route(rt)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002786#ifdef CONFIG_IP_MROUTE
Al Viroe4485152006-09-26 22:15:01 -07002787 __be32 dst = rt->rt_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002788
Joe Perchesf97c1e02007-12-16 13:45:43 -08002789 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
Benjamin Thery4feb88e2009-01-22 04:56:23 +00002790 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
David S. Miller9a1b9492011-05-04 12:18:54 -07002791 int err = ipmr_get_route(net, skb,
2792 rt->rt_src, rt->rt_dst,
2793 r, nowait);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002794 if (err <= 0) {
2795 if (!nowait) {
2796 if (err == 0)
2797 return 0;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002798 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002799 } else {
2800 if (err == -EMSGSIZE)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002801 goto nla_put_failure;
Thomas Grafe3703b32006-11-27 09:27:07 -08002802 error = err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002803 }
2804 }
2805 } else
2806#endif
David S. Millerf3756b72012-04-01 20:39:02 -04002807 if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
2808 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002809 }
2810
David S. Millerf1850712012-07-10 07:26:01 -07002811 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
Thomas Grafe3703b32006-11-27 09:27:07 -08002812 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002813
Thomas Grafbe403ea2006-08-17 18:15:17 -07002814 return nlmsg_end(skb, nlh);
2815
2816nla_put_failure:
Patrick McHardy26932562007-01-31 23:16:40 -08002817 nlmsg_cancel(skb, nlh);
2818 return -EMSGSIZE;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002819}
2820
Daniel Baluta5e73ea12012-04-15 01:34:41 +00002821static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002822{
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09002823 struct net *net = sock_net(in_skb->sk);
Thomas Grafd889ce32006-08-17 18:15:44 -07002824 struct rtmsg *rtm;
2825 struct nlattr *tb[RTA_MAX+1];
Linus Torvalds1da177e2005-04-16 15:20:36 -07002826 struct rtable *rt = NULL;
Al Viro9e12bb22006-09-26 21:25:20 -07002827 __be32 dst = 0;
2828 __be32 src = 0;
2829 u32 iif;
Thomas Grafd889ce32006-08-17 18:15:44 -07002830 int err;
Eric Dumazet963bfee2010-07-20 22:03:14 +00002831 int mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002832 struct sk_buff *skb;
2833
Thomas Grafd889ce32006-08-17 18:15:44 -07002834 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2835 if (err < 0)
2836 goto errout;
2837
2838 rtm = nlmsg_data(nlh);
2839
Linus Torvalds1da177e2005-04-16 15:20:36 -07002840 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
Thomas Grafd889ce32006-08-17 18:15:44 -07002841 if (skb == NULL) {
2842 err = -ENOBUFS;
2843 goto errout;
2844 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002845
2846 /* Reserve room for dummy headers, this skb can pass
2847 through good chunk of routing engine.
2848 */
Arnaldo Carvalho de Melo459a98e2007-03-19 15:30:44 -07002849 skb_reset_mac_header(skb);
Arnaldo Carvalho de Meloc1d2bbe2007-04-10 20:45:18 -07002850 skb_reset_network_header(skb);
Stephen Hemmingerd2c962b2006-04-17 17:27:11 -07002851
2852 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07002853 ip_hdr(skb)->protocol = IPPROTO_ICMP;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002854 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2855
Al Viro17fb2c62006-09-26 22:15:25 -07002856 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2857 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
Thomas Grafd889ce32006-08-17 18:15:44 -07002858 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
Eric Dumazet963bfee2010-07-20 22:03:14 +00002859 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002860
2861 if (iif) {
Thomas Grafd889ce32006-08-17 18:15:44 -07002862 struct net_device *dev;
2863
Denis V. Lunev19375042008-02-28 20:52:04 -08002864 dev = __dev_get_by_index(net, iif);
Thomas Grafd889ce32006-08-17 18:15:44 -07002865 if (dev == NULL) {
2866 err = -ENODEV;
2867 goto errout_free;
2868 }
2869
Linus Torvalds1da177e2005-04-16 15:20:36 -07002870 skb->protocol = htons(ETH_P_IP);
2871 skb->dev = dev;
Eric Dumazet963bfee2010-07-20 22:03:14 +00002872 skb->mark = mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002873 local_bh_disable();
2874 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2875 local_bh_enable();
Thomas Grafd889ce32006-08-17 18:15:44 -07002876
Eric Dumazet511c3f92009-06-02 05:14:27 +00002877 rt = skb_rtable(skb);
Changli Gaod8d1f302010-06-10 23:31:35 -07002878 if (err == 0 && rt->dst.error)
2879 err = -rt->dst.error;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002880 } else {
David S. Miller68a5e3d2011-03-11 20:07:33 -05002881 struct flowi4 fl4 = {
2882 .daddr = dst,
2883 .saddr = src,
2884 .flowi4_tos = rtm->rtm_tos,
2885 .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2886 .flowi4_mark = mark,
Thomas Grafd889ce32006-08-17 18:15:44 -07002887 };
David S. Miller9d6ec932011-03-12 01:12:47 -05002888 rt = ip_route_output_key(net, &fl4);
David S. Millerb23dd4f2011-03-02 14:31:35 -08002889
2890 err = 0;
2891 if (IS_ERR(rt))
2892 err = PTR_ERR(rt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002893 }
Thomas Grafd889ce32006-08-17 18:15:44 -07002894
Linus Torvalds1da177e2005-04-16 15:20:36 -07002895 if (err)
Thomas Grafd889ce32006-08-17 18:15:44 -07002896 goto errout_free;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002897
Changli Gaod8d1f302010-06-10 23:31:35 -07002898 skb_dst_set(skb, &rt->dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002899 if (rtm->rtm_flags & RTM_F_NOTIFY)
2900 rt->rt_flags |= RTCF_NOTIFY;
2901
Benjamin Thery4feb88e2009-01-22 04:56:23 +00002902 err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
Denis V. Lunev19375042008-02-28 20:52:04 -08002903 RTM_NEWROUTE, 0, 0);
Thomas Grafd889ce32006-08-17 18:15:44 -07002904 if (err <= 0)
2905 goto errout_free;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002906
Denis V. Lunev19375042008-02-28 20:52:04 -08002907 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
Thomas Grafd889ce32006-08-17 18:15:44 -07002908errout:
Thomas Graf2942e902006-08-15 00:30:25 -07002909 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002910
Thomas Grafd889ce32006-08-17 18:15:44 -07002911errout_free:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002912 kfree_skb(skb);
Thomas Grafd889ce32006-08-17 18:15:44 -07002913 goto errout;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002914}
2915
2916int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2917{
2918 struct rtable *rt;
2919 int h, s_h;
2920 int idx, s_idx;
Denis V. Lunev19375042008-02-28 20:52:04 -08002921 struct net *net;
2922
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09002923 net = sock_net(skb->sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002924
2925 s_h = cb->args[0];
Eric Dumazetd8c92832008-01-07 21:52:14 -08002926 if (s_h < 0)
2927 s_h = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002928 s_idx = idx = cb->args[1];
Eric Dumazeta6272662008-08-28 01:11:25 -07002929 for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
2930 if (!rt_hash_table[h].chain)
2931 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002932 rcu_read_lock_bh();
Paul E. McKenneya898def2010-02-22 17:04:49 -08002933 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
Changli Gaod8d1f302010-06-10 23:31:35 -07002934 rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
2935 if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002936 continue;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002937 if (rt_is_expired(rt))
Eric Dumazet29e75252008-01-31 17:05:09 -08002938 continue;
Changli Gaod8d1f302010-06-10 23:31:35 -07002939 skb_dst_set_noref(skb, &rt->dst);
Benjamin Thery4feb88e2009-01-22 04:56:23 +00002940 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002941 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07002942 1, NLM_F_MULTI) <= 0) {
Eric Dumazetadf30902009-06-02 05:19:30 +00002943 skb_dst_drop(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002944 rcu_read_unlock_bh();
2945 goto done;
2946 }
Eric Dumazetadf30902009-06-02 05:19:30 +00002947 skb_dst_drop(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002948 }
2949 rcu_read_unlock_bh();
2950 }
2951
2952done:
2953 cb->args[0] = h;
2954 cb->args[1] = idx;
2955 return skb->len;
2956}
2957
2958void ip_rt_multicast_event(struct in_device *in_dev)
2959{
Denis V. Lunev76e6ebf2008-07-05 19:00:44 -07002960 rt_cache_flush(dev_net(in_dev->dev), 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002961}
2962
2963#ifdef CONFIG_SYSCTL
Denis V. Lunev81c684d2008-07-08 03:05:28 -07002964static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
Alexey Dobriyan8d65af72009-09-23 15:57:19 -07002965 void __user *buffer,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002966 size_t *lenp, loff_t *ppos)
2967{
2968 if (write) {
Denis V. Lunev639e1042008-07-05 19:02:06 -07002969 int flush_delay;
Denis V. Lunev81c684d2008-07-08 03:05:28 -07002970 ctl_table ctl;
Denis V. Lunev39a23e72008-07-05 19:02:33 -07002971 struct net *net;
Denis V. Lunev639e1042008-07-05 19:02:06 -07002972
Denis V. Lunev81c684d2008-07-08 03:05:28 -07002973 memcpy(&ctl, __ctl, sizeof(ctl));
2974 ctl.data = &flush_delay;
Alexey Dobriyan8d65af72009-09-23 15:57:19 -07002975 proc_dointvec(&ctl, write, buffer, lenp, ppos);
Denis V. Lunev639e1042008-07-05 19:02:06 -07002976
Denis V. Lunev81c684d2008-07-08 03:05:28 -07002977 net = (struct net *)__ctl->extra1;
Denis V. Lunev39a23e72008-07-05 19:02:33 -07002978 rt_cache_flush(net, flush_delay);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002979 return 0;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002980 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002981
2982 return -EINVAL;
2983}
2984
Al Viroeeb61f72008-07-27 08:59:33 +01002985static ctl_table ipv4_route_table[] = {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002986 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002987 .procname = "gc_thresh",
2988 .data = &ipv4_dst_ops.gc_thresh,
2989 .maxlen = sizeof(int),
2990 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08002991 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002992 },
2993 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002994 .procname = "max_size",
2995 .data = &ip_rt_max_size,
2996 .maxlen = sizeof(int),
2997 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08002998 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002999 },
3000 {
3001 /* Deprecated. Use gc_min_interval_ms */
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003002
Linus Torvalds1da177e2005-04-16 15:20:36 -07003003 .procname = "gc_min_interval",
3004 .data = &ip_rt_gc_min_interval,
3005 .maxlen = sizeof(int),
3006 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003007 .proc_handler = proc_dointvec_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003008 },
3009 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003010 .procname = "gc_min_interval_ms",
3011 .data = &ip_rt_gc_min_interval,
3012 .maxlen = sizeof(int),
3013 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003014 .proc_handler = proc_dointvec_ms_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003015 },
3016 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003017 .procname = "gc_timeout",
3018 .data = &ip_rt_gc_timeout,
3019 .maxlen = sizeof(int),
3020 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003021 .proc_handler = proc_dointvec_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003022 },
3023 {
Eric Dumazet9f28a2f2011-12-21 15:47:16 -05003024 .procname = "gc_interval",
3025 .data = &ip_rt_gc_interval,
3026 .maxlen = sizeof(int),
3027 .mode = 0644,
3028 .proc_handler = proc_dointvec_jiffies,
3029 },
3030 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003031 .procname = "redirect_load",
3032 .data = &ip_rt_redirect_load,
3033 .maxlen = sizeof(int),
3034 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003035 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003036 },
3037 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003038 .procname = "redirect_number",
3039 .data = &ip_rt_redirect_number,
3040 .maxlen = sizeof(int),
3041 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003042 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003043 },
3044 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003045 .procname = "redirect_silence",
3046 .data = &ip_rt_redirect_silence,
3047 .maxlen = sizeof(int),
3048 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003049 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003050 },
3051 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003052 .procname = "error_cost",
3053 .data = &ip_rt_error_cost,
3054 .maxlen = sizeof(int),
3055 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003056 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003057 },
3058 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003059 .procname = "error_burst",
3060 .data = &ip_rt_error_burst,
3061 .maxlen = sizeof(int),
3062 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003063 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003064 },
3065 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003066 .procname = "gc_elasticity",
3067 .data = &ip_rt_gc_elasticity,
3068 .maxlen = sizeof(int),
3069 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003070 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003071 },
3072 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003073 .procname = "mtu_expires",
3074 .data = &ip_rt_mtu_expires,
3075 .maxlen = sizeof(int),
3076 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003077 .proc_handler = proc_dointvec_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003078 },
3079 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003080 .procname = "min_pmtu",
3081 .data = &ip_rt_min_pmtu,
3082 .maxlen = sizeof(int),
3083 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003084 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003085 },
3086 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003087 .procname = "min_adv_mss",
3088 .data = &ip_rt_min_advmss,
3089 .maxlen = sizeof(int),
3090 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003091 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003092 },
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003093 { }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003094};
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003095
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003096static struct ctl_table ipv4_route_flush_table[] = {
3097 {
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003098 .procname = "flush",
3099 .maxlen = sizeof(int),
3100 .mode = 0200,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003101 .proc_handler = ipv4_sysctl_rtcache_flush,
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003102 },
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003103 { },
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003104};
3105
3106static __net_init int sysctl_route_net_init(struct net *net)
3107{
3108 struct ctl_table *tbl;
3109
3110 tbl = ipv4_route_flush_table;
Octavian Purdila09ad9bc2009-11-25 15:14:13 -08003111 if (!net_eq(net, &init_net)) {
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003112 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3113 if (tbl == NULL)
3114 goto err_dup;
3115 }
3116 tbl[0].extra1 = net;
3117
Eric W. Biedermanec8f23c2012-04-19 13:44:49 +00003118 net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003119 if (net->ipv4.route_hdr == NULL)
3120 goto err_reg;
3121 return 0;
3122
3123err_reg:
3124 if (tbl != ipv4_route_flush_table)
3125 kfree(tbl);
3126err_dup:
3127 return -ENOMEM;
3128}
3129
3130static __net_exit void sysctl_route_net_exit(struct net *net)
3131{
3132 struct ctl_table *tbl;
3133
3134 tbl = net->ipv4.route_hdr->ctl_table_arg;
3135 unregister_net_sysctl_table(net->ipv4.route_hdr);
3136 BUG_ON(tbl == ipv4_route_flush_table);
3137 kfree(tbl);
3138}
3139
3140static __net_initdata struct pernet_operations sysctl_route_ops = {
3141 .init = sysctl_route_net_init,
3142 .exit = sysctl_route_net_exit,
3143};
Linus Torvalds1da177e2005-04-16 15:20:36 -07003144#endif
3145
Neil Horman3ee94372010-05-08 01:57:52 -07003146static __net_init int rt_genid_init(struct net *net)
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003147{
Neil Horman3ee94372010-05-08 01:57:52 -07003148 get_random_bytes(&net->ipv4.rt_genid,
3149 sizeof(net->ipv4.rt_genid));
David S. Miller436c3b62011-03-24 17:42:21 -07003150 get_random_bytes(&net->ipv4.dev_addr_genid,
3151 sizeof(net->ipv4.dev_addr_genid));
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003152 return 0;
3153}
3154
Neil Horman3ee94372010-05-08 01:57:52 -07003155static __net_initdata struct pernet_operations rt_genid_ops = {
3156 .init = rt_genid_init,
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003157};
3158
David S. Millerc3426b42012-06-09 16:27:05 -07003159static int __net_init ipv4_inetpeer_init(struct net *net)
3160{
3161 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3162
3163 if (!bp)
3164 return -ENOMEM;
3165 inet_peer_base_init(bp);
3166 net->ipv4.peers = bp;
3167 return 0;
3168}
3169
3170static void __net_exit ipv4_inetpeer_exit(struct net *net)
3171{
3172 struct inet_peer_base *bp = net->ipv4.peers;
3173
3174 net->ipv4.peers = NULL;
David S. Miller56a6b242012-06-09 16:32:41 -07003175 inetpeer_invalidate_tree(bp);
David S. Millerc3426b42012-06-09 16:27:05 -07003176 kfree(bp);
3177}
3178
3179static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3180 .init = ipv4_inetpeer_init,
3181 .exit = ipv4_inetpeer_exit,
3182};
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003183
Patrick McHardyc7066f72011-01-14 13:36:42 +01003184#ifdef CONFIG_IP_ROUTE_CLASSID
Tejun Heo7d720c32010-02-16 15:20:26 +00003185struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
Patrick McHardyc7066f72011-01-14 13:36:42 +01003186#endif /* CONFIG_IP_ROUTE_CLASSID */
Linus Torvalds1da177e2005-04-16 15:20:36 -07003187
3188static __initdata unsigned long rhash_entries;
3189static int __init set_rhash_entries(char *str)
3190{
Eldad Zack413c27d2012-05-19 14:13:18 +00003191 ssize_t ret;
3192
Linus Torvalds1da177e2005-04-16 15:20:36 -07003193 if (!str)
3194 return 0;
Eldad Zack413c27d2012-05-19 14:13:18 +00003195
3196 ret = kstrtoul(str, 0, &rhash_entries);
3197 if (ret)
3198 return 0;
3199
Linus Torvalds1da177e2005-04-16 15:20:36 -07003200 return 1;
3201}
3202__setup("rhash_entries=", set_rhash_entries);
3203
3204int __init ip_rt_init(void)
3205{
Eric Dumazet424c4b72005-07-05 14:58:19 -07003206 int rc = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003207
Patrick McHardyc7066f72011-01-14 13:36:42 +01003208#ifdef CONFIG_IP_ROUTE_CLASSID
Ingo Molnar0dcec8c2009-02-25 14:07:33 +01003209 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
Linus Torvalds1da177e2005-04-16 15:20:36 -07003210 if (!ip_rt_acct)
3211 panic("IP: failed to allocate ip_rt_acct\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07003212#endif
3213
Alexey Dobriyane5d679f332006-08-26 19:25:52 -07003214 ipv4_dst_ops.kmem_cachep =
3215 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
Paul Mundt20c2df82007-07-20 10:11:58 +09003216 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003217
David S. Miller14e50e52007-05-24 18:17:54 -07003218 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3219
Eric Dumazetfc66f952010-10-08 06:37:34 +00003220 if (dst_entries_init(&ipv4_dst_ops) < 0)
3221 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3222
3223 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3224 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3225
Eric Dumazet424c4b72005-07-05 14:58:19 -07003226 rt_hash_table = (struct rt_hash_bucket *)
3227 alloc_large_system_hash("IP route cache",
3228 sizeof(struct rt_hash_bucket),
3229 rhash_entries,
Jan Beulich44813742009-09-21 17:03:05 -07003230 (totalram_pages >= 128 * 1024) ?
Mike Stroyan18955cf2005-11-29 16:12:55 -08003231 15 : 17,
Kirill Korotaev8d1502d2006-08-07 20:44:22 -07003232 0,
Eric Dumazet424c4b72005-07-05 14:58:19 -07003233 &rt_hash_log,
3234 &rt_hash_mask,
Tim Bird31fe62b2012-05-23 13:33:35 +00003235 0,
Anton Blanchardc9503e02009-04-27 05:42:24 -07003236 rhash_entries ? 0 : 512 * 1024);
Eric Dumazet22c047c2005-07-05 14:55:24 -07003237 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3238 rt_hash_lock_init();
Linus Torvalds1da177e2005-04-16 15:20:36 -07003239
3240 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3241 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3242
Linus Torvalds1da177e2005-04-16 15:20:36 -07003243 devinet_init();
3244 ip_fib_init();
3245
Eric Dumazet9f28a2f2011-12-21 15:47:16 -05003246 INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3247 expires_ljiffies = jiffies;
3248 schedule_delayed_work(&expires_work,
3249 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3250
Denis V. Lunev73b38712008-02-28 20:51:18 -08003251 if (ip_rt_proc_init())
Joe Perches058bd4d2012-03-11 18:36:11 +00003252 pr_err("Unable to create route proc files\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07003253#ifdef CONFIG_XFRM
3254 xfrm_init();
Neil Hormana33bc5c2009-07-30 18:52:15 -07003255 xfrm4_init(ip_rt_max_size);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003256#endif
Greg Rosec7ac8672011-06-10 01:27:09 +00003257 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
Thomas Graf63f34442007-03-22 11:55:17 -07003258
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003259#ifdef CONFIG_SYSCTL
3260 register_pernet_subsys(&sysctl_route_ops);
3261#endif
Neil Horman3ee94372010-05-08 01:57:52 -07003262 register_pernet_subsys(&rt_genid_ops);
David S. Millerc3426b42012-06-09 16:27:05 -07003263 register_pernet_subsys(&ipv4_inetpeer_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003264 return rc;
3265}
3266
Al Viroa1bc6eb2008-07-30 06:32:52 -04003267#ifdef CONFIG_SYSCTL
Al Viroeeb61f72008-07-27 08:59:33 +01003268/*
3269 * We really need to sanitize the damn ipv4 init order, then all
3270 * this nonsense will go away.
3271 */
3272void __init ip_static_sysctl_init(void)
3273{
Eric W. Biederman4e5ca782012-04-19 13:32:39 +00003274 register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
Al Viroeeb61f72008-07-27 08:59:33 +01003275}
Al Viroa1bc6eb2008-07-30 06:32:52 -04003276#endif