blob: cbda6de0a77c1f9071a08a18e9ac31c4c87159fb [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
Jesper Juhl02c30a82005-05-05 16:16:16 -07008 * Authors: Ross Biro
Linus Torvalds1da177e2005-04-16 15:20:36 -07009 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 *
14 * Fixes:
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090021 * Alan Cox : Super /proc >4K
Linus Torvalds1da177e2005-04-16 15:20:36 -070022 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090039 *
Linus Torvalds1da177e2005-04-16 15:20:36 -070040 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
Eric Dumazetbb1d23b2005-07-05 15:00:32 -070055 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
Ilia Sotnikovcef26852006-03-25 01:38:55 -080056 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
Linus Torvalds1da177e2005-04-16 15:20:36 -070058 *
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
63 */
64
Linus Torvalds1da177e2005-04-16 15:20:36 -070065#include <linux/module.h>
66#include <asm/uaccess.h>
67#include <asm/system.h>
68#include <linux/bitops.h>
69#include <linux/types.h>
70#include <linux/kernel.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070071#include <linux/mm.h>
Eric Dumazet424c4b72005-07-05 14:58:19 -070072#include <linux/bootmem.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070073#include <linux/string.h>
74#include <linux/socket.h>
75#include <linux/sockios.h>
76#include <linux/errno.h>
77#include <linux/in.h>
78#include <linux/inet.h>
79#include <linux/netdevice.h>
80#include <linux/proc_fs.h>
81#include <linux/init.h>
Eric Dumazet39c90ec2007-09-15 10:55:54 -070082#include <linux/workqueue.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070083#include <linux/skbuff.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070084#include <linux/inetdevice.h>
85#include <linux/igmp.h>
86#include <linux/pkt_sched.h>
87#include <linux/mroute.h>
88#include <linux/netfilter_ipv4.h>
89#include <linux/random.h>
90#include <linux/jhash.h>
91#include <linux/rcupdate.h>
92#include <linux/times.h>
Tejun Heo5a0e3ad2010-03-24 17:04:11 +090093#include <linux/slab.h>
Stephen Rothwellb9eda062011-12-22 17:03:29 +110094#include <linux/prefetch.h>
Herbert Xu352e5122007-11-13 21:34:06 -080095#include <net/dst.h>
Eric W. Biederman457c4cb2007-09-12 12:01:34 +020096#include <net/net_namespace.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070097#include <net/protocol.h>
98#include <net/ip.h>
99#include <net/route.h>
100#include <net/inetpeer.h>
101#include <net/sock.h>
102#include <net/ip_fib.h>
103#include <net/arp.h>
104#include <net/tcp.h>
105#include <net/icmp.h>
106#include <net/xfrm.h>
Tom Tucker8d717402006-07-30 20:43:36 -0700107#include <net/netevent.h>
Thomas Graf63f34442007-03-22 11:55:17 -0700108#include <net/rtnetlink.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700109#ifdef CONFIG_SYSCTL
110#include <linux/sysctl.h>
111#endif
David S. Miller6e5714e2011-08-03 20:50:44 -0700112#include <net/secure_seq.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700113
David S. Miller68a5e3d2011-03-11 20:07:33 -0500114#define RT_FL_TOS(oldflp4) \
Julian Anastasovf61759e2011-12-02 11:39:42 +0000115 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700116
117#define IP_MAX_MTU 0xFFF0
118
119#define RT_GC_TIMEOUT (300*HZ)
120
Linus Torvalds1da177e2005-04-16 15:20:36 -0700121static int ip_rt_max_size;
Stephen Hemminger817bc4d2008-03-22 17:43:59 -0700122static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
Eric Dumazet9f28a2f2011-12-21 15:47:16 -0500123static int ip_rt_gc_interval __read_mostly = 60 * HZ;
Stephen Hemminger817bc4d2008-03-22 17:43:59 -0700124static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
125static int ip_rt_redirect_number __read_mostly = 9;
126static int ip_rt_redirect_load __read_mostly = HZ / 50;
127static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
128static int ip_rt_error_cost __read_mostly = HZ;
129static int ip_rt_error_burst __read_mostly = 5 * HZ;
130static int ip_rt_gc_elasticity __read_mostly = 8;
131static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
132static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
133static int ip_rt_min_advmss __read_mostly = 256;
Neil Horman1080d702008-10-27 12:28:25 -0700134static int rt_chain_length_max __read_mostly = 20;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700135
Eric Dumazet9f28a2f2011-12-21 15:47:16 -0500136static struct delayed_work expires_work;
137static unsigned long expires_ljiffies;
138
Linus Torvalds1da177e2005-04-16 15:20:36 -0700139/*
140 * Interface to generic destination cache.
141 */
142
143static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
David S. Miller0dbaee32010-12-13 12:52:14 -0800144static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
Steffen Klassertebb762f2011-11-23 02:12:51 +0000145static unsigned int ipv4_mtu(const struct dst_entry *dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700146static void ipv4_dst_destroy(struct dst_entry *dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700147static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
148static void ipv4_link_failure(struct sk_buff *skb);
149static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
Daniel Lezcano569d3642008-01-18 03:56:57 -0800150static int rt_garbage_collect(struct dst_ops *ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700151
Eric Dumazet72cdd1d2010-11-11 07:14:07 +0000152static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
153 int how)
154{
155}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700156
David S. Miller62fa8a82011-01-26 20:51:05 -0800157static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
158{
David S. Miller06582542011-01-27 14:58:42 -0800159 struct rtable *rt = (struct rtable *) dst;
160 struct inet_peer *peer;
161 u32 *p = NULL;
David S. Miller62fa8a82011-01-26 20:51:05 -0800162
David S. Miller06582542011-01-27 14:58:42 -0800163 if (!rt->peer)
David S. Millera48eff12011-05-18 18:42:43 -0400164 rt_bind_peer(rt, rt->rt_dst, 1);
David S. Miller06582542011-01-27 14:58:42 -0800165
166 peer = rt->peer;
167 if (peer) {
David S. Miller62fa8a82011-01-26 20:51:05 -0800168 u32 *old_p = __DST_METRICS_PTR(old);
169 unsigned long prev, new;
170
David S. Miller06582542011-01-27 14:58:42 -0800171 p = peer->metrics;
172 if (inet_metrics_new(peer))
173 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
David S. Miller62fa8a82011-01-26 20:51:05 -0800174
175 new = (unsigned long) p;
176 prev = cmpxchg(&dst->_metrics, old, new);
177
178 if (prev != old) {
David S. Miller62fa8a82011-01-26 20:51:05 -0800179 p = __DST_METRICS_PTR(prev);
180 if (prev & DST_METRICS_READ_ONLY)
181 p = NULL;
182 } else {
David S. Miller62fa8a82011-01-26 20:51:05 -0800183 if (rt->fi) {
184 fib_info_put(rt->fi);
185 rt->fi = NULL;
186 }
187 }
188 }
189 return p;
190}
191
David S. Millerd3aaeb32011-07-18 00:40:17 -0700192static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
193
Linus Torvalds1da177e2005-04-16 15:20:36 -0700194static struct dst_ops ipv4_dst_ops = {
195 .family = AF_INET,
Harvey Harrison09640e632009-02-01 00:45:17 -0800196 .protocol = cpu_to_be16(ETH_P_IP),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700197 .gc = rt_garbage_collect,
198 .check = ipv4_dst_check,
David S. Miller0dbaee32010-12-13 12:52:14 -0800199 .default_advmss = ipv4_default_advmss,
Steffen Klassertebb762f2011-11-23 02:12:51 +0000200 .mtu = ipv4_mtu,
David S. Miller62fa8a82011-01-26 20:51:05 -0800201 .cow_metrics = ipv4_cow_metrics,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700202 .destroy = ipv4_dst_destroy,
203 .ifdown = ipv4_dst_ifdown,
204 .negative_advice = ipv4_negative_advice,
205 .link_failure = ipv4_link_failure,
206 .update_pmtu = ip_rt_update_pmtu,
Herbert Xu1ac06e02008-05-20 14:32:14 -0700207 .local_out = __ip_local_out,
David S. Millerd3aaeb32011-07-18 00:40:17 -0700208 .neigh_lookup = ipv4_neigh_lookup,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700209};
210
211#define ECN_OR_COST(class) TC_PRIO_##class
212
Philippe De Muyter4839c522007-07-09 15:32:57 -0700213const __u8 ip_tos2prio[16] = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700214 TC_PRIO_BESTEFFORT,
Dan Siemon4a2b9c32011-03-15 13:56:07 +0000215 ECN_OR_COST(BESTEFFORT),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700216 TC_PRIO_BESTEFFORT,
217 ECN_OR_COST(BESTEFFORT),
218 TC_PRIO_BULK,
219 ECN_OR_COST(BULK),
220 TC_PRIO_BULK,
221 ECN_OR_COST(BULK),
222 TC_PRIO_INTERACTIVE,
223 ECN_OR_COST(INTERACTIVE),
224 TC_PRIO_INTERACTIVE,
225 ECN_OR_COST(INTERACTIVE),
226 TC_PRIO_INTERACTIVE_BULK,
227 ECN_OR_COST(INTERACTIVE_BULK),
228 TC_PRIO_INTERACTIVE_BULK,
229 ECN_OR_COST(INTERACTIVE_BULK)
230};
231
232
233/*
234 * Route cache.
235 */
236
237/* The locking scheme is rather straight forward:
238 *
239 * 1) Read-Copy Update protects the buckets of the central route hash.
240 * 2) Only writers remove entries, and they hold the lock
241 * as they look at rtable reference counts.
242 * 3) Only readers acquire references to rtable entries,
243 * they do so with atomic increments and with the
244 * lock held.
245 */
246
247struct rt_hash_bucket {
Eric Dumazet1c317202010-10-25 21:02:07 +0000248 struct rtable __rcu *chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700249};
Neil Horman1080d702008-10-27 12:28:25 -0700250
Ingo Molnar8a25d5d2006-07-03 00:24:54 -0700251#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
252 defined(CONFIG_PROVE_LOCKING)
Eric Dumazet22c047c2005-07-05 14:55:24 -0700253/*
254 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
255 * The size of this table is a power of two and depends on the number of CPUS.
Ingo Molnar62051202006-07-03 00:24:59 -0700256 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
Eric Dumazet22c047c2005-07-05 14:55:24 -0700257 */
Ingo Molnar62051202006-07-03 00:24:59 -0700258#ifdef CONFIG_LOCKDEP
259# define RT_HASH_LOCK_SZ 256
Eric Dumazet22c047c2005-07-05 14:55:24 -0700260#else
Ingo Molnar62051202006-07-03 00:24:59 -0700261# if NR_CPUS >= 32
262# define RT_HASH_LOCK_SZ 4096
263# elif NR_CPUS >= 16
264# define RT_HASH_LOCK_SZ 2048
265# elif NR_CPUS >= 8
266# define RT_HASH_LOCK_SZ 1024
267# elif NR_CPUS >= 4
268# define RT_HASH_LOCK_SZ 512
269# else
270# define RT_HASH_LOCK_SZ 256
271# endif
Eric Dumazet22c047c2005-07-05 14:55:24 -0700272#endif
273
274static spinlock_t *rt_hash_locks;
275# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
Pavel Emelyanov1ff1cc22007-12-05 21:15:05 -0800276
277static __init void rt_hash_lock_init(void)
278{
279 int i;
280
281 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
282 GFP_KERNEL);
283 if (!rt_hash_locks)
284 panic("IP: failed to allocate rt_hash_locks\n");
285
286 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
287 spin_lock_init(&rt_hash_locks[i]);
288}
Eric Dumazet22c047c2005-07-05 14:55:24 -0700289#else
290# define rt_hash_lock_addr(slot) NULL
Pavel Emelyanov1ff1cc22007-12-05 21:15:05 -0800291
292static inline void rt_hash_lock_init(void)
293{
294}
Eric Dumazet22c047c2005-07-05 14:55:24 -0700295#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700296
Stephen Hemminger817bc4d2008-03-22 17:43:59 -0700297static struct rt_hash_bucket *rt_hash_table __read_mostly;
298static unsigned rt_hash_mask __read_mostly;
299static unsigned int rt_hash_log __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700300
Eric Dumazet2f970d82006-01-17 02:54:36 -0800301static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
Eric Dumazet27f39c73e2010-05-19 22:07:23 +0000302#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700303
Denis V. Lunevb00180d2008-07-05 19:04:09 -0700304static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
Eric Dumazet0eae88f2010-04-20 19:06:52 -0700305 int genid)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700306{
Eric Dumazet0eae88f2010-04-20 19:06:52 -0700307 return jhash_3words((__force u32)daddr, (__force u32)saddr,
Denis V. Lunevb00180d2008-07-05 19:04:09 -0700308 idx, genid)
Eric Dumazet29e75252008-01-31 17:05:09 -0800309 & rt_hash_mask;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700310}
311
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700312static inline int rt_genid(struct net *net)
313{
314 return atomic_read(&net->ipv4.rt_genid);
315}
316
Linus Torvalds1da177e2005-04-16 15:20:36 -0700317#ifdef CONFIG_PROC_FS
318struct rt_cache_iter_state {
Denis V. Luneva75e9362008-02-28 20:50:55 -0800319 struct seq_net_private p;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700320 int bucket;
Eric Dumazet29e75252008-01-31 17:05:09 -0800321 int genid;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700322};
323
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900324static struct rtable *rt_cache_get_first(struct seq_file *seq)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700325{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900326 struct rt_cache_iter_state *st = seq->private;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700327 struct rtable *r = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700328
329 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
Eric Dumazet33d480c2011-08-11 19:30:52 +0000330 if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
Eric Dumazeta6272662008-08-28 01:11:25 -0700331 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700332 rcu_read_lock_bh();
Paul E. McKenneya898def2010-02-22 17:04:49 -0800333 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
Eric Dumazet29e75252008-01-31 17:05:09 -0800334 while (r) {
Changli Gaod8d1f302010-06-10 23:31:35 -0700335 if (dev_net(r->dst.dev) == seq_file_net(seq) &&
Denis V. Luneva75e9362008-02-28 20:50:55 -0800336 r->rt_genid == st->genid)
Eric Dumazet29e75252008-01-31 17:05:09 -0800337 return r;
Changli Gaod8d1f302010-06-10 23:31:35 -0700338 r = rcu_dereference_bh(r->dst.rt_next);
Eric Dumazet29e75252008-01-31 17:05:09 -0800339 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700340 rcu_read_unlock_bh();
341 }
Eric Dumazet29e75252008-01-31 17:05:09 -0800342 return r;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700343}
344
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900345static struct rtable *__rt_cache_get_next(struct seq_file *seq,
Denis V. Lunev642d6312008-02-28 20:50:33 -0800346 struct rtable *r)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700347{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900348 struct rt_cache_iter_state *st = seq->private;
Eric Dumazeta6272662008-08-28 01:11:25 -0700349
Eric Dumazet1c317202010-10-25 21:02:07 +0000350 r = rcu_dereference_bh(r->dst.rt_next);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700351 while (!r) {
352 rcu_read_unlock_bh();
Eric Dumazeta6272662008-08-28 01:11:25 -0700353 do {
354 if (--st->bucket < 0)
355 return NULL;
Eric Dumazet33d480c2011-08-11 19:30:52 +0000356 } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700357 rcu_read_lock_bh();
Eric Dumazet1c317202010-10-25 21:02:07 +0000358 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700359 }
Eric Dumazet1c317202010-10-25 21:02:07 +0000360 return r;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700361}
362
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900363static struct rtable *rt_cache_get_next(struct seq_file *seq,
Denis V. Lunev642d6312008-02-28 20:50:33 -0800364 struct rtable *r)
365{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900366 struct rt_cache_iter_state *st = seq->private;
367 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
Changli Gaod8d1f302010-06-10 23:31:35 -0700368 if (dev_net(r->dst.dev) != seq_file_net(seq))
Denis V. Luneva75e9362008-02-28 20:50:55 -0800369 continue;
Denis V. Lunev642d6312008-02-28 20:50:33 -0800370 if (r->rt_genid == st->genid)
371 break;
372 }
373 return r;
374}
375
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900376static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700377{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900378 struct rtable *r = rt_cache_get_first(seq);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700379
380 if (r)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900381 while (pos && (r = rt_cache_get_next(seq, r)))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700382 --pos;
383 return pos ? NULL : r;
384}
385
386static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
387{
Eric Dumazet29e75252008-01-31 17:05:09 -0800388 struct rt_cache_iter_state *st = seq->private;
Eric Dumazet29e75252008-01-31 17:05:09 -0800389 if (*pos)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900390 return rt_cache_get_idx(seq, *pos - 1);
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700391 st->genid = rt_genid(seq_file_net(seq));
Eric Dumazet29e75252008-01-31 17:05:09 -0800392 return SEQ_START_TOKEN;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700393}
394
395static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
396{
Eric Dumazet29e75252008-01-31 17:05:09 -0800397 struct rtable *r;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700398
399 if (v == SEQ_START_TOKEN)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900400 r = rt_cache_get_first(seq);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700401 else
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900402 r = rt_cache_get_next(seq, v);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700403 ++*pos;
404 return r;
405}
406
407static void rt_cache_seq_stop(struct seq_file *seq, void *v)
408{
409 if (v && v != SEQ_START_TOKEN)
410 rcu_read_unlock_bh();
411}
412
413static int rt_cache_seq_show(struct seq_file *seq, void *v)
414{
415 if (v == SEQ_START_TOKEN)
416 seq_printf(seq, "%-127s\n",
417 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
418 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
419 "HHUptod\tSpecDst");
420 else {
421 struct rtable *r = v;
David S. Miller69cce1d2011-07-17 23:09:49 -0700422 struct neighbour *n;
Eric Dumazet218fa902011-11-29 20:05:55 +0000423 int len, HHUptod;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700424
Eric Dumazet218fa902011-11-29 20:05:55 +0000425 rcu_read_lock();
David Miller27217452011-12-02 16:52:08 +0000426 n = dst_get_neighbour_noref(&r->dst);
Eric Dumazet218fa902011-11-29 20:05:55 +0000427 HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0;
428 rcu_read_unlock();
429
Eric Dumazet0eae88f2010-04-20 19:06:52 -0700430 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
431 "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
Changli Gaod8d1f302010-06-10 23:31:35 -0700432 r->dst.dev ? r->dst.dev->name : "*",
Eric Dumazet0eae88f2010-04-20 19:06:52 -0700433 (__force u32)r->rt_dst,
434 (__force u32)r->rt_gateway,
Changli Gaod8d1f302010-06-10 23:31:35 -0700435 r->rt_flags, atomic_read(&r->dst.__refcnt),
436 r->dst.__use, 0, (__force u32)r->rt_src,
David S. Miller0dbaee32010-12-13 12:52:14 -0800437 dst_metric_advmss(&r->dst) + 40,
Changli Gaod8d1f302010-06-10 23:31:35 -0700438 dst_metric(&r->dst, RTAX_WINDOW),
439 (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
440 dst_metric(&r->dst, RTAX_RTTVAR)),
David S. Miller475949d2011-05-03 19:45:15 -0700441 r->rt_key_tos,
David S. Millerf6b72b622011-07-14 07:53:20 -0700442 -1,
Eric Dumazet218fa902011-11-29 20:05:55 +0000443 HHUptod,
Pavel Emelyanov5e659e42008-04-24 01:02:16 -0700444 r->rt_spec_dst, &len);
445
446 seq_printf(seq, "%*s\n", 127 - len, "");
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900447 }
448 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700449}
450
Stephen Hemmingerf6908082007-03-12 14:34:29 -0700451static const struct seq_operations rt_cache_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700452 .start = rt_cache_seq_start,
453 .next = rt_cache_seq_next,
454 .stop = rt_cache_seq_stop,
455 .show = rt_cache_seq_show,
456};
457
458static int rt_cache_seq_open(struct inode *inode, struct file *file)
459{
Denis V. Luneva75e9362008-02-28 20:50:55 -0800460 return seq_open_net(inode, file, &rt_cache_seq_ops,
Pavel Emelyanovcf7732e2007-10-10 02:29:29 -0700461 sizeof(struct rt_cache_iter_state));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700462}
463
Arjan van de Ven9a321442007-02-12 00:55:35 -0800464static const struct file_operations rt_cache_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700465 .owner = THIS_MODULE,
466 .open = rt_cache_seq_open,
467 .read = seq_read,
468 .llseek = seq_lseek,
Denis V. Luneva75e9362008-02-28 20:50:55 -0800469 .release = seq_release_net,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700470};
471
472
473static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
474{
475 int cpu;
476
477 if (*pos == 0)
478 return SEQ_START_TOKEN;
479
Rusty Russell0f23174a2008-12-29 12:23:42 +0000480 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700481 if (!cpu_possible(cpu))
482 continue;
483 *pos = cpu+1;
Eric Dumazet2f970d82006-01-17 02:54:36 -0800484 return &per_cpu(rt_cache_stat, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700485 }
486 return NULL;
487}
488
489static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
490{
491 int cpu;
492
Rusty Russell0f23174a2008-12-29 12:23:42 +0000493 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700494 if (!cpu_possible(cpu))
495 continue;
496 *pos = cpu+1;
Eric Dumazet2f970d82006-01-17 02:54:36 -0800497 return &per_cpu(rt_cache_stat, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700498 }
499 return NULL;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900500
Linus Torvalds1da177e2005-04-16 15:20:36 -0700501}
502
503static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
504{
505
506}
507
508static int rt_cpu_seq_show(struct seq_file *seq, void *v)
509{
510 struct rt_cache_stat *st = v;
511
512 if (v == SEQ_START_TOKEN) {
Olaf Rempel5bec0032005-04-28 12:16:08 -0700513 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700514 return 0;
515 }
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900516
Linus Torvalds1da177e2005-04-16 15:20:36 -0700517 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
518 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
Eric Dumazetfc66f952010-10-08 06:37:34 +0000519 dst_entries_get_slow(&ipv4_dst_ops),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700520 st->in_hit,
521 st->in_slow_tot,
522 st->in_slow_mc,
523 st->in_no_route,
524 st->in_brd,
525 st->in_martian_dst,
526 st->in_martian_src,
527
528 st->out_hit,
529 st->out_slow_tot,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900530 st->out_slow_mc,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700531
532 st->gc_total,
533 st->gc_ignored,
534 st->gc_goal_miss,
535 st->gc_dst_overflow,
536 st->in_hlist_search,
537 st->out_hlist_search
538 );
539 return 0;
540}
541
Stephen Hemmingerf6908082007-03-12 14:34:29 -0700542static const struct seq_operations rt_cpu_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700543 .start = rt_cpu_seq_start,
544 .next = rt_cpu_seq_next,
545 .stop = rt_cpu_seq_stop,
546 .show = rt_cpu_seq_show,
547};
548
549
550static int rt_cpu_seq_open(struct inode *inode, struct file *file)
551{
552 return seq_open(file, &rt_cpu_seq_ops);
553}
554
Arjan van de Ven9a321442007-02-12 00:55:35 -0800555static const struct file_operations rt_cpu_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700556 .owner = THIS_MODULE,
557 .open = rt_cpu_seq_open,
558 .read = seq_read,
559 .llseek = seq_lseek,
560 .release = seq_release,
561};
562
Patrick McHardyc7066f72011-01-14 13:36:42 +0100563#ifdef CONFIG_IP_ROUTE_CLASSID
Alexey Dobriyana661c412009-11-25 15:40:35 -0800564static int rt_acct_proc_show(struct seq_file *m, void *v)
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800565{
Alexey Dobriyana661c412009-11-25 15:40:35 -0800566 struct ip_rt_acct *dst, *src;
567 unsigned int i, j;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800568
Alexey Dobriyana661c412009-11-25 15:40:35 -0800569 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
570 if (!dst)
571 return -ENOMEM;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800572
Alexey Dobriyana661c412009-11-25 15:40:35 -0800573 for_each_possible_cpu(i) {
574 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
575 for (j = 0; j < 256; j++) {
576 dst[j].o_bytes += src[j].o_bytes;
577 dst[j].o_packets += src[j].o_packets;
578 dst[j].i_bytes += src[j].i_bytes;
579 dst[j].i_packets += src[j].i_packets;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800580 }
581 }
Alexey Dobriyana661c412009-11-25 15:40:35 -0800582
583 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
584 kfree(dst);
585 return 0;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800586}
Alexey Dobriyana661c412009-11-25 15:40:35 -0800587
588static int rt_acct_proc_open(struct inode *inode, struct file *file)
589{
590 return single_open(file, rt_acct_proc_show, NULL);
591}
592
593static const struct file_operations rt_acct_proc_fops = {
594 .owner = THIS_MODULE,
595 .open = rt_acct_proc_open,
596 .read = seq_read,
597 .llseek = seq_lseek,
598 .release = single_release,
599};
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800600#endif
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800601
Denis V. Lunev73b38712008-02-28 20:51:18 -0800602static int __net_init ip_rt_do_proc_init(struct net *net)
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800603{
604 struct proc_dir_entry *pde;
605
606 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
607 &rt_cache_seq_fops);
608 if (!pde)
609 goto err1;
610
Wang Chen77020722008-02-28 14:14:25 -0800611 pde = proc_create("rt_cache", S_IRUGO,
612 net->proc_net_stat, &rt_cpu_seq_fops);
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800613 if (!pde)
614 goto err2;
615
Patrick McHardyc7066f72011-01-14 13:36:42 +0100616#ifdef CONFIG_IP_ROUTE_CLASSID
Alexey Dobriyana661c412009-11-25 15:40:35 -0800617 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800618 if (!pde)
619 goto err3;
620#endif
621 return 0;
622
Patrick McHardyc7066f72011-01-14 13:36:42 +0100623#ifdef CONFIG_IP_ROUTE_CLASSID
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800624err3:
625 remove_proc_entry("rt_cache", net->proc_net_stat);
626#endif
627err2:
628 remove_proc_entry("rt_cache", net->proc_net);
629err1:
630 return -ENOMEM;
631}
Denis V. Lunev73b38712008-02-28 20:51:18 -0800632
633static void __net_exit ip_rt_do_proc_exit(struct net *net)
634{
635 remove_proc_entry("rt_cache", net->proc_net_stat);
636 remove_proc_entry("rt_cache", net->proc_net);
Patrick McHardyc7066f72011-01-14 13:36:42 +0100637#ifdef CONFIG_IP_ROUTE_CLASSID
Denis V. Lunev73b38712008-02-28 20:51:18 -0800638 remove_proc_entry("rt_acct", net->proc_net);
Alexey Dobriyan0a931ac2010-01-17 03:32:50 +0000639#endif
Denis V. Lunev73b38712008-02-28 20:51:18 -0800640}
641
642static struct pernet_operations ip_rt_proc_ops __net_initdata = {
643 .init = ip_rt_do_proc_init,
644 .exit = ip_rt_do_proc_exit,
645};
646
647static int __init ip_rt_proc_init(void)
648{
649 return register_pernet_subsys(&ip_rt_proc_ops);
650}
651
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800652#else
Denis V. Lunev73b38712008-02-28 20:51:18 -0800653static inline int ip_rt_proc_init(void)
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800654{
655 return 0;
656}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700657#endif /* CONFIG_PROC_FS */
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900658
Stephen Hemminger5969f712008-04-10 01:52:09 -0700659static inline void rt_free(struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700660{
Changli Gaod8d1f302010-06-10 23:31:35 -0700661 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700662}
663
Stephen Hemminger5969f712008-04-10 01:52:09 -0700664static inline void rt_drop(struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700665{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700666 ip_rt_put(rt);
Changli Gaod8d1f302010-06-10 23:31:35 -0700667 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700668}
669
Stephen Hemminger5969f712008-04-10 01:52:09 -0700670static inline int rt_fast_clean(struct rtable *rth)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700671{
672 /* Kill broadcast/multicast entries very aggresively, if they
673 collide in hash table with more useful entries */
674 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
David S. Millerc7537962010-11-11 17:07:48 -0800675 rt_is_input_route(rth) && rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700676}
677
Stephen Hemminger5969f712008-04-10 01:52:09 -0700678static inline int rt_valuable(struct rtable *rth)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700679{
680 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
David S. Miller2c8cec52011-02-09 20:42:07 -0800681 (rth->peer && rth->peer->pmtu_expires);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700682}
683
684static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
685{
686 unsigned long age;
687 int ret = 0;
688
Changli Gaod8d1f302010-06-10 23:31:35 -0700689 if (atomic_read(&rth->dst.__refcnt))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700690 goto out;
691
Changli Gaod8d1f302010-06-10 23:31:35 -0700692 age = jiffies - rth->dst.lastuse;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700693 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
694 (age <= tmo2 && rt_valuable(rth)))
695 goto out;
696 ret = 1;
697out: return ret;
698}
699
700/* Bits of score are:
701 * 31: very valuable
702 * 30: not quite useless
703 * 29..0: usage counter
704 */
705static inline u32 rt_score(struct rtable *rt)
706{
Changli Gaod8d1f302010-06-10 23:31:35 -0700707 u32 score = jiffies - rt->dst.lastuse;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700708
709 score = ~score & ~(3<<30);
710
711 if (rt_valuable(rt))
712 score |= (1<<31);
713
David S. Millerc7537962010-11-11 17:07:48 -0800714 if (rt_is_output_route(rt) ||
Linus Torvalds1da177e2005-04-16 15:20:36 -0700715 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
716 score |= (1<<30);
717
718 return score;
719}
720
Neil Horman1080d702008-10-27 12:28:25 -0700721static inline bool rt_caching(const struct net *net)
722{
723 return net->ipv4.current_rt_cache_rebuild_count <=
724 net->ipv4.sysctl_rt_cache_rebuild_count;
725}
726
David S. Miller5e2b61f2011-03-04 21:47:09 -0800727static inline bool compare_hash_inputs(const struct rtable *rt1,
728 const struct rtable *rt2)
Neil Horman1080d702008-10-27 12:28:25 -0700729{
David S. Miller5e2b61f2011-03-04 21:47:09 -0800730 return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
731 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
Julian Anastasov97a80412011-08-09 04:01:16 +0000732 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
Neil Horman1080d702008-10-27 12:28:25 -0700733}
734
David S. Miller5e2b61f2011-03-04 21:47:09 -0800735static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700736{
David S. Miller5e2b61f2011-03-04 21:47:09 -0800737 return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
738 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
739 (rt1->rt_mark ^ rt2->rt_mark) |
David S. Miller475949d2011-05-03 19:45:15 -0700740 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
Julian Anastasovd547f722011-08-07 22:20:20 -0700741 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
Julian Anastasov97a80412011-08-09 04:01:16 +0000742 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700743}
744
Denis V. Lunevb5921912008-01-22 23:50:25 -0800745static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
746{
Changli Gaod8d1f302010-06-10 23:31:35 -0700747 return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
Denis V. Lunevb5921912008-01-22 23:50:25 -0800748}
749
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700750static inline int rt_is_expired(struct rtable *rth)
751{
Changli Gaod8d1f302010-06-10 23:31:35 -0700752 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700753}
754
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800755/*
756 * Perform a full scan of hash table and free all entries.
757 * Can be called by a softirq or a process.
758 * In the later case, we want to be reschedule if necessary
759 */
David S. Miller6561a3b2010-12-19 21:11:20 -0800760static void rt_do_flush(struct net *net, int process_context)
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800761{
762 unsigned int i;
763 struct rtable *rth, *next;
764
765 for (i = 0; i <= rt_hash_mask; i++) {
David S. Miller6561a3b2010-12-19 21:11:20 -0800766 struct rtable __rcu **pprev;
767 struct rtable *list;
768
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800769 if (process_context && need_resched())
770 cond_resched();
Eric Dumazet33d480c2011-08-11 19:30:52 +0000771 rth = rcu_access_pointer(rt_hash_table[i].chain);
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800772 if (!rth)
773 continue;
774
775 spin_lock_bh(rt_hash_lock_addr(i));
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700776
David S. Miller6561a3b2010-12-19 21:11:20 -0800777 list = NULL;
778 pprev = &rt_hash_table[i].chain;
779 rth = rcu_dereference_protected(*pprev,
Eric Dumazet1c317202010-10-25 21:02:07 +0000780 lockdep_is_held(rt_hash_lock_addr(i)));
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700781
David S. Miller6561a3b2010-12-19 21:11:20 -0800782 while (rth) {
783 next = rcu_dereference_protected(rth->dst.rt_next,
784 lockdep_is_held(rt_hash_lock_addr(i)));
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700785
David S. Miller6561a3b2010-12-19 21:11:20 -0800786 if (!net ||
787 net_eq(dev_net(rth->dst.dev), net)) {
788 rcu_assign_pointer(*pprev, next);
789 rcu_assign_pointer(rth->dst.rt_next, list);
790 list = rth;
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700791 } else {
David S. Miller6561a3b2010-12-19 21:11:20 -0800792 pprev = &rth->dst.rt_next;
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700793 }
David S. Miller6561a3b2010-12-19 21:11:20 -0800794 rth = next;
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700795 }
David S. Miller6561a3b2010-12-19 21:11:20 -0800796
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800797 spin_unlock_bh(rt_hash_lock_addr(i));
798
David S. Miller6561a3b2010-12-19 21:11:20 -0800799 for (; list; list = next) {
800 next = rcu_dereference_protected(list->dst.rt_next, 1);
801 rt_free(list);
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800802 }
803 }
804}
805
Neil Horman1080d702008-10-27 12:28:25 -0700806/*
807 * While freeing expired entries, we compute average chain length
808 * and standard deviation, using fixed-point arithmetic.
809 * This to have an estimation of rt_chain_length_max
810 * rt_chain_length_max = max(elasticity, AVG + 4*SD)
811 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
812 */
813
814#define FRACT_BITS 3
815#define ONE (1UL << FRACT_BITS)
816
Eric Dumazet98376382010-03-08 03:20:00 +0000817/*
818 * Given a hash chain and an item in this hash chain,
819 * find if a previous entry has the same hash_inputs
820 * (but differs on tos, mark or oif)
821 * Returns 0 if an alias is found.
822 * Returns ONE if rth has no alias before itself.
823 */
824static int has_noalias(const struct rtable *head, const struct rtable *rth)
825{
826 const struct rtable *aux = head;
827
828 while (aux != rth) {
David S. Miller5e2b61f2011-03-04 21:47:09 -0800829 if (compare_hash_inputs(aux, rth))
Eric Dumazet98376382010-03-08 03:20:00 +0000830 return 0;
Eric Dumazet1c317202010-10-25 21:02:07 +0000831 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
Eric Dumazet98376382010-03-08 03:20:00 +0000832 }
833 return ONE;
834}
835
Eric Dumazet9f28a2f2011-12-21 15:47:16 -0500836static void rt_check_expire(void)
837{
838 static unsigned int rover;
839 unsigned int i = rover, goal;
840 struct rtable *rth;
841 struct rtable __rcu **rthp;
842 unsigned long samples = 0;
843 unsigned long sum = 0, sum2 = 0;
844 unsigned long delta;
845 u64 mult;
846
847 delta = jiffies - expires_ljiffies;
848 expires_ljiffies = jiffies;
849 mult = ((u64)delta) << rt_hash_log;
850 if (ip_rt_gc_timeout > 1)
851 do_div(mult, ip_rt_gc_timeout);
852 goal = (unsigned int)mult;
853 if (goal > rt_hash_mask)
854 goal = rt_hash_mask + 1;
855 for (; goal > 0; goal--) {
856 unsigned long tmo = ip_rt_gc_timeout;
857 unsigned long length;
858
859 i = (i + 1) & rt_hash_mask;
860 rthp = &rt_hash_table[i].chain;
861
862 if (need_resched())
863 cond_resched();
864
865 samples++;
866
867 if (rcu_dereference_raw(*rthp) == NULL)
868 continue;
869 length = 0;
870 spin_lock_bh(rt_hash_lock_addr(i));
871 while ((rth = rcu_dereference_protected(*rthp,
872 lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
873 prefetch(rth->dst.rt_next);
874 if (rt_is_expired(rth)) {
875 *rthp = rth->dst.rt_next;
876 rt_free(rth);
877 continue;
878 }
879 if (rth->dst.expires) {
880 /* Entry is expired even if it is in use */
881 if (time_before_eq(jiffies, rth->dst.expires)) {
882nofree:
883 tmo >>= 1;
884 rthp = &rth->dst.rt_next;
885 /*
886 * We only count entries on
887 * a chain with equal hash inputs once
888 * so that entries for different QOS
889 * levels, and other non-hash input
890 * attributes don't unfairly skew
891 * the length computation
892 */
893 length += has_noalias(rt_hash_table[i].chain, rth);
894 continue;
895 }
896 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
897 goto nofree;
898
899 /* Cleanup aged off entries. */
900 *rthp = rth->dst.rt_next;
901 rt_free(rth);
902 }
903 spin_unlock_bh(rt_hash_lock_addr(i));
904 sum += length;
905 sum2 += length*length;
906 }
907 if (samples) {
908 unsigned long avg = sum / samples;
909 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
910 rt_chain_length_max = max_t(unsigned long,
911 ip_rt_gc_elasticity,
912 (avg + 4*sd) >> FRACT_BITS);
913 }
914 rover = i;
915}
916
917/*
918 * rt_worker_func() is run in process context.
919 * we call rt_check_expire() to scan part of the hash table
920 */
921static void rt_worker_func(struct work_struct *work)
922{
923 rt_check_expire();
924 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
925}
926
Eric Dumazet29e75252008-01-31 17:05:09 -0800927/*
Lucas De Marchi25985ed2011-03-30 22:57:33 -0300928 * Perturbation of rt_genid by a small quantity [1..256]
Eric Dumazet29e75252008-01-31 17:05:09 -0800929 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
930 * many times (2^24) without giving recent rt_genid.
931 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700932 */
Denis V. Lunev86c657f2008-07-05 19:03:31 -0700933static void rt_cache_invalidate(struct net *net)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700934{
Eric Dumazet29e75252008-01-31 17:05:09 -0800935 unsigned char shuffle;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700936
Eric Dumazet29e75252008-01-31 17:05:09 -0800937 get_random_bytes(&shuffle, sizeof(shuffle));
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700938 atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
Steffen Klassert5faa5df2012-03-06 21:20:26 +0000939 inetpeer_invalidate_tree(AF_INET);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700940}
941
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800942/*
Eric Dumazet29e75252008-01-31 17:05:09 -0800943 * delay < 0 : invalidate cache (fast : entries will be deleted later)
944 * delay >= 0 : invalidate & flush cache (can be long)
945 */
Denis V. Lunev76e6ebf2008-07-05 19:00:44 -0700946void rt_cache_flush(struct net *net, int delay)
Eric Dumazet29e75252008-01-31 17:05:09 -0800947{
Denis V. Lunev86c657f2008-07-05 19:03:31 -0700948 rt_cache_invalidate(net);
Eric Dumazet29e75252008-01-31 17:05:09 -0800949 if (delay >= 0)
David S. Miller6561a3b2010-12-19 21:11:20 -0800950 rt_do_flush(net, !in_softirq());
Eric Dumazet29e75252008-01-31 17:05:09 -0800951}
952
Eric W. Biedermana5ee1552009-11-29 15:45:58 +0000953/* Flush previous cache invalidated entries from the cache */
David S. Miller6561a3b2010-12-19 21:11:20 -0800954void rt_cache_flush_batch(struct net *net)
Eric W. Biedermana5ee1552009-11-29 15:45:58 +0000955{
David S. Miller6561a3b2010-12-19 21:11:20 -0800956 rt_do_flush(net, !in_softirq());
Eric W. Biedermana5ee1552009-11-29 15:45:58 +0000957}
958
Neil Horman1080d702008-10-27 12:28:25 -0700959static void rt_emergency_hash_rebuild(struct net *net)
960{
Neil Horman3ee94372010-05-08 01:57:52 -0700961 if (net_ratelimit())
Joe Perches058bd4d2012-03-11 18:36:11 +0000962 pr_warn("Route hash chain too long!\n");
Neil Horman3ee94372010-05-08 01:57:52 -0700963 rt_cache_invalidate(net);
Neil Horman1080d702008-10-27 12:28:25 -0700964}
965
Linus Torvalds1da177e2005-04-16 15:20:36 -0700966/*
967 Short description of GC goals.
968
969 We want to build algorithm, which will keep routing cache
970 at some equilibrium point, when number of aged off entries
971 is kept approximately equal to newly generated ones.
972
973 Current expiration strength is variable "expire".
974 We try to adjust it dynamically, so that if networking
975 is idle expires is large enough to keep enough of warm entries,
976 and when load increases it reduces to limit cache size.
977 */
978
Daniel Lezcano569d3642008-01-18 03:56:57 -0800979static int rt_garbage_collect(struct dst_ops *ops)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700980{
981 static unsigned long expire = RT_GC_TIMEOUT;
982 static unsigned long last_gc;
983 static int rover;
984 static int equilibrium;
Eric Dumazet1c317202010-10-25 21:02:07 +0000985 struct rtable *rth;
986 struct rtable __rcu **rthp;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700987 unsigned long now = jiffies;
988 int goal;
Eric Dumazetfc66f952010-10-08 06:37:34 +0000989 int entries = dst_entries_get_fast(&ipv4_dst_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700990
991 /*
992 * Garbage collection is pretty expensive,
993 * do not make it too frequently.
994 */
995
996 RT_CACHE_STAT_INC(gc_total);
997
998 if (now - last_gc < ip_rt_gc_min_interval &&
Eric Dumazetfc66f952010-10-08 06:37:34 +0000999 entries < ip_rt_max_size) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001000 RT_CACHE_STAT_INC(gc_ignored);
1001 goto out;
1002 }
1003
Eric Dumazetfc66f952010-10-08 06:37:34 +00001004 entries = dst_entries_get_slow(&ipv4_dst_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001005 /* Calculate number of entries, which we want to expire now. */
Eric Dumazetfc66f952010-10-08 06:37:34 +00001006 goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001007 if (goal <= 0) {
1008 if (equilibrium < ipv4_dst_ops.gc_thresh)
1009 equilibrium = ipv4_dst_ops.gc_thresh;
Eric Dumazetfc66f952010-10-08 06:37:34 +00001010 goal = entries - equilibrium;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001011 if (goal > 0) {
Eric Dumazetb790ced2007-12-21 01:49:07 -08001012 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
Eric Dumazetfc66f952010-10-08 06:37:34 +00001013 goal = entries - equilibrium;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001014 }
1015 } else {
1016 /* We are in dangerous area. Try to reduce cache really
1017 * aggressively.
1018 */
Eric Dumazetb790ced2007-12-21 01:49:07 -08001019 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
Eric Dumazetfc66f952010-10-08 06:37:34 +00001020 equilibrium = entries - goal;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001021 }
1022
1023 if (now - last_gc >= ip_rt_gc_min_interval)
1024 last_gc = now;
1025
1026 if (goal <= 0) {
1027 equilibrium += goal;
1028 goto work_done;
1029 }
1030
1031 do {
1032 int i, k;
1033
1034 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1035 unsigned long tmo = expire;
1036
1037 k = (k + 1) & rt_hash_mask;
1038 rthp = &rt_hash_table[k].chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -07001039 spin_lock_bh(rt_hash_lock_addr(k));
Eric Dumazet1c317202010-10-25 21:02:07 +00001040 while ((rth = rcu_dereference_protected(*rthp,
1041 lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001042 if (!rt_is_expired(rth) &&
Eric Dumazet29e75252008-01-31 17:05:09 -08001043 !rt_may_expire(rth, tmo, expire)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001044 tmo >>= 1;
Changli Gaod8d1f302010-06-10 23:31:35 -07001045 rthp = &rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001046 continue;
1047 }
Changli Gaod8d1f302010-06-10 23:31:35 -07001048 *rthp = rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001049 rt_free(rth);
1050 goal--;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001051 }
Eric Dumazet22c047c2005-07-05 14:55:24 -07001052 spin_unlock_bh(rt_hash_lock_addr(k));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001053 if (goal <= 0)
1054 break;
1055 }
1056 rover = k;
1057
1058 if (goal <= 0)
1059 goto work_done;
1060
1061 /* Goal is not achieved. We stop process if:
1062
1063 - if expire reduced to zero. Otherwise, expire is halfed.
1064 - if table is not full.
1065 - if we are called from interrupt.
1066 - jiffies check is just fallback/debug loop breaker.
1067 We will not spin here for long time in any case.
1068 */
1069
1070 RT_CACHE_STAT_INC(gc_goal_miss);
1071
1072 if (expire == 0)
1073 break;
1074
1075 expire >>= 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001076
Eric Dumazetfc66f952010-10-08 06:37:34 +00001077 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001078 goto out;
1079 } while (!in_softirq() && time_before_eq(jiffies, now));
1080
Eric Dumazetfc66f952010-10-08 06:37:34 +00001081 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1082 goto out;
1083 if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001084 goto out;
1085 if (net_ratelimit())
Joe Perches058bd4d2012-03-11 18:36:11 +00001086 pr_warn("dst cache overflow\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07001087 RT_CACHE_STAT_INC(gc_dst_overflow);
1088 return 1;
1089
1090work_done:
1091 expire += ip_rt_gc_min_interval;
1092 if (expire > ip_rt_gc_timeout ||
Eric Dumazetfc66f952010-10-08 06:37:34 +00001093 dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1094 dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001095 expire = ip_rt_gc_timeout;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001096out: return 0;
1097}
1098
Eric Dumazet98376382010-03-08 03:20:00 +00001099/*
1100 * Returns number of entries in a hash chain that have different hash_inputs
1101 */
1102static int slow_chain_length(const struct rtable *head)
1103{
1104 int length = 0;
1105 const struct rtable *rth = head;
1106
1107 while (rth) {
1108 length += has_noalias(head, rth);
Eric Dumazet1c317202010-10-25 21:02:07 +00001109 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
Eric Dumazet98376382010-03-08 03:20:00 +00001110 }
1111 return length >> FRACT_BITS;
1112}
1113
David S. Millerd3aaeb32011-07-18 00:40:17 -07001114static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
David Miller3769cff2011-07-11 22:44:24 +00001115{
David S. Millerd3aaeb32011-07-18 00:40:17 -07001116 static const __be32 inaddr_any = 0;
1117 struct net_device *dev = dst->dev;
1118 const __be32 *pkey = daddr;
David S. Miller39232972012-01-26 15:22:32 -05001119 const struct rtable *rt;
David Miller3769cff2011-07-11 22:44:24 +00001120 struct neighbour *n;
1121
David S. Miller39232972012-01-26 15:22:32 -05001122 rt = (const struct rtable *) dst;
1123
David Miller3769cff2011-07-11 22:44:24 +00001124 if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
David S. Millerd3aaeb32011-07-18 00:40:17 -07001125 pkey = &inaddr_any;
David S. Miller39232972012-01-26 15:22:32 -05001126 else if (rt->rt_gateway)
1127 pkey = (const __be32 *) &rt->rt_gateway;
David S. Millerd3aaeb32011-07-18 00:40:17 -07001128
David S. Miller80703d22012-02-15 17:48:35 -05001129 n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
David S. Millerd3aaeb32011-07-18 00:40:17 -07001130 if (n)
1131 return n;
David Miller32092ec2011-07-25 00:01:41 +00001132 return neigh_create(&arp_tbl, pkey, dev);
David S. Millerd3aaeb32011-07-18 00:40:17 -07001133}
1134
1135static int rt_bind_neighbour(struct rtable *rt)
1136{
1137 struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
David Miller3769cff2011-07-11 22:44:24 +00001138 if (IS_ERR(n))
1139 return PTR_ERR(n);
David S. Miller69cce1d2011-07-17 23:09:49 -07001140 dst_set_neighbour(&rt->dst, n);
David Miller3769cff2011-07-11 22:44:24 +00001141
1142 return 0;
1143}
1144
David S. Millerb23dd4f2011-03-02 14:31:35 -08001145static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1146 struct sk_buff *skb, int ifindex)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001147{
Eric Dumazet1c317202010-10-25 21:02:07 +00001148 struct rtable *rth, *cand;
1149 struct rtable __rcu **rthp, **candp;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001150 unsigned long now;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001151 u32 min_score;
1152 int chain_length;
1153 int attempts = !in_softirq();
1154
1155restart:
1156 chain_length = 0;
1157 min_score = ~(u32)0;
1158 cand = NULL;
1159 candp = NULL;
1160 now = jiffies;
1161
Changli Gaod8d1f302010-06-10 23:31:35 -07001162 if (!rt_caching(dev_net(rt->dst.dev))) {
Neil Horman73e42892009-06-20 01:15:16 -07001163 /*
1164 * If we're not caching, just tell the caller we
1165 * were successful and don't touch the route. The
1166 * caller hold the sole reference to the cache entry, and
1167 * it will be released when the caller is done with it.
1168 * If we drop it here, the callers have no way to resolve routes
1169 * when we're not caching. Instead, just point *rp at rt, so
1170 * the caller gets a single use out of the route
Neil Hormanb6280b42009-06-22 10:18:53 +00001171 * Note that we do rt_free on this new route entry, so that
1172 * once its refcount hits zero, we are still able to reap it
1173 * (Thanks Alexey)
Eric Dumazet27b75c92010-10-15 05:44:11 +00001174 * Note: To avoid expensive rcu stuff for this uncached dst,
1175 * we set DST_NOCACHE so that dst_release() can free dst without
1176 * waiting a grace period.
Neil Horman73e42892009-06-20 01:15:16 -07001177 */
Neil Hormanb6280b42009-06-22 10:18:53 +00001178
Eric Dumazetc7d44262010-10-03 22:17:54 -07001179 rt->dst.flags |= DST_NOCACHE;
David S. Millerc7537962010-11-11 17:07:48 -08001180 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
David Miller3769cff2011-07-11 22:44:24 +00001181 int err = rt_bind_neighbour(rt);
Neil Hormanb6280b42009-06-22 10:18:53 +00001182 if (err) {
1183 if (net_ratelimit())
Joe Perches058bd4d2012-03-11 18:36:11 +00001184 pr_warn("Neighbour table failure & not caching routes\n");
Eric Dumazet27b75c92010-10-15 05:44:11 +00001185 ip_rt_put(rt);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001186 return ERR_PTR(err);
Neil Hormanb6280b42009-06-22 10:18:53 +00001187 }
1188 }
1189
Neil Hormanb6280b42009-06-22 10:18:53 +00001190 goto skip_hashing;
Neil Horman1080d702008-10-27 12:28:25 -07001191 }
1192
Linus Torvalds1da177e2005-04-16 15:20:36 -07001193 rthp = &rt_hash_table[hash].chain;
1194
Eric Dumazet22c047c2005-07-05 14:55:24 -07001195 spin_lock_bh(rt_hash_lock_addr(hash));
Eric Dumazet1c317202010-10-25 21:02:07 +00001196 while ((rth = rcu_dereference_protected(*rthp,
1197 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001198 if (rt_is_expired(rth)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001199 *rthp = rth->dst.rt_next;
Eric Dumazet29e75252008-01-31 17:05:09 -08001200 rt_free(rth);
1201 continue;
1202 }
David S. Miller5e2b61f2011-03-04 21:47:09 -08001203 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001204 /* Put it first */
Changli Gaod8d1f302010-06-10 23:31:35 -07001205 *rthp = rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001206 /*
1207 * Since lookup is lockfree, the deletion
1208 * must be visible to another weakly ordered CPU before
1209 * the insertion at the start of the hash chain.
1210 */
Changli Gaod8d1f302010-06-10 23:31:35 -07001211 rcu_assign_pointer(rth->dst.rt_next,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001212 rt_hash_table[hash].chain);
1213 /*
1214 * Since lookup is lockfree, the update writes
1215 * must be ordered for consistency on SMP.
1216 */
1217 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1218
Changli Gaod8d1f302010-06-10 23:31:35 -07001219 dst_use(&rth->dst, now);
Eric Dumazet22c047c2005-07-05 14:55:24 -07001220 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001221
1222 rt_drop(rt);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001223 if (skb)
Changli Gaod8d1f302010-06-10 23:31:35 -07001224 skb_dst_set(skb, &rth->dst);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001225 return rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001226 }
1227
Changli Gaod8d1f302010-06-10 23:31:35 -07001228 if (!atomic_read(&rth->dst.__refcnt)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001229 u32 score = rt_score(rth);
1230
1231 if (score <= min_score) {
1232 cand = rth;
1233 candp = rthp;
1234 min_score = score;
1235 }
1236 }
1237
1238 chain_length++;
1239
Changli Gaod8d1f302010-06-10 23:31:35 -07001240 rthp = &rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001241 }
1242
1243 if (cand) {
1244 /* ip_rt_gc_elasticity used to be average length of chain
1245 * length, when exceeded gc becomes really aggressive.
1246 *
1247 * The second limit is less certain. At the moment it allows
1248 * only 2 entries per bucket. We will see.
1249 */
1250 if (chain_length > ip_rt_gc_elasticity) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001251 *candp = cand->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001252 rt_free(cand);
1253 }
Neil Horman1080d702008-10-27 12:28:25 -07001254 } else {
Eric Dumazet98376382010-03-08 03:20:00 +00001255 if (chain_length > rt_chain_length_max &&
1256 slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001257 struct net *net = dev_net(rt->dst.dev);
Neil Horman1080d702008-10-27 12:28:25 -07001258 int num = ++net->ipv4.current_rt_cache_rebuild_count;
Pavel Emelyanovb35ecb52010-03-24 07:43:17 +00001259 if (!rt_caching(net)) {
Joe Perches058bd4d2012-03-11 18:36:11 +00001260 pr_warn("%s: %d rebuilds is over limit, route caching disabled\n",
Changli Gaod8d1f302010-06-10 23:31:35 -07001261 rt->dst.dev->name, num);
Neil Horman1080d702008-10-27 12:28:25 -07001262 }
Pavel Emelyanovb35ecb52010-03-24 07:43:17 +00001263 rt_emergency_hash_rebuild(net);
Pavel Emelyanov6a2bad72010-03-24 21:51:22 +00001264 spin_unlock_bh(rt_hash_lock_addr(hash));
1265
David S. Miller5e2b61f2011-03-04 21:47:09 -08001266 hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
Pavel Emelyanov6a2bad72010-03-24 21:51:22 +00001267 ifindex, rt_genid(net));
1268 goto restart;
Neil Horman1080d702008-10-27 12:28:25 -07001269 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001270 }
1271
1272 /* Try to bind route to arp only if it is output
1273 route or unicast forwarding path.
1274 */
David S. Millerc7537962010-11-11 17:07:48 -08001275 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
David Miller3769cff2011-07-11 22:44:24 +00001276 int err = rt_bind_neighbour(rt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001277 if (err) {
Eric Dumazet22c047c2005-07-05 14:55:24 -07001278 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001279
1280 if (err != -ENOBUFS) {
1281 rt_drop(rt);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001282 return ERR_PTR(err);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001283 }
1284
1285 /* Neighbour tables are full and nothing
1286 can be released. Try to shrink route cache,
1287 it is most likely it holds some neighbour records.
1288 */
1289 if (attempts-- > 0) {
1290 int saved_elasticity = ip_rt_gc_elasticity;
1291 int saved_int = ip_rt_gc_min_interval;
1292 ip_rt_gc_elasticity = 1;
1293 ip_rt_gc_min_interval = 0;
Daniel Lezcano569d3642008-01-18 03:56:57 -08001294 rt_garbage_collect(&ipv4_dst_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001295 ip_rt_gc_min_interval = saved_int;
1296 ip_rt_gc_elasticity = saved_elasticity;
1297 goto restart;
1298 }
1299
1300 if (net_ratelimit())
Joe Perches058bd4d2012-03-11 18:36:11 +00001301 pr_warn("ipv4: Neighbour table overflow\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07001302 rt_drop(rt);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001303 return ERR_PTR(-ENOBUFS);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001304 }
1305 }
1306
Changli Gaod8d1f302010-06-10 23:31:35 -07001307 rt->dst.rt_next = rt_hash_table[hash].chain;
Neil Horman1080d702008-10-27 12:28:25 -07001308
Eric Dumazet00269b52008-10-16 14:18:29 -07001309 /*
1310 * Since lookup is lockfree, we must make sure
Lucas De Marchi25985ed2011-03-30 22:57:33 -03001311 * previous writes to rt are committed to memory
Eric Dumazet00269b52008-10-16 14:18:29 -07001312 * before making rt visible to other CPUS.
1313 */
Eric Dumazet1ddbcb02009-05-19 20:14:28 +00001314 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
Neil Horman1080d702008-10-27 12:28:25 -07001315
Eric Dumazet22c047c2005-07-05 14:55:24 -07001316 spin_unlock_bh(rt_hash_lock_addr(hash));
Neil Horman73e42892009-06-20 01:15:16 -07001317
Neil Hormanb6280b42009-06-22 10:18:53 +00001318skip_hashing:
David S. Millerb23dd4f2011-03-02 14:31:35 -08001319 if (skb)
Changli Gaod8d1f302010-06-10 23:31:35 -07001320 skb_dst_set(skb, &rt->dst);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001321 return rt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001322}
1323
David S. Miller6431cbc2011-02-07 20:38:06 -08001324static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1325
1326static u32 rt_peer_genid(void)
1327{
1328 return atomic_read(&__rt_peer_genid);
1329}
1330
David S. Millera48eff12011-05-18 18:42:43 -04001331void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001332{
Linus Torvalds1da177e2005-04-16 15:20:36 -07001333 struct inet_peer *peer;
1334
David S. Millera48eff12011-05-18 18:42:43 -04001335 peer = inet_getpeer_v4(daddr, create);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001336
Eric Dumazet49e8ab02010-08-19 06:10:45 +00001337 if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001338 inet_putpeer(peer);
David S. Miller6431cbc2011-02-07 20:38:06 -08001339 else
1340 rt->rt_peer_genid = rt_peer_genid();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001341}
1342
1343/*
1344 * Peer allocation may fail only in serious out-of-memory conditions. However
1345 * we still can generate some output.
1346 * Random ID selection looks a bit dangerous because we have no chances to
1347 * select ID being unique in a reasonable period of time.
1348 * But broken packet identifier may be better than no packet at all.
1349 */
1350static void ip_select_fb_ident(struct iphdr *iph)
1351{
1352 static DEFINE_SPINLOCK(ip_fb_id_lock);
1353 static u32 ip_fallback_id;
1354 u32 salt;
1355
1356 spin_lock_bh(&ip_fb_id_lock);
Al Viroe4485152006-09-26 22:15:01 -07001357 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001358 iph->id = htons(salt & 0xFFFF);
1359 ip_fallback_id = salt;
1360 spin_unlock_bh(&ip_fb_id_lock);
1361}
1362
1363void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1364{
1365 struct rtable *rt = (struct rtable *) dst;
1366
Eric Dumazete688a602011-12-22 04:15:53 +00001367 if (rt && !(rt->dst.flags & DST_NOPEER)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001368 if (rt->peer == NULL)
David S. Millera48eff12011-05-18 18:42:43 -04001369 rt_bind_peer(rt, rt->rt_dst, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001370
1371 /* If peer is attached to destination, it is never detached,
1372 so that we need not to grab a lock to dereference it.
1373 */
1374 if (rt->peer) {
1375 iph->id = htons(inet_getid(rt->peer, more));
1376 return;
1377 }
Eric Dumazete688a602011-12-22 04:15:53 +00001378 } else if (!rt)
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001379 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
Stephen Hemminger9c2b3322005-04-19 22:39:42 -07001380 __builtin_return_address(0));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001381
1382 ip_select_fb_ident(iph);
1383}
Eric Dumazet4bc2f182010-07-09 21:22:10 +00001384EXPORT_SYMBOL(__ip_select_ident);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001385
1386static void rt_del(unsigned hash, struct rtable *rt)
1387{
Eric Dumazet1c317202010-10-25 21:02:07 +00001388 struct rtable __rcu **rthp;
1389 struct rtable *aux;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001390
Eric Dumazet29e75252008-01-31 17:05:09 -08001391 rthp = &rt_hash_table[hash].chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -07001392 spin_lock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001393 ip_rt_put(rt);
Eric Dumazet1c317202010-10-25 21:02:07 +00001394 while ((aux = rcu_dereference_protected(*rthp,
1395 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001396 if (aux == rt || rt_is_expired(aux)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001397 *rthp = aux->dst.rt_next;
Eric Dumazet29e75252008-01-31 17:05:09 -08001398 rt_free(aux);
1399 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001400 }
Changli Gaod8d1f302010-06-10 23:31:35 -07001401 rthp = &aux->dst.rt_next;
Eric Dumazet29e75252008-01-31 17:05:09 -08001402 }
Eric Dumazet22c047c2005-07-05 14:55:24 -07001403 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001404}
1405
David S. Millerde398fb2011-12-05 13:21:42 -05001406static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
Eric Dumazet9cc20b22011-11-18 15:24:32 -05001407{
1408 struct rtable *rt = (struct rtable *) dst;
1409 __be32 orig_gw = rt->rt_gateway;
1410 struct neighbour *n, *old_n;
1411
1412 dst_confirm(&rt->dst);
1413
1414 rt->rt_gateway = peer->redirect_learned.a4;
1415
1416 n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
David S. Millerde398fb2011-12-05 13:21:42 -05001417 if (IS_ERR(n)) {
1418 rt->rt_gateway = orig_gw;
1419 return;
1420 }
Eric Dumazet9cc20b22011-11-18 15:24:32 -05001421 old_n = xchg(&rt->dst._neighbour, n);
1422 if (old_n)
1423 neigh_release(old_n);
David S. Millerde398fb2011-12-05 13:21:42 -05001424 if (!(n->nud_state & NUD_VALID)) {
1425 neigh_event_send(n, NULL);
Eric Dumazet9cc20b22011-11-18 15:24:32 -05001426 } else {
1427 rt->rt_flags |= RTCF_REDIRECTED;
1428 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1429 }
Eric Dumazet9cc20b22011-11-18 15:24:32 -05001430}
1431
Eric Dumazeted7865a42010-06-07 21:49:44 -07001432/* called in rcu_read_lock() section */
Al Virof7655222006-09-26 21:25:43 -07001433void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1434 __be32 saddr, struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001435{
Flavio Leitner7cc91502011-10-24 02:56:38 -04001436 int s, i;
Eric Dumazeted7865a42010-06-07 21:49:44 -07001437 struct in_device *in_dev = __in_dev_get_rcu(dev);
Flavio Leitner7cc91502011-10-24 02:56:38 -04001438 __be32 skeys[2] = { saddr, 0 };
1439 int ikeys[2] = { dev->ifindex, 0 };
David S. Millerf39925d2011-02-09 22:00:16 -08001440 struct inet_peer *peer;
Denis V. Lunev317805b2008-02-28 20:50:06 -08001441 struct net *net;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001442
Linus Torvalds1da177e2005-04-16 15:20:36 -07001443 if (!in_dev)
1444 return;
1445
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09001446 net = dev_net(dev);
Joe Perches9d4fb272009-11-23 10:41:23 -08001447 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1448 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1449 ipv4_is_zeronet(new_gw))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001450 goto reject_redirect;
1451
1452 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1453 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1454 goto reject_redirect;
1455 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1456 goto reject_redirect;
1457 } else {
Denis V. Lunev317805b2008-02-28 20:50:06 -08001458 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001459 goto reject_redirect;
1460 }
1461
Flavio Leitner7cc91502011-10-24 02:56:38 -04001462 for (s = 0; s < 2; s++) {
1463 for (i = 0; i < 2; i++) {
Eric Dumazet9cc20b22011-11-18 15:24:32 -05001464 unsigned int hash;
1465 struct rtable __rcu **rthp;
1466 struct rtable *rt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001467
Eric Dumazet9cc20b22011-11-18 15:24:32 -05001468 hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
1469
1470 rthp = &rt_hash_table[hash].chain;
1471
1472 while ((rt = rcu_dereference(*rthp)) != NULL) {
1473 rthp = &rt->dst.rt_next;
1474
1475 if (rt->rt_key_dst != daddr ||
1476 rt->rt_key_src != skeys[s] ||
1477 rt->rt_oif != ikeys[i] ||
1478 rt_is_input_route(rt) ||
1479 rt_is_expired(rt) ||
1480 !net_eq(dev_net(rt->dst.dev), net) ||
1481 rt->dst.error ||
1482 rt->dst.dev != dev ||
1483 rt->rt_gateway != old_gw)
1484 continue;
1485
1486 if (!rt->peer)
1487 rt_bind_peer(rt, rt->rt_dst, 1);
1488
1489 peer = rt->peer;
1490 if (peer) {
Steffen Klassertac3f48d2012-03-06 21:21:10 +00001491 if (peer->redirect_learned.a4 != new_gw) {
Eric Dumazet9cc20b22011-11-18 15:24:32 -05001492 peer->redirect_learned.a4 = new_gw;
1493 atomic_inc(&__rt_peer_genid);
1494 }
1495 check_peer_redir(&rt->dst, peer);
1496 }
Flavio Leitner7cc91502011-10-24 02:56:38 -04001497 }
Flavio Leitner7cc91502011-10-24 02:56:38 -04001498 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001499 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001500 return;
1501
1502reject_redirect:
1503#ifdef CONFIG_IP_ROUTE_VERBOSE
1504 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
Joe Perches058bd4d2012-03-11 18:36:11 +00001505 pr_info("Redirect from %pI4 on %s about %pI4 ignored\n"
Harvey Harrison673d57e2008-10-31 00:53:57 -07001506 " Advised path = %pI4 -> %pI4\n",
Joe Perches058bd4d2012-03-11 18:36:11 +00001507 &old_gw, dev->name, &new_gw,
1508 &saddr, &daddr);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001509#endif
Eric Dumazeted7865a42010-06-07 21:49:44 -07001510 ;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001511}
1512
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001513static bool peer_pmtu_expired(struct inet_peer *peer)
1514{
1515 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1516
1517 return orig &&
1518 time_after_eq(jiffies, orig) &&
1519 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1520}
1521
1522static bool peer_pmtu_cleaned(struct inet_peer *peer)
1523{
1524 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1525
1526 return orig &&
1527 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1528}
1529
Linus Torvalds1da177e2005-04-16 15:20:36 -07001530static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1531{
Eric Dumazetee6b9672008-03-05 18:30:47 -08001532 struct rtable *rt = (struct rtable *)dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001533 struct dst_entry *ret = dst;
1534
1535 if (rt) {
Timo Teräsd11a4dc2010-03-18 23:20:20 +00001536 if (dst->obsolete > 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001537 ip_rt_put(rt);
1538 ret = NULL;
David S. Miller2c8cec52011-02-09 20:42:07 -08001539 } else if (rt->rt_flags & RTCF_REDIRECTED) {
David S. Miller5e2b61f2011-03-04 21:47:09 -08001540 unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1541 rt->rt_oif,
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001542 rt_genid(dev_net(dst->dev)));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001543 rt_del(hash, rt);
1544 ret = NULL;
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001545 } else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1546 dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001547 }
1548 }
1549 return ret;
1550}
1551
1552/*
1553 * Algorithm:
1554 * 1. The first ip_rt_redirect_number redirects are sent
1555 * with exponential backoff, then we stop sending them at all,
1556 * assuming that the host ignores our redirects.
1557 * 2. If we did not see packets requiring redirects
1558 * during ip_rt_redirect_silence, we assume that the host
1559 * forgot redirected route and start to send redirects again.
1560 *
1561 * This algorithm is much cheaper and more intelligent than dumb load limiting
1562 * in icmp.c.
1563 *
1564 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1565 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1566 */
1567
1568void ip_rt_send_redirect(struct sk_buff *skb)
1569{
Eric Dumazet511c3f92009-06-02 05:14:27 +00001570 struct rtable *rt = skb_rtable(skb);
Eric Dumazet30038fc2009-08-28 23:52:01 -07001571 struct in_device *in_dev;
David S. Miller92d86822011-02-04 15:55:25 -08001572 struct inet_peer *peer;
Eric Dumazet30038fc2009-08-28 23:52:01 -07001573 int log_martians;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001574
Eric Dumazet30038fc2009-08-28 23:52:01 -07001575 rcu_read_lock();
Changli Gaod8d1f302010-06-10 23:31:35 -07001576 in_dev = __in_dev_get_rcu(rt->dst.dev);
Eric Dumazet30038fc2009-08-28 23:52:01 -07001577 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1578 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001579 return;
Eric Dumazet30038fc2009-08-28 23:52:01 -07001580 }
1581 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1582 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001583
David S. Miller92d86822011-02-04 15:55:25 -08001584 if (!rt->peer)
David S. Millera48eff12011-05-18 18:42:43 -04001585 rt_bind_peer(rt, rt->rt_dst, 1);
David S. Miller92d86822011-02-04 15:55:25 -08001586 peer = rt->peer;
1587 if (!peer) {
1588 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1589 return;
1590 }
1591
Linus Torvalds1da177e2005-04-16 15:20:36 -07001592 /* No redirected packets during ip_rt_redirect_silence;
1593 * reset the algorithm.
1594 */
David S. Miller92d86822011-02-04 15:55:25 -08001595 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1596 peer->rate_tokens = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001597
1598 /* Too many ignored redirects; do not send anything
Changli Gaod8d1f302010-06-10 23:31:35 -07001599 * set dst.rate_last to the last seen redirected packet.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001600 */
David S. Miller92d86822011-02-04 15:55:25 -08001601 if (peer->rate_tokens >= ip_rt_redirect_number) {
1602 peer->rate_last = jiffies;
Eric Dumazet30038fc2009-08-28 23:52:01 -07001603 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001604 }
1605
1606 /* Check for load limit; set rate_last to the latest sent
1607 * redirect.
1608 */
David S. Miller92d86822011-02-04 15:55:25 -08001609 if (peer->rate_tokens == 0 ||
Li Yewang14fb8a72006-12-18 00:26:35 -08001610 time_after(jiffies,
David S. Miller92d86822011-02-04 15:55:25 -08001611 (peer->rate_last +
1612 (ip_rt_redirect_load << peer->rate_tokens)))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001613 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
David S. Miller92d86822011-02-04 15:55:25 -08001614 peer->rate_last = jiffies;
1615 ++peer->rate_tokens;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001616#ifdef CONFIG_IP_ROUTE_VERBOSE
Eric Dumazet30038fc2009-08-28 23:52:01 -07001617 if (log_martians &&
David S. Miller92d86822011-02-04 15:55:25 -08001618 peer->rate_tokens == ip_rt_redirect_number &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001619 net_ratelimit())
Joe Perches058bd4d2012-03-11 18:36:11 +00001620 pr_warn("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
1621 &ip_hdr(skb)->saddr, rt->rt_iif,
Harvey Harrison673d57e2008-10-31 00:53:57 -07001622 &rt->rt_dst, &rt->rt_gateway);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001623#endif
1624 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001625}
1626
1627static int ip_error(struct sk_buff *skb)
1628{
Eric Dumazet511c3f92009-06-02 05:14:27 +00001629 struct rtable *rt = skb_rtable(skb);
David S. Miller92d86822011-02-04 15:55:25 -08001630 struct inet_peer *peer;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001631 unsigned long now;
David S. Miller92d86822011-02-04 15:55:25 -08001632 bool send;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001633 int code;
1634
Changli Gaod8d1f302010-06-10 23:31:35 -07001635 switch (rt->dst.error) {
Joe Perches4500ebf2011-07-01 09:43:07 +00001636 case EINVAL:
1637 default:
1638 goto out;
1639 case EHOSTUNREACH:
1640 code = ICMP_HOST_UNREACH;
1641 break;
1642 case ENETUNREACH:
1643 code = ICMP_NET_UNREACH;
1644 IP_INC_STATS_BH(dev_net(rt->dst.dev),
1645 IPSTATS_MIB_INNOROUTES);
1646 break;
1647 case EACCES:
1648 code = ICMP_PKT_FILTERED;
1649 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001650 }
1651
David S. Miller92d86822011-02-04 15:55:25 -08001652 if (!rt->peer)
David S. Millera48eff12011-05-18 18:42:43 -04001653 rt_bind_peer(rt, rt->rt_dst, 1);
David S. Miller92d86822011-02-04 15:55:25 -08001654 peer = rt->peer;
1655
1656 send = true;
1657 if (peer) {
1658 now = jiffies;
1659 peer->rate_tokens += now - peer->rate_last;
1660 if (peer->rate_tokens > ip_rt_error_burst)
1661 peer->rate_tokens = ip_rt_error_burst;
1662 peer->rate_last = now;
1663 if (peer->rate_tokens >= ip_rt_error_cost)
1664 peer->rate_tokens -= ip_rt_error_cost;
1665 else
1666 send = false;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001667 }
David S. Miller92d86822011-02-04 15:55:25 -08001668 if (send)
1669 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001670
1671out: kfree_skb(skb);
1672 return 0;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001673}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001674
1675/*
1676 * The last two values are not from the RFC but
1677 * are needed for AMPRnet AX.25 paths.
1678 */
1679
Arjan van de Ven9b5b5cf2005-11-29 16:21:38 -08001680static const unsigned short mtu_plateau[] =
Linus Torvalds1da177e2005-04-16 15:20:36 -07001681{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1682
Stephen Hemminger5969f712008-04-10 01:52:09 -07001683static inline unsigned short guess_mtu(unsigned short old_mtu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001684{
1685 int i;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001686
Linus Torvalds1da177e2005-04-16 15:20:36 -07001687 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1688 if (old_mtu > mtu_plateau[i])
1689 return mtu_plateau[i];
1690 return 68;
1691}
1692
Eric Dumazetb71d1d42011-04-22 04:53:02 +00001693unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
Timo Teras0010e462008-04-29 03:32:25 -07001694 unsigned short new_mtu,
1695 struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001696{
Linus Torvalds1da177e2005-04-16 15:20:36 -07001697 unsigned short old_mtu = ntohs(iph->tot_len);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001698 unsigned short est_mtu = 0;
David S. Miller2c8cec52011-02-09 20:42:07 -08001699 struct inet_peer *peer;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001700
David S. Miller2c8cec52011-02-09 20:42:07 -08001701 peer = inet_getpeer_v4(iph->daddr, 1);
1702 if (peer) {
1703 unsigned short mtu = new_mtu;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001704
David S. Miller2c8cec52011-02-09 20:42:07 -08001705 if (new_mtu < 68 || new_mtu >= old_mtu) {
1706 /* BSD 4.2 derived systems incorrectly adjust
1707 * tot_len by the IP header length, and report
1708 * a zero MTU in the ICMP message.
1709 */
1710 if (mtu == 0 &&
1711 old_mtu >= 68 + (iph->ihl << 2))
1712 old_mtu -= iph->ihl << 2;
1713 mtu = guess_mtu(old_mtu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001714 }
David S. Miller2c8cec52011-02-09 20:42:07 -08001715
1716 if (mtu < ip_rt_min_pmtu)
1717 mtu = ip_rt_min_pmtu;
1718 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
Hiroaki SHIMODA46af3182011-03-09 20:09:58 +00001719 unsigned long pmtu_expires;
1720
1721 pmtu_expires = jiffies + ip_rt_mtu_expires;
1722 if (!pmtu_expires)
1723 pmtu_expires = 1UL;
1724
David S. Miller2c8cec52011-02-09 20:42:07 -08001725 est_mtu = mtu;
1726 peer->pmtu_learned = mtu;
Hiroaki SHIMODA46af3182011-03-09 20:09:58 +00001727 peer->pmtu_expires = pmtu_expires;
Gao feng59445b62011-10-19 15:34:09 +00001728 atomic_inc(&__rt_peer_genid);
David S. Miller2c8cec52011-02-09 20:42:07 -08001729 }
1730
1731 inet_putpeer(peer);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001732 }
1733 return est_mtu ? : new_mtu;
1734}
1735
David S. Miller2c8cec52011-02-09 20:42:07 -08001736static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1737{
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001738 unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
David S. Miller2c8cec52011-02-09 20:42:07 -08001739
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001740 if (!expires)
1741 return;
Hiroaki SHIMODA46af3182011-03-09 20:09:58 +00001742 if (time_before(jiffies, expires)) {
David S. Miller2c8cec52011-02-09 20:42:07 -08001743 u32 orig_dst_mtu = dst_mtu(dst);
1744 if (peer->pmtu_learned < orig_dst_mtu) {
1745 if (!peer->pmtu_orig)
1746 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1747 dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1748 }
1749 } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1750 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1751}
1752
Linus Torvalds1da177e2005-04-16 15:20:36 -07001753static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1754{
David S. Miller2c8cec52011-02-09 20:42:07 -08001755 struct rtable *rt = (struct rtable *) dst;
1756 struct inet_peer *peer;
1757
1758 dst_confirm(dst);
1759
1760 if (!rt->peer)
David S. Millera48eff12011-05-18 18:42:43 -04001761 rt_bind_peer(rt, rt->rt_dst, 1);
David S. Miller2c8cec52011-02-09 20:42:07 -08001762 peer = rt->peer;
1763 if (peer) {
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001764 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1765
David S. Miller2c8cec52011-02-09 20:42:07 -08001766 if (mtu < ip_rt_min_pmtu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001767 mtu = ip_rt_min_pmtu;
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001768 if (!pmtu_expires || mtu < peer->pmtu_learned) {
Hiroaki SHIMODA46af3182011-03-09 20:09:58 +00001769
1770 pmtu_expires = jiffies + ip_rt_mtu_expires;
1771 if (!pmtu_expires)
1772 pmtu_expires = 1UL;
1773
David S. Miller2c8cec52011-02-09 20:42:07 -08001774 peer->pmtu_learned = mtu;
Hiroaki SHIMODA46af3182011-03-09 20:09:58 +00001775 peer->pmtu_expires = pmtu_expires;
David S. Miller2c8cec52011-02-09 20:42:07 -08001776
1777 atomic_inc(&__rt_peer_genid);
1778 rt->rt_peer_genid = rt_peer_genid();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001779 }
Hiroaki SHIMODA46af3182011-03-09 20:09:58 +00001780 check_peer_pmtu(dst, peer);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001781 }
1782}
1783
David S. Millerf39925d2011-02-09 22:00:16 -08001784
David S. Millerde398fb2011-12-05 13:21:42 -05001785static void ipv4_validate_peer(struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001786{
David S. Miller6431cbc2011-02-07 20:38:06 -08001787 if (rt->rt_peer_genid != rt_peer_genid()) {
David S. Miller2c8cec52011-02-09 20:42:07 -08001788 struct inet_peer *peer;
1789
David S. Miller6431cbc2011-02-07 20:38:06 -08001790 if (!rt->peer)
David S. Millera48eff12011-05-18 18:42:43 -04001791 rt_bind_peer(rt, rt->rt_dst, 0);
David S. Miller6431cbc2011-02-07 20:38:06 -08001792
David S. Miller2c8cec52011-02-09 20:42:07 -08001793 peer = rt->peer;
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001794 if (peer) {
David S. Millerefbc368d2011-12-01 13:38:59 -05001795 check_peer_pmtu(&rt->dst, peer);
David S. Miller2c8cec52011-02-09 20:42:07 -08001796
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001797 if (peer->redirect_learned.a4 &&
David S. Millerde398fb2011-12-05 13:21:42 -05001798 peer->redirect_learned.a4 != rt->rt_gateway)
1799 check_peer_redir(&rt->dst, peer);
David S. Millerf39925d2011-02-09 22:00:16 -08001800 }
1801
David S. Miller6431cbc2011-02-07 20:38:06 -08001802 rt->rt_peer_genid = rt_peer_genid();
1803 }
David S. Millerefbc368d2011-12-01 13:38:59 -05001804}
1805
1806static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1807{
1808 struct rtable *rt = (struct rtable *) dst;
1809
1810 if (rt_is_expired(rt))
1811 return NULL;
David S. Millerde398fb2011-12-05 13:21:42 -05001812 ipv4_validate_peer(rt);
Timo Teräsd11a4dc2010-03-18 23:20:20 +00001813 return dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001814}
1815
1816static void ipv4_dst_destroy(struct dst_entry *dst)
1817{
1818 struct rtable *rt = (struct rtable *) dst;
1819 struct inet_peer *peer = rt->peer;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001820
David S. Miller62fa8a82011-01-26 20:51:05 -08001821 if (rt->fi) {
1822 fib_info_put(rt->fi);
1823 rt->fi = NULL;
1824 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001825 if (peer) {
1826 rt->peer = NULL;
1827 inet_putpeer(peer);
1828 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001829}
1830
Linus Torvalds1da177e2005-04-16 15:20:36 -07001831
1832static void ipv4_link_failure(struct sk_buff *skb)
1833{
1834 struct rtable *rt;
1835
1836 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1837
Eric Dumazet511c3f92009-06-02 05:14:27 +00001838 rt = skb_rtable(skb);
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001839 if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1840 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001841}
1842
1843static int ip_rt_bug(struct sk_buff *skb)
1844{
Harvey Harrison673d57e2008-10-31 00:53:57 -07001845 printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1846 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001847 skb->dev ? skb->dev->name : "?");
1848 kfree_skb(skb);
Dave Jonesc378a9c2011-05-21 07:16:42 +00001849 WARN_ON(1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001850 return 0;
1851}
1852
1853/*
1854 We do not cache source address of outgoing interface,
1855 because it is used only by IP RR, TS and SRR options,
1856 so that it out of fast path.
1857
1858 BTW remember: "addr" is allowed to be not aligned
1859 in IP options!
1860 */
1861
David S. Miller8e363602011-05-13 17:29:41 -04001862void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001863{
Al Viroa61ced52006-09-26 21:27:54 -07001864 __be32 src;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001865
David S. Millerc7537962010-11-11 17:07:48 -08001866 if (rt_is_output_route(rt))
David S. Millerc5be24f2011-05-13 18:01:21 -04001867 src = ip_hdr(skb)->saddr;
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00001868 else {
David S. Miller8e363602011-05-13 17:29:41 -04001869 struct fib_result res;
1870 struct flowi4 fl4;
1871 struct iphdr *iph;
1872
1873 iph = ip_hdr(skb);
1874
1875 memset(&fl4, 0, sizeof(fl4));
1876 fl4.daddr = iph->daddr;
1877 fl4.saddr = iph->saddr;
Julian Anastasovb0fe4a32011-07-23 02:00:41 +00001878 fl4.flowi4_tos = RT_TOS(iph->tos);
David S. Miller8e363602011-05-13 17:29:41 -04001879 fl4.flowi4_oif = rt->dst.dev->ifindex;
1880 fl4.flowi4_iif = skb->dev->ifindex;
1881 fl4.flowi4_mark = skb->mark;
David S. Miller5e2b61f2011-03-04 21:47:09 -08001882
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00001883 rcu_read_lock();
David S. Miller68a5e3d2011-03-11 20:07:33 -05001884 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
David S. Miller436c3b62011-03-24 17:42:21 -07001885 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00001886 else
1887 src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001888 RT_SCOPE_UNIVERSE);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00001889 rcu_read_unlock();
1890 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001891 memcpy(addr, &src, 4);
1892}
1893
Patrick McHardyc7066f72011-01-14 13:36:42 +01001894#ifdef CONFIG_IP_ROUTE_CLASSID
Linus Torvalds1da177e2005-04-16 15:20:36 -07001895static void set_class_tag(struct rtable *rt, u32 tag)
1896{
Changli Gaod8d1f302010-06-10 23:31:35 -07001897 if (!(rt->dst.tclassid & 0xFFFF))
1898 rt->dst.tclassid |= tag & 0xFFFF;
1899 if (!(rt->dst.tclassid & 0xFFFF0000))
1900 rt->dst.tclassid |= tag & 0xFFFF0000;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001901}
1902#endif
1903
David S. Miller0dbaee32010-12-13 12:52:14 -08001904static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1905{
1906 unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1907
1908 if (advmss == 0) {
1909 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1910 ip_rt_min_advmss);
1911 if (advmss > 65535 - 40)
1912 advmss = 65535 - 40;
1913 }
1914 return advmss;
1915}
1916
Steffen Klassertebb762f2011-11-23 02:12:51 +00001917static unsigned int ipv4_mtu(const struct dst_entry *dst)
David S. Millerd33e4552010-12-14 13:01:14 -08001918{
Steffen Klassert261663b2011-11-23 02:14:50 +00001919 const struct rtable *rt = (const struct rtable *) dst;
Steffen Klassert618f9bc2011-11-23 02:13:31 +00001920 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1921
Steffen Klassert261663b2011-11-23 02:14:50 +00001922 if (mtu && rt_is_output_route(rt))
Steffen Klassert618f9bc2011-11-23 02:13:31 +00001923 return mtu;
1924
1925 mtu = dst->dev->mtu;
David S. Millerd33e4552010-12-14 13:01:14 -08001926
1927 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
David S. Millerd33e4552010-12-14 13:01:14 -08001928
1929 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1930 mtu = 576;
1931 }
1932
1933 if (mtu > IP_MAX_MTU)
1934 mtu = IP_MAX_MTU;
1935
1936 return mtu;
1937}
1938
David S. Miller813b3b52011-04-28 14:48:42 -07001939static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
David S. Miller5e2b61f2011-03-04 21:47:09 -08001940 struct fib_info *fi)
David S. Millera4daad62011-01-27 22:01:53 -08001941{
David S. Miller0131ba42011-02-04 14:37:30 -08001942 struct inet_peer *peer;
1943 int create = 0;
1944
1945 /* If a peer entry exists for this destination, we must hook
1946 * it up in order to get at cached metrics.
1947 */
David S. Miller813b3b52011-04-28 14:48:42 -07001948 if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
David S. Miller0131ba42011-02-04 14:37:30 -08001949 create = 1;
1950
David S. Miller3c0afdc2011-03-04 21:26:07 -08001951 rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
David S. Miller0131ba42011-02-04 14:37:30 -08001952 if (peer) {
David S. Miller3c0afdc2011-03-04 21:26:07 -08001953 rt->rt_peer_genid = rt_peer_genid();
David S. Miller0131ba42011-02-04 14:37:30 -08001954 if (inet_metrics_new(peer))
1955 memcpy(peer->metrics, fi->fib_metrics,
1956 sizeof(u32) * RTAX_MAX);
1957 dst_init_metrics(&rt->dst, peer->metrics, false);
David S. Miller2c8cec52011-02-09 20:42:07 -08001958
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001959 check_peer_pmtu(&rt->dst, peer);
Steffen Klassertac3f48d2012-03-06 21:21:10 +00001960
David S. Millerf39925d2011-02-09 22:00:16 -08001961 if (peer->redirect_learned.a4 &&
1962 peer->redirect_learned.a4 != rt->rt_gateway) {
1963 rt->rt_gateway = peer->redirect_learned.a4;
1964 rt->rt_flags |= RTCF_REDIRECTED;
1965 }
David S. Miller0131ba42011-02-04 14:37:30 -08001966 } else {
David S. Millerb8dad612011-01-28 14:07:16 -08001967 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1968 rt->fi = fi;
1969 atomic_inc(&fi->fib_clntref);
1970 }
David S. Millera4daad62011-01-27 22:01:53 -08001971 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
David S. Millera4daad62011-01-27 22:01:53 -08001972 }
1973}
1974
David S. Miller813b3b52011-04-28 14:48:42 -07001975static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
David S. Miller5e2b61f2011-03-04 21:47:09 -08001976 const struct fib_result *res,
David S. Miller982721f2011-02-16 21:44:24 -08001977 struct fib_info *fi, u16 type, u32 itag)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001978{
David S. Millerdefb3512010-12-08 21:16:57 -08001979 struct dst_entry *dst = &rt->dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001980
1981 if (fi) {
1982 if (FIB_RES_GW(*res) &&
1983 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1984 rt->rt_gateway = FIB_RES_GW(*res);
David S. Miller813b3b52011-04-28 14:48:42 -07001985 rt_init_metrics(rt, fl4, fi);
Patrick McHardyc7066f72011-01-14 13:36:42 +01001986#ifdef CONFIG_IP_ROUTE_CLASSID
David S. Millerdefb3512010-12-08 21:16:57 -08001987 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001988#endif
David S. Millerd33e4552010-12-14 13:01:14 -08001989 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001990
David S. Millerdefb3512010-12-08 21:16:57 -08001991 if (dst_mtu(dst) > IP_MAX_MTU)
1992 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
David S. Miller0dbaee32010-12-13 12:52:14 -08001993 if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
David S. Millerdefb3512010-12-08 21:16:57 -08001994 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001995
Patrick McHardyc7066f72011-01-14 13:36:42 +01001996#ifdef CONFIG_IP_ROUTE_CLASSID
Linus Torvalds1da177e2005-04-16 15:20:36 -07001997#ifdef CONFIG_IP_MULTIPLE_TABLES
1998 set_class_tag(rt, fib_rules_tclass(res));
1999#endif
2000 set_class_tag(rt, itag);
2001#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07002002}
2003
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002004static struct rtable *rt_dst_alloc(struct net_device *dev,
2005 bool nopolicy, bool noxfrm)
David S. Miller0c4dcd52011-02-17 15:42:37 -08002006{
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002007 return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
2008 DST_HOST |
2009 (nopolicy ? DST_NOPOLICY : 0) |
2010 (noxfrm ? DST_NOXFRM : 0));
David S. Miller0c4dcd52011-02-17 15:42:37 -08002011}
2012
Eric Dumazet96d36222010-06-02 19:21:31 +00002013/* called in rcu_read_lock() section */
Al Viro9e12bb22006-09-26 21:25:20 -07002014static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002015 u8 tos, struct net_device *dev, int our)
2016{
Eric Dumazet96d36222010-06-02 19:21:31 +00002017 unsigned int hash;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002018 struct rtable *rth;
Al Viroa61ced52006-09-26 21:27:54 -07002019 __be32 spec_dst;
Eric Dumazet96d36222010-06-02 19:21:31 +00002020 struct in_device *in_dev = __in_dev_get_rcu(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002021 u32 itag = 0;
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002022 int err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002023
2024 /* Primary sanity checks. */
2025
2026 if (in_dev == NULL)
2027 return -EINVAL;
2028
Jan Engelhardt1e637c72008-01-21 03:18:08 -08002029 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
Joe Perchesf97c1e02007-12-16 13:45:43 -08002030 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002031 goto e_inval;
2032
Joe Perchesf97c1e02007-12-16 13:45:43 -08002033 if (ipv4_is_zeronet(saddr)) {
2034 if (!ipv4_is_local_multicast(daddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002035 goto e_inval;
2036 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002037 } else {
Michael Smith5c04c812011-04-07 04:51:50 +00002038 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2039 &itag);
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002040 if (err < 0)
2041 goto e_err;
2042 }
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002043 rth = rt_dst_alloc(init_net.loopback_dev,
2044 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002045 if (!rth)
2046 goto e_nobufs;
2047
Patrick McHardyc7066f72011-01-14 13:36:42 +01002048#ifdef CONFIG_IP_ROUTE_CLASSID
Changli Gaod8d1f302010-06-10 23:31:35 -07002049 rth->dst.tclassid = itag;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002050#endif
David S. Millercf911662011-04-28 14:31:47 -07002051 rth->dst.output = ip_rt_bug;
2052
2053 rth->rt_key_dst = daddr;
2054 rth->rt_key_src = saddr;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002055 rth->rt_genid = rt_genid(dev_net(dev));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002056 rth->rt_flags = RTCF_MULTICAST;
Eric Dumazet29e75252008-01-31 17:05:09 -08002057 rth->rt_type = RTN_MULTICAST;
David S. Miller475949d2011-05-03 19:45:15 -07002058 rth->rt_key_tos = tos;
David S. Millercf911662011-04-28 14:31:47 -07002059 rth->rt_dst = daddr;
2060 rth->rt_src = saddr;
2061 rth->rt_route_iif = dev->ifindex;
2062 rth->rt_iif = dev->ifindex;
2063 rth->rt_oif = 0;
2064 rth->rt_mark = skb->mark;
2065 rth->rt_gateway = daddr;
2066 rth->rt_spec_dst= spec_dst;
2067 rth->rt_peer_genid = 0;
2068 rth->peer = NULL;
2069 rth->fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002070 if (our) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002071 rth->dst.input= ip_local_deliver;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002072 rth->rt_flags |= RTCF_LOCAL;
2073 }
2074
2075#ifdef CONFIG_IP_MROUTE
Joe Perchesf97c1e02007-12-16 13:45:43 -08002076 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
Changli Gaod8d1f302010-06-10 23:31:35 -07002077 rth->dst.input = ip_mr_input;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002078#endif
2079 RT_CACHE_STAT_INC(in_slow_mc);
2080
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002081 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
David S. Millerb23dd4f2011-03-02 14:31:35 -08002082 rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
Eric Dumazet9aa3c942011-06-18 11:59:18 -07002083 return IS_ERR(rth) ? PTR_ERR(rth) : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002084
2085e_nobufs:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002086 return -ENOBUFS;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002087e_inval:
Eric Dumazet96d36222010-06-02 19:21:31 +00002088 return -EINVAL;
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002089e_err:
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002090 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002091}
2092
2093
2094static void ip_handle_martian_source(struct net_device *dev,
2095 struct in_device *in_dev,
2096 struct sk_buff *skb,
Al Viro9e12bb22006-09-26 21:25:20 -07002097 __be32 daddr,
2098 __be32 saddr)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002099{
2100 RT_CACHE_STAT_INC(in_martian_src);
2101#ifdef CONFIG_IP_ROUTE_VERBOSE
2102 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2103 /*
2104 * RFC1812 recommendation, if source is martian,
2105 * the only hint is MAC header.
2106 */
Joe Perches058bd4d2012-03-11 18:36:11 +00002107 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
Harvey Harrison673d57e2008-10-31 00:53:57 -07002108 &daddr, &saddr, dev->name);
Arnaldo Carvalho de Melo98e399f2007-03-19 15:33:04 -07002109 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
Joe Perches058bd4d2012-03-11 18:36:11 +00002110 print_hex_dump(KERN_WARNING, "ll header: ",
2111 DUMP_PREFIX_OFFSET, 16, 1,
2112 skb_mac_header(skb),
2113 dev->hard_header_len, true);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002114 }
2115 }
2116#endif
2117}
2118
Eric Dumazet47360222010-06-03 04:13:21 +00002119/* called in rcu_read_lock() section */
Stephen Hemminger5969f712008-04-10 01:52:09 -07002120static int __mkroute_input(struct sk_buff *skb,
David S. Miller982721f2011-02-16 21:44:24 -08002121 const struct fib_result *res,
Stephen Hemminger5969f712008-04-10 01:52:09 -07002122 struct in_device *in_dev,
2123 __be32 daddr, __be32 saddr, u32 tos,
2124 struct rtable **result)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002125{
Linus Torvalds1da177e2005-04-16 15:20:36 -07002126 struct rtable *rth;
2127 int err;
2128 struct in_device *out_dev;
Eric Dumazet47360222010-06-03 04:13:21 +00002129 unsigned int flags = 0;
Al Virod9c9df82006-09-26 21:28:14 -07002130 __be32 spec_dst;
2131 u32 itag;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002132
2133 /* get a working reference to the output device */
Eric Dumazet47360222010-06-03 04:13:21 +00002134 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002135 if (out_dev == NULL) {
2136 if (net_ratelimit())
Joe Perches058bd4d2012-03-11 18:36:11 +00002137 pr_crit("Bug in ip_route_input_slow(). Please report.\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07002138 return -EINVAL;
2139 }
2140
2141
Michael Smith5c04c812011-04-07 04:51:50 +00002142 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2143 in_dev->dev, &spec_dst, &itag);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002144 if (err < 0) {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002145 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002146 saddr);
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002147
Linus Torvalds1da177e2005-04-16 15:20:36 -07002148 goto cleanup;
2149 }
2150
2151 if (err)
2152 flags |= RTCF_DIRECTSRC;
2153
Thomas Graf51b77ca2008-06-03 16:36:01 -07002154 if (out_dev == in_dev && err &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07002155 (IN_DEV_SHARED_MEDIA(out_dev) ||
2156 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2157 flags |= RTCF_DOREDIRECT;
2158
2159 if (skb->protocol != htons(ETH_P_IP)) {
2160 /* Not IP (i.e. ARP). Do not create route, if it is
2161 * invalid for proxy arp. DNAT routes are always valid.
Jesper Dangaard Brouer65324142010-01-05 05:50:47 +00002162 *
2163 * Proxy arp feature have been extended to allow, ARP
2164 * replies back to the same interface, to support
2165 * Private VLAN switch technologies. See arp.c.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002166 */
Jesper Dangaard Brouer65324142010-01-05 05:50:47 +00002167 if (out_dev == in_dev &&
2168 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002169 err = -EINVAL;
2170 goto cleanup;
2171 }
2172 }
2173
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002174 rth = rt_dst_alloc(out_dev->dev,
2175 IN_DEV_CONF_GET(in_dev, NOPOLICY),
David S. Miller0c4dcd52011-02-17 15:42:37 -08002176 IN_DEV_CONF_GET(out_dev, NOXFRM));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002177 if (!rth) {
2178 err = -ENOBUFS;
2179 goto cleanup;
2180 }
2181
David S. Miller5e2b61f2011-03-04 21:47:09 -08002182 rth->rt_key_dst = daddr;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002183 rth->rt_key_src = saddr;
David S. Millercf911662011-04-28 14:31:47 -07002184 rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2185 rth->rt_flags = flags;
2186 rth->rt_type = res->type;
David S. Miller475949d2011-05-03 19:45:15 -07002187 rth->rt_key_tos = tos;
David S. Millercf911662011-04-28 14:31:47 -07002188 rth->rt_dst = daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002189 rth->rt_src = saddr;
OGAWA Hirofumi1b86a582011-04-07 14:04:08 -07002190 rth->rt_route_iif = in_dev->dev->ifindex;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002191 rth->rt_iif = in_dev->dev->ifindex;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002192 rth->rt_oif = 0;
David S. Millercf911662011-04-28 14:31:47 -07002193 rth->rt_mark = skb->mark;
2194 rth->rt_gateway = daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002195 rth->rt_spec_dst= spec_dst;
David S. Millercf911662011-04-28 14:31:47 -07002196 rth->rt_peer_genid = 0;
2197 rth->peer = NULL;
2198 rth->fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002199
Changli Gaod8d1f302010-06-10 23:31:35 -07002200 rth->dst.input = ip_forward;
2201 rth->dst.output = ip_output;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002202
David S. Miller5e2b61f2011-03-04 21:47:09 -08002203 rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002204
Linus Torvalds1da177e2005-04-16 15:20:36 -07002205 *result = rth;
2206 err = 0;
2207 cleanup:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002208 return err;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002209}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002210
Stephen Hemminger5969f712008-04-10 01:52:09 -07002211static int ip_mkroute_input(struct sk_buff *skb,
2212 struct fib_result *res,
David S. Miller68a5e3d2011-03-11 20:07:33 -05002213 const struct flowi4 *fl4,
Stephen Hemminger5969f712008-04-10 01:52:09 -07002214 struct in_device *in_dev,
2215 __be32 daddr, __be32 saddr, u32 tos)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002216{
Chuck Short7abaa272005-06-22 22:10:23 -07002217 struct rtable* rth = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002218 int err;
2219 unsigned hash;
2220
2221#ifdef CONFIG_IP_ROUTE_MULTIPATH
David S. Millerff3fccb2011-03-10 16:23:24 -08002222 if (res->fi && res->fi->fib_nhs > 1)
David S. Miller1b7fe5932011-03-10 17:01:16 -08002223 fib_select_multipath(res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002224#endif
2225
2226 /* create a routing cache entry */
2227 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2228 if (err)
2229 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002230
2231 /* put it into the cache */
David S. Miller68a5e3d2011-03-11 20:07:33 -05002232 hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
Changli Gaod8d1f302010-06-10 23:31:35 -07002233 rt_genid(dev_net(rth->dst.dev)));
David S. Miller68a5e3d2011-03-11 20:07:33 -05002234 rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
David S. Millerb23dd4f2011-03-02 14:31:35 -08002235 if (IS_ERR(rth))
2236 return PTR_ERR(rth);
2237 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002238}
2239
Linus Torvalds1da177e2005-04-16 15:20:36 -07002240/*
2241 * NOTE. We drop all the packets that has local source
2242 * addresses, because every properly looped back packet
2243 * must have correct destination already attached by output routine.
2244 *
2245 * Such approach solves two big problems:
2246 * 1. Not simplex devices are handled properly.
2247 * 2. IP spoofing attempts are filtered with 100% of guarantee.
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002248 * called with rcu_read_lock()
Linus Torvalds1da177e2005-04-16 15:20:36 -07002249 */
2250
Al Viro9e12bb22006-09-26 21:25:20 -07002251static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002252 u8 tos, struct net_device *dev)
2253{
2254 struct fib_result res;
Eric Dumazet96d36222010-06-02 19:21:31 +00002255 struct in_device *in_dev = __in_dev_get_rcu(dev);
David S. Miller68a5e3d2011-03-11 20:07:33 -05002256 struct flowi4 fl4;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002257 unsigned flags = 0;
2258 u32 itag = 0;
2259 struct rtable * rth;
2260 unsigned hash;
Al Viro9e12bb22006-09-26 21:25:20 -07002261 __be32 spec_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002262 int err = -EINVAL;
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09002263 struct net * net = dev_net(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002264
2265 /* IP on this device is disabled. */
2266
2267 if (!in_dev)
2268 goto out;
2269
2270 /* Check for the most weird martians, which can be not detected
2271 by fib_lookup.
2272 */
2273
Jan Engelhardt1e637c72008-01-21 03:18:08 -08002274 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
Joe Perchesf97c1e02007-12-16 13:45:43 -08002275 ipv4_is_loopback(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002276 goto martian_source;
2277
Andy Walls27a954b2010-10-17 15:11:22 +00002278 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002279 goto brd_input;
2280
2281 /* Accept zero addresses only to limited broadcast;
2282 * I even do not know to fix it or not. Waiting for complains :-)
2283 */
Joe Perchesf97c1e02007-12-16 13:45:43 -08002284 if (ipv4_is_zeronet(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002285 goto martian_source;
2286
Andy Walls27a954b2010-10-17 15:11:22 +00002287 if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002288 goto martian_destination;
2289
2290 /*
2291 * Now we are ready to route packet.
2292 */
David S. Miller68a5e3d2011-03-11 20:07:33 -05002293 fl4.flowi4_oif = 0;
2294 fl4.flowi4_iif = dev->ifindex;
2295 fl4.flowi4_mark = skb->mark;
2296 fl4.flowi4_tos = tos;
2297 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2298 fl4.daddr = daddr;
2299 fl4.saddr = saddr;
2300 err = fib_lookup(net, &fl4, &res);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002301 if (err != 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002302 if (!IN_DEV_FORWARD(in_dev))
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002303 goto e_hostunreach;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002304 goto no_route;
2305 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002306
2307 RT_CACHE_STAT_INC(in_slow_tot);
2308
2309 if (res.type == RTN_BROADCAST)
2310 goto brd_input;
2311
2312 if (res.type == RTN_LOCAL) {
Michael Smith5c04c812011-04-07 04:51:50 +00002313 err = fib_validate_source(skb, saddr, daddr, tos,
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002314 net->loopback_dev->ifindex,
Michael Smith5c04c812011-04-07 04:51:50 +00002315 dev, &spec_dst, &itag);
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002316 if (err < 0)
2317 goto martian_source_keep_err;
2318 if (err)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002319 flags |= RTCF_DIRECTSRC;
2320 spec_dst = daddr;
2321 goto local_input;
2322 }
2323
2324 if (!IN_DEV_FORWARD(in_dev))
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002325 goto e_hostunreach;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002326 if (res.type != RTN_UNICAST)
2327 goto martian_destination;
2328
David S. Miller68a5e3d2011-03-11 20:07:33 -05002329 err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002330out: return err;
2331
2332brd_input:
2333 if (skb->protocol != htons(ETH_P_IP))
2334 goto e_inval;
2335
Joe Perchesf97c1e02007-12-16 13:45:43 -08002336 if (ipv4_is_zeronet(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002337 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2338 else {
Michael Smith5c04c812011-04-07 04:51:50 +00002339 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2340 &itag);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002341 if (err < 0)
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002342 goto martian_source_keep_err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002343 if (err)
2344 flags |= RTCF_DIRECTSRC;
2345 }
2346 flags |= RTCF_BROADCAST;
2347 res.type = RTN_BROADCAST;
2348 RT_CACHE_STAT_INC(in_brd);
2349
2350local_input:
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002351 rth = rt_dst_alloc(net->loopback_dev,
2352 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002353 if (!rth)
2354 goto e_nobufs;
2355
David S. Millercf911662011-04-28 14:31:47 -07002356 rth->dst.input= ip_local_deliver;
Changli Gaod8d1f302010-06-10 23:31:35 -07002357 rth->dst.output= ip_rt_bug;
David S. Millercf911662011-04-28 14:31:47 -07002358#ifdef CONFIG_IP_ROUTE_CLASSID
2359 rth->dst.tclassid = itag;
2360#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07002361
David S. Miller5e2b61f2011-03-04 21:47:09 -08002362 rth->rt_key_dst = daddr;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002363 rth->rt_key_src = saddr;
David S. Millercf911662011-04-28 14:31:47 -07002364 rth->rt_genid = rt_genid(net);
2365 rth->rt_flags = flags|RTCF_LOCAL;
2366 rth->rt_type = res.type;
David S. Miller475949d2011-05-03 19:45:15 -07002367 rth->rt_key_tos = tos;
David S. Millercf911662011-04-28 14:31:47 -07002368 rth->rt_dst = daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002369 rth->rt_src = saddr;
Patrick McHardyc7066f72011-01-14 13:36:42 +01002370#ifdef CONFIG_IP_ROUTE_CLASSID
Changli Gaod8d1f302010-06-10 23:31:35 -07002371 rth->dst.tclassid = itag;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002372#endif
OGAWA Hirofumi1b86a582011-04-07 14:04:08 -07002373 rth->rt_route_iif = dev->ifindex;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002374 rth->rt_iif = dev->ifindex;
David S. Millercf911662011-04-28 14:31:47 -07002375 rth->rt_oif = 0;
2376 rth->rt_mark = skb->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002377 rth->rt_gateway = daddr;
2378 rth->rt_spec_dst= spec_dst;
David S. Millercf911662011-04-28 14:31:47 -07002379 rth->rt_peer_genid = 0;
2380 rth->peer = NULL;
2381 rth->fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002382 if (res.type == RTN_UNREACHABLE) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002383 rth->dst.input= ip_error;
2384 rth->dst.error= -err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002385 rth->rt_flags &= ~RTCF_LOCAL;
2386 }
David S. Miller68a5e3d2011-03-11 20:07:33 -05002387 hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2388 rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
David S. Millerb23dd4f2011-03-02 14:31:35 -08002389 err = 0;
2390 if (IS_ERR(rth))
2391 err = PTR_ERR(rth);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002392 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002393
2394no_route:
2395 RT_CACHE_STAT_INC(in_no_route);
2396 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2397 res.type = RTN_UNREACHABLE;
Mitsuru Chinen7f538782007-12-07 01:07:24 -08002398 if (err == -ESRCH)
2399 err = -ENETUNREACH;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002400 goto local_input;
2401
2402 /*
2403 * Do not cache martian addresses: they should be logged (RFC1812)
2404 */
2405martian_destination:
2406 RT_CACHE_STAT_INC(in_martian_dst);
2407#ifdef CONFIG_IP_ROUTE_VERBOSE
2408 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
Joe Perches058bd4d2012-03-11 18:36:11 +00002409 pr_warn("martian destination %pI4 from %pI4, dev %s\n",
Harvey Harrison673d57e2008-10-31 00:53:57 -07002410 &daddr, &saddr, dev->name);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002411#endif
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002412
2413e_hostunreach:
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002414 err = -EHOSTUNREACH;
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002415 goto out;
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002416
Linus Torvalds1da177e2005-04-16 15:20:36 -07002417e_inval:
2418 err = -EINVAL;
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002419 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002420
2421e_nobufs:
2422 err = -ENOBUFS;
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002423 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002424
2425martian_source:
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002426 err = -EINVAL;
2427martian_source_keep_err:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002428 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002429 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002430}
2431
Eric Dumazet407eadd2010-05-10 11:32:55 +00002432int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2433 u8 tos, struct net_device *dev, bool noref)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002434{
2435 struct rtable * rth;
2436 unsigned hash;
2437 int iif = dev->ifindex;
Denis V. Lunevb5921912008-01-22 23:50:25 -08002438 struct net *net;
Eric Dumazet96d36222010-06-02 19:21:31 +00002439 int res;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002440
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09002441 net = dev_net(dev);
Neil Horman1080d702008-10-27 12:28:25 -07002442
Eric Dumazet96d36222010-06-02 19:21:31 +00002443 rcu_read_lock();
2444
Neil Horman1080d702008-10-27 12:28:25 -07002445 if (!rt_caching(net))
2446 goto skip_cache;
2447
Linus Torvalds1da177e2005-04-16 15:20:36 -07002448 tos &= IPTOS_RT_MASK;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002449 hash = rt_hash(daddr, saddr, iif, rt_genid(net));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002450
Linus Torvalds1da177e2005-04-16 15:20:36 -07002451 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
Changli Gaod8d1f302010-06-10 23:31:35 -07002452 rth = rcu_dereference(rth->dst.rt_next)) {
David S. Miller5e2b61f2011-03-04 21:47:09 -08002453 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2454 ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
Julian Anastasov97a80412011-08-09 04:01:16 +00002455 (rth->rt_route_iif ^ iif) |
David S. Miller475949d2011-05-03 19:45:15 -07002456 (rth->rt_key_tos ^ tos)) == 0 &&
David S. Miller5e2b61f2011-03-04 21:47:09 -08002457 rth->rt_mark == skb->mark &&
Changli Gaod8d1f302010-06-10 23:31:35 -07002458 net_eq(dev_net(rth->dst.dev), net) &&
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002459 !rt_is_expired(rth)) {
David S. Millerde398fb2011-12-05 13:21:42 -05002460 ipv4_validate_peer(rth);
Eric Dumazet407eadd2010-05-10 11:32:55 +00002461 if (noref) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002462 dst_use_noref(&rth->dst, jiffies);
2463 skb_dst_set_noref(skb, &rth->dst);
Eric Dumazet407eadd2010-05-10 11:32:55 +00002464 } else {
Changli Gaod8d1f302010-06-10 23:31:35 -07002465 dst_use(&rth->dst, jiffies);
2466 skb_dst_set(skb, &rth->dst);
Eric Dumazet407eadd2010-05-10 11:32:55 +00002467 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002468 RT_CACHE_STAT_INC(in_hit);
2469 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002470 return 0;
2471 }
2472 RT_CACHE_STAT_INC(in_hlist_search);
2473 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002474
Neil Horman1080d702008-10-27 12:28:25 -07002475skip_cache:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002476 /* Multicast recognition logic is moved from route cache to here.
2477 The problem was that too many Ethernet cards have broken/missing
2478 hardware multicast filters :-( As result the host on multicasting
2479 network acquires a lot of useless route cache entries, sort of
2480 SDR messages from all the world. Now we try to get rid of them.
2481 Really, provided software IP multicast filter is organized
2482 reasonably (at least, hashed), it does not result in a slowdown
2483 comparing with route cache reject entries.
2484 Note, that multicast routers are not affected, because
2485 route cache entry is created eventually.
2486 */
Joe Perchesf97c1e02007-12-16 13:45:43 -08002487 if (ipv4_is_multicast(daddr)) {
Eric Dumazet96d36222010-06-02 19:21:31 +00002488 struct in_device *in_dev = __in_dev_get_rcu(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002489
Eric Dumazet96d36222010-06-02 19:21:31 +00002490 if (in_dev) {
David S. Millerdbdd9a52011-03-10 16:34:38 -08002491 int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2492 ip_hdr(skb)->protocol);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002493 if (our
2494#ifdef CONFIG_IP_MROUTE
Joe Perches9d4fb272009-11-23 10:41:23 -08002495 ||
2496 (!ipv4_is_local_multicast(daddr) &&
2497 IN_DEV_MFORWARD(in_dev))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002498#endif
Joe Perches9d4fb272009-11-23 10:41:23 -08002499 ) {
Eric Dumazet96d36222010-06-02 19:21:31 +00002500 int res = ip_route_input_mc(skb, daddr, saddr,
2501 tos, dev, our);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002502 rcu_read_unlock();
Eric Dumazet96d36222010-06-02 19:21:31 +00002503 return res;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002504 }
2505 }
2506 rcu_read_unlock();
2507 return -EINVAL;
2508 }
Eric Dumazet96d36222010-06-02 19:21:31 +00002509 res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2510 rcu_read_unlock();
2511 return res;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002512}
Eric Dumazet407eadd2010-05-10 11:32:55 +00002513EXPORT_SYMBOL(ip_route_input_common);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002514
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002515/* called with rcu_read_lock() */
David S. Miller982721f2011-02-16 21:44:24 -08002516static struct rtable *__mkroute_output(const struct fib_result *res,
David S. Miller68a5e3d2011-03-11 20:07:33 -05002517 const struct flowi4 *fl4,
David S. Miller813b3b52011-04-28 14:48:42 -07002518 __be32 orig_daddr, __be32 orig_saddr,
Julian Anastasovf61759e2011-12-02 11:39:42 +00002519 int orig_oif, __u8 orig_rtos,
2520 struct net_device *dev_out,
David S. Miller5ada5522011-02-17 15:29:00 -08002521 unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002522{
David S. Miller982721f2011-02-16 21:44:24 -08002523 struct fib_info *fi = res->fi;
David S. Miller5ada5522011-02-17 15:29:00 -08002524 struct in_device *in_dev;
David S. Miller982721f2011-02-16 21:44:24 -08002525 u16 type = res->type;
David S. Miller5ada5522011-02-17 15:29:00 -08002526 struct rtable *rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002527
David S. Miller68a5e3d2011-03-11 20:07:33 -05002528 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
David S. Miller5ada5522011-02-17 15:29:00 -08002529 return ERR_PTR(-EINVAL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002530
David S. Miller68a5e3d2011-03-11 20:07:33 -05002531 if (ipv4_is_lbcast(fl4->daddr))
David S. Miller982721f2011-02-16 21:44:24 -08002532 type = RTN_BROADCAST;
David S. Miller68a5e3d2011-03-11 20:07:33 -05002533 else if (ipv4_is_multicast(fl4->daddr))
David S. Miller982721f2011-02-16 21:44:24 -08002534 type = RTN_MULTICAST;
David S. Miller68a5e3d2011-03-11 20:07:33 -05002535 else if (ipv4_is_zeronet(fl4->daddr))
David S. Miller5ada5522011-02-17 15:29:00 -08002536 return ERR_PTR(-EINVAL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002537
2538 if (dev_out->flags & IFF_LOOPBACK)
2539 flags |= RTCF_LOCAL;
2540
Eric Dumazetdd28d1a2010-09-29 11:53:50 +00002541 in_dev = __in_dev_get_rcu(dev_out);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002542 if (!in_dev)
David S. Miller5ada5522011-02-17 15:29:00 -08002543 return ERR_PTR(-EINVAL);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002544
David S. Miller982721f2011-02-16 21:44:24 -08002545 if (type == RTN_BROADCAST) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002546 flags |= RTCF_BROADCAST | RTCF_LOCAL;
David S. Miller982721f2011-02-16 21:44:24 -08002547 fi = NULL;
2548 } else if (type == RTN_MULTICAST) {
Eric Dumazetdd28d1a2010-09-29 11:53:50 +00002549 flags |= RTCF_MULTICAST | RTCF_LOCAL;
David S. Miller813b3b52011-04-28 14:48:42 -07002550 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2551 fl4->flowi4_proto))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002552 flags &= ~RTCF_LOCAL;
2553 /* If multicast route do not exist use
Eric Dumazetdd28d1a2010-09-29 11:53:50 +00002554 * default one, but do not gateway in this case.
2555 * Yes, it is hack.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002556 */
David S. Miller982721f2011-02-16 21:44:24 -08002557 if (fi && res->prefixlen < 4)
2558 fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002559 }
2560
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002561 rth = rt_dst_alloc(dev_out,
2562 IN_DEV_CONF_GET(in_dev, NOPOLICY),
David S. Miller0c4dcd52011-02-17 15:42:37 -08002563 IN_DEV_CONF_GET(in_dev, NOXFRM));
Dimitris Michailidis8391d072010-10-07 14:48:38 +00002564 if (!rth)
David S. Miller5ada5522011-02-17 15:29:00 -08002565 return ERR_PTR(-ENOBUFS);
Dimitris Michailidis8391d072010-10-07 14:48:38 +00002566
David S. Millercf911662011-04-28 14:31:47 -07002567 rth->dst.output = ip_output;
2568
David S. Miller813b3b52011-04-28 14:48:42 -07002569 rth->rt_key_dst = orig_daddr;
2570 rth->rt_key_src = orig_saddr;
David S. Millercf911662011-04-28 14:31:47 -07002571 rth->rt_genid = rt_genid(dev_net(dev_out));
2572 rth->rt_flags = flags;
2573 rth->rt_type = type;
Julian Anastasovf61759e2011-12-02 11:39:42 +00002574 rth->rt_key_tos = orig_rtos;
David S. Miller68a5e3d2011-03-11 20:07:33 -05002575 rth->rt_dst = fl4->daddr;
2576 rth->rt_src = fl4->saddr;
OGAWA Hirofumi1b86a582011-04-07 14:04:08 -07002577 rth->rt_route_iif = 0;
David S. Miller813b3b52011-04-28 14:48:42 -07002578 rth->rt_iif = orig_oif ? : dev_out->ifindex;
2579 rth->rt_oif = orig_oif;
2580 rth->rt_mark = fl4->flowi4_mark;
David S. Miller68a5e3d2011-03-11 20:07:33 -05002581 rth->rt_gateway = fl4->daddr;
2582 rth->rt_spec_dst= fl4->saddr;
David S. Millercf911662011-04-28 14:31:47 -07002583 rth->rt_peer_genid = 0;
2584 rth->peer = NULL;
2585 rth->fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002586
2587 RT_CACHE_STAT_INC(out_slow_tot);
2588
2589 if (flags & RTCF_LOCAL) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002590 rth->dst.input = ip_local_deliver;
David S. Miller68a5e3d2011-03-11 20:07:33 -05002591 rth->rt_spec_dst = fl4->daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002592 }
2593 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
David S. Miller68a5e3d2011-03-11 20:07:33 -05002594 rth->rt_spec_dst = fl4->saddr;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002595 if (flags & RTCF_LOCAL &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07002596 !(dev_out->flags & IFF_LOOPBACK)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002597 rth->dst.output = ip_mc_output;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002598 RT_CACHE_STAT_INC(out_slow_mc);
2599 }
2600#ifdef CONFIG_IP_MROUTE
David S. Miller982721f2011-02-16 21:44:24 -08002601 if (type == RTN_MULTICAST) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002602 if (IN_DEV_MFORWARD(in_dev) &&
David S. Miller813b3b52011-04-28 14:48:42 -07002603 !ipv4_is_local_multicast(fl4->daddr)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002604 rth->dst.input = ip_mr_input;
2605 rth->dst.output = ip_mc_output;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002606 }
2607 }
2608#endif
2609 }
2610
David S. Miller813b3b52011-04-28 14:48:42 -07002611 rt_set_nexthop(rth, fl4, res, fi, type, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002612
David S. Miller5ada5522011-02-17 15:29:00 -08002613 return rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002614}
2615
Linus Torvalds1da177e2005-04-16 15:20:36 -07002616/*
2617 * Major route resolver routine.
Eric Dumazet0197aa32010-09-30 03:33:58 +00002618 * called with rcu_read_lock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002619 */
2620
David S. Miller813b3b52011-04-28 14:48:42 -07002621static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002622{
Linus Torvalds1da177e2005-04-16 15:20:36 -07002623 struct net_device *dev_out = NULL;
Julian Anastasovf61759e2011-12-02 11:39:42 +00002624 __u8 tos = RT_FL_TOS(fl4);
David S. Miller813b3b52011-04-28 14:48:42 -07002625 unsigned int flags = 0;
2626 struct fib_result res;
David S. Miller5ada5522011-02-17 15:29:00 -08002627 struct rtable *rth;
David S. Miller813b3b52011-04-28 14:48:42 -07002628 __be32 orig_daddr;
2629 __be32 orig_saddr;
2630 int orig_oif;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002631
2632 res.fi = NULL;
2633#ifdef CONFIG_IP_MULTIPLE_TABLES
2634 res.r = NULL;
2635#endif
2636
David S. Miller813b3b52011-04-28 14:48:42 -07002637 orig_daddr = fl4->daddr;
2638 orig_saddr = fl4->saddr;
2639 orig_oif = fl4->flowi4_oif;
2640
2641 fl4->flowi4_iif = net->loopback_dev->ifindex;
2642 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2643 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2644 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
David S. Miller44713b62011-03-04 21:24:47 -08002645
David S. Miller010c2702011-02-17 15:37:09 -08002646 rcu_read_lock();
David S. Miller813b3b52011-04-28 14:48:42 -07002647 if (fl4->saddr) {
David S. Millerb23dd4f2011-03-02 14:31:35 -08002648 rth = ERR_PTR(-EINVAL);
David S. Miller813b3b52011-04-28 14:48:42 -07002649 if (ipv4_is_multicast(fl4->saddr) ||
2650 ipv4_is_lbcast(fl4->saddr) ||
2651 ipv4_is_zeronet(fl4->saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002652 goto out;
2653
Linus Torvalds1da177e2005-04-16 15:20:36 -07002654 /* I removed check for oif == dev_out->oif here.
2655 It was wrong for two reasons:
Denis V. Lunev1ab35272008-01-22 22:04:30 -08002656 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2657 is assigned to multiple interfaces.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002658 2. Moreover, we are allowed to send packets with saddr
2659 of another iface. --ANK
2660 */
2661
David S. Miller813b3b52011-04-28 14:48:42 -07002662 if (fl4->flowi4_oif == 0 &&
2663 (ipv4_is_multicast(fl4->daddr) ||
2664 ipv4_is_lbcast(fl4->daddr))) {
Julian Anastasova210d012008-10-01 07:28:28 -07002665 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
David S. Miller813b3b52011-04-28 14:48:42 -07002666 dev_out = __ip_dev_find(net, fl4->saddr, false);
Julian Anastasova210d012008-10-01 07:28:28 -07002667 if (dev_out == NULL)
2668 goto out;
2669
Linus Torvalds1da177e2005-04-16 15:20:36 -07002670 /* Special hack: user can direct multicasts
2671 and limited broadcast via necessary interface
2672 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2673 This hack is not just for fun, it allows
2674 vic,vat and friends to work.
2675 They bind socket to loopback, set ttl to zero
2676 and expect that it will work.
2677 From the viewpoint of routing cache they are broken,
2678 because we are not allowed to build multicast path
2679 with loopback source addr (look, routing cache
2680 cannot know, that ttl is zero, so that packet
2681 will not leave this host and route is valid).
2682 Luckily, this hack is good workaround.
2683 */
2684
David S. Miller813b3b52011-04-28 14:48:42 -07002685 fl4->flowi4_oif = dev_out->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002686 goto make_route;
2687 }
Julian Anastasova210d012008-10-01 07:28:28 -07002688
David S. Miller813b3b52011-04-28 14:48:42 -07002689 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
Julian Anastasova210d012008-10-01 07:28:28 -07002690 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
David S. Miller813b3b52011-04-28 14:48:42 -07002691 if (!__ip_dev_find(net, fl4->saddr, false))
Julian Anastasova210d012008-10-01 07:28:28 -07002692 goto out;
Julian Anastasova210d012008-10-01 07:28:28 -07002693 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002694 }
2695
2696
David S. Miller813b3b52011-04-28 14:48:42 -07002697 if (fl4->flowi4_oif) {
2698 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
David S. Millerb23dd4f2011-03-02 14:31:35 -08002699 rth = ERR_PTR(-ENODEV);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002700 if (dev_out == NULL)
2701 goto out;
Herbert Xue5ed6392005-10-03 14:35:55 -07002702
2703 /* RACE: Check return value of inet_select_addr instead. */
Eric Dumazetfc75fc82010-12-22 04:39:39 +00002704 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
David S. Millerb23dd4f2011-03-02 14:31:35 -08002705 rth = ERR_PTR(-ENETUNREACH);
Eric Dumazetfc75fc82010-12-22 04:39:39 +00002706 goto out;
2707 }
David S. Miller813b3b52011-04-28 14:48:42 -07002708 if (ipv4_is_local_multicast(fl4->daddr) ||
2709 ipv4_is_lbcast(fl4->daddr)) {
2710 if (!fl4->saddr)
2711 fl4->saddr = inet_select_addr(dev_out, 0,
2712 RT_SCOPE_LINK);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002713 goto make_route;
2714 }
David S. Miller813b3b52011-04-28 14:48:42 -07002715 if (fl4->saddr) {
2716 if (ipv4_is_multicast(fl4->daddr))
2717 fl4->saddr = inet_select_addr(dev_out, 0,
2718 fl4->flowi4_scope);
2719 else if (!fl4->daddr)
2720 fl4->saddr = inet_select_addr(dev_out, 0,
2721 RT_SCOPE_HOST);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002722 }
2723 }
2724
David S. Miller813b3b52011-04-28 14:48:42 -07002725 if (!fl4->daddr) {
2726 fl4->daddr = fl4->saddr;
2727 if (!fl4->daddr)
2728 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002729 dev_out = net->loopback_dev;
David S. Miller813b3b52011-04-28 14:48:42 -07002730 fl4->flowi4_oif = net->loopback_dev->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002731 res.type = RTN_LOCAL;
2732 flags |= RTCF_LOCAL;
2733 goto make_route;
2734 }
2735
David S. Miller813b3b52011-04-28 14:48:42 -07002736 if (fib_lookup(net, fl4, &res)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002737 res.fi = NULL;
David S. Miller813b3b52011-04-28 14:48:42 -07002738 if (fl4->flowi4_oif) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002739 /* Apparently, routing tables are wrong. Assume,
2740 that the destination is on link.
2741
2742 WHY? DW.
2743 Because we are allowed to send to iface
2744 even if it has NO routes and NO assigned
2745 addresses. When oif is specified, routing
2746 tables are looked up with only one purpose:
2747 to catch if destination is gatewayed, rather than
2748 direct. Moreover, if MSG_DONTROUTE is set,
2749 we send packet, ignoring both routing tables
2750 and ifaddr state. --ANK
2751
2752
2753 We could make it even if oif is unknown,
2754 likely IPv6, but we do not.
2755 */
2756
David S. Miller813b3b52011-04-28 14:48:42 -07002757 if (fl4->saddr == 0)
2758 fl4->saddr = inet_select_addr(dev_out, 0,
2759 RT_SCOPE_LINK);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002760 res.type = RTN_UNICAST;
2761 goto make_route;
2762 }
David S. Millerb23dd4f2011-03-02 14:31:35 -08002763 rth = ERR_PTR(-ENETUNREACH);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002764 goto out;
2765 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002766
2767 if (res.type == RTN_LOCAL) {
David S. Miller813b3b52011-04-28 14:48:42 -07002768 if (!fl4->saddr) {
Joel Sing9fc3bbb2011-01-03 20:24:20 +00002769 if (res.fi->fib_prefsrc)
David S. Miller813b3b52011-04-28 14:48:42 -07002770 fl4->saddr = res.fi->fib_prefsrc;
Joel Sing9fc3bbb2011-01-03 20:24:20 +00002771 else
David S. Miller813b3b52011-04-28 14:48:42 -07002772 fl4->saddr = fl4->daddr;
Joel Sing9fc3bbb2011-01-03 20:24:20 +00002773 }
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002774 dev_out = net->loopback_dev;
David S. Miller813b3b52011-04-28 14:48:42 -07002775 fl4->flowi4_oif = dev_out->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002776 res.fi = NULL;
2777 flags |= RTCF_LOCAL;
2778 goto make_route;
2779 }
2780
2781#ifdef CONFIG_IP_ROUTE_MULTIPATH
David S. Miller813b3b52011-04-28 14:48:42 -07002782 if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
David S. Miller1b7fe5932011-03-10 17:01:16 -08002783 fib_select_multipath(&res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002784 else
2785#endif
David S. Miller21d8c492011-04-14 14:49:37 -07002786 if (!res.prefixlen &&
2787 res.table->tb_num_default > 1 &&
David S. Miller813b3b52011-04-28 14:48:42 -07002788 res.type == RTN_UNICAST && !fl4->flowi4_oif)
David S. Miller0c838ff2011-01-31 16:16:50 -08002789 fib_select_default(&res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002790
David S. Miller813b3b52011-04-28 14:48:42 -07002791 if (!fl4->saddr)
2792 fl4->saddr = FIB_RES_PREFSRC(net, res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002793
Linus Torvalds1da177e2005-04-16 15:20:36 -07002794 dev_out = FIB_RES_DEV(res);
David S. Miller813b3b52011-04-28 14:48:42 -07002795 fl4->flowi4_oif = dev_out->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002796
2797
2798make_route:
David S. Miller813b3b52011-04-28 14:48:42 -07002799 rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
Julian Anastasovf61759e2011-12-02 11:39:42 +00002800 tos, dev_out, flags);
David S. Millerb23dd4f2011-03-02 14:31:35 -08002801 if (!IS_ERR(rth)) {
David S. Miller5ada5522011-02-17 15:29:00 -08002802 unsigned int hash;
2803
David S. Miller813b3b52011-04-28 14:48:42 -07002804 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
David S. Miller5ada5522011-02-17 15:29:00 -08002805 rt_genid(dev_net(dev_out)));
David S. Miller813b3b52011-04-28 14:48:42 -07002806 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
David S. Miller5ada5522011-02-17 15:29:00 -08002807 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002808
David S. Miller010c2702011-02-17 15:37:09 -08002809out:
2810 rcu_read_unlock();
David S. Millerb23dd4f2011-03-02 14:31:35 -08002811 return rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002812}
2813
David S. Miller813b3b52011-04-28 14:48:42 -07002814struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002815{
Linus Torvalds1da177e2005-04-16 15:20:36 -07002816 struct rtable *rth;
David S. Miller010c2702011-02-17 15:37:09 -08002817 unsigned int hash;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002818
Neil Horman1080d702008-10-27 12:28:25 -07002819 if (!rt_caching(net))
2820 goto slow_output;
2821
David S. Miller9d6ec932011-03-12 01:12:47 -05002822 hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002823
2824 rcu_read_lock_bh();
Paul E. McKenneya898def2010-02-22 17:04:49 -08002825 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
Changli Gaod8d1f302010-06-10 23:31:35 -07002826 rth = rcu_dereference_bh(rth->dst.rt_next)) {
David S. Miller9d6ec932011-03-12 01:12:47 -05002827 if (rth->rt_key_dst == flp4->daddr &&
2828 rth->rt_key_src == flp4->saddr &&
David S. Millerc7537962010-11-11 17:07:48 -08002829 rt_is_output_route(rth) &&
David S. Miller9d6ec932011-03-12 01:12:47 -05002830 rth->rt_oif == flp4->flowi4_oif &&
2831 rth->rt_mark == flp4->flowi4_mark &&
David S. Miller475949d2011-05-03 19:45:15 -07002832 !((rth->rt_key_tos ^ flp4->flowi4_tos) &
Denis V. Lunevb5921912008-01-22 23:50:25 -08002833 (IPTOS_RT_MASK | RTO_ONLINK)) &&
Changli Gaod8d1f302010-06-10 23:31:35 -07002834 net_eq(dev_net(rth->dst.dev), net) &&
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002835 !rt_is_expired(rth)) {
David S. Millerde398fb2011-12-05 13:21:42 -05002836 ipv4_validate_peer(rth);
Changli Gaod8d1f302010-06-10 23:31:35 -07002837 dst_use(&rth->dst, jiffies);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002838 RT_CACHE_STAT_INC(out_hit);
2839 rcu_read_unlock_bh();
David S. Miller56157872011-05-02 14:37:45 -07002840 if (!flp4->saddr)
2841 flp4->saddr = rth->rt_src;
2842 if (!flp4->daddr)
2843 flp4->daddr = rth->rt_dst;
David S. Millerb23dd4f2011-03-02 14:31:35 -08002844 return rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002845 }
2846 RT_CACHE_STAT_INC(out_hlist_search);
2847 }
2848 rcu_read_unlock_bh();
2849
Neil Horman1080d702008-10-27 12:28:25 -07002850slow_output:
David S. Miller9d6ec932011-03-12 01:12:47 -05002851 return ip_route_output_slow(net, flp4);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002852}
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -07002853EXPORT_SYMBOL_GPL(__ip_route_output_key);
2854
Jianzhao Wangae2688d2010-09-08 14:35:43 -07002855static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2856{
2857 return NULL;
2858}
2859
Steffen Klassertebb762f2011-11-23 02:12:51 +00002860static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
Roland Dreierec831ea2011-01-31 13:16:00 -08002861{
Steffen Klassert618f9bc2011-11-23 02:13:31 +00002862 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2863
2864 return mtu ? : dst->dev->mtu;
Roland Dreierec831ea2011-01-31 13:16:00 -08002865}
2866
David S. Miller14e50e52007-05-24 18:17:54 -07002867static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2868{
2869}
2870
Held Bernhard0972ddb2011-04-24 22:07:32 +00002871static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2872 unsigned long old)
2873{
2874 return NULL;
2875}
2876
David S. Miller14e50e52007-05-24 18:17:54 -07002877static struct dst_ops ipv4_dst_blackhole_ops = {
2878 .family = AF_INET,
Harvey Harrison09640e632009-02-01 00:45:17 -08002879 .protocol = cpu_to_be16(ETH_P_IP),
David S. Miller14e50e52007-05-24 18:17:54 -07002880 .destroy = ipv4_dst_destroy,
Jianzhao Wangae2688d2010-09-08 14:35:43 -07002881 .check = ipv4_blackhole_dst_check,
Steffen Klassertebb762f2011-11-23 02:12:51 +00002882 .mtu = ipv4_blackhole_mtu,
Eric Dumazet214f45c2011-02-18 11:39:01 -08002883 .default_advmss = ipv4_default_advmss,
David S. Miller14e50e52007-05-24 18:17:54 -07002884 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
Held Bernhard0972ddb2011-04-24 22:07:32 +00002885 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
David S. Millerd3aaeb32011-07-18 00:40:17 -07002886 .neigh_lookup = ipv4_neigh_lookup,
David S. Miller14e50e52007-05-24 18:17:54 -07002887};
2888
David S. Miller2774c132011-03-01 14:59:04 -08002889struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
David S. Miller14e50e52007-05-24 18:17:54 -07002890{
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002891 struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
David S. Miller2774c132011-03-01 14:59:04 -08002892 struct rtable *ort = (struct rtable *) dst_orig;
David S. Miller14e50e52007-05-24 18:17:54 -07002893
2894 if (rt) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002895 struct dst_entry *new = &rt->dst;
David S. Miller14e50e52007-05-24 18:17:54 -07002896
David S. Miller14e50e52007-05-24 18:17:54 -07002897 new->__use = 1;
Herbert Xu352e5122007-11-13 21:34:06 -08002898 new->input = dst_discard;
2899 new->output = dst_discard;
David S. Millerdefb3512010-12-08 21:16:57 -08002900 dst_copy_metrics(new, &ort->dst);
David S. Miller14e50e52007-05-24 18:17:54 -07002901
Changli Gaod8d1f302010-06-10 23:31:35 -07002902 new->dev = ort->dst.dev;
David S. Miller14e50e52007-05-24 18:17:54 -07002903 if (new->dev)
2904 dev_hold(new->dev);
2905
David S. Miller5e2b61f2011-03-04 21:47:09 -08002906 rt->rt_key_dst = ort->rt_key_dst;
2907 rt->rt_key_src = ort->rt_key_src;
David S. Miller475949d2011-05-03 19:45:15 -07002908 rt->rt_key_tos = ort->rt_key_tos;
OGAWA Hirofumi1b86a582011-04-07 14:04:08 -07002909 rt->rt_route_iif = ort->rt_route_iif;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002910 rt->rt_iif = ort->rt_iif;
2911 rt->rt_oif = ort->rt_oif;
2912 rt->rt_mark = ort->rt_mark;
David S. Miller14e50e52007-05-24 18:17:54 -07002913
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002914 rt->rt_genid = rt_genid(net);
David S. Miller14e50e52007-05-24 18:17:54 -07002915 rt->rt_flags = ort->rt_flags;
2916 rt->rt_type = ort->rt_type;
2917 rt->rt_dst = ort->rt_dst;
2918 rt->rt_src = ort->rt_src;
David S. Miller14e50e52007-05-24 18:17:54 -07002919 rt->rt_gateway = ort->rt_gateway;
2920 rt->rt_spec_dst = ort->rt_spec_dst;
2921 rt->peer = ort->peer;
2922 if (rt->peer)
2923 atomic_inc(&rt->peer->refcnt);
David S. Miller62fa8a82011-01-26 20:51:05 -08002924 rt->fi = ort->fi;
2925 if (rt->fi)
2926 atomic_inc(&rt->fi->fib_clntref);
David S. Miller14e50e52007-05-24 18:17:54 -07002927
2928 dst_free(new);
2929 }
2930
David S. Miller2774c132011-03-01 14:59:04 -08002931 dst_release(dst_orig);
2932
2933 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
David S. Miller14e50e52007-05-24 18:17:54 -07002934}
2935
David S. Miller9d6ec932011-03-12 01:12:47 -05002936struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
David S. Millerb23dd4f2011-03-02 14:31:35 -08002937 struct sock *sk)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002938{
David S. Miller9d6ec932011-03-12 01:12:47 -05002939 struct rtable *rt = __ip_route_output_key(net, flp4);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002940
David S. Millerb23dd4f2011-03-02 14:31:35 -08002941 if (IS_ERR(rt))
2942 return rt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002943
David S. Miller56157872011-05-02 14:37:45 -07002944 if (flp4->flowi4_proto)
David S. Miller9d6ec932011-03-12 01:12:47 -05002945 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2946 flowi4_to_flowi(flp4),
2947 sk, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002948
David S. Millerb23dd4f2011-03-02 14:31:35 -08002949 return rt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002950}
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -07002951EXPORT_SYMBOL_GPL(ip_route_output_flow);
2952
Benjamin Thery4feb88e2009-01-22 04:56:23 +00002953static int rt_fill_info(struct net *net,
2954 struct sk_buff *skb, u32 pid, u32 seq, int event,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07002955 int nowait, unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002956{
Eric Dumazet511c3f92009-06-02 05:14:27 +00002957 struct rtable *rt = skb_rtable(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002958 struct rtmsg *r;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002959 struct nlmsghdr *nlh;
Steffen Klassert2bc8ca42011-10-11 01:12:02 +00002960 unsigned long expires = 0;
Eric Dumazetfe6fe792011-06-08 06:07:07 +00002961 const struct inet_peer *peer = rt->peer;
Thomas Grafe3703b32006-11-27 09:27:07 -08002962 u32 id = 0, ts = 0, tsage = 0, error;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002963
2964 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2965 if (nlh == NULL)
Patrick McHardy26932562007-01-31 23:16:40 -08002966 return -EMSGSIZE;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002967
2968 r = nlmsg_data(nlh);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002969 r->rtm_family = AF_INET;
2970 r->rtm_dst_len = 32;
2971 r->rtm_src_len = 0;
David S. Miller475949d2011-05-03 19:45:15 -07002972 r->rtm_tos = rt->rt_key_tos;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002973 r->rtm_table = RT_TABLE_MAIN;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002974 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002975 r->rtm_type = rt->rt_type;
2976 r->rtm_scope = RT_SCOPE_UNIVERSE;
2977 r->rtm_protocol = RTPROT_UNSPEC;
2978 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2979 if (rt->rt_flags & RTCF_NOTIFY)
2980 r->rtm_flags |= RTM_F_NOTIFY;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002981
Al Viro17fb2c62006-09-26 22:15:25 -07002982 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
Thomas Grafbe403ea2006-08-17 18:15:17 -07002983
David S. Miller5e2b61f2011-03-04 21:47:09 -08002984 if (rt->rt_key_src) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002985 r->rtm_src_len = 32;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002986 NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002987 }
Changli Gaod8d1f302010-06-10 23:31:35 -07002988 if (rt->dst.dev)
2989 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
Patrick McHardyc7066f72011-01-14 13:36:42 +01002990#ifdef CONFIG_IP_ROUTE_CLASSID
Changli Gaod8d1f302010-06-10 23:31:35 -07002991 if (rt->dst.tclassid)
2992 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002993#endif
David S. Millerc7537962010-11-11 17:07:48 -08002994 if (rt_is_input_route(rt))
Al Viro17fb2c62006-09-26 22:15:25 -07002995 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
David S. Miller5e2b61f2011-03-04 21:47:09 -08002996 else if (rt->rt_src != rt->rt_key_src)
Al Viro17fb2c62006-09-26 22:15:25 -07002997 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
Thomas Grafbe403ea2006-08-17 18:15:17 -07002998
Linus Torvalds1da177e2005-04-16 15:20:36 -07002999 if (rt->rt_dst != rt->rt_gateway)
Al Viro17fb2c62006-09-26 22:15:25 -07003000 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
Thomas Grafbe403ea2006-08-17 18:15:17 -07003001
David S. Millerdefb3512010-12-08 21:16:57 -08003002 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
Thomas Grafbe403ea2006-08-17 18:15:17 -07003003 goto nla_put_failure;
3004
David S. Miller5e2b61f2011-03-04 21:47:09 -08003005 if (rt->rt_mark)
3006 NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
Eric Dumazet963bfee2010-07-20 22:03:14 +00003007
Changli Gaod8d1f302010-06-10 23:31:35 -07003008 error = rt->dst.error;
Eric Dumazetfe6fe792011-06-08 06:07:07 +00003009 if (peer) {
Eric Dumazet317fe0e2010-06-16 04:52:13 +00003010 inet_peer_refcheck(rt->peer);
Eric Dumazetfe6fe792011-06-08 06:07:07 +00003011 id = atomic_read(&peer->ip_id_count) & 0xffff;
3012 if (peer->tcp_ts_stamp) {
3013 ts = peer->tcp_ts;
3014 tsage = get_seconds() - peer->tcp_ts_stamp;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003015 }
Eric Dumazetfe6fe792011-06-08 06:07:07 +00003016 expires = ACCESS_ONCE(peer->pmtu_expires);
Steffen Klassert2bc8ca42011-10-11 01:12:02 +00003017 if (expires) {
3018 if (time_before(jiffies, expires))
3019 expires -= jiffies;
3020 else
3021 expires = 0;
3022 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003023 }
Thomas Grafbe403ea2006-08-17 18:15:17 -07003024
David S. Millerc7537962010-11-11 17:07:48 -08003025 if (rt_is_input_route(rt)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003026#ifdef CONFIG_IP_MROUTE
Al Viroe4485152006-09-26 22:15:01 -07003027 __be32 dst = rt->rt_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003028
Joe Perchesf97c1e02007-12-16 13:45:43 -08003029 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
Benjamin Thery4feb88e2009-01-22 04:56:23 +00003030 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
David S. Miller9a1b9492011-05-04 12:18:54 -07003031 int err = ipmr_get_route(net, skb,
3032 rt->rt_src, rt->rt_dst,
3033 r, nowait);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003034 if (err <= 0) {
3035 if (!nowait) {
3036 if (err == 0)
3037 return 0;
Thomas Grafbe403ea2006-08-17 18:15:17 -07003038 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003039 } else {
3040 if (err == -EMSGSIZE)
Thomas Grafbe403ea2006-08-17 18:15:17 -07003041 goto nla_put_failure;
Thomas Grafe3703b32006-11-27 09:27:07 -08003042 error = err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003043 }
3044 }
3045 } else
3046#endif
David S. Miller5e2b61f2011-03-04 21:47:09 -08003047 NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003048 }
3049
Changli Gaod8d1f302010-06-10 23:31:35 -07003050 if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
Thomas Grafe3703b32006-11-27 09:27:07 -08003051 expires, error) < 0)
3052 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003053
Thomas Grafbe403ea2006-08-17 18:15:17 -07003054 return nlmsg_end(skb, nlh);
3055
3056nla_put_failure:
Patrick McHardy26932562007-01-31 23:16:40 -08003057 nlmsg_cancel(skb, nlh);
3058 return -EMSGSIZE;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003059}
3060
Thomas Graf63f34442007-03-22 11:55:17 -07003061static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003062{
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09003063 struct net *net = sock_net(in_skb->sk);
Thomas Grafd889ce32006-08-17 18:15:44 -07003064 struct rtmsg *rtm;
3065 struct nlattr *tb[RTA_MAX+1];
Linus Torvalds1da177e2005-04-16 15:20:36 -07003066 struct rtable *rt = NULL;
Al Viro9e12bb22006-09-26 21:25:20 -07003067 __be32 dst = 0;
3068 __be32 src = 0;
3069 u32 iif;
Thomas Grafd889ce32006-08-17 18:15:44 -07003070 int err;
Eric Dumazet963bfee2010-07-20 22:03:14 +00003071 int mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003072 struct sk_buff *skb;
3073
Thomas Grafd889ce32006-08-17 18:15:44 -07003074 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
3075 if (err < 0)
3076 goto errout;
3077
3078 rtm = nlmsg_data(nlh);
3079
Linus Torvalds1da177e2005-04-16 15:20:36 -07003080 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
Thomas Grafd889ce32006-08-17 18:15:44 -07003081 if (skb == NULL) {
3082 err = -ENOBUFS;
3083 goto errout;
3084 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003085
3086 /* Reserve room for dummy headers, this skb can pass
3087 through good chunk of routing engine.
3088 */
Arnaldo Carvalho de Melo459a98e2007-03-19 15:30:44 -07003089 skb_reset_mac_header(skb);
Arnaldo Carvalho de Meloc1d2bbe2007-04-10 20:45:18 -07003090 skb_reset_network_header(skb);
Stephen Hemmingerd2c962b2006-04-17 17:27:11 -07003091
3092 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07003093 ip_hdr(skb)->protocol = IPPROTO_ICMP;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003094 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3095
Al Viro17fb2c62006-09-26 22:15:25 -07003096 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3097 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
Thomas Grafd889ce32006-08-17 18:15:44 -07003098 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
Eric Dumazet963bfee2010-07-20 22:03:14 +00003099 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003100
3101 if (iif) {
Thomas Grafd889ce32006-08-17 18:15:44 -07003102 struct net_device *dev;
3103
Denis V. Lunev19375042008-02-28 20:52:04 -08003104 dev = __dev_get_by_index(net, iif);
Thomas Grafd889ce32006-08-17 18:15:44 -07003105 if (dev == NULL) {
3106 err = -ENODEV;
3107 goto errout_free;
3108 }
3109
Linus Torvalds1da177e2005-04-16 15:20:36 -07003110 skb->protocol = htons(ETH_P_IP);
3111 skb->dev = dev;
Eric Dumazet963bfee2010-07-20 22:03:14 +00003112 skb->mark = mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003113 local_bh_disable();
3114 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3115 local_bh_enable();
Thomas Grafd889ce32006-08-17 18:15:44 -07003116
Eric Dumazet511c3f92009-06-02 05:14:27 +00003117 rt = skb_rtable(skb);
Changli Gaod8d1f302010-06-10 23:31:35 -07003118 if (err == 0 && rt->dst.error)
3119 err = -rt->dst.error;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003120 } else {
David S. Miller68a5e3d2011-03-11 20:07:33 -05003121 struct flowi4 fl4 = {
3122 .daddr = dst,
3123 .saddr = src,
3124 .flowi4_tos = rtm->rtm_tos,
3125 .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3126 .flowi4_mark = mark,
Thomas Grafd889ce32006-08-17 18:15:44 -07003127 };
David S. Miller9d6ec932011-03-12 01:12:47 -05003128 rt = ip_route_output_key(net, &fl4);
David S. Millerb23dd4f2011-03-02 14:31:35 -08003129
3130 err = 0;
3131 if (IS_ERR(rt))
3132 err = PTR_ERR(rt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003133 }
Thomas Grafd889ce32006-08-17 18:15:44 -07003134
Linus Torvalds1da177e2005-04-16 15:20:36 -07003135 if (err)
Thomas Grafd889ce32006-08-17 18:15:44 -07003136 goto errout_free;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003137
Changli Gaod8d1f302010-06-10 23:31:35 -07003138 skb_dst_set(skb, &rt->dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003139 if (rtm->rtm_flags & RTM_F_NOTIFY)
3140 rt->rt_flags |= RTCF_NOTIFY;
3141
Benjamin Thery4feb88e2009-01-22 04:56:23 +00003142 err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
Denis V. Lunev19375042008-02-28 20:52:04 -08003143 RTM_NEWROUTE, 0, 0);
Thomas Grafd889ce32006-08-17 18:15:44 -07003144 if (err <= 0)
3145 goto errout_free;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003146
Denis V. Lunev19375042008-02-28 20:52:04 -08003147 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
Thomas Grafd889ce32006-08-17 18:15:44 -07003148errout:
Thomas Graf2942e902006-08-15 00:30:25 -07003149 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003150
Thomas Grafd889ce32006-08-17 18:15:44 -07003151errout_free:
Linus Torvalds1da177e2005-04-16 15:20:36 -07003152 kfree_skb(skb);
Thomas Grafd889ce32006-08-17 18:15:44 -07003153 goto errout;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003154}
3155
3156int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
3157{
3158 struct rtable *rt;
3159 int h, s_h;
3160 int idx, s_idx;
Denis V. Lunev19375042008-02-28 20:52:04 -08003161 struct net *net;
3162
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09003163 net = sock_net(skb->sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003164
3165 s_h = cb->args[0];
Eric Dumazetd8c92832008-01-07 21:52:14 -08003166 if (s_h < 0)
3167 s_h = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003168 s_idx = idx = cb->args[1];
Eric Dumazeta6272662008-08-28 01:11:25 -07003169 for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3170 if (!rt_hash_table[h].chain)
3171 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003172 rcu_read_lock_bh();
Paul E. McKenneya898def2010-02-22 17:04:49 -08003173 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
Changli Gaod8d1f302010-06-10 23:31:35 -07003174 rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3175 if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003176 continue;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07003177 if (rt_is_expired(rt))
Eric Dumazet29e75252008-01-31 17:05:09 -08003178 continue;
Changli Gaod8d1f302010-06-10 23:31:35 -07003179 skb_dst_set_noref(skb, &rt->dst);
Benjamin Thery4feb88e2009-01-22 04:56:23 +00003180 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003181 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07003182 1, NLM_F_MULTI) <= 0) {
Eric Dumazetadf30902009-06-02 05:19:30 +00003183 skb_dst_drop(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003184 rcu_read_unlock_bh();
3185 goto done;
3186 }
Eric Dumazetadf30902009-06-02 05:19:30 +00003187 skb_dst_drop(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003188 }
3189 rcu_read_unlock_bh();
3190 }
3191
3192done:
3193 cb->args[0] = h;
3194 cb->args[1] = idx;
3195 return skb->len;
3196}
3197
3198void ip_rt_multicast_event(struct in_device *in_dev)
3199{
Denis V. Lunev76e6ebf2008-07-05 19:00:44 -07003200 rt_cache_flush(dev_net(in_dev->dev), 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003201}
3202
3203#ifdef CONFIG_SYSCTL
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003204static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
Alexey Dobriyan8d65af72009-09-23 15:57:19 -07003205 void __user *buffer,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003206 size_t *lenp, loff_t *ppos)
3207{
3208 if (write) {
Denis V. Lunev639e1042008-07-05 19:02:06 -07003209 int flush_delay;
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003210 ctl_table ctl;
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003211 struct net *net;
Denis V. Lunev639e1042008-07-05 19:02:06 -07003212
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003213 memcpy(&ctl, __ctl, sizeof(ctl));
3214 ctl.data = &flush_delay;
Alexey Dobriyan8d65af72009-09-23 15:57:19 -07003215 proc_dointvec(&ctl, write, buffer, lenp, ppos);
Denis V. Lunev639e1042008-07-05 19:02:06 -07003216
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003217 net = (struct net *)__ctl->extra1;
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003218 rt_cache_flush(net, flush_delay);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003219 return 0;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003220 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003221
3222 return -EINVAL;
3223}
3224
Al Viroeeb61f72008-07-27 08:59:33 +01003225static ctl_table ipv4_route_table[] = {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003226 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003227 .procname = "gc_thresh",
3228 .data = &ipv4_dst_ops.gc_thresh,
3229 .maxlen = sizeof(int),
3230 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003231 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003232 },
3233 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003234 .procname = "max_size",
3235 .data = &ip_rt_max_size,
3236 .maxlen = sizeof(int),
3237 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003238 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003239 },
3240 {
3241 /* Deprecated. Use gc_min_interval_ms */
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003242
Linus Torvalds1da177e2005-04-16 15:20:36 -07003243 .procname = "gc_min_interval",
3244 .data = &ip_rt_gc_min_interval,
3245 .maxlen = sizeof(int),
3246 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003247 .proc_handler = proc_dointvec_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003248 },
3249 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003250 .procname = "gc_min_interval_ms",
3251 .data = &ip_rt_gc_min_interval,
3252 .maxlen = sizeof(int),
3253 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003254 .proc_handler = proc_dointvec_ms_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003255 },
3256 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003257 .procname = "gc_timeout",
3258 .data = &ip_rt_gc_timeout,
3259 .maxlen = sizeof(int),
3260 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003261 .proc_handler = proc_dointvec_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003262 },
3263 {
Eric Dumazet9f28a2f2011-12-21 15:47:16 -05003264 .procname = "gc_interval",
3265 .data = &ip_rt_gc_interval,
3266 .maxlen = sizeof(int),
3267 .mode = 0644,
3268 .proc_handler = proc_dointvec_jiffies,
3269 },
3270 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003271 .procname = "redirect_load",
3272 .data = &ip_rt_redirect_load,
3273 .maxlen = sizeof(int),
3274 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003275 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003276 },
3277 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003278 .procname = "redirect_number",
3279 .data = &ip_rt_redirect_number,
3280 .maxlen = sizeof(int),
3281 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003282 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003283 },
3284 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003285 .procname = "redirect_silence",
3286 .data = &ip_rt_redirect_silence,
3287 .maxlen = sizeof(int),
3288 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003289 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003290 },
3291 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003292 .procname = "error_cost",
3293 .data = &ip_rt_error_cost,
3294 .maxlen = sizeof(int),
3295 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003296 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003297 },
3298 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003299 .procname = "error_burst",
3300 .data = &ip_rt_error_burst,
3301 .maxlen = sizeof(int),
3302 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003303 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003304 },
3305 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003306 .procname = "gc_elasticity",
3307 .data = &ip_rt_gc_elasticity,
3308 .maxlen = sizeof(int),
3309 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003310 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003311 },
3312 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003313 .procname = "mtu_expires",
3314 .data = &ip_rt_mtu_expires,
3315 .maxlen = sizeof(int),
3316 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003317 .proc_handler = proc_dointvec_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003318 },
3319 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003320 .procname = "min_pmtu",
3321 .data = &ip_rt_min_pmtu,
3322 .maxlen = sizeof(int),
3323 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003324 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003325 },
3326 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003327 .procname = "min_adv_mss",
3328 .data = &ip_rt_min_advmss,
3329 .maxlen = sizeof(int),
3330 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003331 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003332 },
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003333 { }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003334};
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003335
Al Viro2f4520d2008-08-25 15:17:44 -07003336static struct ctl_table empty[1];
3337
3338static struct ctl_table ipv4_skeleton[] =
3339{
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003340 { .procname = "route",
Hugh Dickinsd994af02008-08-27 02:35:18 -07003341 .mode = 0555, .child = ipv4_route_table},
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003342 { .procname = "neigh",
Hugh Dickinsd994af02008-08-27 02:35:18 -07003343 .mode = 0555, .child = empty},
Al Viro2f4520d2008-08-25 15:17:44 -07003344 { }
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003345};
3346
Al Viro2f4520d2008-08-25 15:17:44 -07003347static __net_initdata struct ctl_path ipv4_path[] = {
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003348 { .procname = "net", },
3349 { .procname = "ipv4", },
Al Viro2f4520d2008-08-25 15:17:44 -07003350 { },
3351};
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003352
3353static struct ctl_table ipv4_route_flush_table[] = {
3354 {
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003355 .procname = "flush",
3356 .maxlen = sizeof(int),
3357 .mode = 0200,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003358 .proc_handler = ipv4_sysctl_rtcache_flush,
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003359 },
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003360 { },
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003361};
3362
Al Viro2f4520d2008-08-25 15:17:44 -07003363static __net_initdata struct ctl_path ipv4_route_path[] = {
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003364 { .procname = "net", },
3365 { .procname = "ipv4", },
3366 { .procname = "route", },
Al Viro2f4520d2008-08-25 15:17:44 -07003367 { },
3368};
3369
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003370static __net_init int sysctl_route_net_init(struct net *net)
3371{
3372 struct ctl_table *tbl;
3373
3374 tbl = ipv4_route_flush_table;
Octavian Purdila09ad9bc2009-11-25 15:14:13 -08003375 if (!net_eq(net, &init_net)) {
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003376 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3377 if (tbl == NULL)
3378 goto err_dup;
3379 }
3380 tbl[0].extra1 = net;
3381
3382 net->ipv4.route_hdr =
3383 register_net_sysctl_table(net, ipv4_route_path, tbl);
3384 if (net->ipv4.route_hdr == NULL)
3385 goto err_reg;
3386 return 0;
3387
3388err_reg:
3389 if (tbl != ipv4_route_flush_table)
3390 kfree(tbl);
3391err_dup:
3392 return -ENOMEM;
3393}
3394
3395static __net_exit void sysctl_route_net_exit(struct net *net)
3396{
3397 struct ctl_table *tbl;
3398
3399 tbl = net->ipv4.route_hdr->ctl_table_arg;
3400 unregister_net_sysctl_table(net->ipv4.route_hdr);
3401 BUG_ON(tbl == ipv4_route_flush_table);
3402 kfree(tbl);
3403}
3404
3405static __net_initdata struct pernet_operations sysctl_route_ops = {
3406 .init = sysctl_route_net_init,
3407 .exit = sysctl_route_net_exit,
3408};
Linus Torvalds1da177e2005-04-16 15:20:36 -07003409#endif
3410
Neil Horman3ee94372010-05-08 01:57:52 -07003411static __net_init int rt_genid_init(struct net *net)
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003412{
Neil Horman3ee94372010-05-08 01:57:52 -07003413 get_random_bytes(&net->ipv4.rt_genid,
3414 sizeof(net->ipv4.rt_genid));
David S. Miller436c3b62011-03-24 17:42:21 -07003415 get_random_bytes(&net->ipv4.dev_addr_genid,
3416 sizeof(net->ipv4.dev_addr_genid));
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003417 return 0;
3418}
3419
Neil Horman3ee94372010-05-08 01:57:52 -07003420static __net_initdata struct pernet_operations rt_genid_ops = {
3421 .init = rt_genid_init,
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003422};
3423
3424
Patrick McHardyc7066f72011-01-14 13:36:42 +01003425#ifdef CONFIG_IP_ROUTE_CLASSID
Tejun Heo7d720c32010-02-16 15:20:26 +00003426struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
Patrick McHardyc7066f72011-01-14 13:36:42 +01003427#endif /* CONFIG_IP_ROUTE_CLASSID */
Linus Torvalds1da177e2005-04-16 15:20:36 -07003428
3429static __initdata unsigned long rhash_entries;
3430static int __init set_rhash_entries(char *str)
3431{
3432 if (!str)
3433 return 0;
3434 rhash_entries = simple_strtoul(str, &str, 0);
3435 return 1;
3436}
3437__setup("rhash_entries=", set_rhash_entries);
3438
3439int __init ip_rt_init(void)
3440{
Eric Dumazet424c4b72005-07-05 14:58:19 -07003441 int rc = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003442
Patrick McHardyc7066f72011-01-14 13:36:42 +01003443#ifdef CONFIG_IP_ROUTE_CLASSID
Ingo Molnar0dcec8c2009-02-25 14:07:33 +01003444 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
Linus Torvalds1da177e2005-04-16 15:20:36 -07003445 if (!ip_rt_acct)
3446 panic("IP: failed to allocate ip_rt_acct\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07003447#endif
3448
Alexey Dobriyane5d679f332006-08-26 19:25:52 -07003449 ipv4_dst_ops.kmem_cachep =
3450 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
Paul Mundt20c2df82007-07-20 10:11:58 +09003451 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003452
David S. Miller14e50e52007-05-24 18:17:54 -07003453 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3454
Eric Dumazetfc66f952010-10-08 06:37:34 +00003455 if (dst_entries_init(&ipv4_dst_ops) < 0)
3456 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3457
3458 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3459 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3460
Eric Dumazet424c4b72005-07-05 14:58:19 -07003461 rt_hash_table = (struct rt_hash_bucket *)
3462 alloc_large_system_hash("IP route cache",
3463 sizeof(struct rt_hash_bucket),
3464 rhash_entries,
Jan Beulich44813742009-09-21 17:03:05 -07003465 (totalram_pages >= 128 * 1024) ?
Mike Stroyan18955cf2005-11-29 16:12:55 -08003466 15 : 17,
Kirill Korotaev8d1502d2006-08-07 20:44:22 -07003467 0,
Eric Dumazet424c4b72005-07-05 14:58:19 -07003468 &rt_hash_log,
3469 &rt_hash_mask,
Anton Blanchardc9503e02009-04-27 05:42:24 -07003470 rhash_entries ? 0 : 512 * 1024);
Eric Dumazet22c047c2005-07-05 14:55:24 -07003471 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3472 rt_hash_lock_init();
Linus Torvalds1da177e2005-04-16 15:20:36 -07003473
3474 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3475 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3476
Linus Torvalds1da177e2005-04-16 15:20:36 -07003477 devinet_init();
3478 ip_fib_init();
3479
Eric Dumazet9f28a2f2011-12-21 15:47:16 -05003480 INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3481 expires_ljiffies = jiffies;
3482 schedule_delayed_work(&expires_work,
3483 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3484
Denis V. Lunev73b38712008-02-28 20:51:18 -08003485 if (ip_rt_proc_init())
Joe Perches058bd4d2012-03-11 18:36:11 +00003486 pr_err("Unable to create route proc files\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07003487#ifdef CONFIG_XFRM
3488 xfrm_init();
Neil Hormana33bc5c2009-07-30 18:52:15 -07003489 xfrm4_init(ip_rt_max_size);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003490#endif
Greg Rosec7ac8672011-06-10 01:27:09 +00003491 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
Thomas Graf63f34442007-03-22 11:55:17 -07003492
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003493#ifdef CONFIG_SYSCTL
3494 register_pernet_subsys(&sysctl_route_ops);
3495#endif
Neil Horman3ee94372010-05-08 01:57:52 -07003496 register_pernet_subsys(&rt_genid_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003497 return rc;
3498}
3499
Al Viroa1bc6eb2008-07-30 06:32:52 -04003500#ifdef CONFIG_SYSCTL
Al Viroeeb61f72008-07-27 08:59:33 +01003501/*
3502 * We really need to sanitize the damn ipv4 init order, then all
3503 * this nonsense will go away.
3504 */
3505void __init ip_static_sysctl_init(void)
3506{
Al Viro2f4520d2008-08-25 15:17:44 -07003507 register_sysctl_paths(ipv4_path, ipv4_skeleton);
Al Viroeeb61f72008-07-27 08:59:33 +01003508}
Al Viroa1bc6eb2008-07-30 06:32:52 -04003509#endif