blob: cb7efe0567f096b8c0e67a07f942e77aea3c2fc2 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
Jesper Juhl02c30a82005-05-05 16:16:16 -07008 * Authors: Ross Biro
Linus Torvalds1da177e2005-04-16 15:20:36 -07009 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 *
14 * Fixes:
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090021 * Alan Cox : Super /proc >4K
Linus Torvalds1da177e2005-04-16 15:20:36 -070022 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090039 *
Linus Torvalds1da177e2005-04-16 15:20:36 -070040 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
Eric Dumazetbb1d23b2005-07-05 15:00:32 -070055 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
Ilia Sotnikovcef26852006-03-25 01:38:55 -080056 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
Linus Torvalds1da177e2005-04-16 15:20:36 -070058 *
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
63 */
64
Linus Torvalds1da177e2005-04-16 15:20:36 -070065#include <linux/module.h>
66#include <asm/uaccess.h>
67#include <asm/system.h>
68#include <linux/bitops.h>
69#include <linux/types.h>
70#include <linux/kernel.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070071#include <linux/mm.h>
Eric Dumazet424c4b72005-07-05 14:58:19 -070072#include <linux/bootmem.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070073#include <linux/string.h>
74#include <linux/socket.h>
75#include <linux/sockios.h>
76#include <linux/errno.h>
77#include <linux/in.h>
78#include <linux/inet.h>
79#include <linux/netdevice.h>
80#include <linux/proc_fs.h>
81#include <linux/init.h>
Eric Dumazet39c90ec2007-09-15 10:55:54 -070082#include <linux/workqueue.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070083#include <linux/skbuff.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070084#include <linux/inetdevice.h>
85#include <linux/igmp.h>
86#include <linux/pkt_sched.h>
87#include <linux/mroute.h>
88#include <linux/netfilter_ipv4.h>
89#include <linux/random.h>
90#include <linux/jhash.h>
91#include <linux/rcupdate.h>
92#include <linux/times.h>
Tejun Heo5a0e3ad2010-03-24 17:04:11 +090093#include <linux/slab.h>
Herbert Xu352e5122007-11-13 21:34:06 -080094#include <net/dst.h>
Eric W. Biederman457c4cb2007-09-12 12:01:34 +020095#include <net/net_namespace.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070096#include <net/protocol.h>
97#include <net/ip.h>
98#include <net/route.h>
99#include <net/inetpeer.h>
100#include <net/sock.h>
101#include <net/ip_fib.h>
102#include <net/arp.h>
103#include <net/tcp.h>
104#include <net/icmp.h>
105#include <net/xfrm.h>
Tom Tucker8d717402006-07-30 20:43:36 -0700106#include <net/netevent.h>
Thomas Graf63f34442007-03-22 11:55:17 -0700107#include <net/rtnetlink.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700108#ifdef CONFIG_SYSCTL
109#include <linux/sysctl.h>
110#endif
David Miller3769cff2011-07-11 22:44:24 +0000111#include <net/atmclip.h>
David S. Miller6e5714e2011-08-03 20:50:44 -0700112#include <net/secure_seq.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700113
David S. Miller68a5e3d2011-03-11 20:07:33 -0500114#define RT_FL_TOS(oldflp4) \
115 ((u32)(oldflp4->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700116
117#define IP_MAX_MTU 0xFFF0
118
119#define RT_GC_TIMEOUT (300*HZ)
120
Linus Torvalds1da177e2005-04-16 15:20:36 -0700121static int ip_rt_max_size;
Stephen Hemminger817bc4d2008-03-22 17:43:59 -0700122static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
123static int ip_rt_gc_interval __read_mostly = 60 * HZ;
124static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
125static int ip_rt_redirect_number __read_mostly = 9;
126static int ip_rt_redirect_load __read_mostly = HZ / 50;
127static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
128static int ip_rt_error_cost __read_mostly = HZ;
129static int ip_rt_error_burst __read_mostly = 5 * HZ;
130static int ip_rt_gc_elasticity __read_mostly = 8;
131static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
132static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
133static int ip_rt_min_advmss __read_mostly = 256;
Neil Horman1080d702008-10-27 12:28:25 -0700134static int rt_chain_length_max __read_mostly = 20;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700135
Linus Torvalds1da177e2005-04-16 15:20:36 -0700136/*
137 * Interface to generic destination cache.
138 */
139
140static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
David S. Miller0dbaee32010-12-13 12:52:14 -0800141static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
David S. Millerd33e4552010-12-14 13:01:14 -0800142static unsigned int ipv4_default_mtu(const struct dst_entry *dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700143static void ipv4_dst_destroy(struct dst_entry *dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700144static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
145static void ipv4_link_failure(struct sk_buff *skb);
146static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
Daniel Lezcano569d3642008-01-18 03:56:57 -0800147static int rt_garbage_collect(struct dst_ops *ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700148
Eric Dumazet72cdd1d2010-11-11 07:14:07 +0000149static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
150 int how)
151{
152}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700153
David S. Miller62fa8a82011-01-26 20:51:05 -0800154static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
155{
David S. Miller06582542011-01-27 14:58:42 -0800156 struct rtable *rt = (struct rtable *) dst;
157 struct inet_peer *peer;
158 u32 *p = NULL;
David S. Miller62fa8a82011-01-26 20:51:05 -0800159
David S. Miller06582542011-01-27 14:58:42 -0800160 if (!rt->peer)
David S. Millera48eff12011-05-18 18:42:43 -0400161 rt_bind_peer(rt, rt->rt_dst, 1);
David S. Miller06582542011-01-27 14:58:42 -0800162
163 peer = rt->peer;
164 if (peer) {
David S. Miller62fa8a82011-01-26 20:51:05 -0800165 u32 *old_p = __DST_METRICS_PTR(old);
166 unsigned long prev, new;
167
David S. Miller06582542011-01-27 14:58:42 -0800168 p = peer->metrics;
169 if (inet_metrics_new(peer))
170 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
David S. Miller62fa8a82011-01-26 20:51:05 -0800171
172 new = (unsigned long) p;
173 prev = cmpxchg(&dst->_metrics, old, new);
174
175 if (prev != old) {
David S. Miller62fa8a82011-01-26 20:51:05 -0800176 p = __DST_METRICS_PTR(prev);
177 if (prev & DST_METRICS_READ_ONLY)
178 p = NULL;
179 } else {
David S. Miller62fa8a82011-01-26 20:51:05 -0800180 if (rt->fi) {
181 fib_info_put(rt->fi);
182 rt->fi = NULL;
183 }
184 }
185 }
186 return p;
187}
188
David S. Millerd3aaeb32011-07-18 00:40:17 -0700189static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
190
Linus Torvalds1da177e2005-04-16 15:20:36 -0700191static struct dst_ops ipv4_dst_ops = {
192 .family = AF_INET,
Harvey Harrison09640e632009-02-01 00:45:17 -0800193 .protocol = cpu_to_be16(ETH_P_IP),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700194 .gc = rt_garbage_collect,
195 .check = ipv4_dst_check,
David S. Miller0dbaee32010-12-13 12:52:14 -0800196 .default_advmss = ipv4_default_advmss,
David S. Millerd33e4552010-12-14 13:01:14 -0800197 .default_mtu = ipv4_default_mtu,
David S. Miller62fa8a82011-01-26 20:51:05 -0800198 .cow_metrics = ipv4_cow_metrics,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700199 .destroy = ipv4_dst_destroy,
200 .ifdown = ipv4_dst_ifdown,
201 .negative_advice = ipv4_negative_advice,
202 .link_failure = ipv4_link_failure,
203 .update_pmtu = ip_rt_update_pmtu,
Herbert Xu1ac06e02008-05-20 14:32:14 -0700204 .local_out = __ip_local_out,
David S. Millerd3aaeb32011-07-18 00:40:17 -0700205 .neigh_lookup = ipv4_neigh_lookup,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700206};
207
208#define ECN_OR_COST(class) TC_PRIO_##class
209
Philippe De Muyter4839c522007-07-09 15:32:57 -0700210const __u8 ip_tos2prio[16] = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700211 TC_PRIO_BESTEFFORT,
Dan Siemon4a2b9c32011-03-15 13:56:07 +0000212 ECN_OR_COST(BESTEFFORT),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700213 TC_PRIO_BESTEFFORT,
214 ECN_OR_COST(BESTEFFORT),
215 TC_PRIO_BULK,
216 ECN_OR_COST(BULK),
217 TC_PRIO_BULK,
218 ECN_OR_COST(BULK),
219 TC_PRIO_INTERACTIVE,
220 ECN_OR_COST(INTERACTIVE),
221 TC_PRIO_INTERACTIVE,
222 ECN_OR_COST(INTERACTIVE),
223 TC_PRIO_INTERACTIVE_BULK,
224 ECN_OR_COST(INTERACTIVE_BULK),
225 TC_PRIO_INTERACTIVE_BULK,
226 ECN_OR_COST(INTERACTIVE_BULK)
227};
228
229
230/*
231 * Route cache.
232 */
233
234/* The locking scheme is rather straight forward:
235 *
236 * 1) Read-Copy Update protects the buckets of the central route hash.
237 * 2) Only writers remove entries, and they hold the lock
238 * as they look at rtable reference counts.
239 * 3) Only readers acquire references to rtable entries,
240 * they do so with atomic increments and with the
241 * lock held.
242 */
243
244struct rt_hash_bucket {
Eric Dumazet1c317202010-10-25 21:02:07 +0000245 struct rtable __rcu *chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700246};
Neil Horman1080d702008-10-27 12:28:25 -0700247
Ingo Molnar8a25d5d2006-07-03 00:24:54 -0700248#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
249 defined(CONFIG_PROVE_LOCKING)
Eric Dumazet22c047c2005-07-05 14:55:24 -0700250/*
251 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
252 * The size of this table is a power of two and depends on the number of CPUS.
Ingo Molnar62051202006-07-03 00:24:59 -0700253 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
Eric Dumazet22c047c2005-07-05 14:55:24 -0700254 */
Ingo Molnar62051202006-07-03 00:24:59 -0700255#ifdef CONFIG_LOCKDEP
256# define RT_HASH_LOCK_SZ 256
Eric Dumazet22c047c2005-07-05 14:55:24 -0700257#else
Ingo Molnar62051202006-07-03 00:24:59 -0700258# if NR_CPUS >= 32
259# define RT_HASH_LOCK_SZ 4096
260# elif NR_CPUS >= 16
261# define RT_HASH_LOCK_SZ 2048
262# elif NR_CPUS >= 8
263# define RT_HASH_LOCK_SZ 1024
264# elif NR_CPUS >= 4
265# define RT_HASH_LOCK_SZ 512
266# else
267# define RT_HASH_LOCK_SZ 256
268# endif
Eric Dumazet22c047c2005-07-05 14:55:24 -0700269#endif
270
271static spinlock_t *rt_hash_locks;
272# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
Pavel Emelyanov1ff1cc22007-12-05 21:15:05 -0800273
274static __init void rt_hash_lock_init(void)
275{
276 int i;
277
278 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
279 GFP_KERNEL);
280 if (!rt_hash_locks)
281 panic("IP: failed to allocate rt_hash_locks\n");
282
283 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
284 spin_lock_init(&rt_hash_locks[i]);
285}
Eric Dumazet22c047c2005-07-05 14:55:24 -0700286#else
287# define rt_hash_lock_addr(slot) NULL
Pavel Emelyanov1ff1cc22007-12-05 21:15:05 -0800288
289static inline void rt_hash_lock_init(void)
290{
291}
Eric Dumazet22c047c2005-07-05 14:55:24 -0700292#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700293
Stephen Hemminger817bc4d2008-03-22 17:43:59 -0700294static struct rt_hash_bucket *rt_hash_table __read_mostly;
295static unsigned rt_hash_mask __read_mostly;
296static unsigned int rt_hash_log __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700297
Eric Dumazet2f970d82006-01-17 02:54:36 -0800298static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
Eric Dumazet27f39c73e2010-05-19 22:07:23 +0000299#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700300
Denis V. Lunevb00180d2008-07-05 19:04:09 -0700301static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
Eric Dumazet0eae88f2010-04-20 19:06:52 -0700302 int genid)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700303{
Eric Dumazet0eae88f2010-04-20 19:06:52 -0700304 return jhash_3words((__force u32)daddr, (__force u32)saddr,
Denis V. Lunevb00180d2008-07-05 19:04:09 -0700305 idx, genid)
Eric Dumazet29e75252008-01-31 17:05:09 -0800306 & rt_hash_mask;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700307}
308
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700309static inline int rt_genid(struct net *net)
310{
311 return atomic_read(&net->ipv4.rt_genid);
312}
313
Linus Torvalds1da177e2005-04-16 15:20:36 -0700314#ifdef CONFIG_PROC_FS
315struct rt_cache_iter_state {
Denis V. Luneva75e9362008-02-28 20:50:55 -0800316 struct seq_net_private p;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700317 int bucket;
Eric Dumazet29e75252008-01-31 17:05:09 -0800318 int genid;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700319};
320
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900321static struct rtable *rt_cache_get_first(struct seq_file *seq)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700322{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900323 struct rt_cache_iter_state *st = seq->private;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700324 struct rtable *r = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700325
326 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
Eric Dumazet1c317202010-10-25 21:02:07 +0000327 if (!rcu_dereference_raw(rt_hash_table[st->bucket].chain))
Eric Dumazeta6272662008-08-28 01:11:25 -0700328 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700329 rcu_read_lock_bh();
Paul E. McKenneya898def2010-02-22 17:04:49 -0800330 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
Eric Dumazet29e75252008-01-31 17:05:09 -0800331 while (r) {
Changli Gaod8d1f302010-06-10 23:31:35 -0700332 if (dev_net(r->dst.dev) == seq_file_net(seq) &&
Denis V. Luneva75e9362008-02-28 20:50:55 -0800333 r->rt_genid == st->genid)
Eric Dumazet29e75252008-01-31 17:05:09 -0800334 return r;
Changli Gaod8d1f302010-06-10 23:31:35 -0700335 r = rcu_dereference_bh(r->dst.rt_next);
Eric Dumazet29e75252008-01-31 17:05:09 -0800336 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700337 rcu_read_unlock_bh();
338 }
Eric Dumazet29e75252008-01-31 17:05:09 -0800339 return r;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700340}
341
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900342static struct rtable *__rt_cache_get_next(struct seq_file *seq,
Denis V. Lunev642d6312008-02-28 20:50:33 -0800343 struct rtable *r)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700344{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900345 struct rt_cache_iter_state *st = seq->private;
Eric Dumazeta6272662008-08-28 01:11:25 -0700346
Eric Dumazet1c317202010-10-25 21:02:07 +0000347 r = rcu_dereference_bh(r->dst.rt_next);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700348 while (!r) {
349 rcu_read_unlock_bh();
Eric Dumazeta6272662008-08-28 01:11:25 -0700350 do {
351 if (--st->bucket < 0)
352 return NULL;
Eric Dumazet1c317202010-10-25 21:02:07 +0000353 } while (!rcu_dereference_raw(rt_hash_table[st->bucket].chain));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700354 rcu_read_lock_bh();
Eric Dumazet1c317202010-10-25 21:02:07 +0000355 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700356 }
Eric Dumazet1c317202010-10-25 21:02:07 +0000357 return r;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700358}
359
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900360static struct rtable *rt_cache_get_next(struct seq_file *seq,
Denis V. Lunev642d6312008-02-28 20:50:33 -0800361 struct rtable *r)
362{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900363 struct rt_cache_iter_state *st = seq->private;
364 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
Changli Gaod8d1f302010-06-10 23:31:35 -0700365 if (dev_net(r->dst.dev) != seq_file_net(seq))
Denis V. Luneva75e9362008-02-28 20:50:55 -0800366 continue;
Denis V. Lunev642d6312008-02-28 20:50:33 -0800367 if (r->rt_genid == st->genid)
368 break;
369 }
370 return r;
371}
372
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900373static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700374{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900375 struct rtable *r = rt_cache_get_first(seq);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700376
377 if (r)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900378 while (pos && (r = rt_cache_get_next(seq, r)))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700379 --pos;
380 return pos ? NULL : r;
381}
382
383static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
384{
Eric Dumazet29e75252008-01-31 17:05:09 -0800385 struct rt_cache_iter_state *st = seq->private;
Eric Dumazet29e75252008-01-31 17:05:09 -0800386 if (*pos)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900387 return rt_cache_get_idx(seq, *pos - 1);
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700388 st->genid = rt_genid(seq_file_net(seq));
Eric Dumazet29e75252008-01-31 17:05:09 -0800389 return SEQ_START_TOKEN;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700390}
391
392static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
393{
Eric Dumazet29e75252008-01-31 17:05:09 -0800394 struct rtable *r;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700395
396 if (v == SEQ_START_TOKEN)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900397 r = rt_cache_get_first(seq);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700398 else
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900399 r = rt_cache_get_next(seq, v);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700400 ++*pos;
401 return r;
402}
403
404static void rt_cache_seq_stop(struct seq_file *seq, void *v)
405{
406 if (v && v != SEQ_START_TOKEN)
407 rcu_read_unlock_bh();
408}
409
410static int rt_cache_seq_show(struct seq_file *seq, void *v)
411{
412 if (v == SEQ_START_TOKEN)
413 seq_printf(seq, "%-127s\n",
414 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
415 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
416 "HHUptod\tSpecDst");
417 else {
418 struct rtable *r = v;
David S. Miller69cce1d2011-07-17 23:09:49 -0700419 struct neighbour *n;
Pavel Emelyanov5e659e42008-04-24 01:02:16 -0700420 int len;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700421
David S. Miller69cce1d2011-07-17 23:09:49 -0700422 n = dst_get_neighbour(&r->dst);
Eric Dumazet0eae88f2010-04-20 19:06:52 -0700423 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
424 "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
Changli Gaod8d1f302010-06-10 23:31:35 -0700425 r->dst.dev ? r->dst.dev->name : "*",
Eric Dumazet0eae88f2010-04-20 19:06:52 -0700426 (__force u32)r->rt_dst,
427 (__force u32)r->rt_gateway,
Changli Gaod8d1f302010-06-10 23:31:35 -0700428 r->rt_flags, atomic_read(&r->dst.__refcnt),
429 r->dst.__use, 0, (__force u32)r->rt_src,
David S. Miller0dbaee32010-12-13 12:52:14 -0800430 dst_metric_advmss(&r->dst) + 40,
Changli Gaod8d1f302010-06-10 23:31:35 -0700431 dst_metric(&r->dst, RTAX_WINDOW),
432 (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
433 dst_metric(&r->dst, RTAX_RTTVAR)),
David S. Miller475949d2011-05-03 19:45:15 -0700434 r->rt_key_tos,
David S. Millerf6b72b622011-07-14 07:53:20 -0700435 -1,
David S. Miller69cce1d2011-07-17 23:09:49 -0700436 (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0,
Pavel Emelyanov5e659e42008-04-24 01:02:16 -0700437 r->rt_spec_dst, &len);
438
439 seq_printf(seq, "%*s\n", 127 - len, "");
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900440 }
441 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700442}
443
Stephen Hemmingerf6908082007-03-12 14:34:29 -0700444static const struct seq_operations rt_cache_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700445 .start = rt_cache_seq_start,
446 .next = rt_cache_seq_next,
447 .stop = rt_cache_seq_stop,
448 .show = rt_cache_seq_show,
449};
450
451static int rt_cache_seq_open(struct inode *inode, struct file *file)
452{
Denis V. Luneva75e9362008-02-28 20:50:55 -0800453 return seq_open_net(inode, file, &rt_cache_seq_ops,
Pavel Emelyanovcf7732e2007-10-10 02:29:29 -0700454 sizeof(struct rt_cache_iter_state));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700455}
456
Arjan van de Ven9a321442007-02-12 00:55:35 -0800457static const struct file_operations rt_cache_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700458 .owner = THIS_MODULE,
459 .open = rt_cache_seq_open,
460 .read = seq_read,
461 .llseek = seq_lseek,
Denis V. Luneva75e9362008-02-28 20:50:55 -0800462 .release = seq_release_net,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700463};
464
465
466static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
467{
468 int cpu;
469
470 if (*pos == 0)
471 return SEQ_START_TOKEN;
472
Rusty Russell0f23174a2008-12-29 12:23:42 +0000473 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700474 if (!cpu_possible(cpu))
475 continue;
476 *pos = cpu+1;
Eric Dumazet2f970d82006-01-17 02:54:36 -0800477 return &per_cpu(rt_cache_stat, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700478 }
479 return NULL;
480}
481
482static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
483{
484 int cpu;
485
Rusty Russell0f23174a2008-12-29 12:23:42 +0000486 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700487 if (!cpu_possible(cpu))
488 continue;
489 *pos = cpu+1;
Eric Dumazet2f970d82006-01-17 02:54:36 -0800490 return &per_cpu(rt_cache_stat, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700491 }
492 return NULL;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900493
Linus Torvalds1da177e2005-04-16 15:20:36 -0700494}
495
496static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
497{
498
499}
500
501static int rt_cpu_seq_show(struct seq_file *seq, void *v)
502{
503 struct rt_cache_stat *st = v;
504
505 if (v == SEQ_START_TOKEN) {
Olaf Rempel5bec0032005-04-28 12:16:08 -0700506 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700507 return 0;
508 }
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900509
Linus Torvalds1da177e2005-04-16 15:20:36 -0700510 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
511 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
Eric Dumazetfc66f952010-10-08 06:37:34 +0000512 dst_entries_get_slow(&ipv4_dst_ops),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700513 st->in_hit,
514 st->in_slow_tot,
515 st->in_slow_mc,
516 st->in_no_route,
517 st->in_brd,
518 st->in_martian_dst,
519 st->in_martian_src,
520
521 st->out_hit,
522 st->out_slow_tot,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900523 st->out_slow_mc,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700524
525 st->gc_total,
526 st->gc_ignored,
527 st->gc_goal_miss,
528 st->gc_dst_overflow,
529 st->in_hlist_search,
530 st->out_hlist_search
531 );
532 return 0;
533}
534
Stephen Hemmingerf6908082007-03-12 14:34:29 -0700535static const struct seq_operations rt_cpu_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700536 .start = rt_cpu_seq_start,
537 .next = rt_cpu_seq_next,
538 .stop = rt_cpu_seq_stop,
539 .show = rt_cpu_seq_show,
540};
541
542
543static int rt_cpu_seq_open(struct inode *inode, struct file *file)
544{
545 return seq_open(file, &rt_cpu_seq_ops);
546}
547
Arjan van de Ven9a321442007-02-12 00:55:35 -0800548static const struct file_operations rt_cpu_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700549 .owner = THIS_MODULE,
550 .open = rt_cpu_seq_open,
551 .read = seq_read,
552 .llseek = seq_lseek,
553 .release = seq_release,
554};
555
Patrick McHardyc7066f72011-01-14 13:36:42 +0100556#ifdef CONFIG_IP_ROUTE_CLASSID
Alexey Dobriyana661c412009-11-25 15:40:35 -0800557static int rt_acct_proc_show(struct seq_file *m, void *v)
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800558{
Alexey Dobriyana661c412009-11-25 15:40:35 -0800559 struct ip_rt_acct *dst, *src;
560 unsigned int i, j;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800561
Alexey Dobriyana661c412009-11-25 15:40:35 -0800562 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
563 if (!dst)
564 return -ENOMEM;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800565
Alexey Dobriyana661c412009-11-25 15:40:35 -0800566 for_each_possible_cpu(i) {
567 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
568 for (j = 0; j < 256; j++) {
569 dst[j].o_bytes += src[j].o_bytes;
570 dst[j].o_packets += src[j].o_packets;
571 dst[j].i_bytes += src[j].i_bytes;
572 dst[j].i_packets += src[j].i_packets;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800573 }
574 }
Alexey Dobriyana661c412009-11-25 15:40:35 -0800575
576 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
577 kfree(dst);
578 return 0;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800579}
Alexey Dobriyana661c412009-11-25 15:40:35 -0800580
581static int rt_acct_proc_open(struct inode *inode, struct file *file)
582{
583 return single_open(file, rt_acct_proc_show, NULL);
584}
585
586static const struct file_operations rt_acct_proc_fops = {
587 .owner = THIS_MODULE,
588 .open = rt_acct_proc_open,
589 .read = seq_read,
590 .llseek = seq_lseek,
591 .release = single_release,
592};
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800593#endif
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800594
Denis V. Lunev73b38712008-02-28 20:51:18 -0800595static int __net_init ip_rt_do_proc_init(struct net *net)
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800596{
597 struct proc_dir_entry *pde;
598
599 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
600 &rt_cache_seq_fops);
601 if (!pde)
602 goto err1;
603
Wang Chen77020722008-02-28 14:14:25 -0800604 pde = proc_create("rt_cache", S_IRUGO,
605 net->proc_net_stat, &rt_cpu_seq_fops);
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800606 if (!pde)
607 goto err2;
608
Patrick McHardyc7066f72011-01-14 13:36:42 +0100609#ifdef CONFIG_IP_ROUTE_CLASSID
Alexey Dobriyana661c412009-11-25 15:40:35 -0800610 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800611 if (!pde)
612 goto err3;
613#endif
614 return 0;
615
Patrick McHardyc7066f72011-01-14 13:36:42 +0100616#ifdef CONFIG_IP_ROUTE_CLASSID
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800617err3:
618 remove_proc_entry("rt_cache", net->proc_net_stat);
619#endif
620err2:
621 remove_proc_entry("rt_cache", net->proc_net);
622err1:
623 return -ENOMEM;
624}
Denis V. Lunev73b38712008-02-28 20:51:18 -0800625
626static void __net_exit ip_rt_do_proc_exit(struct net *net)
627{
628 remove_proc_entry("rt_cache", net->proc_net_stat);
629 remove_proc_entry("rt_cache", net->proc_net);
Patrick McHardyc7066f72011-01-14 13:36:42 +0100630#ifdef CONFIG_IP_ROUTE_CLASSID
Denis V. Lunev73b38712008-02-28 20:51:18 -0800631 remove_proc_entry("rt_acct", net->proc_net);
Alexey Dobriyan0a931ac2010-01-17 03:32:50 +0000632#endif
Denis V. Lunev73b38712008-02-28 20:51:18 -0800633}
634
635static struct pernet_operations ip_rt_proc_ops __net_initdata = {
636 .init = ip_rt_do_proc_init,
637 .exit = ip_rt_do_proc_exit,
638};
639
640static int __init ip_rt_proc_init(void)
641{
642 return register_pernet_subsys(&ip_rt_proc_ops);
643}
644
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800645#else
Denis V. Lunev73b38712008-02-28 20:51:18 -0800646static inline int ip_rt_proc_init(void)
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800647{
648 return 0;
649}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700650#endif /* CONFIG_PROC_FS */
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900651
Stephen Hemminger5969f712008-04-10 01:52:09 -0700652static inline void rt_free(struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700653{
Changli Gaod8d1f302010-06-10 23:31:35 -0700654 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700655}
656
Stephen Hemminger5969f712008-04-10 01:52:09 -0700657static inline void rt_drop(struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700658{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700659 ip_rt_put(rt);
Changli Gaod8d1f302010-06-10 23:31:35 -0700660 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700661}
662
Stephen Hemminger5969f712008-04-10 01:52:09 -0700663static inline int rt_fast_clean(struct rtable *rth)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700664{
665 /* Kill broadcast/multicast entries very aggresively, if they
666 collide in hash table with more useful entries */
667 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
David S. Millerc7537962010-11-11 17:07:48 -0800668 rt_is_input_route(rth) && rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700669}
670
Stephen Hemminger5969f712008-04-10 01:52:09 -0700671static inline int rt_valuable(struct rtable *rth)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700672{
673 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
David S. Miller2c8cec52011-02-09 20:42:07 -0800674 (rth->peer && rth->peer->pmtu_expires);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700675}
676
677static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
678{
679 unsigned long age;
680 int ret = 0;
681
Changli Gaod8d1f302010-06-10 23:31:35 -0700682 if (atomic_read(&rth->dst.__refcnt))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700683 goto out;
684
Changli Gaod8d1f302010-06-10 23:31:35 -0700685 age = jiffies - rth->dst.lastuse;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700686 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
687 (age <= tmo2 && rt_valuable(rth)))
688 goto out;
689 ret = 1;
690out: return ret;
691}
692
693/* Bits of score are:
694 * 31: very valuable
695 * 30: not quite useless
696 * 29..0: usage counter
697 */
698static inline u32 rt_score(struct rtable *rt)
699{
Changli Gaod8d1f302010-06-10 23:31:35 -0700700 u32 score = jiffies - rt->dst.lastuse;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700701
702 score = ~score & ~(3<<30);
703
704 if (rt_valuable(rt))
705 score |= (1<<31);
706
David S. Millerc7537962010-11-11 17:07:48 -0800707 if (rt_is_output_route(rt) ||
Linus Torvalds1da177e2005-04-16 15:20:36 -0700708 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
709 score |= (1<<30);
710
711 return score;
712}
713
Neil Horman1080d702008-10-27 12:28:25 -0700714static inline bool rt_caching(const struct net *net)
715{
716 return net->ipv4.current_rt_cache_rebuild_count <=
717 net->ipv4.sysctl_rt_cache_rebuild_count;
718}
719
David S. Miller5e2b61f2011-03-04 21:47:09 -0800720static inline bool compare_hash_inputs(const struct rtable *rt1,
721 const struct rtable *rt2)
Neil Horman1080d702008-10-27 12:28:25 -0700722{
David S. Miller5e2b61f2011-03-04 21:47:09 -0800723 return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
724 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
725 (rt1->rt_iif ^ rt2->rt_iif)) == 0);
Neil Horman1080d702008-10-27 12:28:25 -0700726}
727
David S. Miller5e2b61f2011-03-04 21:47:09 -0800728static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700729{
David S. Miller5e2b61f2011-03-04 21:47:09 -0800730 return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
731 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
732 (rt1->rt_mark ^ rt2->rt_mark) |
David S. Miller475949d2011-05-03 19:45:15 -0700733 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
Julian Anastasovd547f722011-08-07 22:20:20 -0700734 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
David S. Miller5e2b61f2011-03-04 21:47:09 -0800735 (rt1->rt_oif ^ rt2->rt_oif) |
736 (rt1->rt_iif ^ rt2->rt_iif)) == 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700737}
738
Denis V. Lunevb5921912008-01-22 23:50:25 -0800739static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
740{
Changli Gaod8d1f302010-06-10 23:31:35 -0700741 return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
Denis V. Lunevb5921912008-01-22 23:50:25 -0800742}
743
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700744static inline int rt_is_expired(struct rtable *rth)
745{
Changli Gaod8d1f302010-06-10 23:31:35 -0700746 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700747}
748
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800749/*
750 * Perform a full scan of hash table and free all entries.
751 * Can be called by a softirq or a process.
752 * In the later case, we want to be reschedule if necessary
753 */
David S. Miller6561a3b2010-12-19 21:11:20 -0800754static void rt_do_flush(struct net *net, int process_context)
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800755{
756 unsigned int i;
757 struct rtable *rth, *next;
758
759 for (i = 0; i <= rt_hash_mask; i++) {
David S. Miller6561a3b2010-12-19 21:11:20 -0800760 struct rtable __rcu **pprev;
761 struct rtable *list;
762
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800763 if (process_context && need_resched())
764 cond_resched();
Eric Dumazet1c317202010-10-25 21:02:07 +0000765 rth = rcu_dereference_raw(rt_hash_table[i].chain);
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800766 if (!rth)
767 continue;
768
769 spin_lock_bh(rt_hash_lock_addr(i));
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700770
David S. Miller6561a3b2010-12-19 21:11:20 -0800771 list = NULL;
772 pprev = &rt_hash_table[i].chain;
773 rth = rcu_dereference_protected(*pprev,
Eric Dumazet1c317202010-10-25 21:02:07 +0000774 lockdep_is_held(rt_hash_lock_addr(i)));
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700775
David S. Miller6561a3b2010-12-19 21:11:20 -0800776 while (rth) {
777 next = rcu_dereference_protected(rth->dst.rt_next,
778 lockdep_is_held(rt_hash_lock_addr(i)));
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700779
David S. Miller6561a3b2010-12-19 21:11:20 -0800780 if (!net ||
781 net_eq(dev_net(rth->dst.dev), net)) {
782 rcu_assign_pointer(*pprev, next);
783 rcu_assign_pointer(rth->dst.rt_next, list);
784 list = rth;
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700785 } else {
David S. Miller6561a3b2010-12-19 21:11:20 -0800786 pprev = &rth->dst.rt_next;
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700787 }
David S. Miller6561a3b2010-12-19 21:11:20 -0800788 rth = next;
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700789 }
David S. Miller6561a3b2010-12-19 21:11:20 -0800790
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800791 spin_unlock_bh(rt_hash_lock_addr(i));
792
David S. Miller6561a3b2010-12-19 21:11:20 -0800793 for (; list; list = next) {
794 next = rcu_dereference_protected(list->dst.rt_next, 1);
795 rt_free(list);
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800796 }
797 }
798}
799
Neil Horman1080d702008-10-27 12:28:25 -0700800/*
801 * While freeing expired entries, we compute average chain length
802 * and standard deviation, using fixed-point arithmetic.
803 * This to have an estimation of rt_chain_length_max
804 * rt_chain_length_max = max(elasticity, AVG + 4*SD)
805 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
806 */
807
808#define FRACT_BITS 3
809#define ONE (1UL << FRACT_BITS)
810
Eric Dumazet98376382010-03-08 03:20:00 +0000811/*
812 * Given a hash chain and an item in this hash chain,
813 * find if a previous entry has the same hash_inputs
814 * (but differs on tos, mark or oif)
815 * Returns 0 if an alias is found.
816 * Returns ONE if rth has no alias before itself.
817 */
818static int has_noalias(const struct rtable *head, const struct rtable *rth)
819{
820 const struct rtable *aux = head;
821
822 while (aux != rth) {
David S. Miller5e2b61f2011-03-04 21:47:09 -0800823 if (compare_hash_inputs(aux, rth))
Eric Dumazet98376382010-03-08 03:20:00 +0000824 return 0;
Eric Dumazet1c317202010-10-25 21:02:07 +0000825 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
Eric Dumazet98376382010-03-08 03:20:00 +0000826 }
827 return ONE;
828}
829
Eric Dumazet29e75252008-01-31 17:05:09 -0800830/*
Lucas De Marchi25985ed2011-03-30 22:57:33 -0300831 * Perturbation of rt_genid by a small quantity [1..256]
Eric Dumazet29e75252008-01-31 17:05:09 -0800832 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
833 * many times (2^24) without giving recent rt_genid.
834 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700835 */
Denis V. Lunev86c657f2008-07-05 19:03:31 -0700836static void rt_cache_invalidate(struct net *net)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700837{
Eric Dumazet29e75252008-01-31 17:05:09 -0800838 unsigned char shuffle;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700839
Eric Dumazet29e75252008-01-31 17:05:09 -0800840 get_random_bytes(&shuffle, sizeof(shuffle));
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700841 atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700842}
843
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800844/*
Eric Dumazet29e75252008-01-31 17:05:09 -0800845 * delay < 0 : invalidate cache (fast : entries will be deleted later)
846 * delay >= 0 : invalidate & flush cache (can be long)
847 */
Denis V. Lunev76e6ebf2008-07-05 19:00:44 -0700848void rt_cache_flush(struct net *net, int delay)
Eric Dumazet29e75252008-01-31 17:05:09 -0800849{
Denis V. Lunev86c657f2008-07-05 19:03:31 -0700850 rt_cache_invalidate(net);
Eric Dumazet29e75252008-01-31 17:05:09 -0800851 if (delay >= 0)
David S. Miller6561a3b2010-12-19 21:11:20 -0800852 rt_do_flush(net, !in_softirq());
Eric Dumazet29e75252008-01-31 17:05:09 -0800853}
854
Eric W. Biedermana5ee1552009-11-29 15:45:58 +0000855/* Flush previous cache invalidated entries from the cache */
David S. Miller6561a3b2010-12-19 21:11:20 -0800856void rt_cache_flush_batch(struct net *net)
Eric W. Biedermana5ee1552009-11-29 15:45:58 +0000857{
David S. Miller6561a3b2010-12-19 21:11:20 -0800858 rt_do_flush(net, !in_softirq());
Eric W. Biedermana5ee1552009-11-29 15:45:58 +0000859}
860
Neil Horman1080d702008-10-27 12:28:25 -0700861static void rt_emergency_hash_rebuild(struct net *net)
862{
Neil Horman3ee94372010-05-08 01:57:52 -0700863 if (net_ratelimit())
Neil Horman1080d702008-10-27 12:28:25 -0700864 printk(KERN_WARNING "Route hash chain too long!\n");
Neil Horman3ee94372010-05-08 01:57:52 -0700865 rt_cache_invalidate(net);
Neil Horman1080d702008-10-27 12:28:25 -0700866}
867
Linus Torvalds1da177e2005-04-16 15:20:36 -0700868/*
869 Short description of GC goals.
870
871 We want to build algorithm, which will keep routing cache
872 at some equilibrium point, when number of aged off entries
873 is kept approximately equal to newly generated ones.
874
875 Current expiration strength is variable "expire".
876 We try to adjust it dynamically, so that if networking
877 is idle expires is large enough to keep enough of warm entries,
878 and when load increases it reduces to limit cache size.
879 */
880
Daniel Lezcano569d3642008-01-18 03:56:57 -0800881static int rt_garbage_collect(struct dst_ops *ops)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700882{
883 static unsigned long expire = RT_GC_TIMEOUT;
884 static unsigned long last_gc;
885 static int rover;
886 static int equilibrium;
Eric Dumazet1c317202010-10-25 21:02:07 +0000887 struct rtable *rth;
888 struct rtable __rcu **rthp;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700889 unsigned long now = jiffies;
890 int goal;
Eric Dumazetfc66f952010-10-08 06:37:34 +0000891 int entries = dst_entries_get_fast(&ipv4_dst_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700892
893 /*
894 * Garbage collection is pretty expensive,
895 * do not make it too frequently.
896 */
897
898 RT_CACHE_STAT_INC(gc_total);
899
900 if (now - last_gc < ip_rt_gc_min_interval &&
Eric Dumazetfc66f952010-10-08 06:37:34 +0000901 entries < ip_rt_max_size) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700902 RT_CACHE_STAT_INC(gc_ignored);
903 goto out;
904 }
905
Eric Dumazetfc66f952010-10-08 06:37:34 +0000906 entries = dst_entries_get_slow(&ipv4_dst_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700907 /* Calculate number of entries, which we want to expire now. */
Eric Dumazetfc66f952010-10-08 06:37:34 +0000908 goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700909 if (goal <= 0) {
910 if (equilibrium < ipv4_dst_ops.gc_thresh)
911 equilibrium = ipv4_dst_ops.gc_thresh;
Eric Dumazetfc66f952010-10-08 06:37:34 +0000912 goal = entries - equilibrium;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700913 if (goal > 0) {
Eric Dumazetb790ced2007-12-21 01:49:07 -0800914 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
Eric Dumazetfc66f952010-10-08 06:37:34 +0000915 goal = entries - equilibrium;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700916 }
917 } else {
918 /* We are in dangerous area. Try to reduce cache really
919 * aggressively.
920 */
Eric Dumazetb790ced2007-12-21 01:49:07 -0800921 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
Eric Dumazetfc66f952010-10-08 06:37:34 +0000922 equilibrium = entries - goal;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700923 }
924
925 if (now - last_gc >= ip_rt_gc_min_interval)
926 last_gc = now;
927
928 if (goal <= 0) {
929 equilibrium += goal;
930 goto work_done;
931 }
932
933 do {
934 int i, k;
935
936 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
937 unsigned long tmo = expire;
938
939 k = (k + 1) & rt_hash_mask;
940 rthp = &rt_hash_table[k].chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700941 spin_lock_bh(rt_hash_lock_addr(k));
Eric Dumazet1c317202010-10-25 21:02:07 +0000942 while ((rth = rcu_dereference_protected(*rthp,
943 lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700944 if (!rt_is_expired(rth) &&
Eric Dumazet29e75252008-01-31 17:05:09 -0800945 !rt_may_expire(rth, tmo, expire)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700946 tmo >>= 1;
Changli Gaod8d1f302010-06-10 23:31:35 -0700947 rthp = &rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700948 continue;
949 }
Changli Gaod8d1f302010-06-10 23:31:35 -0700950 *rthp = rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700951 rt_free(rth);
952 goal--;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700953 }
Eric Dumazet22c047c2005-07-05 14:55:24 -0700954 spin_unlock_bh(rt_hash_lock_addr(k));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700955 if (goal <= 0)
956 break;
957 }
958 rover = k;
959
960 if (goal <= 0)
961 goto work_done;
962
963 /* Goal is not achieved. We stop process if:
964
965 - if expire reduced to zero. Otherwise, expire is halfed.
966 - if table is not full.
967 - if we are called from interrupt.
968 - jiffies check is just fallback/debug loop breaker.
969 We will not spin here for long time in any case.
970 */
971
972 RT_CACHE_STAT_INC(gc_goal_miss);
973
974 if (expire == 0)
975 break;
976
977 expire >>= 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700978
Eric Dumazetfc66f952010-10-08 06:37:34 +0000979 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700980 goto out;
981 } while (!in_softirq() && time_before_eq(jiffies, now));
982
Eric Dumazetfc66f952010-10-08 06:37:34 +0000983 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
984 goto out;
985 if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700986 goto out;
987 if (net_ratelimit())
988 printk(KERN_WARNING "dst cache overflow\n");
989 RT_CACHE_STAT_INC(gc_dst_overflow);
990 return 1;
991
992work_done:
993 expire += ip_rt_gc_min_interval;
994 if (expire > ip_rt_gc_timeout ||
Eric Dumazetfc66f952010-10-08 06:37:34 +0000995 dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
996 dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700997 expire = ip_rt_gc_timeout;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700998out: return 0;
999}
1000
Eric Dumazet98376382010-03-08 03:20:00 +00001001/*
1002 * Returns number of entries in a hash chain that have different hash_inputs
1003 */
1004static int slow_chain_length(const struct rtable *head)
1005{
1006 int length = 0;
1007 const struct rtable *rth = head;
1008
1009 while (rth) {
1010 length += has_noalias(head, rth);
Eric Dumazet1c317202010-10-25 21:02:07 +00001011 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
Eric Dumazet98376382010-03-08 03:20:00 +00001012 }
1013 return length >> FRACT_BITS;
1014}
1015
David S. Millerd3aaeb32011-07-18 00:40:17 -07001016static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
David Miller3769cff2011-07-11 22:44:24 +00001017{
David Miller3769cff2011-07-11 22:44:24 +00001018 struct neigh_table *tbl = &arp_tbl;
David S. Millerd3aaeb32011-07-18 00:40:17 -07001019 static const __be32 inaddr_any = 0;
1020 struct net_device *dev = dst->dev;
1021 const __be32 *pkey = daddr;
David Miller3769cff2011-07-11 22:44:24 +00001022 struct neighbour *n;
1023
1024#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
1025 if (dev->type == ARPHRD_ATM)
1026 tbl = clip_tbl_hook;
1027#endif
David Miller3769cff2011-07-11 22:44:24 +00001028 if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
David S. Millerd3aaeb32011-07-18 00:40:17 -07001029 pkey = &inaddr_any;
1030
1031 n = __ipv4_neigh_lookup(tbl, dev, *(__force u32 *)pkey);
1032 if (n)
1033 return n;
1034 return neigh_create(tbl, pkey, dev);
1035}
1036
1037static int rt_bind_neighbour(struct rtable *rt)
1038{
1039 struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
David Miller3769cff2011-07-11 22:44:24 +00001040 if (IS_ERR(n))
1041 return PTR_ERR(n);
David S. Miller69cce1d2011-07-17 23:09:49 -07001042 dst_set_neighbour(&rt->dst, n);
David Miller3769cff2011-07-11 22:44:24 +00001043
1044 return 0;
1045}
1046
David S. Millerb23dd4f2011-03-02 14:31:35 -08001047static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1048 struct sk_buff *skb, int ifindex)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001049{
Eric Dumazet1c317202010-10-25 21:02:07 +00001050 struct rtable *rth, *cand;
1051 struct rtable __rcu **rthp, **candp;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001052 unsigned long now;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001053 u32 min_score;
1054 int chain_length;
1055 int attempts = !in_softirq();
1056
1057restart:
1058 chain_length = 0;
1059 min_score = ~(u32)0;
1060 cand = NULL;
1061 candp = NULL;
1062 now = jiffies;
1063
Changli Gaod8d1f302010-06-10 23:31:35 -07001064 if (!rt_caching(dev_net(rt->dst.dev))) {
Neil Horman73e42892009-06-20 01:15:16 -07001065 /*
1066 * If we're not caching, just tell the caller we
1067 * were successful and don't touch the route. The
1068 * caller hold the sole reference to the cache entry, and
1069 * it will be released when the caller is done with it.
1070 * If we drop it here, the callers have no way to resolve routes
1071 * when we're not caching. Instead, just point *rp at rt, so
1072 * the caller gets a single use out of the route
Neil Hormanb6280b42009-06-22 10:18:53 +00001073 * Note that we do rt_free on this new route entry, so that
1074 * once its refcount hits zero, we are still able to reap it
1075 * (Thanks Alexey)
Eric Dumazet27b75c92010-10-15 05:44:11 +00001076 * Note: To avoid expensive rcu stuff for this uncached dst,
1077 * we set DST_NOCACHE so that dst_release() can free dst without
1078 * waiting a grace period.
Neil Horman73e42892009-06-20 01:15:16 -07001079 */
Neil Hormanb6280b42009-06-22 10:18:53 +00001080
Eric Dumazetc7d44262010-10-03 22:17:54 -07001081 rt->dst.flags |= DST_NOCACHE;
David S. Millerc7537962010-11-11 17:07:48 -08001082 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
David Miller3769cff2011-07-11 22:44:24 +00001083 int err = rt_bind_neighbour(rt);
Neil Hormanb6280b42009-06-22 10:18:53 +00001084 if (err) {
1085 if (net_ratelimit())
1086 printk(KERN_WARNING
1087 "Neighbour table failure & not caching routes.\n");
Eric Dumazet27b75c92010-10-15 05:44:11 +00001088 ip_rt_put(rt);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001089 return ERR_PTR(err);
Neil Hormanb6280b42009-06-22 10:18:53 +00001090 }
1091 }
1092
Neil Hormanb6280b42009-06-22 10:18:53 +00001093 goto skip_hashing;
Neil Horman1080d702008-10-27 12:28:25 -07001094 }
1095
Linus Torvalds1da177e2005-04-16 15:20:36 -07001096 rthp = &rt_hash_table[hash].chain;
1097
Eric Dumazet22c047c2005-07-05 14:55:24 -07001098 spin_lock_bh(rt_hash_lock_addr(hash));
Eric Dumazet1c317202010-10-25 21:02:07 +00001099 while ((rth = rcu_dereference_protected(*rthp,
1100 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001101 if (rt_is_expired(rth)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001102 *rthp = rth->dst.rt_next;
Eric Dumazet29e75252008-01-31 17:05:09 -08001103 rt_free(rth);
1104 continue;
1105 }
David S. Miller5e2b61f2011-03-04 21:47:09 -08001106 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001107 /* Put it first */
Changli Gaod8d1f302010-06-10 23:31:35 -07001108 *rthp = rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001109 /*
1110 * Since lookup is lockfree, the deletion
1111 * must be visible to another weakly ordered CPU before
1112 * the insertion at the start of the hash chain.
1113 */
Changli Gaod8d1f302010-06-10 23:31:35 -07001114 rcu_assign_pointer(rth->dst.rt_next,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001115 rt_hash_table[hash].chain);
1116 /*
1117 * Since lookup is lockfree, the update writes
1118 * must be ordered for consistency on SMP.
1119 */
1120 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1121
Changli Gaod8d1f302010-06-10 23:31:35 -07001122 dst_use(&rth->dst, now);
Eric Dumazet22c047c2005-07-05 14:55:24 -07001123 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001124
1125 rt_drop(rt);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001126 if (skb)
Changli Gaod8d1f302010-06-10 23:31:35 -07001127 skb_dst_set(skb, &rth->dst);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001128 return rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001129 }
1130
Changli Gaod8d1f302010-06-10 23:31:35 -07001131 if (!atomic_read(&rth->dst.__refcnt)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001132 u32 score = rt_score(rth);
1133
1134 if (score <= min_score) {
1135 cand = rth;
1136 candp = rthp;
1137 min_score = score;
1138 }
1139 }
1140
1141 chain_length++;
1142
Changli Gaod8d1f302010-06-10 23:31:35 -07001143 rthp = &rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001144 }
1145
1146 if (cand) {
1147 /* ip_rt_gc_elasticity used to be average length of chain
1148 * length, when exceeded gc becomes really aggressive.
1149 *
1150 * The second limit is less certain. At the moment it allows
1151 * only 2 entries per bucket. We will see.
1152 */
1153 if (chain_length > ip_rt_gc_elasticity) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001154 *candp = cand->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001155 rt_free(cand);
1156 }
Neil Horman1080d702008-10-27 12:28:25 -07001157 } else {
Eric Dumazet98376382010-03-08 03:20:00 +00001158 if (chain_length > rt_chain_length_max &&
1159 slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001160 struct net *net = dev_net(rt->dst.dev);
Neil Horman1080d702008-10-27 12:28:25 -07001161 int num = ++net->ipv4.current_rt_cache_rebuild_count;
Pavel Emelyanovb35ecb52010-03-24 07:43:17 +00001162 if (!rt_caching(net)) {
Neil Horman1080d702008-10-27 12:28:25 -07001163 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
Changli Gaod8d1f302010-06-10 23:31:35 -07001164 rt->dst.dev->name, num);
Neil Horman1080d702008-10-27 12:28:25 -07001165 }
Pavel Emelyanovb35ecb52010-03-24 07:43:17 +00001166 rt_emergency_hash_rebuild(net);
Pavel Emelyanov6a2bad72010-03-24 21:51:22 +00001167 spin_unlock_bh(rt_hash_lock_addr(hash));
1168
David S. Miller5e2b61f2011-03-04 21:47:09 -08001169 hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
Pavel Emelyanov6a2bad72010-03-24 21:51:22 +00001170 ifindex, rt_genid(net));
1171 goto restart;
Neil Horman1080d702008-10-27 12:28:25 -07001172 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001173 }
1174
1175 /* Try to bind route to arp only if it is output
1176 route or unicast forwarding path.
1177 */
David S. Millerc7537962010-11-11 17:07:48 -08001178 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
David Miller3769cff2011-07-11 22:44:24 +00001179 int err = rt_bind_neighbour(rt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001180 if (err) {
Eric Dumazet22c047c2005-07-05 14:55:24 -07001181 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001182
1183 if (err != -ENOBUFS) {
1184 rt_drop(rt);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001185 return ERR_PTR(err);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001186 }
1187
1188 /* Neighbour tables are full and nothing
1189 can be released. Try to shrink route cache,
1190 it is most likely it holds some neighbour records.
1191 */
1192 if (attempts-- > 0) {
1193 int saved_elasticity = ip_rt_gc_elasticity;
1194 int saved_int = ip_rt_gc_min_interval;
1195 ip_rt_gc_elasticity = 1;
1196 ip_rt_gc_min_interval = 0;
Daniel Lezcano569d3642008-01-18 03:56:57 -08001197 rt_garbage_collect(&ipv4_dst_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001198 ip_rt_gc_min_interval = saved_int;
1199 ip_rt_gc_elasticity = saved_elasticity;
1200 goto restart;
1201 }
1202
1203 if (net_ratelimit())
Ulrich Weber7e1b33e2010-09-27 15:02:18 -07001204 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07001205 rt_drop(rt);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001206 return ERR_PTR(-ENOBUFS);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001207 }
1208 }
1209
Changli Gaod8d1f302010-06-10 23:31:35 -07001210 rt->dst.rt_next = rt_hash_table[hash].chain;
Neil Horman1080d702008-10-27 12:28:25 -07001211
Eric Dumazet00269b52008-10-16 14:18:29 -07001212 /*
1213 * Since lookup is lockfree, we must make sure
Lucas De Marchi25985ed2011-03-30 22:57:33 -03001214 * previous writes to rt are committed to memory
Eric Dumazet00269b52008-10-16 14:18:29 -07001215 * before making rt visible to other CPUS.
1216 */
Eric Dumazet1ddbcb02009-05-19 20:14:28 +00001217 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
Neil Horman1080d702008-10-27 12:28:25 -07001218
Eric Dumazet22c047c2005-07-05 14:55:24 -07001219 spin_unlock_bh(rt_hash_lock_addr(hash));
Neil Horman73e42892009-06-20 01:15:16 -07001220
Neil Hormanb6280b42009-06-22 10:18:53 +00001221skip_hashing:
David S. Millerb23dd4f2011-03-02 14:31:35 -08001222 if (skb)
Changli Gaod8d1f302010-06-10 23:31:35 -07001223 skb_dst_set(skb, &rt->dst);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001224 return rt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001225}
1226
David S. Miller6431cbc2011-02-07 20:38:06 -08001227static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1228
1229static u32 rt_peer_genid(void)
1230{
1231 return atomic_read(&__rt_peer_genid);
1232}
1233
David S. Millera48eff12011-05-18 18:42:43 -04001234void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001235{
Linus Torvalds1da177e2005-04-16 15:20:36 -07001236 struct inet_peer *peer;
1237
David S. Millera48eff12011-05-18 18:42:43 -04001238 peer = inet_getpeer_v4(daddr, create);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001239
Eric Dumazet49e8ab02010-08-19 06:10:45 +00001240 if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001241 inet_putpeer(peer);
David S. Miller6431cbc2011-02-07 20:38:06 -08001242 else
1243 rt->rt_peer_genid = rt_peer_genid();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001244}
1245
1246/*
1247 * Peer allocation may fail only in serious out-of-memory conditions. However
1248 * we still can generate some output.
1249 * Random ID selection looks a bit dangerous because we have no chances to
1250 * select ID being unique in a reasonable period of time.
1251 * But broken packet identifier may be better than no packet at all.
1252 */
1253static void ip_select_fb_ident(struct iphdr *iph)
1254{
1255 static DEFINE_SPINLOCK(ip_fb_id_lock);
1256 static u32 ip_fallback_id;
1257 u32 salt;
1258
1259 spin_lock_bh(&ip_fb_id_lock);
Al Viroe4485152006-09-26 22:15:01 -07001260 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001261 iph->id = htons(salt & 0xFFFF);
1262 ip_fallback_id = salt;
1263 spin_unlock_bh(&ip_fb_id_lock);
1264}
1265
1266void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1267{
1268 struct rtable *rt = (struct rtable *) dst;
1269
1270 if (rt) {
1271 if (rt->peer == NULL)
David S. Millera48eff12011-05-18 18:42:43 -04001272 rt_bind_peer(rt, rt->rt_dst, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001273
1274 /* If peer is attached to destination, it is never detached,
1275 so that we need not to grab a lock to dereference it.
1276 */
1277 if (rt->peer) {
1278 iph->id = htons(inet_getid(rt->peer, more));
1279 return;
1280 }
1281 } else
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001282 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
Stephen Hemminger9c2b3322005-04-19 22:39:42 -07001283 __builtin_return_address(0));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001284
1285 ip_select_fb_ident(iph);
1286}
Eric Dumazet4bc2f182010-07-09 21:22:10 +00001287EXPORT_SYMBOL(__ip_select_ident);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001288
1289static void rt_del(unsigned hash, struct rtable *rt)
1290{
Eric Dumazet1c317202010-10-25 21:02:07 +00001291 struct rtable __rcu **rthp;
1292 struct rtable *aux;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001293
Eric Dumazet29e75252008-01-31 17:05:09 -08001294 rthp = &rt_hash_table[hash].chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -07001295 spin_lock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001296 ip_rt_put(rt);
Eric Dumazet1c317202010-10-25 21:02:07 +00001297 while ((aux = rcu_dereference_protected(*rthp,
1298 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001299 if (aux == rt || rt_is_expired(aux)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001300 *rthp = aux->dst.rt_next;
Eric Dumazet29e75252008-01-31 17:05:09 -08001301 rt_free(aux);
1302 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001303 }
Changli Gaod8d1f302010-06-10 23:31:35 -07001304 rthp = &aux->dst.rt_next;
Eric Dumazet29e75252008-01-31 17:05:09 -08001305 }
Eric Dumazet22c047c2005-07-05 14:55:24 -07001306 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001307}
1308
Eric Dumazeted7865a42010-06-07 21:49:44 -07001309/* called in rcu_read_lock() section */
Al Virof7655222006-09-26 21:25:43 -07001310void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1311 __be32 saddr, struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001312{
Eric Dumazeted7865a42010-06-07 21:49:44 -07001313 struct in_device *in_dev = __in_dev_get_rcu(dev);
David S. Millerf39925d2011-02-09 22:00:16 -08001314 struct inet_peer *peer;
Denis V. Lunev317805b2008-02-28 20:50:06 -08001315 struct net *net;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001316
Linus Torvalds1da177e2005-04-16 15:20:36 -07001317 if (!in_dev)
1318 return;
1319
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09001320 net = dev_net(dev);
Joe Perches9d4fb272009-11-23 10:41:23 -08001321 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1322 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1323 ipv4_is_zeronet(new_gw))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001324 goto reject_redirect;
1325
1326 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1327 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1328 goto reject_redirect;
1329 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1330 goto reject_redirect;
1331 } else {
Denis V. Lunev317805b2008-02-28 20:50:06 -08001332 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001333 goto reject_redirect;
1334 }
1335
David S. Millerf39925d2011-02-09 22:00:16 -08001336 peer = inet_getpeer_v4(daddr, 1);
1337 if (peer) {
1338 peer->redirect_learned.a4 = new_gw;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001339
David S. Millerf39925d2011-02-09 22:00:16 -08001340 inet_putpeer(peer);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001341
David S. Millerf39925d2011-02-09 22:00:16 -08001342 atomic_inc(&__rt_peer_genid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001343 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001344 return;
1345
1346reject_redirect:
1347#ifdef CONFIG_IP_ROUTE_VERBOSE
1348 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
Harvey Harrison673d57e2008-10-31 00:53:57 -07001349 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1350 " Advised path = %pI4 -> %pI4\n",
1351 &old_gw, dev->name, &new_gw,
1352 &saddr, &daddr);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001353#endif
Eric Dumazeted7865a42010-06-07 21:49:44 -07001354 ;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001355}
1356
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001357static bool peer_pmtu_expired(struct inet_peer *peer)
1358{
1359 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1360
1361 return orig &&
1362 time_after_eq(jiffies, orig) &&
1363 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1364}
1365
1366static bool peer_pmtu_cleaned(struct inet_peer *peer)
1367{
1368 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1369
1370 return orig &&
1371 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1372}
1373
Linus Torvalds1da177e2005-04-16 15:20:36 -07001374static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1375{
Eric Dumazetee6b9672008-03-05 18:30:47 -08001376 struct rtable *rt = (struct rtable *)dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001377 struct dst_entry *ret = dst;
1378
1379 if (rt) {
Timo Teräsd11a4dc2010-03-18 23:20:20 +00001380 if (dst->obsolete > 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001381 ip_rt_put(rt);
1382 ret = NULL;
David S. Miller2c8cec52011-02-09 20:42:07 -08001383 } else if (rt->rt_flags & RTCF_REDIRECTED) {
David S. Miller5e2b61f2011-03-04 21:47:09 -08001384 unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1385 rt->rt_oif,
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001386 rt_genid(dev_net(dst->dev)));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001387 rt_del(hash, rt);
1388 ret = NULL;
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001389 } else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1390 dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001391 }
1392 }
1393 return ret;
1394}
1395
1396/*
1397 * Algorithm:
1398 * 1. The first ip_rt_redirect_number redirects are sent
1399 * with exponential backoff, then we stop sending them at all,
1400 * assuming that the host ignores our redirects.
1401 * 2. If we did not see packets requiring redirects
1402 * during ip_rt_redirect_silence, we assume that the host
1403 * forgot redirected route and start to send redirects again.
1404 *
1405 * This algorithm is much cheaper and more intelligent than dumb load limiting
1406 * in icmp.c.
1407 *
1408 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1409 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1410 */
1411
1412void ip_rt_send_redirect(struct sk_buff *skb)
1413{
Eric Dumazet511c3f92009-06-02 05:14:27 +00001414 struct rtable *rt = skb_rtable(skb);
Eric Dumazet30038fc2009-08-28 23:52:01 -07001415 struct in_device *in_dev;
David S. Miller92d86822011-02-04 15:55:25 -08001416 struct inet_peer *peer;
Eric Dumazet30038fc2009-08-28 23:52:01 -07001417 int log_martians;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001418
Eric Dumazet30038fc2009-08-28 23:52:01 -07001419 rcu_read_lock();
Changli Gaod8d1f302010-06-10 23:31:35 -07001420 in_dev = __in_dev_get_rcu(rt->dst.dev);
Eric Dumazet30038fc2009-08-28 23:52:01 -07001421 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1422 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001423 return;
Eric Dumazet30038fc2009-08-28 23:52:01 -07001424 }
1425 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1426 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001427
David S. Miller92d86822011-02-04 15:55:25 -08001428 if (!rt->peer)
David S. Millera48eff12011-05-18 18:42:43 -04001429 rt_bind_peer(rt, rt->rt_dst, 1);
David S. Miller92d86822011-02-04 15:55:25 -08001430 peer = rt->peer;
1431 if (!peer) {
1432 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1433 return;
1434 }
1435
Linus Torvalds1da177e2005-04-16 15:20:36 -07001436 /* No redirected packets during ip_rt_redirect_silence;
1437 * reset the algorithm.
1438 */
David S. Miller92d86822011-02-04 15:55:25 -08001439 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1440 peer->rate_tokens = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001441
1442 /* Too many ignored redirects; do not send anything
Changli Gaod8d1f302010-06-10 23:31:35 -07001443 * set dst.rate_last to the last seen redirected packet.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001444 */
David S. Miller92d86822011-02-04 15:55:25 -08001445 if (peer->rate_tokens >= ip_rt_redirect_number) {
1446 peer->rate_last = jiffies;
Eric Dumazet30038fc2009-08-28 23:52:01 -07001447 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001448 }
1449
1450 /* Check for load limit; set rate_last to the latest sent
1451 * redirect.
1452 */
David S. Miller92d86822011-02-04 15:55:25 -08001453 if (peer->rate_tokens == 0 ||
Li Yewang14fb8a72006-12-18 00:26:35 -08001454 time_after(jiffies,
David S. Miller92d86822011-02-04 15:55:25 -08001455 (peer->rate_last +
1456 (ip_rt_redirect_load << peer->rate_tokens)))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001457 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
David S. Miller92d86822011-02-04 15:55:25 -08001458 peer->rate_last = jiffies;
1459 ++peer->rate_tokens;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001460#ifdef CONFIG_IP_ROUTE_VERBOSE
Eric Dumazet30038fc2009-08-28 23:52:01 -07001461 if (log_martians &&
David S. Miller92d86822011-02-04 15:55:25 -08001462 peer->rate_tokens == ip_rt_redirect_number &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001463 net_ratelimit())
Harvey Harrison673d57e2008-10-31 00:53:57 -07001464 printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
David S. Millerc5be24f2011-05-13 18:01:21 -04001465 &ip_hdr(skb)->saddr, rt->rt_iif,
Harvey Harrison673d57e2008-10-31 00:53:57 -07001466 &rt->rt_dst, &rt->rt_gateway);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001467#endif
1468 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001469}
1470
1471static int ip_error(struct sk_buff *skb)
1472{
Eric Dumazet511c3f92009-06-02 05:14:27 +00001473 struct rtable *rt = skb_rtable(skb);
David S. Miller92d86822011-02-04 15:55:25 -08001474 struct inet_peer *peer;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001475 unsigned long now;
David S. Miller92d86822011-02-04 15:55:25 -08001476 bool send;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001477 int code;
1478
Changli Gaod8d1f302010-06-10 23:31:35 -07001479 switch (rt->dst.error) {
Joe Perches4500ebf2011-07-01 09:43:07 +00001480 case EINVAL:
1481 default:
1482 goto out;
1483 case EHOSTUNREACH:
1484 code = ICMP_HOST_UNREACH;
1485 break;
1486 case ENETUNREACH:
1487 code = ICMP_NET_UNREACH;
1488 IP_INC_STATS_BH(dev_net(rt->dst.dev),
1489 IPSTATS_MIB_INNOROUTES);
1490 break;
1491 case EACCES:
1492 code = ICMP_PKT_FILTERED;
1493 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001494 }
1495
David S. Miller92d86822011-02-04 15:55:25 -08001496 if (!rt->peer)
David S. Millera48eff12011-05-18 18:42:43 -04001497 rt_bind_peer(rt, rt->rt_dst, 1);
David S. Miller92d86822011-02-04 15:55:25 -08001498 peer = rt->peer;
1499
1500 send = true;
1501 if (peer) {
1502 now = jiffies;
1503 peer->rate_tokens += now - peer->rate_last;
1504 if (peer->rate_tokens > ip_rt_error_burst)
1505 peer->rate_tokens = ip_rt_error_burst;
1506 peer->rate_last = now;
1507 if (peer->rate_tokens >= ip_rt_error_cost)
1508 peer->rate_tokens -= ip_rt_error_cost;
1509 else
1510 send = false;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001511 }
David S. Miller92d86822011-02-04 15:55:25 -08001512 if (send)
1513 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001514
1515out: kfree_skb(skb);
1516 return 0;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001517}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001518
1519/*
1520 * The last two values are not from the RFC but
1521 * are needed for AMPRnet AX.25 paths.
1522 */
1523
Arjan van de Ven9b5b5cf2005-11-29 16:21:38 -08001524static const unsigned short mtu_plateau[] =
Linus Torvalds1da177e2005-04-16 15:20:36 -07001525{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1526
Stephen Hemminger5969f712008-04-10 01:52:09 -07001527static inline unsigned short guess_mtu(unsigned short old_mtu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001528{
1529 int i;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001530
Linus Torvalds1da177e2005-04-16 15:20:36 -07001531 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1532 if (old_mtu > mtu_plateau[i])
1533 return mtu_plateau[i];
1534 return 68;
1535}
1536
Eric Dumazetb71d1d42011-04-22 04:53:02 +00001537unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
Timo Teras0010e462008-04-29 03:32:25 -07001538 unsigned short new_mtu,
1539 struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001540{
Linus Torvalds1da177e2005-04-16 15:20:36 -07001541 unsigned short old_mtu = ntohs(iph->tot_len);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001542 unsigned short est_mtu = 0;
David S. Miller2c8cec52011-02-09 20:42:07 -08001543 struct inet_peer *peer;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001544
David S. Miller2c8cec52011-02-09 20:42:07 -08001545 peer = inet_getpeer_v4(iph->daddr, 1);
1546 if (peer) {
1547 unsigned short mtu = new_mtu;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001548
David S. Miller2c8cec52011-02-09 20:42:07 -08001549 if (new_mtu < 68 || new_mtu >= old_mtu) {
1550 /* BSD 4.2 derived systems incorrectly adjust
1551 * tot_len by the IP header length, and report
1552 * a zero MTU in the ICMP message.
1553 */
1554 if (mtu == 0 &&
1555 old_mtu >= 68 + (iph->ihl << 2))
1556 old_mtu -= iph->ihl << 2;
1557 mtu = guess_mtu(old_mtu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001558 }
David S. Miller2c8cec52011-02-09 20:42:07 -08001559
1560 if (mtu < ip_rt_min_pmtu)
1561 mtu = ip_rt_min_pmtu;
1562 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
Hiroaki SHIMODA46af3182011-03-09 20:09:58 +00001563 unsigned long pmtu_expires;
1564
1565 pmtu_expires = jiffies + ip_rt_mtu_expires;
1566 if (!pmtu_expires)
1567 pmtu_expires = 1UL;
1568
David S. Miller2c8cec52011-02-09 20:42:07 -08001569 est_mtu = mtu;
1570 peer->pmtu_learned = mtu;
Hiroaki SHIMODA46af3182011-03-09 20:09:58 +00001571 peer->pmtu_expires = pmtu_expires;
David S. Miller2c8cec52011-02-09 20:42:07 -08001572 }
1573
1574 inet_putpeer(peer);
1575
1576 atomic_inc(&__rt_peer_genid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001577 }
1578 return est_mtu ? : new_mtu;
1579}
1580
David S. Miller2c8cec52011-02-09 20:42:07 -08001581static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1582{
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001583 unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
David S. Miller2c8cec52011-02-09 20:42:07 -08001584
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001585 if (!expires)
1586 return;
Hiroaki SHIMODA46af3182011-03-09 20:09:58 +00001587 if (time_before(jiffies, expires)) {
David S. Miller2c8cec52011-02-09 20:42:07 -08001588 u32 orig_dst_mtu = dst_mtu(dst);
1589 if (peer->pmtu_learned < orig_dst_mtu) {
1590 if (!peer->pmtu_orig)
1591 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1592 dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1593 }
1594 } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1595 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1596}
1597
Linus Torvalds1da177e2005-04-16 15:20:36 -07001598static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1599{
David S. Miller2c8cec52011-02-09 20:42:07 -08001600 struct rtable *rt = (struct rtable *) dst;
1601 struct inet_peer *peer;
1602
1603 dst_confirm(dst);
1604
1605 if (!rt->peer)
David S. Millera48eff12011-05-18 18:42:43 -04001606 rt_bind_peer(rt, rt->rt_dst, 1);
David S. Miller2c8cec52011-02-09 20:42:07 -08001607 peer = rt->peer;
1608 if (peer) {
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001609 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1610
David S. Miller2c8cec52011-02-09 20:42:07 -08001611 if (mtu < ip_rt_min_pmtu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001612 mtu = ip_rt_min_pmtu;
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001613 if (!pmtu_expires || mtu < peer->pmtu_learned) {
Hiroaki SHIMODA46af3182011-03-09 20:09:58 +00001614
1615 pmtu_expires = jiffies + ip_rt_mtu_expires;
1616 if (!pmtu_expires)
1617 pmtu_expires = 1UL;
1618
David S. Miller2c8cec52011-02-09 20:42:07 -08001619 peer->pmtu_learned = mtu;
Hiroaki SHIMODA46af3182011-03-09 20:09:58 +00001620 peer->pmtu_expires = pmtu_expires;
David S. Miller2c8cec52011-02-09 20:42:07 -08001621
1622 atomic_inc(&__rt_peer_genid);
1623 rt->rt_peer_genid = rt_peer_genid();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001624 }
Hiroaki SHIMODA46af3182011-03-09 20:09:58 +00001625 check_peer_pmtu(dst, peer);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001626 }
1627}
1628
David S. Millerf39925d2011-02-09 22:00:16 -08001629static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1630{
1631 struct rtable *rt = (struct rtable *) dst;
1632 __be32 orig_gw = rt->rt_gateway;
Eric Dumazetf2c31e32011-07-29 19:00:53 +00001633 struct neighbour *n, *old_n;
David S. Millerf39925d2011-02-09 22:00:16 -08001634
1635 dst_confirm(&rt->dst);
1636
David S. Millerf39925d2011-02-09 22:00:16 -08001637 rt->rt_gateway = peer->redirect_learned.a4;
Eric Dumazetf2c31e32011-07-29 19:00:53 +00001638
1639 n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1640 if (IS_ERR(n))
1641 return PTR_ERR(n);
1642 old_n = xchg(&rt->dst._neighbour, n);
1643 if (old_n)
1644 neigh_release(old_n);
David S. Miller69cce1d2011-07-17 23:09:49 -07001645 if (!n || !(n->nud_state & NUD_VALID)) {
1646 if (n)
1647 neigh_event_send(n, NULL);
David S. Millerf39925d2011-02-09 22:00:16 -08001648 rt->rt_gateway = orig_gw;
1649 return -EAGAIN;
1650 } else {
1651 rt->rt_flags |= RTCF_REDIRECTED;
David S. Miller69cce1d2011-07-17 23:09:49 -07001652 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
David S. Millerf39925d2011-02-09 22:00:16 -08001653 }
1654 return 0;
1655}
1656
Linus Torvalds1da177e2005-04-16 15:20:36 -07001657static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1658{
David S. Miller6431cbc2011-02-07 20:38:06 -08001659 struct rtable *rt = (struct rtable *) dst;
1660
1661 if (rt_is_expired(rt))
Timo Teräsd11a4dc2010-03-18 23:20:20 +00001662 return NULL;
David S. Miller6431cbc2011-02-07 20:38:06 -08001663 if (rt->rt_peer_genid != rt_peer_genid()) {
David S. Miller2c8cec52011-02-09 20:42:07 -08001664 struct inet_peer *peer;
1665
David S. Miller6431cbc2011-02-07 20:38:06 -08001666 if (!rt->peer)
David S. Millera48eff12011-05-18 18:42:43 -04001667 rt_bind_peer(rt, rt->rt_dst, 0);
David S. Miller6431cbc2011-02-07 20:38:06 -08001668
David S. Miller2c8cec52011-02-09 20:42:07 -08001669 peer = rt->peer;
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001670 if (peer) {
David S. Miller2c8cec52011-02-09 20:42:07 -08001671 check_peer_pmtu(dst, peer);
1672
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001673 if (peer->redirect_learned.a4 &&
1674 peer->redirect_learned.a4 != rt->rt_gateway) {
1675 if (check_peer_redir(dst, peer))
1676 return NULL;
1677 }
David S. Millerf39925d2011-02-09 22:00:16 -08001678 }
1679
David S. Miller6431cbc2011-02-07 20:38:06 -08001680 rt->rt_peer_genid = rt_peer_genid();
1681 }
Timo Teräsd11a4dc2010-03-18 23:20:20 +00001682 return dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001683}
1684
1685static void ipv4_dst_destroy(struct dst_entry *dst)
1686{
1687 struct rtable *rt = (struct rtable *) dst;
1688 struct inet_peer *peer = rt->peer;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001689
David S. Miller62fa8a82011-01-26 20:51:05 -08001690 if (rt->fi) {
1691 fib_info_put(rt->fi);
1692 rt->fi = NULL;
1693 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001694 if (peer) {
1695 rt->peer = NULL;
1696 inet_putpeer(peer);
1697 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001698}
1699
Linus Torvalds1da177e2005-04-16 15:20:36 -07001700
1701static void ipv4_link_failure(struct sk_buff *skb)
1702{
1703 struct rtable *rt;
1704
1705 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1706
Eric Dumazet511c3f92009-06-02 05:14:27 +00001707 rt = skb_rtable(skb);
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001708 if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1709 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001710}
1711
1712static int ip_rt_bug(struct sk_buff *skb)
1713{
Harvey Harrison673d57e2008-10-31 00:53:57 -07001714 printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1715 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001716 skb->dev ? skb->dev->name : "?");
1717 kfree_skb(skb);
Dave Jonesc378a9c2011-05-21 07:16:42 +00001718 WARN_ON(1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001719 return 0;
1720}
1721
1722/*
1723 We do not cache source address of outgoing interface,
1724 because it is used only by IP RR, TS and SRR options,
1725 so that it out of fast path.
1726
1727 BTW remember: "addr" is allowed to be not aligned
1728 in IP options!
1729 */
1730
David S. Miller8e363602011-05-13 17:29:41 -04001731void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001732{
Al Viroa61ced52006-09-26 21:27:54 -07001733 __be32 src;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001734
David S. Millerc7537962010-11-11 17:07:48 -08001735 if (rt_is_output_route(rt))
David S. Millerc5be24f2011-05-13 18:01:21 -04001736 src = ip_hdr(skb)->saddr;
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00001737 else {
David S. Miller8e363602011-05-13 17:29:41 -04001738 struct fib_result res;
1739 struct flowi4 fl4;
1740 struct iphdr *iph;
1741
1742 iph = ip_hdr(skb);
1743
1744 memset(&fl4, 0, sizeof(fl4));
1745 fl4.daddr = iph->daddr;
1746 fl4.saddr = iph->saddr;
Julian Anastasovb0fe4a32011-07-23 02:00:41 +00001747 fl4.flowi4_tos = RT_TOS(iph->tos);
David S. Miller8e363602011-05-13 17:29:41 -04001748 fl4.flowi4_oif = rt->dst.dev->ifindex;
1749 fl4.flowi4_iif = skb->dev->ifindex;
1750 fl4.flowi4_mark = skb->mark;
David S. Miller5e2b61f2011-03-04 21:47:09 -08001751
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00001752 rcu_read_lock();
David S. Miller68a5e3d2011-03-11 20:07:33 -05001753 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
David S. Miller436c3b62011-03-24 17:42:21 -07001754 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00001755 else
1756 src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001757 RT_SCOPE_UNIVERSE);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00001758 rcu_read_unlock();
1759 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001760 memcpy(addr, &src, 4);
1761}
1762
Patrick McHardyc7066f72011-01-14 13:36:42 +01001763#ifdef CONFIG_IP_ROUTE_CLASSID
Linus Torvalds1da177e2005-04-16 15:20:36 -07001764static void set_class_tag(struct rtable *rt, u32 tag)
1765{
Changli Gaod8d1f302010-06-10 23:31:35 -07001766 if (!(rt->dst.tclassid & 0xFFFF))
1767 rt->dst.tclassid |= tag & 0xFFFF;
1768 if (!(rt->dst.tclassid & 0xFFFF0000))
1769 rt->dst.tclassid |= tag & 0xFFFF0000;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001770}
1771#endif
1772
David S. Miller0dbaee32010-12-13 12:52:14 -08001773static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1774{
1775 unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1776
1777 if (advmss == 0) {
1778 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1779 ip_rt_min_advmss);
1780 if (advmss > 65535 - 40)
1781 advmss = 65535 - 40;
1782 }
1783 return advmss;
1784}
1785
David S. Millerd33e4552010-12-14 13:01:14 -08001786static unsigned int ipv4_default_mtu(const struct dst_entry *dst)
1787{
1788 unsigned int mtu = dst->dev->mtu;
1789
1790 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1791 const struct rtable *rt = (const struct rtable *) dst;
1792
1793 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1794 mtu = 576;
1795 }
1796
1797 if (mtu > IP_MAX_MTU)
1798 mtu = IP_MAX_MTU;
1799
1800 return mtu;
1801}
1802
David S. Miller813b3b52011-04-28 14:48:42 -07001803static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
David S. Miller5e2b61f2011-03-04 21:47:09 -08001804 struct fib_info *fi)
David S. Millera4daad62011-01-27 22:01:53 -08001805{
David S. Miller0131ba42011-02-04 14:37:30 -08001806 struct inet_peer *peer;
1807 int create = 0;
1808
1809 /* If a peer entry exists for this destination, we must hook
1810 * it up in order to get at cached metrics.
1811 */
David S. Miller813b3b52011-04-28 14:48:42 -07001812 if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
David S. Miller0131ba42011-02-04 14:37:30 -08001813 create = 1;
1814
David S. Miller3c0afdc2011-03-04 21:26:07 -08001815 rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
David S. Miller0131ba42011-02-04 14:37:30 -08001816 if (peer) {
David S. Miller3c0afdc2011-03-04 21:26:07 -08001817 rt->rt_peer_genid = rt_peer_genid();
David S. Miller0131ba42011-02-04 14:37:30 -08001818 if (inet_metrics_new(peer))
1819 memcpy(peer->metrics, fi->fib_metrics,
1820 sizeof(u32) * RTAX_MAX);
1821 dst_init_metrics(&rt->dst, peer->metrics, false);
David S. Miller2c8cec52011-02-09 20:42:07 -08001822
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001823 check_peer_pmtu(&rt->dst, peer);
David S. Millerf39925d2011-02-09 22:00:16 -08001824 if (peer->redirect_learned.a4 &&
1825 peer->redirect_learned.a4 != rt->rt_gateway) {
1826 rt->rt_gateway = peer->redirect_learned.a4;
1827 rt->rt_flags |= RTCF_REDIRECTED;
1828 }
David S. Miller0131ba42011-02-04 14:37:30 -08001829 } else {
David S. Millerb8dad612011-01-28 14:07:16 -08001830 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1831 rt->fi = fi;
1832 atomic_inc(&fi->fib_clntref);
1833 }
David S. Millera4daad62011-01-27 22:01:53 -08001834 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
David S. Millera4daad62011-01-27 22:01:53 -08001835 }
1836}
1837
David S. Miller813b3b52011-04-28 14:48:42 -07001838static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
David S. Miller5e2b61f2011-03-04 21:47:09 -08001839 const struct fib_result *res,
David S. Miller982721f2011-02-16 21:44:24 -08001840 struct fib_info *fi, u16 type, u32 itag)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001841{
David S. Millerdefb3512010-12-08 21:16:57 -08001842 struct dst_entry *dst = &rt->dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001843
1844 if (fi) {
1845 if (FIB_RES_GW(*res) &&
1846 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1847 rt->rt_gateway = FIB_RES_GW(*res);
David S. Miller813b3b52011-04-28 14:48:42 -07001848 rt_init_metrics(rt, fl4, fi);
Patrick McHardyc7066f72011-01-14 13:36:42 +01001849#ifdef CONFIG_IP_ROUTE_CLASSID
David S. Millerdefb3512010-12-08 21:16:57 -08001850 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001851#endif
David S. Millerd33e4552010-12-14 13:01:14 -08001852 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001853
David S. Millerdefb3512010-12-08 21:16:57 -08001854 if (dst_mtu(dst) > IP_MAX_MTU)
1855 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
David S. Miller0dbaee32010-12-13 12:52:14 -08001856 if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
David S. Millerdefb3512010-12-08 21:16:57 -08001857 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001858
Patrick McHardyc7066f72011-01-14 13:36:42 +01001859#ifdef CONFIG_IP_ROUTE_CLASSID
Linus Torvalds1da177e2005-04-16 15:20:36 -07001860#ifdef CONFIG_IP_MULTIPLE_TABLES
1861 set_class_tag(rt, fib_rules_tclass(res));
1862#endif
1863 set_class_tag(rt, itag);
1864#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07001865}
1866
David S. Miller5c1e6aa2011-04-28 14:13:38 -07001867static struct rtable *rt_dst_alloc(struct net_device *dev,
1868 bool nopolicy, bool noxfrm)
David S. Miller0c4dcd52011-02-17 15:42:37 -08001869{
David S. Miller5c1e6aa2011-04-28 14:13:38 -07001870 return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
1871 DST_HOST |
1872 (nopolicy ? DST_NOPOLICY : 0) |
1873 (noxfrm ? DST_NOXFRM : 0));
David S. Miller0c4dcd52011-02-17 15:42:37 -08001874}
1875
Eric Dumazet96d36222010-06-02 19:21:31 +00001876/* called in rcu_read_lock() section */
Al Viro9e12bb22006-09-26 21:25:20 -07001877static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001878 u8 tos, struct net_device *dev, int our)
1879{
Eric Dumazet96d36222010-06-02 19:21:31 +00001880 unsigned int hash;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001881 struct rtable *rth;
Al Viroa61ced52006-09-26 21:27:54 -07001882 __be32 spec_dst;
Eric Dumazet96d36222010-06-02 19:21:31 +00001883 struct in_device *in_dev = __in_dev_get_rcu(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001884 u32 itag = 0;
Eric Dumazetb5f7e752010-06-02 12:05:27 +00001885 int err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001886
1887 /* Primary sanity checks. */
1888
1889 if (in_dev == NULL)
1890 return -EINVAL;
1891
Jan Engelhardt1e637c72008-01-21 03:18:08 -08001892 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
Joe Perchesf97c1e02007-12-16 13:45:43 -08001893 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001894 goto e_inval;
1895
Joe Perchesf97c1e02007-12-16 13:45:43 -08001896 if (ipv4_is_zeronet(saddr)) {
1897 if (!ipv4_is_local_multicast(daddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001898 goto e_inval;
1899 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
Eric Dumazetb5f7e752010-06-02 12:05:27 +00001900 } else {
Michael Smith5c04c812011-04-07 04:51:50 +00001901 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
1902 &itag);
Eric Dumazetb5f7e752010-06-02 12:05:27 +00001903 if (err < 0)
1904 goto e_err;
1905 }
David S. Miller5c1e6aa2011-04-28 14:13:38 -07001906 rth = rt_dst_alloc(init_net.loopback_dev,
1907 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001908 if (!rth)
1909 goto e_nobufs;
1910
Patrick McHardyc7066f72011-01-14 13:36:42 +01001911#ifdef CONFIG_IP_ROUTE_CLASSID
Changli Gaod8d1f302010-06-10 23:31:35 -07001912 rth->dst.tclassid = itag;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001913#endif
David S. Millercf911662011-04-28 14:31:47 -07001914 rth->dst.output = ip_rt_bug;
1915
1916 rth->rt_key_dst = daddr;
1917 rth->rt_key_src = saddr;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001918 rth->rt_genid = rt_genid(dev_net(dev));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001919 rth->rt_flags = RTCF_MULTICAST;
Eric Dumazet29e75252008-01-31 17:05:09 -08001920 rth->rt_type = RTN_MULTICAST;
David S. Miller475949d2011-05-03 19:45:15 -07001921 rth->rt_key_tos = tos;
David S. Millercf911662011-04-28 14:31:47 -07001922 rth->rt_dst = daddr;
1923 rth->rt_src = saddr;
1924 rth->rt_route_iif = dev->ifindex;
1925 rth->rt_iif = dev->ifindex;
1926 rth->rt_oif = 0;
1927 rth->rt_mark = skb->mark;
1928 rth->rt_gateway = daddr;
1929 rth->rt_spec_dst= spec_dst;
1930 rth->rt_peer_genid = 0;
1931 rth->peer = NULL;
1932 rth->fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001933 if (our) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001934 rth->dst.input= ip_local_deliver;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001935 rth->rt_flags |= RTCF_LOCAL;
1936 }
1937
1938#ifdef CONFIG_IP_MROUTE
Joe Perchesf97c1e02007-12-16 13:45:43 -08001939 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
Changli Gaod8d1f302010-06-10 23:31:35 -07001940 rth->dst.input = ip_mr_input;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001941#endif
1942 RT_CACHE_STAT_INC(in_slow_mc);
1943
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001944 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
David S. Millerb23dd4f2011-03-02 14:31:35 -08001945 rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
Eric Dumazet9aa3c942011-06-18 11:59:18 -07001946 return IS_ERR(rth) ? PTR_ERR(rth) : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001947
1948e_nobufs:
Linus Torvalds1da177e2005-04-16 15:20:36 -07001949 return -ENOBUFS;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001950e_inval:
Eric Dumazet96d36222010-06-02 19:21:31 +00001951 return -EINVAL;
Eric Dumazetb5f7e752010-06-02 12:05:27 +00001952e_err:
Eric Dumazetb5f7e752010-06-02 12:05:27 +00001953 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001954}
1955
1956
1957static void ip_handle_martian_source(struct net_device *dev,
1958 struct in_device *in_dev,
1959 struct sk_buff *skb,
Al Viro9e12bb22006-09-26 21:25:20 -07001960 __be32 daddr,
1961 __be32 saddr)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001962{
1963 RT_CACHE_STAT_INC(in_martian_src);
1964#ifdef CONFIG_IP_ROUTE_VERBOSE
1965 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1966 /*
1967 * RFC1812 recommendation, if source is martian,
1968 * the only hint is MAC header.
1969 */
Harvey Harrison673d57e2008-10-31 00:53:57 -07001970 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
1971 &daddr, &saddr, dev->name);
Arnaldo Carvalho de Melo98e399f2007-03-19 15:33:04 -07001972 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001973 int i;
Arnaldo Carvalho de Melo98e399f2007-03-19 15:33:04 -07001974 const unsigned char *p = skb_mac_header(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001975 printk(KERN_WARNING "ll header: ");
1976 for (i = 0; i < dev->hard_header_len; i++, p++) {
1977 printk("%02x", *p);
1978 if (i < (dev->hard_header_len - 1))
1979 printk(":");
1980 }
1981 printk("\n");
1982 }
1983 }
1984#endif
1985}
1986
Eric Dumazet47360222010-06-03 04:13:21 +00001987/* called in rcu_read_lock() section */
Stephen Hemminger5969f712008-04-10 01:52:09 -07001988static int __mkroute_input(struct sk_buff *skb,
David S. Miller982721f2011-02-16 21:44:24 -08001989 const struct fib_result *res,
Stephen Hemminger5969f712008-04-10 01:52:09 -07001990 struct in_device *in_dev,
1991 __be32 daddr, __be32 saddr, u32 tos,
1992 struct rtable **result)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001993{
Linus Torvalds1da177e2005-04-16 15:20:36 -07001994 struct rtable *rth;
1995 int err;
1996 struct in_device *out_dev;
Eric Dumazet47360222010-06-03 04:13:21 +00001997 unsigned int flags = 0;
Al Virod9c9df82006-09-26 21:28:14 -07001998 __be32 spec_dst;
1999 u32 itag;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002000
2001 /* get a working reference to the output device */
Eric Dumazet47360222010-06-03 04:13:21 +00002002 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002003 if (out_dev == NULL) {
2004 if (net_ratelimit())
2005 printk(KERN_CRIT "Bug in ip_route_input" \
2006 "_slow(). Please, report\n");
2007 return -EINVAL;
2008 }
2009
2010
Michael Smith5c04c812011-04-07 04:51:50 +00002011 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2012 in_dev->dev, &spec_dst, &itag);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002013 if (err < 0) {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002014 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002015 saddr);
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002016
Linus Torvalds1da177e2005-04-16 15:20:36 -07002017 goto cleanup;
2018 }
2019
2020 if (err)
2021 flags |= RTCF_DIRECTSRC;
2022
Thomas Graf51b77ca2008-06-03 16:36:01 -07002023 if (out_dev == in_dev && err &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07002024 (IN_DEV_SHARED_MEDIA(out_dev) ||
2025 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2026 flags |= RTCF_DOREDIRECT;
2027
2028 if (skb->protocol != htons(ETH_P_IP)) {
2029 /* Not IP (i.e. ARP). Do not create route, if it is
2030 * invalid for proxy arp. DNAT routes are always valid.
Jesper Dangaard Brouer65324142010-01-05 05:50:47 +00002031 *
2032 * Proxy arp feature have been extended to allow, ARP
2033 * replies back to the same interface, to support
2034 * Private VLAN switch technologies. See arp.c.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002035 */
Jesper Dangaard Brouer65324142010-01-05 05:50:47 +00002036 if (out_dev == in_dev &&
2037 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002038 err = -EINVAL;
2039 goto cleanup;
2040 }
2041 }
2042
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002043 rth = rt_dst_alloc(out_dev->dev,
2044 IN_DEV_CONF_GET(in_dev, NOPOLICY),
David S. Miller0c4dcd52011-02-17 15:42:37 -08002045 IN_DEV_CONF_GET(out_dev, NOXFRM));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002046 if (!rth) {
2047 err = -ENOBUFS;
2048 goto cleanup;
2049 }
2050
David S. Miller5e2b61f2011-03-04 21:47:09 -08002051 rth->rt_key_dst = daddr;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002052 rth->rt_key_src = saddr;
David S. Millercf911662011-04-28 14:31:47 -07002053 rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2054 rth->rt_flags = flags;
2055 rth->rt_type = res->type;
David S. Miller475949d2011-05-03 19:45:15 -07002056 rth->rt_key_tos = tos;
David S. Millercf911662011-04-28 14:31:47 -07002057 rth->rt_dst = daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002058 rth->rt_src = saddr;
OGAWA Hirofumi1b86a582011-04-07 14:04:08 -07002059 rth->rt_route_iif = in_dev->dev->ifindex;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002060 rth->rt_iif = in_dev->dev->ifindex;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002061 rth->rt_oif = 0;
David S. Millercf911662011-04-28 14:31:47 -07002062 rth->rt_mark = skb->mark;
2063 rth->rt_gateway = daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002064 rth->rt_spec_dst= spec_dst;
David S. Millercf911662011-04-28 14:31:47 -07002065 rth->rt_peer_genid = 0;
2066 rth->peer = NULL;
2067 rth->fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002068
Changli Gaod8d1f302010-06-10 23:31:35 -07002069 rth->dst.input = ip_forward;
2070 rth->dst.output = ip_output;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002071
David S. Miller5e2b61f2011-03-04 21:47:09 -08002072 rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002073
Linus Torvalds1da177e2005-04-16 15:20:36 -07002074 *result = rth;
2075 err = 0;
2076 cleanup:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002077 return err;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002078}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002079
Stephen Hemminger5969f712008-04-10 01:52:09 -07002080static int ip_mkroute_input(struct sk_buff *skb,
2081 struct fib_result *res,
David S. Miller68a5e3d2011-03-11 20:07:33 -05002082 const struct flowi4 *fl4,
Stephen Hemminger5969f712008-04-10 01:52:09 -07002083 struct in_device *in_dev,
2084 __be32 daddr, __be32 saddr, u32 tos)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002085{
Chuck Short7abaa272005-06-22 22:10:23 -07002086 struct rtable* rth = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002087 int err;
2088 unsigned hash;
2089
2090#ifdef CONFIG_IP_ROUTE_MULTIPATH
David S. Millerff3fccb2011-03-10 16:23:24 -08002091 if (res->fi && res->fi->fib_nhs > 1)
David S. Miller1b7fe5932011-03-10 17:01:16 -08002092 fib_select_multipath(res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002093#endif
2094
2095 /* create a routing cache entry */
2096 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2097 if (err)
2098 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002099
2100 /* put it into the cache */
David S. Miller68a5e3d2011-03-11 20:07:33 -05002101 hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
Changli Gaod8d1f302010-06-10 23:31:35 -07002102 rt_genid(dev_net(rth->dst.dev)));
David S. Miller68a5e3d2011-03-11 20:07:33 -05002103 rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
David S. Millerb23dd4f2011-03-02 14:31:35 -08002104 if (IS_ERR(rth))
2105 return PTR_ERR(rth);
2106 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002107}
2108
Linus Torvalds1da177e2005-04-16 15:20:36 -07002109/*
2110 * NOTE. We drop all the packets that has local source
2111 * addresses, because every properly looped back packet
2112 * must have correct destination already attached by output routine.
2113 *
2114 * Such approach solves two big problems:
2115 * 1. Not simplex devices are handled properly.
2116 * 2. IP spoofing attempts are filtered with 100% of guarantee.
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002117 * called with rcu_read_lock()
Linus Torvalds1da177e2005-04-16 15:20:36 -07002118 */
2119
Al Viro9e12bb22006-09-26 21:25:20 -07002120static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002121 u8 tos, struct net_device *dev)
2122{
2123 struct fib_result res;
Eric Dumazet96d36222010-06-02 19:21:31 +00002124 struct in_device *in_dev = __in_dev_get_rcu(dev);
David S. Miller68a5e3d2011-03-11 20:07:33 -05002125 struct flowi4 fl4;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002126 unsigned flags = 0;
2127 u32 itag = 0;
2128 struct rtable * rth;
2129 unsigned hash;
Al Viro9e12bb22006-09-26 21:25:20 -07002130 __be32 spec_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002131 int err = -EINVAL;
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09002132 struct net * net = dev_net(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002133
2134 /* IP on this device is disabled. */
2135
2136 if (!in_dev)
2137 goto out;
2138
2139 /* Check for the most weird martians, which can be not detected
2140 by fib_lookup.
2141 */
2142
Jan Engelhardt1e637c72008-01-21 03:18:08 -08002143 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
Joe Perchesf97c1e02007-12-16 13:45:43 -08002144 ipv4_is_loopback(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002145 goto martian_source;
2146
Andy Walls27a954b2010-10-17 15:11:22 +00002147 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002148 goto brd_input;
2149
2150 /* Accept zero addresses only to limited broadcast;
2151 * I even do not know to fix it or not. Waiting for complains :-)
2152 */
Joe Perchesf97c1e02007-12-16 13:45:43 -08002153 if (ipv4_is_zeronet(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002154 goto martian_source;
2155
Andy Walls27a954b2010-10-17 15:11:22 +00002156 if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002157 goto martian_destination;
2158
2159 /*
2160 * Now we are ready to route packet.
2161 */
David S. Miller68a5e3d2011-03-11 20:07:33 -05002162 fl4.flowi4_oif = 0;
2163 fl4.flowi4_iif = dev->ifindex;
2164 fl4.flowi4_mark = skb->mark;
2165 fl4.flowi4_tos = tos;
2166 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2167 fl4.daddr = daddr;
2168 fl4.saddr = saddr;
2169 err = fib_lookup(net, &fl4, &res);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002170 if (err != 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002171 if (!IN_DEV_FORWARD(in_dev))
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002172 goto e_hostunreach;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002173 goto no_route;
2174 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002175
2176 RT_CACHE_STAT_INC(in_slow_tot);
2177
2178 if (res.type == RTN_BROADCAST)
2179 goto brd_input;
2180
2181 if (res.type == RTN_LOCAL) {
Michael Smith5c04c812011-04-07 04:51:50 +00002182 err = fib_validate_source(skb, saddr, daddr, tos,
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002183 net->loopback_dev->ifindex,
Michael Smith5c04c812011-04-07 04:51:50 +00002184 dev, &spec_dst, &itag);
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002185 if (err < 0)
2186 goto martian_source_keep_err;
2187 if (err)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002188 flags |= RTCF_DIRECTSRC;
2189 spec_dst = daddr;
2190 goto local_input;
2191 }
2192
2193 if (!IN_DEV_FORWARD(in_dev))
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002194 goto e_hostunreach;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002195 if (res.type != RTN_UNICAST)
2196 goto martian_destination;
2197
David S. Miller68a5e3d2011-03-11 20:07:33 -05002198 err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002199out: return err;
2200
2201brd_input:
2202 if (skb->protocol != htons(ETH_P_IP))
2203 goto e_inval;
2204
Joe Perchesf97c1e02007-12-16 13:45:43 -08002205 if (ipv4_is_zeronet(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002206 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2207 else {
Michael Smith5c04c812011-04-07 04:51:50 +00002208 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2209 &itag);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002210 if (err < 0)
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002211 goto martian_source_keep_err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002212 if (err)
2213 flags |= RTCF_DIRECTSRC;
2214 }
2215 flags |= RTCF_BROADCAST;
2216 res.type = RTN_BROADCAST;
2217 RT_CACHE_STAT_INC(in_brd);
2218
2219local_input:
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002220 rth = rt_dst_alloc(net->loopback_dev,
2221 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002222 if (!rth)
2223 goto e_nobufs;
2224
David S. Millercf911662011-04-28 14:31:47 -07002225 rth->dst.input= ip_local_deliver;
Changli Gaod8d1f302010-06-10 23:31:35 -07002226 rth->dst.output= ip_rt_bug;
David S. Millercf911662011-04-28 14:31:47 -07002227#ifdef CONFIG_IP_ROUTE_CLASSID
2228 rth->dst.tclassid = itag;
2229#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07002230
David S. Miller5e2b61f2011-03-04 21:47:09 -08002231 rth->rt_key_dst = daddr;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002232 rth->rt_key_src = saddr;
David S. Millercf911662011-04-28 14:31:47 -07002233 rth->rt_genid = rt_genid(net);
2234 rth->rt_flags = flags|RTCF_LOCAL;
2235 rth->rt_type = res.type;
David S. Miller475949d2011-05-03 19:45:15 -07002236 rth->rt_key_tos = tos;
David S. Millercf911662011-04-28 14:31:47 -07002237 rth->rt_dst = daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002238 rth->rt_src = saddr;
Patrick McHardyc7066f72011-01-14 13:36:42 +01002239#ifdef CONFIG_IP_ROUTE_CLASSID
Changli Gaod8d1f302010-06-10 23:31:35 -07002240 rth->dst.tclassid = itag;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002241#endif
OGAWA Hirofumi1b86a582011-04-07 14:04:08 -07002242 rth->rt_route_iif = dev->ifindex;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002243 rth->rt_iif = dev->ifindex;
David S. Millercf911662011-04-28 14:31:47 -07002244 rth->rt_oif = 0;
2245 rth->rt_mark = skb->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002246 rth->rt_gateway = daddr;
2247 rth->rt_spec_dst= spec_dst;
David S. Millercf911662011-04-28 14:31:47 -07002248 rth->rt_peer_genid = 0;
2249 rth->peer = NULL;
2250 rth->fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002251 if (res.type == RTN_UNREACHABLE) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002252 rth->dst.input= ip_error;
2253 rth->dst.error= -err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002254 rth->rt_flags &= ~RTCF_LOCAL;
2255 }
David S. Miller68a5e3d2011-03-11 20:07:33 -05002256 hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2257 rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
David S. Millerb23dd4f2011-03-02 14:31:35 -08002258 err = 0;
2259 if (IS_ERR(rth))
2260 err = PTR_ERR(rth);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002261 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002262
2263no_route:
2264 RT_CACHE_STAT_INC(in_no_route);
2265 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2266 res.type = RTN_UNREACHABLE;
Mitsuru Chinen7f538782007-12-07 01:07:24 -08002267 if (err == -ESRCH)
2268 err = -ENETUNREACH;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002269 goto local_input;
2270
2271 /*
2272 * Do not cache martian addresses: they should be logged (RFC1812)
2273 */
2274martian_destination:
2275 RT_CACHE_STAT_INC(in_martian_dst);
2276#ifdef CONFIG_IP_ROUTE_VERBOSE
2277 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
Harvey Harrison673d57e2008-10-31 00:53:57 -07002278 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2279 &daddr, &saddr, dev->name);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002280#endif
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002281
2282e_hostunreach:
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002283 err = -EHOSTUNREACH;
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002284 goto out;
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002285
Linus Torvalds1da177e2005-04-16 15:20:36 -07002286e_inval:
2287 err = -EINVAL;
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002288 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002289
2290e_nobufs:
2291 err = -ENOBUFS;
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002292 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002293
2294martian_source:
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002295 err = -EINVAL;
2296martian_source_keep_err:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002297 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002298 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002299}
2300
Eric Dumazet407eadd2010-05-10 11:32:55 +00002301int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2302 u8 tos, struct net_device *dev, bool noref)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002303{
2304 struct rtable * rth;
2305 unsigned hash;
2306 int iif = dev->ifindex;
Denis V. Lunevb5921912008-01-22 23:50:25 -08002307 struct net *net;
Eric Dumazet96d36222010-06-02 19:21:31 +00002308 int res;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002309
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09002310 net = dev_net(dev);
Neil Horman1080d702008-10-27 12:28:25 -07002311
Eric Dumazet96d36222010-06-02 19:21:31 +00002312 rcu_read_lock();
2313
Neil Horman1080d702008-10-27 12:28:25 -07002314 if (!rt_caching(net))
2315 goto skip_cache;
2316
Linus Torvalds1da177e2005-04-16 15:20:36 -07002317 tos &= IPTOS_RT_MASK;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002318 hash = rt_hash(daddr, saddr, iif, rt_genid(net));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002319
Linus Torvalds1da177e2005-04-16 15:20:36 -07002320 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
Changli Gaod8d1f302010-06-10 23:31:35 -07002321 rth = rcu_dereference(rth->dst.rt_next)) {
David S. Miller5e2b61f2011-03-04 21:47:09 -08002322 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2323 ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2324 (rth->rt_iif ^ iif) |
David S. Miller475949d2011-05-03 19:45:15 -07002325 (rth->rt_key_tos ^ tos)) == 0 &&
Julian Anastasovd547f722011-08-07 22:20:20 -07002326 rt_is_input_route(rth) &&
David S. Miller5e2b61f2011-03-04 21:47:09 -08002327 rth->rt_mark == skb->mark &&
Changli Gaod8d1f302010-06-10 23:31:35 -07002328 net_eq(dev_net(rth->dst.dev), net) &&
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002329 !rt_is_expired(rth)) {
Eric Dumazet407eadd2010-05-10 11:32:55 +00002330 if (noref) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002331 dst_use_noref(&rth->dst, jiffies);
2332 skb_dst_set_noref(skb, &rth->dst);
Eric Dumazet407eadd2010-05-10 11:32:55 +00002333 } else {
Changli Gaod8d1f302010-06-10 23:31:35 -07002334 dst_use(&rth->dst, jiffies);
2335 skb_dst_set(skb, &rth->dst);
Eric Dumazet407eadd2010-05-10 11:32:55 +00002336 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002337 RT_CACHE_STAT_INC(in_hit);
2338 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002339 return 0;
2340 }
2341 RT_CACHE_STAT_INC(in_hlist_search);
2342 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002343
Neil Horman1080d702008-10-27 12:28:25 -07002344skip_cache:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002345 /* Multicast recognition logic is moved from route cache to here.
2346 The problem was that too many Ethernet cards have broken/missing
2347 hardware multicast filters :-( As result the host on multicasting
2348 network acquires a lot of useless route cache entries, sort of
2349 SDR messages from all the world. Now we try to get rid of them.
2350 Really, provided software IP multicast filter is organized
2351 reasonably (at least, hashed), it does not result in a slowdown
2352 comparing with route cache reject entries.
2353 Note, that multicast routers are not affected, because
2354 route cache entry is created eventually.
2355 */
Joe Perchesf97c1e02007-12-16 13:45:43 -08002356 if (ipv4_is_multicast(daddr)) {
Eric Dumazet96d36222010-06-02 19:21:31 +00002357 struct in_device *in_dev = __in_dev_get_rcu(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002358
Eric Dumazet96d36222010-06-02 19:21:31 +00002359 if (in_dev) {
David S. Millerdbdd9a52011-03-10 16:34:38 -08002360 int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2361 ip_hdr(skb)->protocol);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002362 if (our
2363#ifdef CONFIG_IP_MROUTE
Joe Perches9d4fb272009-11-23 10:41:23 -08002364 ||
2365 (!ipv4_is_local_multicast(daddr) &&
2366 IN_DEV_MFORWARD(in_dev))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002367#endif
Joe Perches9d4fb272009-11-23 10:41:23 -08002368 ) {
Eric Dumazet96d36222010-06-02 19:21:31 +00002369 int res = ip_route_input_mc(skb, daddr, saddr,
2370 tos, dev, our);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002371 rcu_read_unlock();
Eric Dumazet96d36222010-06-02 19:21:31 +00002372 return res;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002373 }
2374 }
2375 rcu_read_unlock();
2376 return -EINVAL;
2377 }
Eric Dumazet96d36222010-06-02 19:21:31 +00002378 res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2379 rcu_read_unlock();
2380 return res;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002381}
Eric Dumazet407eadd2010-05-10 11:32:55 +00002382EXPORT_SYMBOL(ip_route_input_common);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002383
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002384/* called with rcu_read_lock() */
David S. Miller982721f2011-02-16 21:44:24 -08002385static struct rtable *__mkroute_output(const struct fib_result *res,
David S. Miller68a5e3d2011-03-11 20:07:33 -05002386 const struct flowi4 *fl4,
David S. Miller813b3b52011-04-28 14:48:42 -07002387 __be32 orig_daddr, __be32 orig_saddr,
2388 int orig_oif, struct net_device *dev_out,
David S. Miller5ada5522011-02-17 15:29:00 -08002389 unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002390{
David S. Miller982721f2011-02-16 21:44:24 -08002391 struct fib_info *fi = res->fi;
David S. Miller813b3b52011-04-28 14:48:42 -07002392 u32 tos = RT_FL_TOS(fl4);
David S. Miller5ada5522011-02-17 15:29:00 -08002393 struct in_device *in_dev;
David S. Miller982721f2011-02-16 21:44:24 -08002394 u16 type = res->type;
David S. Miller5ada5522011-02-17 15:29:00 -08002395 struct rtable *rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002396
David S. Miller68a5e3d2011-03-11 20:07:33 -05002397 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
David S. Miller5ada5522011-02-17 15:29:00 -08002398 return ERR_PTR(-EINVAL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002399
David S. Miller68a5e3d2011-03-11 20:07:33 -05002400 if (ipv4_is_lbcast(fl4->daddr))
David S. Miller982721f2011-02-16 21:44:24 -08002401 type = RTN_BROADCAST;
David S. Miller68a5e3d2011-03-11 20:07:33 -05002402 else if (ipv4_is_multicast(fl4->daddr))
David S. Miller982721f2011-02-16 21:44:24 -08002403 type = RTN_MULTICAST;
David S. Miller68a5e3d2011-03-11 20:07:33 -05002404 else if (ipv4_is_zeronet(fl4->daddr))
David S. Miller5ada5522011-02-17 15:29:00 -08002405 return ERR_PTR(-EINVAL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002406
2407 if (dev_out->flags & IFF_LOOPBACK)
2408 flags |= RTCF_LOCAL;
2409
Eric Dumazetdd28d1a2010-09-29 11:53:50 +00002410 in_dev = __in_dev_get_rcu(dev_out);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002411 if (!in_dev)
David S. Miller5ada5522011-02-17 15:29:00 -08002412 return ERR_PTR(-EINVAL);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002413
David S. Miller982721f2011-02-16 21:44:24 -08002414 if (type == RTN_BROADCAST) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002415 flags |= RTCF_BROADCAST | RTCF_LOCAL;
David S. Miller982721f2011-02-16 21:44:24 -08002416 fi = NULL;
2417 } else if (type == RTN_MULTICAST) {
Eric Dumazetdd28d1a2010-09-29 11:53:50 +00002418 flags |= RTCF_MULTICAST | RTCF_LOCAL;
David S. Miller813b3b52011-04-28 14:48:42 -07002419 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2420 fl4->flowi4_proto))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002421 flags &= ~RTCF_LOCAL;
2422 /* If multicast route do not exist use
Eric Dumazetdd28d1a2010-09-29 11:53:50 +00002423 * default one, but do not gateway in this case.
2424 * Yes, it is hack.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002425 */
David S. Miller982721f2011-02-16 21:44:24 -08002426 if (fi && res->prefixlen < 4)
2427 fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002428 }
2429
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002430 rth = rt_dst_alloc(dev_out,
2431 IN_DEV_CONF_GET(in_dev, NOPOLICY),
David S. Miller0c4dcd52011-02-17 15:42:37 -08002432 IN_DEV_CONF_GET(in_dev, NOXFRM));
Dimitris Michailidis8391d072010-10-07 14:48:38 +00002433 if (!rth)
David S. Miller5ada5522011-02-17 15:29:00 -08002434 return ERR_PTR(-ENOBUFS);
Dimitris Michailidis8391d072010-10-07 14:48:38 +00002435
David S. Millercf911662011-04-28 14:31:47 -07002436 rth->dst.output = ip_output;
2437
David S. Miller813b3b52011-04-28 14:48:42 -07002438 rth->rt_key_dst = orig_daddr;
2439 rth->rt_key_src = orig_saddr;
David S. Millercf911662011-04-28 14:31:47 -07002440 rth->rt_genid = rt_genid(dev_net(dev_out));
2441 rth->rt_flags = flags;
2442 rth->rt_type = type;
David S. Miller475949d2011-05-03 19:45:15 -07002443 rth->rt_key_tos = tos;
David S. Miller68a5e3d2011-03-11 20:07:33 -05002444 rth->rt_dst = fl4->daddr;
2445 rth->rt_src = fl4->saddr;
OGAWA Hirofumi1b86a582011-04-07 14:04:08 -07002446 rth->rt_route_iif = 0;
David S. Miller813b3b52011-04-28 14:48:42 -07002447 rth->rt_iif = orig_oif ? : dev_out->ifindex;
2448 rth->rt_oif = orig_oif;
2449 rth->rt_mark = fl4->flowi4_mark;
David S. Miller68a5e3d2011-03-11 20:07:33 -05002450 rth->rt_gateway = fl4->daddr;
2451 rth->rt_spec_dst= fl4->saddr;
David S. Millercf911662011-04-28 14:31:47 -07002452 rth->rt_peer_genid = 0;
2453 rth->peer = NULL;
2454 rth->fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002455
2456 RT_CACHE_STAT_INC(out_slow_tot);
2457
2458 if (flags & RTCF_LOCAL) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002459 rth->dst.input = ip_local_deliver;
David S. Miller68a5e3d2011-03-11 20:07:33 -05002460 rth->rt_spec_dst = fl4->daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002461 }
2462 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
David S. Miller68a5e3d2011-03-11 20:07:33 -05002463 rth->rt_spec_dst = fl4->saddr;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002464 if (flags & RTCF_LOCAL &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07002465 !(dev_out->flags & IFF_LOOPBACK)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002466 rth->dst.output = ip_mc_output;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002467 RT_CACHE_STAT_INC(out_slow_mc);
2468 }
2469#ifdef CONFIG_IP_MROUTE
David S. Miller982721f2011-02-16 21:44:24 -08002470 if (type == RTN_MULTICAST) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002471 if (IN_DEV_MFORWARD(in_dev) &&
David S. Miller813b3b52011-04-28 14:48:42 -07002472 !ipv4_is_local_multicast(fl4->daddr)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002473 rth->dst.input = ip_mr_input;
2474 rth->dst.output = ip_mc_output;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002475 }
2476 }
2477#endif
2478 }
2479
David S. Miller813b3b52011-04-28 14:48:42 -07002480 rt_set_nexthop(rth, fl4, res, fi, type, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002481
David S. Miller5ada5522011-02-17 15:29:00 -08002482 return rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002483}
2484
Linus Torvalds1da177e2005-04-16 15:20:36 -07002485/*
2486 * Major route resolver routine.
Eric Dumazet0197aa32010-09-30 03:33:58 +00002487 * called with rcu_read_lock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002488 */
2489
David S. Miller813b3b52011-04-28 14:48:42 -07002490static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002491{
Linus Torvalds1da177e2005-04-16 15:20:36 -07002492 struct net_device *dev_out = NULL;
David S. Miller813b3b52011-04-28 14:48:42 -07002493 u32 tos = RT_FL_TOS(fl4);
2494 unsigned int flags = 0;
2495 struct fib_result res;
David S. Miller5ada5522011-02-17 15:29:00 -08002496 struct rtable *rth;
David S. Miller813b3b52011-04-28 14:48:42 -07002497 __be32 orig_daddr;
2498 __be32 orig_saddr;
2499 int orig_oif;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002500
2501 res.fi = NULL;
2502#ifdef CONFIG_IP_MULTIPLE_TABLES
2503 res.r = NULL;
2504#endif
2505
David S. Miller813b3b52011-04-28 14:48:42 -07002506 orig_daddr = fl4->daddr;
2507 orig_saddr = fl4->saddr;
2508 orig_oif = fl4->flowi4_oif;
2509
2510 fl4->flowi4_iif = net->loopback_dev->ifindex;
2511 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2512 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2513 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
David S. Miller44713b62011-03-04 21:24:47 -08002514
David S. Miller010c2702011-02-17 15:37:09 -08002515 rcu_read_lock();
David S. Miller813b3b52011-04-28 14:48:42 -07002516 if (fl4->saddr) {
David S. Millerb23dd4f2011-03-02 14:31:35 -08002517 rth = ERR_PTR(-EINVAL);
David S. Miller813b3b52011-04-28 14:48:42 -07002518 if (ipv4_is_multicast(fl4->saddr) ||
2519 ipv4_is_lbcast(fl4->saddr) ||
2520 ipv4_is_zeronet(fl4->saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002521 goto out;
2522
Linus Torvalds1da177e2005-04-16 15:20:36 -07002523 /* I removed check for oif == dev_out->oif here.
2524 It was wrong for two reasons:
Denis V. Lunev1ab35272008-01-22 22:04:30 -08002525 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2526 is assigned to multiple interfaces.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002527 2. Moreover, we are allowed to send packets with saddr
2528 of another iface. --ANK
2529 */
2530
David S. Miller813b3b52011-04-28 14:48:42 -07002531 if (fl4->flowi4_oif == 0 &&
2532 (ipv4_is_multicast(fl4->daddr) ||
2533 ipv4_is_lbcast(fl4->daddr))) {
Julian Anastasova210d012008-10-01 07:28:28 -07002534 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
David S. Miller813b3b52011-04-28 14:48:42 -07002535 dev_out = __ip_dev_find(net, fl4->saddr, false);
Julian Anastasova210d012008-10-01 07:28:28 -07002536 if (dev_out == NULL)
2537 goto out;
2538
Linus Torvalds1da177e2005-04-16 15:20:36 -07002539 /* Special hack: user can direct multicasts
2540 and limited broadcast via necessary interface
2541 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2542 This hack is not just for fun, it allows
2543 vic,vat and friends to work.
2544 They bind socket to loopback, set ttl to zero
2545 and expect that it will work.
2546 From the viewpoint of routing cache they are broken,
2547 because we are not allowed to build multicast path
2548 with loopback source addr (look, routing cache
2549 cannot know, that ttl is zero, so that packet
2550 will not leave this host and route is valid).
2551 Luckily, this hack is good workaround.
2552 */
2553
David S. Miller813b3b52011-04-28 14:48:42 -07002554 fl4->flowi4_oif = dev_out->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002555 goto make_route;
2556 }
Julian Anastasova210d012008-10-01 07:28:28 -07002557
David S. Miller813b3b52011-04-28 14:48:42 -07002558 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
Julian Anastasova210d012008-10-01 07:28:28 -07002559 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
David S. Miller813b3b52011-04-28 14:48:42 -07002560 if (!__ip_dev_find(net, fl4->saddr, false))
Julian Anastasova210d012008-10-01 07:28:28 -07002561 goto out;
Julian Anastasova210d012008-10-01 07:28:28 -07002562 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002563 }
2564
2565
David S. Miller813b3b52011-04-28 14:48:42 -07002566 if (fl4->flowi4_oif) {
2567 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
David S. Millerb23dd4f2011-03-02 14:31:35 -08002568 rth = ERR_PTR(-ENODEV);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002569 if (dev_out == NULL)
2570 goto out;
Herbert Xue5ed6392005-10-03 14:35:55 -07002571
2572 /* RACE: Check return value of inet_select_addr instead. */
Eric Dumazetfc75fc82010-12-22 04:39:39 +00002573 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
David S. Millerb23dd4f2011-03-02 14:31:35 -08002574 rth = ERR_PTR(-ENETUNREACH);
Eric Dumazetfc75fc82010-12-22 04:39:39 +00002575 goto out;
2576 }
David S. Miller813b3b52011-04-28 14:48:42 -07002577 if (ipv4_is_local_multicast(fl4->daddr) ||
2578 ipv4_is_lbcast(fl4->daddr)) {
2579 if (!fl4->saddr)
2580 fl4->saddr = inet_select_addr(dev_out, 0,
2581 RT_SCOPE_LINK);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002582 goto make_route;
2583 }
David S. Miller813b3b52011-04-28 14:48:42 -07002584 if (fl4->saddr) {
2585 if (ipv4_is_multicast(fl4->daddr))
2586 fl4->saddr = inet_select_addr(dev_out, 0,
2587 fl4->flowi4_scope);
2588 else if (!fl4->daddr)
2589 fl4->saddr = inet_select_addr(dev_out, 0,
2590 RT_SCOPE_HOST);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002591 }
2592 }
2593
David S. Miller813b3b52011-04-28 14:48:42 -07002594 if (!fl4->daddr) {
2595 fl4->daddr = fl4->saddr;
2596 if (!fl4->daddr)
2597 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002598 dev_out = net->loopback_dev;
David S. Miller813b3b52011-04-28 14:48:42 -07002599 fl4->flowi4_oif = net->loopback_dev->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002600 res.type = RTN_LOCAL;
2601 flags |= RTCF_LOCAL;
2602 goto make_route;
2603 }
2604
David S. Miller813b3b52011-04-28 14:48:42 -07002605 if (fib_lookup(net, fl4, &res)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002606 res.fi = NULL;
David S. Miller813b3b52011-04-28 14:48:42 -07002607 if (fl4->flowi4_oif) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002608 /* Apparently, routing tables are wrong. Assume,
2609 that the destination is on link.
2610
2611 WHY? DW.
2612 Because we are allowed to send to iface
2613 even if it has NO routes and NO assigned
2614 addresses. When oif is specified, routing
2615 tables are looked up with only one purpose:
2616 to catch if destination is gatewayed, rather than
2617 direct. Moreover, if MSG_DONTROUTE is set,
2618 we send packet, ignoring both routing tables
2619 and ifaddr state. --ANK
2620
2621
2622 We could make it even if oif is unknown,
2623 likely IPv6, but we do not.
2624 */
2625
David S. Miller813b3b52011-04-28 14:48:42 -07002626 if (fl4->saddr == 0)
2627 fl4->saddr = inet_select_addr(dev_out, 0,
2628 RT_SCOPE_LINK);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002629 res.type = RTN_UNICAST;
2630 goto make_route;
2631 }
David S. Millerb23dd4f2011-03-02 14:31:35 -08002632 rth = ERR_PTR(-ENETUNREACH);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002633 goto out;
2634 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002635
2636 if (res.type == RTN_LOCAL) {
David S. Miller813b3b52011-04-28 14:48:42 -07002637 if (!fl4->saddr) {
Joel Sing9fc3bbb2011-01-03 20:24:20 +00002638 if (res.fi->fib_prefsrc)
David S. Miller813b3b52011-04-28 14:48:42 -07002639 fl4->saddr = res.fi->fib_prefsrc;
Joel Sing9fc3bbb2011-01-03 20:24:20 +00002640 else
David S. Miller813b3b52011-04-28 14:48:42 -07002641 fl4->saddr = fl4->daddr;
Joel Sing9fc3bbb2011-01-03 20:24:20 +00002642 }
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002643 dev_out = net->loopback_dev;
David S. Miller813b3b52011-04-28 14:48:42 -07002644 fl4->flowi4_oif = dev_out->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002645 res.fi = NULL;
2646 flags |= RTCF_LOCAL;
2647 goto make_route;
2648 }
2649
2650#ifdef CONFIG_IP_ROUTE_MULTIPATH
David S. Miller813b3b52011-04-28 14:48:42 -07002651 if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
David S. Miller1b7fe5932011-03-10 17:01:16 -08002652 fib_select_multipath(&res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002653 else
2654#endif
David S. Miller21d8c492011-04-14 14:49:37 -07002655 if (!res.prefixlen &&
2656 res.table->tb_num_default > 1 &&
David S. Miller813b3b52011-04-28 14:48:42 -07002657 res.type == RTN_UNICAST && !fl4->flowi4_oif)
David S. Miller0c838ff2011-01-31 16:16:50 -08002658 fib_select_default(&res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002659
David S. Miller813b3b52011-04-28 14:48:42 -07002660 if (!fl4->saddr)
2661 fl4->saddr = FIB_RES_PREFSRC(net, res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002662
Linus Torvalds1da177e2005-04-16 15:20:36 -07002663 dev_out = FIB_RES_DEV(res);
David S. Miller813b3b52011-04-28 14:48:42 -07002664 fl4->flowi4_oif = dev_out->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002665
2666
2667make_route:
David S. Miller813b3b52011-04-28 14:48:42 -07002668 rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2669 dev_out, flags);
David S. Millerb23dd4f2011-03-02 14:31:35 -08002670 if (!IS_ERR(rth)) {
David S. Miller5ada5522011-02-17 15:29:00 -08002671 unsigned int hash;
2672
David S. Miller813b3b52011-04-28 14:48:42 -07002673 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
David S. Miller5ada5522011-02-17 15:29:00 -08002674 rt_genid(dev_net(dev_out)));
David S. Miller813b3b52011-04-28 14:48:42 -07002675 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
David S. Miller5ada5522011-02-17 15:29:00 -08002676 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002677
David S. Miller010c2702011-02-17 15:37:09 -08002678out:
2679 rcu_read_unlock();
David S. Millerb23dd4f2011-03-02 14:31:35 -08002680 return rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002681}
2682
David S. Miller813b3b52011-04-28 14:48:42 -07002683struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002684{
Linus Torvalds1da177e2005-04-16 15:20:36 -07002685 struct rtable *rth;
David S. Miller010c2702011-02-17 15:37:09 -08002686 unsigned int hash;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002687
Neil Horman1080d702008-10-27 12:28:25 -07002688 if (!rt_caching(net))
2689 goto slow_output;
2690
David S. Miller9d6ec932011-03-12 01:12:47 -05002691 hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002692
2693 rcu_read_lock_bh();
Paul E. McKenneya898def2010-02-22 17:04:49 -08002694 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
Changli Gaod8d1f302010-06-10 23:31:35 -07002695 rth = rcu_dereference_bh(rth->dst.rt_next)) {
David S. Miller9d6ec932011-03-12 01:12:47 -05002696 if (rth->rt_key_dst == flp4->daddr &&
2697 rth->rt_key_src == flp4->saddr &&
David S. Millerc7537962010-11-11 17:07:48 -08002698 rt_is_output_route(rth) &&
David S. Miller9d6ec932011-03-12 01:12:47 -05002699 rth->rt_oif == flp4->flowi4_oif &&
2700 rth->rt_mark == flp4->flowi4_mark &&
David S. Miller475949d2011-05-03 19:45:15 -07002701 !((rth->rt_key_tos ^ flp4->flowi4_tos) &
Denis V. Lunevb5921912008-01-22 23:50:25 -08002702 (IPTOS_RT_MASK | RTO_ONLINK)) &&
Changli Gaod8d1f302010-06-10 23:31:35 -07002703 net_eq(dev_net(rth->dst.dev), net) &&
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002704 !rt_is_expired(rth)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002705 dst_use(&rth->dst, jiffies);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002706 RT_CACHE_STAT_INC(out_hit);
2707 rcu_read_unlock_bh();
David S. Miller56157872011-05-02 14:37:45 -07002708 if (!flp4->saddr)
2709 flp4->saddr = rth->rt_src;
2710 if (!flp4->daddr)
2711 flp4->daddr = rth->rt_dst;
David S. Millerb23dd4f2011-03-02 14:31:35 -08002712 return rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002713 }
2714 RT_CACHE_STAT_INC(out_hlist_search);
2715 }
2716 rcu_read_unlock_bh();
2717
Neil Horman1080d702008-10-27 12:28:25 -07002718slow_output:
David S. Miller9d6ec932011-03-12 01:12:47 -05002719 return ip_route_output_slow(net, flp4);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002720}
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -07002721EXPORT_SYMBOL_GPL(__ip_route_output_key);
2722
Jianzhao Wangae2688d2010-09-08 14:35:43 -07002723static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2724{
2725 return NULL;
2726}
2727
Roland Dreierec831ea2011-01-31 13:16:00 -08002728static unsigned int ipv4_blackhole_default_mtu(const struct dst_entry *dst)
2729{
2730 return 0;
2731}
2732
David S. Miller14e50e52007-05-24 18:17:54 -07002733static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2734{
2735}
2736
Held Bernhard0972ddb2011-04-24 22:07:32 +00002737static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2738 unsigned long old)
2739{
2740 return NULL;
2741}
2742
David S. Miller14e50e52007-05-24 18:17:54 -07002743static struct dst_ops ipv4_dst_blackhole_ops = {
2744 .family = AF_INET,
Harvey Harrison09640e632009-02-01 00:45:17 -08002745 .protocol = cpu_to_be16(ETH_P_IP),
David S. Miller14e50e52007-05-24 18:17:54 -07002746 .destroy = ipv4_dst_destroy,
Jianzhao Wangae2688d2010-09-08 14:35:43 -07002747 .check = ipv4_blackhole_dst_check,
Roland Dreierec831ea2011-01-31 13:16:00 -08002748 .default_mtu = ipv4_blackhole_default_mtu,
Eric Dumazet214f45c2011-02-18 11:39:01 -08002749 .default_advmss = ipv4_default_advmss,
David S. Miller14e50e52007-05-24 18:17:54 -07002750 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
Held Bernhard0972ddb2011-04-24 22:07:32 +00002751 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
David S. Millerd3aaeb32011-07-18 00:40:17 -07002752 .neigh_lookup = ipv4_neigh_lookup,
David S. Miller14e50e52007-05-24 18:17:54 -07002753};
2754
David S. Miller2774c132011-03-01 14:59:04 -08002755struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
David S. Miller14e50e52007-05-24 18:17:54 -07002756{
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002757 struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
David S. Miller2774c132011-03-01 14:59:04 -08002758 struct rtable *ort = (struct rtable *) dst_orig;
David S. Miller14e50e52007-05-24 18:17:54 -07002759
2760 if (rt) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002761 struct dst_entry *new = &rt->dst;
David S. Miller14e50e52007-05-24 18:17:54 -07002762
David S. Miller14e50e52007-05-24 18:17:54 -07002763 new->__use = 1;
Herbert Xu352e5122007-11-13 21:34:06 -08002764 new->input = dst_discard;
2765 new->output = dst_discard;
David S. Millerdefb3512010-12-08 21:16:57 -08002766 dst_copy_metrics(new, &ort->dst);
David S. Miller14e50e52007-05-24 18:17:54 -07002767
Changli Gaod8d1f302010-06-10 23:31:35 -07002768 new->dev = ort->dst.dev;
David S. Miller14e50e52007-05-24 18:17:54 -07002769 if (new->dev)
2770 dev_hold(new->dev);
2771
David S. Miller5e2b61f2011-03-04 21:47:09 -08002772 rt->rt_key_dst = ort->rt_key_dst;
2773 rt->rt_key_src = ort->rt_key_src;
David S. Miller475949d2011-05-03 19:45:15 -07002774 rt->rt_key_tos = ort->rt_key_tos;
OGAWA Hirofumi1b86a582011-04-07 14:04:08 -07002775 rt->rt_route_iif = ort->rt_route_iif;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002776 rt->rt_iif = ort->rt_iif;
2777 rt->rt_oif = ort->rt_oif;
2778 rt->rt_mark = ort->rt_mark;
David S. Miller14e50e52007-05-24 18:17:54 -07002779
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002780 rt->rt_genid = rt_genid(net);
David S. Miller14e50e52007-05-24 18:17:54 -07002781 rt->rt_flags = ort->rt_flags;
2782 rt->rt_type = ort->rt_type;
2783 rt->rt_dst = ort->rt_dst;
2784 rt->rt_src = ort->rt_src;
David S. Miller14e50e52007-05-24 18:17:54 -07002785 rt->rt_gateway = ort->rt_gateway;
2786 rt->rt_spec_dst = ort->rt_spec_dst;
2787 rt->peer = ort->peer;
2788 if (rt->peer)
2789 atomic_inc(&rt->peer->refcnt);
David S. Miller62fa8a82011-01-26 20:51:05 -08002790 rt->fi = ort->fi;
2791 if (rt->fi)
2792 atomic_inc(&rt->fi->fib_clntref);
David S. Miller14e50e52007-05-24 18:17:54 -07002793
2794 dst_free(new);
2795 }
2796
David S. Miller2774c132011-03-01 14:59:04 -08002797 dst_release(dst_orig);
2798
2799 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
David S. Miller14e50e52007-05-24 18:17:54 -07002800}
2801
David S. Miller9d6ec932011-03-12 01:12:47 -05002802struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
David S. Millerb23dd4f2011-03-02 14:31:35 -08002803 struct sock *sk)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002804{
David S. Miller9d6ec932011-03-12 01:12:47 -05002805 struct rtable *rt = __ip_route_output_key(net, flp4);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002806
David S. Millerb23dd4f2011-03-02 14:31:35 -08002807 if (IS_ERR(rt))
2808 return rt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002809
David S. Miller56157872011-05-02 14:37:45 -07002810 if (flp4->flowi4_proto)
David S. Miller9d6ec932011-03-12 01:12:47 -05002811 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2812 flowi4_to_flowi(flp4),
2813 sk, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002814
David S. Millerb23dd4f2011-03-02 14:31:35 -08002815 return rt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002816}
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -07002817EXPORT_SYMBOL_GPL(ip_route_output_flow);
2818
Benjamin Thery4feb88e2009-01-22 04:56:23 +00002819static int rt_fill_info(struct net *net,
2820 struct sk_buff *skb, u32 pid, u32 seq, int event,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07002821 int nowait, unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002822{
Eric Dumazet511c3f92009-06-02 05:14:27 +00002823 struct rtable *rt = skb_rtable(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002824 struct rtmsg *r;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002825 struct nlmsghdr *nlh;
Eric Dumazetfe6fe792011-06-08 06:07:07 +00002826 long expires = 0;
2827 const struct inet_peer *peer = rt->peer;
Thomas Grafe3703b32006-11-27 09:27:07 -08002828 u32 id = 0, ts = 0, tsage = 0, error;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002829
2830 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2831 if (nlh == NULL)
Patrick McHardy26932562007-01-31 23:16:40 -08002832 return -EMSGSIZE;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002833
2834 r = nlmsg_data(nlh);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002835 r->rtm_family = AF_INET;
2836 r->rtm_dst_len = 32;
2837 r->rtm_src_len = 0;
David S. Miller475949d2011-05-03 19:45:15 -07002838 r->rtm_tos = rt->rt_key_tos;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002839 r->rtm_table = RT_TABLE_MAIN;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002840 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002841 r->rtm_type = rt->rt_type;
2842 r->rtm_scope = RT_SCOPE_UNIVERSE;
2843 r->rtm_protocol = RTPROT_UNSPEC;
2844 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2845 if (rt->rt_flags & RTCF_NOTIFY)
2846 r->rtm_flags |= RTM_F_NOTIFY;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002847
Al Viro17fb2c62006-09-26 22:15:25 -07002848 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
Thomas Grafbe403ea2006-08-17 18:15:17 -07002849
David S. Miller5e2b61f2011-03-04 21:47:09 -08002850 if (rt->rt_key_src) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002851 r->rtm_src_len = 32;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002852 NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002853 }
Changli Gaod8d1f302010-06-10 23:31:35 -07002854 if (rt->dst.dev)
2855 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
Patrick McHardyc7066f72011-01-14 13:36:42 +01002856#ifdef CONFIG_IP_ROUTE_CLASSID
Changli Gaod8d1f302010-06-10 23:31:35 -07002857 if (rt->dst.tclassid)
2858 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002859#endif
David S. Millerc7537962010-11-11 17:07:48 -08002860 if (rt_is_input_route(rt))
Al Viro17fb2c62006-09-26 22:15:25 -07002861 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
David S. Miller5e2b61f2011-03-04 21:47:09 -08002862 else if (rt->rt_src != rt->rt_key_src)
Al Viro17fb2c62006-09-26 22:15:25 -07002863 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
Thomas Grafbe403ea2006-08-17 18:15:17 -07002864
Linus Torvalds1da177e2005-04-16 15:20:36 -07002865 if (rt->rt_dst != rt->rt_gateway)
Al Viro17fb2c62006-09-26 22:15:25 -07002866 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
Thomas Grafbe403ea2006-08-17 18:15:17 -07002867
David S. Millerdefb3512010-12-08 21:16:57 -08002868 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002869 goto nla_put_failure;
2870
David S. Miller5e2b61f2011-03-04 21:47:09 -08002871 if (rt->rt_mark)
2872 NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
Eric Dumazet963bfee2010-07-20 22:03:14 +00002873
Changli Gaod8d1f302010-06-10 23:31:35 -07002874 error = rt->dst.error;
Eric Dumazetfe6fe792011-06-08 06:07:07 +00002875 if (peer) {
Eric Dumazet317fe0e2010-06-16 04:52:13 +00002876 inet_peer_refcheck(rt->peer);
Eric Dumazetfe6fe792011-06-08 06:07:07 +00002877 id = atomic_read(&peer->ip_id_count) & 0xffff;
2878 if (peer->tcp_ts_stamp) {
2879 ts = peer->tcp_ts;
2880 tsage = get_seconds() - peer->tcp_ts_stamp;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002881 }
Eric Dumazetfe6fe792011-06-08 06:07:07 +00002882 expires = ACCESS_ONCE(peer->pmtu_expires);
2883 if (expires)
2884 expires -= jiffies;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002885 }
Thomas Grafbe403ea2006-08-17 18:15:17 -07002886
David S. Millerc7537962010-11-11 17:07:48 -08002887 if (rt_is_input_route(rt)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002888#ifdef CONFIG_IP_MROUTE
Al Viroe4485152006-09-26 22:15:01 -07002889 __be32 dst = rt->rt_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002890
Joe Perchesf97c1e02007-12-16 13:45:43 -08002891 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
Benjamin Thery4feb88e2009-01-22 04:56:23 +00002892 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
David S. Miller9a1b9492011-05-04 12:18:54 -07002893 int err = ipmr_get_route(net, skb,
2894 rt->rt_src, rt->rt_dst,
2895 r, nowait);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002896 if (err <= 0) {
2897 if (!nowait) {
2898 if (err == 0)
2899 return 0;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002900 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002901 } else {
2902 if (err == -EMSGSIZE)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002903 goto nla_put_failure;
Thomas Grafe3703b32006-11-27 09:27:07 -08002904 error = err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002905 }
2906 }
2907 } else
2908#endif
David S. Miller5e2b61f2011-03-04 21:47:09 -08002909 NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002910 }
2911
Changli Gaod8d1f302010-06-10 23:31:35 -07002912 if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
Thomas Grafe3703b32006-11-27 09:27:07 -08002913 expires, error) < 0)
2914 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002915
Thomas Grafbe403ea2006-08-17 18:15:17 -07002916 return nlmsg_end(skb, nlh);
2917
2918nla_put_failure:
Patrick McHardy26932562007-01-31 23:16:40 -08002919 nlmsg_cancel(skb, nlh);
2920 return -EMSGSIZE;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002921}
2922
Thomas Graf63f34442007-03-22 11:55:17 -07002923static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002924{
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09002925 struct net *net = sock_net(in_skb->sk);
Thomas Grafd889ce32006-08-17 18:15:44 -07002926 struct rtmsg *rtm;
2927 struct nlattr *tb[RTA_MAX+1];
Linus Torvalds1da177e2005-04-16 15:20:36 -07002928 struct rtable *rt = NULL;
Al Viro9e12bb22006-09-26 21:25:20 -07002929 __be32 dst = 0;
2930 __be32 src = 0;
2931 u32 iif;
Thomas Grafd889ce32006-08-17 18:15:44 -07002932 int err;
Eric Dumazet963bfee2010-07-20 22:03:14 +00002933 int mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002934 struct sk_buff *skb;
2935
Thomas Grafd889ce32006-08-17 18:15:44 -07002936 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2937 if (err < 0)
2938 goto errout;
2939
2940 rtm = nlmsg_data(nlh);
2941
Linus Torvalds1da177e2005-04-16 15:20:36 -07002942 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
Thomas Grafd889ce32006-08-17 18:15:44 -07002943 if (skb == NULL) {
2944 err = -ENOBUFS;
2945 goto errout;
2946 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002947
2948 /* Reserve room for dummy headers, this skb can pass
2949 through good chunk of routing engine.
2950 */
Arnaldo Carvalho de Melo459a98e2007-03-19 15:30:44 -07002951 skb_reset_mac_header(skb);
Arnaldo Carvalho de Meloc1d2bbe2007-04-10 20:45:18 -07002952 skb_reset_network_header(skb);
Stephen Hemmingerd2c962b2006-04-17 17:27:11 -07002953
2954 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07002955 ip_hdr(skb)->protocol = IPPROTO_ICMP;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002956 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2957
Al Viro17fb2c62006-09-26 22:15:25 -07002958 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2959 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
Thomas Grafd889ce32006-08-17 18:15:44 -07002960 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
Eric Dumazet963bfee2010-07-20 22:03:14 +00002961 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002962
2963 if (iif) {
Thomas Grafd889ce32006-08-17 18:15:44 -07002964 struct net_device *dev;
2965
Denis V. Lunev19375042008-02-28 20:52:04 -08002966 dev = __dev_get_by_index(net, iif);
Thomas Grafd889ce32006-08-17 18:15:44 -07002967 if (dev == NULL) {
2968 err = -ENODEV;
2969 goto errout_free;
2970 }
2971
Linus Torvalds1da177e2005-04-16 15:20:36 -07002972 skb->protocol = htons(ETH_P_IP);
2973 skb->dev = dev;
Eric Dumazet963bfee2010-07-20 22:03:14 +00002974 skb->mark = mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002975 local_bh_disable();
2976 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2977 local_bh_enable();
Thomas Grafd889ce32006-08-17 18:15:44 -07002978
Eric Dumazet511c3f92009-06-02 05:14:27 +00002979 rt = skb_rtable(skb);
Changli Gaod8d1f302010-06-10 23:31:35 -07002980 if (err == 0 && rt->dst.error)
2981 err = -rt->dst.error;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002982 } else {
David S. Miller68a5e3d2011-03-11 20:07:33 -05002983 struct flowi4 fl4 = {
2984 .daddr = dst,
2985 .saddr = src,
2986 .flowi4_tos = rtm->rtm_tos,
2987 .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2988 .flowi4_mark = mark,
Thomas Grafd889ce32006-08-17 18:15:44 -07002989 };
David S. Miller9d6ec932011-03-12 01:12:47 -05002990 rt = ip_route_output_key(net, &fl4);
David S. Millerb23dd4f2011-03-02 14:31:35 -08002991
2992 err = 0;
2993 if (IS_ERR(rt))
2994 err = PTR_ERR(rt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002995 }
Thomas Grafd889ce32006-08-17 18:15:44 -07002996
Linus Torvalds1da177e2005-04-16 15:20:36 -07002997 if (err)
Thomas Grafd889ce32006-08-17 18:15:44 -07002998 goto errout_free;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002999
Changli Gaod8d1f302010-06-10 23:31:35 -07003000 skb_dst_set(skb, &rt->dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003001 if (rtm->rtm_flags & RTM_F_NOTIFY)
3002 rt->rt_flags |= RTCF_NOTIFY;
3003
Benjamin Thery4feb88e2009-01-22 04:56:23 +00003004 err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
Denis V. Lunev19375042008-02-28 20:52:04 -08003005 RTM_NEWROUTE, 0, 0);
Thomas Grafd889ce32006-08-17 18:15:44 -07003006 if (err <= 0)
3007 goto errout_free;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003008
Denis V. Lunev19375042008-02-28 20:52:04 -08003009 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
Thomas Grafd889ce32006-08-17 18:15:44 -07003010errout:
Thomas Graf2942e902006-08-15 00:30:25 -07003011 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003012
Thomas Grafd889ce32006-08-17 18:15:44 -07003013errout_free:
Linus Torvalds1da177e2005-04-16 15:20:36 -07003014 kfree_skb(skb);
Thomas Grafd889ce32006-08-17 18:15:44 -07003015 goto errout;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003016}
3017
3018int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
3019{
3020 struct rtable *rt;
3021 int h, s_h;
3022 int idx, s_idx;
Denis V. Lunev19375042008-02-28 20:52:04 -08003023 struct net *net;
3024
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09003025 net = sock_net(skb->sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003026
3027 s_h = cb->args[0];
Eric Dumazetd8c92832008-01-07 21:52:14 -08003028 if (s_h < 0)
3029 s_h = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003030 s_idx = idx = cb->args[1];
Eric Dumazeta6272662008-08-28 01:11:25 -07003031 for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3032 if (!rt_hash_table[h].chain)
3033 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003034 rcu_read_lock_bh();
Paul E. McKenneya898def2010-02-22 17:04:49 -08003035 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
Changli Gaod8d1f302010-06-10 23:31:35 -07003036 rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3037 if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003038 continue;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07003039 if (rt_is_expired(rt))
Eric Dumazet29e75252008-01-31 17:05:09 -08003040 continue;
Changli Gaod8d1f302010-06-10 23:31:35 -07003041 skb_dst_set_noref(skb, &rt->dst);
Benjamin Thery4feb88e2009-01-22 04:56:23 +00003042 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003043 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07003044 1, NLM_F_MULTI) <= 0) {
Eric Dumazetadf30902009-06-02 05:19:30 +00003045 skb_dst_drop(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003046 rcu_read_unlock_bh();
3047 goto done;
3048 }
Eric Dumazetadf30902009-06-02 05:19:30 +00003049 skb_dst_drop(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003050 }
3051 rcu_read_unlock_bh();
3052 }
3053
3054done:
3055 cb->args[0] = h;
3056 cb->args[1] = idx;
3057 return skb->len;
3058}
3059
3060void ip_rt_multicast_event(struct in_device *in_dev)
3061{
Denis V. Lunev76e6ebf2008-07-05 19:00:44 -07003062 rt_cache_flush(dev_net(in_dev->dev), 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003063}
3064
3065#ifdef CONFIG_SYSCTL
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003066static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
Alexey Dobriyan8d65af72009-09-23 15:57:19 -07003067 void __user *buffer,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003068 size_t *lenp, loff_t *ppos)
3069{
3070 if (write) {
Denis V. Lunev639e1042008-07-05 19:02:06 -07003071 int flush_delay;
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003072 ctl_table ctl;
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003073 struct net *net;
Denis V. Lunev639e1042008-07-05 19:02:06 -07003074
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003075 memcpy(&ctl, __ctl, sizeof(ctl));
3076 ctl.data = &flush_delay;
Alexey Dobriyan8d65af72009-09-23 15:57:19 -07003077 proc_dointvec(&ctl, write, buffer, lenp, ppos);
Denis V. Lunev639e1042008-07-05 19:02:06 -07003078
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003079 net = (struct net *)__ctl->extra1;
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003080 rt_cache_flush(net, flush_delay);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003081 return 0;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003082 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003083
3084 return -EINVAL;
3085}
3086
Al Viroeeb61f72008-07-27 08:59:33 +01003087static ctl_table ipv4_route_table[] = {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003088 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003089 .procname = "gc_thresh",
3090 .data = &ipv4_dst_ops.gc_thresh,
3091 .maxlen = sizeof(int),
3092 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003093 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003094 },
3095 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003096 .procname = "max_size",
3097 .data = &ip_rt_max_size,
3098 .maxlen = sizeof(int),
3099 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003100 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003101 },
3102 {
3103 /* Deprecated. Use gc_min_interval_ms */
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003104
Linus Torvalds1da177e2005-04-16 15:20:36 -07003105 .procname = "gc_min_interval",
3106 .data = &ip_rt_gc_min_interval,
3107 .maxlen = sizeof(int),
3108 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003109 .proc_handler = proc_dointvec_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003110 },
3111 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003112 .procname = "gc_min_interval_ms",
3113 .data = &ip_rt_gc_min_interval,
3114 .maxlen = sizeof(int),
3115 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003116 .proc_handler = proc_dointvec_ms_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003117 },
3118 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003119 .procname = "gc_timeout",
3120 .data = &ip_rt_gc_timeout,
3121 .maxlen = sizeof(int),
3122 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003123 .proc_handler = proc_dointvec_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003124 },
3125 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003126 .procname = "gc_interval",
3127 .data = &ip_rt_gc_interval,
3128 .maxlen = sizeof(int),
3129 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003130 .proc_handler = proc_dointvec_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003131 },
3132 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003133 .procname = "redirect_load",
3134 .data = &ip_rt_redirect_load,
3135 .maxlen = sizeof(int),
3136 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003137 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003138 },
3139 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003140 .procname = "redirect_number",
3141 .data = &ip_rt_redirect_number,
3142 .maxlen = sizeof(int),
3143 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003144 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003145 },
3146 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003147 .procname = "redirect_silence",
3148 .data = &ip_rt_redirect_silence,
3149 .maxlen = sizeof(int),
3150 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003151 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003152 },
3153 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003154 .procname = "error_cost",
3155 .data = &ip_rt_error_cost,
3156 .maxlen = sizeof(int),
3157 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003158 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003159 },
3160 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003161 .procname = "error_burst",
3162 .data = &ip_rt_error_burst,
3163 .maxlen = sizeof(int),
3164 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003165 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003166 },
3167 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003168 .procname = "gc_elasticity",
3169 .data = &ip_rt_gc_elasticity,
3170 .maxlen = sizeof(int),
3171 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003172 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003173 },
3174 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003175 .procname = "mtu_expires",
3176 .data = &ip_rt_mtu_expires,
3177 .maxlen = sizeof(int),
3178 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003179 .proc_handler = proc_dointvec_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003180 },
3181 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003182 .procname = "min_pmtu",
3183 .data = &ip_rt_min_pmtu,
3184 .maxlen = sizeof(int),
3185 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003186 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003187 },
3188 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003189 .procname = "min_adv_mss",
3190 .data = &ip_rt_min_advmss,
3191 .maxlen = sizeof(int),
3192 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003193 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003194 },
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003195 { }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003196};
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003197
Al Viro2f4520d2008-08-25 15:17:44 -07003198static struct ctl_table empty[1];
3199
3200static struct ctl_table ipv4_skeleton[] =
3201{
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003202 { .procname = "route",
Hugh Dickinsd994af02008-08-27 02:35:18 -07003203 .mode = 0555, .child = ipv4_route_table},
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003204 { .procname = "neigh",
Hugh Dickinsd994af02008-08-27 02:35:18 -07003205 .mode = 0555, .child = empty},
Al Viro2f4520d2008-08-25 15:17:44 -07003206 { }
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003207};
3208
Al Viro2f4520d2008-08-25 15:17:44 -07003209static __net_initdata struct ctl_path ipv4_path[] = {
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003210 { .procname = "net", },
3211 { .procname = "ipv4", },
Al Viro2f4520d2008-08-25 15:17:44 -07003212 { },
3213};
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003214
3215static struct ctl_table ipv4_route_flush_table[] = {
3216 {
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003217 .procname = "flush",
3218 .maxlen = sizeof(int),
3219 .mode = 0200,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003220 .proc_handler = ipv4_sysctl_rtcache_flush,
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003221 },
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003222 { },
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003223};
3224
Al Viro2f4520d2008-08-25 15:17:44 -07003225static __net_initdata struct ctl_path ipv4_route_path[] = {
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003226 { .procname = "net", },
3227 { .procname = "ipv4", },
3228 { .procname = "route", },
Al Viro2f4520d2008-08-25 15:17:44 -07003229 { },
3230};
3231
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003232static __net_init int sysctl_route_net_init(struct net *net)
3233{
3234 struct ctl_table *tbl;
3235
3236 tbl = ipv4_route_flush_table;
Octavian Purdila09ad9bc2009-11-25 15:14:13 -08003237 if (!net_eq(net, &init_net)) {
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003238 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3239 if (tbl == NULL)
3240 goto err_dup;
3241 }
3242 tbl[0].extra1 = net;
3243
3244 net->ipv4.route_hdr =
3245 register_net_sysctl_table(net, ipv4_route_path, tbl);
3246 if (net->ipv4.route_hdr == NULL)
3247 goto err_reg;
3248 return 0;
3249
3250err_reg:
3251 if (tbl != ipv4_route_flush_table)
3252 kfree(tbl);
3253err_dup:
3254 return -ENOMEM;
3255}
3256
3257static __net_exit void sysctl_route_net_exit(struct net *net)
3258{
3259 struct ctl_table *tbl;
3260
3261 tbl = net->ipv4.route_hdr->ctl_table_arg;
3262 unregister_net_sysctl_table(net->ipv4.route_hdr);
3263 BUG_ON(tbl == ipv4_route_flush_table);
3264 kfree(tbl);
3265}
3266
3267static __net_initdata struct pernet_operations sysctl_route_ops = {
3268 .init = sysctl_route_net_init,
3269 .exit = sysctl_route_net_exit,
3270};
Linus Torvalds1da177e2005-04-16 15:20:36 -07003271#endif
3272
Neil Horman3ee94372010-05-08 01:57:52 -07003273static __net_init int rt_genid_init(struct net *net)
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003274{
Neil Horman3ee94372010-05-08 01:57:52 -07003275 get_random_bytes(&net->ipv4.rt_genid,
3276 sizeof(net->ipv4.rt_genid));
David S. Miller436c3b62011-03-24 17:42:21 -07003277 get_random_bytes(&net->ipv4.dev_addr_genid,
3278 sizeof(net->ipv4.dev_addr_genid));
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003279 return 0;
3280}
3281
Neil Horman3ee94372010-05-08 01:57:52 -07003282static __net_initdata struct pernet_operations rt_genid_ops = {
3283 .init = rt_genid_init,
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003284};
3285
3286
Patrick McHardyc7066f72011-01-14 13:36:42 +01003287#ifdef CONFIG_IP_ROUTE_CLASSID
Tejun Heo7d720c32010-02-16 15:20:26 +00003288struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
Patrick McHardyc7066f72011-01-14 13:36:42 +01003289#endif /* CONFIG_IP_ROUTE_CLASSID */
Linus Torvalds1da177e2005-04-16 15:20:36 -07003290
3291static __initdata unsigned long rhash_entries;
3292static int __init set_rhash_entries(char *str)
3293{
3294 if (!str)
3295 return 0;
3296 rhash_entries = simple_strtoul(str, &str, 0);
3297 return 1;
3298}
3299__setup("rhash_entries=", set_rhash_entries);
3300
3301int __init ip_rt_init(void)
3302{
Eric Dumazet424c4b72005-07-05 14:58:19 -07003303 int rc = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003304
Patrick McHardyc7066f72011-01-14 13:36:42 +01003305#ifdef CONFIG_IP_ROUTE_CLASSID
Ingo Molnar0dcec8c2009-02-25 14:07:33 +01003306 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
Linus Torvalds1da177e2005-04-16 15:20:36 -07003307 if (!ip_rt_acct)
3308 panic("IP: failed to allocate ip_rt_acct\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07003309#endif
3310
Alexey Dobriyane5d679f332006-08-26 19:25:52 -07003311 ipv4_dst_ops.kmem_cachep =
3312 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
Paul Mundt20c2df82007-07-20 10:11:58 +09003313 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003314
David S. Miller14e50e52007-05-24 18:17:54 -07003315 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3316
Eric Dumazetfc66f952010-10-08 06:37:34 +00003317 if (dst_entries_init(&ipv4_dst_ops) < 0)
3318 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3319
3320 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3321 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3322
Eric Dumazet424c4b72005-07-05 14:58:19 -07003323 rt_hash_table = (struct rt_hash_bucket *)
3324 alloc_large_system_hash("IP route cache",
3325 sizeof(struct rt_hash_bucket),
3326 rhash_entries,
Jan Beulich44813742009-09-21 17:03:05 -07003327 (totalram_pages >= 128 * 1024) ?
Mike Stroyan18955cf2005-11-29 16:12:55 -08003328 15 : 17,
Kirill Korotaev8d1502d2006-08-07 20:44:22 -07003329 0,
Eric Dumazet424c4b72005-07-05 14:58:19 -07003330 &rt_hash_log,
3331 &rt_hash_mask,
Anton Blanchardc9503e02009-04-27 05:42:24 -07003332 rhash_entries ? 0 : 512 * 1024);
Eric Dumazet22c047c2005-07-05 14:55:24 -07003333 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3334 rt_hash_lock_init();
Linus Torvalds1da177e2005-04-16 15:20:36 -07003335
3336 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3337 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3338
Linus Torvalds1da177e2005-04-16 15:20:36 -07003339 devinet_init();
3340 ip_fib_init();
3341
Denis V. Lunev73b38712008-02-28 20:51:18 -08003342 if (ip_rt_proc_init())
Pavel Emelyanov107f1632007-12-05 21:14:28 -08003343 printk(KERN_ERR "Unable to create route proc files\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07003344#ifdef CONFIG_XFRM
3345 xfrm_init();
Neil Hormana33bc5c2009-07-30 18:52:15 -07003346 xfrm4_init(ip_rt_max_size);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003347#endif
Greg Rosec7ac8672011-06-10 01:27:09 +00003348 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
Thomas Graf63f34442007-03-22 11:55:17 -07003349
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003350#ifdef CONFIG_SYSCTL
3351 register_pernet_subsys(&sysctl_route_ops);
3352#endif
Neil Horman3ee94372010-05-08 01:57:52 -07003353 register_pernet_subsys(&rt_genid_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003354 return rc;
3355}
3356
Al Viroa1bc6eb2008-07-30 06:32:52 -04003357#ifdef CONFIG_SYSCTL
Al Viroeeb61f72008-07-27 08:59:33 +01003358/*
3359 * We really need to sanitize the damn ipv4 init order, then all
3360 * this nonsense will go away.
3361 */
3362void __init ip_static_sysctl_init(void)
3363{
Al Viro2f4520d2008-08-25 15:17:44 -07003364 register_sysctl_paths(ipv4_path, ipv4_skeleton);
Al Viroeeb61f72008-07-27 08:59:33 +01003365}
Al Viroa1bc6eb2008-07-30 06:32:52 -04003366#endif