blob: 980030d4e4ae9161b3a1a9edd971e57e341e0338 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
Jesper Juhl02c30a82005-05-05 16:16:16 -07008 * Authors: Ross Biro
Linus Torvalds1da177e2005-04-16 15:20:36 -07009 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 *
14 * Fixes:
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090021 * Alan Cox : Super /proc >4K
Linus Torvalds1da177e2005-04-16 15:20:36 -070022 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090039 *
Linus Torvalds1da177e2005-04-16 15:20:36 -070040 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
Eric Dumazetbb1d23b2005-07-05 15:00:32 -070055 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
Ilia Sotnikovcef26852006-03-25 01:38:55 -080056 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
Linus Torvalds1da177e2005-04-16 15:20:36 -070058 *
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
63 */
64
Linus Torvalds1da177e2005-04-16 15:20:36 -070065#include <linux/module.h>
66#include <asm/uaccess.h>
67#include <asm/system.h>
68#include <linux/bitops.h>
69#include <linux/types.h>
70#include <linux/kernel.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070071#include <linux/mm.h>
Eric Dumazet424c4b72005-07-05 14:58:19 -070072#include <linux/bootmem.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070073#include <linux/string.h>
74#include <linux/socket.h>
75#include <linux/sockios.h>
76#include <linux/errno.h>
77#include <linux/in.h>
78#include <linux/inet.h>
79#include <linux/netdevice.h>
80#include <linux/proc_fs.h>
81#include <linux/init.h>
Eric Dumazet39c90ec2007-09-15 10:55:54 -070082#include <linux/workqueue.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070083#include <linux/skbuff.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070084#include <linux/inetdevice.h>
85#include <linux/igmp.h>
86#include <linux/pkt_sched.h>
87#include <linux/mroute.h>
88#include <linux/netfilter_ipv4.h>
89#include <linux/random.h>
90#include <linux/jhash.h>
91#include <linux/rcupdate.h>
92#include <linux/times.h>
Tejun Heo5a0e3ad2010-03-24 17:04:11 +090093#include <linux/slab.h>
Herbert Xu352e5122007-11-13 21:34:06 -080094#include <net/dst.h>
Eric W. Biederman457c4cb2007-09-12 12:01:34 +020095#include <net/net_namespace.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070096#include <net/protocol.h>
97#include <net/ip.h>
98#include <net/route.h>
99#include <net/inetpeer.h>
100#include <net/sock.h>
101#include <net/ip_fib.h>
102#include <net/arp.h>
103#include <net/tcp.h>
104#include <net/icmp.h>
105#include <net/xfrm.h>
Tom Tucker8d717402006-07-30 20:43:36 -0700106#include <net/netevent.h>
Thomas Graf63f34442007-03-22 11:55:17 -0700107#include <net/rtnetlink.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700108#ifdef CONFIG_SYSCTL
109#include <linux/sysctl.h>
110#endif
111
112#define RT_FL_TOS(oldflp) \
113 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
114
115#define IP_MAX_MTU 0xFFF0
116
117#define RT_GC_TIMEOUT (300*HZ)
118
Linus Torvalds1da177e2005-04-16 15:20:36 -0700119static int ip_rt_max_size;
Stephen Hemminger817bc4d2008-03-22 17:43:59 -0700120static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
121static int ip_rt_gc_interval __read_mostly = 60 * HZ;
122static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
123static int ip_rt_redirect_number __read_mostly = 9;
124static int ip_rt_redirect_load __read_mostly = HZ / 50;
125static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
126static int ip_rt_error_cost __read_mostly = HZ;
127static int ip_rt_error_burst __read_mostly = 5 * HZ;
128static int ip_rt_gc_elasticity __read_mostly = 8;
129static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
130static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
131static int ip_rt_min_advmss __read_mostly = 256;
Neil Horman1080d702008-10-27 12:28:25 -0700132static int rt_chain_length_max __read_mostly = 20;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700133
Eric Dumazet125bb8f2009-06-11 20:10:07 +0000134static struct delayed_work expires_work;
135static unsigned long expires_ljiffies;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700136
137/*
138 * Interface to generic destination cache.
139 */
140
141static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
David S. Miller0dbaee32010-12-13 12:52:14 -0800142static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
David S. Millerd33e4552010-12-14 13:01:14 -0800143static unsigned int ipv4_default_mtu(const struct dst_entry *dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700144static void ipv4_dst_destroy(struct dst_entry *dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700145static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
146static void ipv4_link_failure(struct sk_buff *skb);
147static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
Daniel Lezcano569d3642008-01-18 03:56:57 -0800148static int rt_garbage_collect(struct dst_ops *ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700149
Eric Dumazet72cdd1d2010-11-11 07:14:07 +0000150static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
151 int how)
152{
153}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700154
David S. Miller62fa8a82011-01-26 20:51:05 -0800155static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
156{
157 u32 *p = kmalloc(sizeof(u32) * RTAX_MAX, GFP_ATOMIC);
158
159 if (p) {
160 u32 *old_p = __DST_METRICS_PTR(old);
161 unsigned long prev, new;
162
163 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
164
165 new = (unsigned long) p;
166 prev = cmpxchg(&dst->_metrics, old, new);
167
168 if (prev != old) {
169 kfree(p);
170 p = __DST_METRICS_PTR(prev);
171 if (prev & DST_METRICS_READ_ONLY)
172 p = NULL;
173 } else {
174 struct rtable *rt = (struct rtable *) dst;
175
176 if (rt->fi) {
177 fib_info_put(rt->fi);
178 rt->fi = NULL;
179 }
180 }
181 }
182 return p;
183}
184
Linus Torvalds1da177e2005-04-16 15:20:36 -0700185static struct dst_ops ipv4_dst_ops = {
186 .family = AF_INET,
Harvey Harrison09640e632009-02-01 00:45:17 -0800187 .protocol = cpu_to_be16(ETH_P_IP),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700188 .gc = rt_garbage_collect,
189 .check = ipv4_dst_check,
David S. Miller0dbaee32010-12-13 12:52:14 -0800190 .default_advmss = ipv4_default_advmss,
David S. Millerd33e4552010-12-14 13:01:14 -0800191 .default_mtu = ipv4_default_mtu,
David S. Miller62fa8a82011-01-26 20:51:05 -0800192 .cow_metrics = ipv4_cow_metrics,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700193 .destroy = ipv4_dst_destroy,
194 .ifdown = ipv4_dst_ifdown,
195 .negative_advice = ipv4_negative_advice,
196 .link_failure = ipv4_link_failure,
197 .update_pmtu = ip_rt_update_pmtu,
Herbert Xu1ac06e02008-05-20 14:32:14 -0700198 .local_out = __ip_local_out,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700199};
200
201#define ECN_OR_COST(class) TC_PRIO_##class
202
Philippe De Muyter4839c522007-07-09 15:32:57 -0700203const __u8 ip_tos2prio[16] = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700204 TC_PRIO_BESTEFFORT,
205 ECN_OR_COST(FILLER),
206 TC_PRIO_BESTEFFORT,
207 ECN_OR_COST(BESTEFFORT),
208 TC_PRIO_BULK,
209 ECN_OR_COST(BULK),
210 TC_PRIO_BULK,
211 ECN_OR_COST(BULK),
212 TC_PRIO_INTERACTIVE,
213 ECN_OR_COST(INTERACTIVE),
214 TC_PRIO_INTERACTIVE,
215 ECN_OR_COST(INTERACTIVE),
216 TC_PRIO_INTERACTIVE_BULK,
217 ECN_OR_COST(INTERACTIVE_BULK),
218 TC_PRIO_INTERACTIVE_BULK,
219 ECN_OR_COST(INTERACTIVE_BULK)
220};
221
222
223/*
224 * Route cache.
225 */
226
227/* The locking scheme is rather straight forward:
228 *
229 * 1) Read-Copy Update protects the buckets of the central route hash.
230 * 2) Only writers remove entries, and they hold the lock
231 * as they look at rtable reference counts.
232 * 3) Only readers acquire references to rtable entries,
233 * they do so with atomic increments and with the
234 * lock held.
235 */
236
237struct rt_hash_bucket {
Eric Dumazet1c317202010-10-25 21:02:07 +0000238 struct rtable __rcu *chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700239};
Neil Horman1080d702008-10-27 12:28:25 -0700240
Ingo Molnar8a25d5d2006-07-03 00:24:54 -0700241#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
242 defined(CONFIG_PROVE_LOCKING)
Eric Dumazet22c047c2005-07-05 14:55:24 -0700243/*
244 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
245 * The size of this table is a power of two and depends on the number of CPUS.
Ingo Molnar62051202006-07-03 00:24:59 -0700246 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
Eric Dumazet22c047c2005-07-05 14:55:24 -0700247 */
Ingo Molnar62051202006-07-03 00:24:59 -0700248#ifdef CONFIG_LOCKDEP
249# define RT_HASH_LOCK_SZ 256
Eric Dumazet22c047c2005-07-05 14:55:24 -0700250#else
Ingo Molnar62051202006-07-03 00:24:59 -0700251# if NR_CPUS >= 32
252# define RT_HASH_LOCK_SZ 4096
253# elif NR_CPUS >= 16
254# define RT_HASH_LOCK_SZ 2048
255# elif NR_CPUS >= 8
256# define RT_HASH_LOCK_SZ 1024
257# elif NR_CPUS >= 4
258# define RT_HASH_LOCK_SZ 512
259# else
260# define RT_HASH_LOCK_SZ 256
261# endif
Eric Dumazet22c047c2005-07-05 14:55:24 -0700262#endif
263
264static spinlock_t *rt_hash_locks;
265# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
Pavel Emelyanov1ff1cc22007-12-05 21:15:05 -0800266
267static __init void rt_hash_lock_init(void)
268{
269 int i;
270
271 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
272 GFP_KERNEL);
273 if (!rt_hash_locks)
274 panic("IP: failed to allocate rt_hash_locks\n");
275
276 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
277 spin_lock_init(&rt_hash_locks[i]);
278}
Eric Dumazet22c047c2005-07-05 14:55:24 -0700279#else
280# define rt_hash_lock_addr(slot) NULL
Pavel Emelyanov1ff1cc22007-12-05 21:15:05 -0800281
282static inline void rt_hash_lock_init(void)
283{
284}
Eric Dumazet22c047c2005-07-05 14:55:24 -0700285#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700286
Stephen Hemminger817bc4d2008-03-22 17:43:59 -0700287static struct rt_hash_bucket *rt_hash_table __read_mostly;
288static unsigned rt_hash_mask __read_mostly;
289static unsigned int rt_hash_log __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700290
Eric Dumazet2f970d82006-01-17 02:54:36 -0800291static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
Eric Dumazet27f39c73e2010-05-19 22:07:23 +0000292#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700293
Denis V. Lunevb00180d2008-07-05 19:04:09 -0700294static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
Eric Dumazet0eae88f2010-04-20 19:06:52 -0700295 int genid)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700296{
Eric Dumazet0eae88f2010-04-20 19:06:52 -0700297 return jhash_3words((__force u32)daddr, (__force u32)saddr,
Denis V. Lunevb00180d2008-07-05 19:04:09 -0700298 idx, genid)
Eric Dumazet29e75252008-01-31 17:05:09 -0800299 & rt_hash_mask;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700300}
301
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700302static inline int rt_genid(struct net *net)
303{
304 return atomic_read(&net->ipv4.rt_genid);
305}
306
Linus Torvalds1da177e2005-04-16 15:20:36 -0700307#ifdef CONFIG_PROC_FS
308struct rt_cache_iter_state {
Denis V. Luneva75e9362008-02-28 20:50:55 -0800309 struct seq_net_private p;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700310 int bucket;
Eric Dumazet29e75252008-01-31 17:05:09 -0800311 int genid;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700312};
313
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900314static struct rtable *rt_cache_get_first(struct seq_file *seq)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700315{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900316 struct rt_cache_iter_state *st = seq->private;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700317 struct rtable *r = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700318
319 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
Eric Dumazet1c317202010-10-25 21:02:07 +0000320 if (!rcu_dereference_raw(rt_hash_table[st->bucket].chain))
Eric Dumazeta6272662008-08-28 01:11:25 -0700321 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700322 rcu_read_lock_bh();
Paul E. McKenneya898def2010-02-22 17:04:49 -0800323 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
Eric Dumazet29e75252008-01-31 17:05:09 -0800324 while (r) {
Changli Gaod8d1f302010-06-10 23:31:35 -0700325 if (dev_net(r->dst.dev) == seq_file_net(seq) &&
Denis V. Luneva75e9362008-02-28 20:50:55 -0800326 r->rt_genid == st->genid)
Eric Dumazet29e75252008-01-31 17:05:09 -0800327 return r;
Changli Gaod8d1f302010-06-10 23:31:35 -0700328 r = rcu_dereference_bh(r->dst.rt_next);
Eric Dumazet29e75252008-01-31 17:05:09 -0800329 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700330 rcu_read_unlock_bh();
331 }
Eric Dumazet29e75252008-01-31 17:05:09 -0800332 return r;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700333}
334
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900335static struct rtable *__rt_cache_get_next(struct seq_file *seq,
Denis V. Lunev642d6312008-02-28 20:50:33 -0800336 struct rtable *r)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700337{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900338 struct rt_cache_iter_state *st = seq->private;
Eric Dumazeta6272662008-08-28 01:11:25 -0700339
Eric Dumazet1c317202010-10-25 21:02:07 +0000340 r = rcu_dereference_bh(r->dst.rt_next);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700341 while (!r) {
342 rcu_read_unlock_bh();
Eric Dumazeta6272662008-08-28 01:11:25 -0700343 do {
344 if (--st->bucket < 0)
345 return NULL;
Eric Dumazet1c317202010-10-25 21:02:07 +0000346 } while (!rcu_dereference_raw(rt_hash_table[st->bucket].chain));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700347 rcu_read_lock_bh();
Eric Dumazet1c317202010-10-25 21:02:07 +0000348 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700349 }
Eric Dumazet1c317202010-10-25 21:02:07 +0000350 return r;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700351}
352
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900353static struct rtable *rt_cache_get_next(struct seq_file *seq,
Denis V. Lunev642d6312008-02-28 20:50:33 -0800354 struct rtable *r)
355{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900356 struct rt_cache_iter_state *st = seq->private;
357 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
Changli Gaod8d1f302010-06-10 23:31:35 -0700358 if (dev_net(r->dst.dev) != seq_file_net(seq))
Denis V. Luneva75e9362008-02-28 20:50:55 -0800359 continue;
Denis V. Lunev642d6312008-02-28 20:50:33 -0800360 if (r->rt_genid == st->genid)
361 break;
362 }
363 return r;
364}
365
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900366static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700367{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900368 struct rtable *r = rt_cache_get_first(seq);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700369
370 if (r)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900371 while (pos && (r = rt_cache_get_next(seq, r)))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700372 --pos;
373 return pos ? NULL : r;
374}
375
376static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
377{
Eric Dumazet29e75252008-01-31 17:05:09 -0800378 struct rt_cache_iter_state *st = seq->private;
Eric Dumazet29e75252008-01-31 17:05:09 -0800379 if (*pos)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900380 return rt_cache_get_idx(seq, *pos - 1);
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700381 st->genid = rt_genid(seq_file_net(seq));
Eric Dumazet29e75252008-01-31 17:05:09 -0800382 return SEQ_START_TOKEN;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700383}
384
385static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
386{
Eric Dumazet29e75252008-01-31 17:05:09 -0800387 struct rtable *r;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700388
389 if (v == SEQ_START_TOKEN)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900390 r = rt_cache_get_first(seq);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700391 else
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900392 r = rt_cache_get_next(seq, v);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700393 ++*pos;
394 return r;
395}
396
397static void rt_cache_seq_stop(struct seq_file *seq, void *v)
398{
399 if (v && v != SEQ_START_TOKEN)
400 rcu_read_unlock_bh();
401}
402
403static int rt_cache_seq_show(struct seq_file *seq, void *v)
404{
405 if (v == SEQ_START_TOKEN)
406 seq_printf(seq, "%-127s\n",
407 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
408 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
409 "HHUptod\tSpecDst");
410 else {
411 struct rtable *r = v;
Pavel Emelyanov5e659e42008-04-24 01:02:16 -0700412 int len;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700413
Eric Dumazet0eae88f2010-04-20 19:06:52 -0700414 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
415 "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
Changli Gaod8d1f302010-06-10 23:31:35 -0700416 r->dst.dev ? r->dst.dev->name : "*",
Eric Dumazet0eae88f2010-04-20 19:06:52 -0700417 (__force u32)r->rt_dst,
418 (__force u32)r->rt_gateway,
Changli Gaod8d1f302010-06-10 23:31:35 -0700419 r->rt_flags, atomic_read(&r->dst.__refcnt),
420 r->dst.__use, 0, (__force u32)r->rt_src,
David S. Miller0dbaee32010-12-13 12:52:14 -0800421 dst_metric_advmss(&r->dst) + 40,
Changli Gaod8d1f302010-06-10 23:31:35 -0700422 dst_metric(&r->dst, RTAX_WINDOW),
423 (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
424 dst_metric(&r->dst, RTAX_RTTVAR)),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700425 r->fl.fl4_tos,
Changli Gaod8d1f302010-06-10 23:31:35 -0700426 r->dst.hh ? atomic_read(&r->dst.hh->hh_refcnt) : -1,
427 r->dst.hh ? (r->dst.hh->hh_output ==
Linus Torvalds1da177e2005-04-16 15:20:36 -0700428 dev_queue_xmit) : 0,
Pavel Emelyanov5e659e42008-04-24 01:02:16 -0700429 r->rt_spec_dst, &len);
430
431 seq_printf(seq, "%*s\n", 127 - len, "");
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900432 }
433 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700434}
435
Stephen Hemmingerf6908082007-03-12 14:34:29 -0700436static const struct seq_operations rt_cache_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700437 .start = rt_cache_seq_start,
438 .next = rt_cache_seq_next,
439 .stop = rt_cache_seq_stop,
440 .show = rt_cache_seq_show,
441};
442
443static int rt_cache_seq_open(struct inode *inode, struct file *file)
444{
Denis V. Luneva75e9362008-02-28 20:50:55 -0800445 return seq_open_net(inode, file, &rt_cache_seq_ops,
Pavel Emelyanovcf7732e2007-10-10 02:29:29 -0700446 sizeof(struct rt_cache_iter_state));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700447}
448
Arjan van de Ven9a321442007-02-12 00:55:35 -0800449static const struct file_operations rt_cache_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700450 .owner = THIS_MODULE,
451 .open = rt_cache_seq_open,
452 .read = seq_read,
453 .llseek = seq_lseek,
Denis V. Luneva75e9362008-02-28 20:50:55 -0800454 .release = seq_release_net,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700455};
456
457
458static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
459{
460 int cpu;
461
462 if (*pos == 0)
463 return SEQ_START_TOKEN;
464
Rusty Russell0f23174a2008-12-29 12:23:42 +0000465 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700466 if (!cpu_possible(cpu))
467 continue;
468 *pos = cpu+1;
Eric Dumazet2f970d82006-01-17 02:54:36 -0800469 return &per_cpu(rt_cache_stat, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700470 }
471 return NULL;
472}
473
474static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
475{
476 int cpu;
477
Rusty Russell0f23174a2008-12-29 12:23:42 +0000478 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700479 if (!cpu_possible(cpu))
480 continue;
481 *pos = cpu+1;
Eric Dumazet2f970d82006-01-17 02:54:36 -0800482 return &per_cpu(rt_cache_stat, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700483 }
484 return NULL;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900485
Linus Torvalds1da177e2005-04-16 15:20:36 -0700486}
487
488static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
489{
490
491}
492
493static int rt_cpu_seq_show(struct seq_file *seq, void *v)
494{
495 struct rt_cache_stat *st = v;
496
497 if (v == SEQ_START_TOKEN) {
Olaf Rempel5bec0032005-04-28 12:16:08 -0700498 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700499 return 0;
500 }
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900501
Linus Torvalds1da177e2005-04-16 15:20:36 -0700502 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
503 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
Eric Dumazetfc66f952010-10-08 06:37:34 +0000504 dst_entries_get_slow(&ipv4_dst_ops),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700505 st->in_hit,
506 st->in_slow_tot,
507 st->in_slow_mc,
508 st->in_no_route,
509 st->in_brd,
510 st->in_martian_dst,
511 st->in_martian_src,
512
513 st->out_hit,
514 st->out_slow_tot,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900515 st->out_slow_mc,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700516
517 st->gc_total,
518 st->gc_ignored,
519 st->gc_goal_miss,
520 st->gc_dst_overflow,
521 st->in_hlist_search,
522 st->out_hlist_search
523 );
524 return 0;
525}
526
Stephen Hemmingerf6908082007-03-12 14:34:29 -0700527static const struct seq_operations rt_cpu_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700528 .start = rt_cpu_seq_start,
529 .next = rt_cpu_seq_next,
530 .stop = rt_cpu_seq_stop,
531 .show = rt_cpu_seq_show,
532};
533
534
535static int rt_cpu_seq_open(struct inode *inode, struct file *file)
536{
537 return seq_open(file, &rt_cpu_seq_ops);
538}
539
Arjan van de Ven9a321442007-02-12 00:55:35 -0800540static const struct file_operations rt_cpu_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700541 .owner = THIS_MODULE,
542 .open = rt_cpu_seq_open,
543 .read = seq_read,
544 .llseek = seq_lseek,
545 .release = seq_release,
546};
547
Patrick McHardyc7066f72011-01-14 13:36:42 +0100548#ifdef CONFIG_IP_ROUTE_CLASSID
Alexey Dobriyana661c412009-11-25 15:40:35 -0800549static int rt_acct_proc_show(struct seq_file *m, void *v)
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800550{
Alexey Dobriyana661c412009-11-25 15:40:35 -0800551 struct ip_rt_acct *dst, *src;
552 unsigned int i, j;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800553
Alexey Dobriyana661c412009-11-25 15:40:35 -0800554 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
555 if (!dst)
556 return -ENOMEM;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800557
Alexey Dobriyana661c412009-11-25 15:40:35 -0800558 for_each_possible_cpu(i) {
559 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
560 for (j = 0; j < 256; j++) {
561 dst[j].o_bytes += src[j].o_bytes;
562 dst[j].o_packets += src[j].o_packets;
563 dst[j].i_bytes += src[j].i_bytes;
564 dst[j].i_packets += src[j].i_packets;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800565 }
566 }
Alexey Dobriyana661c412009-11-25 15:40:35 -0800567
568 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
569 kfree(dst);
570 return 0;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800571}
Alexey Dobriyana661c412009-11-25 15:40:35 -0800572
573static int rt_acct_proc_open(struct inode *inode, struct file *file)
574{
575 return single_open(file, rt_acct_proc_show, NULL);
576}
577
578static const struct file_operations rt_acct_proc_fops = {
579 .owner = THIS_MODULE,
580 .open = rt_acct_proc_open,
581 .read = seq_read,
582 .llseek = seq_lseek,
583 .release = single_release,
584};
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800585#endif
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800586
Denis V. Lunev73b38712008-02-28 20:51:18 -0800587static int __net_init ip_rt_do_proc_init(struct net *net)
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800588{
589 struct proc_dir_entry *pde;
590
591 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
592 &rt_cache_seq_fops);
593 if (!pde)
594 goto err1;
595
Wang Chen77020722008-02-28 14:14:25 -0800596 pde = proc_create("rt_cache", S_IRUGO,
597 net->proc_net_stat, &rt_cpu_seq_fops);
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800598 if (!pde)
599 goto err2;
600
Patrick McHardyc7066f72011-01-14 13:36:42 +0100601#ifdef CONFIG_IP_ROUTE_CLASSID
Alexey Dobriyana661c412009-11-25 15:40:35 -0800602 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800603 if (!pde)
604 goto err3;
605#endif
606 return 0;
607
Patrick McHardyc7066f72011-01-14 13:36:42 +0100608#ifdef CONFIG_IP_ROUTE_CLASSID
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800609err3:
610 remove_proc_entry("rt_cache", net->proc_net_stat);
611#endif
612err2:
613 remove_proc_entry("rt_cache", net->proc_net);
614err1:
615 return -ENOMEM;
616}
Denis V. Lunev73b38712008-02-28 20:51:18 -0800617
618static void __net_exit ip_rt_do_proc_exit(struct net *net)
619{
620 remove_proc_entry("rt_cache", net->proc_net_stat);
621 remove_proc_entry("rt_cache", net->proc_net);
Patrick McHardyc7066f72011-01-14 13:36:42 +0100622#ifdef CONFIG_IP_ROUTE_CLASSID
Denis V. Lunev73b38712008-02-28 20:51:18 -0800623 remove_proc_entry("rt_acct", net->proc_net);
Alexey Dobriyan0a931ac2010-01-17 03:32:50 +0000624#endif
Denis V. Lunev73b38712008-02-28 20:51:18 -0800625}
626
627static struct pernet_operations ip_rt_proc_ops __net_initdata = {
628 .init = ip_rt_do_proc_init,
629 .exit = ip_rt_do_proc_exit,
630};
631
632static int __init ip_rt_proc_init(void)
633{
634 return register_pernet_subsys(&ip_rt_proc_ops);
635}
636
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800637#else
Denis V. Lunev73b38712008-02-28 20:51:18 -0800638static inline int ip_rt_proc_init(void)
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800639{
640 return 0;
641}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700642#endif /* CONFIG_PROC_FS */
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900643
Stephen Hemminger5969f712008-04-10 01:52:09 -0700644static inline void rt_free(struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700645{
Changli Gaod8d1f302010-06-10 23:31:35 -0700646 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700647}
648
Stephen Hemminger5969f712008-04-10 01:52:09 -0700649static inline void rt_drop(struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700650{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700651 ip_rt_put(rt);
Changli Gaod8d1f302010-06-10 23:31:35 -0700652 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700653}
654
Stephen Hemminger5969f712008-04-10 01:52:09 -0700655static inline int rt_fast_clean(struct rtable *rth)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700656{
657 /* Kill broadcast/multicast entries very aggresively, if they
658 collide in hash table with more useful entries */
659 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
David S. Millerc7537962010-11-11 17:07:48 -0800660 rt_is_input_route(rth) && rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700661}
662
Stephen Hemminger5969f712008-04-10 01:52:09 -0700663static inline int rt_valuable(struct rtable *rth)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700664{
665 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
Changli Gaod8d1f302010-06-10 23:31:35 -0700666 rth->dst.expires;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700667}
668
669static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
670{
671 unsigned long age;
672 int ret = 0;
673
Changli Gaod8d1f302010-06-10 23:31:35 -0700674 if (atomic_read(&rth->dst.__refcnt))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700675 goto out;
676
677 ret = 1;
Changli Gaod8d1f302010-06-10 23:31:35 -0700678 if (rth->dst.expires &&
679 time_after_eq(jiffies, rth->dst.expires))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700680 goto out;
681
Changli Gaod8d1f302010-06-10 23:31:35 -0700682 age = jiffies - rth->dst.lastuse;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700683 ret = 0;
684 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
685 (age <= tmo2 && rt_valuable(rth)))
686 goto out;
687 ret = 1;
688out: return ret;
689}
690
691/* Bits of score are:
692 * 31: very valuable
693 * 30: not quite useless
694 * 29..0: usage counter
695 */
696static inline u32 rt_score(struct rtable *rt)
697{
Changli Gaod8d1f302010-06-10 23:31:35 -0700698 u32 score = jiffies - rt->dst.lastuse;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700699
700 score = ~score & ~(3<<30);
701
702 if (rt_valuable(rt))
703 score |= (1<<31);
704
David S. Millerc7537962010-11-11 17:07:48 -0800705 if (rt_is_output_route(rt) ||
Linus Torvalds1da177e2005-04-16 15:20:36 -0700706 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
707 score |= (1<<30);
708
709 return score;
710}
711
Neil Horman1080d702008-10-27 12:28:25 -0700712static inline bool rt_caching(const struct net *net)
713{
714 return net->ipv4.current_rt_cache_rebuild_count <=
715 net->ipv4.sysctl_rt_cache_rebuild_count;
716}
717
718static inline bool compare_hash_inputs(const struct flowi *fl1,
719 const struct flowi *fl2)
720{
Changli Gao58116622010-11-12 18:43:55 +0000721 return ((((__force u32)fl1->fl4_dst ^ (__force u32)fl2->fl4_dst) |
722 ((__force u32)fl1->fl4_src ^ (__force u32)fl2->fl4_src) |
Neil Horman1080d702008-10-27 12:28:25 -0700723 (fl1->iif ^ fl2->iif)) == 0);
724}
725
Linus Torvalds1da177e2005-04-16 15:20:36 -0700726static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
727{
Changli Gao58116622010-11-12 18:43:55 +0000728 return (((__force u32)fl1->fl4_dst ^ (__force u32)fl2->fl4_dst) |
729 ((__force u32)fl1->fl4_src ^ (__force u32)fl2->fl4_src) |
Thomas Graf47dcf0c2006-11-09 15:20:38 -0800730 (fl1->mark ^ fl2->mark) |
Changli Gao58116622010-11-12 18:43:55 +0000731 (*(u16 *)&fl1->fl4_tos ^ *(u16 *)&fl2->fl4_tos) |
David S. Miller8238b212006-10-12 00:49:15 -0700732 (fl1->oif ^ fl2->oif) |
733 (fl1->iif ^ fl2->iif)) == 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700734}
735
Denis V. Lunevb5921912008-01-22 23:50:25 -0800736static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
737{
Changli Gaod8d1f302010-06-10 23:31:35 -0700738 return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
Denis V. Lunevb5921912008-01-22 23:50:25 -0800739}
740
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700741static inline int rt_is_expired(struct rtable *rth)
742{
Changli Gaod8d1f302010-06-10 23:31:35 -0700743 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700744}
745
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800746/*
747 * Perform a full scan of hash table and free all entries.
748 * Can be called by a softirq or a process.
749 * In the later case, we want to be reschedule if necessary
750 */
David S. Miller6561a3b2010-12-19 21:11:20 -0800751static void rt_do_flush(struct net *net, int process_context)
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800752{
753 unsigned int i;
754 struct rtable *rth, *next;
755
756 for (i = 0; i <= rt_hash_mask; i++) {
David S. Miller6561a3b2010-12-19 21:11:20 -0800757 struct rtable __rcu **pprev;
758 struct rtable *list;
759
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800760 if (process_context && need_resched())
761 cond_resched();
Eric Dumazet1c317202010-10-25 21:02:07 +0000762 rth = rcu_dereference_raw(rt_hash_table[i].chain);
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800763 if (!rth)
764 continue;
765
766 spin_lock_bh(rt_hash_lock_addr(i));
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700767
David S. Miller6561a3b2010-12-19 21:11:20 -0800768 list = NULL;
769 pprev = &rt_hash_table[i].chain;
770 rth = rcu_dereference_protected(*pprev,
Eric Dumazet1c317202010-10-25 21:02:07 +0000771 lockdep_is_held(rt_hash_lock_addr(i)));
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700772
David S. Miller6561a3b2010-12-19 21:11:20 -0800773 while (rth) {
774 next = rcu_dereference_protected(rth->dst.rt_next,
775 lockdep_is_held(rt_hash_lock_addr(i)));
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700776
David S. Miller6561a3b2010-12-19 21:11:20 -0800777 if (!net ||
778 net_eq(dev_net(rth->dst.dev), net)) {
779 rcu_assign_pointer(*pprev, next);
780 rcu_assign_pointer(rth->dst.rt_next, list);
781 list = rth;
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700782 } else {
David S. Miller6561a3b2010-12-19 21:11:20 -0800783 pprev = &rth->dst.rt_next;
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700784 }
David S. Miller6561a3b2010-12-19 21:11:20 -0800785 rth = next;
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700786 }
David S. Miller6561a3b2010-12-19 21:11:20 -0800787
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800788 spin_unlock_bh(rt_hash_lock_addr(i));
789
David S. Miller6561a3b2010-12-19 21:11:20 -0800790 for (; list; list = next) {
791 next = rcu_dereference_protected(list->dst.rt_next, 1);
792 rt_free(list);
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800793 }
794 }
795}
796
Neil Horman1080d702008-10-27 12:28:25 -0700797/*
798 * While freeing expired entries, we compute average chain length
799 * and standard deviation, using fixed-point arithmetic.
800 * This to have an estimation of rt_chain_length_max
801 * rt_chain_length_max = max(elasticity, AVG + 4*SD)
802 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
803 */
804
805#define FRACT_BITS 3
806#define ONE (1UL << FRACT_BITS)
807
Eric Dumazet98376382010-03-08 03:20:00 +0000808/*
809 * Given a hash chain and an item in this hash chain,
810 * find if a previous entry has the same hash_inputs
811 * (but differs on tos, mark or oif)
812 * Returns 0 if an alias is found.
813 * Returns ONE if rth has no alias before itself.
814 */
815static int has_noalias(const struct rtable *head, const struct rtable *rth)
816{
817 const struct rtable *aux = head;
818
819 while (aux != rth) {
820 if (compare_hash_inputs(&aux->fl, &rth->fl))
821 return 0;
Eric Dumazet1c317202010-10-25 21:02:07 +0000822 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
Eric Dumazet98376382010-03-08 03:20:00 +0000823 }
824 return ONE;
825}
826
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800827static void rt_check_expire(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700828{
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700829 static unsigned int rover;
830 unsigned int i = rover, goal;
Eric Dumazet1c317202010-10-25 21:02:07 +0000831 struct rtable *rth;
832 struct rtable __rcu **rthp;
Eric Dumazetcf8da762009-05-19 18:54:22 +0000833 unsigned long samples = 0;
Neil Horman1080d702008-10-27 12:28:25 -0700834 unsigned long sum = 0, sum2 = 0;
Eric Dumazet125bb8f2009-06-11 20:10:07 +0000835 unsigned long delta;
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700836 u64 mult;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700837
Eric Dumazet125bb8f2009-06-11 20:10:07 +0000838 delta = jiffies - expires_ljiffies;
839 expires_ljiffies = jiffies;
840 mult = ((u64)delta) << rt_hash_log;
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700841 if (ip_rt_gc_timeout > 1)
842 do_div(mult, ip_rt_gc_timeout);
843 goal = (unsigned int)mult;
Eric Dumazet39c90ec2007-09-15 10:55:54 -0700844 if (goal > rt_hash_mask)
845 goal = rt_hash_mask + 1;
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700846 for (; goal > 0; goal--) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700847 unsigned long tmo = ip_rt_gc_timeout;
Eric Dumazetcf8da762009-05-19 18:54:22 +0000848 unsigned long length;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700849
850 i = (i + 1) & rt_hash_mask;
851 rthp = &rt_hash_table[i].chain;
852
Eric Dumazetd90bf5a2007-11-14 16:14:05 -0800853 if (need_resched())
854 cond_resched();
855
Neil Horman1080d702008-10-27 12:28:25 -0700856 samples++;
857
Eric Dumazet1c317202010-10-25 21:02:07 +0000858 if (rcu_dereference_raw(*rthp) == NULL)
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700859 continue;
Eric Dumazetcf8da762009-05-19 18:54:22 +0000860 length = 0;
Eric Dumazet39c90ec2007-09-15 10:55:54 -0700861 spin_lock_bh(rt_hash_lock_addr(i));
Eric Dumazet1c317202010-10-25 21:02:07 +0000862 while ((rth = rcu_dereference_protected(*rthp,
863 lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
Changli Gaod8d1f302010-06-10 23:31:35 -0700864 prefetch(rth->dst.rt_next);
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700865 if (rt_is_expired(rth)) {
Changli Gaod8d1f302010-06-10 23:31:35 -0700866 *rthp = rth->dst.rt_next;
Eric Dumazet29e75252008-01-31 17:05:09 -0800867 rt_free(rth);
868 continue;
869 }
Changli Gaod8d1f302010-06-10 23:31:35 -0700870 if (rth->dst.expires) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700871 /* Entry is expired even if it is in use */
Changli Gaod8d1f302010-06-10 23:31:35 -0700872 if (time_before_eq(jiffies, rth->dst.expires)) {
Eric Dumazet1ddbcb02009-05-19 20:14:28 +0000873nofree:
Linus Torvalds1da177e2005-04-16 15:20:36 -0700874 tmo >>= 1;
Changli Gaod8d1f302010-06-10 23:31:35 -0700875 rthp = &rth->dst.rt_next;
Neil Horman1080d702008-10-27 12:28:25 -0700876 /*
Eric Dumazet1ddbcb02009-05-19 20:14:28 +0000877 * We only count entries on
Neil Horman1080d702008-10-27 12:28:25 -0700878 * a chain with equal hash inputs once
879 * so that entries for different QOS
880 * levels, and other non-hash input
881 * attributes don't unfairly skew
882 * the length computation
883 */
Eric Dumazet98376382010-03-08 03:20:00 +0000884 length += has_noalias(rt_hash_table[i].chain, rth);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700885 continue;
886 }
Eric Dumazet1ddbcb02009-05-19 20:14:28 +0000887 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
888 goto nofree;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700889
890 /* Cleanup aged off entries. */
Changli Gaod8d1f302010-06-10 23:31:35 -0700891 *rthp = rth->dst.rt_next;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900892 rt_free(rth);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700893 }
Eric Dumazet39c90ec2007-09-15 10:55:54 -0700894 spin_unlock_bh(rt_hash_lock_addr(i));
Neil Horman1080d702008-10-27 12:28:25 -0700895 sum += length;
896 sum2 += length*length;
897 }
898 if (samples) {
899 unsigned long avg = sum / samples;
900 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
901 rt_chain_length_max = max_t(unsigned long,
902 ip_rt_gc_elasticity,
903 (avg + 4*sd) >> FRACT_BITS);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700904 }
905 rover = i;
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800906}
907
908/*
909 * rt_worker_func() is run in process context.
Eric Dumazet29e75252008-01-31 17:05:09 -0800910 * we call rt_check_expire() to scan part of the hash table
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800911 */
912static void rt_worker_func(struct work_struct *work)
913{
Eric Dumazet29e75252008-01-31 17:05:09 -0800914 rt_check_expire();
Eric Dumazet39c90ec2007-09-15 10:55:54 -0700915 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700916}
917
Eric Dumazet29e75252008-01-31 17:05:09 -0800918/*
919 * Pertubation of rt_genid by a small quantity [1..256]
920 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
921 * many times (2^24) without giving recent rt_genid.
922 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700923 */
Denis V. Lunev86c657f2008-07-05 19:03:31 -0700924static void rt_cache_invalidate(struct net *net)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700925{
Eric Dumazet29e75252008-01-31 17:05:09 -0800926 unsigned char shuffle;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700927
Eric Dumazet29e75252008-01-31 17:05:09 -0800928 get_random_bytes(&shuffle, sizeof(shuffle));
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700929 atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700930}
931
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800932/*
Eric Dumazet29e75252008-01-31 17:05:09 -0800933 * delay < 0 : invalidate cache (fast : entries will be deleted later)
934 * delay >= 0 : invalidate & flush cache (can be long)
935 */
Denis V. Lunev76e6ebf2008-07-05 19:00:44 -0700936void rt_cache_flush(struct net *net, int delay)
Eric Dumazet29e75252008-01-31 17:05:09 -0800937{
Denis V. Lunev86c657f2008-07-05 19:03:31 -0700938 rt_cache_invalidate(net);
Eric Dumazet29e75252008-01-31 17:05:09 -0800939 if (delay >= 0)
David S. Miller6561a3b2010-12-19 21:11:20 -0800940 rt_do_flush(net, !in_softirq());
Eric Dumazet29e75252008-01-31 17:05:09 -0800941}
942
Eric W. Biedermana5ee1552009-11-29 15:45:58 +0000943/* Flush previous cache invalidated entries from the cache */
David S. Miller6561a3b2010-12-19 21:11:20 -0800944void rt_cache_flush_batch(struct net *net)
Eric W. Biedermana5ee1552009-11-29 15:45:58 +0000945{
David S. Miller6561a3b2010-12-19 21:11:20 -0800946 rt_do_flush(net, !in_softirq());
Eric W. Biedermana5ee1552009-11-29 15:45:58 +0000947}
948
Neil Horman1080d702008-10-27 12:28:25 -0700949static void rt_emergency_hash_rebuild(struct net *net)
950{
Neil Horman3ee94372010-05-08 01:57:52 -0700951 if (net_ratelimit())
Neil Horman1080d702008-10-27 12:28:25 -0700952 printk(KERN_WARNING "Route hash chain too long!\n");
Neil Horman3ee94372010-05-08 01:57:52 -0700953 rt_cache_invalidate(net);
Neil Horman1080d702008-10-27 12:28:25 -0700954}
955
Linus Torvalds1da177e2005-04-16 15:20:36 -0700956/*
957 Short description of GC goals.
958
959 We want to build algorithm, which will keep routing cache
960 at some equilibrium point, when number of aged off entries
961 is kept approximately equal to newly generated ones.
962
963 Current expiration strength is variable "expire".
964 We try to adjust it dynamically, so that if networking
965 is idle expires is large enough to keep enough of warm entries,
966 and when load increases it reduces to limit cache size.
967 */
968
Daniel Lezcano569d3642008-01-18 03:56:57 -0800969static int rt_garbage_collect(struct dst_ops *ops)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700970{
971 static unsigned long expire = RT_GC_TIMEOUT;
972 static unsigned long last_gc;
973 static int rover;
974 static int equilibrium;
Eric Dumazet1c317202010-10-25 21:02:07 +0000975 struct rtable *rth;
976 struct rtable __rcu **rthp;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700977 unsigned long now = jiffies;
978 int goal;
Eric Dumazetfc66f952010-10-08 06:37:34 +0000979 int entries = dst_entries_get_fast(&ipv4_dst_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700980
981 /*
982 * Garbage collection is pretty expensive,
983 * do not make it too frequently.
984 */
985
986 RT_CACHE_STAT_INC(gc_total);
987
988 if (now - last_gc < ip_rt_gc_min_interval &&
Eric Dumazetfc66f952010-10-08 06:37:34 +0000989 entries < ip_rt_max_size) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700990 RT_CACHE_STAT_INC(gc_ignored);
991 goto out;
992 }
993
Eric Dumazetfc66f952010-10-08 06:37:34 +0000994 entries = dst_entries_get_slow(&ipv4_dst_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700995 /* Calculate number of entries, which we want to expire now. */
Eric Dumazetfc66f952010-10-08 06:37:34 +0000996 goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700997 if (goal <= 0) {
998 if (equilibrium < ipv4_dst_ops.gc_thresh)
999 equilibrium = ipv4_dst_ops.gc_thresh;
Eric Dumazetfc66f952010-10-08 06:37:34 +00001000 goal = entries - equilibrium;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001001 if (goal > 0) {
Eric Dumazetb790ced2007-12-21 01:49:07 -08001002 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
Eric Dumazetfc66f952010-10-08 06:37:34 +00001003 goal = entries - equilibrium;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001004 }
1005 } else {
1006 /* We are in dangerous area. Try to reduce cache really
1007 * aggressively.
1008 */
Eric Dumazetb790ced2007-12-21 01:49:07 -08001009 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
Eric Dumazetfc66f952010-10-08 06:37:34 +00001010 equilibrium = entries - goal;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001011 }
1012
1013 if (now - last_gc >= ip_rt_gc_min_interval)
1014 last_gc = now;
1015
1016 if (goal <= 0) {
1017 equilibrium += goal;
1018 goto work_done;
1019 }
1020
1021 do {
1022 int i, k;
1023
1024 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1025 unsigned long tmo = expire;
1026
1027 k = (k + 1) & rt_hash_mask;
1028 rthp = &rt_hash_table[k].chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -07001029 spin_lock_bh(rt_hash_lock_addr(k));
Eric Dumazet1c317202010-10-25 21:02:07 +00001030 while ((rth = rcu_dereference_protected(*rthp,
1031 lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001032 if (!rt_is_expired(rth) &&
Eric Dumazet29e75252008-01-31 17:05:09 -08001033 !rt_may_expire(rth, tmo, expire)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001034 tmo >>= 1;
Changli Gaod8d1f302010-06-10 23:31:35 -07001035 rthp = &rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001036 continue;
1037 }
Changli Gaod8d1f302010-06-10 23:31:35 -07001038 *rthp = rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001039 rt_free(rth);
1040 goal--;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001041 }
Eric Dumazet22c047c2005-07-05 14:55:24 -07001042 spin_unlock_bh(rt_hash_lock_addr(k));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001043 if (goal <= 0)
1044 break;
1045 }
1046 rover = k;
1047
1048 if (goal <= 0)
1049 goto work_done;
1050
1051 /* Goal is not achieved. We stop process if:
1052
1053 - if expire reduced to zero. Otherwise, expire is halfed.
1054 - if table is not full.
1055 - if we are called from interrupt.
1056 - jiffies check is just fallback/debug loop breaker.
1057 We will not spin here for long time in any case.
1058 */
1059
1060 RT_CACHE_STAT_INC(gc_goal_miss);
1061
1062 if (expire == 0)
1063 break;
1064
1065 expire >>= 1;
1066#if RT_CACHE_DEBUG >= 2
1067 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
Eric Dumazetfc66f952010-10-08 06:37:34 +00001068 dst_entries_get_fast(&ipv4_dst_ops), goal, i);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001069#endif
1070
Eric Dumazetfc66f952010-10-08 06:37:34 +00001071 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001072 goto out;
1073 } while (!in_softirq() && time_before_eq(jiffies, now));
1074
Eric Dumazetfc66f952010-10-08 06:37:34 +00001075 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1076 goto out;
1077 if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001078 goto out;
1079 if (net_ratelimit())
1080 printk(KERN_WARNING "dst cache overflow\n");
1081 RT_CACHE_STAT_INC(gc_dst_overflow);
1082 return 1;
1083
1084work_done:
1085 expire += ip_rt_gc_min_interval;
1086 if (expire > ip_rt_gc_timeout ||
Eric Dumazetfc66f952010-10-08 06:37:34 +00001087 dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1088 dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001089 expire = ip_rt_gc_timeout;
1090#if RT_CACHE_DEBUG >= 2
1091 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
Eric Dumazetfc66f952010-10-08 06:37:34 +00001092 dst_entries_get_fast(&ipv4_dst_ops), goal, rover);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001093#endif
1094out: return 0;
1095}
1096
Eric Dumazet98376382010-03-08 03:20:00 +00001097/*
1098 * Returns number of entries in a hash chain that have different hash_inputs
1099 */
1100static int slow_chain_length(const struct rtable *head)
1101{
1102 int length = 0;
1103 const struct rtable *rth = head;
1104
1105 while (rth) {
1106 length += has_noalias(head, rth);
Eric Dumazet1c317202010-10-25 21:02:07 +00001107 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
Eric Dumazet98376382010-03-08 03:20:00 +00001108 }
1109 return length >> FRACT_BITS;
1110}
1111
Eric Dumazet511c3f92009-06-02 05:14:27 +00001112static int rt_intern_hash(unsigned hash, struct rtable *rt,
Pavel Emelyanov6a2bad72010-03-24 21:51:22 +00001113 struct rtable **rp, struct sk_buff *skb, int ifindex)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001114{
Eric Dumazet1c317202010-10-25 21:02:07 +00001115 struct rtable *rth, *cand;
1116 struct rtable __rcu **rthp, **candp;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001117 unsigned long now;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001118 u32 min_score;
1119 int chain_length;
1120 int attempts = !in_softirq();
1121
1122restart:
1123 chain_length = 0;
1124 min_score = ~(u32)0;
1125 cand = NULL;
1126 candp = NULL;
1127 now = jiffies;
1128
Changli Gaod8d1f302010-06-10 23:31:35 -07001129 if (!rt_caching(dev_net(rt->dst.dev))) {
Neil Horman73e42892009-06-20 01:15:16 -07001130 /*
1131 * If we're not caching, just tell the caller we
1132 * were successful and don't touch the route. The
1133 * caller hold the sole reference to the cache entry, and
1134 * it will be released when the caller is done with it.
1135 * If we drop it here, the callers have no way to resolve routes
1136 * when we're not caching. Instead, just point *rp at rt, so
1137 * the caller gets a single use out of the route
Neil Hormanb6280b42009-06-22 10:18:53 +00001138 * Note that we do rt_free on this new route entry, so that
1139 * once its refcount hits zero, we are still able to reap it
1140 * (Thanks Alexey)
Eric Dumazet27b75c92010-10-15 05:44:11 +00001141 * Note: To avoid expensive rcu stuff for this uncached dst,
1142 * we set DST_NOCACHE so that dst_release() can free dst without
1143 * waiting a grace period.
Neil Horman73e42892009-06-20 01:15:16 -07001144 */
Neil Hormanb6280b42009-06-22 10:18:53 +00001145
Eric Dumazetc7d44262010-10-03 22:17:54 -07001146 rt->dst.flags |= DST_NOCACHE;
David S. Millerc7537962010-11-11 17:07:48 -08001147 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001148 int err = arp_bind_neighbour(&rt->dst);
Neil Hormanb6280b42009-06-22 10:18:53 +00001149 if (err) {
1150 if (net_ratelimit())
1151 printk(KERN_WARNING
1152 "Neighbour table failure & not caching routes.\n");
Eric Dumazet27b75c92010-10-15 05:44:11 +00001153 ip_rt_put(rt);
Neil Hormanb6280b42009-06-22 10:18:53 +00001154 return err;
1155 }
1156 }
1157
Neil Hormanb6280b42009-06-22 10:18:53 +00001158 goto skip_hashing;
Neil Horman1080d702008-10-27 12:28:25 -07001159 }
1160
Linus Torvalds1da177e2005-04-16 15:20:36 -07001161 rthp = &rt_hash_table[hash].chain;
1162
Eric Dumazet22c047c2005-07-05 14:55:24 -07001163 spin_lock_bh(rt_hash_lock_addr(hash));
Eric Dumazet1c317202010-10-25 21:02:07 +00001164 while ((rth = rcu_dereference_protected(*rthp,
1165 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001166 if (rt_is_expired(rth)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001167 *rthp = rth->dst.rt_next;
Eric Dumazet29e75252008-01-31 17:05:09 -08001168 rt_free(rth);
1169 continue;
1170 }
Denis V. Lunevb5921912008-01-22 23:50:25 -08001171 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001172 /* Put it first */
Changli Gaod8d1f302010-06-10 23:31:35 -07001173 *rthp = rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001174 /*
1175 * Since lookup is lockfree, the deletion
1176 * must be visible to another weakly ordered CPU before
1177 * the insertion at the start of the hash chain.
1178 */
Changli Gaod8d1f302010-06-10 23:31:35 -07001179 rcu_assign_pointer(rth->dst.rt_next,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001180 rt_hash_table[hash].chain);
1181 /*
1182 * Since lookup is lockfree, the update writes
1183 * must be ordered for consistency on SMP.
1184 */
1185 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1186
Changli Gaod8d1f302010-06-10 23:31:35 -07001187 dst_use(&rth->dst, now);
Eric Dumazet22c047c2005-07-05 14:55:24 -07001188 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001189
1190 rt_drop(rt);
Eric Dumazet511c3f92009-06-02 05:14:27 +00001191 if (rp)
1192 *rp = rth;
1193 else
Changli Gaod8d1f302010-06-10 23:31:35 -07001194 skb_dst_set(skb, &rth->dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001195 return 0;
1196 }
1197
Changli Gaod8d1f302010-06-10 23:31:35 -07001198 if (!atomic_read(&rth->dst.__refcnt)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001199 u32 score = rt_score(rth);
1200
1201 if (score <= min_score) {
1202 cand = rth;
1203 candp = rthp;
1204 min_score = score;
1205 }
1206 }
1207
1208 chain_length++;
1209
Changli Gaod8d1f302010-06-10 23:31:35 -07001210 rthp = &rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001211 }
1212
1213 if (cand) {
1214 /* ip_rt_gc_elasticity used to be average length of chain
1215 * length, when exceeded gc becomes really aggressive.
1216 *
1217 * The second limit is less certain. At the moment it allows
1218 * only 2 entries per bucket. We will see.
1219 */
1220 if (chain_length > ip_rt_gc_elasticity) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001221 *candp = cand->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001222 rt_free(cand);
1223 }
Neil Horman1080d702008-10-27 12:28:25 -07001224 } else {
Eric Dumazet98376382010-03-08 03:20:00 +00001225 if (chain_length > rt_chain_length_max &&
1226 slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001227 struct net *net = dev_net(rt->dst.dev);
Neil Horman1080d702008-10-27 12:28:25 -07001228 int num = ++net->ipv4.current_rt_cache_rebuild_count;
Pavel Emelyanovb35ecb52010-03-24 07:43:17 +00001229 if (!rt_caching(net)) {
Neil Horman1080d702008-10-27 12:28:25 -07001230 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
Changli Gaod8d1f302010-06-10 23:31:35 -07001231 rt->dst.dev->name, num);
Neil Horman1080d702008-10-27 12:28:25 -07001232 }
Pavel Emelyanovb35ecb52010-03-24 07:43:17 +00001233 rt_emergency_hash_rebuild(net);
Pavel Emelyanov6a2bad72010-03-24 21:51:22 +00001234 spin_unlock_bh(rt_hash_lock_addr(hash));
1235
1236 hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1237 ifindex, rt_genid(net));
1238 goto restart;
Neil Horman1080d702008-10-27 12:28:25 -07001239 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001240 }
1241
1242 /* Try to bind route to arp only if it is output
1243 route or unicast forwarding path.
1244 */
David S. Millerc7537962010-11-11 17:07:48 -08001245 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001246 int err = arp_bind_neighbour(&rt->dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001247 if (err) {
Eric Dumazet22c047c2005-07-05 14:55:24 -07001248 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001249
1250 if (err != -ENOBUFS) {
1251 rt_drop(rt);
1252 return err;
1253 }
1254
1255 /* Neighbour tables are full and nothing
1256 can be released. Try to shrink route cache,
1257 it is most likely it holds some neighbour records.
1258 */
1259 if (attempts-- > 0) {
1260 int saved_elasticity = ip_rt_gc_elasticity;
1261 int saved_int = ip_rt_gc_min_interval;
1262 ip_rt_gc_elasticity = 1;
1263 ip_rt_gc_min_interval = 0;
Daniel Lezcano569d3642008-01-18 03:56:57 -08001264 rt_garbage_collect(&ipv4_dst_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001265 ip_rt_gc_min_interval = saved_int;
1266 ip_rt_gc_elasticity = saved_elasticity;
1267 goto restart;
1268 }
1269
1270 if (net_ratelimit())
Ulrich Weber7e1b33e2010-09-27 15:02:18 -07001271 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07001272 rt_drop(rt);
1273 return -ENOBUFS;
1274 }
1275 }
1276
Changli Gaod8d1f302010-06-10 23:31:35 -07001277 rt->dst.rt_next = rt_hash_table[hash].chain;
Neil Horman1080d702008-10-27 12:28:25 -07001278
Linus Torvalds1da177e2005-04-16 15:20:36 -07001279#if RT_CACHE_DEBUG >= 2
Changli Gaod8d1f302010-06-10 23:31:35 -07001280 if (rt->dst.rt_next) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001281 struct rtable *trt;
Neil Hormanb6280b42009-06-22 10:18:53 +00001282 printk(KERN_DEBUG "rt_cache @%02x: %pI4",
1283 hash, &rt->rt_dst);
Changli Gaod8d1f302010-06-10 23:31:35 -07001284 for (trt = rt->dst.rt_next; trt; trt = trt->dst.rt_next)
Harvey Harrison673d57e2008-10-31 00:53:57 -07001285 printk(" . %pI4", &trt->rt_dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001286 printk("\n");
1287 }
1288#endif
Eric Dumazet00269b52008-10-16 14:18:29 -07001289 /*
1290 * Since lookup is lockfree, we must make sure
1291 * previous writes to rt are comitted to memory
1292 * before making rt visible to other CPUS.
1293 */
Eric Dumazet1ddbcb02009-05-19 20:14:28 +00001294 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
Neil Horman1080d702008-10-27 12:28:25 -07001295
Eric Dumazet22c047c2005-07-05 14:55:24 -07001296 spin_unlock_bh(rt_hash_lock_addr(hash));
Neil Horman73e42892009-06-20 01:15:16 -07001297
Neil Hormanb6280b42009-06-22 10:18:53 +00001298skip_hashing:
Eric Dumazet511c3f92009-06-02 05:14:27 +00001299 if (rp)
1300 *rp = rt;
1301 else
Changli Gaod8d1f302010-06-10 23:31:35 -07001302 skb_dst_set(skb, &rt->dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001303 return 0;
1304}
1305
1306void rt_bind_peer(struct rtable *rt, int create)
1307{
Linus Torvalds1da177e2005-04-16 15:20:36 -07001308 struct inet_peer *peer;
1309
David S. Millerb534ecf2010-11-30 11:54:19 -08001310 peer = inet_getpeer_v4(rt->rt_dst, create);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001311
Eric Dumazet49e8ab02010-08-19 06:10:45 +00001312 if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001313 inet_putpeer(peer);
1314}
1315
1316/*
1317 * Peer allocation may fail only in serious out-of-memory conditions. However
1318 * we still can generate some output.
1319 * Random ID selection looks a bit dangerous because we have no chances to
1320 * select ID being unique in a reasonable period of time.
1321 * But broken packet identifier may be better than no packet at all.
1322 */
1323static void ip_select_fb_ident(struct iphdr *iph)
1324{
1325 static DEFINE_SPINLOCK(ip_fb_id_lock);
1326 static u32 ip_fallback_id;
1327 u32 salt;
1328
1329 spin_lock_bh(&ip_fb_id_lock);
Al Viroe4485152006-09-26 22:15:01 -07001330 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001331 iph->id = htons(salt & 0xFFFF);
1332 ip_fallback_id = salt;
1333 spin_unlock_bh(&ip_fb_id_lock);
1334}
1335
1336void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1337{
1338 struct rtable *rt = (struct rtable *) dst;
1339
1340 if (rt) {
1341 if (rt->peer == NULL)
1342 rt_bind_peer(rt, 1);
1343
1344 /* If peer is attached to destination, it is never detached,
1345 so that we need not to grab a lock to dereference it.
1346 */
1347 if (rt->peer) {
1348 iph->id = htons(inet_getid(rt->peer, more));
1349 return;
1350 }
1351 } else
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001352 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
Stephen Hemminger9c2b3322005-04-19 22:39:42 -07001353 __builtin_return_address(0));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001354
1355 ip_select_fb_ident(iph);
1356}
Eric Dumazet4bc2f182010-07-09 21:22:10 +00001357EXPORT_SYMBOL(__ip_select_ident);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001358
1359static void rt_del(unsigned hash, struct rtable *rt)
1360{
Eric Dumazet1c317202010-10-25 21:02:07 +00001361 struct rtable __rcu **rthp;
1362 struct rtable *aux;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001363
Eric Dumazet29e75252008-01-31 17:05:09 -08001364 rthp = &rt_hash_table[hash].chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -07001365 spin_lock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001366 ip_rt_put(rt);
Eric Dumazet1c317202010-10-25 21:02:07 +00001367 while ((aux = rcu_dereference_protected(*rthp,
1368 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001369 if (aux == rt || rt_is_expired(aux)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001370 *rthp = aux->dst.rt_next;
Eric Dumazet29e75252008-01-31 17:05:09 -08001371 rt_free(aux);
1372 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001373 }
Changli Gaod8d1f302010-06-10 23:31:35 -07001374 rthp = &aux->dst.rt_next;
Eric Dumazet29e75252008-01-31 17:05:09 -08001375 }
Eric Dumazet22c047c2005-07-05 14:55:24 -07001376 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001377}
1378
Eric Dumazeted7865a42010-06-07 21:49:44 -07001379/* called in rcu_read_lock() section */
Al Virof7655222006-09-26 21:25:43 -07001380void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1381 __be32 saddr, struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001382{
1383 int i, k;
Eric Dumazeted7865a42010-06-07 21:49:44 -07001384 struct in_device *in_dev = __in_dev_get_rcu(dev);
Eric Dumazet1c317202010-10-25 21:02:07 +00001385 struct rtable *rth;
1386 struct rtable __rcu **rthp;
Al Virof7655222006-09-26 21:25:43 -07001387 __be32 skeys[2] = { saddr, 0 };
Linus Torvalds1da177e2005-04-16 15:20:36 -07001388 int ikeys[2] = { dev->ifindex, 0 };
Tom Tucker8d717402006-07-30 20:43:36 -07001389 struct netevent_redirect netevent;
Denis V. Lunev317805b2008-02-28 20:50:06 -08001390 struct net *net;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001391
Linus Torvalds1da177e2005-04-16 15:20:36 -07001392 if (!in_dev)
1393 return;
1394
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09001395 net = dev_net(dev);
Joe Perches9d4fb272009-11-23 10:41:23 -08001396 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1397 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1398 ipv4_is_zeronet(new_gw))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001399 goto reject_redirect;
1400
Neil Horman1080d702008-10-27 12:28:25 -07001401 if (!rt_caching(net))
1402 goto reject_redirect;
1403
Linus Torvalds1da177e2005-04-16 15:20:36 -07001404 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1405 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1406 goto reject_redirect;
1407 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1408 goto reject_redirect;
1409 } else {
Denis V. Lunev317805b2008-02-28 20:50:06 -08001410 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001411 goto reject_redirect;
1412 }
1413
1414 for (i = 0; i < 2; i++) {
1415 for (k = 0; k < 2; k++) {
Denis V. Lunevb00180d2008-07-05 19:04:09 -07001416 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001417 rt_genid(net));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001418
Eric Dumazet1c317202010-10-25 21:02:07 +00001419 rthp = &rt_hash_table[hash].chain;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001420
Linus Torvalds1da177e2005-04-16 15:20:36 -07001421 while ((rth = rcu_dereference(*rthp)) != NULL) {
1422 struct rtable *rt;
1423
1424 if (rth->fl.fl4_dst != daddr ||
1425 rth->fl.fl4_src != skeys[i] ||
Linus Torvalds1da177e2005-04-16 15:20:36 -07001426 rth->fl.oif != ikeys[k] ||
David S. Millerc7537962010-11-11 17:07:48 -08001427 rt_is_input_route(rth) ||
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001428 rt_is_expired(rth) ||
Changli Gaod8d1f302010-06-10 23:31:35 -07001429 !net_eq(dev_net(rth->dst.dev), net)) {
1430 rthp = &rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001431 continue;
1432 }
1433
1434 if (rth->rt_dst != daddr ||
1435 rth->rt_src != saddr ||
Changli Gaod8d1f302010-06-10 23:31:35 -07001436 rth->dst.error ||
Linus Torvalds1da177e2005-04-16 15:20:36 -07001437 rth->rt_gateway != old_gw ||
Changli Gaod8d1f302010-06-10 23:31:35 -07001438 rth->dst.dev != dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001439 break;
1440
Changli Gaod8d1f302010-06-10 23:31:35 -07001441 dst_hold(&rth->dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001442
1443 rt = dst_alloc(&ipv4_dst_ops);
1444 if (rt == NULL) {
1445 ip_rt_put(rth);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001446 return;
1447 }
1448
1449 /* Copy all the information. */
1450 *rt = *rth;
Changli Gaod8d1f302010-06-10 23:31:35 -07001451 rt->dst.__use = 1;
1452 atomic_set(&rt->dst.__refcnt, 1);
1453 rt->dst.child = NULL;
1454 if (rt->dst.dev)
1455 dev_hold(rt->dst.dev);
Changli Gaod8d1f302010-06-10 23:31:35 -07001456 rt->dst.obsolete = -1;
1457 rt->dst.lastuse = jiffies;
1458 rt->dst.path = &rt->dst;
1459 rt->dst.neighbour = NULL;
1460 rt->dst.hh = NULL;
Alexey Dobriyandef8b4f2008-10-28 13:24:06 -07001461#ifdef CONFIG_XFRM
Changli Gaod8d1f302010-06-10 23:31:35 -07001462 rt->dst.xfrm = NULL;
Alexey Dobriyandef8b4f2008-10-28 13:24:06 -07001463#endif
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001464 rt->rt_genid = rt_genid(net);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001465 rt->rt_flags |= RTCF_REDIRECTED;
1466
1467 /* Gateway is different ... */
1468 rt->rt_gateway = new_gw;
1469
1470 /* Redirect received -> path was valid */
Changli Gaod8d1f302010-06-10 23:31:35 -07001471 dst_confirm(&rth->dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001472
1473 if (rt->peer)
1474 atomic_inc(&rt->peer->refcnt);
David S. Miller62fa8a82011-01-26 20:51:05 -08001475 if (rt->fi)
1476 atomic_inc(&rt->fi->fib_clntref);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001477
Changli Gaod8d1f302010-06-10 23:31:35 -07001478 if (arp_bind_neighbour(&rt->dst) ||
1479 !(rt->dst.neighbour->nud_state &
Linus Torvalds1da177e2005-04-16 15:20:36 -07001480 NUD_VALID)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001481 if (rt->dst.neighbour)
1482 neigh_event_send(rt->dst.neighbour, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001483 ip_rt_put(rth);
1484 rt_drop(rt);
1485 goto do_next;
1486 }
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001487
Changli Gaod8d1f302010-06-10 23:31:35 -07001488 netevent.old = &rth->dst;
1489 netevent.new = &rt->dst;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001490 call_netevent_notifiers(NETEVENT_REDIRECT,
1491 &netevent);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001492
1493 rt_del(hash, rth);
Pavel Emelyanov6a2bad72010-03-24 21:51:22 +00001494 if (!rt_intern_hash(hash, rt, &rt, NULL, rt->fl.oif))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001495 ip_rt_put(rt);
1496 goto do_next;
1497 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001498 do_next:
1499 ;
1500 }
1501 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001502 return;
1503
1504reject_redirect:
1505#ifdef CONFIG_IP_ROUTE_VERBOSE
1506 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
Harvey Harrison673d57e2008-10-31 00:53:57 -07001507 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1508 " Advised path = %pI4 -> %pI4\n",
1509 &old_gw, dev->name, &new_gw,
1510 &saddr, &daddr);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001511#endif
Eric Dumazeted7865a42010-06-07 21:49:44 -07001512 ;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001513}
1514
1515static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1516{
Eric Dumazetee6b9672008-03-05 18:30:47 -08001517 struct rtable *rt = (struct rtable *)dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001518 struct dst_entry *ret = dst;
1519
1520 if (rt) {
Timo Teräsd11a4dc2010-03-18 23:20:20 +00001521 if (dst->obsolete > 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001522 ip_rt_put(rt);
1523 ret = NULL;
1524 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
Changli Gaod8d1f302010-06-10 23:31:35 -07001525 (rt->dst.expires &&
1526 time_after_eq(jiffies, rt->dst.expires))) {
Al Viro8c7bc842006-09-26 21:26:19 -07001527 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
Denis V. Lunevb00180d2008-07-05 19:04:09 -07001528 rt->fl.oif,
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001529 rt_genid(dev_net(dst->dev)));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001530#if RT_CACHE_DEBUG >= 1
Harvey Harrison673d57e2008-10-31 00:53:57 -07001531 printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n",
1532 &rt->rt_dst, rt->fl.fl4_tos);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001533#endif
1534 rt_del(hash, rt);
1535 ret = NULL;
1536 }
1537 }
1538 return ret;
1539}
1540
1541/*
1542 * Algorithm:
1543 * 1. The first ip_rt_redirect_number redirects are sent
1544 * with exponential backoff, then we stop sending them at all,
1545 * assuming that the host ignores our redirects.
1546 * 2. If we did not see packets requiring redirects
1547 * during ip_rt_redirect_silence, we assume that the host
1548 * forgot redirected route and start to send redirects again.
1549 *
1550 * This algorithm is much cheaper and more intelligent than dumb load limiting
1551 * in icmp.c.
1552 *
1553 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1554 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1555 */
1556
1557void ip_rt_send_redirect(struct sk_buff *skb)
1558{
Eric Dumazet511c3f92009-06-02 05:14:27 +00001559 struct rtable *rt = skb_rtable(skb);
Eric Dumazet30038fc2009-08-28 23:52:01 -07001560 struct in_device *in_dev;
1561 int log_martians;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001562
Eric Dumazet30038fc2009-08-28 23:52:01 -07001563 rcu_read_lock();
Changli Gaod8d1f302010-06-10 23:31:35 -07001564 in_dev = __in_dev_get_rcu(rt->dst.dev);
Eric Dumazet30038fc2009-08-28 23:52:01 -07001565 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1566 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001567 return;
Eric Dumazet30038fc2009-08-28 23:52:01 -07001568 }
1569 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1570 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001571
1572 /* No redirected packets during ip_rt_redirect_silence;
1573 * reset the algorithm.
1574 */
Changli Gaod8d1f302010-06-10 23:31:35 -07001575 if (time_after(jiffies, rt->dst.rate_last + ip_rt_redirect_silence))
1576 rt->dst.rate_tokens = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001577
1578 /* Too many ignored redirects; do not send anything
Changli Gaod8d1f302010-06-10 23:31:35 -07001579 * set dst.rate_last to the last seen redirected packet.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001580 */
Changli Gaod8d1f302010-06-10 23:31:35 -07001581 if (rt->dst.rate_tokens >= ip_rt_redirect_number) {
1582 rt->dst.rate_last = jiffies;
Eric Dumazet30038fc2009-08-28 23:52:01 -07001583 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001584 }
1585
1586 /* Check for load limit; set rate_last to the latest sent
1587 * redirect.
1588 */
Changli Gaod8d1f302010-06-10 23:31:35 -07001589 if (rt->dst.rate_tokens == 0 ||
Li Yewang14fb8a72006-12-18 00:26:35 -08001590 time_after(jiffies,
Changli Gaod8d1f302010-06-10 23:31:35 -07001591 (rt->dst.rate_last +
1592 (ip_rt_redirect_load << rt->dst.rate_tokens)))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001593 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
Changli Gaod8d1f302010-06-10 23:31:35 -07001594 rt->dst.rate_last = jiffies;
1595 ++rt->dst.rate_tokens;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001596#ifdef CONFIG_IP_ROUTE_VERBOSE
Eric Dumazet30038fc2009-08-28 23:52:01 -07001597 if (log_martians &&
Changli Gaod8d1f302010-06-10 23:31:35 -07001598 rt->dst.rate_tokens == ip_rt_redirect_number &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001599 net_ratelimit())
Harvey Harrison673d57e2008-10-31 00:53:57 -07001600 printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1601 &rt->rt_src, rt->rt_iif,
1602 &rt->rt_dst, &rt->rt_gateway);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001603#endif
1604 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001605}
1606
1607static int ip_error(struct sk_buff *skb)
1608{
Eric Dumazet511c3f92009-06-02 05:14:27 +00001609 struct rtable *rt = skb_rtable(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001610 unsigned long now;
1611 int code;
1612
Changli Gaod8d1f302010-06-10 23:31:35 -07001613 switch (rt->dst.error) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001614 case EINVAL:
1615 default:
1616 goto out;
1617 case EHOSTUNREACH:
1618 code = ICMP_HOST_UNREACH;
1619 break;
1620 case ENETUNREACH:
1621 code = ICMP_NET_UNREACH;
Changli Gaod8d1f302010-06-10 23:31:35 -07001622 IP_INC_STATS_BH(dev_net(rt->dst.dev),
Pavel Emelyanov7c73a6f2008-07-16 20:20:11 -07001623 IPSTATS_MIB_INNOROUTES);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001624 break;
1625 case EACCES:
1626 code = ICMP_PKT_FILTERED;
1627 break;
1628 }
1629
1630 now = jiffies;
Changli Gaod8d1f302010-06-10 23:31:35 -07001631 rt->dst.rate_tokens += now - rt->dst.rate_last;
1632 if (rt->dst.rate_tokens > ip_rt_error_burst)
1633 rt->dst.rate_tokens = ip_rt_error_burst;
1634 rt->dst.rate_last = now;
1635 if (rt->dst.rate_tokens >= ip_rt_error_cost) {
1636 rt->dst.rate_tokens -= ip_rt_error_cost;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001637 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1638 }
1639
1640out: kfree_skb(skb);
1641 return 0;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001642}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001643
1644/*
1645 * The last two values are not from the RFC but
1646 * are needed for AMPRnet AX.25 paths.
1647 */
1648
Arjan van de Ven9b5b5cf2005-11-29 16:21:38 -08001649static const unsigned short mtu_plateau[] =
Linus Torvalds1da177e2005-04-16 15:20:36 -07001650{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1651
Stephen Hemminger5969f712008-04-10 01:52:09 -07001652static inline unsigned short guess_mtu(unsigned short old_mtu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001653{
1654 int i;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001655
Linus Torvalds1da177e2005-04-16 15:20:36 -07001656 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1657 if (old_mtu > mtu_plateau[i])
1658 return mtu_plateau[i];
1659 return 68;
1660}
1661
Denis V. Lunevb5921912008-01-22 23:50:25 -08001662unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
Timo Teras0010e462008-04-29 03:32:25 -07001663 unsigned short new_mtu,
1664 struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001665{
Timo Teras0010e462008-04-29 03:32:25 -07001666 int i, k;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001667 unsigned short old_mtu = ntohs(iph->tot_len);
1668 struct rtable *rth;
Timo Teras0010e462008-04-29 03:32:25 -07001669 int ikeys[2] = { dev->ifindex, 0 };
Al Viroe4485152006-09-26 22:15:01 -07001670 __be32 skeys[2] = { iph->saddr, 0, };
1671 __be32 daddr = iph->daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001672 unsigned short est_mtu = 0;
1673
Timo Teras0010e462008-04-29 03:32:25 -07001674 for (k = 0; k < 2; k++) {
1675 for (i = 0; i < 2; i++) {
Denis V. Lunevb00180d2008-07-05 19:04:09 -07001676 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001677 rt_genid(net));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001678
Timo Teras0010e462008-04-29 03:32:25 -07001679 rcu_read_lock();
1680 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
Changli Gaod8d1f302010-06-10 23:31:35 -07001681 rth = rcu_dereference(rth->dst.rt_next)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001682 unsigned short mtu = new_mtu;
1683
Timo Teras0010e462008-04-29 03:32:25 -07001684 if (rth->fl.fl4_dst != daddr ||
1685 rth->fl.fl4_src != skeys[i] ||
1686 rth->rt_dst != daddr ||
1687 rth->rt_src != iph->saddr ||
1688 rth->fl.oif != ikeys[k] ||
David S. Millerc7537962010-11-11 17:07:48 -08001689 rt_is_input_route(rth) ||
Changli Gaod8d1f302010-06-10 23:31:35 -07001690 dst_metric_locked(&rth->dst, RTAX_MTU) ||
1691 !net_eq(dev_net(rth->dst.dev), net) ||
Hugh Dickins6c3b8fc2008-07-26 17:51:06 -07001692 rt_is_expired(rth))
Timo Teras0010e462008-04-29 03:32:25 -07001693 continue;
1694
Linus Torvalds1da177e2005-04-16 15:20:36 -07001695 if (new_mtu < 68 || new_mtu >= old_mtu) {
1696
1697 /* BSD 4.2 compatibility hack :-( */
1698 if (mtu == 0 &&
Changli Gaod8d1f302010-06-10 23:31:35 -07001699 old_mtu >= dst_mtu(&rth->dst) &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001700 old_mtu >= 68 + (iph->ihl << 2))
1701 old_mtu -= iph->ihl << 2;
1702
1703 mtu = guess_mtu(old_mtu);
1704 }
Changli Gaod8d1f302010-06-10 23:31:35 -07001705 if (mtu <= dst_mtu(&rth->dst)) {
1706 if (mtu < dst_mtu(&rth->dst)) {
1707 dst_confirm(&rth->dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001708 if (mtu < ip_rt_min_pmtu) {
David S. Millerdefb3512010-12-08 21:16:57 -08001709 u32 lock = dst_metric(&rth->dst,
1710 RTAX_LOCK);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001711 mtu = ip_rt_min_pmtu;
David S. Millerdefb3512010-12-08 21:16:57 -08001712 lock |= (1 << RTAX_MTU);
1713 dst_metric_set(&rth->dst, RTAX_LOCK,
1714 lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001715 }
David S. Millerdefb3512010-12-08 21:16:57 -08001716 dst_metric_set(&rth->dst, RTAX_MTU, mtu);
Changli Gaod8d1f302010-06-10 23:31:35 -07001717 dst_set_expires(&rth->dst,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001718 ip_rt_mtu_expires);
1719 }
1720 est_mtu = mtu;
1721 }
1722 }
Timo Teras0010e462008-04-29 03:32:25 -07001723 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001724 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001725 }
1726 return est_mtu ? : new_mtu;
1727}
1728
1729static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1730{
Rami Rosen6d273f82008-08-06 02:33:49 -07001731 if (dst_mtu(dst) > mtu && mtu >= 68 &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001732 !(dst_metric_locked(dst, RTAX_MTU))) {
1733 if (mtu < ip_rt_min_pmtu) {
David S. Millerdefb3512010-12-08 21:16:57 -08001734 u32 lock = dst_metric(dst, RTAX_LOCK);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001735 mtu = ip_rt_min_pmtu;
David S. Millerdefb3512010-12-08 21:16:57 -08001736 dst_metric_set(dst, RTAX_LOCK, lock | (1 << RTAX_MTU));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001737 }
David S. Millerdefb3512010-12-08 21:16:57 -08001738 dst_metric_set(dst, RTAX_MTU, mtu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001739 dst_set_expires(dst, ip_rt_mtu_expires);
Tom Tucker8d717402006-07-30 20:43:36 -07001740 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001741 }
1742}
1743
1744static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1745{
Timo Teräsd11a4dc2010-03-18 23:20:20 +00001746 if (rt_is_expired((struct rtable *)dst))
1747 return NULL;
1748 return dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001749}
1750
1751static void ipv4_dst_destroy(struct dst_entry *dst)
1752{
1753 struct rtable *rt = (struct rtable *) dst;
1754 struct inet_peer *peer = rt->peer;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001755
David S. Miller62fa8a82011-01-26 20:51:05 -08001756 dst_destroy_metrics_generic(dst);
1757 if (rt->fi) {
1758 fib_info_put(rt->fi);
1759 rt->fi = NULL;
1760 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001761 if (peer) {
1762 rt->peer = NULL;
1763 inet_putpeer(peer);
1764 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001765}
1766
Linus Torvalds1da177e2005-04-16 15:20:36 -07001767
1768static void ipv4_link_failure(struct sk_buff *skb)
1769{
1770 struct rtable *rt;
1771
1772 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1773
Eric Dumazet511c3f92009-06-02 05:14:27 +00001774 rt = skb_rtable(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001775 if (rt)
Changli Gaod8d1f302010-06-10 23:31:35 -07001776 dst_set_expires(&rt->dst, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001777}
1778
1779static int ip_rt_bug(struct sk_buff *skb)
1780{
Harvey Harrison673d57e2008-10-31 00:53:57 -07001781 printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1782 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001783 skb->dev ? skb->dev->name : "?");
1784 kfree_skb(skb);
1785 return 0;
1786}
1787
1788/*
1789 We do not cache source address of outgoing interface,
1790 because it is used only by IP RR, TS and SRR options,
1791 so that it out of fast path.
1792
1793 BTW remember: "addr" is allowed to be not aligned
1794 in IP options!
1795 */
1796
1797void ip_rt_get_source(u8 *addr, struct rtable *rt)
1798{
Al Viroa61ced52006-09-26 21:27:54 -07001799 __be32 src;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001800 struct fib_result res;
1801
David S. Millerc7537962010-11-11 17:07:48 -08001802 if (rt_is_output_route(rt))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001803 src = rt->rt_src;
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00001804 else {
1805 rcu_read_lock();
1806 if (fib_lookup(dev_net(rt->dst.dev), &rt->fl, &res) == 0)
1807 src = FIB_RES_PREFSRC(res);
1808 else
1809 src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001810 RT_SCOPE_UNIVERSE);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00001811 rcu_read_unlock();
1812 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001813 memcpy(addr, &src, 4);
1814}
1815
Patrick McHardyc7066f72011-01-14 13:36:42 +01001816#ifdef CONFIG_IP_ROUTE_CLASSID
Linus Torvalds1da177e2005-04-16 15:20:36 -07001817static void set_class_tag(struct rtable *rt, u32 tag)
1818{
Changli Gaod8d1f302010-06-10 23:31:35 -07001819 if (!(rt->dst.tclassid & 0xFFFF))
1820 rt->dst.tclassid |= tag & 0xFFFF;
1821 if (!(rt->dst.tclassid & 0xFFFF0000))
1822 rt->dst.tclassid |= tag & 0xFFFF0000;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001823}
1824#endif
1825
David S. Miller0dbaee32010-12-13 12:52:14 -08001826static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1827{
1828 unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1829
1830 if (advmss == 0) {
1831 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1832 ip_rt_min_advmss);
1833 if (advmss > 65535 - 40)
1834 advmss = 65535 - 40;
1835 }
1836 return advmss;
1837}
1838
David S. Millerd33e4552010-12-14 13:01:14 -08001839static unsigned int ipv4_default_mtu(const struct dst_entry *dst)
1840{
1841 unsigned int mtu = dst->dev->mtu;
1842
1843 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1844 const struct rtable *rt = (const struct rtable *) dst;
1845
1846 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1847 mtu = 576;
1848 }
1849
1850 if (mtu > IP_MAX_MTU)
1851 mtu = IP_MAX_MTU;
1852
1853 return mtu;
1854}
1855
Linus Torvalds1da177e2005-04-16 15:20:36 -07001856static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1857{
David S. Millerdefb3512010-12-08 21:16:57 -08001858 struct dst_entry *dst = &rt->dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001859 struct fib_info *fi = res->fi;
1860
1861 if (fi) {
1862 if (FIB_RES_GW(*res) &&
1863 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1864 rt->rt_gateway = FIB_RES_GW(*res);
David S. Miller62fa8a82011-01-26 20:51:05 -08001865 rt->fi = fi;
1866 atomic_inc(&fi->fib_clntref);
1867 dst_init_metrics(dst, fi->fib_metrics, true);
Patrick McHardyc7066f72011-01-14 13:36:42 +01001868#ifdef CONFIG_IP_ROUTE_CLASSID
David S. Millerdefb3512010-12-08 21:16:57 -08001869 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001870#endif
David S. Millerd33e4552010-12-14 13:01:14 -08001871 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001872
David S. Millerdefb3512010-12-08 21:16:57 -08001873 if (dst_mtu(dst) > IP_MAX_MTU)
1874 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
David S. Miller0dbaee32010-12-13 12:52:14 -08001875 if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
David S. Millerdefb3512010-12-08 21:16:57 -08001876 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001877
Patrick McHardyc7066f72011-01-14 13:36:42 +01001878#ifdef CONFIG_IP_ROUTE_CLASSID
Linus Torvalds1da177e2005-04-16 15:20:36 -07001879#ifdef CONFIG_IP_MULTIPLE_TABLES
1880 set_class_tag(rt, fib_rules_tclass(res));
1881#endif
1882 set_class_tag(rt, itag);
1883#endif
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001884 rt->rt_type = res->type;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001885}
1886
Eric Dumazet96d36222010-06-02 19:21:31 +00001887/* called in rcu_read_lock() section */
Al Viro9e12bb22006-09-26 21:25:20 -07001888static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001889 u8 tos, struct net_device *dev, int our)
1890{
Eric Dumazet96d36222010-06-02 19:21:31 +00001891 unsigned int hash;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001892 struct rtable *rth;
Al Viroa61ced52006-09-26 21:27:54 -07001893 __be32 spec_dst;
Eric Dumazet96d36222010-06-02 19:21:31 +00001894 struct in_device *in_dev = __in_dev_get_rcu(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001895 u32 itag = 0;
Eric Dumazetb5f7e752010-06-02 12:05:27 +00001896 int err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001897
1898 /* Primary sanity checks. */
1899
1900 if (in_dev == NULL)
1901 return -EINVAL;
1902
Jan Engelhardt1e637c72008-01-21 03:18:08 -08001903 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
Joe Perchesf97c1e02007-12-16 13:45:43 -08001904 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001905 goto e_inval;
1906
Joe Perchesf97c1e02007-12-16 13:45:43 -08001907 if (ipv4_is_zeronet(saddr)) {
1908 if (!ipv4_is_local_multicast(daddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001909 goto e_inval;
1910 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
Eric Dumazetb5f7e752010-06-02 12:05:27 +00001911 } else {
1912 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1913 &itag, 0);
1914 if (err < 0)
1915 goto e_err;
1916 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001917 rth = dst_alloc(&ipv4_dst_ops);
1918 if (!rth)
1919 goto e_nobufs;
1920
Changli Gaod8d1f302010-06-10 23:31:35 -07001921 rth->dst.output = ip_rt_bug;
1922 rth->dst.obsolete = -1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001923
Changli Gaod8d1f302010-06-10 23:31:35 -07001924 atomic_set(&rth->dst.__refcnt, 1);
1925 rth->dst.flags= DST_HOST;
Herbert Xu42f811b2007-06-04 23:34:44 -07001926 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
Changli Gaod8d1f302010-06-10 23:31:35 -07001927 rth->dst.flags |= DST_NOPOLICY;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001928 rth->fl.fl4_dst = daddr;
1929 rth->rt_dst = daddr;
1930 rth->fl.fl4_tos = tos;
Thomas Graf47dcf0c2006-11-09 15:20:38 -08001931 rth->fl.mark = skb->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001932 rth->fl.fl4_src = saddr;
1933 rth->rt_src = saddr;
Patrick McHardyc7066f72011-01-14 13:36:42 +01001934#ifdef CONFIG_IP_ROUTE_CLASSID
Changli Gaod8d1f302010-06-10 23:31:35 -07001935 rth->dst.tclassid = itag;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001936#endif
1937 rth->rt_iif =
1938 rth->fl.iif = dev->ifindex;
Changli Gaod8d1f302010-06-10 23:31:35 -07001939 rth->dst.dev = init_net.loopback_dev;
1940 dev_hold(rth->dst.dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001941 rth->fl.oif = 0;
1942 rth->rt_gateway = daddr;
1943 rth->rt_spec_dst= spec_dst;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001944 rth->rt_genid = rt_genid(dev_net(dev));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001945 rth->rt_flags = RTCF_MULTICAST;
Eric Dumazet29e75252008-01-31 17:05:09 -08001946 rth->rt_type = RTN_MULTICAST;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001947 if (our) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001948 rth->dst.input= ip_local_deliver;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001949 rth->rt_flags |= RTCF_LOCAL;
1950 }
1951
1952#ifdef CONFIG_IP_MROUTE
Joe Perchesf97c1e02007-12-16 13:45:43 -08001953 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
Changli Gaod8d1f302010-06-10 23:31:35 -07001954 rth->dst.input = ip_mr_input;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001955#endif
1956 RT_CACHE_STAT_INC(in_slow_mc);
1957
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001958 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
Pavel Emelyanov6a2bad72010-03-24 21:51:22 +00001959 return rt_intern_hash(hash, rth, NULL, skb, dev->ifindex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001960
1961e_nobufs:
Linus Torvalds1da177e2005-04-16 15:20:36 -07001962 return -ENOBUFS;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001963e_inval:
Eric Dumazet96d36222010-06-02 19:21:31 +00001964 return -EINVAL;
Eric Dumazetb5f7e752010-06-02 12:05:27 +00001965e_err:
Eric Dumazetb5f7e752010-06-02 12:05:27 +00001966 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001967}
1968
1969
1970static void ip_handle_martian_source(struct net_device *dev,
1971 struct in_device *in_dev,
1972 struct sk_buff *skb,
Al Viro9e12bb22006-09-26 21:25:20 -07001973 __be32 daddr,
1974 __be32 saddr)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001975{
1976 RT_CACHE_STAT_INC(in_martian_src);
1977#ifdef CONFIG_IP_ROUTE_VERBOSE
1978 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1979 /*
1980 * RFC1812 recommendation, if source is martian,
1981 * the only hint is MAC header.
1982 */
Harvey Harrison673d57e2008-10-31 00:53:57 -07001983 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
1984 &daddr, &saddr, dev->name);
Arnaldo Carvalho de Melo98e399f2007-03-19 15:33:04 -07001985 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001986 int i;
Arnaldo Carvalho de Melo98e399f2007-03-19 15:33:04 -07001987 const unsigned char *p = skb_mac_header(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001988 printk(KERN_WARNING "ll header: ");
1989 for (i = 0; i < dev->hard_header_len; i++, p++) {
1990 printk("%02x", *p);
1991 if (i < (dev->hard_header_len - 1))
1992 printk(":");
1993 }
1994 printk("\n");
1995 }
1996 }
1997#endif
1998}
1999
Eric Dumazet47360222010-06-03 04:13:21 +00002000/* called in rcu_read_lock() section */
Stephen Hemminger5969f712008-04-10 01:52:09 -07002001static int __mkroute_input(struct sk_buff *skb,
2002 struct fib_result *res,
2003 struct in_device *in_dev,
2004 __be32 daddr, __be32 saddr, u32 tos,
2005 struct rtable **result)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002006{
Linus Torvalds1da177e2005-04-16 15:20:36 -07002007 struct rtable *rth;
2008 int err;
2009 struct in_device *out_dev;
Eric Dumazet47360222010-06-03 04:13:21 +00002010 unsigned int flags = 0;
Al Virod9c9df82006-09-26 21:28:14 -07002011 __be32 spec_dst;
2012 u32 itag;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002013
2014 /* get a working reference to the output device */
Eric Dumazet47360222010-06-03 04:13:21 +00002015 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002016 if (out_dev == NULL) {
2017 if (net_ratelimit())
2018 printk(KERN_CRIT "Bug in ip_route_input" \
2019 "_slow(). Please, report\n");
2020 return -EINVAL;
2021 }
2022
2023
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002024 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
jamalb0c110c2009-10-18 02:12:33 +00002025 in_dev->dev, &spec_dst, &itag, skb->mark);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002026 if (err < 0) {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002027 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002028 saddr);
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002029
Linus Torvalds1da177e2005-04-16 15:20:36 -07002030 goto cleanup;
2031 }
2032
2033 if (err)
2034 flags |= RTCF_DIRECTSRC;
2035
Thomas Graf51b77ca2008-06-03 16:36:01 -07002036 if (out_dev == in_dev && err &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07002037 (IN_DEV_SHARED_MEDIA(out_dev) ||
2038 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2039 flags |= RTCF_DOREDIRECT;
2040
2041 if (skb->protocol != htons(ETH_P_IP)) {
2042 /* Not IP (i.e. ARP). Do not create route, if it is
2043 * invalid for proxy arp. DNAT routes are always valid.
Jesper Dangaard Brouer65324142010-01-05 05:50:47 +00002044 *
2045 * Proxy arp feature have been extended to allow, ARP
2046 * replies back to the same interface, to support
2047 * Private VLAN switch technologies. See arp.c.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002048 */
Jesper Dangaard Brouer65324142010-01-05 05:50:47 +00002049 if (out_dev == in_dev &&
2050 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002051 err = -EINVAL;
2052 goto cleanup;
2053 }
2054 }
2055
2056
2057 rth = dst_alloc(&ipv4_dst_ops);
2058 if (!rth) {
2059 err = -ENOBUFS;
2060 goto cleanup;
2061 }
2062
Changli Gaod8d1f302010-06-10 23:31:35 -07002063 atomic_set(&rth->dst.__refcnt, 1);
2064 rth->dst.flags= DST_HOST;
Herbert Xu42f811b2007-06-04 23:34:44 -07002065 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
Changli Gaod8d1f302010-06-10 23:31:35 -07002066 rth->dst.flags |= DST_NOPOLICY;
Herbert Xu42f811b2007-06-04 23:34:44 -07002067 if (IN_DEV_CONF_GET(out_dev, NOXFRM))
Changli Gaod8d1f302010-06-10 23:31:35 -07002068 rth->dst.flags |= DST_NOXFRM;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002069 rth->fl.fl4_dst = daddr;
2070 rth->rt_dst = daddr;
2071 rth->fl.fl4_tos = tos;
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002072 rth->fl.mark = skb->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002073 rth->fl.fl4_src = saddr;
2074 rth->rt_src = saddr;
2075 rth->rt_gateway = daddr;
2076 rth->rt_iif =
2077 rth->fl.iif = in_dev->dev->ifindex;
Changli Gaod8d1f302010-06-10 23:31:35 -07002078 rth->dst.dev = (out_dev)->dev;
2079 dev_hold(rth->dst.dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002080 rth->fl.oif = 0;
2081 rth->rt_spec_dst= spec_dst;
2082
Changli Gaod8d1f302010-06-10 23:31:35 -07002083 rth->dst.obsolete = -1;
2084 rth->dst.input = ip_forward;
2085 rth->dst.output = ip_output;
2086 rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002087
2088 rt_set_nexthop(rth, res, itag);
2089
2090 rth->rt_flags = flags;
2091
2092 *result = rth;
2093 err = 0;
2094 cleanup:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002095 return err;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002096}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002097
Stephen Hemminger5969f712008-04-10 01:52:09 -07002098static int ip_mkroute_input(struct sk_buff *skb,
2099 struct fib_result *res,
2100 const struct flowi *fl,
2101 struct in_device *in_dev,
2102 __be32 daddr, __be32 saddr, u32 tos)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002103{
Chuck Short7abaa272005-06-22 22:10:23 -07002104 struct rtable* rth = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002105 int err;
2106 unsigned hash;
2107
2108#ifdef CONFIG_IP_ROUTE_MULTIPATH
2109 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
2110 fib_select_multipath(fl, res);
2111#endif
2112
2113 /* create a routing cache entry */
2114 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2115 if (err)
2116 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002117
2118 /* put it into the cache */
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002119 hash = rt_hash(daddr, saddr, fl->iif,
Changli Gaod8d1f302010-06-10 23:31:35 -07002120 rt_genid(dev_net(rth->dst.dev)));
Pavel Emelyanov6a2bad72010-03-24 21:51:22 +00002121 return rt_intern_hash(hash, rth, NULL, skb, fl->iif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002122}
2123
Linus Torvalds1da177e2005-04-16 15:20:36 -07002124/*
2125 * NOTE. We drop all the packets that has local source
2126 * addresses, because every properly looped back packet
2127 * must have correct destination already attached by output routine.
2128 *
2129 * Such approach solves two big problems:
2130 * 1. Not simplex devices are handled properly.
2131 * 2. IP spoofing attempts are filtered with 100% of guarantee.
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002132 * called with rcu_read_lock()
Linus Torvalds1da177e2005-04-16 15:20:36 -07002133 */
2134
Al Viro9e12bb22006-09-26 21:25:20 -07002135static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002136 u8 tos, struct net_device *dev)
2137{
2138 struct fib_result res;
Eric Dumazet96d36222010-06-02 19:21:31 +00002139 struct in_device *in_dev = __in_dev_get_rcu(dev);
Changli Gao58116622010-11-12 18:43:55 +00002140 struct flowi fl = { .fl4_dst = daddr,
2141 .fl4_src = saddr,
2142 .fl4_tos = tos,
2143 .fl4_scope = RT_SCOPE_UNIVERSE,
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002144 .mark = skb->mark,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002145 .iif = dev->ifindex };
2146 unsigned flags = 0;
2147 u32 itag = 0;
2148 struct rtable * rth;
2149 unsigned hash;
Al Viro9e12bb22006-09-26 21:25:20 -07002150 __be32 spec_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002151 int err = -EINVAL;
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09002152 struct net * net = dev_net(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002153
2154 /* IP on this device is disabled. */
2155
2156 if (!in_dev)
2157 goto out;
2158
2159 /* Check for the most weird martians, which can be not detected
2160 by fib_lookup.
2161 */
2162
Jan Engelhardt1e637c72008-01-21 03:18:08 -08002163 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
Joe Perchesf97c1e02007-12-16 13:45:43 -08002164 ipv4_is_loopback(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002165 goto martian_source;
2166
Andy Walls27a954b2010-10-17 15:11:22 +00002167 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002168 goto brd_input;
2169
2170 /* Accept zero addresses only to limited broadcast;
2171 * I even do not know to fix it or not. Waiting for complains :-)
2172 */
Joe Perchesf97c1e02007-12-16 13:45:43 -08002173 if (ipv4_is_zeronet(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002174 goto martian_source;
2175
Andy Walls27a954b2010-10-17 15:11:22 +00002176 if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002177 goto martian_destination;
2178
2179 /*
2180 * Now we are ready to route packet.
2181 */
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002182 err = fib_lookup(net, &fl, &res);
2183 if (err != 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002184 if (!IN_DEV_FORWARD(in_dev))
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002185 goto e_hostunreach;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002186 goto no_route;
2187 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002188
2189 RT_CACHE_STAT_INC(in_slow_tot);
2190
2191 if (res.type == RTN_BROADCAST)
2192 goto brd_input;
2193
2194 if (res.type == RTN_LOCAL) {
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002195 err = fib_validate_source(saddr, daddr, tos,
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002196 net->loopback_dev->ifindex,
2197 dev, &spec_dst, &itag, skb->mark);
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002198 if (err < 0)
2199 goto martian_source_keep_err;
2200 if (err)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002201 flags |= RTCF_DIRECTSRC;
2202 spec_dst = daddr;
2203 goto local_input;
2204 }
2205
2206 if (!IN_DEV_FORWARD(in_dev))
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002207 goto e_hostunreach;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002208 if (res.type != RTN_UNICAST)
2209 goto martian_destination;
2210
2211 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002212out: return err;
2213
2214brd_input:
2215 if (skb->protocol != htons(ETH_P_IP))
2216 goto e_inval;
2217
Joe Perchesf97c1e02007-12-16 13:45:43 -08002218 if (ipv4_is_zeronet(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002219 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2220 else {
2221 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
jamalb0c110c2009-10-18 02:12:33 +00002222 &itag, skb->mark);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002223 if (err < 0)
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002224 goto martian_source_keep_err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002225 if (err)
2226 flags |= RTCF_DIRECTSRC;
2227 }
2228 flags |= RTCF_BROADCAST;
2229 res.type = RTN_BROADCAST;
2230 RT_CACHE_STAT_INC(in_brd);
2231
2232local_input:
2233 rth = dst_alloc(&ipv4_dst_ops);
2234 if (!rth)
2235 goto e_nobufs;
2236
Changli Gaod8d1f302010-06-10 23:31:35 -07002237 rth->dst.output= ip_rt_bug;
2238 rth->dst.obsolete = -1;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002239 rth->rt_genid = rt_genid(net);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002240
Changli Gaod8d1f302010-06-10 23:31:35 -07002241 atomic_set(&rth->dst.__refcnt, 1);
2242 rth->dst.flags= DST_HOST;
Herbert Xu42f811b2007-06-04 23:34:44 -07002243 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
Changli Gaod8d1f302010-06-10 23:31:35 -07002244 rth->dst.flags |= DST_NOPOLICY;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002245 rth->fl.fl4_dst = daddr;
2246 rth->rt_dst = daddr;
2247 rth->fl.fl4_tos = tos;
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002248 rth->fl.mark = skb->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002249 rth->fl.fl4_src = saddr;
2250 rth->rt_src = saddr;
Patrick McHardyc7066f72011-01-14 13:36:42 +01002251#ifdef CONFIG_IP_ROUTE_CLASSID
Changli Gaod8d1f302010-06-10 23:31:35 -07002252 rth->dst.tclassid = itag;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002253#endif
2254 rth->rt_iif =
2255 rth->fl.iif = dev->ifindex;
Changli Gaod8d1f302010-06-10 23:31:35 -07002256 rth->dst.dev = net->loopback_dev;
2257 dev_hold(rth->dst.dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002258 rth->rt_gateway = daddr;
2259 rth->rt_spec_dst= spec_dst;
Changli Gaod8d1f302010-06-10 23:31:35 -07002260 rth->dst.input= ip_local_deliver;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002261 rth->rt_flags = flags|RTCF_LOCAL;
2262 if (res.type == RTN_UNREACHABLE) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002263 rth->dst.input= ip_error;
2264 rth->dst.error= -err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002265 rth->rt_flags &= ~RTCF_LOCAL;
2266 }
2267 rth->rt_type = res.type;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002268 hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
Pavel Emelyanov6a2bad72010-03-24 21:51:22 +00002269 err = rt_intern_hash(hash, rth, NULL, skb, fl.iif);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002270 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002271
2272no_route:
2273 RT_CACHE_STAT_INC(in_no_route);
2274 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2275 res.type = RTN_UNREACHABLE;
Mitsuru Chinen7f538782007-12-07 01:07:24 -08002276 if (err == -ESRCH)
2277 err = -ENETUNREACH;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002278 goto local_input;
2279
2280 /*
2281 * Do not cache martian addresses: they should be logged (RFC1812)
2282 */
2283martian_destination:
2284 RT_CACHE_STAT_INC(in_martian_dst);
2285#ifdef CONFIG_IP_ROUTE_VERBOSE
2286 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
Harvey Harrison673d57e2008-10-31 00:53:57 -07002287 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2288 &daddr, &saddr, dev->name);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002289#endif
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002290
2291e_hostunreach:
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002292 err = -EHOSTUNREACH;
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002293 goto out;
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002294
Linus Torvalds1da177e2005-04-16 15:20:36 -07002295e_inval:
2296 err = -EINVAL;
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002297 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002298
2299e_nobufs:
2300 err = -ENOBUFS;
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002301 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002302
2303martian_source:
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002304 err = -EINVAL;
2305martian_source_keep_err:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002306 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002307 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002308}
2309
Eric Dumazet407eadd2010-05-10 11:32:55 +00002310int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2311 u8 tos, struct net_device *dev, bool noref)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002312{
2313 struct rtable * rth;
2314 unsigned hash;
2315 int iif = dev->ifindex;
Denis V. Lunevb5921912008-01-22 23:50:25 -08002316 struct net *net;
Eric Dumazet96d36222010-06-02 19:21:31 +00002317 int res;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002318
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09002319 net = dev_net(dev);
Neil Horman1080d702008-10-27 12:28:25 -07002320
Eric Dumazet96d36222010-06-02 19:21:31 +00002321 rcu_read_lock();
2322
Neil Horman1080d702008-10-27 12:28:25 -07002323 if (!rt_caching(net))
2324 goto skip_cache;
2325
Linus Torvalds1da177e2005-04-16 15:20:36 -07002326 tos &= IPTOS_RT_MASK;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002327 hash = rt_hash(daddr, saddr, iif, rt_genid(net));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002328
Linus Torvalds1da177e2005-04-16 15:20:36 -07002329 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
Changli Gaod8d1f302010-06-10 23:31:35 -07002330 rth = rcu_dereference(rth->dst.rt_next)) {
Eric Dumazet0eae88f2010-04-20 19:06:52 -07002331 if ((((__force u32)rth->fl.fl4_dst ^ (__force u32)daddr) |
2332 ((__force u32)rth->fl.fl4_src ^ (__force u32)saddr) |
Stephen Hemmingerc0b8c322008-04-10 04:00:28 -07002333 (rth->fl.iif ^ iif) |
2334 rth->fl.oif |
2335 (rth->fl.fl4_tos ^ tos)) == 0 &&
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002336 rth->fl.mark == skb->mark &&
Changli Gaod8d1f302010-06-10 23:31:35 -07002337 net_eq(dev_net(rth->dst.dev), net) &&
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002338 !rt_is_expired(rth)) {
Eric Dumazet407eadd2010-05-10 11:32:55 +00002339 if (noref) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002340 dst_use_noref(&rth->dst, jiffies);
2341 skb_dst_set_noref(skb, &rth->dst);
Eric Dumazet407eadd2010-05-10 11:32:55 +00002342 } else {
Changli Gaod8d1f302010-06-10 23:31:35 -07002343 dst_use(&rth->dst, jiffies);
2344 skb_dst_set(skb, &rth->dst);
Eric Dumazet407eadd2010-05-10 11:32:55 +00002345 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002346 RT_CACHE_STAT_INC(in_hit);
2347 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002348 return 0;
2349 }
2350 RT_CACHE_STAT_INC(in_hlist_search);
2351 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002352
Neil Horman1080d702008-10-27 12:28:25 -07002353skip_cache:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002354 /* Multicast recognition logic is moved from route cache to here.
2355 The problem was that too many Ethernet cards have broken/missing
2356 hardware multicast filters :-( As result the host on multicasting
2357 network acquires a lot of useless route cache entries, sort of
2358 SDR messages from all the world. Now we try to get rid of them.
2359 Really, provided software IP multicast filter is organized
2360 reasonably (at least, hashed), it does not result in a slowdown
2361 comparing with route cache reject entries.
2362 Note, that multicast routers are not affected, because
2363 route cache entry is created eventually.
2364 */
Joe Perchesf97c1e02007-12-16 13:45:43 -08002365 if (ipv4_is_multicast(daddr)) {
Eric Dumazet96d36222010-06-02 19:21:31 +00002366 struct in_device *in_dev = __in_dev_get_rcu(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002367
Eric Dumazet96d36222010-06-02 19:21:31 +00002368 if (in_dev) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002369 int our = ip_check_mc(in_dev, daddr, saddr,
Eric Dumazet96d36222010-06-02 19:21:31 +00002370 ip_hdr(skb)->protocol);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002371 if (our
2372#ifdef CONFIG_IP_MROUTE
Joe Perches9d4fb272009-11-23 10:41:23 -08002373 ||
2374 (!ipv4_is_local_multicast(daddr) &&
2375 IN_DEV_MFORWARD(in_dev))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002376#endif
Joe Perches9d4fb272009-11-23 10:41:23 -08002377 ) {
Eric Dumazet96d36222010-06-02 19:21:31 +00002378 int res = ip_route_input_mc(skb, daddr, saddr,
2379 tos, dev, our);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002380 rcu_read_unlock();
Eric Dumazet96d36222010-06-02 19:21:31 +00002381 return res;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002382 }
2383 }
2384 rcu_read_unlock();
2385 return -EINVAL;
2386 }
Eric Dumazet96d36222010-06-02 19:21:31 +00002387 res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2388 rcu_read_unlock();
2389 return res;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002390}
Eric Dumazet407eadd2010-05-10 11:32:55 +00002391EXPORT_SYMBOL(ip_route_input_common);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002392
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002393/* called with rcu_read_lock() */
Stephen Hemminger5969f712008-04-10 01:52:09 -07002394static int __mkroute_output(struct rtable **result,
2395 struct fib_result *res,
2396 const struct flowi *fl,
2397 const struct flowi *oldflp,
2398 struct net_device *dev_out,
2399 unsigned flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002400{
2401 struct rtable *rth;
2402 struct in_device *in_dev;
2403 u32 tos = RT_FL_TOS(oldflp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002404
Eric Dumazetdd28d1a2010-09-29 11:53:50 +00002405 if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags & IFF_LOOPBACK))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002406 return -EINVAL;
2407
Andy Walls27a954b2010-10-17 15:11:22 +00002408 if (ipv4_is_lbcast(fl->fl4_dst))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002409 res->type = RTN_BROADCAST;
Joe Perchesf97c1e02007-12-16 13:45:43 -08002410 else if (ipv4_is_multicast(fl->fl4_dst))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002411 res->type = RTN_MULTICAST;
Andy Walls27a954b2010-10-17 15:11:22 +00002412 else if (ipv4_is_zeronet(fl->fl4_dst))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002413 return -EINVAL;
2414
2415 if (dev_out->flags & IFF_LOOPBACK)
2416 flags |= RTCF_LOCAL;
2417
Eric Dumazetdd28d1a2010-09-29 11:53:50 +00002418 in_dev = __in_dev_get_rcu(dev_out);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002419 if (!in_dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002420 return -EINVAL;
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002421
Linus Torvalds1da177e2005-04-16 15:20:36 -07002422 if (res->type == RTN_BROADCAST) {
2423 flags |= RTCF_BROADCAST | RTCF_LOCAL;
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002424 res->fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002425 } else if (res->type == RTN_MULTICAST) {
Eric Dumazetdd28d1a2010-09-29 11:53:50 +00002426 flags |= RTCF_MULTICAST | RTCF_LOCAL;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002427 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002428 oldflp->proto))
2429 flags &= ~RTCF_LOCAL;
2430 /* If multicast route do not exist use
Eric Dumazetdd28d1a2010-09-29 11:53:50 +00002431 * default one, but do not gateway in this case.
2432 * Yes, it is hack.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002433 */
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002434 if (res->fi && res->prefixlen < 4)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002435 res->fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002436 }
2437
2438
2439 rth = dst_alloc(&ipv4_dst_ops);
Dimitris Michailidis8391d072010-10-07 14:48:38 +00002440 if (!rth)
Eric Dumazetdd28d1a2010-09-29 11:53:50 +00002441 return -ENOBUFS;
Dimitris Michailidis8391d072010-10-07 14:48:38 +00002442
Changli Gaod8d1f302010-06-10 23:31:35 -07002443 atomic_set(&rth->dst.__refcnt, 1);
2444 rth->dst.flags= DST_HOST;
Herbert Xu42f811b2007-06-04 23:34:44 -07002445 if (IN_DEV_CONF_GET(in_dev, NOXFRM))
Changli Gaod8d1f302010-06-10 23:31:35 -07002446 rth->dst.flags |= DST_NOXFRM;
Herbert Xu42f811b2007-06-04 23:34:44 -07002447 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
Changli Gaod8d1f302010-06-10 23:31:35 -07002448 rth->dst.flags |= DST_NOPOLICY;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002449
2450 rth->fl.fl4_dst = oldflp->fl4_dst;
2451 rth->fl.fl4_tos = tos;
2452 rth->fl.fl4_src = oldflp->fl4_src;
2453 rth->fl.oif = oldflp->oif;
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002454 rth->fl.mark = oldflp->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002455 rth->rt_dst = fl->fl4_dst;
2456 rth->rt_src = fl->fl4_src;
2457 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002458 /* get references to the devices that are to be hold by the routing
Linus Torvalds1da177e2005-04-16 15:20:36 -07002459 cache entry */
Changli Gaod8d1f302010-06-10 23:31:35 -07002460 rth->dst.dev = dev_out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002461 dev_hold(dev_out);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002462 rth->rt_gateway = fl->fl4_dst;
2463 rth->rt_spec_dst= fl->fl4_src;
2464
Changli Gaod8d1f302010-06-10 23:31:35 -07002465 rth->dst.output=ip_output;
2466 rth->dst.obsolete = -1;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002467 rth->rt_genid = rt_genid(dev_net(dev_out));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002468
2469 RT_CACHE_STAT_INC(out_slow_tot);
2470
2471 if (flags & RTCF_LOCAL) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002472 rth->dst.input = ip_local_deliver;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002473 rth->rt_spec_dst = fl->fl4_dst;
2474 }
2475 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2476 rth->rt_spec_dst = fl->fl4_src;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002477 if (flags & RTCF_LOCAL &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07002478 !(dev_out->flags & IFF_LOOPBACK)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002479 rth->dst.output = ip_mc_output;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002480 RT_CACHE_STAT_INC(out_slow_mc);
2481 }
2482#ifdef CONFIG_IP_MROUTE
2483 if (res->type == RTN_MULTICAST) {
2484 if (IN_DEV_MFORWARD(in_dev) &&
Joe Perchesf97c1e02007-12-16 13:45:43 -08002485 !ipv4_is_local_multicast(oldflp->fl4_dst)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002486 rth->dst.input = ip_mr_input;
2487 rth->dst.output = ip_mc_output;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002488 }
2489 }
2490#endif
2491 }
2492
2493 rt_set_nexthop(rth, res, 0);
2494
2495 rth->rt_flags = flags;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002496 *result = rth;
Eric Dumazetdd28d1a2010-09-29 11:53:50 +00002497 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002498}
2499
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002500/* called with rcu_read_lock() */
Stephen Hemminger5969f712008-04-10 01:52:09 -07002501static int ip_mkroute_output(struct rtable **rp,
2502 struct fib_result *res,
2503 const struct flowi *fl,
2504 const struct flowi *oldflp,
2505 struct net_device *dev_out,
2506 unsigned flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002507{
Chuck Short7abaa272005-06-22 22:10:23 -07002508 struct rtable *rth = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002509 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2510 unsigned hash;
2511 if (err == 0) {
Denis V. Lunevb00180d2008-07-05 19:04:09 -07002512 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002513 rt_genid(dev_net(dev_out)));
Pavel Emelyanov6a2bad72010-03-24 21:51:22 +00002514 err = rt_intern_hash(hash, rth, rp, NULL, oldflp->oif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002515 }
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002516
Linus Torvalds1da177e2005-04-16 15:20:36 -07002517 return err;
2518}
2519
Linus Torvalds1da177e2005-04-16 15:20:36 -07002520/*
2521 * Major route resolver routine.
Eric Dumazet0197aa32010-09-30 03:33:58 +00002522 * called with rcu_read_lock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002523 */
2524
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002525static int ip_route_output_slow(struct net *net, struct rtable **rp,
2526 const struct flowi *oldflp)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002527{
2528 u32 tos = RT_FL_TOS(oldflp);
Changli Gao58116622010-11-12 18:43:55 +00002529 struct flowi fl = { .fl4_dst = oldflp->fl4_dst,
2530 .fl4_src = oldflp->fl4_src,
2531 .fl4_tos = tos & IPTOS_RT_MASK,
2532 .fl4_scope = ((tos & RTO_ONLINK) ?
2533 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE),
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002534 .mark = oldflp->mark,
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002535 .iif = net->loopback_dev->ifindex,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002536 .oif = oldflp->oif };
2537 struct fib_result res;
Eric Dumazet0197aa32010-09-30 03:33:58 +00002538 unsigned int flags = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002539 struct net_device *dev_out = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002540 int err;
2541
2542
2543 res.fi = NULL;
2544#ifdef CONFIG_IP_MULTIPLE_TABLES
2545 res.r = NULL;
2546#endif
2547
2548 if (oldflp->fl4_src) {
2549 err = -EINVAL;
Joe Perchesf97c1e02007-12-16 13:45:43 -08002550 if (ipv4_is_multicast(oldflp->fl4_src) ||
Jan Engelhardt1e637c72008-01-21 03:18:08 -08002551 ipv4_is_lbcast(oldflp->fl4_src) ||
Joe Perchesf97c1e02007-12-16 13:45:43 -08002552 ipv4_is_zeronet(oldflp->fl4_src))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002553 goto out;
2554
Linus Torvalds1da177e2005-04-16 15:20:36 -07002555 /* I removed check for oif == dev_out->oif here.
2556 It was wrong for two reasons:
Denis V. Lunev1ab35272008-01-22 22:04:30 -08002557 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2558 is assigned to multiple interfaces.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002559 2. Moreover, we are allowed to send packets with saddr
2560 of another iface. --ANK
2561 */
2562
Joe Perches9d4fb272009-11-23 10:41:23 -08002563 if (oldflp->oif == 0 &&
2564 (ipv4_is_multicast(oldflp->fl4_dst) ||
Andy Walls27a954b2010-10-17 15:11:22 +00002565 ipv4_is_lbcast(oldflp->fl4_dst))) {
Julian Anastasova210d012008-10-01 07:28:28 -07002566 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
Eric Dumazet0197aa32010-09-30 03:33:58 +00002567 dev_out = __ip_dev_find(net, oldflp->fl4_src, false);
Julian Anastasova210d012008-10-01 07:28:28 -07002568 if (dev_out == NULL)
2569 goto out;
2570
Linus Torvalds1da177e2005-04-16 15:20:36 -07002571 /* Special hack: user can direct multicasts
2572 and limited broadcast via necessary interface
2573 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2574 This hack is not just for fun, it allows
2575 vic,vat and friends to work.
2576 They bind socket to loopback, set ttl to zero
2577 and expect that it will work.
2578 From the viewpoint of routing cache they are broken,
2579 because we are not allowed to build multicast path
2580 with loopback source addr (look, routing cache
2581 cannot know, that ttl is zero, so that packet
2582 will not leave this host and route is valid).
2583 Luckily, this hack is good workaround.
2584 */
2585
2586 fl.oif = dev_out->ifindex;
2587 goto make_route;
2588 }
Julian Anastasova210d012008-10-01 07:28:28 -07002589
2590 if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
2591 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
Eric Dumazet0197aa32010-09-30 03:33:58 +00002592 if (!__ip_dev_find(net, oldflp->fl4_src, false))
Julian Anastasova210d012008-10-01 07:28:28 -07002593 goto out;
Julian Anastasova210d012008-10-01 07:28:28 -07002594 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002595 }
2596
2597
2598 if (oldflp->oif) {
Eric Dumazet0197aa32010-09-30 03:33:58 +00002599 dev_out = dev_get_by_index_rcu(net, oldflp->oif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002600 err = -ENODEV;
2601 if (dev_out == NULL)
2602 goto out;
Herbert Xue5ed6392005-10-03 14:35:55 -07002603
2604 /* RACE: Check return value of inet_select_addr instead. */
Eric Dumazetfc75fc82010-12-22 04:39:39 +00002605 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2606 err = -ENETUNREACH;
2607 goto out;
2608 }
Joe Perchesf97c1e02007-12-16 13:45:43 -08002609 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
Andy Walls27a954b2010-10-17 15:11:22 +00002610 ipv4_is_lbcast(oldflp->fl4_dst)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002611 if (!fl.fl4_src)
2612 fl.fl4_src = inet_select_addr(dev_out, 0,
2613 RT_SCOPE_LINK);
2614 goto make_route;
2615 }
2616 if (!fl.fl4_src) {
Joe Perchesf97c1e02007-12-16 13:45:43 -08002617 if (ipv4_is_multicast(oldflp->fl4_dst))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002618 fl.fl4_src = inet_select_addr(dev_out, 0,
2619 fl.fl4_scope);
2620 else if (!oldflp->fl4_dst)
2621 fl.fl4_src = inet_select_addr(dev_out, 0,
2622 RT_SCOPE_HOST);
2623 }
2624 }
2625
2626 if (!fl.fl4_dst) {
2627 fl.fl4_dst = fl.fl4_src;
2628 if (!fl.fl4_dst)
2629 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002630 dev_out = net->loopback_dev;
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002631 fl.oif = net->loopback_dev->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002632 res.type = RTN_LOCAL;
2633 flags |= RTCF_LOCAL;
2634 goto make_route;
2635 }
2636
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002637 if (fib_lookup(net, &fl, &res)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002638 res.fi = NULL;
2639 if (oldflp->oif) {
2640 /* Apparently, routing tables are wrong. Assume,
2641 that the destination is on link.
2642
2643 WHY? DW.
2644 Because we are allowed to send to iface
2645 even if it has NO routes and NO assigned
2646 addresses. When oif is specified, routing
2647 tables are looked up with only one purpose:
2648 to catch if destination is gatewayed, rather than
2649 direct. Moreover, if MSG_DONTROUTE is set,
2650 we send packet, ignoring both routing tables
2651 and ifaddr state. --ANK
2652
2653
2654 We could make it even if oif is unknown,
2655 likely IPv6, but we do not.
2656 */
2657
2658 if (fl.fl4_src == 0)
2659 fl.fl4_src = inet_select_addr(dev_out, 0,
2660 RT_SCOPE_LINK);
2661 res.type = RTN_UNICAST;
2662 goto make_route;
2663 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002664 err = -ENETUNREACH;
2665 goto out;
2666 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002667
2668 if (res.type == RTN_LOCAL) {
Joel Sing9fc3bbb2011-01-03 20:24:20 +00002669 if (!fl.fl4_src) {
2670 if (res.fi->fib_prefsrc)
2671 fl.fl4_src = res.fi->fib_prefsrc;
2672 else
2673 fl.fl4_src = fl.fl4_dst;
2674 }
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002675 dev_out = net->loopback_dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002676 fl.oif = dev_out->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002677 res.fi = NULL;
2678 flags |= RTCF_LOCAL;
2679 goto make_route;
2680 }
2681
2682#ifdef CONFIG_IP_ROUTE_MULTIPATH
2683 if (res.fi->fib_nhs > 1 && fl.oif == 0)
2684 fib_select_multipath(&fl, &res);
2685 else
2686#endif
2687 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002688 fib_select_default(net, &fl, &res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002689
2690 if (!fl.fl4_src)
2691 fl.fl4_src = FIB_RES_PREFSRC(res);
2692
Linus Torvalds1da177e2005-04-16 15:20:36 -07002693 dev_out = FIB_RES_DEV(res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002694 fl.oif = dev_out->ifindex;
2695
2696
2697make_route:
2698 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2699
Linus Torvalds1da177e2005-04-16 15:20:36 -07002700out: return err;
2701}
2702
Denis V. Lunev611c1832008-01-22 22:06:48 -08002703int __ip_route_output_key(struct net *net, struct rtable **rp,
2704 const struct flowi *flp)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002705{
Eric Dumazet0197aa32010-09-30 03:33:58 +00002706 unsigned int hash;
2707 int res;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002708 struct rtable *rth;
2709
Neil Horman1080d702008-10-27 12:28:25 -07002710 if (!rt_caching(net))
2711 goto slow_output;
2712
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002713 hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002714
2715 rcu_read_lock_bh();
Paul E. McKenneya898def2010-02-22 17:04:49 -08002716 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
Changli Gaod8d1f302010-06-10 23:31:35 -07002717 rth = rcu_dereference_bh(rth->dst.rt_next)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002718 if (rth->fl.fl4_dst == flp->fl4_dst &&
2719 rth->fl.fl4_src == flp->fl4_src &&
David S. Millerc7537962010-11-11 17:07:48 -08002720 rt_is_output_route(rth) &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07002721 rth->fl.oif == flp->oif &&
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002722 rth->fl.mark == flp->mark &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07002723 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
Denis V. Lunevb5921912008-01-22 23:50:25 -08002724 (IPTOS_RT_MASK | RTO_ONLINK)) &&
Changli Gaod8d1f302010-06-10 23:31:35 -07002725 net_eq(dev_net(rth->dst.dev), net) &&
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002726 !rt_is_expired(rth)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002727 dst_use(&rth->dst, jiffies);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002728 RT_CACHE_STAT_INC(out_hit);
2729 rcu_read_unlock_bh();
2730 *rp = rth;
2731 return 0;
2732 }
2733 RT_CACHE_STAT_INC(out_hlist_search);
2734 }
2735 rcu_read_unlock_bh();
2736
Neil Horman1080d702008-10-27 12:28:25 -07002737slow_output:
Eric Dumazet0197aa32010-09-30 03:33:58 +00002738 rcu_read_lock();
2739 res = ip_route_output_slow(net, rp, flp);
2740 rcu_read_unlock();
2741 return res;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002742}
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -07002743EXPORT_SYMBOL_GPL(__ip_route_output_key);
2744
Jianzhao Wangae2688d2010-09-08 14:35:43 -07002745static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2746{
2747 return NULL;
2748}
2749
David S. Miller14e50e52007-05-24 18:17:54 -07002750static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2751{
2752}
2753
2754static struct dst_ops ipv4_dst_blackhole_ops = {
2755 .family = AF_INET,
Harvey Harrison09640e632009-02-01 00:45:17 -08002756 .protocol = cpu_to_be16(ETH_P_IP),
David S. Miller14e50e52007-05-24 18:17:54 -07002757 .destroy = ipv4_dst_destroy,
Jianzhao Wangae2688d2010-09-08 14:35:43 -07002758 .check = ipv4_blackhole_dst_check,
David S. Miller14e50e52007-05-24 18:17:54 -07002759 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
David S. Miller14e50e52007-05-24 18:17:54 -07002760};
2761
2762
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002763static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp)
David S. Miller14e50e52007-05-24 18:17:54 -07002764{
2765 struct rtable *ort = *rp;
2766 struct rtable *rt = (struct rtable *)
2767 dst_alloc(&ipv4_dst_blackhole_ops);
2768
2769 if (rt) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002770 struct dst_entry *new = &rt->dst;
David S. Miller14e50e52007-05-24 18:17:54 -07002771
2772 atomic_set(&new->__refcnt, 1);
2773 new->__use = 1;
Herbert Xu352e5122007-11-13 21:34:06 -08002774 new->input = dst_discard;
2775 new->output = dst_discard;
David S. Millerdefb3512010-12-08 21:16:57 -08002776 dst_copy_metrics(new, &ort->dst);
David S. Miller14e50e52007-05-24 18:17:54 -07002777
Changli Gaod8d1f302010-06-10 23:31:35 -07002778 new->dev = ort->dst.dev;
David S. Miller14e50e52007-05-24 18:17:54 -07002779 if (new->dev)
2780 dev_hold(new->dev);
2781
2782 rt->fl = ort->fl;
2783
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002784 rt->rt_genid = rt_genid(net);
David S. Miller14e50e52007-05-24 18:17:54 -07002785 rt->rt_flags = ort->rt_flags;
2786 rt->rt_type = ort->rt_type;
2787 rt->rt_dst = ort->rt_dst;
2788 rt->rt_src = ort->rt_src;
2789 rt->rt_iif = ort->rt_iif;
2790 rt->rt_gateway = ort->rt_gateway;
2791 rt->rt_spec_dst = ort->rt_spec_dst;
2792 rt->peer = ort->peer;
2793 if (rt->peer)
2794 atomic_inc(&rt->peer->refcnt);
David S. Miller62fa8a82011-01-26 20:51:05 -08002795 rt->fi = ort->fi;
2796 if (rt->fi)
2797 atomic_inc(&rt->fi->fib_clntref);
David S. Miller14e50e52007-05-24 18:17:54 -07002798
2799 dst_free(new);
2800 }
2801
Changli Gaod8d1f302010-06-10 23:31:35 -07002802 dst_release(&(*rp)->dst);
David S. Miller14e50e52007-05-24 18:17:54 -07002803 *rp = rt;
Eric Dumazeta02cec22010-09-22 20:43:57 +00002804 return rt ? 0 : -ENOMEM;
David S. Miller14e50e52007-05-24 18:17:54 -07002805}
2806
Denis V. Lunevf1b050b2008-01-22 22:07:10 -08002807int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2808 struct sock *sk, int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002809{
2810 int err;
2811
Denis V. Lunevf1b050b2008-01-22 22:07:10 -08002812 if ((err = __ip_route_output_key(net, rp, flp)) != 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002813 return err;
2814
2815 if (flp->proto) {
2816 if (!flp->fl4_src)
2817 flp->fl4_src = (*rp)->rt_src;
2818 if (!flp->fl4_dst)
2819 flp->fl4_dst = (*rp)->rt_dst;
Alexey Dobriyan52479b62008-11-25 17:35:18 -08002820 err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk,
Herbert Xubb728452007-12-12 18:48:58 -08002821 flags ? XFRM_LOOKUP_WAIT : 0);
David S. Miller14e50e52007-05-24 18:17:54 -07002822 if (err == -EREMOTE)
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002823 err = ipv4_dst_blackhole(net, rp, flp);
David S. Miller14e50e52007-05-24 18:17:54 -07002824
2825 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002826 }
2827
2828 return 0;
2829}
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -07002830EXPORT_SYMBOL_GPL(ip_route_output_flow);
2831
Denis V. Lunevf2063512008-01-22 22:07:34 -08002832int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002833{
Denis V. Lunevf2063512008-01-22 22:07:34 -08002834 return ip_route_output_flow(net, rp, flp, NULL, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002835}
Eric Dumazet4bc2f182010-07-09 21:22:10 +00002836EXPORT_SYMBOL(ip_route_output_key);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002837
Benjamin Thery4feb88e2009-01-22 04:56:23 +00002838static int rt_fill_info(struct net *net,
2839 struct sk_buff *skb, u32 pid, u32 seq, int event,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07002840 int nowait, unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002841{
Eric Dumazet511c3f92009-06-02 05:14:27 +00002842 struct rtable *rt = skb_rtable(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002843 struct rtmsg *r;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002844 struct nlmsghdr *nlh;
Thomas Grafe3703b32006-11-27 09:27:07 -08002845 long expires;
2846 u32 id = 0, ts = 0, tsage = 0, error;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002847
2848 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2849 if (nlh == NULL)
Patrick McHardy26932562007-01-31 23:16:40 -08002850 return -EMSGSIZE;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002851
2852 r = nlmsg_data(nlh);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002853 r->rtm_family = AF_INET;
2854 r->rtm_dst_len = 32;
2855 r->rtm_src_len = 0;
2856 r->rtm_tos = rt->fl.fl4_tos;
2857 r->rtm_table = RT_TABLE_MAIN;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002858 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002859 r->rtm_type = rt->rt_type;
2860 r->rtm_scope = RT_SCOPE_UNIVERSE;
2861 r->rtm_protocol = RTPROT_UNSPEC;
2862 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2863 if (rt->rt_flags & RTCF_NOTIFY)
2864 r->rtm_flags |= RTM_F_NOTIFY;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002865
Al Viro17fb2c62006-09-26 22:15:25 -07002866 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
Thomas Grafbe403ea2006-08-17 18:15:17 -07002867
Linus Torvalds1da177e2005-04-16 15:20:36 -07002868 if (rt->fl.fl4_src) {
2869 r->rtm_src_len = 32;
Al Viro17fb2c62006-09-26 22:15:25 -07002870 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002871 }
Changli Gaod8d1f302010-06-10 23:31:35 -07002872 if (rt->dst.dev)
2873 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
Patrick McHardyc7066f72011-01-14 13:36:42 +01002874#ifdef CONFIG_IP_ROUTE_CLASSID
Changli Gaod8d1f302010-06-10 23:31:35 -07002875 if (rt->dst.tclassid)
2876 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002877#endif
David S. Millerc7537962010-11-11 17:07:48 -08002878 if (rt_is_input_route(rt))
Al Viro17fb2c62006-09-26 22:15:25 -07002879 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002880 else if (rt->rt_src != rt->fl.fl4_src)
Al Viro17fb2c62006-09-26 22:15:25 -07002881 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
Thomas Grafbe403ea2006-08-17 18:15:17 -07002882
Linus Torvalds1da177e2005-04-16 15:20:36 -07002883 if (rt->rt_dst != rt->rt_gateway)
Al Viro17fb2c62006-09-26 22:15:25 -07002884 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
Thomas Grafbe403ea2006-08-17 18:15:17 -07002885
David S. Millerdefb3512010-12-08 21:16:57 -08002886 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002887 goto nla_put_failure;
2888
Eric Dumazet963bfee2010-07-20 22:03:14 +00002889 if (rt->fl.mark)
2890 NLA_PUT_BE32(skb, RTA_MARK, rt->fl.mark);
2891
Changli Gaod8d1f302010-06-10 23:31:35 -07002892 error = rt->dst.error;
2893 expires = rt->dst.expires ? rt->dst.expires - jiffies : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002894 if (rt->peer) {
Eric Dumazet317fe0e2010-06-16 04:52:13 +00002895 inet_peer_refcheck(rt->peer);
Eric Dumazet2c1409a2009-11-12 09:33:09 +00002896 id = atomic_read(&rt->peer->ip_id_count) & 0xffff;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002897 if (rt->peer->tcp_ts_stamp) {
Thomas Grafe3703b32006-11-27 09:27:07 -08002898 ts = rt->peer->tcp_ts;
James Morris9d729f72007-03-04 16:12:44 -08002899 tsage = get_seconds() - rt->peer->tcp_ts_stamp;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002900 }
2901 }
Thomas Grafbe403ea2006-08-17 18:15:17 -07002902
David S. Millerc7537962010-11-11 17:07:48 -08002903 if (rt_is_input_route(rt)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002904#ifdef CONFIG_IP_MROUTE
Al Viroe4485152006-09-26 22:15:01 -07002905 __be32 dst = rt->rt_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002906
Joe Perchesf97c1e02007-12-16 13:45:43 -08002907 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
Benjamin Thery4feb88e2009-01-22 04:56:23 +00002908 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2909 int err = ipmr_get_route(net, skb, r, nowait);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002910 if (err <= 0) {
2911 if (!nowait) {
2912 if (err == 0)
2913 return 0;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002914 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002915 } else {
2916 if (err == -EMSGSIZE)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002917 goto nla_put_failure;
Thomas Grafe3703b32006-11-27 09:27:07 -08002918 error = err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002919 }
2920 }
2921 } else
2922#endif
Thomas Grafbe403ea2006-08-17 18:15:17 -07002923 NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002924 }
2925
Changli Gaod8d1f302010-06-10 23:31:35 -07002926 if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
Thomas Grafe3703b32006-11-27 09:27:07 -08002927 expires, error) < 0)
2928 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002929
Thomas Grafbe403ea2006-08-17 18:15:17 -07002930 return nlmsg_end(skb, nlh);
2931
2932nla_put_failure:
Patrick McHardy26932562007-01-31 23:16:40 -08002933 nlmsg_cancel(skb, nlh);
2934 return -EMSGSIZE;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002935}
2936
Thomas Graf63f34442007-03-22 11:55:17 -07002937static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002938{
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09002939 struct net *net = sock_net(in_skb->sk);
Thomas Grafd889ce32006-08-17 18:15:44 -07002940 struct rtmsg *rtm;
2941 struct nlattr *tb[RTA_MAX+1];
Linus Torvalds1da177e2005-04-16 15:20:36 -07002942 struct rtable *rt = NULL;
Al Viro9e12bb22006-09-26 21:25:20 -07002943 __be32 dst = 0;
2944 __be32 src = 0;
2945 u32 iif;
Thomas Grafd889ce32006-08-17 18:15:44 -07002946 int err;
Eric Dumazet963bfee2010-07-20 22:03:14 +00002947 int mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002948 struct sk_buff *skb;
2949
Thomas Grafd889ce32006-08-17 18:15:44 -07002950 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2951 if (err < 0)
2952 goto errout;
2953
2954 rtm = nlmsg_data(nlh);
2955
Linus Torvalds1da177e2005-04-16 15:20:36 -07002956 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
Thomas Grafd889ce32006-08-17 18:15:44 -07002957 if (skb == NULL) {
2958 err = -ENOBUFS;
2959 goto errout;
2960 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002961
2962 /* Reserve room for dummy headers, this skb can pass
2963 through good chunk of routing engine.
2964 */
Arnaldo Carvalho de Melo459a98e2007-03-19 15:30:44 -07002965 skb_reset_mac_header(skb);
Arnaldo Carvalho de Meloc1d2bbe2007-04-10 20:45:18 -07002966 skb_reset_network_header(skb);
Stephen Hemmingerd2c962b2006-04-17 17:27:11 -07002967
2968 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07002969 ip_hdr(skb)->protocol = IPPROTO_ICMP;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002970 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2971
Al Viro17fb2c62006-09-26 22:15:25 -07002972 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2973 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
Thomas Grafd889ce32006-08-17 18:15:44 -07002974 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
Eric Dumazet963bfee2010-07-20 22:03:14 +00002975 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002976
2977 if (iif) {
Thomas Grafd889ce32006-08-17 18:15:44 -07002978 struct net_device *dev;
2979
Denis V. Lunev19375042008-02-28 20:52:04 -08002980 dev = __dev_get_by_index(net, iif);
Thomas Grafd889ce32006-08-17 18:15:44 -07002981 if (dev == NULL) {
2982 err = -ENODEV;
2983 goto errout_free;
2984 }
2985
Linus Torvalds1da177e2005-04-16 15:20:36 -07002986 skb->protocol = htons(ETH_P_IP);
2987 skb->dev = dev;
Eric Dumazet963bfee2010-07-20 22:03:14 +00002988 skb->mark = mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002989 local_bh_disable();
2990 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2991 local_bh_enable();
Thomas Grafd889ce32006-08-17 18:15:44 -07002992
Eric Dumazet511c3f92009-06-02 05:14:27 +00002993 rt = skb_rtable(skb);
Changli Gaod8d1f302010-06-10 23:31:35 -07002994 if (err == 0 && rt->dst.error)
2995 err = -rt->dst.error;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002996 } else {
Thomas Grafd889ce32006-08-17 18:15:44 -07002997 struct flowi fl = {
Changli Gao58116622010-11-12 18:43:55 +00002998 .fl4_dst = dst,
2999 .fl4_src = src,
3000 .fl4_tos = rtm->rtm_tos,
Thomas Grafd889ce32006-08-17 18:15:44 -07003001 .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
Eric Dumazet963bfee2010-07-20 22:03:14 +00003002 .mark = mark,
Thomas Grafd889ce32006-08-17 18:15:44 -07003003 };
Denis V. Lunev19375042008-02-28 20:52:04 -08003004 err = ip_route_output_key(net, &rt, &fl);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003005 }
Thomas Grafd889ce32006-08-17 18:15:44 -07003006
Linus Torvalds1da177e2005-04-16 15:20:36 -07003007 if (err)
Thomas Grafd889ce32006-08-17 18:15:44 -07003008 goto errout_free;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003009
Changli Gaod8d1f302010-06-10 23:31:35 -07003010 skb_dst_set(skb, &rt->dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003011 if (rtm->rtm_flags & RTM_F_NOTIFY)
3012 rt->rt_flags |= RTCF_NOTIFY;
3013
Benjamin Thery4feb88e2009-01-22 04:56:23 +00003014 err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
Denis V. Lunev19375042008-02-28 20:52:04 -08003015 RTM_NEWROUTE, 0, 0);
Thomas Grafd889ce32006-08-17 18:15:44 -07003016 if (err <= 0)
3017 goto errout_free;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003018
Denis V. Lunev19375042008-02-28 20:52:04 -08003019 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
Thomas Grafd889ce32006-08-17 18:15:44 -07003020errout:
Thomas Graf2942e902006-08-15 00:30:25 -07003021 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003022
Thomas Grafd889ce32006-08-17 18:15:44 -07003023errout_free:
Linus Torvalds1da177e2005-04-16 15:20:36 -07003024 kfree_skb(skb);
Thomas Grafd889ce32006-08-17 18:15:44 -07003025 goto errout;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003026}
3027
3028int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
3029{
3030 struct rtable *rt;
3031 int h, s_h;
3032 int idx, s_idx;
Denis V. Lunev19375042008-02-28 20:52:04 -08003033 struct net *net;
3034
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09003035 net = sock_net(skb->sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003036
3037 s_h = cb->args[0];
Eric Dumazetd8c92832008-01-07 21:52:14 -08003038 if (s_h < 0)
3039 s_h = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003040 s_idx = idx = cb->args[1];
Eric Dumazeta6272662008-08-28 01:11:25 -07003041 for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3042 if (!rt_hash_table[h].chain)
3043 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003044 rcu_read_lock_bh();
Paul E. McKenneya898def2010-02-22 17:04:49 -08003045 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
Changli Gaod8d1f302010-06-10 23:31:35 -07003046 rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3047 if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003048 continue;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07003049 if (rt_is_expired(rt))
Eric Dumazet29e75252008-01-31 17:05:09 -08003050 continue;
Changli Gaod8d1f302010-06-10 23:31:35 -07003051 skb_dst_set_noref(skb, &rt->dst);
Benjamin Thery4feb88e2009-01-22 04:56:23 +00003052 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003053 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07003054 1, NLM_F_MULTI) <= 0) {
Eric Dumazetadf30902009-06-02 05:19:30 +00003055 skb_dst_drop(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003056 rcu_read_unlock_bh();
3057 goto done;
3058 }
Eric Dumazetadf30902009-06-02 05:19:30 +00003059 skb_dst_drop(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003060 }
3061 rcu_read_unlock_bh();
3062 }
3063
3064done:
3065 cb->args[0] = h;
3066 cb->args[1] = idx;
3067 return skb->len;
3068}
3069
3070void ip_rt_multicast_event(struct in_device *in_dev)
3071{
Denis V. Lunev76e6ebf2008-07-05 19:00:44 -07003072 rt_cache_flush(dev_net(in_dev->dev), 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003073}
3074
3075#ifdef CONFIG_SYSCTL
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003076static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
Alexey Dobriyan8d65af72009-09-23 15:57:19 -07003077 void __user *buffer,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003078 size_t *lenp, loff_t *ppos)
3079{
3080 if (write) {
Denis V. Lunev639e1042008-07-05 19:02:06 -07003081 int flush_delay;
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003082 ctl_table ctl;
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003083 struct net *net;
Denis V. Lunev639e1042008-07-05 19:02:06 -07003084
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003085 memcpy(&ctl, __ctl, sizeof(ctl));
3086 ctl.data = &flush_delay;
Alexey Dobriyan8d65af72009-09-23 15:57:19 -07003087 proc_dointvec(&ctl, write, buffer, lenp, ppos);
Denis V. Lunev639e1042008-07-05 19:02:06 -07003088
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003089 net = (struct net *)__ctl->extra1;
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003090 rt_cache_flush(net, flush_delay);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003091 return 0;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003092 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003093
3094 return -EINVAL;
3095}
3096
Al Viroeeb61f72008-07-27 08:59:33 +01003097static ctl_table ipv4_route_table[] = {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003098 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003099 .procname = "gc_thresh",
3100 .data = &ipv4_dst_ops.gc_thresh,
3101 .maxlen = sizeof(int),
3102 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003103 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003104 },
3105 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003106 .procname = "max_size",
3107 .data = &ip_rt_max_size,
3108 .maxlen = sizeof(int),
3109 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003110 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003111 },
3112 {
3113 /* Deprecated. Use gc_min_interval_ms */
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003114
Linus Torvalds1da177e2005-04-16 15:20:36 -07003115 .procname = "gc_min_interval",
3116 .data = &ip_rt_gc_min_interval,
3117 .maxlen = sizeof(int),
3118 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003119 .proc_handler = proc_dointvec_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003120 },
3121 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003122 .procname = "gc_min_interval_ms",
3123 .data = &ip_rt_gc_min_interval,
3124 .maxlen = sizeof(int),
3125 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003126 .proc_handler = proc_dointvec_ms_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003127 },
3128 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003129 .procname = "gc_timeout",
3130 .data = &ip_rt_gc_timeout,
3131 .maxlen = sizeof(int),
3132 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003133 .proc_handler = proc_dointvec_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003134 },
3135 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003136 .procname = "gc_interval",
3137 .data = &ip_rt_gc_interval,
3138 .maxlen = sizeof(int),
3139 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003140 .proc_handler = proc_dointvec_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003141 },
3142 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003143 .procname = "redirect_load",
3144 .data = &ip_rt_redirect_load,
3145 .maxlen = sizeof(int),
3146 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003147 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003148 },
3149 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003150 .procname = "redirect_number",
3151 .data = &ip_rt_redirect_number,
3152 .maxlen = sizeof(int),
3153 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003154 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003155 },
3156 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003157 .procname = "redirect_silence",
3158 .data = &ip_rt_redirect_silence,
3159 .maxlen = sizeof(int),
3160 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003161 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003162 },
3163 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003164 .procname = "error_cost",
3165 .data = &ip_rt_error_cost,
3166 .maxlen = sizeof(int),
3167 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003168 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003169 },
3170 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003171 .procname = "error_burst",
3172 .data = &ip_rt_error_burst,
3173 .maxlen = sizeof(int),
3174 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003175 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003176 },
3177 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003178 .procname = "gc_elasticity",
3179 .data = &ip_rt_gc_elasticity,
3180 .maxlen = sizeof(int),
3181 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003182 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003183 },
3184 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003185 .procname = "mtu_expires",
3186 .data = &ip_rt_mtu_expires,
3187 .maxlen = sizeof(int),
3188 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003189 .proc_handler = proc_dointvec_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003190 },
3191 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003192 .procname = "min_pmtu",
3193 .data = &ip_rt_min_pmtu,
3194 .maxlen = sizeof(int),
3195 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003196 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003197 },
3198 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003199 .procname = "min_adv_mss",
3200 .data = &ip_rt_min_advmss,
3201 .maxlen = sizeof(int),
3202 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003203 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003204 },
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003205 { }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003206};
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003207
Al Viro2f4520d2008-08-25 15:17:44 -07003208static struct ctl_table empty[1];
3209
3210static struct ctl_table ipv4_skeleton[] =
3211{
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003212 { .procname = "route",
Hugh Dickinsd994af02008-08-27 02:35:18 -07003213 .mode = 0555, .child = ipv4_route_table},
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003214 { .procname = "neigh",
Hugh Dickinsd994af02008-08-27 02:35:18 -07003215 .mode = 0555, .child = empty},
Al Viro2f4520d2008-08-25 15:17:44 -07003216 { }
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003217};
3218
Al Viro2f4520d2008-08-25 15:17:44 -07003219static __net_initdata struct ctl_path ipv4_path[] = {
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003220 { .procname = "net", },
3221 { .procname = "ipv4", },
Al Viro2f4520d2008-08-25 15:17:44 -07003222 { },
3223};
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003224
3225static struct ctl_table ipv4_route_flush_table[] = {
3226 {
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003227 .procname = "flush",
3228 .maxlen = sizeof(int),
3229 .mode = 0200,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003230 .proc_handler = ipv4_sysctl_rtcache_flush,
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003231 },
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003232 { },
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003233};
3234
Al Viro2f4520d2008-08-25 15:17:44 -07003235static __net_initdata struct ctl_path ipv4_route_path[] = {
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003236 { .procname = "net", },
3237 { .procname = "ipv4", },
3238 { .procname = "route", },
Al Viro2f4520d2008-08-25 15:17:44 -07003239 { },
3240};
3241
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003242static __net_init int sysctl_route_net_init(struct net *net)
3243{
3244 struct ctl_table *tbl;
3245
3246 tbl = ipv4_route_flush_table;
Octavian Purdila09ad9bc2009-11-25 15:14:13 -08003247 if (!net_eq(net, &init_net)) {
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003248 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3249 if (tbl == NULL)
3250 goto err_dup;
3251 }
3252 tbl[0].extra1 = net;
3253
3254 net->ipv4.route_hdr =
3255 register_net_sysctl_table(net, ipv4_route_path, tbl);
3256 if (net->ipv4.route_hdr == NULL)
3257 goto err_reg;
3258 return 0;
3259
3260err_reg:
3261 if (tbl != ipv4_route_flush_table)
3262 kfree(tbl);
3263err_dup:
3264 return -ENOMEM;
3265}
3266
3267static __net_exit void sysctl_route_net_exit(struct net *net)
3268{
3269 struct ctl_table *tbl;
3270
3271 tbl = net->ipv4.route_hdr->ctl_table_arg;
3272 unregister_net_sysctl_table(net->ipv4.route_hdr);
3273 BUG_ON(tbl == ipv4_route_flush_table);
3274 kfree(tbl);
3275}
3276
3277static __net_initdata struct pernet_operations sysctl_route_ops = {
3278 .init = sysctl_route_net_init,
3279 .exit = sysctl_route_net_exit,
3280};
Linus Torvalds1da177e2005-04-16 15:20:36 -07003281#endif
3282
Neil Horman3ee94372010-05-08 01:57:52 -07003283static __net_init int rt_genid_init(struct net *net)
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003284{
Neil Horman3ee94372010-05-08 01:57:52 -07003285 get_random_bytes(&net->ipv4.rt_genid,
3286 sizeof(net->ipv4.rt_genid));
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003287 return 0;
3288}
3289
Neil Horman3ee94372010-05-08 01:57:52 -07003290static __net_initdata struct pernet_operations rt_genid_ops = {
3291 .init = rt_genid_init,
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003292};
3293
3294
Patrick McHardyc7066f72011-01-14 13:36:42 +01003295#ifdef CONFIG_IP_ROUTE_CLASSID
Tejun Heo7d720c32010-02-16 15:20:26 +00003296struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
Patrick McHardyc7066f72011-01-14 13:36:42 +01003297#endif /* CONFIG_IP_ROUTE_CLASSID */
Linus Torvalds1da177e2005-04-16 15:20:36 -07003298
3299static __initdata unsigned long rhash_entries;
3300static int __init set_rhash_entries(char *str)
3301{
3302 if (!str)
3303 return 0;
3304 rhash_entries = simple_strtoul(str, &str, 0);
3305 return 1;
3306}
3307__setup("rhash_entries=", set_rhash_entries);
3308
3309int __init ip_rt_init(void)
3310{
Eric Dumazet424c4b72005-07-05 14:58:19 -07003311 int rc = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003312
Patrick McHardyc7066f72011-01-14 13:36:42 +01003313#ifdef CONFIG_IP_ROUTE_CLASSID
Ingo Molnar0dcec8c2009-02-25 14:07:33 +01003314 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
Linus Torvalds1da177e2005-04-16 15:20:36 -07003315 if (!ip_rt_acct)
3316 panic("IP: failed to allocate ip_rt_acct\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07003317#endif
3318
Alexey Dobriyane5d679f332006-08-26 19:25:52 -07003319 ipv4_dst_ops.kmem_cachep =
3320 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
Paul Mundt20c2df82007-07-20 10:11:58 +09003321 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003322
David S. Miller14e50e52007-05-24 18:17:54 -07003323 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3324
Eric Dumazetfc66f952010-10-08 06:37:34 +00003325 if (dst_entries_init(&ipv4_dst_ops) < 0)
3326 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3327
3328 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3329 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3330
Eric Dumazet424c4b72005-07-05 14:58:19 -07003331 rt_hash_table = (struct rt_hash_bucket *)
3332 alloc_large_system_hash("IP route cache",
3333 sizeof(struct rt_hash_bucket),
3334 rhash_entries,
Jan Beulich44813742009-09-21 17:03:05 -07003335 (totalram_pages >= 128 * 1024) ?
Mike Stroyan18955cf2005-11-29 16:12:55 -08003336 15 : 17,
Kirill Korotaev8d1502d2006-08-07 20:44:22 -07003337 0,
Eric Dumazet424c4b72005-07-05 14:58:19 -07003338 &rt_hash_log,
3339 &rt_hash_mask,
Anton Blanchardc9503e02009-04-27 05:42:24 -07003340 rhash_entries ? 0 : 512 * 1024);
Eric Dumazet22c047c2005-07-05 14:55:24 -07003341 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3342 rt_hash_lock_init();
Linus Torvalds1da177e2005-04-16 15:20:36 -07003343
3344 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3345 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3346
Linus Torvalds1da177e2005-04-16 15:20:36 -07003347 devinet_init();
3348 ip_fib_init();
3349
Linus Torvalds1da177e2005-04-16 15:20:36 -07003350 /* All the timers, started at system startup tend
3351 to synchronize. Perturb it a bit.
3352 */
Eric Dumazet125bb8f2009-06-11 20:10:07 +00003353 INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3354 expires_ljiffies = jiffies;
Eric Dumazet39c90ec2007-09-15 10:55:54 -07003355 schedule_delayed_work(&expires_work,
3356 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003357
Denis V. Lunev73b38712008-02-28 20:51:18 -08003358 if (ip_rt_proc_init())
Pavel Emelyanov107f1632007-12-05 21:14:28 -08003359 printk(KERN_ERR "Unable to create route proc files\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07003360#ifdef CONFIG_XFRM
3361 xfrm_init();
Neil Hormana33bc5c2009-07-30 18:52:15 -07003362 xfrm4_init(ip_rt_max_size);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003363#endif
Thomas Graf63f34442007-03-22 11:55:17 -07003364 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3365
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003366#ifdef CONFIG_SYSCTL
3367 register_pernet_subsys(&sysctl_route_ops);
3368#endif
Neil Horman3ee94372010-05-08 01:57:52 -07003369 register_pernet_subsys(&rt_genid_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003370 return rc;
3371}
3372
Al Viroa1bc6eb2008-07-30 06:32:52 -04003373#ifdef CONFIG_SYSCTL
Al Viroeeb61f72008-07-27 08:59:33 +01003374/*
3375 * We really need to sanitize the damn ipv4 init order, then all
3376 * this nonsense will go away.
3377 */
3378void __init ip_static_sysctl_init(void)
3379{
Al Viro2f4520d2008-08-25 15:17:44 -07003380 register_sysctl_paths(ipv4_path, ipv4_skeleton);
Al Viroeeb61f72008-07-27 08:59:33 +01003381}
Al Viroa1bc6eb2008-07-30 06:32:52 -04003382#endif