blob: 1cc339441e7d2698f8bd2eacce4b7ce0a9481437 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
Jesper Juhl02c30a82005-05-05 16:16:16 -07008 * Authors: Ross Biro
Linus Torvalds1da177e2005-04-16 15:20:36 -07009 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 *
14 * Fixes:
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090021 * Alan Cox : Super /proc >4K
Linus Torvalds1da177e2005-04-16 15:20:36 -070022 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090039 *
Linus Torvalds1da177e2005-04-16 15:20:36 -070040 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
Eric Dumazetbb1d23b2005-07-05 15:00:32 -070055 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
Ilia Sotnikovcef26852006-03-25 01:38:55 -080056 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
Linus Torvalds1da177e2005-04-16 15:20:36 -070058 *
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
63 */
64
Linus Torvalds1da177e2005-04-16 15:20:36 -070065#include <linux/module.h>
66#include <asm/uaccess.h>
67#include <asm/system.h>
68#include <linux/bitops.h>
69#include <linux/types.h>
70#include <linux/kernel.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070071#include <linux/mm.h>
Eric Dumazet424c4b72005-07-05 14:58:19 -070072#include <linux/bootmem.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070073#include <linux/string.h>
74#include <linux/socket.h>
75#include <linux/sockios.h>
76#include <linux/errno.h>
77#include <linux/in.h>
78#include <linux/inet.h>
79#include <linux/netdevice.h>
80#include <linux/proc_fs.h>
81#include <linux/init.h>
Eric Dumazet39c90ec2007-09-15 10:55:54 -070082#include <linux/workqueue.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070083#include <linux/skbuff.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070084#include <linux/inetdevice.h>
85#include <linux/igmp.h>
86#include <linux/pkt_sched.h>
87#include <linux/mroute.h>
88#include <linux/netfilter_ipv4.h>
89#include <linux/random.h>
90#include <linux/jhash.h>
91#include <linux/rcupdate.h>
92#include <linux/times.h>
Herbert Xu352e5122007-11-13 21:34:06 -080093#include <net/dst.h>
Eric W. Biederman457c4cb2007-09-12 12:01:34 +020094#include <net/net_namespace.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070095#include <net/protocol.h>
96#include <net/ip.h>
97#include <net/route.h>
98#include <net/inetpeer.h>
99#include <net/sock.h>
100#include <net/ip_fib.h>
101#include <net/arp.h>
102#include <net/tcp.h>
103#include <net/icmp.h>
104#include <net/xfrm.h>
Tom Tucker8d717402006-07-30 20:43:36 -0700105#include <net/netevent.h>
Thomas Graf63f34442007-03-22 11:55:17 -0700106#include <net/rtnetlink.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700107#ifdef CONFIG_SYSCTL
108#include <linux/sysctl.h>
109#endif
110
111#define RT_FL_TOS(oldflp) \
112 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
113
114#define IP_MAX_MTU 0xFFF0
115
116#define RT_GC_TIMEOUT (300*HZ)
117
Linus Torvalds1da177e2005-04-16 15:20:36 -0700118static int ip_rt_max_size;
Stephen Hemminger817bc4d2008-03-22 17:43:59 -0700119static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
120static int ip_rt_gc_interval __read_mostly = 60 * HZ;
121static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
122static int ip_rt_redirect_number __read_mostly = 9;
123static int ip_rt_redirect_load __read_mostly = HZ / 50;
124static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
125static int ip_rt_error_cost __read_mostly = HZ;
126static int ip_rt_error_burst __read_mostly = 5 * HZ;
127static int ip_rt_gc_elasticity __read_mostly = 8;
128static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
129static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
130static int ip_rt_min_advmss __read_mostly = 256;
131static int ip_rt_secret_interval __read_mostly = 10 * 60 * HZ;
Neil Horman1080d702008-10-27 12:28:25 -0700132static int rt_chain_length_max __read_mostly = 20;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700133
Eric Dumazet125bb8f2009-06-11 20:10:07 +0000134static struct delayed_work expires_work;
135static unsigned long expires_ljiffies;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700136
137/*
138 * Interface to generic destination cache.
139 */
140
141static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
142static void ipv4_dst_destroy(struct dst_entry *dst);
143static void ipv4_dst_ifdown(struct dst_entry *dst,
144 struct net_device *dev, int how);
145static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
146static void ipv4_link_failure(struct sk_buff *skb);
147static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
Daniel Lezcano569d3642008-01-18 03:56:57 -0800148static int rt_garbage_collect(struct dst_ops *ops);
Neil Horman1080d702008-10-27 12:28:25 -0700149static void rt_emergency_hash_rebuild(struct net *net);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700150
151
152static struct dst_ops ipv4_dst_ops = {
153 .family = AF_INET,
Harvey Harrison09640e632009-02-01 00:45:17 -0800154 .protocol = cpu_to_be16(ETH_P_IP),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700155 .gc = rt_garbage_collect,
156 .check = ipv4_dst_check,
157 .destroy = ipv4_dst_destroy,
158 .ifdown = ipv4_dst_ifdown,
159 .negative_advice = ipv4_negative_advice,
160 .link_failure = ipv4_link_failure,
161 .update_pmtu = ip_rt_update_pmtu,
Herbert Xu1ac06e02008-05-20 14:32:14 -0700162 .local_out = __ip_local_out,
Eric Dumazete2422972008-01-30 20:07:45 -0800163 .entries = ATOMIC_INIT(0),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700164};
165
166#define ECN_OR_COST(class) TC_PRIO_##class
167
Philippe De Muyter4839c522007-07-09 15:32:57 -0700168const __u8 ip_tos2prio[16] = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700169 TC_PRIO_BESTEFFORT,
170 ECN_OR_COST(FILLER),
171 TC_PRIO_BESTEFFORT,
172 ECN_OR_COST(BESTEFFORT),
173 TC_PRIO_BULK,
174 ECN_OR_COST(BULK),
175 TC_PRIO_BULK,
176 ECN_OR_COST(BULK),
177 TC_PRIO_INTERACTIVE,
178 ECN_OR_COST(INTERACTIVE),
179 TC_PRIO_INTERACTIVE,
180 ECN_OR_COST(INTERACTIVE),
181 TC_PRIO_INTERACTIVE_BULK,
182 ECN_OR_COST(INTERACTIVE_BULK),
183 TC_PRIO_INTERACTIVE_BULK,
184 ECN_OR_COST(INTERACTIVE_BULK)
185};
186
187
188/*
189 * Route cache.
190 */
191
192/* The locking scheme is rather straight forward:
193 *
194 * 1) Read-Copy Update protects the buckets of the central route hash.
195 * 2) Only writers remove entries, and they hold the lock
196 * as they look at rtable reference counts.
197 * 3) Only readers acquire references to rtable entries,
198 * they do so with atomic increments and with the
199 * lock held.
200 */
201
202struct rt_hash_bucket {
203 struct rtable *chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700204};
Neil Horman1080d702008-10-27 12:28:25 -0700205
Ingo Molnar8a25d5d2006-07-03 00:24:54 -0700206#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
207 defined(CONFIG_PROVE_LOCKING)
Eric Dumazet22c047c2005-07-05 14:55:24 -0700208/*
209 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
210 * The size of this table is a power of two and depends on the number of CPUS.
Ingo Molnar62051202006-07-03 00:24:59 -0700211 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
Eric Dumazet22c047c2005-07-05 14:55:24 -0700212 */
Ingo Molnar62051202006-07-03 00:24:59 -0700213#ifdef CONFIG_LOCKDEP
214# define RT_HASH_LOCK_SZ 256
Eric Dumazet22c047c2005-07-05 14:55:24 -0700215#else
Ingo Molnar62051202006-07-03 00:24:59 -0700216# if NR_CPUS >= 32
217# define RT_HASH_LOCK_SZ 4096
218# elif NR_CPUS >= 16
219# define RT_HASH_LOCK_SZ 2048
220# elif NR_CPUS >= 8
221# define RT_HASH_LOCK_SZ 1024
222# elif NR_CPUS >= 4
223# define RT_HASH_LOCK_SZ 512
224# else
225# define RT_HASH_LOCK_SZ 256
226# endif
Eric Dumazet22c047c2005-07-05 14:55:24 -0700227#endif
228
229static spinlock_t *rt_hash_locks;
230# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
Pavel Emelyanov1ff1cc22007-12-05 21:15:05 -0800231
232static __init void rt_hash_lock_init(void)
233{
234 int i;
235
236 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
237 GFP_KERNEL);
238 if (!rt_hash_locks)
239 panic("IP: failed to allocate rt_hash_locks\n");
240
241 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
242 spin_lock_init(&rt_hash_locks[i]);
243}
Eric Dumazet22c047c2005-07-05 14:55:24 -0700244#else
245# define rt_hash_lock_addr(slot) NULL
Pavel Emelyanov1ff1cc22007-12-05 21:15:05 -0800246
247static inline void rt_hash_lock_init(void)
248{
249}
Eric Dumazet22c047c2005-07-05 14:55:24 -0700250#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700251
Stephen Hemminger817bc4d2008-03-22 17:43:59 -0700252static struct rt_hash_bucket *rt_hash_table __read_mostly;
253static unsigned rt_hash_mask __read_mostly;
254static unsigned int rt_hash_log __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700255
Eric Dumazet2f970d82006-01-17 02:54:36 -0800256static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
Andrew Mortondbd29152006-01-17 21:58:01 -0800257#define RT_CACHE_STAT_INC(field) \
Paul Mackerrasbfe5d832006-06-25 05:47:14 -0700258 (__raw_get_cpu_var(rt_cache_stat).field++)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700259
Denis V. Lunevb00180d2008-07-05 19:04:09 -0700260static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
261 int genid)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700262{
Stephen Hemminger1294fc42008-04-10 01:54:01 -0700263 return jhash_3words((__force u32)(__be32)(daddr),
264 (__force u32)(__be32)(saddr),
Denis V. Lunevb00180d2008-07-05 19:04:09 -0700265 idx, genid)
Eric Dumazet29e75252008-01-31 17:05:09 -0800266 & rt_hash_mask;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700267}
268
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700269static inline int rt_genid(struct net *net)
270{
271 return atomic_read(&net->ipv4.rt_genid);
272}
273
Linus Torvalds1da177e2005-04-16 15:20:36 -0700274#ifdef CONFIG_PROC_FS
275struct rt_cache_iter_state {
Denis V. Luneva75e9362008-02-28 20:50:55 -0800276 struct seq_net_private p;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700277 int bucket;
Eric Dumazet29e75252008-01-31 17:05:09 -0800278 int genid;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700279};
280
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900281static struct rtable *rt_cache_get_first(struct seq_file *seq)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700282{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900283 struct rt_cache_iter_state *st = seq->private;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700284 struct rtable *r = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700285
286 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
Eric Dumazeta6272662008-08-28 01:11:25 -0700287 if (!rt_hash_table[st->bucket].chain)
288 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700289 rcu_read_lock_bh();
Eric Dumazet29e75252008-01-31 17:05:09 -0800290 r = rcu_dereference(rt_hash_table[st->bucket].chain);
291 while (r) {
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900292 if (dev_net(r->u.dst.dev) == seq_file_net(seq) &&
Denis V. Luneva75e9362008-02-28 20:50:55 -0800293 r->rt_genid == st->genid)
Eric Dumazet29e75252008-01-31 17:05:09 -0800294 return r;
295 r = rcu_dereference(r->u.dst.rt_next);
296 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700297 rcu_read_unlock_bh();
298 }
Eric Dumazet29e75252008-01-31 17:05:09 -0800299 return r;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700300}
301
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900302static struct rtable *__rt_cache_get_next(struct seq_file *seq,
Denis V. Lunev642d6312008-02-28 20:50:33 -0800303 struct rtable *r)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700304{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900305 struct rt_cache_iter_state *st = seq->private;
Eric Dumazeta6272662008-08-28 01:11:25 -0700306
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800307 r = r->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700308 while (!r) {
309 rcu_read_unlock_bh();
Eric Dumazeta6272662008-08-28 01:11:25 -0700310 do {
311 if (--st->bucket < 0)
312 return NULL;
313 } while (!rt_hash_table[st->bucket].chain);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700314 rcu_read_lock_bh();
315 r = rt_hash_table[st->bucket].chain;
316 }
Eric Dumazet0bccead2008-01-10 03:55:57 -0800317 return rcu_dereference(r);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700318}
319
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900320static struct rtable *rt_cache_get_next(struct seq_file *seq,
Denis V. Lunev642d6312008-02-28 20:50:33 -0800321 struct rtable *r)
322{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900323 struct rt_cache_iter_state *st = seq->private;
324 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
325 if (dev_net(r->u.dst.dev) != seq_file_net(seq))
Denis V. Luneva75e9362008-02-28 20:50:55 -0800326 continue;
Denis V. Lunev642d6312008-02-28 20:50:33 -0800327 if (r->rt_genid == st->genid)
328 break;
329 }
330 return r;
331}
332
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900333static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700334{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900335 struct rtable *r = rt_cache_get_first(seq);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700336
337 if (r)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900338 while (pos && (r = rt_cache_get_next(seq, r)))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700339 --pos;
340 return pos ? NULL : r;
341}
342
343static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
344{
Eric Dumazet29e75252008-01-31 17:05:09 -0800345 struct rt_cache_iter_state *st = seq->private;
Eric Dumazet29e75252008-01-31 17:05:09 -0800346 if (*pos)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900347 return rt_cache_get_idx(seq, *pos - 1);
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700348 st->genid = rt_genid(seq_file_net(seq));
Eric Dumazet29e75252008-01-31 17:05:09 -0800349 return SEQ_START_TOKEN;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700350}
351
352static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
353{
Eric Dumazet29e75252008-01-31 17:05:09 -0800354 struct rtable *r;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700355
356 if (v == SEQ_START_TOKEN)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900357 r = rt_cache_get_first(seq);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700358 else
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900359 r = rt_cache_get_next(seq, v);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700360 ++*pos;
361 return r;
362}
363
364static void rt_cache_seq_stop(struct seq_file *seq, void *v)
365{
366 if (v && v != SEQ_START_TOKEN)
367 rcu_read_unlock_bh();
368}
369
370static int rt_cache_seq_show(struct seq_file *seq, void *v)
371{
372 if (v == SEQ_START_TOKEN)
373 seq_printf(seq, "%-127s\n",
374 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
375 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
376 "HHUptod\tSpecDst");
377 else {
378 struct rtable *r = v;
Pavel Emelyanov5e659e42008-04-24 01:02:16 -0700379 int len;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700380
Pavel Emelyanov5e659e42008-04-24 01:02:16 -0700381 seq_printf(seq, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
382 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
Linus Torvalds1da177e2005-04-16 15:20:36 -0700383 r->u.dst.dev ? r->u.dst.dev->name : "*",
384 (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
385 r->rt_flags, atomic_read(&r->u.dst.__refcnt),
386 r->u.dst.__use, 0, (unsigned long)r->rt_src,
387 (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
388 (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
389 dst_metric(&r->u.dst, RTAX_WINDOW),
390 (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
391 dst_metric(&r->u.dst, RTAX_RTTVAR)),
392 r->fl.fl4_tos,
393 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
394 r->u.dst.hh ? (r->u.dst.hh->hh_output ==
395 dev_queue_xmit) : 0,
Pavel Emelyanov5e659e42008-04-24 01:02:16 -0700396 r->rt_spec_dst, &len);
397
398 seq_printf(seq, "%*s\n", 127 - len, "");
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900399 }
400 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700401}
402
Stephen Hemmingerf6908082007-03-12 14:34:29 -0700403static const struct seq_operations rt_cache_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700404 .start = rt_cache_seq_start,
405 .next = rt_cache_seq_next,
406 .stop = rt_cache_seq_stop,
407 .show = rt_cache_seq_show,
408};
409
410static int rt_cache_seq_open(struct inode *inode, struct file *file)
411{
Denis V. Luneva75e9362008-02-28 20:50:55 -0800412 return seq_open_net(inode, file, &rt_cache_seq_ops,
Pavel Emelyanovcf7732e2007-10-10 02:29:29 -0700413 sizeof(struct rt_cache_iter_state));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700414}
415
Arjan van de Ven9a321442007-02-12 00:55:35 -0800416static const struct file_operations rt_cache_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700417 .owner = THIS_MODULE,
418 .open = rt_cache_seq_open,
419 .read = seq_read,
420 .llseek = seq_lseek,
Denis V. Luneva75e9362008-02-28 20:50:55 -0800421 .release = seq_release_net,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700422};
423
424
425static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
426{
427 int cpu;
428
429 if (*pos == 0)
430 return SEQ_START_TOKEN;
431
Rusty Russell0f23174a2008-12-29 12:23:42 +0000432 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700433 if (!cpu_possible(cpu))
434 continue;
435 *pos = cpu+1;
Eric Dumazet2f970d82006-01-17 02:54:36 -0800436 return &per_cpu(rt_cache_stat, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700437 }
438 return NULL;
439}
440
441static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
442{
443 int cpu;
444
Rusty Russell0f23174a2008-12-29 12:23:42 +0000445 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700446 if (!cpu_possible(cpu))
447 continue;
448 *pos = cpu+1;
Eric Dumazet2f970d82006-01-17 02:54:36 -0800449 return &per_cpu(rt_cache_stat, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700450 }
451 return NULL;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900452
Linus Torvalds1da177e2005-04-16 15:20:36 -0700453}
454
455static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
456{
457
458}
459
460static int rt_cpu_seq_show(struct seq_file *seq, void *v)
461{
462 struct rt_cache_stat *st = v;
463
464 if (v == SEQ_START_TOKEN) {
Olaf Rempel5bec0032005-04-28 12:16:08 -0700465 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700466 return 0;
467 }
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900468
Linus Torvalds1da177e2005-04-16 15:20:36 -0700469 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
470 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
471 atomic_read(&ipv4_dst_ops.entries),
472 st->in_hit,
473 st->in_slow_tot,
474 st->in_slow_mc,
475 st->in_no_route,
476 st->in_brd,
477 st->in_martian_dst,
478 st->in_martian_src,
479
480 st->out_hit,
481 st->out_slow_tot,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900482 st->out_slow_mc,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700483
484 st->gc_total,
485 st->gc_ignored,
486 st->gc_goal_miss,
487 st->gc_dst_overflow,
488 st->in_hlist_search,
489 st->out_hlist_search
490 );
491 return 0;
492}
493
Stephen Hemmingerf6908082007-03-12 14:34:29 -0700494static const struct seq_operations rt_cpu_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700495 .start = rt_cpu_seq_start,
496 .next = rt_cpu_seq_next,
497 .stop = rt_cpu_seq_stop,
498 .show = rt_cpu_seq_show,
499};
500
501
502static int rt_cpu_seq_open(struct inode *inode, struct file *file)
503{
504 return seq_open(file, &rt_cpu_seq_ops);
505}
506
Arjan van de Ven9a321442007-02-12 00:55:35 -0800507static const struct file_operations rt_cpu_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700508 .owner = THIS_MODULE,
509 .open = rt_cpu_seq_open,
510 .read = seq_read,
511 .llseek = seq_lseek,
512 .release = seq_release,
513};
514
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800515#ifdef CONFIG_NET_CLS_ROUTE
Alexey Dobriyana661c412009-11-25 15:40:35 -0800516static int rt_acct_proc_show(struct seq_file *m, void *v)
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800517{
Alexey Dobriyana661c412009-11-25 15:40:35 -0800518 struct ip_rt_acct *dst, *src;
519 unsigned int i, j;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800520
Alexey Dobriyana661c412009-11-25 15:40:35 -0800521 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
522 if (!dst)
523 return -ENOMEM;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800524
Alexey Dobriyana661c412009-11-25 15:40:35 -0800525 for_each_possible_cpu(i) {
526 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
527 for (j = 0; j < 256; j++) {
528 dst[j].o_bytes += src[j].o_bytes;
529 dst[j].o_packets += src[j].o_packets;
530 dst[j].i_bytes += src[j].i_bytes;
531 dst[j].i_packets += src[j].i_packets;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800532 }
533 }
Alexey Dobriyana661c412009-11-25 15:40:35 -0800534
535 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
536 kfree(dst);
537 return 0;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800538}
Alexey Dobriyana661c412009-11-25 15:40:35 -0800539
540static int rt_acct_proc_open(struct inode *inode, struct file *file)
541{
542 return single_open(file, rt_acct_proc_show, NULL);
543}
544
545static const struct file_operations rt_acct_proc_fops = {
546 .owner = THIS_MODULE,
547 .open = rt_acct_proc_open,
548 .read = seq_read,
549 .llseek = seq_lseek,
550 .release = single_release,
551};
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800552#endif
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800553
Denis V. Lunev73b38712008-02-28 20:51:18 -0800554static int __net_init ip_rt_do_proc_init(struct net *net)
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800555{
556 struct proc_dir_entry *pde;
557
558 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
559 &rt_cache_seq_fops);
560 if (!pde)
561 goto err1;
562
Wang Chen77020722008-02-28 14:14:25 -0800563 pde = proc_create("rt_cache", S_IRUGO,
564 net->proc_net_stat, &rt_cpu_seq_fops);
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800565 if (!pde)
566 goto err2;
567
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800568#ifdef CONFIG_NET_CLS_ROUTE
Alexey Dobriyana661c412009-11-25 15:40:35 -0800569 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800570 if (!pde)
571 goto err3;
572#endif
573 return 0;
574
575#ifdef CONFIG_NET_CLS_ROUTE
576err3:
577 remove_proc_entry("rt_cache", net->proc_net_stat);
578#endif
579err2:
580 remove_proc_entry("rt_cache", net->proc_net);
581err1:
582 return -ENOMEM;
583}
Denis V. Lunev73b38712008-02-28 20:51:18 -0800584
585static void __net_exit ip_rt_do_proc_exit(struct net *net)
586{
587 remove_proc_entry("rt_cache", net->proc_net_stat);
588 remove_proc_entry("rt_cache", net->proc_net);
589 remove_proc_entry("rt_acct", net->proc_net);
590}
591
592static struct pernet_operations ip_rt_proc_ops __net_initdata = {
593 .init = ip_rt_do_proc_init,
594 .exit = ip_rt_do_proc_exit,
595};
596
597static int __init ip_rt_proc_init(void)
598{
599 return register_pernet_subsys(&ip_rt_proc_ops);
600}
601
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800602#else
Denis V. Lunev73b38712008-02-28 20:51:18 -0800603static inline int ip_rt_proc_init(void)
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800604{
605 return 0;
606}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700607#endif /* CONFIG_PROC_FS */
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900608
Stephen Hemminger5969f712008-04-10 01:52:09 -0700609static inline void rt_free(struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700610{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700611 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
612}
613
Stephen Hemminger5969f712008-04-10 01:52:09 -0700614static inline void rt_drop(struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700615{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700616 ip_rt_put(rt);
617 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
618}
619
Stephen Hemminger5969f712008-04-10 01:52:09 -0700620static inline int rt_fast_clean(struct rtable *rth)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700621{
622 /* Kill broadcast/multicast entries very aggresively, if they
623 collide in hash table with more useful entries */
624 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800625 rth->fl.iif && rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700626}
627
Stephen Hemminger5969f712008-04-10 01:52:09 -0700628static inline int rt_valuable(struct rtable *rth)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700629{
630 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
631 rth->u.dst.expires;
632}
633
634static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
635{
636 unsigned long age;
637 int ret = 0;
638
639 if (atomic_read(&rth->u.dst.__refcnt))
640 goto out;
641
642 ret = 1;
643 if (rth->u.dst.expires &&
644 time_after_eq(jiffies, rth->u.dst.expires))
645 goto out;
646
647 age = jiffies - rth->u.dst.lastuse;
648 ret = 0;
649 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
650 (age <= tmo2 && rt_valuable(rth)))
651 goto out;
652 ret = 1;
653out: return ret;
654}
655
656/* Bits of score are:
657 * 31: very valuable
658 * 30: not quite useless
659 * 29..0: usage counter
660 */
661static inline u32 rt_score(struct rtable *rt)
662{
663 u32 score = jiffies - rt->u.dst.lastuse;
664
665 score = ~score & ~(3<<30);
666
667 if (rt_valuable(rt))
668 score |= (1<<31);
669
670 if (!rt->fl.iif ||
671 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
672 score |= (1<<30);
673
674 return score;
675}
676
Neil Horman1080d702008-10-27 12:28:25 -0700677static inline bool rt_caching(const struct net *net)
678{
679 return net->ipv4.current_rt_cache_rebuild_count <=
680 net->ipv4.sysctl_rt_cache_rebuild_count;
681}
682
683static inline bool compare_hash_inputs(const struct flowi *fl1,
684 const struct flowi *fl2)
685{
686 return (__force u32)(((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
687 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr) |
688 (fl1->iif ^ fl2->iif)) == 0);
689}
690
Linus Torvalds1da177e2005-04-16 15:20:36 -0700691static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
692{
Al Viro714e85b2006-11-14 20:51:49 -0800693 return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
694 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
Thomas Graf47dcf0c2006-11-09 15:20:38 -0800695 (fl1->mark ^ fl2->mark) |
David S. Miller8238b212006-10-12 00:49:15 -0700696 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
697 *(u16 *)&fl2->nl_u.ip4_u.tos) |
698 (fl1->oif ^ fl2->oif) |
699 (fl1->iif ^ fl2->iif)) == 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700700}
701
Denis V. Lunevb5921912008-01-22 23:50:25 -0800702static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
703{
Octavian Purdila09ad9bc2009-11-25 15:14:13 -0800704 return net_eq(dev_net(rt1->u.dst.dev), dev_net(rt2->u.dst.dev));
Denis V. Lunevb5921912008-01-22 23:50:25 -0800705}
706
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700707static inline int rt_is_expired(struct rtable *rth)
708{
709 return rth->rt_genid != rt_genid(dev_net(rth->u.dst.dev));
710}
711
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800712/*
713 * Perform a full scan of hash table and free all entries.
714 * Can be called by a softirq or a process.
715 * In the later case, we want to be reschedule if necessary
716 */
717static void rt_do_flush(int process_context)
718{
719 unsigned int i;
720 struct rtable *rth, *next;
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700721 struct rtable * tail;
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800722
723 for (i = 0; i <= rt_hash_mask; i++) {
724 if (process_context && need_resched())
725 cond_resched();
726 rth = rt_hash_table[i].chain;
727 if (!rth)
728 continue;
729
730 spin_lock_bh(rt_hash_lock_addr(i));
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700731#ifdef CONFIG_NET_NS
732 {
733 struct rtable ** prev, * p;
734
735 rth = rt_hash_table[i].chain;
736
737 /* defer releasing the head of the list after spin_unlock */
738 for (tail = rth; tail; tail = tail->u.dst.rt_next)
739 if (!rt_is_expired(tail))
740 break;
741 if (rth != tail)
742 rt_hash_table[i].chain = tail;
743
744 /* call rt_free on entries after the tail requiring flush */
745 prev = &rt_hash_table[i].chain;
746 for (p = *prev; p; p = next) {
747 next = p->u.dst.rt_next;
748 if (!rt_is_expired(p)) {
749 prev = &p->u.dst.rt_next;
750 } else {
751 *prev = next;
752 rt_free(p);
753 }
754 }
755 }
756#else
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800757 rth = rt_hash_table[i].chain;
758 rt_hash_table[i].chain = NULL;
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700759 tail = NULL;
760#endif
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800761 spin_unlock_bh(rt_hash_lock_addr(i));
762
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700763 for (; rth != tail; rth = next) {
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800764 next = rth->u.dst.rt_next;
765 rt_free(rth);
766 }
767 }
768}
769
Neil Horman1080d702008-10-27 12:28:25 -0700770/*
771 * While freeing expired entries, we compute average chain length
772 * and standard deviation, using fixed-point arithmetic.
773 * This to have an estimation of rt_chain_length_max
774 * rt_chain_length_max = max(elasticity, AVG + 4*SD)
775 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
776 */
777
778#define FRACT_BITS 3
779#define ONE (1UL << FRACT_BITS)
780
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800781static void rt_check_expire(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700782{
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700783 static unsigned int rover;
784 unsigned int i = rover, goal;
Eric Dumazet1ddbcb02009-05-19 20:14:28 +0000785 struct rtable *rth, *aux, **rthp;
Eric Dumazetcf8da762009-05-19 18:54:22 +0000786 unsigned long samples = 0;
Neil Horman1080d702008-10-27 12:28:25 -0700787 unsigned long sum = 0, sum2 = 0;
Eric Dumazet125bb8f2009-06-11 20:10:07 +0000788 unsigned long delta;
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700789 u64 mult;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700790
Eric Dumazet125bb8f2009-06-11 20:10:07 +0000791 delta = jiffies - expires_ljiffies;
792 expires_ljiffies = jiffies;
793 mult = ((u64)delta) << rt_hash_log;
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700794 if (ip_rt_gc_timeout > 1)
795 do_div(mult, ip_rt_gc_timeout);
796 goal = (unsigned int)mult;
Eric Dumazet39c90ec2007-09-15 10:55:54 -0700797 if (goal > rt_hash_mask)
798 goal = rt_hash_mask + 1;
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700799 for (; goal > 0; goal--) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700800 unsigned long tmo = ip_rt_gc_timeout;
Eric Dumazetcf8da762009-05-19 18:54:22 +0000801 unsigned long length;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700802
803 i = (i + 1) & rt_hash_mask;
804 rthp = &rt_hash_table[i].chain;
805
Eric Dumazetd90bf5a2007-11-14 16:14:05 -0800806 if (need_resched())
807 cond_resched();
808
Neil Horman1080d702008-10-27 12:28:25 -0700809 samples++;
810
Stephen Hemmingercfcabdc2007-10-09 01:59:42 -0700811 if (*rthp == NULL)
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700812 continue;
Eric Dumazetcf8da762009-05-19 18:54:22 +0000813 length = 0;
Eric Dumazet39c90ec2007-09-15 10:55:54 -0700814 spin_lock_bh(rt_hash_lock_addr(i));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700815 while ((rth = *rthp) != NULL) {
Eric Dumazet1ddbcb02009-05-19 20:14:28 +0000816 prefetch(rth->u.dst.rt_next);
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700817 if (rt_is_expired(rth)) {
Eric Dumazet29e75252008-01-31 17:05:09 -0800818 *rthp = rth->u.dst.rt_next;
819 rt_free(rth);
820 continue;
821 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700822 if (rth->u.dst.expires) {
823 /* Entry is expired even if it is in use */
Eric Dumazet39c90ec2007-09-15 10:55:54 -0700824 if (time_before_eq(jiffies, rth->u.dst.expires)) {
Eric Dumazet1ddbcb02009-05-19 20:14:28 +0000825nofree:
Linus Torvalds1da177e2005-04-16 15:20:36 -0700826 tmo >>= 1;
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800827 rthp = &rth->u.dst.rt_next;
Neil Horman1080d702008-10-27 12:28:25 -0700828 /*
Eric Dumazet1ddbcb02009-05-19 20:14:28 +0000829 * We only count entries on
Neil Horman1080d702008-10-27 12:28:25 -0700830 * a chain with equal hash inputs once
831 * so that entries for different QOS
832 * levels, and other non-hash input
833 * attributes don't unfairly skew
834 * the length computation
835 */
Eric Dumazet1ddbcb02009-05-19 20:14:28 +0000836 for (aux = rt_hash_table[i].chain;;) {
837 if (aux == rth) {
838 length += ONE;
839 break;
840 }
841 if (compare_hash_inputs(&aux->fl, &rth->fl))
842 break;
843 aux = aux->u.dst.rt_next;
844 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700845 continue;
846 }
Eric Dumazet1ddbcb02009-05-19 20:14:28 +0000847 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
848 goto nofree;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700849
850 /* Cleanup aged off entries. */
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800851 *rthp = rth->u.dst.rt_next;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900852 rt_free(rth);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700853 }
Eric Dumazet39c90ec2007-09-15 10:55:54 -0700854 spin_unlock_bh(rt_hash_lock_addr(i));
Neil Horman1080d702008-10-27 12:28:25 -0700855 sum += length;
856 sum2 += length*length;
857 }
858 if (samples) {
859 unsigned long avg = sum / samples;
860 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
861 rt_chain_length_max = max_t(unsigned long,
862 ip_rt_gc_elasticity,
863 (avg + 4*sd) >> FRACT_BITS);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700864 }
865 rover = i;
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800866}
867
868/*
869 * rt_worker_func() is run in process context.
Eric Dumazet29e75252008-01-31 17:05:09 -0800870 * we call rt_check_expire() to scan part of the hash table
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800871 */
872static void rt_worker_func(struct work_struct *work)
873{
Eric Dumazet29e75252008-01-31 17:05:09 -0800874 rt_check_expire();
Eric Dumazet39c90ec2007-09-15 10:55:54 -0700875 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700876}
877
Eric Dumazet29e75252008-01-31 17:05:09 -0800878/*
879 * Pertubation of rt_genid by a small quantity [1..256]
880 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
881 * many times (2^24) without giving recent rt_genid.
882 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700883 */
Denis V. Lunev86c657f2008-07-05 19:03:31 -0700884static void rt_cache_invalidate(struct net *net)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700885{
Eric Dumazet29e75252008-01-31 17:05:09 -0800886 unsigned char shuffle;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700887
Eric Dumazet29e75252008-01-31 17:05:09 -0800888 get_random_bytes(&shuffle, sizeof(shuffle));
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700889 atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700890}
891
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800892/*
Eric Dumazet29e75252008-01-31 17:05:09 -0800893 * delay < 0 : invalidate cache (fast : entries will be deleted later)
894 * delay >= 0 : invalidate & flush cache (can be long)
895 */
Denis V. Lunev76e6ebf2008-07-05 19:00:44 -0700896void rt_cache_flush(struct net *net, int delay)
Eric Dumazet29e75252008-01-31 17:05:09 -0800897{
Denis V. Lunev86c657f2008-07-05 19:03:31 -0700898 rt_cache_invalidate(net);
Eric Dumazet29e75252008-01-31 17:05:09 -0800899 if (delay >= 0)
900 rt_do_flush(!in_softirq());
901}
902
Eric W. Biedermana5ee1552009-11-29 15:45:58 +0000903/* Flush previous cache invalidated entries from the cache */
904void rt_cache_flush_batch(void)
905{
906 rt_do_flush(!in_softirq());
907}
908
Eric Dumazet29e75252008-01-31 17:05:09 -0800909/*
910 * We change rt_genid and let gc do the cleanup
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800911 */
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -0700912static void rt_secret_rebuild(unsigned long __net)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700913{
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -0700914 struct net *net = (struct net *)__net;
Denis V. Lunev86c657f2008-07-05 19:03:31 -0700915 rt_cache_invalidate(net);
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -0700916 mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700917}
918
Neil Horman1080d702008-10-27 12:28:25 -0700919static void rt_secret_rebuild_oneshot(struct net *net)
920{
921 del_timer_sync(&net->ipv4.rt_secret_timer);
922 rt_cache_invalidate(net);
923 if (ip_rt_secret_interval) {
924 net->ipv4.rt_secret_timer.expires += ip_rt_secret_interval;
925 add_timer(&net->ipv4.rt_secret_timer);
926 }
927}
928
929static void rt_emergency_hash_rebuild(struct net *net)
930{
931 if (net_ratelimit()) {
932 printk(KERN_WARNING "Route hash chain too long!\n");
933 printk(KERN_WARNING "Adjust your secret_interval!\n");
934 }
935
936 rt_secret_rebuild_oneshot(net);
937}
938
Linus Torvalds1da177e2005-04-16 15:20:36 -0700939/*
940 Short description of GC goals.
941
942 We want to build algorithm, which will keep routing cache
943 at some equilibrium point, when number of aged off entries
944 is kept approximately equal to newly generated ones.
945
946 Current expiration strength is variable "expire".
947 We try to adjust it dynamically, so that if networking
948 is idle expires is large enough to keep enough of warm entries,
949 and when load increases it reduces to limit cache size.
950 */
951
Daniel Lezcano569d3642008-01-18 03:56:57 -0800952static int rt_garbage_collect(struct dst_ops *ops)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700953{
954 static unsigned long expire = RT_GC_TIMEOUT;
955 static unsigned long last_gc;
956 static int rover;
957 static int equilibrium;
958 struct rtable *rth, **rthp;
959 unsigned long now = jiffies;
960 int goal;
961
962 /*
963 * Garbage collection is pretty expensive,
964 * do not make it too frequently.
965 */
966
967 RT_CACHE_STAT_INC(gc_total);
968
969 if (now - last_gc < ip_rt_gc_min_interval &&
970 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
971 RT_CACHE_STAT_INC(gc_ignored);
972 goto out;
973 }
974
975 /* Calculate number of entries, which we want to expire now. */
976 goal = atomic_read(&ipv4_dst_ops.entries) -
977 (ip_rt_gc_elasticity << rt_hash_log);
978 if (goal <= 0) {
979 if (equilibrium < ipv4_dst_ops.gc_thresh)
980 equilibrium = ipv4_dst_ops.gc_thresh;
981 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
982 if (goal > 0) {
Eric Dumazetb790ced2007-12-21 01:49:07 -0800983 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700984 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
985 }
986 } else {
987 /* We are in dangerous area. Try to reduce cache really
988 * aggressively.
989 */
Eric Dumazetb790ced2007-12-21 01:49:07 -0800990 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700991 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
992 }
993
994 if (now - last_gc >= ip_rt_gc_min_interval)
995 last_gc = now;
996
997 if (goal <= 0) {
998 equilibrium += goal;
999 goto work_done;
1000 }
1001
1002 do {
1003 int i, k;
1004
1005 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1006 unsigned long tmo = expire;
1007
1008 k = (k + 1) & rt_hash_mask;
1009 rthp = &rt_hash_table[k].chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -07001010 spin_lock_bh(rt_hash_lock_addr(k));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001011 while ((rth = *rthp) != NULL) {
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001012 if (!rt_is_expired(rth) &&
Eric Dumazet29e75252008-01-31 17:05:09 -08001013 !rt_may_expire(rth, tmo, expire)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001014 tmo >>= 1;
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001015 rthp = &rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001016 continue;
1017 }
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001018 *rthp = rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001019 rt_free(rth);
1020 goal--;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001021 }
Eric Dumazet22c047c2005-07-05 14:55:24 -07001022 spin_unlock_bh(rt_hash_lock_addr(k));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001023 if (goal <= 0)
1024 break;
1025 }
1026 rover = k;
1027
1028 if (goal <= 0)
1029 goto work_done;
1030
1031 /* Goal is not achieved. We stop process if:
1032
1033 - if expire reduced to zero. Otherwise, expire is halfed.
1034 - if table is not full.
1035 - if we are called from interrupt.
1036 - jiffies check is just fallback/debug loop breaker.
1037 We will not spin here for long time in any case.
1038 */
1039
1040 RT_CACHE_STAT_INC(gc_goal_miss);
1041
1042 if (expire == 0)
1043 break;
1044
1045 expire >>= 1;
1046#if RT_CACHE_DEBUG >= 2
1047 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
1048 atomic_read(&ipv4_dst_ops.entries), goal, i);
1049#endif
1050
1051 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
1052 goto out;
1053 } while (!in_softirq() && time_before_eq(jiffies, now));
1054
1055 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
1056 goto out;
1057 if (net_ratelimit())
1058 printk(KERN_WARNING "dst cache overflow\n");
1059 RT_CACHE_STAT_INC(gc_dst_overflow);
1060 return 1;
1061
1062work_done:
1063 expire += ip_rt_gc_min_interval;
1064 if (expire > ip_rt_gc_timeout ||
1065 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
1066 expire = ip_rt_gc_timeout;
1067#if RT_CACHE_DEBUG >= 2
1068 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
1069 atomic_read(&ipv4_dst_ops.entries), goal, rover);
1070#endif
1071out: return 0;
1072}
1073
Eric Dumazet511c3f92009-06-02 05:14:27 +00001074static int rt_intern_hash(unsigned hash, struct rtable *rt,
1075 struct rtable **rp, struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001076{
1077 struct rtable *rth, **rthp;
1078 unsigned long now;
1079 struct rtable *cand, **candp;
1080 u32 min_score;
1081 int chain_length;
1082 int attempts = !in_softirq();
1083
1084restart:
1085 chain_length = 0;
1086 min_score = ~(u32)0;
1087 cand = NULL;
1088 candp = NULL;
1089 now = jiffies;
1090
Neil Horman1080d702008-10-27 12:28:25 -07001091 if (!rt_caching(dev_net(rt->u.dst.dev))) {
Neil Horman73e42892009-06-20 01:15:16 -07001092 /*
1093 * If we're not caching, just tell the caller we
1094 * were successful and don't touch the route. The
1095 * caller hold the sole reference to the cache entry, and
1096 * it will be released when the caller is done with it.
1097 * If we drop it here, the callers have no way to resolve routes
1098 * when we're not caching. Instead, just point *rp at rt, so
1099 * the caller gets a single use out of the route
Neil Hormanb6280b42009-06-22 10:18:53 +00001100 * Note that we do rt_free on this new route entry, so that
1101 * once its refcount hits zero, we are still able to reap it
1102 * (Thanks Alexey)
1103 * Note also the rt_free uses call_rcu. We don't actually
1104 * need rcu protection here, this is just our path to get
1105 * on the route gc list.
Neil Horman73e42892009-06-20 01:15:16 -07001106 */
Neil Hormanb6280b42009-06-22 10:18:53 +00001107
1108 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1109 int err = arp_bind_neighbour(&rt->u.dst);
1110 if (err) {
1111 if (net_ratelimit())
1112 printk(KERN_WARNING
1113 "Neighbour table failure & not caching routes.\n");
1114 rt_drop(rt);
1115 return err;
1116 }
1117 }
1118
1119 rt_free(rt);
1120 goto skip_hashing;
Neil Horman1080d702008-10-27 12:28:25 -07001121 }
1122
Linus Torvalds1da177e2005-04-16 15:20:36 -07001123 rthp = &rt_hash_table[hash].chain;
1124
Eric Dumazet22c047c2005-07-05 14:55:24 -07001125 spin_lock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001126 while ((rth = *rthp) != NULL) {
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001127 if (rt_is_expired(rth)) {
Eric Dumazet29e75252008-01-31 17:05:09 -08001128 *rthp = rth->u.dst.rt_next;
1129 rt_free(rth);
1130 continue;
1131 }
Denis V. Lunevb5921912008-01-22 23:50:25 -08001132 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001133 /* Put it first */
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001134 *rthp = rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001135 /*
1136 * Since lookup is lockfree, the deletion
1137 * must be visible to another weakly ordered CPU before
1138 * the insertion at the start of the hash chain.
1139 */
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001140 rcu_assign_pointer(rth->u.dst.rt_next,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001141 rt_hash_table[hash].chain);
1142 /*
1143 * Since lookup is lockfree, the update writes
1144 * must be ordered for consistency on SMP.
1145 */
1146 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1147
Pavel Emelyanov03f49f32007-11-10 21:28:34 -08001148 dst_use(&rth->u.dst, now);
Eric Dumazet22c047c2005-07-05 14:55:24 -07001149 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001150
1151 rt_drop(rt);
Eric Dumazet511c3f92009-06-02 05:14:27 +00001152 if (rp)
1153 *rp = rth;
1154 else
Eric Dumazetadf30902009-06-02 05:19:30 +00001155 skb_dst_set(skb, &rth->u.dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001156 return 0;
1157 }
1158
1159 if (!atomic_read(&rth->u.dst.__refcnt)) {
1160 u32 score = rt_score(rth);
1161
1162 if (score <= min_score) {
1163 cand = rth;
1164 candp = rthp;
1165 min_score = score;
1166 }
1167 }
1168
1169 chain_length++;
1170
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001171 rthp = &rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001172 }
1173
1174 if (cand) {
1175 /* ip_rt_gc_elasticity used to be average length of chain
1176 * length, when exceeded gc becomes really aggressive.
1177 *
1178 * The second limit is less certain. At the moment it allows
1179 * only 2 entries per bucket. We will see.
1180 */
1181 if (chain_length > ip_rt_gc_elasticity) {
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001182 *candp = cand->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001183 rt_free(cand);
1184 }
Neil Horman1080d702008-10-27 12:28:25 -07001185 } else {
1186 if (chain_length > rt_chain_length_max) {
1187 struct net *net = dev_net(rt->u.dst.dev);
1188 int num = ++net->ipv4.current_rt_cache_rebuild_count;
1189 if (!rt_caching(dev_net(rt->u.dst.dev))) {
1190 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1191 rt->u.dst.dev->name, num);
1192 }
1193 rt_emergency_hash_rebuild(dev_net(rt->u.dst.dev));
1194 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001195 }
1196
1197 /* Try to bind route to arp only if it is output
1198 route or unicast forwarding path.
1199 */
1200 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1201 int err = arp_bind_neighbour(&rt->u.dst);
1202 if (err) {
Eric Dumazet22c047c2005-07-05 14:55:24 -07001203 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001204
1205 if (err != -ENOBUFS) {
1206 rt_drop(rt);
1207 return err;
1208 }
1209
1210 /* Neighbour tables are full and nothing
1211 can be released. Try to shrink route cache,
1212 it is most likely it holds some neighbour records.
1213 */
1214 if (attempts-- > 0) {
1215 int saved_elasticity = ip_rt_gc_elasticity;
1216 int saved_int = ip_rt_gc_min_interval;
1217 ip_rt_gc_elasticity = 1;
1218 ip_rt_gc_min_interval = 0;
Daniel Lezcano569d3642008-01-18 03:56:57 -08001219 rt_garbage_collect(&ipv4_dst_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001220 ip_rt_gc_min_interval = saved_int;
1221 ip_rt_gc_elasticity = saved_elasticity;
1222 goto restart;
1223 }
1224
1225 if (net_ratelimit())
1226 printk(KERN_WARNING "Neighbour table overflow.\n");
1227 rt_drop(rt);
1228 return -ENOBUFS;
1229 }
1230 }
1231
Eric Dumazet1ddbcb02009-05-19 20:14:28 +00001232 rt->u.dst.rt_next = rt_hash_table[hash].chain;
Neil Horman1080d702008-10-27 12:28:25 -07001233
Linus Torvalds1da177e2005-04-16 15:20:36 -07001234#if RT_CACHE_DEBUG >= 2
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001235 if (rt->u.dst.rt_next) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001236 struct rtable *trt;
Neil Hormanb6280b42009-06-22 10:18:53 +00001237 printk(KERN_DEBUG "rt_cache @%02x: %pI4",
1238 hash, &rt->rt_dst);
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001239 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
Harvey Harrison673d57e2008-10-31 00:53:57 -07001240 printk(" . %pI4", &trt->rt_dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001241 printk("\n");
1242 }
1243#endif
Eric Dumazet00269b52008-10-16 14:18:29 -07001244 /*
1245 * Since lookup is lockfree, we must make sure
1246 * previous writes to rt are comitted to memory
1247 * before making rt visible to other CPUS.
1248 */
Eric Dumazet1ddbcb02009-05-19 20:14:28 +00001249 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
Neil Horman1080d702008-10-27 12:28:25 -07001250
Eric Dumazet22c047c2005-07-05 14:55:24 -07001251 spin_unlock_bh(rt_hash_lock_addr(hash));
Neil Horman73e42892009-06-20 01:15:16 -07001252
Neil Hormanb6280b42009-06-22 10:18:53 +00001253skip_hashing:
Eric Dumazet511c3f92009-06-02 05:14:27 +00001254 if (rp)
1255 *rp = rt;
1256 else
Eric Dumazetadf30902009-06-02 05:19:30 +00001257 skb_dst_set(skb, &rt->u.dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001258 return 0;
1259}
1260
1261void rt_bind_peer(struct rtable *rt, int create)
1262{
1263 static DEFINE_SPINLOCK(rt_peer_lock);
1264 struct inet_peer *peer;
1265
1266 peer = inet_getpeer(rt->rt_dst, create);
1267
1268 spin_lock_bh(&rt_peer_lock);
1269 if (rt->peer == NULL) {
1270 rt->peer = peer;
1271 peer = NULL;
1272 }
1273 spin_unlock_bh(&rt_peer_lock);
1274 if (peer)
1275 inet_putpeer(peer);
1276}
1277
1278/*
1279 * Peer allocation may fail only in serious out-of-memory conditions. However
1280 * we still can generate some output.
1281 * Random ID selection looks a bit dangerous because we have no chances to
1282 * select ID being unique in a reasonable period of time.
1283 * But broken packet identifier may be better than no packet at all.
1284 */
1285static void ip_select_fb_ident(struct iphdr *iph)
1286{
1287 static DEFINE_SPINLOCK(ip_fb_id_lock);
1288 static u32 ip_fallback_id;
1289 u32 salt;
1290
1291 spin_lock_bh(&ip_fb_id_lock);
Al Viroe4485152006-09-26 22:15:01 -07001292 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001293 iph->id = htons(salt & 0xFFFF);
1294 ip_fallback_id = salt;
1295 spin_unlock_bh(&ip_fb_id_lock);
1296}
1297
1298void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1299{
1300 struct rtable *rt = (struct rtable *) dst;
1301
1302 if (rt) {
1303 if (rt->peer == NULL)
1304 rt_bind_peer(rt, 1);
1305
1306 /* If peer is attached to destination, it is never detached,
1307 so that we need not to grab a lock to dereference it.
1308 */
1309 if (rt->peer) {
1310 iph->id = htons(inet_getid(rt->peer, more));
1311 return;
1312 }
1313 } else
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001314 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
Stephen Hemminger9c2b3322005-04-19 22:39:42 -07001315 __builtin_return_address(0));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001316
1317 ip_select_fb_ident(iph);
1318}
1319
1320static void rt_del(unsigned hash, struct rtable *rt)
1321{
Eric Dumazet29e75252008-01-31 17:05:09 -08001322 struct rtable **rthp, *aux;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001323
Eric Dumazet29e75252008-01-31 17:05:09 -08001324 rthp = &rt_hash_table[hash].chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -07001325 spin_lock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001326 ip_rt_put(rt);
Eric Dumazet29e75252008-01-31 17:05:09 -08001327 while ((aux = *rthp) != NULL) {
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001328 if (aux == rt || rt_is_expired(aux)) {
Eric Dumazet29e75252008-01-31 17:05:09 -08001329 *rthp = aux->u.dst.rt_next;
1330 rt_free(aux);
1331 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001332 }
Eric Dumazet29e75252008-01-31 17:05:09 -08001333 rthp = &aux->u.dst.rt_next;
1334 }
Eric Dumazet22c047c2005-07-05 14:55:24 -07001335 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001336}
1337
Al Virof7655222006-09-26 21:25:43 -07001338void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1339 __be32 saddr, struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001340{
1341 int i, k;
1342 struct in_device *in_dev = in_dev_get(dev);
1343 struct rtable *rth, **rthp;
Al Virof7655222006-09-26 21:25:43 -07001344 __be32 skeys[2] = { saddr, 0 };
Linus Torvalds1da177e2005-04-16 15:20:36 -07001345 int ikeys[2] = { dev->ifindex, 0 };
Tom Tucker8d717402006-07-30 20:43:36 -07001346 struct netevent_redirect netevent;
Denis V. Lunev317805b2008-02-28 20:50:06 -08001347 struct net *net;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001348
Linus Torvalds1da177e2005-04-16 15:20:36 -07001349 if (!in_dev)
1350 return;
1351
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09001352 net = dev_net(dev);
Joe Perches9d4fb272009-11-23 10:41:23 -08001353 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1354 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1355 ipv4_is_zeronet(new_gw))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001356 goto reject_redirect;
1357
Neil Horman1080d702008-10-27 12:28:25 -07001358 if (!rt_caching(net))
1359 goto reject_redirect;
1360
Linus Torvalds1da177e2005-04-16 15:20:36 -07001361 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1362 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1363 goto reject_redirect;
1364 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1365 goto reject_redirect;
1366 } else {
Denis V. Lunev317805b2008-02-28 20:50:06 -08001367 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001368 goto reject_redirect;
1369 }
1370
1371 for (i = 0; i < 2; i++) {
1372 for (k = 0; k < 2; k++) {
Denis V. Lunevb00180d2008-07-05 19:04:09 -07001373 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001374 rt_genid(net));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001375
1376 rthp=&rt_hash_table[hash].chain;
1377
1378 rcu_read_lock();
1379 while ((rth = rcu_dereference(*rthp)) != NULL) {
1380 struct rtable *rt;
1381
1382 if (rth->fl.fl4_dst != daddr ||
1383 rth->fl.fl4_src != skeys[i] ||
Linus Torvalds1da177e2005-04-16 15:20:36 -07001384 rth->fl.oif != ikeys[k] ||
Eric Dumazet29e75252008-01-31 17:05:09 -08001385 rth->fl.iif != 0 ||
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001386 rt_is_expired(rth) ||
YOSHIFUJI Hideaki878628f2008-03-26 03:57:35 +09001387 !net_eq(dev_net(rth->u.dst.dev), net)) {
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001388 rthp = &rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001389 continue;
1390 }
1391
1392 if (rth->rt_dst != daddr ||
1393 rth->rt_src != saddr ||
1394 rth->u.dst.error ||
1395 rth->rt_gateway != old_gw ||
1396 rth->u.dst.dev != dev)
1397 break;
1398
1399 dst_hold(&rth->u.dst);
1400 rcu_read_unlock();
1401
1402 rt = dst_alloc(&ipv4_dst_ops);
1403 if (rt == NULL) {
1404 ip_rt_put(rth);
1405 in_dev_put(in_dev);
1406 return;
1407 }
1408
1409 /* Copy all the information. */
1410 *rt = *rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001411 rt->u.dst.__use = 1;
1412 atomic_set(&rt->u.dst.__refcnt, 1);
1413 rt->u.dst.child = NULL;
1414 if (rt->u.dst.dev)
1415 dev_hold(rt->u.dst.dev);
1416 if (rt->idev)
1417 in_dev_hold(rt->idev);
1418 rt->u.dst.obsolete = 0;
1419 rt->u.dst.lastuse = jiffies;
1420 rt->u.dst.path = &rt->u.dst;
1421 rt->u.dst.neighbour = NULL;
1422 rt->u.dst.hh = NULL;
Alexey Dobriyandef8b4f2008-10-28 13:24:06 -07001423#ifdef CONFIG_XFRM
Linus Torvalds1da177e2005-04-16 15:20:36 -07001424 rt->u.dst.xfrm = NULL;
Alexey Dobriyandef8b4f2008-10-28 13:24:06 -07001425#endif
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001426 rt->rt_genid = rt_genid(net);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001427 rt->rt_flags |= RTCF_REDIRECTED;
1428
1429 /* Gateway is different ... */
1430 rt->rt_gateway = new_gw;
1431
1432 /* Redirect received -> path was valid */
1433 dst_confirm(&rth->u.dst);
1434
1435 if (rt->peer)
1436 atomic_inc(&rt->peer->refcnt);
1437
1438 if (arp_bind_neighbour(&rt->u.dst) ||
1439 !(rt->u.dst.neighbour->nud_state &
1440 NUD_VALID)) {
1441 if (rt->u.dst.neighbour)
1442 neigh_event_send(rt->u.dst.neighbour, NULL);
1443 ip_rt_put(rth);
1444 rt_drop(rt);
1445 goto do_next;
1446 }
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001447
Tom Tucker8d717402006-07-30 20:43:36 -07001448 netevent.old = &rth->u.dst;
1449 netevent.new = &rt->u.dst;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001450 call_netevent_notifiers(NETEVENT_REDIRECT,
1451 &netevent);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001452
1453 rt_del(hash, rth);
Eric Dumazet511c3f92009-06-02 05:14:27 +00001454 if (!rt_intern_hash(hash, rt, &rt, NULL))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001455 ip_rt_put(rt);
1456 goto do_next;
1457 }
1458 rcu_read_unlock();
1459 do_next:
1460 ;
1461 }
1462 }
1463 in_dev_put(in_dev);
1464 return;
1465
1466reject_redirect:
1467#ifdef CONFIG_IP_ROUTE_VERBOSE
1468 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
Harvey Harrison673d57e2008-10-31 00:53:57 -07001469 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1470 " Advised path = %pI4 -> %pI4\n",
1471 &old_gw, dev->name, &new_gw,
1472 &saddr, &daddr);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001473#endif
1474 in_dev_put(in_dev);
1475}
1476
1477static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1478{
Eric Dumazetee6b9672008-03-05 18:30:47 -08001479 struct rtable *rt = (struct rtable *)dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001480 struct dst_entry *ret = dst;
1481
1482 if (rt) {
1483 if (dst->obsolete) {
1484 ip_rt_put(rt);
1485 ret = NULL;
1486 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1487 rt->u.dst.expires) {
Al Viro8c7bc842006-09-26 21:26:19 -07001488 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
Denis V. Lunevb00180d2008-07-05 19:04:09 -07001489 rt->fl.oif,
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001490 rt_genid(dev_net(dst->dev)));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001491#if RT_CACHE_DEBUG >= 1
Harvey Harrison673d57e2008-10-31 00:53:57 -07001492 printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n",
1493 &rt->rt_dst, rt->fl.fl4_tos);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001494#endif
1495 rt_del(hash, rt);
1496 ret = NULL;
1497 }
1498 }
1499 return ret;
1500}
1501
1502/*
1503 * Algorithm:
1504 * 1. The first ip_rt_redirect_number redirects are sent
1505 * with exponential backoff, then we stop sending them at all,
1506 * assuming that the host ignores our redirects.
1507 * 2. If we did not see packets requiring redirects
1508 * during ip_rt_redirect_silence, we assume that the host
1509 * forgot redirected route and start to send redirects again.
1510 *
1511 * This algorithm is much cheaper and more intelligent than dumb load limiting
1512 * in icmp.c.
1513 *
1514 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1515 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1516 */
1517
1518void ip_rt_send_redirect(struct sk_buff *skb)
1519{
Eric Dumazet511c3f92009-06-02 05:14:27 +00001520 struct rtable *rt = skb_rtable(skb);
Eric Dumazet30038fc2009-08-28 23:52:01 -07001521 struct in_device *in_dev;
1522 int log_martians;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001523
Eric Dumazet30038fc2009-08-28 23:52:01 -07001524 rcu_read_lock();
1525 in_dev = __in_dev_get_rcu(rt->u.dst.dev);
1526 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1527 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001528 return;
Eric Dumazet30038fc2009-08-28 23:52:01 -07001529 }
1530 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1531 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001532
1533 /* No redirected packets during ip_rt_redirect_silence;
1534 * reset the algorithm.
1535 */
1536 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1537 rt->u.dst.rate_tokens = 0;
1538
1539 /* Too many ignored redirects; do not send anything
1540 * set u.dst.rate_last to the last seen redirected packet.
1541 */
1542 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1543 rt->u.dst.rate_last = jiffies;
Eric Dumazet30038fc2009-08-28 23:52:01 -07001544 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001545 }
1546
1547 /* Check for load limit; set rate_last to the latest sent
1548 * redirect.
1549 */
Li Yewang14fb8a72006-12-18 00:26:35 -08001550 if (rt->u.dst.rate_tokens == 0 ||
1551 time_after(jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001552 (rt->u.dst.rate_last +
1553 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1554 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1555 rt->u.dst.rate_last = jiffies;
1556 ++rt->u.dst.rate_tokens;
1557#ifdef CONFIG_IP_ROUTE_VERBOSE
Eric Dumazet30038fc2009-08-28 23:52:01 -07001558 if (log_martians &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001559 rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1560 net_ratelimit())
Harvey Harrison673d57e2008-10-31 00:53:57 -07001561 printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1562 &rt->rt_src, rt->rt_iif,
1563 &rt->rt_dst, &rt->rt_gateway);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001564#endif
1565 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001566}
1567
1568static int ip_error(struct sk_buff *skb)
1569{
Eric Dumazet511c3f92009-06-02 05:14:27 +00001570 struct rtable *rt = skb_rtable(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001571 unsigned long now;
1572 int code;
1573
1574 switch (rt->u.dst.error) {
1575 case EINVAL:
1576 default:
1577 goto out;
1578 case EHOSTUNREACH:
1579 code = ICMP_HOST_UNREACH;
1580 break;
1581 case ENETUNREACH:
1582 code = ICMP_NET_UNREACH;
Pavel Emelyanov7c73a6f2008-07-16 20:20:11 -07001583 IP_INC_STATS_BH(dev_net(rt->u.dst.dev),
1584 IPSTATS_MIB_INNOROUTES);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001585 break;
1586 case EACCES:
1587 code = ICMP_PKT_FILTERED;
1588 break;
1589 }
1590
1591 now = jiffies;
1592 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1593 if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1594 rt->u.dst.rate_tokens = ip_rt_error_burst;
1595 rt->u.dst.rate_last = now;
1596 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1597 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1598 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1599 }
1600
1601out: kfree_skb(skb);
1602 return 0;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001603}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001604
1605/*
1606 * The last two values are not from the RFC but
1607 * are needed for AMPRnet AX.25 paths.
1608 */
1609
Arjan van de Ven9b5b5cf2005-11-29 16:21:38 -08001610static const unsigned short mtu_plateau[] =
Linus Torvalds1da177e2005-04-16 15:20:36 -07001611{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1612
Stephen Hemminger5969f712008-04-10 01:52:09 -07001613static inline unsigned short guess_mtu(unsigned short old_mtu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001614{
1615 int i;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001616
Linus Torvalds1da177e2005-04-16 15:20:36 -07001617 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1618 if (old_mtu > mtu_plateau[i])
1619 return mtu_plateau[i];
1620 return 68;
1621}
1622
Denis V. Lunevb5921912008-01-22 23:50:25 -08001623unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
Timo Teras0010e462008-04-29 03:32:25 -07001624 unsigned short new_mtu,
1625 struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001626{
Timo Teras0010e462008-04-29 03:32:25 -07001627 int i, k;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001628 unsigned short old_mtu = ntohs(iph->tot_len);
1629 struct rtable *rth;
Timo Teras0010e462008-04-29 03:32:25 -07001630 int ikeys[2] = { dev->ifindex, 0 };
Al Viroe4485152006-09-26 22:15:01 -07001631 __be32 skeys[2] = { iph->saddr, 0, };
1632 __be32 daddr = iph->daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001633 unsigned short est_mtu = 0;
1634
Timo Teras0010e462008-04-29 03:32:25 -07001635 for (k = 0; k < 2; k++) {
1636 for (i = 0; i < 2; i++) {
Denis V. Lunevb00180d2008-07-05 19:04:09 -07001637 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001638 rt_genid(net));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001639
Timo Teras0010e462008-04-29 03:32:25 -07001640 rcu_read_lock();
1641 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1642 rth = rcu_dereference(rth->u.dst.rt_next)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001643 unsigned short mtu = new_mtu;
1644
Timo Teras0010e462008-04-29 03:32:25 -07001645 if (rth->fl.fl4_dst != daddr ||
1646 rth->fl.fl4_src != skeys[i] ||
1647 rth->rt_dst != daddr ||
1648 rth->rt_src != iph->saddr ||
1649 rth->fl.oif != ikeys[k] ||
1650 rth->fl.iif != 0 ||
1651 dst_metric_locked(&rth->u.dst, RTAX_MTU) ||
1652 !net_eq(dev_net(rth->u.dst.dev), net) ||
Hugh Dickins6c3b8fc2008-07-26 17:51:06 -07001653 rt_is_expired(rth))
Timo Teras0010e462008-04-29 03:32:25 -07001654 continue;
1655
Linus Torvalds1da177e2005-04-16 15:20:36 -07001656 if (new_mtu < 68 || new_mtu >= old_mtu) {
1657
1658 /* BSD 4.2 compatibility hack :-( */
1659 if (mtu == 0 &&
Rami Rosen6d273f82008-08-06 02:33:49 -07001660 old_mtu >= dst_mtu(&rth->u.dst) &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001661 old_mtu >= 68 + (iph->ihl << 2))
1662 old_mtu -= iph->ihl << 2;
1663
1664 mtu = guess_mtu(old_mtu);
1665 }
Rami Rosen6d273f82008-08-06 02:33:49 -07001666 if (mtu <= dst_mtu(&rth->u.dst)) {
1667 if (mtu < dst_mtu(&rth->u.dst)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001668 dst_confirm(&rth->u.dst);
1669 if (mtu < ip_rt_min_pmtu) {
1670 mtu = ip_rt_min_pmtu;
1671 rth->u.dst.metrics[RTAX_LOCK-1] |=
1672 (1 << RTAX_MTU);
1673 }
1674 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1675 dst_set_expires(&rth->u.dst,
1676 ip_rt_mtu_expires);
1677 }
1678 est_mtu = mtu;
1679 }
1680 }
Timo Teras0010e462008-04-29 03:32:25 -07001681 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001682 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001683 }
1684 return est_mtu ? : new_mtu;
1685}
1686
1687static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1688{
Rami Rosen6d273f82008-08-06 02:33:49 -07001689 if (dst_mtu(dst) > mtu && mtu >= 68 &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001690 !(dst_metric_locked(dst, RTAX_MTU))) {
1691 if (mtu < ip_rt_min_pmtu) {
1692 mtu = ip_rt_min_pmtu;
1693 dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1694 }
1695 dst->metrics[RTAX_MTU-1] = mtu;
1696 dst_set_expires(dst, ip_rt_mtu_expires);
Tom Tucker8d717402006-07-30 20:43:36 -07001697 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001698 }
1699}
1700
1701static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1702{
1703 return NULL;
1704}
1705
1706static void ipv4_dst_destroy(struct dst_entry *dst)
1707{
1708 struct rtable *rt = (struct rtable *) dst;
1709 struct inet_peer *peer = rt->peer;
1710 struct in_device *idev = rt->idev;
1711
1712 if (peer) {
1713 rt->peer = NULL;
1714 inet_putpeer(peer);
1715 }
1716
1717 if (idev) {
1718 rt->idev = NULL;
1719 in_dev_put(idev);
1720 }
1721}
1722
1723static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1724 int how)
1725{
1726 struct rtable *rt = (struct rtable *) dst;
1727 struct in_device *idev = rt->idev;
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09001728 if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) {
Denis V. Lunev5a3e55d2007-12-07 00:38:10 -08001729 struct in_device *loopback_idev =
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09001730 in_dev_get(dev_net(dev)->loopback_dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001731 if (loopback_idev) {
1732 rt->idev = loopback_idev;
1733 in_dev_put(idev);
1734 }
1735 }
1736}
1737
1738static void ipv4_link_failure(struct sk_buff *skb)
1739{
1740 struct rtable *rt;
1741
1742 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1743
Eric Dumazet511c3f92009-06-02 05:14:27 +00001744 rt = skb_rtable(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001745 if (rt)
1746 dst_set_expires(&rt->u.dst, 0);
1747}
1748
1749static int ip_rt_bug(struct sk_buff *skb)
1750{
Harvey Harrison673d57e2008-10-31 00:53:57 -07001751 printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1752 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001753 skb->dev ? skb->dev->name : "?");
1754 kfree_skb(skb);
1755 return 0;
1756}
1757
1758/*
1759 We do not cache source address of outgoing interface,
1760 because it is used only by IP RR, TS and SRR options,
1761 so that it out of fast path.
1762
1763 BTW remember: "addr" is allowed to be not aligned
1764 in IP options!
1765 */
1766
1767void ip_rt_get_source(u8 *addr, struct rtable *rt)
1768{
Al Viroa61ced52006-09-26 21:27:54 -07001769 __be32 src;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001770 struct fib_result res;
1771
1772 if (rt->fl.iif == 0)
1773 src = rt->rt_src;
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09001774 else if (fib_lookup(dev_net(rt->u.dst.dev), &rt->fl, &res) == 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001775 src = FIB_RES_PREFSRC(res);
1776 fib_res_put(&res);
1777 } else
1778 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1779 RT_SCOPE_UNIVERSE);
1780 memcpy(addr, &src, 4);
1781}
1782
1783#ifdef CONFIG_NET_CLS_ROUTE
1784static void set_class_tag(struct rtable *rt, u32 tag)
1785{
1786 if (!(rt->u.dst.tclassid & 0xFFFF))
1787 rt->u.dst.tclassid |= tag & 0xFFFF;
1788 if (!(rt->u.dst.tclassid & 0xFFFF0000))
1789 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1790}
1791#endif
1792
1793static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1794{
1795 struct fib_info *fi = res->fi;
1796
1797 if (fi) {
1798 if (FIB_RES_GW(*res) &&
1799 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1800 rt->rt_gateway = FIB_RES_GW(*res);
1801 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1802 sizeof(rt->u.dst.metrics));
1803 if (fi->fib_mtu == 0) {
1804 rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
Satoru SATOH0bbeafd2008-05-04 22:12:43 -07001805 if (dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001806 rt->rt_gateway != rt->rt_dst &&
1807 rt->u.dst.dev->mtu > 576)
1808 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1809 }
1810#ifdef CONFIG_NET_CLS_ROUTE
1811 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1812#endif
1813 } else
1814 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1815
Satoru SATOH5ffc02a2008-05-04 22:14:42 -07001816 if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001817 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
Rami Rosen6d273f82008-08-06 02:33:49 -07001818 if (dst_mtu(&rt->u.dst) > IP_MAX_MTU)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001819 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
Satoru SATOH5ffc02a2008-05-04 22:14:42 -07001820 if (dst_metric(&rt->u.dst, RTAX_ADVMSS) == 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001821 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1822 ip_rt_min_advmss);
Satoru SATOH5ffc02a2008-05-04 22:14:42 -07001823 if (dst_metric(&rt->u.dst, RTAX_ADVMSS) > 65535 - 40)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001824 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1825
1826#ifdef CONFIG_NET_CLS_ROUTE
1827#ifdef CONFIG_IP_MULTIPLE_TABLES
1828 set_class_tag(rt, fib_rules_tclass(res));
1829#endif
1830 set_class_tag(rt, itag);
1831#endif
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001832 rt->rt_type = res->type;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001833}
1834
Al Viro9e12bb22006-09-26 21:25:20 -07001835static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001836 u8 tos, struct net_device *dev, int our)
1837{
1838 unsigned hash;
1839 struct rtable *rth;
Al Viroa61ced52006-09-26 21:27:54 -07001840 __be32 spec_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001841 struct in_device *in_dev = in_dev_get(dev);
1842 u32 itag = 0;
1843
1844 /* Primary sanity checks. */
1845
1846 if (in_dev == NULL)
1847 return -EINVAL;
1848
Jan Engelhardt1e637c72008-01-21 03:18:08 -08001849 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
Joe Perchesf97c1e02007-12-16 13:45:43 -08001850 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001851 goto e_inval;
1852
Joe Perchesf97c1e02007-12-16 13:45:43 -08001853 if (ipv4_is_zeronet(saddr)) {
1854 if (!ipv4_is_local_multicast(daddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001855 goto e_inval;
1856 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1857 } else if (fib_validate_source(saddr, 0, tos, 0,
jamalb0c110c2009-10-18 02:12:33 +00001858 dev, &spec_dst, &itag, 0) < 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001859 goto e_inval;
1860
1861 rth = dst_alloc(&ipv4_dst_ops);
1862 if (!rth)
1863 goto e_nobufs;
1864
1865 rth->u.dst.output= ip_rt_bug;
1866
1867 atomic_set(&rth->u.dst.__refcnt, 1);
1868 rth->u.dst.flags= DST_HOST;
Herbert Xu42f811b2007-06-04 23:34:44 -07001869 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001870 rth->u.dst.flags |= DST_NOPOLICY;
1871 rth->fl.fl4_dst = daddr;
1872 rth->rt_dst = daddr;
1873 rth->fl.fl4_tos = tos;
Thomas Graf47dcf0c2006-11-09 15:20:38 -08001874 rth->fl.mark = skb->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001875 rth->fl.fl4_src = saddr;
1876 rth->rt_src = saddr;
1877#ifdef CONFIG_NET_CLS_ROUTE
1878 rth->u.dst.tclassid = itag;
1879#endif
1880 rth->rt_iif =
1881 rth->fl.iif = dev->ifindex;
Eric W. Biederman2774c7a2007-09-26 22:10:56 -07001882 rth->u.dst.dev = init_net.loopback_dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001883 dev_hold(rth->u.dst.dev);
1884 rth->idev = in_dev_get(rth->u.dst.dev);
1885 rth->fl.oif = 0;
1886 rth->rt_gateway = daddr;
1887 rth->rt_spec_dst= spec_dst;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001888 rth->rt_genid = rt_genid(dev_net(dev));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001889 rth->rt_flags = RTCF_MULTICAST;
Eric Dumazet29e75252008-01-31 17:05:09 -08001890 rth->rt_type = RTN_MULTICAST;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001891 if (our) {
1892 rth->u.dst.input= ip_local_deliver;
1893 rth->rt_flags |= RTCF_LOCAL;
1894 }
1895
1896#ifdef CONFIG_IP_MROUTE
Joe Perchesf97c1e02007-12-16 13:45:43 -08001897 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001898 rth->u.dst.input = ip_mr_input;
1899#endif
1900 RT_CACHE_STAT_INC(in_slow_mc);
1901
1902 in_dev_put(in_dev);
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001903 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
Eric Dumazet511c3f92009-06-02 05:14:27 +00001904 return rt_intern_hash(hash, rth, NULL, skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001905
1906e_nobufs:
1907 in_dev_put(in_dev);
1908 return -ENOBUFS;
1909
1910e_inval:
1911 in_dev_put(in_dev);
1912 return -EINVAL;
1913}
1914
1915
1916static void ip_handle_martian_source(struct net_device *dev,
1917 struct in_device *in_dev,
1918 struct sk_buff *skb,
Al Viro9e12bb22006-09-26 21:25:20 -07001919 __be32 daddr,
1920 __be32 saddr)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001921{
1922 RT_CACHE_STAT_INC(in_martian_src);
1923#ifdef CONFIG_IP_ROUTE_VERBOSE
1924 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1925 /*
1926 * RFC1812 recommendation, if source is martian,
1927 * the only hint is MAC header.
1928 */
Harvey Harrison673d57e2008-10-31 00:53:57 -07001929 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
1930 &daddr, &saddr, dev->name);
Arnaldo Carvalho de Melo98e399f2007-03-19 15:33:04 -07001931 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001932 int i;
Arnaldo Carvalho de Melo98e399f2007-03-19 15:33:04 -07001933 const unsigned char *p = skb_mac_header(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001934 printk(KERN_WARNING "ll header: ");
1935 for (i = 0; i < dev->hard_header_len; i++, p++) {
1936 printk("%02x", *p);
1937 if (i < (dev->hard_header_len - 1))
1938 printk(":");
1939 }
1940 printk("\n");
1941 }
1942 }
1943#endif
1944}
1945
Stephen Hemminger5969f712008-04-10 01:52:09 -07001946static int __mkroute_input(struct sk_buff *skb,
1947 struct fib_result *res,
1948 struct in_device *in_dev,
1949 __be32 daddr, __be32 saddr, u32 tos,
1950 struct rtable **result)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001951{
1952
1953 struct rtable *rth;
1954 int err;
1955 struct in_device *out_dev;
1956 unsigned flags = 0;
Al Virod9c9df82006-09-26 21:28:14 -07001957 __be32 spec_dst;
1958 u32 itag;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001959
1960 /* get a working reference to the output device */
1961 out_dev = in_dev_get(FIB_RES_DEV(*res));
1962 if (out_dev == NULL) {
1963 if (net_ratelimit())
1964 printk(KERN_CRIT "Bug in ip_route_input" \
1965 "_slow(). Please, report\n");
1966 return -EINVAL;
1967 }
1968
1969
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001970 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
jamalb0c110c2009-10-18 02:12:33 +00001971 in_dev->dev, &spec_dst, &itag, skb->mark);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001972 if (err < 0) {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001973 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001974 saddr);
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001975
Linus Torvalds1da177e2005-04-16 15:20:36 -07001976 err = -EINVAL;
1977 goto cleanup;
1978 }
1979
1980 if (err)
1981 flags |= RTCF_DIRECTSRC;
1982
Thomas Graf51b77ca2008-06-03 16:36:01 -07001983 if (out_dev == in_dev && err &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001984 (IN_DEV_SHARED_MEDIA(out_dev) ||
1985 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1986 flags |= RTCF_DOREDIRECT;
1987
1988 if (skb->protocol != htons(ETH_P_IP)) {
1989 /* Not IP (i.e. ARP). Do not create route, if it is
1990 * invalid for proxy arp. DNAT routes are always valid.
Jesper Dangaard Brouer65324142010-01-05 05:50:47 +00001991 *
1992 * Proxy arp feature have been extended to allow, ARP
1993 * replies back to the same interface, to support
1994 * Private VLAN switch technologies. See arp.c.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001995 */
Jesper Dangaard Brouer65324142010-01-05 05:50:47 +00001996 if (out_dev == in_dev &&
1997 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001998 err = -EINVAL;
1999 goto cleanup;
2000 }
2001 }
2002
2003
2004 rth = dst_alloc(&ipv4_dst_ops);
2005 if (!rth) {
2006 err = -ENOBUFS;
2007 goto cleanup;
2008 }
2009
Julian Anastasovce723d82005-09-08 13:34:47 -07002010 atomic_set(&rth->u.dst.__refcnt, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002011 rth->u.dst.flags= DST_HOST;
Herbert Xu42f811b2007-06-04 23:34:44 -07002012 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002013 rth->u.dst.flags |= DST_NOPOLICY;
Herbert Xu42f811b2007-06-04 23:34:44 -07002014 if (IN_DEV_CONF_GET(out_dev, NOXFRM))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002015 rth->u.dst.flags |= DST_NOXFRM;
2016 rth->fl.fl4_dst = daddr;
2017 rth->rt_dst = daddr;
2018 rth->fl.fl4_tos = tos;
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002019 rth->fl.mark = skb->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002020 rth->fl.fl4_src = saddr;
2021 rth->rt_src = saddr;
2022 rth->rt_gateway = daddr;
2023 rth->rt_iif =
2024 rth->fl.iif = in_dev->dev->ifindex;
2025 rth->u.dst.dev = (out_dev)->dev;
2026 dev_hold(rth->u.dst.dev);
2027 rth->idev = in_dev_get(rth->u.dst.dev);
2028 rth->fl.oif = 0;
2029 rth->rt_spec_dst= spec_dst;
2030
2031 rth->u.dst.input = ip_forward;
2032 rth->u.dst.output = ip_output;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002033 rth->rt_genid = rt_genid(dev_net(rth->u.dst.dev));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002034
2035 rt_set_nexthop(rth, res, itag);
2036
2037 rth->rt_flags = flags;
2038
2039 *result = rth;
2040 err = 0;
2041 cleanup:
2042 /* release the working reference to the output device */
2043 in_dev_put(out_dev);
2044 return err;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002045}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002046
Stephen Hemminger5969f712008-04-10 01:52:09 -07002047static int ip_mkroute_input(struct sk_buff *skb,
2048 struct fib_result *res,
2049 const struct flowi *fl,
2050 struct in_device *in_dev,
2051 __be32 daddr, __be32 saddr, u32 tos)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002052{
Chuck Short7abaa272005-06-22 22:10:23 -07002053 struct rtable* rth = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002054 int err;
2055 unsigned hash;
2056
2057#ifdef CONFIG_IP_ROUTE_MULTIPATH
2058 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
2059 fib_select_multipath(fl, res);
2060#endif
2061
2062 /* create a routing cache entry */
2063 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2064 if (err)
2065 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002066
2067 /* put it into the cache */
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002068 hash = rt_hash(daddr, saddr, fl->iif,
2069 rt_genid(dev_net(rth->u.dst.dev)));
Eric Dumazet511c3f92009-06-02 05:14:27 +00002070 return rt_intern_hash(hash, rth, NULL, skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002071}
2072
Linus Torvalds1da177e2005-04-16 15:20:36 -07002073/*
2074 * NOTE. We drop all the packets that has local source
2075 * addresses, because every properly looped back packet
2076 * must have correct destination already attached by output routine.
2077 *
2078 * Such approach solves two big problems:
2079 * 1. Not simplex devices are handled properly.
2080 * 2. IP spoofing attempts are filtered with 100% of guarantee.
2081 */
2082
Al Viro9e12bb22006-09-26 21:25:20 -07002083static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002084 u8 tos, struct net_device *dev)
2085{
2086 struct fib_result res;
2087 struct in_device *in_dev = in_dev_get(dev);
2088 struct flowi fl = { .nl_u = { .ip4_u =
2089 { .daddr = daddr,
2090 .saddr = saddr,
2091 .tos = tos,
2092 .scope = RT_SCOPE_UNIVERSE,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002093 } },
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002094 .mark = skb->mark,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002095 .iif = dev->ifindex };
2096 unsigned flags = 0;
2097 u32 itag = 0;
2098 struct rtable * rth;
2099 unsigned hash;
Al Viro9e12bb22006-09-26 21:25:20 -07002100 __be32 spec_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002101 int err = -EINVAL;
2102 int free_res = 0;
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09002103 struct net * net = dev_net(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002104
2105 /* IP on this device is disabled. */
2106
2107 if (!in_dev)
2108 goto out;
2109
2110 /* Check for the most weird martians, which can be not detected
2111 by fib_lookup.
2112 */
2113
Jan Engelhardt1e637c72008-01-21 03:18:08 -08002114 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
Joe Perchesf97c1e02007-12-16 13:45:43 -08002115 ipv4_is_loopback(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002116 goto martian_source;
2117
Al Viroe4485152006-09-26 22:15:01 -07002118 if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002119 goto brd_input;
2120
2121 /* Accept zero addresses only to limited broadcast;
2122 * I even do not know to fix it or not. Waiting for complains :-)
2123 */
Joe Perchesf97c1e02007-12-16 13:45:43 -08002124 if (ipv4_is_zeronet(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002125 goto martian_source;
2126
Jan Engelhardt1e637c72008-01-21 03:18:08 -08002127 if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
Joe Perchesf97c1e02007-12-16 13:45:43 -08002128 ipv4_is_loopback(daddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002129 goto martian_destination;
2130
2131 /*
2132 * Now we are ready to route packet.
2133 */
Denis V. Lunev84a885f2008-01-21 17:34:35 -08002134 if ((err = fib_lookup(net, &fl, &res)) != 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002135 if (!IN_DEV_FORWARD(in_dev))
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002136 goto e_hostunreach;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002137 goto no_route;
2138 }
2139 free_res = 1;
2140
2141 RT_CACHE_STAT_INC(in_slow_tot);
2142
2143 if (res.type == RTN_BROADCAST)
2144 goto brd_input;
2145
2146 if (res.type == RTN_LOCAL) {
2147 int result;
2148 result = fib_validate_source(saddr, daddr, tos,
Denis V. Lunev84a885f2008-01-21 17:34:35 -08002149 net->loopback_dev->ifindex,
jamalb0c110c2009-10-18 02:12:33 +00002150 dev, &spec_dst, &itag, skb->mark);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002151 if (result < 0)
2152 goto martian_source;
2153 if (result)
2154 flags |= RTCF_DIRECTSRC;
2155 spec_dst = daddr;
2156 goto local_input;
2157 }
2158
2159 if (!IN_DEV_FORWARD(in_dev))
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002160 goto e_hostunreach;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002161 if (res.type != RTN_UNICAST)
2162 goto martian_destination;
2163
2164 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002165done:
2166 in_dev_put(in_dev);
2167 if (free_res)
2168 fib_res_put(&res);
2169out: return err;
2170
2171brd_input:
2172 if (skb->protocol != htons(ETH_P_IP))
2173 goto e_inval;
2174
Joe Perchesf97c1e02007-12-16 13:45:43 -08002175 if (ipv4_is_zeronet(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002176 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2177 else {
2178 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
jamalb0c110c2009-10-18 02:12:33 +00002179 &itag, skb->mark);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002180 if (err < 0)
2181 goto martian_source;
2182 if (err)
2183 flags |= RTCF_DIRECTSRC;
2184 }
2185 flags |= RTCF_BROADCAST;
2186 res.type = RTN_BROADCAST;
2187 RT_CACHE_STAT_INC(in_brd);
2188
2189local_input:
2190 rth = dst_alloc(&ipv4_dst_ops);
2191 if (!rth)
2192 goto e_nobufs;
2193
2194 rth->u.dst.output= ip_rt_bug;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002195 rth->rt_genid = rt_genid(net);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002196
2197 atomic_set(&rth->u.dst.__refcnt, 1);
2198 rth->u.dst.flags= DST_HOST;
Herbert Xu42f811b2007-06-04 23:34:44 -07002199 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002200 rth->u.dst.flags |= DST_NOPOLICY;
2201 rth->fl.fl4_dst = daddr;
2202 rth->rt_dst = daddr;
2203 rth->fl.fl4_tos = tos;
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002204 rth->fl.mark = skb->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002205 rth->fl.fl4_src = saddr;
2206 rth->rt_src = saddr;
2207#ifdef CONFIG_NET_CLS_ROUTE
2208 rth->u.dst.tclassid = itag;
2209#endif
2210 rth->rt_iif =
2211 rth->fl.iif = dev->ifindex;
Denis V. Lunev84a885f2008-01-21 17:34:35 -08002212 rth->u.dst.dev = net->loopback_dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002213 dev_hold(rth->u.dst.dev);
2214 rth->idev = in_dev_get(rth->u.dst.dev);
2215 rth->rt_gateway = daddr;
2216 rth->rt_spec_dst= spec_dst;
2217 rth->u.dst.input= ip_local_deliver;
2218 rth->rt_flags = flags|RTCF_LOCAL;
2219 if (res.type == RTN_UNREACHABLE) {
2220 rth->u.dst.input= ip_error;
2221 rth->u.dst.error= -err;
2222 rth->rt_flags &= ~RTCF_LOCAL;
2223 }
2224 rth->rt_type = res.type;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002225 hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
Eric Dumazet511c3f92009-06-02 05:14:27 +00002226 err = rt_intern_hash(hash, rth, NULL, skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002227 goto done;
2228
2229no_route:
2230 RT_CACHE_STAT_INC(in_no_route);
2231 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2232 res.type = RTN_UNREACHABLE;
Mitsuru Chinen7f538782007-12-07 01:07:24 -08002233 if (err == -ESRCH)
2234 err = -ENETUNREACH;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002235 goto local_input;
2236
2237 /*
2238 * Do not cache martian addresses: they should be logged (RFC1812)
2239 */
2240martian_destination:
2241 RT_CACHE_STAT_INC(in_martian_dst);
2242#ifdef CONFIG_IP_ROUTE_VERBOSE
2243 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
Harvey Harrison673d57e2008-10-31 00:53:57 -07002244 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2245 &daddr, &saddr, dev->name);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002246#endif
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002247
2248e_hostunreach:
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002249 err = -EHOSTUNREACH;
2250 goto done;
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002251
Linus Torvalds1da177e2005-04-16 15:20:36 -07002252e_inval:
2253 err = -EINVAL;
2254 goto done;
2255
2256e_nobufs:
2257 err = -ENOBUFS;
2258 goto done;
2259
2260martian_source:
2261 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2262 goto e_inval;
2263}
2264
Al Viro9e12bb22006-09-26 21:25:20 -07002265int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002266 u8 tos, struct net_device *dev)
2267{
2268 struct rtable * rth;
2269 unsigned hash;
2270 int iif = dev->ifindex;
Denis V. Lunevb5921912008-01-22 23:50:25 -08002271 struct net *net;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002272
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09002273 net = dev_net(dev);
Neil Horman1080d702008-10-27 12:28:25 -07002274
2275 if (!rt_caching(net))
2276 goto skip_cache;
2277
Linus Torvalds1da177e2005-04-16 15:20:36 -07002278 tos &= IPTOS_RT_MASK;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002279 hash = rt_hash(daddr, saddr, iif, rt_genid(net));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002280
2281 rcu_read_lock();
2282 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
Eric Dumazet093c2ca2007-02-09 16:19:26 -08002283 rth = rcu_dereference(rth->u.dst.rt_next)) {
Stephen Hemmingerc0b8c322008-04-10 04:00:28 -07002284 if (((rth->fl.fl4_dst ^ daddr) |
2285 (rth->fl.fl4_src ^ saddr) |
2286 (rth->fl.iif ^ iif) |
2287 rth->fl.oif |
2288 (rth->fl.fl4_tos ^ tos)) == 0 &&
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002289 rth->fl.mark == skb->mark &&
YOSHIFUJI Hideaki878628f2008-03-26 03:57:35 +09002290 net_eq(dev_net(rth->u.dst.dev), net) &&
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002291 !rt_is_expired(rth)) {
Pavel Emelyanov03f49f32007-11-10 21:28:34 -08002292 dst_use(&rth->u.dst, jiffies);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002293 RT_CACHE_STAT_INC(in_hit);
2294 rcu_read_unlock();
Eric Dumazetadf30902009-06-02 05:19:30 +00002295 skb_dst_set(skb, &rth->u.dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002296 return 0;
2297 }
2298 RT_CACHE_STAT_INC(in_hlist_search);
2299 }
2300 rcu_read_unlock();
2301
Neil Horman1080d702008-10-27 12:28:25 -07002302skip_cache:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002303 /* Multicast recognition logic is moved from route cache to here.
2304 The problem was that too many Ethernet cards have broken/missing
2305 hardware multicast filters :-( As result the host on multicasting
2306 network acquires a lot of useless route cache entries, sort of
2307 SDR messages from all the world. Now we try to get rid of them.
2308 Really, provided software IP multicast filter is organized
2309 reasonably (at least, hashed), it does not result in a slowdown
2310 comparing with route cache reject entries.
2311 Note, that multicast routers are not affected, because
2312 route cache entry is created eventually.
2313 */
Joe Perchesf97c1e02007-12-16 13:45:43 -08002314 if (ipv4_is_multicast(daddr)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002315 struct in_device *in_dev;
2316
2317 rcu_read_lock();
Herbert Xue5ed6392005-10-03 14:35:55 -07002318 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002319 int our = ip_check_mc(in_dev, daddr, saddr,
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07002320 ip_hdr(skb)->protocol);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002321 if (our
2322#ifdef CONFIG_IP_MROUTE
Joe Perches9d4fb272009-11-23 10:41:23 -08002323 ||
2324 (!ipv4_is_local_multicast(daddr) &&
2325 IN_DEV_MFORWARD(in_dev))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002326#endif
Joe Perches9d4fb272009-11-23 10:41:23 -08002327 ) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002328 rcu_read_unlock();
2329 return ip_route_input_mc(skb, daddr, saddr,
2330 tos, dev, our);
2331 }
2332 }
2333 rcu_read_unlock();
2334 return -EINVAL;
2335 }
2336 return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2337}
2338
Stephen Hemminger5969f712008-04-10 01:52:09 -07002339static int __mkroute_output(struct rtable **result,
2340 struct fib_result *res,
2341 const struct flowi *fl,
2342 const struct flowi *oldflp,
2343 struct net_device *dev_out,
2344 unsigned flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002345{
2346 struct rtable *rth;
2347 struct in_device *in_dev;
2348 u32 tos = RT_FL_TOS(oldflp);
2349 int err = 0;
2350
Joe Perchesf97c1e02007-12-16 13:45:43 -08002351 if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002352 return -EINVAL;
2353
Al Viroe4485152006-09-26 22:15:01 -07002354 if (fl->fl4_dst == htonl(0xFFFFFFFF))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002355 res->type = RTN_BROADCAST;
Joe Perchesf97c1e02007-12-16 13:45:43 -08002356 else if (ipv4_is_multicast(fl->fl4_dst))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002357 res->type = RTN_MULTICAST;
Jan Engelhardt1e637c72008-01-21 03:18:08 -08002358 else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002359 return -EINVAL;
2360
2361 if (dev_out->flags & IFF_LOOPBACK)
2362 flags |= RTCF_LOCAL;
2363
2364 /* get work reference to inet device */
2365 in_dev = in_dev_get(dev_out);
2366 if (!in_dev)
2367 return -EINVAL;
2368
2369 if (res->type == RTN_BROADCAST) {
2370 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2371 if (res->fi) {
2372 fib_info_put(res->fi);
2373 res->fi = NULL;
2374 }
2375 } else if (res->type == RTN_MULTICAST) {
2376 flags |= RTCF_MULTICAST|RTCF_LOCAL;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002377 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002378 oldflp->proto))
2379 flags &= ~RTCF_LOCAL;
2380 /* If multicast route do not exist use
2381 default one, but do not gateway in this case.
2382 Yes, it is hack.
2383 */
2384 if (res->fi && res->prefixlen < 4) {
2385 fib_info_put(res->fi);
2386 res->fi = NULL;
2387 }
2388 }
2389
2390
2391 rth = dst_alloc(&ipv4_dst_ops);
2392 if (!rth) {
2393 err = -ENOBUFS;
2394 goto cleanup;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002395 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002396
Julian Anastasovce723d82005-09-08 13:34:47 -07002397 atomic_set(&rth->u.dst.__refcnt, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002398 rth->u.dst.flags= DST_HOST;
Herbert Xu42f811b2007-06-04 23:34:44 -07002399 if (IN_DEV_CONF_GET(in_dev, NOXFRM))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002400 rth->u.dst.flags |= DST_NOXFRM;
Herbert Xu42f811b2007-06-04 23:34:44 -07002401 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002402 rth->u.dst.flags |= DST_NOPOLICY;
2403
2404 rth->fl.fl4_dst = oldflp->fl4_dst;
2405 rth->fl.fl4_tos = tos;
2406 rth->fl.fl4_src = oldflp->fl4_src;
2407 rth->fl.oif = oldflp->oif;
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002408 rth->fl.mark = oldflp->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002409 rth->rt_dst = fl->fl4_dst;
2410 rth->rt_src = fl->fl4_src;
2411 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002412 /* get references to the devices that are to be hold by the routing
Linus Torvalds1da177e2005-04-16 15:20:36 -07002413 cache entry */
2414 rth->u.dst.dev = dev_out;
2415 dev_hold(dev_out);
2416 rth->idev = in_dev_get(dev_out);
2417 rth->rt_gateway = fl->fl4_dst;
2418 rth->rt_spec_dst= fl->fl4_src;
2419
2420 rth->u.dst.output=ip_output;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002421 rth->rt_genid = rt_genid(dev_net(dev_out));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002422
2423 RT_CACHE_STAT_INC(out_slow_tot);
2424
2425 if (flags & RTCF_LOCAL) {
2426 rth->u.dst.input = ip_local_deliver;
2427 rth->rt_spec_dst = fl->fl4_dst;
2428 }
2429 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2430 rth->rt_spec_dst = fl->fl4_src;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002431 if (flags & RTCF_LOCAL &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07002432 !(dev_out->flags & IFF_LOOPBACK)) {
2433 rth->u.dst.output = ip_mc_output;
2434 RT_CACHE_STAT_INC(out_slow_mc);
2435 }
2436#ifdef CONFIG_IP_MROUTE
2437 if (res->type == RTN_MULTICAST) {
2438 if (IN_DEV_MFORWARD(in_dev) &&
Joe Perchesf97c1e02007-12-16 13:45:43 -08002439 !ipv4_is_local_multicast(oldflp->fl4_dst)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002440 rth->u.dst.input = ip_mr_input;
2441 rth->u.dst.output = ip_mc_output;
2442 }
2443 }
2444#endif
2445 }
2446
2447 rt_set_nexthop(rth, res, 0);
2448
2449 rth->rt_flags = flags;
2450
2451 *result = rth;
2452 cleanup:
2453 /* release work reference to inet device */
2454 in_dev_put(in_dev);
2455
2456 return err;
2457}
2458
Stephen Hemminger5969f712008-04-10 01:52:09 -07002459static int ip_mkroute_output(struct rtable **rp,
2460 struct fib_result *res,
2461 const struct flowi *fl,
2462 const struct flowi *oldflp,
2463 struct net_device *dev_out,
2464 unsigned flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002465{
Chuck Short7abaa272005-06-22 22:10:23 -07002466 struct rtable *rth = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002467 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2468 unsigned hash;
2469 if (err == 0) {
Denis V. Lunevb00180d2008-07-05 19:04:09 -07002470 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002471 rt_genid(dev_net(dev_out)));
Eric Dumazet511c3f92009-06-02 05:14:27 +00002472 err = rt_intern_hash(hash, rth, rp, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002473 }
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002474
Linus Torvalds1da177e2005-04-16 15:20:36 -07002475 return err;
2476}
2477
Linus Torvalds1da177e2005-04-16 15:20:36 -07002478/*
2479 * Major route resolver routine.
2480 */
2481
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002482static int ip_route_output_slow(struct net *net, struct rtable **rp,
2483 const struct flowi *oldflp)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002484{
2485 u32 tos = RT_FL_TOS(oldflp);
2486 struct flowi fl = { .nl_u = { .ip4_u =
2487 { .daddr = oldflp->fl4_dst,
2488 .saddr = oldflp->fl4_src,
2489 .tos = tos & IPTOS_RT_MASK,
2490 .scope = ((tos & RTO_ONLINK) ?
2491 RT_SCOPE_LINK :
2492 RT_SCOPE_UNIVERSE),
Linus Torvalds1da177e2005-04-16 15:20:36 -07002493 } },
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002494 .mark = oldflp->mark,
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002495 .iif = net->loopback_dev->ifindex,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002496 .oif = oldflp->oif };
2497 struct fib_result res;
2498 unsigned flags = 0;
2499 struct net_device *dev_out = NULL;
2500 int free_res = 0;
2501 int err;
2502
2503
2504 res.fi = NULL;
2505#ifdef CONFIG_IP_MULTIPLE_TABLES
2506 res.r = NULL;
2507#endif
2508
2509 if (oldflp->fl4_src) {
2510 err = -EINVAL;
Joe Perchesf97c1e02007-12-16 13:45:43 -08002511 if (ipv4_is_multicast(oldflp->fl4_src) ||
Jan Engelhardt1e637c72008-01-21 03:18:08 -08002512 ipv4_is_lbcast(oldflp->fl4_src) ||
Joe Perchesf97c1e02007-12-16 13:45:43 -08002513 ipv4_is_zeronet(oldflp->fl4_src))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002514 goto out;
2515
Linus Torvalds1da177e2005-04-16 15:20:36 -07002516 /* I removed check for oif == dev_out->oif here.
2517 It was wrong for two reasons:
Denis V. Lunev1ab35272008-01-22 22:04:30 -08002518 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2519 is assigned to multiple interfaces.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002520 2. Moreover, we are allowed to send packets with saddr
2521 of another iface. --ANK
2522 */
2523
Joe Perches9d4fb272009-11-23 10:41:23 -08002524 if (oldflp->oif == 0 &&
2525 (ipv4_is_multicast(oldflp->fl4_dst) ||
2526 oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
Julian Anastasova210d012008-10-01 07:28:28 -07002527 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2528 dev_out = ip_dev_find(net, oldflp->fl4_src);
2529 if (dev_out == NULL)
2530 goto out;
2531
Linus Torvalds1da177e2005-04-16 15:20:36 -07002532 /* Special hack: user can direct multicasts
2533 and limited broadcast via necessary interface
2534 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2535 This hack is not just for fun, it allows
2536 vic,vat and friends to work.
2537 They bind socket to loopback, set ttl to zero
2538 and expect that it will work.
2539 From the viewpoint of routing cache they are broken,
2540 because we are not allowed to build multicast path
2541 with loopback source addr (look, routing cache
2542 cannot know, that ttl is zero, so that packet
2543 will not leave this host and route is valid).
2544 Luckily, this hack is good workaround.
2545 */
2546
2547 fl.oif = dev_out->ifindex;
2548 goto make_route;
2549 }
Julian Anastasova210d012008-10-01 07:28:28 -07002550
2551 if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
2552 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2553 dev_out = ip_dev_find(net, oldflp->fl4_src);
2554 if (dev_out == NULL)
2555 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002556 dev_put(dev_out);
Julian Anastasova210d012008-10-01 07:28:28 -07002557 dev_out = NULL;
2558 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002559 }
2560
2561
2562 if (oldflp->oif) {
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002563 dev_out = dev_get_by_index(net, oldflp->oif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002564 err = -ENODEV;
2565 if (dev_out == NULL)
2566 goto out;
Herbert Xue5ed6392005-10-03 14:35:55 -07002567
2568 /* RACE: Check return value of inet_select_addr instead. */
2569 if (__in_dev_get_rtnl(dev_out) == NULL) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002570 dev_put(dev_out);
2571 goto out; /* Wrong error code */
2572 }
2573
Joe Perchesf97c1e02007-12-16 13:45:43 -08002574 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2575 oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002576 if (!fl.fl4_src)
2577 fl.fl4_src = inet_select_addr(dev_out, 0,
2578 RT_SCOPE_LINK);
2579 goto make_route;
2580 }
2581 if (!fl.fl4_src) {
Joe Perchesf97c1e02007-12-16 13:45:43 -08002582 if (ipv4_is_multicast(oldflp->fl4_dst))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002583 fl.fl4_src = inet_select_addr(dev_out, 0,
2584 fl.fl4_scope);
2585 else if (!oldflp->fl4_dst)
2586 fl.fl4_src = inet_select_addr(dev_out, 0,
2587 RT_SCOPE_HOST);
2588 }
2589 }
2590
2591 if (!fl.fl4_dst) {
2592 fl.fl4_dst = fl.fl4_src;
2593 if (!fl.fl4_dst)
2594 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2595 if (dev_out)
2596 dev_put(dev_out);
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002597 dev_out = net->loopback_dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002598 dev_hold(dev_out);
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002599 fl.oif = net->loopback_dev->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002600 res.type = RTN_LOCAL;
2601 flags |= RTCF_LOCAL;
2602 goto make_route;
2603 }
2604
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002605 if (fib_lookup(net, &fl, &res)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002606 res.fi = NULL;
2607 if (oldflp->oif) {
2608 /* Apparently, routing tables are wrong. Assume,
2609 that the destination is on link.
2610
2611 WHY? DW.
2612 Because we are allowed to send to iface
2613 even if it has NO routes and NO assigned
2614 addresses. When oif is specified, routing
2615 tables are looked up with only one purpose:
2616 to catch if destination is gatewayed, rather than
2617 direct. Moreover, if MSG_DONTROUTE is set,
2618 we send packet, ignoring both routing tables
2619 and ifaddr state. --ANK
2620
2621
2622 We could make it even if oif is unknown,
2623 likely IPv6, but we do not.
2624 */
2625
2626 if (fl.fl4_src == 0)
2627 fl.fl4_src = inet_select_addr(dev_out, 0,
2628 RT_SCOPE_LINK);
2629 res.type = RTN_UNICAST;
2630 goto make_route;
2631 }
2632 if (dev_out)
2633 dev_put(dev_out);
2634 err = -ENETUNREACH;
2635 goto out;
2636 }
2637 free_res = 1;
2638
2639 if (res.type == RTN_LOCAL) {
2640 if (!fl.fl4_src)
2641 fl.fl4_src = fl.fl4_dst;
2642 if (dev_out)
2643 dev_put(dev_out);
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002644 dev_out = net->loopback_dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002645 dev_hold(dev_out);
2646 fl.oif = dev_out->ifindex;
2647 if (res.fi)
2648 fib_info_put(res.fi);
2649 res.fi = NULL;
2650 flags |= RTCF_LOCAL;
2651 goto make_route;
2652 }
2653
2654#ifdef CONFIG_IP_ROUTE_MULTIPATH
2655 if (res.fi->fib_nhs > 1 && fl.oif == 0)
2656 fib_select_multipath(&fl, &res);
2657 else
2658#endif
2659 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002660 fib_select_default(net, &fl, &res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002661
2662 if (!fl.fl4_src)
2663 fl.fl4_src = FIB_RES_PREFSRC(res);
2664
2665 if (dev_out)
2666 dev_put(dev_out);
2667 dev_out = FIB_RES_DEV(res);
2668 dev_hold(dev_out);
2669 fl.oif = dev_out->ifindex;
2670
2671
2672make_route:
2673 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2674
2675
2676 if (free_res)
2677 fib_res_put(&res);
2678 if (dev_out)
2679 dev_put(dev_out);
2680out: return err;
2681}
2682
Denis V. Lunev611c1832008-01-22 22:06:48 -08002683int __ip_route_output_key(struct net *net, struct rtable **rp,
2684 const struct flowi *flp)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002685{
2686 unsigned hash;
2687 struct rtable *rth;
2688
Neil Horman1080d702008-10-27 12:28:25 -07002689 if (!rt_caching(net))
2690 goto slow_output;
2691
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002692 hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002693
2694 rcu_read_lock_bh();
2695 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
Eric Dumazet093c2ca2007-02-09 16:19:26 -08002696 rth = rcu_dereference(rth->u.dst.rt_next)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002697 if (rth->fl.fl4_dst == flp->fl4_dst &&
2698 rth->fl.fl4_src == flp->fl4_src &&
2699 rth->fl.iif == 0 &&
2700 rth->fl.oif == flp->oif &&
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002701 rth->fl.mark == flp->mark &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07002702 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
Denis V. Lunevb5921912008-01-22 23:50:25 -08002703 (IPTOS_RT_MASK | RTO_ONLINK)) &&
YOSHIFUJI Hideaki878628f2008-03-26 03:57:35 +09002704 net_eq(dev_net(rth->u.dst.dev), net) &&
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002705 !rt_is_expired(rth)) {
Pavel Emelyanov03f49f32007-11-10 21:28:34 -08002706 dst_use(&rth->u.dst, jiffies);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002707 RT_CACHE_STAT_INC(out_hit);
2708 rcu_read_unlock_bh();
2709 *rp = rth;
2710 return 0;
2711 }
2712 RT_CACHE_STAT_INC(out_hlist_search);
2713 }
2714 rcu_read_unlock_bh();
2715
Neil Horman1080d702008-10-27 12:28:25 -07002716slow_output:
Denis V. Lunev611c1832008-01-22 22:06:48 -08002717 return ip_route_output_slow(net, rp, flp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002718}
2719
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -07002720EXPORT_SYMBOL_GPL(__ip_route_output_key);
2721
David S. Miller14e50e52007-05-24 18:17:54 -07002722static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2723{
2724}
2725
2726static struct dst_ops ipv4_dst_blackhole_ops = {
2727 .family = AF_INET,
Harvey Harrison09640e632009-02-01 00:45:17 -08002728 .protocol = cpu_to_be16(ETH_P_IP),
David S. Miller14e50e52007-05-24 18:17:54 -07002729 .destroy = ipv4_dst_destroy,
2730 .check = ipv4_dst_check,
2731 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
Eric Dumazete2422972008-01-30 20:07:45 -08002732 .entries = ATOMIC_INIT(0),
David S. Miller14e50e52007-05-24 18:17:54 -07002733};
2734
2735
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002736static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp)
David S. Miller14e50e52007-05-24 18:17:54 -07002737{
2738 struct rtable *ort = *rp;
2739 struct rtable *rt = (struct rtable *)
2740 dst_alloc(&ipv4_dst_blackhole_ops);
2741
2742 if (rt) {
2743 struct dst_entry *new = &rt->u.dst;
2744
2745 atomic_set(&new->__refcnt, 1);
2746 new->__use = 1;
Herbert Xu352e5122007-11-13 21:34:06 -08002747 new->input = dst_discard;
2748 new->output = dst_discard;
David S. Miller14e50e52007-05-24 18:17:54 -07002749 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2750
2751 new->dev = ort->u.dst.dev;
2752 if (new->dev)
2753 dev_hold(new->dev);
2754
2755 rt->fl = ort->fl;
2756
2757 rt->idev = ort->idev;
2758 if (rt->idev)
2759 in_dev_hold(rt->idev);
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002760 rt->rt_genid = rt_genid(net);
David S. Miller14e50e52007-05-24 18:17:54 -07002761 rt->rt_flags = ort->rt_flags;
2762 rt->rt_type = ort->rt_type;
2763 rt->rt_dst = ort->rt_dst;
2764 rt->rt_src = ort->rt_src;
2765 rt->rt_iif = ort->rt_iif;
2766 rt->rt_gateway = ort->rt_gateway;
2767 rt->rt_spec_dst = ort->rt_spec_dst;
2768 rt->peer = ort->peer;
2769 if (rt->peer)
2770 atomic_inc(&rt->peer->refcnt);
2771
2772 dst_free(new);
2773 }
2774
2775 dst_release(&(*rp)->u.dst);
2776 *rp = rt;
2777 return (rt ? 0 : -ENOMEM);
2778}
2779
Denis V. Lunevf1b050b2008-01-22 22:07:10 -08002780int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2781 struct sock *sk, int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002782{
2783 int err;
2784
Denis V. Lunevf1b050b2008-01-22 22:07:10 -08002785 if ((err = __ip_route_output_key(net, rp, flp)) != 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002786 return err;
2787
2788 if (flp->proto) {
2789 if (!flp->fl4_src)
2790 flp->fl4_src = (*rp)->rt_src;
2791 if (!flp->fl4_dst)
2792 flp->fl4_dst = (*rp)->rt_dst;
Alexey Dobriyan52479b62008-11-25 17:35:18 -08002793 err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk,
Herbert Xubb728452007-12-12 18:48:58 -08002794 flags ? XFRM_LOOKUP_WAIT : 0);
David S. Miller14e50e52007-05-24 18:17:54 -07002795 if (err == -EREMOTE)
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002796 err = ipv4_dst_blackhole(net, rp, flp);
David S. Miller14e50e52007-05-24 18:17:54 -07002797
2798 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002799 }
2800
2801 return 0;
2802}
2803
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -07002804EXPORT_SYMBOL_GPL(ip_route_output_flow);
2805
Denis V. Lunevf2063512008-01-22 22:07:34 -08002806int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002807{
Denis V. Lunevf2063512008-01-22 22:07:34 -08002808 return ip_route_output_flow(net, rp, flp, NULL, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002809}
2810
Benjamin Thery4feb88e2009-01-22 04:56:23 +00002811static int rt_fill_info(struct net *net,
2812 struct sk_buff *skb, u32 pid, u32 seq, int event,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07002813 int nowait, unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002814{
Eric Dumazet511c3f92009-06-02 05:14:27 +00002815 struct rtable *rt = skb_rtable(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002816 struct rtmsg *r;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002817 struct nlmsghdr *nlh;
Thomas Grafe3703b32006-11-27 09:27:07 -08002818 long expires;
2819 u32 id = 0, ts = 0, tsage = 0, error;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002820
2821 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2822 if (nlh == NULL)
Patrick McHardy26932562007-01-31 23:16:40 -08002823 return -EMSGSIZE;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002824
2825 r = nlmsg_data(nlh);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002826 r->rtm_family = AF_INET;
2827 r->rtm_dst_len = 32;
2828 r->rtm_src_len = 0;
2829 r->rtm_tos = rt->fl.fl4_tos;
2830 r->rtm_table = RT_TABLE_MAIN;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002831 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002832 r->rtm_type = rt->rt_type;
2833 r->rtm_scope = RT_SCOPE_UNIVERSE;
2834 r->rtm_protocol = RTPROT_UNSPEC;
2835 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2836 if (rt->rt_flags & RTCF_NOTIFY)
2837 r->rtm_flags |= RTM_F_NOTIFY;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002838
Al Viro17fb2c62006-09-26 22:15:25 -07002839 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
Thomas Grafbe403ea2006-08-17 18:15:17 -07002840
Linus Torvalds1da177e2005-04-16 15:20:36 -07002841 if (rt->fl.fl4_src) {
2842 r->rtm_src_len = 32;
Al Viro17fb2c62006-09-26 22:15:25 -07002843 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002844 }
2845 if (rt->u.dst.dev)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002846 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002847#ifdef CONFIG_NET_CLS_ROUTE
2848 if (rt->u.dst.tclassid)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002849 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002850#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07002851 if (rt->fl.iif)
Al Viro17fb2c62006-09-26 22:15:25 -07002852 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002853 else if (rt->rt_src != rt->fl.fl4_src)
Al Viro17fb2c62006-09-26 22:15:25 -07002854 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
Thomas Grafbe403ea2006-08-17 18:15:17 -07002855
Linus Torvalds1da177e2005-04-16 15:20:36 -07002856 if (rt->rt_dst != rt->rt_gateway)
Al Viro17fb2c62006-09-26 22:15:25 -07002857 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
Thomas Grafbe403ea2006-08-17 18:15:17 -07002858
Linus Torvalds1da177e2005-04-16 15:20:36 -07002859 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002860 goto nla_put_failure;
2861
Thomas Grafe3703b32006-11-27 09:27:07 -08002862 error = rt->u.dst.error;
2863 expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002864 if (rt->peer) {
Eric Dumazet2c1409a2009-11-12 09:33:09 +00002865 id = atomic_read(&rt->peer->ip_id_count) & 0xffff;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002866 if (rt->peer->tcp_ts_stamp) {
Thomas Grafe3703b32006-11-27 09:27:07 -08002867 ts = rt->peer->tcp_ts;
James Morris9d729f72007-03-04 16:12:44 -08002868 tsage = get_seconds() - rt->peer->tcp_ts_stamp;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002869 }
2870 }
Thomas Grafbe403ea2006-08-17 18:15:17 -07002871
Linus Torvalds1da177e2005-04-16 15:20:36 -07002872 if (rt->fl.iif) {
2873#ifdef CONFIG_IP_MROUTE
Al Viroe4485152006-09-26 22:15:01 -07002874 __be32 dst = rt->rt_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002875
Joe Perchesf97c1e02007-12-16 13:45:43 -08002876 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
Benjamin Thery4feb88e2009-01-22 04:56:23 +00002877 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2878 int err = ipmr_get_route(net, skb, r, nowait);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002879 if (err <= 0) {
2880 if (!nowait) {
2881 if (err == 0)
2882 return 0;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002883 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002884 } else {
2885 if (err == -EMSGSIZE)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002886 goto nla_put_failure;
Thomas Grafe3703b32006-11-27 09:27:07 -08002887 error = err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002888 }
2889 }
2890 } else
2891#endif
Thomas Grafbe403ea2006-08-17 18:15:17 -07002892 NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002893 }
2894
Thomas Grafe3703b32006-11-27 09:27:07 -08002895 if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2896 expires, error) < 0)
2897 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002898
Thomas Grafbe403ea2006-08-17 18:15:17 -07002899 return nlmsg_end(skb, nlh);
2900
2901nla_put_failure:
Patrick McHardy26932562007-01-31 23:16:40 -08002902 nlmsg_cancel(skb, nlh);
2903 return -EMSGSIZE;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002904}
2905
Thomas Graf63f34442007-03-22 11:55:17 -07002906static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002907{
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09002908 struct net *net = sock_net(in_skb->sk);
Thomas Grafd889ce32006-08-17 18:15:44 -07002909 struct rtmsg *rtm;
2910 struct nlattr *tb[RTA_MAX+1];
Linus Torvalds1da177e2005-04-16 15:20:36 -07002911 struct rtable *rt = NULL;
Al Viro9e12bb22006-09-26 21:25:20 -07002912 __be32 dst = 0;
2913 __be32 src = 0;
2914 u32 iif;
Thomas Grafd889ce32006-08-17 18:15:44 -07002915 int err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002916 struct sk_buff *skb;
2917
Thomas Grafd889ce32006-08-17 18:15:44 -07002918 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2919 if (err < 0)
2920 goto errout;
2921
2922 rtm = nlmsg_data(nlh);
2923
Linus Torvalds1da177e2005-04-16 15:20:36 -07002924 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
Thomas Grafd889ce32006-08-17 18:15:44 -07002925 if (skb == NULL) {
2926 err = -ENOBUFS;
2927 goto errout;
2928 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002929
2930 /* Reserve room for dummy headers, this skb can pass
2931 through good chunk of routing engine.
2932 */
Arnaldo Carvalho de Melo459a98e2007-03-19 15:30:44 -07002933 skb_reset_mac_header(skb);
Arnaldo Carvalho de Meloc1d2bbe2007-04-10 20:45:18 -07002934 skb_reset_network_header(skb);
Stephen Hemmingerd2c962b2006-04-17 17:27:11 -07002935
2936 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07002937 ip_hdr(skb)->protocol = IPPROTO_ICMP;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002938 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2939
Al Viro17fb2c62006-09-26 22:15:25 -07002940 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2941 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
Thomas Grafd889ce32006-08-17 18:15:44 -07002942 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002943
2944 if (iif) {
Thomas Grafd889ce32006-08-17 18:15:44 -07002945 struct net_device *dev;
2946
Denis V. Lunev19375042008-02-28 20:52:04 -08002947 dev = __dev_get_by_index(net, iif);
Thomas Grafd889ce32006-08-17 18:15:44 -07002948 if (dev == NULL) {
2949 err = -ENODEV;
2950 goto errout_free;
2951 }
2952
Linus Torvalds1da177e2005-04-16 15:20:36 -07002953 skb->protocol = htons(ETH_P_IP);
2954 skb->dev = dev;
2955 local_bh_disable();
2956 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2957 local_bh_enable();
Thomas Grafd889ce32006-08-17 18:15:44 -07002958
Eric Dumazet511c3f92009-06-02 05:14:27 +00002959 rt = skb_rtable(skb);
Thomas Grafd889ce32006-08-17 18:15:44 -07002960 if (err == 0 && rt->u.dst.error)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002961 err = -rt->u.dst.error;
2962 } else {
Thomas Grafd889ce32006-08-17 18:15:44 -07002963 struct flowi fl = {
2964 .nl_u = {
2965 .ip4_u = {
2966 .daddr = dst,
2967 .saddr = src,
2968 .tos = rtm->rtm_tos,
2969 },
2970 },
2971 .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2972 };
Denis V. Lunev19375042008-02-28 20:52:04 -08002973 err = ip_route_output_key(net, &rt, &fl);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002974 }
Thomas Grafd889ce32006-08-17 18:15:44 -07002975
Linus Torvalds1da177e2005-04-16 15:20:36 -07002976 if (err)
Thomas Grafd889ce32006-08-17 18:15:44 -07002977 goto errout_free;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002978
Eric Dumazetadf30902009-06-02 05:19:30 +00002979 skb_dst_set(skb, &rt->u.dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002980 if (rtm->rtm_flags & RTM_F_NOTIFY)
2981 rt->rt_flags |= RTCF_NOTIFY;
2982
Benjamin Thery4feb88e2009-01-22 04:56:23 +00002983 err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
Denis V. Lunev19375042008-02-28 20:52:04 -08002984 RTM_NEWROUTE, 0, 0);
Thomas Grafd889ce32006-08-17 18:15:44 -07002985 if (err <= 0)
2986 goto errout_free;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002987
Denis V. Lunev19375042008-02-28 20:52:04 -08002988 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
Thomas Grafd889ce32006-08-17 18:15:44 -07002989errout:
Thomas Graf2942e902006-08-15 00:30:25 -07002990 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002991
Thomas Grafd889ce32006-08-17 18:15:44 -07002992errout_free:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002993 kfree_skb(skb);
Thomas Grafd889ce32006-08-17 18:15:44 -07002994 goto errout;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002995}
2996
2997int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2998{
2999 struct rtable *rt;
3000 int h, s_h;
3001 int idx, s_idx;
Denis V. Lunev19375042008-02-28 20:52:04 -08003002 struct net *net;
3003
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09003004 net = sock_net(skb->sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003005
3006 s_h = cb->args[0];
Eric Dumazetd8c92832008-01-07 21:52:14 -08003007 if (s_h < 0)
3008 s_h = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003009 s_idx = idx = cb->args[1];
Eric Dumazeta6272662008-08-28 01:11:25 -07003010 for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3011 if (!rt_hash_table[h].chain)
3012 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003013 rcu_read_lock_bh();
3014 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
Eric Dumazet093c2ca2007-02-09 16:19:26 -08003015 rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
YOSHIFUJI Hideaki878628f2008-03-26 03:57:35 +09003016 if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003017 continue;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07003018 if (rt_is_expired(rt))
Eric Dumazet29e75252008-01-31 17:05:09 -08003019 continue;
Eric Dumazetadf30902009-06-02 05:19:30 +00003020 skb_dst_set(skb, dst_clone(&rt->u.dst));
Benjamin Thery4feb88e2009-01-22 04:56:23 +00003021 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003022 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07003023 1, NLM_F_MULTI) <= 0) {
Eric Dumazetadf30902009-06-02 05:19:30 +00003024 skb_dst_drop(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003025 rcu_read_unlock_bh();
3026 goto done;
3027 }
Eric Dumazetadf30902009-06-02 05:19:30 +00003028 skb_dst_drop(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003029 }
3030 rcu_read_unlock_bh();
3031 }
3032
3033done:
3034 cb->args[0] = h;
3035 cb->args[1] = idx;
3036 return skb->len;
3037}
3038
3039void ip_rt_multicast_event(struct in_device *in_dev)
3040{
Denis V. Lunev76e6ebf2008-07-05 19:00:44 -07003041 rt_cache_flush(dev_net(in_dev->dev), 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003042}
3043
3044#ifdef CONFIG_SYSCTL
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003045static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
Alexey Dobriyan8d65af72009-09-23 15:57:19 -07003046 void __user *buffer,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003047 size_t *lenp, loff_t *ppos)
3048{
3049 if (write) {
Denis V. Lunev639e1042008-07-05 19:02:06 -07003050 int flush_delay;
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003051 ctl_table ctl;
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003052 struct net *net;
Denis V. Lunev639e1042008-07-05 19:02:06 -07003053
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003054 memcpy(&ctl, __ctl, sizeof(ctl));
3055 ctl.data = &flush_delay;
Alexey Dobriyan8d65af72009-09-23 15:57:19 -07003056 proc_dointvec(&ctl, write, buffer, lenp, ppos);
Denis V. Lunev639e1042008-07-05 19:02:06 -07003057
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003058 net = (struct net *)__ctl->extra1;
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003059 rt_cache_flush(net, flush_delay);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003060 return 0;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003061 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003062
3063 return -EINVAL;
3064}
3065
Herbert Xuc6153b52008-08-15 13:44:31 -07003066static void rt_secret_reschedule(int old)
3067{
3068 struct net *net;
3069 int new = ip_rt_secret_interval;
3070 int diff = new - old;
3071
3072 if (!diff)
3073 return;
3074
3075 rtnl_lock();
3076 for_each_net(net) {
3077 int deleted = del_timer_sync(&net->ipv4.rt_secret_timer);
3078
3079 if (!new)
3080 continue;
3081
3082 if (deleted) {
3083 long time = net->ipv4.rt_secret_timer.expires - jiffies;
3084
3085 if (time <= 0 || (time += diff) <= 0)
3086 time = 0;
3087
3088 net->ipv4.rt_secret_timer.expires = time;
3089 } else
3090 net->ipv4.rt_secret_timer.expires = new;
3091
3092 net->ipv4.rt_secret_timer.expires += jiffies;
3093 add_timer(&net->ipv4.rt_secret_timer);
3094 }
3095 rtnl_unlock();
3096}
3097
3098static int ipv4_sysctl_rt_secret_interval(ctl_table *ctl, int write,
Herbert Xuc6153b52008-08-15 13:44:31 -07003099 void __user *buffer, size_t *lenp,
3100 loff_t *ppos)
3101{
3102 int old = ip_rt_secret_interval;
Alexey Dobriyan8d65af72009-09-23 15:57:19 -07003103 int ret = proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos);
Herbert Xuc6153b52008-08-15 13:44:31 -07003104
3105 rt_secret_reschedule(old);
3106
3107 return ret;
3108}
3109
Al Viroeeb61f72008-07-27 08:59:33 +01003110static ctl_table ipv4_route_table[] = {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003111 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003112 .procname = "gc_thresh",
3113 .data = &ipv4_dst_ops.gc_thresh,
3114 .maxlen = sizeof(int),
3115 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003116 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003117 },
3118 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003119 .procname = "max_size",
3120 .data = &ip_rt_max_size,
3121 .maxlen = sizeof(int),
3122 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003123 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003124 },
3125 {
3126 /* Deprecated. Use gc_min_interval_ms */
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003127
Linus Torvalds1da177e2005-04-16 15:20:36 -07003128 .procname = "gc_min_interval",
3129 .data = &ip_rt_gc_min_interval,
3130 .maxlen = sizeof(int),
3131 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003132 .proc_handler = proc_dointvec_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003133 },
3134 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003135 .procname = "gc_min_interval_ms",
3136 .data = &ip_rt_gc_min_interval,
3137 .maxlen = sizeof(int),
3138 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003139 .proc_handler = proc_dointvec_ms_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003140 },
3141 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003142 .procname = "gc_timeout",
3143 .data = &ip_rt_gc_timeout,
3144 .maxlen = sizeof(int),
3145 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003146 .proc_handler = proc_dointvec_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003147 },
3148 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003149 .procname = "gc_interval",
3150 .data = &ip_rt_gc_interval,
3151 .maxlen = sizeof(int),
3152 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003153 .proc_handler = proc_dointvec_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003154 },
3155 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003156 .procname = "redirect_load",
3157 .data = &ip_rt_redirect_load,
3158 .maxlen = sizeof(int),
3159 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003160 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003161 },
3162 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003163 .procname = "redirect_number",
3164 .data = &ip_rt_redirect_number,
3165 .maxlen = sizeof(int),
3166 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003167 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003168 },
3169 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003170 .procname = "redirect_silence",
3171 .data = &ip_rt_redirect_silence,
3172 .maxlen = sizeof(int),
3173 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003174 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003175 },
3176 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003177 .procname = "error_cost",
3178 .data = &ip_rt_error_cost,
3179 .maxlen = sizeof(int),
3180 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003181 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003182 },
3183 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003184 .procname = "error_burst",
3185 .data = &ip_rt_error_burst,
3186 .maxlen = sizeof(int),
3187 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003188 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003189 },
3190 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003191 .procname = "gc_elasticity",
3192 .data = &ip_rt_gc_elasticity,
3193 .maxlen = sizeof(int),
3194 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003195 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003196 },
3197 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003198 .procname = "mtu_expires",
3199 .data = &ip_rt_mtu_expires,
3200 .maxlen = sizeof(int),
3201 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003202 .proc_handler = proc_dointvec_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003203 },
3204 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003205 .procname = "min_pmtu",
3206 .data = &ip_rt_min_pmtu,
3207 .maxlen = sizeof(int),
3208 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003209 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003210 },
3211 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003212 .procname = "min_adv_mss",
3213 .data = &ip_rt_min_advmss,
3214 .maxlen = sizeof(int),
3215 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003216 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003217 },
3218 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003219 .procname = "secret_interval",
3220 .data = &ip_rt_secret_interval,
3221 .maxlen = sizeof(int),
3222 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003223 .proc_handler = ipv4_sysctl_rt_secret_interval,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003224 },
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003225 { }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003226};
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003227
Al Viro2f4520d2008-08-25 15:17:44 -07003228static struct ctl_table empty[1];
3229
3230static struct ctl_table ipv4_skeleton[] =
3231{
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003232 { .procname = "route",
Hugh Dickinsd994af02008-08-27 02:35:18 -07003233 .mode = 0555, .child = ipv4_route_table},
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003234 { .procname = "neigh",
Hugh Dickinsd994af02008-08-27 02:35:18 -07003235 .mode = 0555, .child = empty},
Al Viro2f4520d2008-08-25 15:17:44 -07003236 { }
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003237};
3238
Al Viro2f4520d2008-08-25 15:17:44 -07003239static __net_initdata struct ctl_path ipv4_path[] = {
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003240 { .procname = "net", },
3241 { .procname = "ipv4", },
Al Viro2f4520d2008-08-25 15:17:44 -07003242 { },
3243};
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003244
3245static struct ctl_table ipv4_route_flush_table[] = {
3246 {
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003247 .procname = "flush",
3248 .maxlen = sizeof(int),
3249 .mode = 0200,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003250 .proc_handler = ipv4_sysctl_rtcache_flush,
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003251 },
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003252 { },
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003253};
3254
Al Viro2f4520d2008-08-25 15:17:44 -07003255static __net_initdata struct ctl_path ipv4_route_path[] = {
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003256 { .procname = "net", },
3257 { .procname = "ipv4", },
3258 { .procname = "route", },
Al Viro2f4520d2008-08-25 15:17:44 -07003259 { },
3260};
3261
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003262static __net_init int sysctl_route_net_init(struct net *net)
3263{
3264 struct ctl_table *tbl;
3265
3266 tbl = ipv4_route_flush_table;
Octavian Purdila09ad9bc2009-11-25 15:14:13 -08003267 if (!net_eq(net, &init_net)) {
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003268 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3269 if (tbl == NULL)
3270 goto err_dup;
3271 }
3272 tbl[0].extra1 = net;
3273
3274 net->ipv4.route_hdr =
3275 register_net_sysctl_table(net, ipv4_route_path, tbl);
3276 if (net->ipv4.route_hdr == NULL)
3277 goto err_reg;
3278 return 0;
3279
3280err_reg:
3281 if (tbl != ipv4_route_flush_table)
3282 kfree(tbl);
3283err_dup:
3284 return -ENOMEM;
3285}
3286
3287static __net_exit void sysctl_route_net_exit(struct net *net)
3288{
3289 struct ctl_table *tbl;
3290
3291 tbl = net->ipv4.route_hdr->ctl_table_arg;
3292 unregister_net_sysctl_table(net->ipv4.route_hdr);
3293 BUG_ON(tbl == ipv4_route_flush_table);
3294 kfree(tbl);
3295}
3296
3297static __net_initdata struct pernet_operations sysctl_route_ops = {
3298 .init = sysctl_route_net_init,
3299 .exit = sysctl_route_net_exit,
3300};
Linus Torvalds1da177e2005-04-16 15:20:36 -07003301#endif
3302
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003303
3304static __net_init int rt_secret_timer_init(struct net *net)
3305{
Denis V. Luneve84f84f2008-07-05 19:04:32 -07003306 atomic_set(&net->ipv4.rt_genid,
3307 (int) ((num_physpages ^ (num_physpages>>8)) ^
3308 (jiffies ^ (jiffies >> 7))));
3309
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003310 net->ipv4.rt_secret_timer.function = rt_secret_rebuild;
3311 net->ipv4.rt_secret_timer.data = (unsigned long)net;
3312 init_timer_deferrable(&net->ipv4.rt_secret_timer);
3313
Herbert Xuc6153b52008-08-15 13:44:31 -07003314 if (ip_rt_secret_interval) {
3315 net->ipv4.rt_secret_timer.expires =
3316 jiffies + net_random() % ip_rt_secret_interval +
3317 ip_rt_secret_interval;
3318 add_timer(&net->ipv4.rt_secret_timer);
3319 }
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003320 return 0;
3321}
3322
3323static __net_exit void rt_secret_timer_exit(struct net *net)
3324{
3325 del_timer_sync(&net->ipv4.rt_secret_timer);
3326}
3327
3328static __net_initdata struct pernet_operations rt_secret_timer_ops = {
3329 .init = rt_secret_timer_init,
3330 .exit = rt_secret_timer_exit,
3331};
3332
3333
Linus Torvalds1da177e2005-04-16 15:20:36 -07003334#ifdef CONFIG_NET_CLS_ROUTE
Eric Dumazet8dbde282007-11-16 03:32:10 -08003335struct ip_rt_acct *ip_rt_acct __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003336#endif /* CONFIG_NET_CLS_ROUTE */
3337
3338static __initdata unsigned long rhash_entries;
3339static int __init set_rhash_entries(char *str)
3340{
3341 if (!str)
3342 return 0;
3343 rhash_entries = simple_strtoul(str, &str, 0);
3344 return 1;
3345}
3346__setup("rhash_entries=", set_rhash_entries);
3347
3348int __init ip_rt_init(void)
3349{
Eric Dumazet424c4b72005-07-05 14:58:19 -07003350 int rc = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003351
Linus Torvalds1da177e2005-04-16 15:20:36 -07003352#ifdef CONFIG_NET_CLS_ROUTE
Ingo Molnar0dcec8c2009-02-25 14:07:33 +01003353 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
Linus Torvalds1da177e2005-04-16 15:20:36 -07003354 if (!ip_rt_acct)
3355 panic("IP: failed to allocate ip_rt_acct\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07003356#endif
3357
Alexey Dobriyane5d679f332006-08-26 19:25:52 -07003358 ipv4_dst_ops.kmem_cachep =
3359 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
Paul Mundt20c2df82007-07-20 10:11:58 +09003360 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003361
David S. Miller14e50e52007-05-24 18:17:54 -07003362 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3363
Eric Dumazet424c4b72005-07-05 14:58:19 -07003364 rt_hash_table = (struct rt_hash_bucket *)
3365 alloc_large_system_hash("IP route cache",
3366 sizeof(struct rt_hash_bucket),
3367 rhash_entries,
Jan Beulich44813742009-09-21 17:03:05 -07003368 (totalram_pages >= 128 * 1024) ?
Mike Stroyan18955cf2005-11-29 16:12:55 -08003369 15 : 17,
Kirill Korotaev8d1502d2006-08-07 20:44:22 -07003370 0,
Eric Dumazet424c4b72005-07-05 14:58:19 -07003371 &rt_hash_log,
3372 &rt_hash_mask,
Anton Blanchardc9503e02009-04-27 05:42:24 -07003373 rhash_entries ? 0 : 512 * 1024);
Eric Dumazet22c047c2005-07-05 14:55:24 -07003374 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3375 rt_hash_lock_init();
Linus Torvalds1da177e2005-04-16 15:20:36 -07003376
3377 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3378 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3379
Linus Torvalds1da177e2005-04-16 15:20:36 -07003380 devinet_init();
3381 ip_fib_init();
3382
Linus Torvalds1da177e2005-04-16 15:20:36 -07003383 /* All the timers, started at system startup tend
3384 to synchronize. Perturb it a bit.
3385 */
Eric Dumazet125bb8f2009-06-11 20:10:07 +00003386 INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3387 expires_ljiffies = jiffies;
Eric Dumazet39c90ec2007-09-15 10:55:54 -07003388 schedule_delayed_work(&expires_work,
3389 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003390
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003391 if (register_pernet_subsys(&rt_secret_timer_ops))
3392 printk(KERN_ERR "Unable to setup rt_secret_timer\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07003393
Denis V. Lunev73b38712008-02-28 20:51:18 -08003394 if (ip_rt_proc_init())
Pavel Emelyanov107f1632007-12-05 21:14:28 -08003395 printk(KERN_ERR "Unable to create route proc files\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07003396#ifdef CONFIG_XFRM
3397 xfrm_init();
Neil Hormana33bc5c2009-07-30 18:52:15 -07003398 xfrm4_init(ip_rt_max_size);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003399#endif
Thomas Graf63f34442007-03-22 11:55:17 -07003400 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3401
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003402#ifdef CONFIG_SYSCTL
3403 register_pernet_subsys(&sysctl_route_ops);
3404#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07003405 return rc;
3406}
3407
Al Viroa1bc6eb2008-07-30 06:32:52 -04003408#ifdef CONFIG_SYSCTL
Al Viroeeb61f72008-07-27 08:59:33 +01003409/*
3410 * We really need to sanitize the damn ipv4 init order, then all
3411 * this nonsense will go away.
3412 */
3413void __init ip_static_sysctl_init(void)
3414{
Al Viro2f4520d2008-08-25 15:17:44 -07003415 register_sysctl_paths(ipv4_path, ipv4_skeleton);
Al Viroeeb61f72008-07-27 08:59:33 +01003416}
Al Viroa1bc6eb2008-07-30 06:32:52 -04003417#endif
Al Viroeeb61f72008-07-27 08:59:33 +01003418
Linus Torvalds1da177e2005-04-16 15:20:36 -07003419EXPORT_SYMBOL(__ip_select_ident);
3420EXPORT_SYMBOL(ip_route_input);
3421EXPORT_SYMBOL(ip_route_output_key);