blob: f7ff2a64a7f054488cdaba6a32e0c986287642d3 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
8 * Version: $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
9 *
10 * IPv4 specific functions
11 *
12 *
13 * code split from:
14 * linux/ipv4/tcp.c
15 * linux/ipv4/tcp_input.c
16 * linux/ipv4/tcp_output.c
17 *
18 * See tcp.c for author information
19 *
20 * This program is free software; you can redistribute it and/or
21 * modify it under the terms of the GNU General Public License
22 * as published by the Free Software Foundation; either version
23 * 2 of the License, or (at your option) any later version.
24 */
25
26/*
27 * Changes:
28 * David S. Miller : New socket lookup architecture.
29 * This code is dedicated to John Dyson.
30 * David S. Miller : Change semantics of established hash,
31 * half is devoted to TIME_WAIT sockets
32 * and the rest go in the other half.
33 * Andi Kleen : Add support for syncookies and fixed
34 * some bugs: ip options weren't passed to
35 * the TCP layer, missed a check for an
36 * ACK bit.
37 * Andi Kleen : Implemented fast path mtu discovery.
38 * Fixed many serious bugs in the
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -070039 * request_sock handling and moved
Linus Torvalds1da177e2005-04-16 15:20:36 -070040 * most of it into the af independent code.
41 * Added tail drop and some other bugfixes.
Stephen Hemmingercaa20d9a2005-11-10 17:13:47 -080042 * Added new listen semantics.
Linus Torvalds1da177e2005-04-16 15:20:36 -070043 * Mike McLagan : Routing by source
44 * Juan Jose Ciarlante: ip_dynaddr bits
45 * Andi Kleen: various fixes.
46 * Vitaly E. Lavrov : Transparent proxy revived after year
47 * coma.
48 * Andi Kleen : Fix new listen.
49 * Andi Kleen : Fix accept error reporting.
50 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
51 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
52 * a single port at the same time.
53 */
54
Linus Torvalds1da177e2005-04-16 15:20:36 -070055
56#include <linux/types.h>
57#include <linux/fcntl.h>
58#include <linux/module.h>
59#include <linux/random.h>
60#include <linux/cache.h>
61#include <linux/jhash.h>
62#include <linux/init.h>
63#include <linux/times.h>
64
Eric W. Biederman457c4cb2007-09-12 12:01:34 +020065#include <net/net_namespace.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070066#include <net/icmp.h>
Arnaldo Carvalho de Melo304a1612005-08-09 19:59:20 -070067#include <net/inet_hashtables.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070068#include <net/tcp.h>
Arnaldo Carvalho de Melo20380732005-08-16 02:18:02 -030069#include <net/transp_v6.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070070#include <net/ipv6.h>
71#include <net/inet_common.h>
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -080072#include <net/timewait_sock.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070073#include <net/xfrm.h>
Chris Leech1a2449a2006-05-23 18:05:53 -070074#include <net/netdma.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070075
76#include <linux/inet.h>
77#include <linux/ipv6.h>
78#include <linux/stddef.h>
79#include <linux/proc_fs.h>
80#include <linux/seq_file.h>
81
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -080082#include <linux/crypto.h>
83#include <linux/scatterlist.h>
84
Brian Haleyab32ea52006-09-22 14:15:41 -070085int sysctl_tcp_tw_reuse __read_mostly;
86int sysctl_tcp_low_latency __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -070087
88/* Check TCP sequence numbers in ICMP packets. */
89#define ICMP_MIN_LENGTH 8
90
Arnaldo Carvalho de Melo8292a172005-12-13 23:15:52 -080091void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -070092
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -080093#ifdef CONFIG_TCP_MD5SIG
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -020094static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
95 __be32 addr);
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -080096static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -020097 __be32 saddr, __be32 daddr,
YOSHIFUJI Hideaki076fb722008-04-17 12:48:12 +090098 struct tcphdr *th, unsigned int tcplen);
YOSHIFUJI Hideaki9501f972008-04-18 12:45:16 +090099#else
100static inline
101struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
102{
103 return NULL;
104}
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800105#endif
106
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -0700107struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -0200108 .lhash_lock = __RW_LOCK_UNLOCKED(tcp_hashinfo.lhash_lock),
109 .lhash_users = ATOMIC_INIT(0),
110 .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700111};
112
Gerrit Renkera94f7232006-11-10 14:06:49 -0800113static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700114{
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700115 return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
116 ip_hdr(skb)->saddr,
Arnaldo Carvalho de Meloaa8223c2007-04-10 21:04:22 -0700117 tcp_hdr(skb)->dest,
118 tcp_hdr(skb)->source);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700119}
120
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -0800121int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
122{
123 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
124 struct tcp_sock *tp = tcp_sk(sk);
125
126 /* With PAWS, it is safe from the viewpoint
127 of data integrity. Even without PAWS it is safe provided sequence
128 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
129
130 Actually, the idea is close to VJ's one, only timestamp cache is
131 held not per host, but per port pair and TW bucket is used as state
132 holder.
133
134 If TW bucket has been already destroyed we fall back to VJ's scheme
135 and use initial timestamp retrieved from peer table.
136 */
137 if (tcptw->tw_ts_recent_stamp &&
138 (twp == NULL || (sysctl_tcp_tw_reuse &&
James Morris9d729f72007-03-04 16:12:44 -0800139 get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -0800140 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
141 if (tp->write_seq == 0)
142 tp->write_seq = 1;
143 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
144 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
145 sock_hold(sktw);
146 return 1;
147 }
148
149 return 0;
150}
151
152EXPORT_SYMBOL_GPL(tcp_twsk_unique);
153
Linus Torvalds1da177e2005-04-16 15:20:36 -0700154/* This will initiate an outgoing connection. */
155int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
156{
157 struct inet_sock *inet = inet_sk(sk);
158 struct tcp_sock *tp = tcp_sk(sk);
159 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
160 struct rtable *rt;
Al Virobada8ad2006-09-26 21:27:15 -0700161 __be32 daddr, nexthop;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700162 int tmp;
163 int err;
164
165 if (addr_len < sizeof(struct sockaddr_in))
166 return -EINVAL;
167
168 if (usin->sin_family != AF_INET)
169 return -EAFNOSUPPORT;
170
171 nexthop = daddr = usin->sin_addr.s_addr;
172 if (inet->opt && inet->opt->srr) {
173 if (!daddr)
174 return -EINVAL;
175 nexthop = inet->opt->faddr;
176 }
177
178 tmp = ip_route_connect(&rt, nexthop, inet->saddr,
179 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
180 IPPROTO_TCP,
David S. Miller8eb90862007-02-08 02:09:21 -0800181 inet->sport, usin->sin_port, sk, 1);
Wei Dong584bdf82007-05-31 22:49:28 -0700182 if (tmp < 0) {
183 if (tmp == -ENETUNREACH)
184 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700185 return tmp;
Wei Dong584bdf82007-05-31 22:49:28 -0700186 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700187
188 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
189 ip_rt_put(rt);
190 return -ENETUNREACH;
191 }
192
193 if (!inet->opt || !inet->opt->srr)
194 daddr = rt->rt_dst;
195
196 if (!inet->saddr)
197 inet->saddr = rt->rt_src;
198 inet->rcv_saddr = inet->saddr;
199
200 if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
201 /* Reset inherited state */
202 tp->rx_opt.ts_recent = 0;
203 tp->rx_opt.ts_recent_stamp = 0;
204 tp->write_seq = 0;
205 }
206
Arnaldo Carvalho de Melo295ff7e2005-08-09 20:44:40 -0700207 if (tcp_death_row.sysctl_tw_recycle &&
Linus Torvalds1da177e2005-04-16 15:20:36 -0700208 !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
209 struct inet_peer *peer = rt_get_peer(rt);
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -0200210 /*
211 * VJ's idea. We save last timestamp seen from
212 * the destination in peer table, when entering state
213 * TIME-WAIT * and initialize rx_opt.ts_recent from it,
214 * when trying new connection.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700215 */
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -0200216 if (peer != NULL &&
James Morris9d729f72007-03-04 16:12:44 -0800217 peer->tcp_ts_stamp + TCP_PAWS_MSL >= get_seconds()) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700218 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
219 tp->rx_opt.ts_recent = peer->tcp_ts;
220 }
221 }
222
223 inet->dport = usin->sin_port;
224 inet->daddr = daddr;
225
Arnaldo Carvalho de Melod83d8462005-12-13 23:26:10 -0800226 inet_csk(sk)->icsk_ext_hdr_len = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700227 if (inet->opt)
Arnaldo Carvalho de Melod83d8462005-12-13 23:26:10 -0800228 inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700229
230 tp->rx_opt.mss_clamp = 536;
231
232 /* Socket identity is still unknown (sport may be zero).
233 * However we set state to SYN-SENT and not releasing socket
234 * lock select source port, enter ourselves into the hash tables and
235 * complete initialization after this.
236 */
237 tcp_set_state(sk, TCP_SYN_SENT);
Arnaldo Carvalho de Meloa7f5e7f2005-12-13 23:25:31 -0800238 err = inet_hash_connect(&tcp_death_row, sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700239 if (err)
240 goto failure;
241
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -0200242 err = ip_route_newports(&rt, IPPROTO_TCP,
243 inet->sport, inet->dport, sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700244 if (err)
245 goto failure;
246
247 /* OK, now commit destination to socket. */
Herbert Xubcd76112006-06-30 13:36:35 -0700248 sk->sk_gso_type = SKB_GSO_TCPV4;
Arnaldo Carvalho de Melo6cbb0df2005-08-09 19:49:02 -0700249 sk_setup_caps(sk, &rt->u.dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700250
251 if (!tp->write_seq)
252 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
253 inet->daddr,
254 inet->sport,
255 usin->sin_port);
256
257 inet->id = tp->write_seq ^ jiffies;
258
259 err = tcp_connect(sk);
260 rt = NULL;
261 if (err)
262 goto failure;
263
264 return 0;
265
266failure:
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -0200267 /*
268 * This unhashes the socket and releases the local port,
269 * if necessary.
270 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700271 tcp_set_state(sk, TCP_CLOSE);
272 ip_rt_put(rt);
273 sk->sk_route_caps = 0;
274 inet->dport = 0;
275 return err;
276}
277
Linus Torvalds1da177e2005-04-16 15:20:36 -0700278/*
279 * This routine does path mtu discovery as defined in RFC1191.
280 */
Stephen Hemminger40efc6f2006-01-03 16:03:49 -0800281static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700282{
283 struct dst_entry *dst;
284 struct inet_sock *inet = inet_sk(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700285
286 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
287 * send out by Linux are always <576bytes so they should go through
288 * unfragmented).
289 */
290 if (sk->sk_state == TCP_LISTEN)
291 return;
292
293 /* We don't check in the destentry if pmtu discovery is forbidden
294 * on this route. We just assume that no packet_to_big packets
295 * are send back when pmtu discovery is not active.
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900296 * There is a small race when the user changes this flag in the
Linus Torvalds1da177e2005-04-16 15:20:36 -0700297 * route, but I think that's acceptable.
298 */
299 if ((dst = __sk_dst_check(sk, 0)) == NULL)
300 return;
301
302 dst->ops->update_pmtu(dst, mtu);
303
304 /* Something is about to be wrong... Remember soft error
305 * for the case, if this connection will not able to recover.
306 */
307 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
308 sk->sk_err_soft = EMSGSIZE;
309
310 mtu = dst_mtu(dst);
311
312 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
Arnaldo Carvalho de Melod83d8462005-12-13 23:26:10 -0800313 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700314 tcp_sync_mss(sk, mtu);
315
316 /* Resend the TCP packet because it's
317 * clear that the old packet has been
318 * dropped. This is the new "fast" path mtu
319 * discovery.
320 */
321 tcp_simple_retransmit(sk);
322 } /* else let the usual retransmit timer handle it */
323}
324
325/*
326 * This routine is called by the ICMP module when it gets some
327 * sort of error condition. If err < 0 then the socket should
328 * be closed and the error returned to the user. If err > 0
329 * it's just the icmp type << 8 | icmp code. After adjustment
330 * header points to the first 8 bytes of the tcp header. We need
331 * to find the appropriate port.
332 *
333 * The locking strategy used here is very "optimistic". When
334 * someone else accesses the socket the ICMP is just dropped
335 * and for some paths there is no check at all.
336 * A more general error queue to queue errors for later handling
337 * is probably better.
338 *
339 */
340
341void tcp_v4_err(struct sk_buff *skb, u32 info)
342{
343 struct iphdr *iph = (struct iphdr *)skb->data;
344 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
345 struct tcp_sock *tp;
346 struct inet_sock *inet;
Arnaldo Carvalho de Melo88c76642007-03-13 14:43:18 -0300347 const int type = icmp_hdr(skb)->type;
348 const int code = icmp_hdr(skb)->code;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700349 struct sock *sk;
350 __u32 seq;
351 int err;
352
353 if (skb->len < (iph->ihl << 2) + 8) {
354 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
355 return;
356 }
357
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +0900358 sk = inet_lookup(dev_net(skb->dev), &tcp_hashinfo, iph->daddr, th->dest,
Pavel Emelyanovc67499c2008-01-31 05:06:40 -0800359 iph->saddr, th->source, inet_iif(skb));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700360 if (!sk) {
361 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
362 return;
363 }
364 if (sk->sk_state == TCP_TIME_WAIT) {
YOSHIFUJI Hideaki9469c7b2006-10-10 19:41:46 -0700365 inet_twsk_put(inet_twsk(sk));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700366 return;
367 }
368
369 bh_lock_sock(sk);
370 /* If too many ICMPs get dropped on busy
371 * servers this needs to be solved differently.
372 */
373 if (sock_owned_by_user(sk))
374 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
375
376 if (sk->sk_state == TCP_CLOSE)
377 goto out;
378
379 tp = tcp_sk(sk);
380 seq = ntohl(th->seq);
381 if (sk->sk_state != TCP_LISTEN &&
382 !between(seq, tp->snd_una, tp->snd_nxt)) {
Eric Dumazet06ca7192006-10-20 00:22:25 -0700383 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700384 goto out;
385 }
386
387 switch (type) {
388 case ICMP_SOURCE_QUENCH:
389 /* Just silently ignore these. */
390 goto out;
391 case ICMP_PARAMETERPROB:
392 err = EPROTO;
393 break;
394 case ICMP_DEST_UNREACH:
395 if (code > NR_ICMP_UNREACH)
396 goto out;
397
398 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
399 if (!sock_owned_by_user(sk))
400 do_pmtu_discovery(sk, iph, info);
401 goto out;
402 }
403
404 err = icmp_err_convert[code].errno;
405 break;
406 case ICMP_TIME_EXCEEDED:
407 err = EHOSTUNREACH;
408 break;
409 default:
410 goto out;
411 }
412
413 switch (sk->sk_state) {
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -0700414 struct request_sock *req, **prev;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700415 case TCP_LISTEN:
416 if (sock_owned_by_user(sk))
417 goto out;
418
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -0700419 req = inet_csk_search_req(sk, &prev, th->dest,
420 iph->daddr, iph->saddr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700421 if (!req)
422 goto out;
423
424 /* ICMPs are not backlogged, hence we cannot get
425 an established socket here.
426 */
427 BUG_TRAP(!req->sk);
428
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -0700429 if (seq != tcp_rsk(req)->snt_isn) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700430 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
431 goto out;
432 }
433
434 /*
435 * Still in SYN_RECV, just remove it silently.
436 * There is no good way to pass the error to the newly
437 * created socket, and POSIX does not want network
438 * errors returned from accept().
439 */
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -0700440 inet_csk_reqsk_queue_drop(sk, req, prev);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700441 goto out;
442
443 case TCP_SYN_SENT:
444 case TCP_SYN_RECV: /* Cannot happen.
445 It can f.e. if SYNs crossed.
446 */
447 if (!sock_owned_by_user(sk)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700448 sk->sk_err = err;
449
450 sk->sk_error_report(sk);
451
452 tcp_done(sk);
453 } else {
454 sk->sk_err_soft = err;
455 }
456 goto out;
457 }
458
459 /* If we've already connected we will keep trying
460 * until we time out, or the user gives up.
461 *
462 * rfc1122 4.2.3.9 allows to consider as hard errors
463 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
464 * but it is obsoleted by pmtu discovery).
465 *
466 * Note, that in modern internet, where routing is unreliable
467 * and in each dark corner broken firewalls sit, sending random
468 * errors ordered by their masters even this two messages finally lose
469 * their original sense (even Linux sends invalid PORT_UNREACHs)
470 *
471 * Now we are in compliance with RFCs.
472 * --ANK (980905)
473 */
474
475 inet = inet_sk(sk);
476 if (!sock_owned_by_user(sk) && inet->recverr) {
477 sk->sk_err = err;
478 sk->sk_error_report(sk);
479 } else { /* Only an error on timeout */
480 sk->sk_err_soft = err;
481 }
482
483out:
484 bh_unlock_sock(sk);
485 sock_put(sk);
486}
487
488/* This routine computes an IPv4 TCP checksum. */
Arnaldo Carvalho de Melo8292a172005-12-13 23:15:52 -0800489void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700490{
491 struct inet_sock *inet = inet_sk(sk);
Arnaldo Carvalho de Meloaa8223c2007-04-10 21:04:22 -0700492 struct tcphdr *th = tcp_hdr(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700493
Patrick McHardy84fa7932006-08-29 16:44:56 -0700494 if (skb->ip_summed == CHECKSUM_PARTIAL) {
Frederik Deweerdtba7808e2007-02-04 20:15:27 -0800495 th->check = ~tcp_v4_check(len, inet->saddr,
496 inet->daddr, 0);
Herbert Xu663ead32007-04-09 11:59:07 -0700497 skb->csum_start = skb_transport_header(skb) - skb->head;
Al Viroff1dcad2006-11-20 18:07:29 -0800498 skb->csum_offset = offsetof(struct tcphdr, check);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700499 } else {
Frederik Deweerdtba7808e2007-02-04 20:15:27 -0800500 th->check = tcp_v4_check(len, inet->saddr, inet->daddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700501 csum_partial((char *)th,
502 th->doff << 2,
503 skb->csum));
504 }
505}
506
Herbert Xua430a432006-07-08 13:34:56 -0700507int tcp_v4_gso_send_check(struct sk_buff *skb)
508{
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700509 const struct iphdr *iph;
Herbert Xua430a432006-07-08 13:34:56 -0700510 struct tcphdr *th;
511
512 if (!pskb_may_pull(skb, sizeof(*th)))
513 return -EINVAL;
514
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700515 iph = ip_hdr(skb);
Arnaldo Carvalho de Meloaa8223c2007-04-10 21:04:22 -0700516 th = tcp_hdr(skb);
Herbert Xua430a432006-07-08 13:34:56 -0700517
518 th->check = 0;
Frederik Deweerdtba7808e2007-02-04 20:15:27 -0800519 th->check = ~tcp_v4_check(skb->len, iph->saddr, iph->daddr, 0);
Herbert Xu663ead32007-04-09 11:59:07 -0700520 skb->csum_start = skb_transport_header(skb) - skb->head;
Al Viroff1dcad2006-11-20 18:07:29 -0800521 skb->csum_offset = offsetof(struct tcphdr, check);
Patrick McHardy84fa7932006-08-29 16:44:56 -0700522 skb->ip_summed = CHECKSUM_PARTIAL;
Herbert Xua430a432006-07-08 13:34:56 -0700523 return 0;
524}
525
Linus Torvalds1da177e2005-04-16 15:20:36 -0700526/*
527 * This routine will send an RST to the other tcp.
528 *
529 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
530 * for reset.
531 * Answer: if a packet caused RST, it is not for a socket
532 * existing in our system, if it is matched to a socket,
533 * it is just duplicate segment or bug in other side's TCP.
534 * So that we build reply only basing on parameters
535 * arrived with segment.
536 * Exception: precedence violation. We do not implement it in any case.
537 */
538
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800539static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700540{
Arnaldo Carvalho de Meloaa8223c2007-04-10 21:04:22 -0700541 struct tcphdr *th = tcp_hdr(skb);
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800542 struct {
543 struct tcphdr th;
544#ifdef CONFIG_TCP_MD5SIG
Al Viro714e85b2006-11-14 20:51:49 -0800545 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800546#endif
547 } rep;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700548 struct ip_reply_arg arg;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800549#ifdef CONFIG_TCP_MD5SIG
550 struct tcp_md5sig_key *key;
551#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700552
553 /* Never send a reset in response to a reset. */
554 if (th->rst)
555 return;
556
Eric Dumazetee6b9672008-03-05 18:30:47 -0800557 if (skb->rtable->rt_type != RTN_LOCAL)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700558 return;
559
560 /* Swap the send and the receive. */
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800561 memset(&rep, 0, sizeof(rep));
562 rep.th.dest = th->source;
563 rep.th.source = th->dest;
564 rep.th.doff = sizeof(struct tcphdr) / 4;
565 rep.th.rst = 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700566
567 if (th->ack) {
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800568 rep.th.seq = th->ack_seq;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700569 } else {
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800570 rep.th.ack = 1;
571 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
572 skb->len - (th->doff << 2));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700573 }
574
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -0200575 memset(&arg, 0, sizeof(arg));
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800576 arg.iov[0].iov_base = (unsigned char *)&rep;
577 arg.iov[0].iov_len = sizeof(rep.th);
578
579#ifdef CONFIG_TCP_MD5SIG
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700580 key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800581 if (key) {
582 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
583 (TCPOPT_NOP << 16) |
584 (TCPOPT_MD5SIG << 8) |
585 TCPOLEN_MD5SIG);
586 /* Update length and the length the header thinks exists */
587 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
588 rep.th.doff = arg.iov[0].iov_len / 4;
589
590 tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[1],
591 key,
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700592 ip_hdr(skb)->daddr,
593 ip_hdr(skb)->saddr,
YOSHIFUJI Hideaki076fb722008-04-17 12:48:12 +0900594 &rep.th, arg.iov[0].iov_len);
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800595 }
596#endif
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700597 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
598 ip_hdr(skb)->saddr, /* XXX */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700599 sizeof(struct tcphdr), IPPROTO_TCP, 0);
600 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
601
Denis V. Lunev7feb49c2008-04-03 14:32:00 -0700602 ip_send_reply(dev_net(skb->dst->dev)->ipv4.tcp_sock, skb,
603 &arg, arg.iov[0].iov_len);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700604
605 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
606 TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
607}
608
609/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
610 outside socket context is ugly, certainly. What can I do?
611 */
612
YOSHIFUJI Hideaki9501f972008-04-18 12:45:16 +0900613static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
614 u32 win, u32 ts, int oif,
615 struct tcp_md5sig_key *key)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700616{
Arnaldo Carvalho de Meloaa8223c2007-04-10 21:04:22 -0700617 struct tcphdr *th = tcp_hdr(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700618 struct {
619 struct tcphdr th;
Al Viro714e85b2006-11-14 20:51:49 -0800620 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800621#ifdef CONFIG_TCP_MD5SIG
Al Viro714e85b2006-11-14 20:51:49 -0800622 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800623#endif
624 ];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700625 } rep;
626 struct ip_reply_arg arg;
627
628 memset(&rep.th, 0, sizeof(struct tcphdr));
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -0200629 memset(&arg, 0, sizeof(arg));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700630
631 arg.iov[0].iov_base = (unsigned char *)&rep;
632 arg.iov[0].iov_len = sizeof(rep.th);
633 if (ts) {
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800634 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
635 (TCPOPT_TIMESTAMP << 8) |
636 TCPOLEN_TIMESTAMP);
637 rep.opt[1] = htonl(tcp_time_stamp);
638 rep.opt[2] = htonl(ts);
Craig Schlentercb48cfe2007-01-09 00:11:15 -0800639 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700640 }
641
642 /* Swap the send and the receive. */
643 rep.th.dest = th->source;
644 rep.th.source = th->dest;
645 rep.th.doff = arg.iov[0].iov_len / 4;
646 rep.th.seq = htonl(seq);
647 rep.th.ack_seq = htonl(ack);
648 rep.th.ack = 1;
649 rep.th.window = htons(win);
650
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800651#ifdef CONFIG_TCP_MD5SIG
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800652 if (key) {
653 int offset = (ts) ? 3 : 0;
654
655 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
656 (TCPOPT_NOP << 16) |
657 (TCPOPT_MD5SIG << 8) |
658 TCPOLEN_MD5SIG);
659 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
660 rep.th.doff = arg.iov[0].iov_len/4;
661
662 tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[offset],
663 key,
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700664 ip_hdr(skb)->daddr,
665 ip_hdr(skb)->saddr,
YOSHIFUJI Hideaki076fb722008-04-17 12:48:12 +0900666 &rep.th, arg.iov[0].iov_len);
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800667 }
668#endif
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700669 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
670 ip_hdr(skb)->saddr, /* XXX */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700671 arg.iov[0].iov_len, IPPROTO_TCP, 0);
672 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
YOSHIFUJI Hideaki9501f972008-04-18 12:45:16 +0900673 if (oif)
674 arg.bound_dev_if = oif;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700675
Denis V. Lunev7feb49c2008-04-03 14:32:00 -0700676 ip_send_reply(dev_net(skb->dev)->ipv4.tcp_sock, skb,
677 &arg, arg.iov[0].iov_len);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700678
679 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
680}
681
682static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
683{
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -0700684 struct inet_timewait_sock *tw = inet_twsk(sk);
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800685 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700686
YOSHIFUJI Hideaki9501f972008-04-18 12:45:16 +0900687 tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -0200688 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
YOSHIFUJI Hideaki9501f972008-04-18 12:45:16 +0900689 tcptw->tw_ts_recent,
690 tw->tw_bound_dev_if,
691 tcp_twsk_md5_key(tcptw)
692 );
Linus Torvalds1da177e2005-04-16 15:20:36 -0700693
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -0700694 inet_twsk_put(tw);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700695}
696
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -0200697static void tcp_v4_reqsk_send_ack(struct sk_buff *skb,
698 struct request_sock *req)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700699{
YOSHIFUJI Hideaki9501f972008-04-18 12:45:16 +0900700 tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800701 tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
YOSHIFUJI Hideaki9501f972008-04-18 12:45:16 +0900702 req->ts_recent,
703 0,
704 tcp_v4_md5_do_lookup(skb->sk, ip_hdr(skb)->daddr));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700705}
706
Linus Torvalds1da177e2005-04-16 15:20:36 -0700707/*
Kris Katterjohn9bf1d832008-02-17 22:29:19 -0800708 * Send a SYN-ACK after having received a SYN.
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -0700709 * This still operates on a request_sock only, not on a big
Linus Torvalds1da177e2005-04-16 15:20:36 -0700710 * socket.
711 */
Denis V. Lunevfd80eb92008-02-29 11:43:03 -0800712static int __tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
713 struct dst_entry *dst)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700714{
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -0700715 const struct inet_request_sock *ireq = inet_rsk(req);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700716 int err = -1;
717 struct sk_buff * skb;
718
719 /* First, grab a route. */
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -0700720 if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
Denis V. Lunevfd80eb92008-02-29 11:43:03 -0800721 return -1;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700722
723 skb = tcp_make_synack(sk, dst, req);
724
725 if (skb) {
Arnaldo Carvalho de Meloaa8223c2007-04-10 21:04:22 -0700726 struct tcphdr *th = tcp_hdr(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700727
Frederik Deweerdtba7808e2007-02-04 20:15:27 -0800728 th->check = tcp_v4_check(skb->len,
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -0700729 ireq->loc_addr,
730 ireq->rmt_addr,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700731 csum_partial((char *)th, skb->len,
732 skb->csum));
733
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -0700734 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
735 ireq->rmt_addr,
736 ireq->opt);
Gerrit Renkerb9df3cb2006-11-14 11:21:36 -0200737 err = net_xmit_eval(err);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700738 }
739
Linus Torvalds1da177e2005-04-16 15:20:36 -0700740 dst_release(dst);
741 return err;
742}
743
Denis V. Lunevfd80eb92008-02-29 11:43:03 -0800744static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req)
745{
746 return __tcp_v4_send_synack(sk, req, NULL);
747}
748
Linus Torvalds1da177e2005-04-16 15:20:36 -0700749/*
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -0700750 * IPv4 request_sock destructor.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700751 */
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -0700752static void tcp_v4_reqsk_destructor(struct request_sock *req)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700753{
Jesper Juhla51482b2005-11-08 09:41:34 -0800754 kfree(inet_rsk(req)->opt);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700755}
756
Arnaldo Carvalho de Melo80e40da2006-01-04 01:58:06 -0200757#ifdef CONFIG_SYN_COOKIES
Stephen Hemminger40efc6f2006-01-03 16:03:49 -0800758static void syn_flood_warning(struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700759{
760 static unsigned long warntime;
761
762 if (time_after(jiffies, (warntime + HZ * 60))) {
763 warntime = jiffies;
764 printk(KERN_INFO
765 "possible SYN flooding on port %d. Sending cookies.\n",
Arnaldo Carvalho de Meloaa8223c2007-04-10 21:04:22 -0700766 ntohs(tcp_hdr(skb)->dest));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700767 }
768}
Arnaldo Carvalho de Melo80e40da2006-01-04 01:58:06 -0200769#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700770
771/*
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -0700772 * Save and compile IPv4 options into the request_sock if needed.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700773 */
Stephen Hemminger40efc6f2006-01-03 16:03:49 -0800774static struct ip_options *tcp_v4_save_options(struct sock *sk,
775 struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700776{
777 struct ip_options *opt = &(IPCB(skb)->opt);
778 struct ip_options *dopt = NULL;
779
780 if (opt && opt->optlen) {
781 int opt_size = optlength(opt);
782 dopt = kmalloc(opt_size, GFP_ATOMIC);
783 if (dopt) {
784 if (ip_options_echo(dopt, skb)) {
785 kfree(dopt);
786 dopt = NULL;
787 }
788 }
789 }
790 return dopt;
791}
792
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800793#ifdef CONFIG_TCP_MD5SIG
794/*
795 * RFC2385 MD5 checksumming requires a mapping of
796 * IP address->MD5 Key.
797 * We need to maintain these in the sk structure.
798 */
799
800/* Find the Key structure for an address. */
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -0200801static struct tcp_md5sig_key *
802 tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800803{
804 struct tcp_sock *tp = tcp_sk(sk);
805 int i;
806
807 if (!tp->md5sig_info || !tp->md5sig_info->entries4)
808 return NULL;
809 for (i = 0; i < tp->md5sig_info->entries4; i++) {
810 if (tp->md5sig_info->keys4[i].addr == addr)
David S. Millerf8ab18d2007-09-28 15:18:35 -0700811 return &tp->md5sig_info->keys4[i].base;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800812 }
813 return NULL;
814}
815
816struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
817 struct sock *addr_sk)
818{
819 return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->daddr);
820}
821
822EXPORT_SYMBOL(tcp_v4_md5_lookup);
823
Adrian Bunkf5b99bc2006-11-30 17:22:29 -0800824static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
825 struct request_sock *req)
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800826{
827 return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
828}
829
830/* This can be called on a newly created socket, from other files */
831int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
832 u8 *newkey, u8 newkeylen)
833{
834 /* Add Key to the list */
Matthias M. Dellwegb0a713e2007-10-29 20:55:27 -0700835 struct tcp_md5sig_key *key;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800836 struct tcp_sock *tp = tcp_sk(sk);
837 struct tcp4_md5sig_key *keys;
838
Matthias M. Dellwegb0a713e2007-10-29 20:55:27 -0700839 key = tcp_v4_md5_do_lookup(sk, addr);
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800840 if (key) {
841 /* Pre-existing entry - just update that one. */
Matthias M. Dellwegb0a713e2007-10-29 20:55:27 -0700842 kfree(key->key);
843 key->key = newkey;
844 key->keylen = newkeylen;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800845 } else {
Arnaldo Carvalho de Melof6685932006-11-17 11:06:01 -0200846 struct tcp_md5sig_info *md5sig;
847
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800848 if (!tp->md5sig_info) {
Arnaldo Carvalho de Melof6685932006-11-17 11:06:01 -0200849 tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
850 GFP_ATOMIC);
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800851 if (!tp->md5sig_info) {
852 kfree(newkey);
853 return -ENOMEM;
854 }
David S. Miller3d7dbea2007-06-12 14:36:42 -0700855 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800856 }
857 if (tcp_alloc_md5sig_pool() == NULL) {
858 kfree(newkey);
859 return -ENOMEM;
860 }
Arnaldo Carvalho de Melof6685932006-11-17 11:06:01 -0200861 md5sig = tp->md5sig_info;
862
863 if (md5sig->alloced4 == md5sig->entries4) {
864 keys = kmalloc((sizeof(*keys) *
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900865 (md5sig->entries4 + 1)), GFP_ATOMIC);
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800866 if (!keys) {
867 kfree(newkey);
868 tcp_free_md5sig_pool();
869 return -ENOMEM;
870 }
871
Arnaldo Carvalho de Melof6685932006-11-17 11:06:01 -0200872 if (md5sig->entries4)
873 memcpy(keys, md5sig->keys4,
874 sizeof(*keys) * md5sig->entries4);
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800875
876 /* Free old key list, and reference new one */
YOSHIFUJI Hideakia80cc202007-11-20 17:30:06 -0800877 kfree(md5sig->keys4);
Arnaldo Carvalho de Melof6685932006-11-17 11:06:01 -0200878 md5sig->keys4 = keys;
879 md5sig->alloced4++;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800880 }
Arnaldo Carvalho de Melof6685932006-11-17 11:06:01 -0200881 md5sig->entries4++;
David S. Millerf8ab18d2007-09-28 15:18:35 -0700882 md5sig->keys4[md5sig->entries4 - 1].addr = addr;
883 md5sig->keys4[md5sig->entries4 - 1].base.key = newkey;
884 md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800885 }
886 return 0;
887}
888
889EXPORT_SYMBOL(tcp_v4_md5_do_add);
890
891static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
892 u8 *newkey, u8 newkeylen)
893{
894 return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->daddr,
895 newkey, newkeylen);
896}
897
898int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
899{
900 struct tcp_sock *tp = tcp_sk(sk);
901 int i;
902
903 for (i = 0; i < tp->md5sig_info->entries4; i++) {
904 if (tp->md5sig_info->keys4[i].addr == addr) {
905 /* Free the key */
David S. Millerf8ab18d2007-09-28 15:18:35 -0700906 kfree(tp->md5sig_info->keys4[i].base.key);
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800907 tp->md5sig_info->entries4--;
908
909 if (tp->md5sig_info->entries4 == 0) {
910 kfree(tp->md5sig_info->keys4);
911 tp->md5sig_info->keys4 = NULL;
Leigh Brown8228a18d2006-12-17 17:12:30 -0800912 tp->md5sig_info->alloced4 = 0;
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -0200913 } else if (tp->md5sig_info->entries4 != i) {
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800914 /* Need to do some manipulation */
YOSHIFUJI Hideaki354faf02007-11-20 17:30:31 -0800915 memmove(&tp->md5sig_info->keys4[i],
916 &tp->md5sig_info->keys4[i+1],
917 (tp->md5sig_info->entries4 - i) *
918 sizeof(struct tcp4_md5sig_key));
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800919 }
920 tcp_free_md5sig_pool();
921 return 0;
922 }
923 }
924 return -ENOENT;
925}
926
927EXPORT_SYMBOL(tcp_v4_md5_do_del);
928
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -0200929static void tcp_v4_clear_md5_list(struct sock *sk)
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800930{
931 struct tcp_sock *tp = tcp_sk(sk);
932
933 /* Free each key, then the set of key keys,
934 * the crypto element, and then decrement our
935 * hold on the last resort crypto.
936 */
937 if (tp->md5sig_info->entries4) {
938 int i;
939 for (i = 0; i < tp->md5sig_info->entries4; i++)
David S. Millerf8ab18d2007-09-28 15:18:35 -0700940 kfree(tp->md5sig_info->keys4[i].base.key);
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800941 tp->md5sig_info->entries4 = 0;
942 tcp_free_md5sig_pool();
943 }
944 if (tp->md5sig_info->keys4) {
945 kfree(tp->md5sig_info->keys4);
946 tp->md5sig_info->keys4 = NULL;
947 tp->md5sig_info->alloced4 = 0;
948 }
949}
950
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -0200951static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
952 int optlen)
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800953{
954 struct tcp_md5sig cmd;
955 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
956 u8 *newkey;
957
958 if (optlen < sizeof(cmd))
959 return -EINVAL;
960
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -0200961 if (copy_from_user(&cmd, optval, sizeof(cmd)))
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800962 return -EFAULT;
963
964 if (sin->sin_family != AF_INET)
965 return -EINVAL;
966
967 if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
968 if (!tcp_sk(sk)->md5sig_info)
969 return -ENOENT;
970 return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
971 }
972
973 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
974 return -EINVAL;
975
976 if (!tcp_sk(sk)->md5sig_info) {
977 struct tcp_sock *tp = tcp_sk(sk);
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -0200978 struct tcp_md5sig_info *p = kzalloc(sizeof(*p), GFP_KERNEL);
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800979
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800980 if (!p)
981 return -EINVAL;
982
983 tp->md5sig_info = p;
David S. Miller3d7dbea2007-06-12 14:36:42 -0700984 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800985 }
986
Arnaldo Carvalho de Melof6685932006-11-17 11:06:01 -0200987 newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800988 if (!newkey)
989 return -ENOMEM;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800990 return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
991 newkey, cmd.tcpm_keylen);
992}
993
994static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
995 __be32 saddr, __be32 daddr,
YOSHIFUJI Hideaki076fb722008-04-17 12:48:12 +0900996 struct tcphdr *th,
YOSHIFUJI Hideaki9cb57342008-01-12 02:16:03 -0800997 unsigned int tcplen)
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800998{
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800999 struct tcp_md5sig_pool *hp;
1000 struct tcp4_pseudohdr *bp;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001001 int err;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001002
1003 /*
1004 * Okay, so RFC2385 is turned on for this connection,
1005 * so we need to generate the MD5 hash for the packet now.
1006 */
1007
1008 hp = tcp_get_md5sig_pool();
1009 if (!hp)
1010 goto clear_hash_noput;
1011
1012 bp = &hp->md5_blk.ip4;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001013
1014 /*
YOSHIFUJI Hideaki8d26d762008-04-17 13:19:16 +09001015 * The TCP pseudo-header (in the order: source IP address,
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001016 * destination IP address, zero-padded protocol number, and
1017 * segment length)
1018 */
1019 bp->saddr = saddr;
1020 bp->daddr = daddr;
1021 bp->pad = 0;
YOSHIFUJI Hideaki076fb722008-04-17 12:48:12 +09001022 bp->protocol = IPPROTO_TCP;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001023 bp->len = htons(tcplen);
David S. Millerc7da57a2007-10-26 00:41:21 -07001024
YOSHIFUJI Hideaki8d26d762008-04-17 13:19:16 +09001025 err = tcp_calc_md5_hash(md5_hash, key, sizeof(*bp),
1026 th, tcplen, hp);
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001027 if (err)
1028 goto clear_hash;
1029
YOSHIFUJI Hideaki8d26d762008-04-17 13:19:16 +09001030 /* Free up the crypto pool */
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001031 tcp_put_md5sig_pool();
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001032out:
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001033 return 0;
1034clear_hash:
1035 tcp_put_md5sig_pool();
1036clear_hash_noput:
1037 memset(md5_hash, 0, 16);
1038 goto out;
1039}
1040
1041int tcp_v4_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
1042 struct sock *sk,
1043 struct dst_entry *dst,
1044 struct request_sock *req,
YOSHIFUJI Hideaki076fb722008-04-17 12:48:12 +09001045 struct tcphdr *th,
YOSHIFUJI Hideaki9cb57342008-01-12 02:16:03 -08001046 unsigned int tcplen)
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001047{
1048 __be32 saddr, daddr;
1049
1050 if (sk) {
1051 saddr = inet_sk(sk)->saddr;
1052 daddr = inet_sk(sk)->daddr;
1053 } else {
1054 struct rtable *rt = (struct rtable *)dst;
1055 BUG_ON(!rt);
1056 saddr = rt->rt_src;
1057 daddr = rt->rt_dst;
1058 }
1059 return tcp_v4_do_calc_md5_hash(md5_hash, key,
1060 saddr, daddr,
YOSHIFUJI Hideaki076fb722008-04-17 12:48:12 +09001061 th, tcplen);
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001062}
1063
1064EXPORT_SYMBOL(tcp_v4_calc_md5_hash);
1065
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -02001066static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001067{
1068 /*
1069 * This gets called for each TCP segment that arrives
1070 * so we want to be efficient.
1071 * We have 3 drop cases:
1072 * o No MD5 hash and one expected.
1073 * o MD5 hash and we're not expecting one.
1074 * o MD5 hash and its wrong.
1075 */
1076 __u8 *hash_location = NULL;
1077 struct tcp_md5sig_key *hash_expected;
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07001078 const struct iphdr *iph = ip_hdr(skb);
Arnaldo Carvalho de Meloaa8223c2007-04-10 21:04:22 -07001079 struct tcphdr *th = tcp_hdr(skb);
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001080 int genhash;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001081 unsigned char newhash[16];
1082
1083 hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
YOSHIFUJI Hideaki7d5d5522008-04-17 12:29:53 +09001084 hash_location = tcp_parse_md5sig_option(th);
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001085
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001086 /* We've parsed the options - do we have a hash? */
1087 if (!hash_expected && !hash_location)
1088 return 0;
1089
1090 if (hash_expected && !hash_location) {
Leigh Browna9fc00c2006-12-17 17:13:10 -08001091 LIMIT_NETDEBUG(KERN_INFO "MD5 Hash expected but NOT found "
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001092 "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n",
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -02001093 NIPQUAD(iph->saddr), ntohs(th->source),
1094 NIPQUAD(iph->daddr), ntohs(th->dest));
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001095 return 1;
1096 }
1097
1098 if (!hash_expected && hash_location) {
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -02001099 LIMIT_NETDEBUG(KERN_INFO "MD5 Hash NOT expected but found "
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001100 "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n",
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -02001101 NIPQUAD(iph->saddr), ntohs(th->source),
1102 NIPQUAD(iph->daddr), ntohs(th->dest));
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001103 return 1;
1104 }
1105
1106 /* Okay, so this is hash_expected and hash_location -
1107 * so we need to calculate the checksum.
1108 */
1109 genhash = tcp_v4_do_calc_md5_hash(newhash,
1110 hash_expected,
1111 iph->saddr, iph->daddr,
YOSHIFUJI Hideaki076fb722008-04-17 12:48:12 +09001112 th, skb->len);
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001113
1114 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1115 if (net_ratelimit()) {
1116 printk(KERN_INFO "MD5 Hash failed for "
1117 "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)%s\n",
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -02001118 NIPQUAD(iph->saddr), ntohs(th->source),
1119 NIPQUAD(iph->daddr), ntohs(th->dest),
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001120 genhash ? " tcp_v4_calc_md5_hash failed" : "");
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001121 }
1122 return 1;
1123 }
1124 return 0;
1125}
1126
1127#endif
1128
Eric Dumazet72a3eff2006-11-16 02:30:37 -08001129struct request_sock_ops tcp_request_sock_ops __read_mostly = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001130 .family = PF_INET,
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001131 .obj_size = sizeof(struct tcp_request_sock),
Linus Torvalds1da177e2005-04-16 15:20:36 -07001132 .rtx_syn_ack = tcp_v4_send_synack,
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001133 .send_ack = tcp_v4_reqsk_send_ack,
1134 .destructor = tcp_v4_reqsk_destructor,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001135 .send_reset = tcp_v4_send_reset,
1136};
1137
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001138#ifdef CONFIG_TCP_MD5SIG
Andrew Mortonb6332e62006-11-30 19:16:28 -08001139static struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001140 .md5_lookup = tcp_v4_reqsk_md5_lookup,
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001141};
Andrew Mortonb6332e62006-11-30 19:16:28 -08001142#endif
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001143
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08001144static struct timewait_sock_ops tcp_timewait_sock_ops = {
1145 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
1146 .twsk_unique = tcp_twsk_unique,
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001147 .twsk_destructor= tcp_twsk_destructor,
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08001148};
1149
Linus Torvalds1da177e2005-04-16 15:20:36 -07001150int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1151{
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001152 struct inet_request_sock *ireq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001153 struct tcp_options_received tmp_opt;
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001154 struct request_sock *req;
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07001155 __be32 saddr = ip_hdr(skb)->saddr;
1156 __be32 daddr = ip_hdr(skb)->daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001157 __u32 isn = TCP_SKB_CB(skb)->when;
1158 struct dst_entry *dst = NULL;
1159#ifdef CONFIG_SYN_COOKIES
1160 int want_cookie = 0;
1161#else
1162#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1163#endif
1164
1165 /* Never answer to SYNs send to broadcast or multicast */
Eric Dumazetee6b9672008-03-05 18:30:47 -08001166 if (skb->rtable->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001167 goto drop;
1168
1169 /* TW buckets are converted to open requests without
1170 * limitations, they conserve resources and peer is
1171 * evidently real one.
1172 */
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001173 if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001174#ifdef CONFIG_SYN_COOKIES
1175 if (sysctl_tcp_syncookies) {
1176 want_cookie = 1;
1177 } else
1178#endif
1179 goto drop;
1180 }
1181
1182 /* Accept backlog is full. If we have already queued enough
1183 * of warm entries in syn queue, drop request. It is better than
1184 * clogging syn queue with openreqs with exponentially increasing
1185 * timeout.
1186 */
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001187 if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001188 goto drop;
1189
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001190 req = reqsk_alloc(&tcp_request_sock_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001191 if (!req)
1192 goto drop;
1193
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001194#ifdef CONFIG_TCP_MD5SIG
1195 tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1196#endif
1197
Linus Torvalds1da177e2005-04-16 15:20:36 -07001198 tcp_clear_options(&tmp_opt);
1199 tmp_opt.mss_clamp = 536;
1200 tmp_opt.user_mss = tcp_sk(sk)->rx_opt.user_mss;
1201
1202 tcp_parse_options(skb, &tmp_opt, 0);
1203
Florian Westphal4dfc2812008-04-10 03:12:40 -07001204 if (want_cookie && !tmp_opt.saw_tstamp)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001205 tcp_clear_options(&tmp_opt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001206
1207 if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1208 /* Some OSes (unknown ones, but I see them on web server, which
1209 * contains information interesting only for windows'
1210 * users) do not send their stamp in SYN. It is easy case.
1211 * We simply do not advertise TS support.
1212 */
1213 tmp_opt.saw_tstamp = 0;
1214 tmp_opt.tstamp_ok = 0;
1215 }
1216 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1217
1218 tcp_openreq_init(req, &tmp_opt, skb);
1219
Venkat Yekkirala4237c752006-07-24 23:32:50 -07001220 if (security_inet_conn_request(sk, skb, req))
1221 goto drop_and_free;
1222
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001223 ireq = inet_rsk(req);
1224 ireq->loc_addr = daddr;
1225 ireq->rmt_addr = saddr;
1226 ireq->opt = tcp_v4_save_options(sk, skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001227 if (!want_cookie)
Arnaldo Carvalho de Meloaa8223c2007-04-10 21:04:22 -07001228 TCP_ECN_create_request(req, tcp_hdr(skb));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001229
1230 if (want_cookie) {
1231#ifdef CONFIG_SYN_COOKIES
1232 syn_flood_warning(skb);
Florian Westphal4dfc2812008-04-10 03:12:40 -07001233 req->cookie_ts = tmp_opt.tstamp_ok;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001234#endif
1235 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1236 } else if (!isn) {
1237 struct inet_peer *peer = NULL;
1238
1239 /* VJ's idea. We save last timestamp seen
1240 * from the destination in peer table, when entering
1241 * state TIME-WAIT, and check against it before
1242 * accepting new connection request.
1243 *
1244 * If "isn" is not zero, this request hit alive
1245 * timewait bucket, so that all the necessary checks
1246 * are made in the function processing timewait state.
1247 */
1248 if (tmp_opt.saw_tstamp &&
Arnaldo Carvalho de Melo295ff7e2005-08-09 20:44:40 -07001249 tcp_death_row.sysctl_tw_recycle &&
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001250 (dst = inet_csk_route_req(sk, req)) != NULL &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001251 (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1252 peer->v4daddr == saddr) {
James Morris9d729f72007-03-04 16:12:44 -08001253 if (get_seconds() < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001254 (s32)(peer->tcp_ts - req->ts_recent) >
1255 TCP_PAWS_WINDOW) {
1256 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
Denis V. Lunev7cd04fa2008-03-03 11:59:32 -08001257 goto drop_and_release;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001258 }
1259 }
1260 /* Kill the following clause, if you dislike this way. */
1261 else if (!sysctl_tcp_syncookies &&
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001262 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
Linus Torvalds1da177e2005-04-16 15:20:36 -07001263 (sysctl_max_syn_backlog >> 2)) &&
1264 (!peer || !peer->tcp_ts_stamp) &&
1265 (!dst || !dst_metric(dst, RTAX_RTT))) {
1266 /* Without syncookies last quarter of
1267 * backlog is filled with destinations,
1268 * proven to be alive.
1269 * It means that we continue to communicate
1270 * to destinations, already remembered
1271 * to the moment of synflood.
1272 */
Patrick McHardy64ce2072005-08-09 20:50:53 -07001273 LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open "
YOSHIFUJI Hideakia7d632b2008-04-14 04:09:00 -07001274 "request from " NIPQUAD_FMT "/%u\n",
Patrick McHardy64ce2072005-08-09 20:50:53 -07001275 NIPQUAD(saddr),
Arnaldo Carvalho de Meloaa8223c2007-04-10 21:04:22 -07001276 ntohs(tcp_hdr(skb)->source));
Denis V. Lunev7cd04fa2008-03-03 11:59:32 -08001277 goto drop_and_release;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001278 }
1279
Gerrit Renkera94f7232006-11-10 14:06:49 -08001280 isn = tcp_v4_init_sequence(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001281 }
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001282 tcp_rsk(req)->snt_isn = isn;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001283
Denis V. Lunev7cd04fa2008-03-03 11:59:32 -08001284 if (__tcp_v4_send_synack(sk, req, dst) || want_cookie)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001285 goto drop_and_free;
1286
Denis V. Lunev7cd04fa2008-03-03 11:59:32 -08001287 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001288 return 0;
1289
Denis V. Lunev7cd04fa2008-03-03 11:59:32 -08001290drop_and_release:
1291 dst_release(dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001292drop_and_free:
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001293 reqsk_free(req);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001294drop:
Linus Torvalds1da177e2005-04-16 15:20:36 -07001295 return 0;
1296}
1297
1298
1299/*
1300 * The three way handshake has completed - we got a valid synack -
1301 * now create the new socket.
1302 */
1303struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001304 struct request_sock *req,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001305 struct dst_entry *dst)
1306{
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001307 struct inet_request_sock *ireq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001308 struct inet_sock *newinet;
1309 struct tcp_sock *newtp;
1310 struct sock *newsk;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001311#ifdef CONFIG_TCP_MD5SIG
1312 struct tcp_md5sig_key *key;
1313#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07001314
1315 if (sk_acceptq_is_full(sk))
1316 goto exit_overflow;
1317
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001318 if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001319 goto exit;
1320
1321 newsk = tcp_create_openreq_child(sk, req, skb);
1322 if (!newsk)
1323 goto exit;
1324
Herbert Xubcd76112006-06-30 13:36:35 -07001325 newsk->sk_gso_type = SKB_GSO_TCPV4;
Arnaldo Carvalho de Melo6cbb0df2005-08-09 19:49:02 -07001326 sk_setup_caps(newsk, dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001327
1328 newtp = tcp_sk(newsk);
1329 newinet = inet_sk(newsk);
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001330 ireq = inet_rsk(req);
1331 newinet->daddr = ireq->rmt_addr;
1332 newinet->rcv_saddr = ireq->loc_addr;
1333 newinet->saddr = ireq->loc_addr;
1334 newinet->opt = ireq->opt;
1335 ireq->opt = NULL;
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001336 newinet->mc_index = inet_iif(skb);
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07001337 newinet->mc_ttl = ip_hdr(skb)->ttl;
Arnaldo Carvalho de Melod83d8462005-12-13 23:26:10 -08001338 inet_csk(newsk)->icsk_ext_hdr_len = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001339 if (newinet->opt)
Arnaldo Carvalho de Melod83d8462005-12-13 23:26:10 -08001340 inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001341 newinet->id = newtp->write_seq ^ jiffies;
1342
John Heffner5d424d52006-03-20 17:53:41 -08001343 tcp_mtup_init(newsk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001344 tcp_sync_mss(newsk, dst_mtu(dst));
1345 newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1346 tcp_initialize_rcv_mss(newsk);
1347
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001348#ifdef CONFIG_TCP_MD5SIG
1349 /* Copy over the MD5 key from the original socket */
1350 if ((key = tcp_v4_md5_do_lookup(sk, newinet->daddr)) != NULL) {
1351 /*
1352 * We're using one, so create a matching key
1353 * on the newsk structure. If we fail to get
1354 * memory, then we end up not copying the key
1355 * across. Shucks.
1356 */
Arnaldo Carvalho de Melof6685932006-11-17 11:06:01 -02001357 char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1358 if (newkey != NULL)
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001359 tcp_v4_md5_do_add(newsk, inet_sk(sk)->daddr,
1360 newkey, key->keylen);
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001361 }
1362#endif
1363
Arnaldo Carvalho de Meloab1e0a12008-02-03 04:06:04 -08001364 __inet_hash_nolisten(newsk);
1365 __inet_inherit_port(sk, newsk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001366
1367 return newsk;
1368
1369exit_overflow:
1370 NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1371exit:
1372 NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1373 dst_release(dst);
1374 return NULL;
1375}
1376
1377static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1378{
Arnaldo Carvalho de Meloaa8223c2007-04-10 21:04:22 -07001379 struct tcphdr *th = tcp_hdr(skb);
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07001380 const struct iphdr *iph = ip_hdr(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001381 struct sock *nsk;
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001382 struct request_sock **prev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001383 /* Find possible connection requests. */
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001384 struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1385 iph->saddr, iph->daddr);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001386 if (req)
1387 return tcp_check_req(sk, skb, req, prev);
1388
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09001389 nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
Pavel Emelyanovc67499c2008-01-31 05:06:40 -08001390 th->source, iph->daddr, th->dest, inet_iif(skb));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001391
1392 if (nsk) {
1393 if (nsk->sk_state != TCP_TIME_WAIT) {
1394 bh_lock_sock(nsk);
1395 return nsk;
1396 }
YOSHIFUJI Hideaki9469c7b2006-10-10 19:41:46 -07001397 inet_twsk_put(inet_twsk(nsk));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001398 return NULL;
1399 }
1400
1401#ifdef CONFIG_SYN_COOKIES
1402 if (!th->rst && !th->syn && th->ack)
1403 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1404#endif
1405 return sk;
1406}
1407
Al Virob51655b2006-11-14 21:40:42 -08001408static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001409{
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07001410 const struct iphdr *iph = ip_hdr(skb);
1411
Patrick McHardy84fa7932006-08-29 16:44:56 -07001412 if (skb->ip_summed == CHECKSUM_COMPLETE) {
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07001413 if (!tcp_v4_check(skb->len, iph->saddr,
1414 iph->daddr, skb->csum)) {
Herbert Xufb286bb2005-11-10 13:01:24 -08001415 skb->ip_summed = CHECKSUM_UNNECESSARY;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001416 return 0;
Herbert Xufb286bb2005-11-10 13:01:24 -08001417 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001418 }
Herbert Xufb286bb2005-11-10 13:01:24 -08001419
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07001420 skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
Herbert Xufb286bb2005-11-10 13:01:24 -08001421 skb->len, IPPROTO_TCP, 0);
1422
Linus Torvalds1da177e2005-04-16 15:20:36 -07001423 if (skb->len <= 76) {
Herbert Xufb286bb2005-11-10 13:01:24 -08001424 return __skb_checksum_complete(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001425 }
1426 return 0;
1427}
1428
1429
1430/* The socket must have it's spinlock held when we get
1431 * here.
1432 *
1433 * We have a potential double-lock case here, so even when
1434 * doing backlog processing we use the BH locking scheme.
1435 * This is because we cannot sleep with the original spinlock
1436 * held.
1437 */
1438int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1439{
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001440 struct sock *rsk;
1441#ifdef CONFIG_TCP_MD5SIG
1442 /*
1443 * We really want to reject the packet as early as possible
1444 * if:
1445 * o We're expecting an MD5'd packet and this is no MD5 tcp option
1446 * o There is an MD5 option and we're not expecting one
1447 */
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -02001448 if (tcp_v4_inbound_md5_hash(sk, skb))
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001449 goto discard;
1450#endif
1451
Linus Torvalds1da177e2005-04-16 15:20:36 -07001452 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1453 TCP_CHECK_TIMER(sk);
Arnaldo Carvalho de Meloaa8223c2007-04-10 21:04:22 -07001454 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001455 rsk = sk;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001456 goto reset;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001457 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001458 TCP_CHECK_TIMER(sk);
1459 return 0;
1460 }
1461
Arnaldo Carvalho de Meloab6a5bb2007-03-18 17:43:48 -07001462 if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001463 goto csum_err;
1464
1465 if (sk->sk_state == TCP_LISTEN) {
1466 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1467 if (!nsk)
1468 goto discard;
1469
1470 if (nsk != sk) {
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001471 if (tcp_child_process(sk, nsk, skb)) {
1472 rsk = nsk;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001473 goto reset;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001474 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001475 return 0;
1476 }
1477 }
1478
1479 TCP_CHECK_TIMER(sk);
Arnaldo Carvalho de Meloaa8223c2007-04-10 21:04:22 -07001480 if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001481 rsk = sk;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001482 goto reset;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001483 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001484 TCP_CHECK_TIMER(sk);
1485 return 0;
1486
1487reset:
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001488 tcp_v4_send_reset(rsk, skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001489discard:
1490 kfree_skb(skb);
1491 /* Be careful here. If this function gets more complicated and
1492 * gcc suffers from register pressure on the x86, sk (in %ebx)
1493 * might be destroyed here. This current version compiles correctly,
1494 * but you have been warned.
1495 */
1496 return 0;
1497
1498csum_err:
1499 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1500 goto discard;
1501}
1502
1503/*
1504 * From tcp_input.c
1505 */
1506
1507int tcp_v4_rcv(struct sk_buff *skb)
1508{
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07001509 const struct iphdr *iph;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001510 struct tcphdr *th;
1511 struct sock *sk;
1512 int ret;
1513
1514 if (skb->pkt_type != PACKET_HOST)
1515 goto discard_it;
1516
1517 /* Count it even if it's bad */
1518 TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1519
1520 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1521 goto discard_it;
1522
Arnaldo Carvalho de Meloaa8223c2007-04-10 21:04:22 -07001523 th = tcp_hdr(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001524
1525 if (th->doff < sizeof(struct tcphdr) / 4)
1526 goto bad_packet;
1527 if (!pskb_may_pull(skb, th->doff * 4))
1528 goto discard_it;
1529
1530 /* An explanation is required here, I think.
1531 * Packet length and doff are validated by header prediction,
Stephen Hemmingercaa20d9a2005-11-10 17:13:47 -08001532 * provided case of th->doff==0 is eliminated.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001533 * So, we defer the checks. */
Herbert Xu60476372007-04-09 11:59:39 -07001534 if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001535 goto bad_packet;
1536
Arnaldo Carvalho de Meloaa8223c2007-04-10 21:04:22 -07001537 th = tcp_hdr(skb);
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07001538 iph = ip_hdr(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001539 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1540 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1541 skb->len - th->doff * 4);
1542 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1543 TCP_SKB_CB(skb)->when = 0;
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07001544 TCP_SKB_CB(skb)->flags = iph->tos;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001545 TCP_SKB_CB(skb)->sacked = 0;
1546
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09001547 sk = __inet_lookup(dev_net(skb->dev), &tcp_hashinfo, iph->saddr,
Pavel Emelyanovc67499c2008-01-31 05:06:40 -08001548 th->source, iph->daddr, th->dest, inet_iif(skb));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001549 if (!sk)
1550 goto no_tcp_socket;
1551
1552process:
1553 if (sk->sk_state == TCP_TIME_WAIT)
1554 goto do_time_wait;
1555
1556 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1557 goto discard_and_relse;
Patrick McHardyb59c2702006-01-06 23:06:10 -08001558 nf_reset(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001559
Dmitry Mishinfda9ef52006-08-31 15:28:39 -07001560 if (sk_filter(sk, skb))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001561 goto discard_and_relse;
1562
1563 skb->dev = NULL;
1564
Ingo Molnarc6366182006-07-03 00:25:13 -07001565 bh_lock_sock_nested(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001566 ret = 0;
1567 if (!sock_owned_by_user(sk)) {
Chris Leech1a2449a2006-05-23 18:05:53 -07001568#ifdef CONFIG_NET_DMA
1569 struct tcp_sock *tp = tcp_sk(sk);
1570 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1571 tp->ucopy.dma_chan = get_softnet_dma();
1572 if (tp->ucopy.dma_chan)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001573 ret = tcp_v4_do_rcv(sk, skb);
Chris Leech1a2449a2006-05-23 18:05:53 -07001574 else
1575#endif
1576 {
1577 if (!tcp_prequeue(sk, skb))
1578 ret = tcp_v4_do_rcv(sk, skb);
1579 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001580 } else
1581 sk_add_backlog(sk, skb);
1582 bh_unlock_sock(sk);
1583
1584 sock_put(sk);
1585
1586 return ret;
1587
1588no_tcp_socket:
1589 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1590 goto discard_it;
1591
1592 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1593bad_packet:
1594 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1595 } else {
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001596 tcp_v4_send_reset(NULL, skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001597 }
1598
1599discard_it:
1600 /* Discard frame. */
1601 kfree_skb(skb);
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001602 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001603
1604discard_and_relse:
1605 sock_put(sk);
1606 goto discard_it;
1607
1608do_time_wait:
1609 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
YOSHIFUJI Hideaki9469c7b2006-10-10 19:41:46 -07001610 inet_twsk_put(inet_twsk(sk));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001611 goto discard_it;
1612 }
1613
1614 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1615 TCP_INC_STATS_BH(TCP_MIB_INERRS);
YOSHIFUJI Hideaki9469c7b2006-10-10 19:41:46 -07001616 inet_twsk_put(inet_twsk(sk));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001617 goto discard_it;
1618 }
YOSHIFUJI Hideaki9469c7b2006-10-10 19:41:46 -07001619 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001620 case TCP_TW_SYN: {
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09001621 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
Pavel Emelyanovc67499c2008-01-31 05:06:40 -08001622 &tcp_hashinfo,
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07001623 iph->daddr, th->dest,
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001624 inet_iif(skb));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001625 if (sk2) {
YOSHIFUJI Hideaki9469c7b2006-10-10 19:41:46 -07001626 inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1627 inet_twsk_put(inet_twsk(sk));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001628 sk = sk2;
1629 goto process;
1630 }
1631 /* Fall through to ACK */
1632 }
1633 case TCP_TW_ACK:
1634 tcp_v4_timewait_ack(sk, skb);
1635 break;
1636 case TCP_TW_RST:
1637 goto no_tcp_socket;
1638 case TCP_TW_SUCCESS:;
1639 }
1640 goto discard_it;
1641}
1642
Linus Torvalds1da177e2005-04-16 15:20:36 -07001643/* VJ's idea. Save last timestamp seen from this destination
1644 * and hold it at least for normal timewait interval to use for duplicate
1645 * segment detection in subsequent connections, before they enter synchronized
1646 * state.
1647 */
1648
1649int tcp_v4_remember_stamp(struct sock *sk)
1650{
1651 struct inet_sock *inet = inet_sk(sk);
1652 struct tcp_sock *tp = tcp_sk(sk);
1653 struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1654 struct inet_peer *peer = NULL;
1655 int release_it = 0;
1656
1657 if (!rt || rt->rt_dst != inet->daddr) {
1658 peer = inet_getpeer(inet->daddr, 1);
1659 release_it = 1;
1660 } else {
1661 if (!rt->peer)
1662 rt_bind_peer(rt, 1);
1663 peer = rt->peer;
1664 }
1665
1666 if (peer) {
1667 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
James Morris9d729f72007-03-04 16:12:44 -08001668 (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001669 peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1670 peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1671 peer->tcp_ts = tp->rx_opt.ts_recent;
1672 }
1673 if (release_it)
1674 inet_putpeer(peer);
1675 return 1;
1676 }
1677
1678 return 0;
1679}
1680
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07001681int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001682{
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07001683 struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001684
1685 if (peer) {
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07001686 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1687
1688 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
James Morris9d729f72007-03-04 16:12:44 -08001689 (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07001690 peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1691 peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1692 peer->tcp_ts = tcptw->tw_ts_recent;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001693 }
1694 inet_putpeer(peer);
1695 return 1;
1696 }
1697
1698 return 0;
1699}
1700
Arnaldo Carvalho de Melo8292a172005-12-13 23:15:52 -08001701struct inet_connection_sock_af_ops ipv4_specific = {
Arnaldo Carvalho de Melo543d9cf2006-03-20 22:48:35 -08001702 .queue_xmit = ip_queue_xmit,
1703 .send_check = tcp_v4_send_check,
1704 .rebuild_header = inet_sk_rebuild_header,
1705 .conn_request = tcp_v4_conn_request,
1706 .syn_recv_sock = tcp_v4_syn_recv_sock,
1707 .remember_stamp = tcp_v4_remember_stamp,
1708 .net_header_len = sizeof(struct iphdr),
1709 .setsockopt = ip_setsockopt,
1710 .getsockopt = ip_getsockopt,
1711 .addr2sockaddr = inet_csk_addr2sockaddr,
1712 .sockaddr_len = sizeof(struct sockaddr_in),
Arnaldo Carvalho de Meloab1e0a12008-02-03 04:06:04 -08001713 .bind_conflict = inet_csk_bind_conflict,
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08001714#ifdef CONFIG_COMPAT
Arnaldo Carvalho de Melo543d9cf2006-03-20 22:48:35 -08001715 .compat_setsockopt = compat_ip_setsockopt,
1716 .compat_getsockopt = compat_ip_getsockopt,
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08001717#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07001718};
1719
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001720#ifdef CONFIG_TCP_MD5SIG
Andrew Mortonb6332e62006-11-30 19:16:28 -08001721static struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001722 .md5_lookup = tcp_v4_md5_lookup,
1723 .calc_md5_hash = tcp_v4_calc_md5_hash,
1724 .md5_add = tcp_v4_md5_add_func,
1725 .md5_parse = tcp_v4_parse_md5_keys,
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001726};
Andrew Mortonb6332e62006-11-30 19:16:28 -08001727#endif
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001728
Linus Torvalds1da177e2005-04-16 15:20:36 -07001729/* NOTE: A lot of things set to zero explicitly by call to
1730 * sk_alloc() so need not be done here.
1731 */
1732static int tcp_v4_init_sock(struct sock *sk)
1733{
Arnaldo Carvalho de Melo6687e982005-08-10 04:03:31 -03001734 struct inet_connection_sock *icsk = inet_csk(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001735 struct tcp_sock *tp = tcp_sk(sk);
1736
1737 skb_queue_head_init(&tp->out_of_order_queue);
1738 tcp_init_xmit_timers(sk);
1739 tcp_prequeue_init(tp);
1740
Arnaldo Carvalho de Melo6687e982005-08-10 04:03:31 -03001741 icsk->icsk_rto = TCP_TIMEOUT_INIT;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001742 tp->mdev = TCP_TIMEOUT_INIT;
1743
1744 /* So many TCP implementations out there (incorrectly) count the
1745 * initial SYN frame in their delayed-ACK and congestion control
1746 * algorithms that we must have the following bandaid to talk
1747 * efficiently to them. -DaveM
1748 */
1749 tp->snd_cwnd = 2;
1750
1751 /* See draft-stevens-tcpca-spec-01 for discussion of the
1752 * initialization of these values.
1753 */
1754 tp->snd_ssthresh = 0x7fffffff; /* Infinity */
1755 tp->snd_cwnd_clamp = ~0;
David S. Millerc1b4a7e2005-07-05 15:24:38 -07001756 tp->mss_cache = 536;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001757
1758 tp->reordering = sysctl_tcp_reordering;
Arnaldo Carvalho de Melo6687e982005-08-10 04:03:31 -03001759 icsk->icsk_ca_ops = &tcp_init_congestion_ops;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001760
1761 sk->sk_state = TCP_CLOSE;
1762
1763 sk->sk_write_space = sk_stream_write_space;
1764 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1765
Arnaldo Carvalho de Melo8292a172005-12-13 23:15:52 -08001766 icsk->icsk_af_ops = &ipv4_specific;
Arnaldo Carvalho de Melod83d8462005-12-13 23:26:10 -08001767 icsk->icsk_sync_mss = tcp_sync_mss;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001768#ifdef CONFIG_TCP_MD5SIG
1769 tp->af_specific = &tcp_sock_ipv4_specific;
1770#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07001771
1772 sk->sk_sndbuf = sysctl_tcp_wmem[1];
1773 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1774
1775 atomic_inc(&tcp_sockets_allocated);
1776
1777 return 0;
1778}
1779
1780int tcp_v4_destroy_sock(struct sock *sk)
1781{
1782 struct tcp_sock *tp = tcp_sk(sk);
1783
1784 tcp_clear_xmit_timers(sk);
1785
Arnaldo Carvalho de Melo6687e982005-08-10 04:03:31 -03001786 tcp_cleanup_congestion_control(sk);
Stephen Hemminger317a76f2005-06-23 12:19:55 -07001787
Linus Torvalds1da177e2005-04-16 15:20:36 -07001788 /* Cleanup up the write buffer. */
David S. Millerfe067e82007-03-07 12:12:44 -08001789 tcp_write_queue_purge(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001790
1791 /* Cleans up our, hopefully empty, out_of_order_queue. */
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001792 __skb_queue_purge(&tp->out_of_order_queue);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001793
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001794#ifdef CONFIG_TCP_MD5SIG
1795 /* Clean up the MD5 key list, if any */
1796 if (tp->md5sig_info) {
1797 tcp_v4_clear_md5_list(sk);
1798 kfree(tp->md5sig_info);
1799 tp->md5sig_info = NULL;
1800 }
1801#endif
1802
Chris Leech1a2449a2006-05-23 18:05:53 -07001803#ifdef CONFIG_NET_DMA
1804 /* Cleans up our sk_async_wait_queue */
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001805 __skb_queue_purge(&sk->sk_async_wait_queue);
Chris Leech1a2449a2006-05-23 18:05:53 -07001806#endif
1807
Linus Torvalds1da177e2005-04-16 15:20:36 -07001808 /* Clean prequeue, it must be empty really */
1809 __skb_queue_purge(&tp->ucopy.prequeue);
1810
1811 /* Clean up a referenced TCP bind bucket. */
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001812 if (inet_csk(sk)->icsk_bind_hash)
Arnaldo Carvalho de Meloab1e0a12008-02-03 04:06:04 -08001813 inet_put_port(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001814
1815 /*
1816 * If sendmsg cached page exists, toss it.
1817 */
1818 if (sk->sk_sndmsg_page) {
1819 __free_page(sk->sk_sndmsg_page);
1820 sk->sk_sndmsg_page = NULL;
1821 }
1822
Patrick McManusec3c0982008-03-21 16:33:01 -07001823 if (tp->defer_tcp_accept.request) {
1824 reqsk_free(tp->defer_tcp_accept.request);
1825 sock_put(tp->defer_tcp_accept.listen_sk);
1826 sock_put(sk);
1827 tp->defer_tcp_accept.listen_sk = NULL;
1828 tp->defer_tcp_accept.request = NULL;
1829 }
1830
Linus Torvalds1da177e2005-04-16 15:20:36 -07001831 atomic_dec(&tcp_sockets_allocated);
1832
1833 return 0;
1834}
1835
1836EXPORT_SYMBOL(tcp_v4_destroy_sock);
1837
1838#ifdef CONFIG_PROC_FS
1839/* Proc filesystem TCP sock list dumping. */
1840
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07001841static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001842{
1843 return hlist_empty(head) ? NULL :
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07001844 list_entry(head->first, struct inet_timewait_sock, tw_node);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001845}
1846
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07001847static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001848{
1849 return tw->tw_node.next ?
1850 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1851}
1852
1853static void *listening_get_next(struct seq_file *seq, void *cur)
1854{
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001855 struct inet_connection_sock *icsk;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001856 struct hlist_node *node;
1857 struct sock *sk = cur;
1858 struct tcp_iter_state* st = seq->private;
Denis V. Luneva4146b12008-04-13 22:11:14 -07001859 struct net *net = seq_file_net(seq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001860
1861 if (!sk) {
1862 st->bucket = 0;
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -07001863 sk = sk_head(&tcp_hashinfo.listening_hash[0]);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001864 goto get_sk;
1865 }
1866
1867 ++st->num;
1868
1869 if (st->state == TCP_SEQ_STATE_OPENREQ) {
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001870 struct request_sock *req = cur;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001871
Eric Dumazet72a3eff2006-11-16 02:30:37 -08001872 icsk = inet_csk(st->syn_wait_sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001873 req = req->dl_next;
1874 while (1) {
1875 while (req) {
Daniel Lezcanof40c8172008-03-21 04:13:54 -07001876 if (req->rsk_ops->family == st->family &&
YOSHIFUJI Hideaki878628f2008-03-26 03:57:35 +09001877 net_eq(sock_net(req->sk), net)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001878 cur = req;
1879 goto out;
1880 }
1881 req = req->dl_next;
1882 }
Eric Dumazet72a3eff2006-11-16 02:30:37 -08001883 if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001884 break;
1885get_req:
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001886 req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
Linus Torvalds1da177e2005-04-16 15:20:36 -07001887 }
1888 sk = sk_next(st->syn_wait_sk);
1889 st->state = TCP_SEQ_STATE_LISTENING;
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001890 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001891 } else {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001892 icsk = inet_csk(sk);
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001893 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1894 if (reqsk_queue_len(&icsk->icsk_accept_queue))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001895 goto start_req;
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001896 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001897 sk = sk_next(sk);
1898 }
1899get_sk:
1900 sk_for_each_from(sk, node) {
YOSHIFUJI Hideaki878628f2008-03-26 03:57:35 +09001901 if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001902 cur = sk;
1903 goto out;
1904 }
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001905 icsk = inet_csk(sk);
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001906 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1907 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001908start_req:
1909 st->uid = sock_i_uid(sk);
1910 st->syn_wait_sk = sk;
1911 st->state = TCP_SEQ_STATE_OPENREQ;
1912 st->sbucket = 0;
1913 goto get_req;
1914 }
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001915 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001916 }
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -07001917 if (++st->bucket < INET_LHTABLE_SIZE) {
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -07001918 sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001919 goto get_sk;
1920 }
1921 cur = NULL;
1922out:
1923 return cur;
1924}
1925
1926static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1927{
1928 void *rc = listening_get_next(seq, NULL);
1929
1930 while (rc && *pos) {
1931 rc = listening_get_next(seq, rc);
1932 --*pos;
1933 }
1934 return rc;
1935}
1936
1937static void *established_get_first(struct seq_file *seq)
1938{
1939 struct tcp_iter_state* st = seq->private;
Denis V. Luneva4146b12008-04-13 22:11:14 -07001940 struct net *net = seq_file_net(seq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001941 void *rc = NULL;
1942
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -07001943 for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001944 struct sock *sk;
1945 struct hlist_node *node;
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07001946 struct inet_timewait_sock *tw;
Eric Dumazet230140c2007-11-07 02:40:20 -08001947 rwlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001948
Eric Dumazet230140c2007-11-07 02:40:20 -08001949 read_lock_bh(lock);
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -07001950 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
Daniel Lezcanof40c8172008-03-21 04:13:54 -07001951 if (sk->sk_family != st->family ||
YOSHIFUJI Hideaki878628f2008-03-26 03:57:35 +09001952 !net_eq(sock_net(sk), net)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001953 continue;
1954 }
1955 rc = sk;
1956 goto out;
1957 }
1958 st->state = TCP_SEQ_STATE_TIME_WAIT;
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07001959 inet_twsk_for_each(tw, node,
Eric Dumazetdbca9b2752007-02-08 14:16:46 -08001960 &tcp_hashinfo.ehash[st->bucket].twchain) {
Pavel Emelyanov28518fc2008-03-21 15:52:00 -07001961 if (tw->tw_family != st->family ||
YOSHIFUJI Hideaki878628f2008-03-26 03:57:35 +09001962 !net_eq(twsk_net(tw), net)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001963 continue;
1964 }
1965 rc = tw;
1966 goto out;
1967 }
Eric Dumazet230140c2007-11-07 02:40:20 -08001968 read_unlock_bh(lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001969 st->state = TCP_SEQ_STATE_ESTABLISHED;
1970 }
1971out:
1972 return rc;
1973}
1974
1975static void *established_get_next(struct seq_file *seq, void *cur)
1976{
1977 struct sock *sk = cur;
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07001978 struct inet_timewait_sock *tw;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001979 struct hlist_node *node;
1980 struct tcp_iter_state* st = seq->private;
Denis V. Luneva4146b12008-04-13 22:11:14 -07001981 struct net *net = seq_file_net(seq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001982
1983 ++st->num;
1984
1985 if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
1986 tw = cur;
1987 tw = tw_next(tw);
1988get_tw:
YOSHIFUJI Hideaki878628f2008-03-26 03:57:35 +09001989 while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001990 tw = tw_next(tw);
1991 }
1992 if (tw) {
1993 cur = tw;
1994 goto out;
1995 }
Eric Dumazet230140c2007-11-07 02:40:20 -08001996 read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001997 st->state = TCP_SEQ_STATE_ESTABLISHED;
1998
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -07001999 if (++st->bucket < tcp_hashinfo.ehash_size) {
Eric Dumazet230140c2007-11-07 02:40:20 -08002000 read_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -07002001 sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002002 } else {
2003 cur = NULL;
2004 goto out;
2005 }
2006 } else
2007 sk = sk_next(sk);
2008
2009 sk_for_each_from(sk, node) {
YOSHIFUJI Hideaki878628f2008-03-26 03:57:35 +09002010 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002011 goto found;
2012 }
2013
2014 st->state = TCP_SEQ_STATE_TIME_WAIT;
Eric Dumazetdbca9b2752007-02-08 14:16:46 -08002015 tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002016 goto get_tw;
2017found:
2018 cur = sk;
2019out:
2020 return cur;
2021}
2022
2023static void *established_get_idx(struct seq_file *seq, loff_t pos)
2024{
2025 void *rc = established_get_first(seq);
2026
2027 while (rc && pos) {
2028 rc = established_get_next(seq, rc);
2029 --pos;
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -02002030 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002031 return rc;
2032}
2033
2034static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2035{
2036 void *rc;
2037 struct tcp_iter_state* st = seq->private;
2038
Arnaldo Carvalho de Melof3f05f72005-08-09 20:08:09 -07002039 inet_listen_lock(&tcp_hashinfo);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002040 st->state = TCP_SEQ_STATE_LISTENING;
2041 rc = listening_get_idx(seq, &pos);
2042
2043 if (!rc) {
Arnaldo Carvalho de Melof3f05f72005-08-09 20:08:09 -07002044 inet_listen_unlock(&tcp_hashinfo);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002045 st->state = TCP_SEQ_STATE_ESTABLISHED;
2046 rc = established_get_idx(seq, pos);
2047 }
2048
2049 return rc;
2050}
2051
2052static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2053{
2054 struct tcp_iter_state* st = seq->private;
2055 st->state = TCP_SEQ_STATE_LISTENING;
2056 st->num = 0;
2057 return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2058}
2059
2060static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2061{
2062 void *rc = NULL;
2063 struct tcp_iter_state* st;
2064
2065 if (v == SEQ_START_TOKEN) {
2066 rc = tcp_get_idx(seq, 0);
2067 goto out;
2068 }
2069 st = seq->private;
2070
2071 switch (st->state) {
2072 case TCP_SEQ_STATE_OPENREQ:
2073 case TCP_SEQ_STATE_LISTENING:
2074 rc = listening_get_next(seq, v);
2075 if (!rc) {
Arnaldo Carvalho de Melof3f05f72005-08-09 20:08:09 -07002076 inet_listen_unlock(&tcp_hashinfo);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002077 st->state = TCP_SEQ_STATE_ESTABLISHED;
2078 rc = established_get_first(seq);
2079 }
2080 break;
2081 case TCP_SEQ_STATE_ESTABLISHED:
2082 case TCP_SEQ_STATE_TIME_WAIT:
2083 rc = established_get_next(seq, v);
2084 break;
2085 }
2086out:
2087 ++*pos;
2088 return rc;
2089}
2090
2091static void tcp_seq_stop(struct seq_file *seq, void *v)
2092{
2093 struct tcp_iter_state* st = seq->private;
2094
2095 switch (st->state) {
2096 case TCP_SEQ_STATE_OPENREQ:
2097 if (v) {
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07002098 struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2099 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002100 }
2101 case TCP_SEQ_STATE_LISTENING:
2102 if (v != SEQ_START_TOKEN)
Arnaldo Carvalho de Melof3f05f72005-08-09 20:08:09 -07002103 inet_listen_unlock(&tcp_hashinfo);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002104 break;
2105 case TCP_SEQ_STATE_TIME_WAIT:
2106 case TCP_SEQ_STATE_ESTABLISHED:
2107 if (v)
Eric Dumazet230140c2007-11-07 02:40:20 -08002108 read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002109 break;
2110 }
2111}
2112
2113static int tcp_seq_open(struct inode *inode, struct file *file)
2114{
2115 struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002116 struct tcp_iter_state *s;
Denis V. Lunev52d6f3f2008-04-13 22:12:41 -07002117 int err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002118
Denis V. Lunev52d6f3f2008-04-13 22:12:41 -07002119 err = seq_open_net(inode, file, &afinfo->seq_ops,
2120 sizeof(struct tcp_iter_state));
2121 if (err < 0)
2122 return err;
Daniel Lezcanof40c8172008-03-21 04:13:54 -07002123
Denis V. Lunev52d6f3f2008-04-13 22:12:41 -07002124 s = ((struct seq_file *)file->private_data)->private;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002125 s->family = afinfo->family;
Daniel Lezcanof40c8172008-03-21 04:13:54 -07002126 return 0;
2127}
2128
Daniel Lezcano6f8b13b2008-03-21 04:14:45 -07002129int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002130{
2131 int rc = 0;
2132 struct proc_dir_entry *p;
2133
Denis V. Lunev68fcadd2008-04-13 22:13:30 -07002134 afinfo->seq_fops.open = tcp_seq_open;
2135 afinfo->seq_fops.read = seq_read;
2136 afinfo->seq_fops.llseek = seq_lseek;
2137 afinfo->seq_fops.release = seq_release_net;
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -02002138
Denis V. Lunev9427c4b2008-04-13 22:12:13 -07002139 afinfo->seq_ops.start = tcp_seq_start;
2140 afinfo->seq_ops.next = tcp_seq_next;
2141 afinfo->seq_ops.stop = tcp_seq_stop;
2142
Denis V. Lunev84841c32008-05-02 04:10:08 -07002143 p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2144 &afinfo->seq_fops, afinfo);
2145 if (!p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002146 rc = -ENOMEM;
2147 return rc;
2148}
2149
Daniel Lezcano6f8b13b2008-03-21 04:14:45 -07002150void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002151{
Daniel Lezcano6f8b13b2008-03-21 04:14:45 -07002152 proc_net_remove(net, afinfo->name);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002153}
2154
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07002155static void get_openreq4(struct sock *sk, struct request_sock *req,
Pavel Emelyanov5e659e42008-04-24 01:02:16 -07002156 struct seq_file *f, int i, int uid, int *len)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002157{
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002158 const struct inet_request_sock *ireq = inet_rsk(req);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002159 int ttd = req->expires - jiffies;
2160
Pavel Emelyanov5e659e42008-04-24 01:02:16 -07002161 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2162 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n",
Linus Torvalds1da177e2005-04-16 15:20:36 -07002163 i,
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002164 ireq->loc_addr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002165 ntohs(inet_sk(sk)->sport),
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002166 ireq->rmt_addr,
2167 ntohs(ireq->rmt_port),
Linus Torvalds1da177e2005-04-16 15:20:36 -07002168 TCP_SYN_RECV,
2169 0, 0, /* could print option size, but that is af dependent. */
2170 1, /* timers active (only the expire timer) */
2171 jiffies_to_clock_t(ttd),
2172 req->retrans,
2173 uid,
2174 0, /* non standard timer */
2175 0, /* open_requests have no inode */
2176 atomic_read(&sk->sk_refcnt),
Pavel Emelyanov5e659e42008-04-24 01:02:16 -07002177 req,
2178 len);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002179}
2180
Pavel Emelyanov5e659e42008-04-24 01:02:16 -07002181static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002182{
2183 int timer_active;
2184 unsigned long timer_expires;
Ilpo Järvinencf4c6bf2007-02-22 01:13:58 -08002185 struct tcp_sock *tp = tcp_sk(sk);
2186 const struct inet_connection_sock *icsk = inet_csk(sk);
2187 struct inet_sock *inet = inet_sk(sk);
Al Viro714e85b2006-11-14 20:51:49 -08002188 __be32 dest = inet->daddr;
2189 __be32 src = inet->rcv_saddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002190 __u16 destp = ntohs(inet->dport);
2191 __u16 srcp = ntohs(inet->sport);
2192
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07002193 if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002194 timer_active = 1;
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07002195 timer_expires = icsk->icsk_timeout;
2196 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002197 timer_active = 4;
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07002198 timer_expires = icsk->icsk_timeout;
Ilpo Järvinencf4c6bf2007-02-22 01:13:58 -08002199 } else if (timer_pending(&sk->sk_timer)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002200 timer_active = 2;
Ilpo Järvinencf4c6bf2007-02-22 01:13:58 -08002201 timer_expires = sk->sk_timer.expires;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002202 } else {
2203 timer_active = 0;
2204 timer_expires = jiffies;
2205 }
2206
Pavel Emelyanov5e659e42008-04-24 01:02:16 -07002207 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2208 "%08X %5d %8d %lu %d %p %u %u %u %u %d%n",
Ilpo Järvinencf4c6bf2007-02-22 01:13:58 -08002209 i, src, srcp, dest, destp, sk->sk_state,
Sridhar Samudrala47da8ee2006-06-27 13:29:00 -07002210 tp->write_seq - tp->snd_una,
Ilpo Järvinencf4c6bf2007-02-22 01:13:58 -08002211 sk->sk_state == TCP_LISTEN ? sk->sk_ack_backlog :
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -02002212 (tp->rcv_nxt - tp->copied_seq),
Linus Torvalds1da177e2005-04-16 15:20:36 -07002213 timer_active,
2214 jiffies_to_clock_t(timer_expires - jiffies),
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07002215 icsk->icsk_retransmits,
Ilpo Järvinencf4c6bf2007-02-22 01:13:58 -08002216 sock_i_uid(sk),
Arnaldo Carvalho de Melo6687e982005-08-10 04:03:31 -03002217 icsk->icsk_probes_out,
Ilpo Järvinencf4c6bf2007-02-22 01:13:58 -08002218 sock_i_ino(sk),
2219 atomic_read(&sk->sk_refcnt), sk,
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07002220 icsk->icsk_rto,
2221 icsk->icsk_ack.ato,
2222 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002223 tp->snd_cwnd,
Pavel Emelyanov5e659e42008-04-24 01:02:16 -07002224 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh,
2225 len);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002226}
2227
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -02002228static void get_timewait4_sock(struct inet_timewait_sock *tw,
Pavel Emelyanov5e659e42008-04-24 01:02:16 -07002229 struct seq_file *f, int i, int *len)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002230{
Al Viro23f33c22006-09-27 18:43:50 -07002231 __be32 dest, src;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002232 __u16 destp, srcp;
2233 int ttd = tw->tw_ttd - jiffies;
2234
2235 if (ttd < 0)
2236 ttd = 0;
2237
2238 dest = tw->tw_daddr;
2239 src = tw->tw_rcv_saddr;
2240 destp = ntohs(tw->tw_dport);
2241 srcp = ntohs(tw->tw_sport);
2242
Pavel Emelyanov5e659e42008-04-24 01:02:16 -07002243 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2244 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p%n",
Linus Torvalds1da177e2005-04-16 15:20:36 -07002245 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2246 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
Pavel Emelyanov5e659e42008-04-24 01:02:16 -07002247 atomic_read(&tw->tw_refcnt), tw, len);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002248}
2249
2250#define TMPSZ 150
2251
2252static int tcp4_seq_show(struct seq_file *seq, void *v)
2253{
2254 struct tcp_iter_state* st;
Pavel Emelyanov5e659e42008-04-24 01:02:16 -07002255 int len;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002256
2257 if (v == SEQ_START_TOKEN) {
2258 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2259 " sl local_address rem_address st tx_queue "
2260 "rx_queue tr tm->when retrnsmt uid timeout "
2261 "inode");
2262 goto out;
2263 }
2264 st = seq->private;
2265
2266 switch (st->state) {
2267 case TCP_SEQ_STATE_LISTENING:
2268 case TCP_SEQ_STATE_ESTABLISHED:
Pavel Emelyanov5e659e42008-04-24 01:02:16 -07002269 get_tcp4_sock(v, seq, st->num, &len);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002270 break;
2271 case TCP_SEQ_STATE_OPENREQ:
Pavel Emelyanov5e659e42008-04-24 01:02:16 -07002272 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002273 break;
2274 case TCP_SEQ_STATE_TIME_WAIT:
Pavel Emelyanov5e659e42008-04-24 01:02:16 -07002275 get_timewait4_sock(v, seq, st->num, &len);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002276 break;
2277 }
Pavel Emelyanov5e659e42008-04-24 01:02:16 -07002278 seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
Linus Torvalds1da177e2005-04-16 15:20:36 -07002279out:
2280 return 0;
2281}
2282
Linus Torvalds1da177e2005-04-16 15:20:36 -07002283static struct tcp_seq_afinfo tcp4_seq_afinfo = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002284 .name = "tcp",
2285 .family = AF_INET,
Denis V. Lunev5f4472c2008-04-13 22:13:53 -07002286 .seq_fops = {
2287 .owner = THIS_MODULE,
2288 },
Denis V. Lunev9427c4b2008-04-13 22:12:13 -07002289 .seq_ops = {
2290 .show = tcp4_seq_show,
2291 },
Linus Torvalds1da177e2005-04-16 15:20:36 -07002292};
2293
Pavel Emelyanov757764f2008-03-24 14:56:02 -07002294static int tcp4_proc_init_net(struct net *net)
2295{
2296 return tcp_proc_register(net, &tcp4_seq_afinfo);
2297}
2298
2299static void tcp4_proc_exit_net(struct net *net)
2300{
2301 tcp_proc_unregister(net, &tcp4_seq_afinfo);
2302}
2303
2304static struct pernet_operations tcp4_net_ops = {
2305 .init = tcp4_proc_init_net,
2306 .exit = tcp4_proc_exit_net,
2307};
2308
Linus Torvalds1da177e2005-04-16 15:20:36 -07002309int __init tcp4_proc_init(void)
2310{
Pavel Emelyanov757764f2008-03-24 14:56:02 -07002311 return register_pernet_subsys(&tcp4_net_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002312}
2313
2314void tcp4_proc_exit(void)
2315{
Pavel Emelyanov757764f2008-03-24 14:56:02 -07002316 unregister_pernet_subsys(&tcp4_net_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002317}
2318#endif /* CONFIG_PROC_FS */
2319
2320struct proto tcp_prot = {
2321 .name = "TCP",
2322 .owner = THIS_MODULE,
2323 .close = tcp_close,
2324 .connect = tcp_v4_connect,
2325 .disconnect = tcp_disconnect,
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07002326 .accept = inet_csk_accept,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002327 .ioctl = tcp_ioctl,
2328 .init = tcp_v4_init_sock,
2329 .destroy = tcp_v4_destroy_sock,
2330 .shutdown = tcp_shutdown,
2331 .setsockopt = tcp_setsockopt,
2332 .getsockopt = tcp_getsockopt,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002333 .recvmsg = tcp_recvmsg,
2334 .backlog_rcv = tcp_v4_do_rcv,
Arnaldo Carvalho de Meloab1e0a12008-02-03 04:06:04 -08002335 .hash = inet_hash,
2336 .unhash = inet_unhash,
2337 .get_port = inet_csk_get_port,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002338 .enter_memory_pressure = tcp_enter_memory_pressure,
2339 .sockets_allocated = &tcp_sockets_allocated,
Arnaldo Carvalho de Melo0a5578c2005-08-09 20:11:41 -07002340 .orphan_count = &tcp_orphan_count,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002341 .memory_allocated = &tcp_memory_allocated,
2342 .memory_pressure = &tcp_memory_pressure,
2343 .sysctl_mem = sysctl_tcp_mem,
2344 .sysctl_wmem = sysctl_tcp_wmem,
2345 .sysctl_rmem = sysctl_tcp_rmem,
2346 .max_header = MAX_TCP_HEADER,
2347 .obj_size = sizeof(struct tcp_sock),
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08002348 .twsk_prot = &tcp_timewait_sock_ops,
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07002349 .rsk_prot = &tcp_request_sock_ops,
Pavel Emelyanov39d8cda2008-03-22 16:50:58 -07002350 .h.hashinfo = &tcp_hashinfo,
Arnaldo Carvalho de Melo543d9cf2006-03-20 22:48:35 -08002351#ifdef CONFIG_COMPAT
2352 .compat_setsockopt = compat_tcp_setsockopt,
2353 .compat_getsockopt = compat_tcp_getsockopt,
2354#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07002355};
2356
Denis V. Lunev046ee902008-04-03 14:31:33 -07002357
2358static int __net_init tcp_sk_init(struct net *net)
2359{
2360 return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2361 PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2362}
2363
2364static void __net_exit tcp_sk_exit(struct net *net)
2365{
2366 inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2367}
2368
2369static struct pernet_operations __net_initdata tcp_sk_ops = {
2370 .init = tcp_sk_init,
2371 .exit = tcp_sk_exit,
2372};
2373
Denis V. Lunev9b0f9762008-02-29 11:13:15 -08002374void __init tcp_v4_init(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002375{
Denis V. Lunev046ee902008-04-03 14:31:33 -07002376 if (register_pernet_device(&tcp_sk_ops))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002377 panic("Failed to create the TCP control socket.\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07002378}
2379
2380EXPORT_SYMBOL(ipv4_specific);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002381EXPORT_SYMBOL(tcp_hashinfo);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002382EXPORT_SYMBOL(tcp_prot);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002383EXPORT_SYMBOL(tcp_v4_conn_request);
2384EXPORT_SYMBOL(tcp_v4_connect);
2385EXPORT_SYMBOL(tcp_v4_do_rcv);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002386EXPORT_SYMBOL(tcp_v4_remember_stamp);
2387EXPORT_SYMBOL(tcp_v4_send_check);
2388EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2389
2390#ifdef CONFIG_PROC_FS
2391EXPORT_SYMBOL(tcp_proc_register);
2392EXPORT_SYMBOL(tcp_proc_unregister);
2393#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07002394EXPORT_SYMBOL(sysctl_tcp_low_latency);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002395