blob: 3696c83aec19520861762b94472e546748e3e4c2 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
8 * Version: $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
9 *
10 * IPv4 specific functions
11 *
12 *
13 * code split from:
14 * linux/ipv4/tcp.c
15 * linux/ipv4/tcp_input.c
16 * linux/ipv4/tcp_output.c
17 *
18 * See tcp.c for author information
19 *
20 * This program is free software; you can redistribute it and/or
21 * modify it under the terms of the GNU General Public License
22 * as published by the Free Software Foundation; either version
23 * 2 of the License, or (at your option) any later version.
24 */
25
26/*
27 * Changes:
28 * David S. Miller : New socket lookup architecture.
29 * This code is dedicated to John Dyson.
30 * David S. Miller : Change semantics of established hash,
31 * half is devoted to TIME_WAIT sockets
32 * and the rest go in the other half.
33 * Andi Kleen : Add support for syncookies and fixed
34 * some bugs: ip options weren't passed to
35 * the TCP layer, missed a check for an
36 * ACK bit.
37 * Andi Kleen : Implemented fast path mtu discovery.
38 * Fixed many serious bugs in the
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -070039 * request_sock handling and moved
Linus Torvalds1da177e2005-04-16 15:20:36 -070040 * most of it into the af independent code.
41 * Added tail drop and some other bugfixes.
Stephen Hemmingercaa20d9a2005-11-10 17:13:47 -080042 * Added new listen semantics.
Linus Torvalds1da177e2005-04-16 15:20:36 -070043 * Mike McLagan : Routing by source
44 * Juan Jose Ciarlante: ip_dynaddr bits
45 * Andi Kleen: various fixes.
46 * Vitaly E. Lavrov : Transparent proxy revived after year
47 * coma.
48 * Andi Kleen : Fix new listen.
49 * Andi Kleen : Fix accept error reporting.
50 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
51 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
52 * a single port at the same time.
53 */
54
Linus Torvalds1da177e2005-04-16 15:20:36 -070055
56#include <linux/types.h>
57#include <linux/fcntl.h>
58#include <linux/module.h>
59#include <linux/random.h>
60#include <linux/cache.h>
61#include <linux/jhash.h>
62#include <linux/init.h>
63#include <linux/times.h>
64
Eric W. Biederman457c4cb2007-09-12 12:01:34 +020065#include <net/net_namespace.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070066#include <net/icmp.h>
Arnaldo Carvalho de Melo304a1612005-08-09 19:59:20 -070067#include <net/inet_hashtables.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070068#include <net/tcp.h>
Arnaldo Carvalho de Melo20380732005-08-16 02:18:02 -030069#include <net/transp_v6.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070070#include <net/ipv6.h>
71#include <net/inet_common.h>
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -080072#include <net/timewait_sock.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070073#include <net/xfrm.h>
Chris Leech1a2449a2006-05-23 18:05:53 -070074#include <net/netdma.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070075
76#include <linux/inet.h>
77#include <linux/ipv6.h>
78#include <linux/stddef.h>
79#include <linux/proc_fs.h>
80#include <linux/seq_file.h>
81
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -080082#include <linux/crypto.h>
83#include <linux/scatterlist.h>
84
Brian Haleyab32ea52006-09-22 14:15:41 -070085int sysctl_tcp_tw_reuse __read_mostly;
86int sysctl_tcp_low_latency __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -070087
88/* Check TCP sequence numbers in ICMP packets. */
89#define ICMP_MIN_LENGTH 8
90
Arnaldo Carvalho de Melo8292a172005-12-13 23:15:52 -080091void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -070092
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -080093#ifdef CONFIG_TCP_MD5SIG
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -020094static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
95 __be32 addr);
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -080096static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -020097 __be32 saddr, __be32 daddr,
98 struct tcphdr *th, int protocol,
YOSHIFUJI Hideaki9cb57342008-01-12 02:16:03 -080099 unsigned int tcplen);
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800100#endif
101
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -0700102struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -0200103 .lhash_lock = __RW_LOCK_UNLOCKED(tcp_hashinfo.lhash_lock),
104 .lhash_users = ATOMIC_INIT(0),
105 .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700106};
107
Gerrit Renkera94f7232006-11-10 14:06:49 -0800108static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700109{
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700110 return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
111 ip_hdr(skb)->saddr,
Arnaldo Carvalho de Meloaa8223c2007-04-10 21:04:22 -0700112 tcp_hdr(skb)->dest,
113 tcp_hdr(skb)->source);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700114}
115
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -0800116int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
117{
118 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
119 struct tcp_sock *tp = tcp_sk(sk);
120
121 /* With PAWS, it is safe from the viewpoint
122 of data integrity. Even without PAWS it is safe provided sequence
123 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
124
125 Actually, the idea is close to VJ's one, only timestamp cache is
126 held not per host, but per port pair and TW bucket is used as state
127 holder.
128
129 If TW bucket has been already destroyed we fall back to VJ's scheme
130 and use initial timestamp retrieved from peer table.
131 */
132 if (tcptw->tw_ts_recent_stamp &&
133 (twp == NULL || (sysctl_tcp_tw_reuse &&
James Morris9d729f72007-03-04 16:12:44 -0800134 get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -0800135 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
136 if (tp->write_seq == 0)
137 tp->write_seq = 1;
138 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
139 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
140 sock_hold(sktw);
141 return 1;
142 }
143
144 return 0;
145}
146
147EXPORT_SYMBOL_GPL(tcp_twsk_unique);
148
Linus Torvalds1da177e2005-04-16 15:20:36 -0700149/* This will initiate an outgoing connection. */
150int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
151{
152 struct inet_sock *inet = inet_sk(sk);
153 struct tcp_sock *tp = tcp_sk(sk);
154 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
155 struct rtable *rt;
Al Virobada8ad2006-09-26 21:27:15 -0700156 __be32 daddr, nexthop;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700157 int tmp;
158 int err;
159
160 if (addr_len < sizeof(struct sockaddr_in))
161 return -EINVAL;
162
163 if (usin->sin_family != AF_INET)
164 return -EAFNOSUPPORT;
165
166 nexthop = daddr = usin->sin_addr.s_addr;
167 if (inet->opt && inet->opt->srr) {
168 if (!daddr)
169 return -EINVAL;
170 nexthop = inet->opt->faddr;
171 }
172
173 tmp = ip_route_connect(&rt, nexthop, inet->saddr,
174 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
175 IPPROTO_TCP,
David S. Miller8eb90862007-02-08 02:09:21 -0800176 inet->sport, usin->sin_port, sk, 1);
Wei Dong584bdf82007-05-31 22:49:28 -0700177 if (tmp < 0) {
178 if (tmp == -ENETUNREACH)
179 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700180 return tmp;
Wei Dong584bdf82007-05-31 22:49:28 -0700181 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700182
183 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
184 ip_rt_put(rt);
185 return -ENETUNREACH;
186 }
187
188 if (!inet->opt || !inet->opt->srr)
189 daddr = rt->rt_dst;
190
191 if (!inet->saddr)
192 inet->saddr = rt->rt_src;
193 inet->rcv_saddr = inet->saddr;
194
195 if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
196 /* Reset inherited state */
197 tp->rx_opt.ts_recent = 0;
198 tp->rx_opt.ts_recent_stamp = 0;
199 tp->write_seq = 0;
200 }
201
Arnaldo Carvalho de Melo295ff7e2005-08-09 20:44:40 -0700202 if (tcp_death_row.sysctl_tw_recycle &&
Linus Torvalds1da177e2005-04-16 15:20:36 -0700203 !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
204 struct inet_peer *peer = rt_get_peer(rt);
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -0200205 /*
206 * VJ's idea. We save last timestamp seen from
207 * the destination in peer table, when entering state
208 * TIME-WAIT * and initialize rx_opt.ts_recent from it,
209 * when trying new connection.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700210 */
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -0200211 if (peer != NULL &&
James Morris9d729f72007-03-04 16:12:44 -0800212 peer->tcp_ts_stamp + TCP_PAWS_MSL >= get_seconds()) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700213 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
214 tp->rx_opt.ts_recent = peer->tcp_ts;
215 }
216 }
217
218 inet->dport = usin->sin_port;
219 inet->daddr = daddr;
220
Arnaldo Carvalho de Melod83d8462005-12-13 23:26:10 -0800221 inet_csk(sk)->icsk_ext_hdr_len = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700222 if (inet->opt)
Arnaldo Carvalho de Melod83d8462005-12-13 23:26:10 -0800223 inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700224
225 tp->rx_opt.mss_clamp = 536;
226
227 /* Socket identity is still unknown (sport may be zero).
228 * However we set state to SYN-SENT and not releasing socket
229 * lock select source port, enter ourselves into the hash tables and
230 * complete initialization after this.
231 */
232 tcp_set_state(sk, TCP_SYN_SENT);
Arnaldo Carvalho de Meloa7f5e7f2005-12-13 23:25:31 -0800233 err = inet_hash_connect(&tcp_death_row, sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700234 if (err)
235 goto failure;
236
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -0200237 err = ip_route_newports(&rt, IPPROTO_TCP,
238 inet->sport, inet->dport, sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700239 if (err)
240 goto failure;
241
242 /* OK, now commit destination to socket. */
Herbert Xubcd76112006-06-30 13:36:35 -0700243 sk->sk_gso_type = SKB_GSO_TCPV4;
Arnaldo Carvalho de Melo6cbb0df2005-08-09 19:49:02 -0700244 sk_setup_caps(sk, &rt->u.dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700245
246 if (!tp->write_seq)
247 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
248 inet->daddr,
249 inet->sport,
250 usin->sin_port);
251
252 inet->id = tp->write_seq ^ jiffies;
253
254 err = tcp_connect(sk);
255 rt = NULL;
256 if (err)
257 goto failure;
258
259 return 0;
260
261failure:
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -0200262 /*
263 * This unhashes the socket and releases the local port,
264 * if necessary.
265 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700266 tcp_set_state(sk, TCP_CLOSE);
267 ip_rt_put(rt);
268 sk->sk_route_caps = 0;
269 inet->dport = 0;
270 return err;
271}
272
Linus Torvalds1da177e2005-04-16 15:20:36 -0700273/*
274 * This routine does path mtu discovery as defined in RFC1191.
275 */
Stephen Hemminger40efc6f2006-01-03 16:03:49 -0800276static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700277{
278 struct dst_entry *dst;
279 struct inet_sock *inet = inet_sk(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700280
281 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
282 * send out by Linux are always <576bytes so they should go through
283 * unfragmented).
284 */
285 if (sk->sk_state == TCP_LISTEN)
286 return;
287
288 /* We don't check in the destentry if pmtu discovery is forbidden
289 * on this route. We just assume that no packet_to_big packets
290 * are send back when pmtu discovery is not active.
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900291 * There is a small race when the user changes this flag in the
Linus Torvalds1da177e2005-04-16 15:20:36 -0700292 * route, but I think that's acceptable.
293 */
294 if ((dst = __sk_dst_check(sk, 0)) == NULL)
295 return;
296
297 dst->ops->update_pmtu(dst, mtu);
298
299 /* Something is about to be wrong... Remember soft error
300 * for the case, if this connection will not able to recover.
301 */
302 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
303 sk->sk_err_soft = EMSGSIZE;
304
305 mtu = dst_mtu(dst);
306
307 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
Arnaldo Carvalho de Melod83d8462005-12-13 23:26:10 -0800308 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700309 tcp_sync_mss(sk, mtu);
310
311 /* Resend the TCP packet because it's
312 * clear that the old packet has been
313 * dropped. This is the new "fast" path mtu
314 * discovery.
315 */
316 tcp_simple_retransmit(sk);
317 } /* else let the usual retransmit timer handle it */
318}
319
320/*
321 * This routine is called by the ICMP module when it gets some
322 * sort of error condition. If err < 0 then the socket should
323 * be closed and the error returned to the user. If err > 0
324 * it's just the icmp type << 8 | icmp code. After adjustment
325 * header points to the first 8 bytes of the tcp header. We need
326 * to find the appropriate port.
327 *
328 * The locking strategy used here is very "optimistic". When
329 * someone else accesses the socket the ICMP is just dropped
330 * and for some paths there is no check at all.
331 * A more general error queue to queue errors for later handling
332 * is probably better.
333 *
334 */
335
336void tcp_v4_err(struct sk_buff *skb, u32 info)
337{
338 struct iphdr *iph = (struct iphdr *)skb->data;
339 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
340 struct tcp_sock *tp;
341 struct inet_sock *inet;
Arnaldo Carvalho de Melo88c76642007-03-13 14:43:18 -0300342 const int type = icmp_hdr(skb)->type;
343 const int code = icmp_hdr(skb)->code;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700344 struct sock *sk;
345 __u32 seq;
346 int err;
347
348 if (skb->len < (iph->ihl << 2) + 8) {
349 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
350 return;
351 }
352
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +0900353 sk = inet_lookup(dev_net(skb->dev), &tcp_hashinfo, iph->daddr, th->dest,
Pavel Emelyanovc67499c2008-01-31 05:06:40 -0800354 iph->saddr, th->source, inet_iif(skb));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700355 if (!sk) {
356 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
357 return;
358 }
359 if (sk->sk_state == TCP_TIME_WAIT) {
YOSHIFUJI Hideaki9469c7b2006-10-10 19:41:46 -0700360 inet_twsk_put(inet_twsk(sk));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700361 return;
362 }
363
364 bh_lock_sock(sk);
365 /* If too many ICMPs get dropped on busy
366 * servers this needs to be solved differently.
367 */
368 if (sock_owned_by_user(sk))
369 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
370
371 if (sk->sk_state == TCP_CLOSE)
372 goto out;
373
374 tp = tcp_sk(sk);
375 seq = ntohl(th->seq);
376 if (sk->sk_state != TCP_LISTEN &&
377 !between(seq, tp->snd_una, tp->snd_nxt)) {
Eric Dumazet06ca7192006-10-20 00:22:25 -0700378 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700379 goto out;
380 }
381
382 switch (type) {
383 case ICMP_SOURCE_QUENCH:
384 /* Just silently ignore these. */
385 goto out;
386 case ICMP_PARAMETERPROB:
387 err = EPROTO;
388 break;
389 case ICMP_DEST_UNREACH:
390 if (code > NR_ICMP_UNREACH)
391 goto out;
392
393 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
394 if (!sock_owned_by_user(sk))
395 do_pmtu_discovery(sk, iph, info);
396 goto out;
397 }
398
399 err = icmp_err_convert[code].errno;
400 break;
401 case ICMP_TIME_EXCEEDED:
402 err = EHOSTUNREACH;
403 break;
404 default:
405 goto out;
406 }
407
408 switch (sk->sk_state) {
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -0700409 struct request_sock *req, **prev;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700410 case TCP_LISTEN:
411 if (sock_owned_by_user(sk))
412 goto out;
413
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -0700414 req = inet_csk_search_req(sk, &prev, th->dest,
415 iph->daddr, iph->saddr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700416 if (!req)
417 goto out;
418
419 /* ICMPs are not backlogged, hence we cannot get
420 an established socket here.
421 */
422 BUG_TRAP(!req->sk);
423
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -0700424 if (seq != tcp_rsk(req)->snt_isn) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700425 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
426 goto out;
427 }
428
429 /*
430 * Still in SYN_RECV, just remove it silently.
431 * There is no good way to pass the error to the newly
432 * created socket, and POSIX does not want network
433 * errors returned from accept().
434 */
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -0700435 inet_csk_reqsk_queue_drop(sk, req, prev);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700436 goto out;
437
438 case TCP_SYN_SENT:
439 case TCP_SYN_RECV: /* Cannot happen.
440 It can f.e. if SYNs crossed.
441 */
442 if (!sock_owned_by_user(sk)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700443 sk->sk_err = err;
444
445 sk->sk_error_report(sk);
446
447 tcp_done(sk);
448 } else {
449 sk->sk_err_soft = err;
450 }
451 goto out;
452 }
453
454 /* If we've already connected we will keep trying
455 * until we time out, or the user gives up.
456 *
457 * rfc1122 4.2.3.9 allows to consider as hard errors
458 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
459 * but it is obsoleted by pmtu discovery).
460 *
461 * Note, that in modern internet, where routing is unreliable
462 * and in each dark corner broken firewalls sit, sending random
463 * errors ordered by their masters even this two messages finally lose
464 * their original sense (even Linux sends invalid PORT_UNREACHs)
465 *
466 * Now we are in compliance with RFCs.
467 * --ANK (980905)
468 */
469
470 inet = inet_sk(sk);
471 if (!sock_owned_by_user(sk) && inet->recverr) {
472 sk->sk_err = err;
473 sk->sk_error_report(sk);
474 } else { /* Only an error on timeout */
475 sk->sk_err_soft = err;
476 }
477
478out:
479 bh_unlock_sock(sk);
480 sock_put(sk);
481}
482
483/* This routine computes an IPv4 TCP checksum. */
Arnaldo Carvalho de Melo8292a172005-12-13 23:15:52 -0800484void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700485{
486 struct inet_sock *inet = inet_sk(sk);
Arnaldo Carvalho de Meloaa8223c2007-04-10 21:04:22 -0700487 struct tcphdr *th = tcp_hdr(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700488
Patrick McHardy84fa7932006-08-29 16:44:56 -0700489 if (skb->ip_summed == CHECKSUM_PARTIAL) {
Frederik Deweerdtba7808e2007-02-04 20:15:27 -0800490 th->check = ~tcp_v4_check(len, inet->saddr,
491 inet->daddr, 0);
Herbert Xu663ead32007-04-09 11:59:07 -0700492 skb->csum_start = skb_transport_header(skb) - skb->head;
Al Viroff1dcad2006-11-20 18:07:29 -0800493 skb->csum_offset = offsetof(struct tcphdr, check);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700494 } else {
Frederik Deweerdtba7808e2007-02-04 20:15:27 -0800495 th->check = tcp_v4_check(len, inet->saddr, inet->daddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700496 csum_partial((char *)th,
497 th->doff << 2,
498 skb->csum));
499 }
500}
501
Herbert Xua430a432006-07-08 13:34:56 -0700502int tcp_v4_gso_send_check(struct sk_buff *skb)
503{
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700504 const struct iphdr *iph;
Herbert Xua430a432006-07-08 13:34:56 -0700505 struct tcphdr *th;
506
507 if (!pskb_may_pull(skb, sizeof(*th)))
508 return -EINVAL;
509
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700510 iph = ip_hdr(skb);
Arnaldo Carvalho de Meloaa8223c2007-04-10 21:04:22 -0700511 th = tcp_hdr(skb);
Herbert Xua430a432006-07-08 13:34:56 -0700512
513 th->check = 0;
Frederik Deweerdtba7808e2007-02-04 20:15:27 -0800514 th->check = ~tcp_v4_check(skb->len, iph->saddr, iph->daddr, 0);
Herbert Xu663ead32007-04-09 11:59:07 -0700515 skb->csum_start = skb_transport_header(skb) - skb->head;
Al Viroff1dcad2006-11-20 18:07:29 -0800516 skb->csum_offset = offsetof(struct tcphdr, check);
Patrick McHardy84fa7932006-08-29 16:44:56 -0700517 skb->ip_summed = CHECKSUM_PARTIAL;
Herbert Xua430a432006-07-08 13:34:56 -0700518 return 0;
519}
520
Linus Torvalds1da177e2005-04-16 15:20:36 -0700521/*
522 * This routine will send an RST to the other tcp.
523 *
524 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
525 * for reset.
526 * Answer: if a packet caused RST, it is not for a socket
527 * existing in our system, if it is matched to a socket,
528 * it is just duplicate segment or bug in other side's TCP.
529 * So that we build reply only basing on parameters
530 * arrived with segment.
531 * Exception: precedence violation. We do not implement it in any case.
532 */
533
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800534static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700535{
Arnaldo Carvalho de Meloaa8223c2007-04-10 21:04:22 -0700536 struct tcphdr *th = tcp_hdr(skb);
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800537 struct {
538 struct tcphdr th;
539#ifdef CONFIG_TCP_MD5SIG
Al Viro714e85b2006-11-14 20:51:49 -0800540 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800541#endif
542 } rep;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700543 struct ip_reply_arg arg;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800544#ifdef CONFIG_TCP_MD5SIG
545 struct tcp_md5sig_key *key;
546#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700547
548 /* Never send a reset in response to a reset. */
549 if (th->rst)
550 return;
551
Eric Dumazetee6b9672008-03-05 18:30:47 -0800552 if (skb->rtable->rt_type != RTN_LOCAL)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700553 return;
554
555 /* Swap the send and the receive. */
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800556 memset(&rep, 0, sizeof(rep));
557 rep.th.dest = th->source;
558 rep.th.source = th->dest;
559 rep.th.doff = sizeof(struct tcphdr) / 4;
560 rep.th.rst = 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700561
562 if (th->ack) {
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800563 rep.th.seq = th->ack_seq;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700564 } else {
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800565 rep.th.ack = 1;
566 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
567 skb->len - (th->doff << 2));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700568 }
569
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -0200570 memset(&arg, 0, sizeof(arg));
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800571 arg.iov[0].iov_base = (unsigned char *)&rep;
572 arg.iov[0].iov_len = sizeof(rep.th);
573
574#ifdef CONFIG_TCP_MD5SIG
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700575 key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800576 if (key) {
577 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
578 (TCPOPT_NOP << 16) |
579 (TCPOPT_MD5SIG << 8) |
580 TCPOLEN_MD5SIG);
581 /* Update length and the length the header thinks exists */
582 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
583 rep.th.doff = arg.iov[0].iov_len / 4;
584
585 tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[1],
586 key,
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700587 ip_hdr(skb)->daddr,
588 ip_hdr(skb)->saddr,
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800589 &rep.th, IPPROTO_TCP,
590 arg.iov[0].iov_len);
591 }
592#endif
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700593 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
594 ip_hdr(skb)->saddr, /* XXX */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700595 sizeof(struct tcphdr), IPPROTO_TCP, 0);
596 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
597
Denis V. Lunev7feb49c2008-04-03 14:32:00 -0700598 ip_send_reply(dev_net(skb->dst->dev)->ipv4.tcp_sock, skb,
599 &arg, arg.iov[0].iov_len);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700600
601 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
602 TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
603}
604
605/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
606 outside socket context is ugly, certainly. What can I do?
607 */
608
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800609static void tcp_v4_send_ack(struct tcp_timewait_sock *twsk,
610 struct sk_buff *skb, u32 seq, u32 ack,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700611 u32 win, u32 ts)
612{
Arnaldo Carvalho de Meloaa8223c2007-04-10 21:04:22 -0700613 struct tcphdr *th = tcp_hdr(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700614 struct {
615 struct tcphdr th;
Al Viro714e85b2006-11-14 20:51:49 -0800616 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800617#ifdef CONFIG_TCP_MD5SIG
Al Viro714e85b2006-11-14 20:51:49 -0800618 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800619#endif
620 ];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700621 } rep;
622 struct ip_reply_arg arg;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800623#ifdef CONFIG_TCP_MD5SIG
624 struct tcp_md5sig_key *key;
625 struct tcp_md5sig_key tw_key;
626#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700627
628 memset(&rep.th, 0, sizeof(struct tcphdr));
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -0200629 memset(&arg, 0, sizeof(arg));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700630
631 arg.iov[0].iov_base = (unsigned char *)&rep;
632 arg.iov[0].iov_len = sizeof(rep.th);
633 if (ts) {
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800634 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
635 (TCPOPT_TIMESTAMP << 8) |
636 TCPOLEN_TIMESTAMP);
637 rep.opt[1] = htonl(tcp_time_stamp);
638 rep.opt[2] = htonl(ts);
Craig Schlentercb48cfe2007-01-09 00:11:15 -0800639 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700640 }
641
642 /* Swap the send and the receive. */
643 rep.th.dest = th->source;
644 rep.th.source = th->dest;
645 rep.th.doff = arg.iov[0].iov_len / 4;
646 rep.th.seq = htonl(seq);
647 rep.th.ack_seq = htonl(ack);
648 rep.th.ack = 1;
649 rep.th.window = htons(win);
650
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800651#ifdef CONFIG_TCP_MD5SIG
652 /*
653 * The SKB holds an imcoming packet, but may not have a valid ->sk
654 * pointer. This is especially the case when we're dealing with a
655 * TIME_WAIT ack, because the sk structure is long gone, and only
656 * the tcp_timewait_sock remains. So the md5 key is stashed in that
657 * structure, and we use it in preference. I believe that (twsk ||
658 * skb->sk) holds true, but we program defensively.
659 */
660 if (!twsk && skb->sk) {
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700661 key = tcp_v4_md5_do_lookup(skb->sk, ip_hdr(skb)->daddr);
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800662 } else if (twsk && twsk->tw_md5_keylen) {
663 tw_key.key = twsk->tw_md5_key;
664 tw_key.keylen = twsk->tw_md5_keylen;
665 key = &tw_key;
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -0200666 } else
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800667 key = NULL;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800668
669 if (key) {
670 int offset = (ts) ? 3 : 0;
671
672 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
673 (TCPOPT_NOP << 16) |
674 (TCPOPT_MD5SIG << 8) |
675 TCPOLEN_MD5SIG);
676 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
677 rep.th.doff = arg.iov[0].iov_len/4;
678
679 tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[offset],
680 key,
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700681 ip_hdr(skb)->daddr,
682 ip_hdr(skb)->saddr,
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800683 &rep.th, IPPROTO_TCP,
684 arg.iov[0].iov_len);
685 }
686#endif
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700687 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
688 ip_hdr(skb)->saddr, /* XXX */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700689 arg.iov[0].iov_len, IPPROTO_TCP, 0);
690 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
Patrick McHardyf0e48db2007-06-04 21:32:46 -0700691 if (twsk)
692 arg.bound_dev_if = twsk->tw_sk.tw_bound_dev_if;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700693
Denis V. Lunev7feb49c2008-04-03 14:32:00 -0700694 ip_send_reply(dev_net(skb->dev)->ipv4.tcp_sock, skb,
695 &arg, arg.iov[0].iov_len);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700696
697 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
698}
699
700static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
701{
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -0700702 struct inet_timewait_sock *tw = inet_twsk(sk);
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800703 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700704
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800705 tcp_v4_send_ack(tcptw, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -0200706 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
707 tcptw->tw_ts_recent);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700708
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -0700709 inet_twsk_put(tw);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700710}
711
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -0200712static void tcp_v4_reqsk_send_ack(struct sk_buff *skb,
713 struct request_sock *req)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700714{
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800715 tcp_v4_send_ack(NULL, skb, tcp_rsk(req)->snt_isn + 1,
716 tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700717 req->ts_recent);
718}
719
Linus Torvalds1da177e2005-04-16 15:20:36 -0700720/*
Kris Katterjohn9bf1d832008-02-17 22:29:19 -0800721 * Send a SYN-ACK after having received a SYN.
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -0700722 * This still operates on a request_sock only, not on a big
Linus Torvalds1da177e2005-04-16 15:20:36 -0700723 * socket.
724 */
Denis V. Lunevfd80eb92008-02-29 11:43:03 -0800725static int __tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
726 struct dst_entry *dst)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700727{
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -0700728 const struct inet_request_sock *ireq = inet_rsk(req);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700729 int err = -1;
730 struct sk_buff * skb;
731
732 /* First, grab a route. */
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -0700733 if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
Denis V. Lunevfd80eb92008-02-29 11:43:03 -0800734 return -1;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700735
736 skb = tcp_make_synack(sk, dst, req);
737
738 if (skb) {
Arnaldo Carvalho de Meloaa8223c2007-04-10 21:04:22 -0700739 struct tcphdr *th = tcp_hdr(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700740
Frederik Deweerdtba7808e2007-02-04 20:15:27 -0800741 th->check = tcp_v4_check(skb->len,
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -0700742 ireq->loc_addr,
743 ireq->rmt_addr,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700744 csum_partial((char *)th, skb->len,
745 skb->csum));
746
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -0700747 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
748 ireq->rmt_addr,
749 ireq->opt);
Gerrit Renkerb9df3cb2006-11-14 11:21:36 -0200750 err = net_xmit_eval(err);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700751 }
752
Linus Torvalds1da177e2005-04-16 15:20:36 -0700753 dst_release(dst);
754 return err;
755}
756
Denis V. Lunevfd80eb92008-02-29 11:43:03 -0800757static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req)
758{
759 return __tcp_v4_send_synack(sk, req, NULL);
760}
761
Linus Torvalds1da177e2005-04-16 15:20:36 -0700762/*
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -0700763 * IPv4 request_sock destructor.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700764 */
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -0700765static void tcp_v4_reqsk_destructor(struct request_sock *req)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700766{
Jesper Juhla51482b2005-11-08 09:41:34 -0800767 kfree(inet_rsk(req)->opt);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700768}
769
Arnaldo Carvalho de Melo80e40da2006-01-04 01:58:06 -0200770#ifdef CONFIG_SYN_COOKIES
Stephen Hemminger40efc6f2006-01-03 16:03:49 -0800771static void syn_flood_warning(struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700772{
773 static unsigned long warntime;
774
775 if (time_after(jiffies, (warntime + HZ * 60))) {
776 warntime = jiffies;
777 printk(KERN_INFO
778 "possible SYN flooding on port %d. Sending cookies.\n",
Arnaldo Carvalho de Meloaa8223c2007-04-10 21:04:22 -0700779 ntohs(tcp_hdr(skb)->dest));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700780 }
781}
Arnaldo Carvalho de Melo80e40da2006-01-04 01:58:06 -0200782#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700783
784/*
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -0700785 * Save and compile IPv4 options into the request_sock if needed.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700786 */
Stephen Hemminger40efc6f2006-01-03 16:03:49 -0800787static struct ip_options *tcp_v4_save_options(struct sock *sk,
788 struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700789{
790 struct ip_options *opt = &(IPCB(skb)->opt);
791 struct ip_options *dopt = NULL;
792
793 if (opt && opt->optlen) {
794 int opt_size = optlength(opt);
795 dopt = kmalloc(opt_size, GFP_ATOMIC);
796 if (dopt) {
797 if (ip_options_echo(dopt, skb)) {
798 kfree(dopt);
799 dopt = NULL;
800 }
801 }
802 }
803 return dopt;
804}
805
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800806#ifdef CONFIG_TCP_MD5SIG
807/*
808 * RFC2385 MD5 checksumming requires a mapping of
809 * IP address->MD5 Key.
810 * We need to maintain these in the sk structure.
811 */
812
813/* Find the Key structure for an address. */
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -0200814static struct tcp_md5sig_key *
815 tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800816{
817 struct tcp_sock *tp = tcp_sk(sk);
818 int i;
819
820 if (!tp->md5sig_info || !tp->md5sig_info->entries4)
821 return NULL;
822 for (i = 0; i < tp->md5sig_info->entries4; i++) {
823 if (tp->md5sig_info->keys4[i].addr == addr)
David S. Millerf8ab18d2007-09-28 15:18:35 -0700824 return &tp->md5sig_info->keys4[i].base;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800825 }
826 return NULL;
827}
828
829struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
830 struct sock *addr_sk)
831{
832 return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->daddr);
833}
834
835EXPORT_SYMBOL(tcp_v4_md5_lookup);
836
Adrian Bunkf5b99bc2006-11-30 17:22:29 -0800837static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
838 struct request_sock *req)
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800839{
840 return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
841}
842
843/* This can be called on a newly created socket, from other files */
844int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
845 u8 *newkey, u8 newkeylen)
846{
847 /* Add Key to the list */
Matthias M. Dellwegb0a713e2007-10-29 20:55:27 -0700848 struct tcp_md5sig_key *key;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800849 struct tcp_sock *tp = tcp_sk(sk);
850 struct tcp4_md5sig_key *keys;
851
Matthias M. Dellwegb0a713e2007-10-29 20:55:27 -0700852 key = tcp_v4_md5_do_lookup(sk, addr);
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800853 if (key) {
854 /* Pre-existing entry - just update that one. */
Matthias M. Dellwegb0a713e2007-10-29 20:55:27 -0700855 kfree(key->key);
856 key->key = newkey;
857 key->keylen = newkeylen;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800858 } else {
Arnaldo Carvalho de Melof6685932006-11-17 11:06:01 -0200859 struct tcp_md5sig_info *md5sig;
860
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800861 if (!tp->md5sig_info) {
Arnaldo Carvalho de Melof6685932006-11-17 11:06:01 -0200862 tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
863 GFP_ATOMIC);
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800864 if (!tp->md5sig_info) {
865 kfree(newkey);
866 return -ENOMEM;
867 }
David S. Miller3d7dbea2007-06-12 14:36:42 -0700868 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800869 }
870 if (tcp_alloc_md5sig_pool() == NULL) {
871 kfree(newkey);
872 return -ENOMEM;
873 }
Arnaldo Carvalho de Melof6685932006-11-17 11:06:01 -0200874 md5sig = tp->md5sig_info;
875
876 if (md5sig->alloced4 == md5sig->entries4) {
877 keys = kmalloc((sizeof(*keys) *
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900878 (md5sig->entries4 + 1)), GFP_ATOMIC);
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800879 if (!keys) {
880 kfree(newkey);
881 tcp_free_md5sig_pool();
882 return -ENOMEM;
883 }
884
Arnaldo Carvalho de Melof6685932006-11-17 11:06:01 -0200885 if (md5sig->entries4)
886 memcpy(keys, md5sig->keys4,
887 sizeof(*keys) * md5sig->entries4);
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800888
889 /* Free old key list, and reference new one */
YOSHIFUJI Hideakia80cc202007-11-20 17:30:06 -0800890 kfree(md5sig->keys4);
Arnaldo Carvalho de Melof6685932006-11-17 11:06:01 -0200891 md5sig->keys4 = keys;
892 md5sig->alloced4++;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800893 }
Arnaldo Carvalho de Melof6685932006-11-17 11:06:01 -0200894 md5sig->entries4++;
David S. Millerf8ab18d2007-09-28 15:18:35 -0700895 md5sig->keys4[md5sig->entries4 - 1].addr = addr;
896 md5sig->keys4[md5sig->entries4 - 1].base.key = newkey;
897 md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800898 }
899 return 0;
900}
901
902EXPORT_SYMBOL(tcp_v4_md5_do_add);
903
904static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
905 u8 *newkey, u8 newkeylen)
906{
907 return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->daddr,
908 newkey, newkeylen);
909}
910
911int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
912{
913 struct tcp_sock *tp = tcp_sk(sk);
914 int i;
915
916 for (i = 0; i < tp->md5sig_info->entries4; i++) {
917 if (tp->md5sig_info->keys4[i].addr == addr) {
918 /* Free the key */
David S. Millerf8ab18d2007-09-28 15:18:35 -0700919 kfree(tp->md5sig_info->keys4[i].base.key);
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800920 tp->md5sig_info->entries4--;
921
922 if (tp->md5sig_info->entries4 == 0) {
923 kfree(tp->md5sig_info->keys4);
924 tp->md5sig_info->keys4 = NULL;
Leigh Brown8228a18d2006-12-17 17:12:30 -0800925 tp->md5sig_info->alloced4 = 0;
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -0200926 } else if (tp->md5sig_info->entries4 != i) {
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800927 /* Need to do some manipulation */
YOSHIFUJI Hideaki354faf02007-11-20 17:30:31 -0800928 memmove(&tp->md5sig_info->keys4[i],
929 &tp->md5sig_info->keys4[i+1],
930 (tp->md5sig_info->entries4 - i) *
931 sizeof(struct tcp4_md5sig_key));
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800932 }
933 tcp_free_md5sig_pool();
934 return 0;
935 }
936 }
937 return -ENOENT;
938}
939
940EXPORT_SYMBOL(tcp_v4_md5_do_del);
941
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -0200942static void tcp_v4_clear_md5_list(struct sock *sk)
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800943{
944 struct tcp_sock *tp = tcp_sk(sk);
945
946 /* Free each key, then the set of key keys,
947 * the crypto element, and then decrement our
948 * hold on the last resort crypto.
949 */
950 if (tp->md5sig_info->entries4) {
951 int i;
952 for (i = 0; i < tp->md5sig_info->entries4; i++)
David S. Millerf8ab18d2007-09-28 15:18:35 -0700953 kfree(tp->md5sig_info->keys4[i].base.key);
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800954 tp->md5sig_info->entries4 = 0;
955 tcp_free_md5sig_pool();
956 }
957 if (tp->md5sig_info->keys4) {
958 kfree(tp->md5sig_info->keys4);
959 tp->md5sig_info->keys4 = NULL;
960 tp->md5sig_info->alloced4 = 0;
961 }
962}
963
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -0200964static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
965 int optlen)
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800966{
967 struct tcp_md5sig cmd;
968 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
969 u8 *newkey;
970
971 if (optlen < sizeof(cmd))
972 return -EINVAL;
973
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -0200974 if (copy_from_user(&cmd, optval, sizeof(cmd)))
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800975 return -EFAULT;
976
977 if (sin->sin_family != AF_INET)
978 return -EINVAL;
979
980 if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
981 if (!tcp_sk(sk)->md5sig_info)
982 return -ENOENT;
983 return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
984 }
985
986 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
987 return -EINVAL;
988
989 if (!tcp_sk(sk)->md5sig_info) {
990 struct tcp_sock *tp = tcp_sk(sk);
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -0200991 struct tcp_md5sig_info *p = kzalloc(sizeof(*p), GFP_KERNEL);
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800992
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800993 if (!p)
994 return -EINVAL;
995
996 tp->md5sig_info = p;
David S. Miller3d7dbea2007-06-12 14:36:42 -0700997 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800998 }
999
Arnaldo Carvalho de Melof6685932006-11-17 11:06:01 -02001000 newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001001 if (!newkey)
1002 return -ENOMEM;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001003 return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
1004 newkey, cmd.tcpm_keylen);
1005}
1006
1007static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
1008 __be32 saddr, __be32 daddr,
1009 struct tcphdr *th, int protocol,
YOSHIFUJI Hideaki9cb57342008-01-12 02:16:03 -08001010 unsigned int tcplen)
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001011{
1012 struct scatterlist sg[4];
1013 __u16 data_len;
1014 int block = 0;
Al Viro8e5200f2006-11-20 18:06:37 -08001015 __sum16 old_checksum;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001016 struct tcp_md5sig_pool *hp;
1017 struct tcp4_pseudohdr *bp;
1018 struct hash_desc *desc;
1019 int err;
1020 unsigned int nbytes = 0;
1021
1022 /*
1023 * Okay, so RFC2385 is turned on for this connection,
1024 * so we need to generate the MD5 hash for the packet now.
1025 */
1026
1027 hp = tcp_get_md5sig_pool();
1028 if (!hp)
1029 goto clear_hash_noput;
1030
1031 bp = &hp->md5_blk.ip4;
1032 desc = &hp->md5_desc;
1033
1034 /*
1035 * 1. the TCP pseudo-header (in the order: source IP address,
1036 * destination IP address, zero-padded protocol number, and
1037 * segment length)
1038 */
1039 bp->saddr = saddr;
1040 bp->daddr = daddr;
1041 bp->pad = 0;
1042 bp->protocol = protocol;
1043 bp->len = htons(tcplen);
David S. Millerc7da57a2007-10-26 00:41:21 -07001044
1045 sg_init_table(sg, 4);
1046
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001047 sg_set_buf(&sg[block++], bp, sizeof(*bp));
1048 nbytes += sizeof(*bp);
1049
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001050 /* 2. the TCP header, excluding options, and assuming a
1051 * checksum of zero/
1052 */
1053 old_checksum = th->check;
1054 th->check = 0;
1055 sg_set_buf(&sg[block++], th, sizeof(struct tcphdr));
1056 nbytes += sizeof(struct tcphdr);
David S. Miller08dd1a52006-11-30 16:35:01 -08001057
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001058 /* 3. the TCP segment data (if any) */
1059 data_len = tcplen - (th->doff << 2);
1060 if (data_len > 0) {
1061 unsigned char *data = (unsigned char *)th + (th->doff << 2);
1062 sg_set_buf(&sg[block++], data, data_len);
1063 nbytes += data_len;
1064 }
1065
1066 /* 4. an independently-specified key or password, known to both
1067 * TCPs and presumably connection-specific
1068 */
1069 sg_set_buf(&sg[block++], key->key, key->keylen);
1070 nbytes += key->keylen;
1071
Jens Axboec46f2332007-10-31 12:06:37 +01001072 sg_mark_end(&sg[block - 1]);
David S. Millerc7da57a2007-10-26 00:41:21 -07001073
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001074 /* Now store the Hash into the packet */
1075 err = crypto_hash_init(desc);
1076 if (err)
1077 goto clear_hash;
1078 err = crypto_hash_update(desc, sg, nbytes);
1079 if (err)
1080 goto clear_hash;
1081 err = crypto_hash_final(desc, md5_hash);
1082 if (err)
1083 goto clear_hash;
1084
1085 /* Reset header, and free up the crypto */
1086 tcp_put_md5sig_pool();
1087 th->check = old_checksum;
1088
1089out:
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001090 return 0;
1091clear_hash:
1092 tcp_put_md5sig_pool();
1093clear_hash_noput:
1094 memset(md5_hash, 0, 16);
1095 goto out;
1096}
1097
1098int tcp_v4_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
1099 struct sock *sk,
1100 struct dst_entry *dst,
1101 struct request_sock *req,
1102 struct tcphdr *th, int protocol,
YOSHIFUJI Hideaki9cb57342008-01-12 02:16:03 -08001103 unsigned int tcplen)
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001104{
1105 __be32 saddr, daddr;
1106
1107 if (sk) {
1108 saddr = inet_sk(sk)->saddr;
1109 daddr = inet_sk(sk)->daddr;
1110 } else {
1111 struct rtable *rt = (struct rtable *)dst;
1112 BUG_ON(!rt);
1113 saddr = rt->rt_src;
1114 daddr = rt->rt_dst;
1115 }
1116 return tcp_v4_do_calc_md5_hash(md5_hash, key,
1117 saddr, daddr,
1118 th, protocol, tcplen);
1119}
1120
1121EXPORT_SYMBOL(tcp_v4_calc_md5_hash);
1122
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -02001123static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001124{
1125 /*
1126 * This gets called for each TCP segment that arrives
1127 * so we want to be efficient.
1128 * We have 3 drop cases:
1129 * o No MD5 hash and one expected.
1130 * o MD5 hash and we're not expecting one.
1131 * o MD5 hash and its wrong.
1132 */
1133 __u8 *hash_location = NULL;
1134 struct tcp_md5sig_key *hash_expected;
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07001135 const struct iphdr *iph = ip_hdr(skb);
Arnaldo Carvalho de Meloaa8223c2007-04-10 21:04:22 -07001136 struct tcphdr *th = tcp_hdr(skb);
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -02001137 int length = (th->doff << 2) - sizeof(struct tcphdr);
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001138 int genhash;
1139 unsigned char *ptr;
1140 unsigned char newhash[16];
1141
1142 hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1143
1144 /*
1145 * If the TCP option length is less than the TCP_MD5SIG
1146 * option length, then we can shortcut
1147 */
1148 if (length < TCPOLEN_MD5SIG) {
1149 if (hash_expected)
1150 return 1;
1151 else
1152 return 0;
1153 }
1154
1155 /* Okay, we can't shortcut - we have to grub through the options */
1156 ptr = (unsigned char *)(th + 1);
1157 while (length > 0) {
1158 int opcode = *ptr++;
1159 int opsize;
1160
1161 switch (opcode) {
1162 case TCPOPT_EOL:
1163 goto done_opts;
1164 case TCPOPT_NOP:
1165 length--;
1166 continue;
1167 default:
1168 opsize = *ptr++;
1169 if (opsize < 2)
1170 goto done_opts;
1171 if (opsize > length)
1172 goto done_opts;
1173
1174 if (opcode == TCPOPT_MD5SIG) {
1175 hash_location = ptr;
1176 goto done_opts;
1177 }
1178 }
1179 ptr += opsize-2;
1180 length -= opsize;
1181 }
1182done_opts:
1183 /* We've parsed the options - do we have a hash? */
1184 if (!hash_expected && !hash_location)
1185 return 0;
1186
1187 if (hash_expected && !hash_location) {
Leigh Browna9fc00c2006-12-17 17:13:10 -08001188 LIMIT_NETDEBUG(KERN_INFO "MD5 Hash expected but NOT found "
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001189 "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n",
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -02001190 NIPQUAD(iph->saddr), ntohs(th->source),
1191 NIPQUAD(iph->daddr), ntohs(th->dest));
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001192 return 1;
1193 }
1194
1195 if (!hash_expected && hash_location) {
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -02001196 LIMIT_NETDEBUG(KERN_INFO "MD5 Hash NOT expected but found "
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001197 "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n",
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -02001198 NIPQUAD(iph->saddr), ntohs(th->source),
1199 NIPQUAD(iph->daddr), ntohs(th->dest));
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001200 return 1;
1201 }
1202
1203 /* Okay, so this is hash_expected and hash_location -
1204 * so we need to calculate the checksum.
1205 */
1206 genhash = tcp_v4_do_calc_md5_hash(newhash,
1207 hash_expected,
1208 iph->saddr, iph->daddr,
1209 th, sk->sk_protocol,
1210 skb->len);
1211
1212 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1213 if (net_ratelimit()) {
1214 printk(KERN_INFO "MD5 Hash failed for "
1215 "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)%s\n",
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -02001216 NIPQUAD(iph->saddr), ntohs(th->source),
1217 NIPQUAD(iph->daddr), ntohs(th->dest),
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001218 genhash ? " tcp_v4_calc_md5_hash failed" : "");
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001219 }
1220 return 1;
1221 }
1222 return 0;
1223}
1224
1225#endif
1226
Eric Dumazet72a3eff2006-11-16 02:30:37 -08001227struct request_sock_ops tcp_request_sock_ops __read_mostly = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001228 .family = PF_INET,
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001229 .obj_size = sizeof(struct tcp_request_sock),
Linus Torvalds1da177e2005-04-16 15:20:36 -07001230 .rtx_syn_ack = tcp_v4_send_synack,
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001231 .send_ack = tcp_v4_reqsk_send_ack,
1232 .destructor = tcp_v4_reqsk_destructor,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001233 .send_reset = tcp_v4_send_reset,
1234};
1235
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001236#ifdef CONFIG_TCP_MD5SIG
Andrew Mortonb6332e62006-11-30 19:16:28 -08001237static struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001238 .md5_lookup = tcp_v4_reqsk_md5_lookup,
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001239};
Andrew Mortonb6332e62006-11-30 19:16:28 -08001240#endif
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001241
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08001242static struct timewait_sock_ops tcp_timewait_sock_ops = {
1243 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
1244 .twsk_unique = tcp_twsk_unique,
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001245 .twsk_destructor= tcp_twsk_destructor,
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08001246};
1247
Linus Torvalds1da177e2005-04-16 15:20:36 -07001248int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1249{
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001250 struct inet_request_sock *ireq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001251 struct tcp_options_received tmp_opt;
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001252 struct request_sock *req;
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07001253 __be32 saddr = ip_hdr(skb)->saddr;
1254 __be32 daddr = ip_hdr(skb)->daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001255 __u32 isn = TCP_SKB_CB(skb)->when;
1256 struct dst_entry *dst = NULL;
1257#ifdef CONFIG_SYN_COOKIES
1258 int want_cookie = 0;
1259#else
1260#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1261#endif
1262
1263 /* Never answer to SYNs send to broadcast or multicast */
Eric Dumazetee6b9672008-03-05 18:30:47 -08001264 if (skb->rtable->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001265 goto drop;
1266
1267 /* TW buckets are converted to open requests without
1268 * limitations, they conserve resources and peer is
1269 * evidently real one.
1270 */
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001271 if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001272#ifdef CONFIG_SYN_COOKIES
1273 if (sysctl_tcp_syncookies) {
1274 want_cookie = 1;
1275 } else
1276#endif
1277 goto drop;
1278 }
1279
1280 /* Accept backlog is full. If we have already queued enough
1281 * of warm entries in syn queue, drop request. It is better than
1282 * clogging syn queue with openreqs with exponentially increasing
1283 * timeout.
1284 */
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001285 if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001286 goto drop;
1287
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001288 req = reqsk_alloc(&tcp_request_sock_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001289 if (!req)
1290 goto drop;
1291
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001292#ifdef CONFIG_TCP_MD5SIG
1293 tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1294#endif
1295
Linus Torvalds1da177e2005-04-16 15:20:36 -07001296 tcp_clear_options(&tmp_opt);
1297 tmp_opt.mss_clamp = 536;
1298 tmp_opt.user_mss = tcp_sk(sk)->rx_opt.user_mss;
1299
1300 tcp_parse_options(skb, &tmp_opt, 0);
1301
Florian Westphal4dfc2812008-04-10 03:12:40 -07001302 if (want_cookie && !tmp_opt.saw_tstamp)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001303 tcp_clear_options(&tmp_opt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001304
1305 if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1306 /* Some OSes (unknown ones, but I see them on web server, which
1307 * contains information interesting only for windows'
1308 * users) do not send their stamp in SYN. It is easy case.
1309 * We simply do not advertise TS support.
1310 */
1311 tmp_opt.saw_tstamp = 0;
1312 tmp_opt.tstamp_ok = 0;
1313 }
1314 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1315
1316 tcp_openreq_init(req, &tmp_opt, skb);
1317
Venkat Yekkirala4237c752006-07-24 23:32:50 -07001318 if (security_inet_conn_request(sk, skb, req))
1319 goto drop_and_free;
1320
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001321 ireq = inet_rsk(req);
1322 ireq->loc_addr = daddr;
1323 ireq->rmt_addr = saddr;
1324 ireq->opt = tcp_v4_save_options(sk, skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001325 if (!want_cookie)
Arnaldo Carvalho de Meloaa8223c2007-04-10 21:04:22 -07001326 TCP_ECN_create_request(req, tcp_hdr(skb));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001327
1328 if (want_cookie) {
1329#ifdef CONFIG_SYN_COOKIES
1330 syn_flood_warning(skb);
Florian Westphal4dfc2812008-04-10 03:12:40 -07001331 req->cookie_ts = tmp_opt.tstamp_ok;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001332#endif
1333 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1334 } else if (!isn) {
1335 struct inet_peer *peer = NULL;
1336
1337 /* VJ's idea. We save last timestamp seen
1338 * from the destination in peer table, when entering
1339 * state TIME-WAIT, and check against it before
1340 * accepting new connection request.
1341 *
1342 * If "isn" is not zero, this request hit alive
1343 * timewait bucket, so that all the necessary checks
1344 * are made in the function processing timewait state.
1345 */
1346 if (tmp_opt.saw_tstamp &&
Arnaldo Carvalho de Melo295ff7e2005-08-09 20:44:40 -07001347 tcp_death_row.sysctl_tw_recycle &&
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001348 (dst = inet_csk_route_req(sk, req)) != NULL &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001349 (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1350 peer->v4daddr == saddr) {
James Morris9d729f72007-03-04 16:12:44 -08001351 if (get_seconds() < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001352 (s32)(peer->tcp_ts - req->ts_recent) >
1353 TCP_PAWS_WINDOW) {
1354 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
Denis V. Lunev7cd04fa2008-03-03 11:59:32 -08001355 goto drop_and_release;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001356 }
1357 }
1358 /* Kill the following clause, if you dislike this way. */
1359 else if (!sysctl_tcp_syncookies &&
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001360 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
Linus Torvalds1da177e2005-04-16 15:20:36 -07001361 (sysctl_max_syn_backlog >> 2)) &&
1362 (!peer || !peer->tcp_ts_stamp) &&
1363 (!dst || !dst_metric(dst, RTAX_RTT))) {
1364 /* Without syncookies last quarter of
1365 * backlog is filled with destinations,
1366 * proven to be alive.
1367 * It means that we continue to communicate
1368 * to destinations, already remembered
1369 * to the moment of synflood.
1370 */
Patrick McHardy64ce2072005-08-09 20:50:53 -07001371 LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open "
1372 "request from %u.%u.%u.%u/%u\n",
1373 NIPQUAD(saddr),
Arnaldo Carvalho de Meloaa8223c2007-04-10 21:04:22 -07001374 ntohs(tcp_hdr(skb)->source));
Denis V. Lunev7cd04fa2008-03-03 11:59:32 -08001375 goto drop_and_release;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001376 }
1377
Gerrit Renkera94f7232006-11-10 14:06:49 -08001378 isn = tcp_v4_init_sequence(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001379 }
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001380 tcp_rsk(req)->snt_isn = isn;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001381
Denis V. Lunev7cd04fa2008-03-03 11:59:32 -08001382 if (__tcp_v4_send_synack(sk, req, dst) || want_cookie)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001383 goto drop_and_free;
1384
Denis V. Lunev7cd04fa2008-03-03 11:59:32 -08001385 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001386 return 0;
1387
Denis V. Lunev7cd04fa2008-03-03 11:59:32 -08001388drop_and_release:
1389 dst_release(dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001390drop_and_free:
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001391 reqsk_free(req);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001392drop:
Linus Torvalds1da177e2005-04-16 15:20:36 -07001393 return 0;
1394}
1395
1396
1397/*
1398 * The three way handshake has completed - we got a valid synack -
1399 * now create the new socket.
1400 */
1401struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001402 struct request_sock *req,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001403 struct dst_entry *dst)
1404{
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001405 struct inet_request_sock *ireq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001406 struct inet_sock *newinet;
1407 struct tcp_sock *newtp;
1408 struct sock *newsk;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001409#ifdef CONFIG_TCP_MD5SIG
1410 struct tcp_md5sig_key *key;
1411#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07001412
1413 if (sk_acceptq_is_full(sk))
1414 goto exit_overflow;
1415
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001416 if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001417 goto exit;
1418
1419 newsk = tcp_create_openreq_child(sk, req, skb);
1420 if (!newsk)
1421 goto exit;
1422
Herbert Xubcd76112006-06-30 13:36:35 -07001423 newsk->sk_gso_type = SKB_GSO_TCPV4;
Arnaldo Carvalho de Melo6cbb0df2005-08-09 19:49:02 -07001424 sk_setup_caps(newsk, dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001425
1426 newtp = tcp_sk(newsk);
1427 newinet = inet_sk(newsk);
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001428 ireq = inet_rsk(req);
1429 newinet->daddr = ireq->rmt_addr;
1430 newinet->rcv_saddr = ireq->loc_addr;
1431 newinet->saddr = ireq->loc_addr;
1432 newinet->opt = ireq->opt;
1433 ireq->opt = NULL;
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001434 newinet->mc_index = inet_iif(skb);
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07001435 newinet->mc_ttl = ip_hdr(skb)->ttl;
Arnaldo Carvalho de Melod83d8462005-12-13 23:26:10 -08001436 inet_csk(newsk)->icsk_ext_hdr_len = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001437 if (newinet->opt)
Arnaldo Carvalho de Melod83d8462005-12-13 23:26:10 -08001438 inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001439 newinet->id = newtp->write_seq ^ jiffies;
1440
John Heffner5d424d52006-03-20 17:53:41 -08001441 tcp_mtup_init(newsk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001442 tcp_sync_mss(newsk, dst_mtu(dst));
1443 newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1444 tcp_initialize_rcv_mss(newsk);
1445
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001446#ifdef CONFIG_TCP_MD5SIG
1447 /* Copy over the MD5 key from the original socket */
1448 if ((key = tcp_v4_md5_do_lookup(sk, newinet->daddr)) != NULL) {
1449 /*
1450 * We're using one, so create a matching key
1451 * on the newsk structure. If we fail to get
1452 * memory, then we end up not copying the key
1453 * across. Shucks.
1454 */
Arnaldo Carvalho de Melof6685932006-11-17 11:06:01 -02001455 char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1456 if (newkey != NULL)
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001457 tcp_v4_md5_do_add(newsk, inet_sk(sk)->daddr,
1458 newkey, key->keylen);
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001459 }
1460#endif
1461
Arnaldo Carvalho de Meloab1e0a12008-02-03 04:06:04 -08001462 __inet_hash_nolisten(newsk);
1463 __inet_inherit_port(sk, newsk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001464
1465 return newsk;
1466
1467exit_overflow:
1468 NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1469exit:
1470 NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1471 dst_release(dst);
1472 return NULL;
1473}
1474
1475static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1476{
Arnaldo Carvalho de Meloaa8223c2007-04-10 21:04:22 -07001477 struct tcphdr *th = tcp_hdr(skb);
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07001478 const struct iphdr *iph = ip_hdr(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001479 struct sock *nsk;
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001480 struct request_sock **prev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001481 /* Find possible connection requests. */
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001482 struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1483 iph->saddr, iph->daddr);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001484 if (req)
1485 return tcp_check_req(sk, skb, req, prev);
1486
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09001487 nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
Pavel Emelyanovc67499c2008-01-31 05:06:40 -08001488 th->source, iph->daddr, th->dest, inet_iif(skb));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001489
1490 if (nsk) {
1491 if (nsk->sk_state != TCP_TIME_WAIT) {
1492 bh_lock_sock(nsk);
1493 return nsk;
1494 }
YOSHIFUJI Hideaki9469c7b2006-10-10 19:41:46 -07001495 inet_twsk_put(inet_twsk(nsk));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001496 return NULL;
1497 }
1498
1499#ifdef CONFIG_SYN_COOKIES
1500 if (!th->rst && !th->syn && th->ack)
1501 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1502#endif
1503 return sk;
1504}
1505
Al Virob51655b2006-11-14 21:40:42 -08001506static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001507{
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07001508 const struct iphdr *iph = ip_hdr(skb);
1509
Patrick McHardy84fa7932006-08-29 16:44:56 -07001510 if (skb->ip_summed == CHECKSUM_COMPLETE) {
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07001511 if (!tcp_v4_check(skb->len, iph->saddr,
1512 iph->daddr, skb->csum)) {
Herbert Xufb286bb2005-11-10 13:01:24 -08001513 skb->ip_summed = CHECKSUM_UNNECESSARY;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001514 return 0;
Herbert Xufb286bb2005-11-10 13:01:24 -08001515 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001516 }
Herbert Xufb286bb2005-11-10 13:01:24 -08001517
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07001518 skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
Herbert Xufb286bb2005-11-10 13:01:24 -08001519 skb->len, IPPROTO_TCP, 0);
1520
Linus Torvalds1da177e2005-04-16 15:20:36 -07001521 if (skb->len <= 76) {
Herbert Xufb286bb2005-11-10 13:01:24 -08001522 return __skb_checksum_complete(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001523 }
1524 return 0;
1525}
1526
1527
1528/* The socket must have it's spinlock held when we get
1529 * here.
1530 *
1531 * We have a potential double-lock case here, so even when
1532 * doing backlog processing we use the BH locking scheme.
1533 * This is because we cannot sleep with the original spinlock
1534 * held.
1535 */
1536int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1537{
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001538 struct sock *rsk;
1539#ifdef CONFIG_TCP_MD5SIG
1540 /*
1541 * We really want to reject the packet as early as possible
1542 * if:
1543 * o We're expecting an MD5'd packet and this is no MD5 tcp option
1544 * o There is an MD5 option and we're not expecting one
1545 */
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -02001546 if (tcp_v4_inbound_md5_hash(sk, skb))
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001547 goto discard;
1548#endif
1549
Linus Torvalds1da177e2005-04-16 15:20:36 -07001550 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1551 TCP_CHECK_TIMER(sk);
Arnaldo Carvalho de Meloaa8223c2007-04-10 21:04:22 -07001552 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001553 rsk = sk;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001554 goto reset;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001555 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001556 TCP_CHECK_TIMER(sk);
1557 return 0;
1558 }
1559
Arnaldo Carvalho de Meloab6a5bb2007-03-18 17:43:48 -07001560 if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001561 goto csum_err;
1562
1563 if (sk->sk_state == TCP_LISTEN) {
1564 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1565 if (!nsk)
1566 goto discard;
1567
1568 if (nsk != sk) {
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001569 if (tcp_child_process(sk, nsk, skb)) {
1570 rsk = nsk;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001571 goto reset;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001572 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001573 return 0;
1574 }
1575 }
1576
1577 TCP_CHECK_TIMER(sk);
Arnaldo Carvalho de Meloaa8223c2007-04-10 21:04:22 -07001578 if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001579 rsk = sk;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001580 goto reset;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001581 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001582 TCP_CHECK_TIMER(sk);
1583 return 0;
1584
1585reset:
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001586 tcp_v4_send_reset(rsk, skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001587discard:
1588 kfree_skb(skb);
1589 /* Be careful here. If this function gets more complicated and
1590 * gcc suffers from register pressure on the x86, sk (in %ebx)
1591 * might be destroyed here. This current version compiles correctly,
1592 * but you have been warned.
1593 */
1594 return 0;
1595
1596csum_err:
1597 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1598 goto discard;
1599}
1600
1601/*
1602 * From tcp_input.c
1603 */
1604
1605int tcp_v4_rcv(struct sk_buff *skb)
1606{
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07001607 const struct iphdr *iph;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001608 struct tcphdr *th;
1609 struct sock *sk;
1610 int ret;
1611
1612 if (skb->pkt_type != PACKET_HOST)
1613 goto discard_it;
1614
1615 /* Count it even if it's bad */
1616 TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1617
1618 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1619 goto discard_it;
1620
Arnaldo Carvalho de Meloaa8223c2007-04-10 21:04:22 -07001621 th = tcp_hdr(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001622
1623 if (th->doff < sizeof(struct tcphdr) / 4)
1624 goto bad_packet;
1625 if (!pskb_may_pull(skb, th->doff * 4))
1626 goto discard_it;
1627
1628 /* An explanation is required here, I think.
1629 * Packet length and doff are validated by header prediction,
Stephen Hemmingercaa20d9a2005-11-10 17:13:47 -08001630 * provided case of th->doff==0 is eliminated.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001631 * So, we defer the checks. */
Herbert Xu60476372007-04-09 11:59:39 -07001632 if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001633 goto bad_packet;
1634
Arnaldo Carvalho de Meloaa8223c2007-04-10 21:04:22 -07001635 th = tcp_hdr(skb);
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07001636 iph = ip_hdr(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001637 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1638 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1639 skb->len - th->doff * 4);
1640 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1641 TCP_SKB_CB(skb)->when = 0;
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07001642 TCP_SKB_CB(skb)->flags = iph->tos;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001643 TCP_SKB_CB(skb)->sacked = 0;
1644
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09001645 sk = __inet_lookup(dev_net(skb->dev), &tcp_hashinfo, iph->saddr,
Pavel Emelyanovc67499c2008-01-31 05:06:40 -08001646 th->source, iph->daddr, th->dest, inet_iif(skb));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001647 if (!sk)
1648 goto no_tcp_socket;
1649
1650process:
1651 if (sk->sk_state == TCP_TIME_WAIT)
1652 goto do_time_wait;
1653
1654 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1655 goto discard_and_relse;
Patrick McHardyb59c2702006-01-06 23:06:10 -08001656 nf_reset(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001657
Dmitry Mishinfda9ef52006-08-31 15:28:39 -07001658 if (sk_filter(sk, skb))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001659 goto discard_and_relse;
1660
1661 skb->dev = NULL;
1662
Ingo Molnarc6366182006-07-03 00:25:13 -07001663 bh_lock_sock_nested(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001664 ret = 0;
1665 if (!sock_owned_by_user(sk)) {
Chris Leech1a2449a2006-05-23 18:05:53 -07001666#ifdef CONFIG_NET_DMA
1667 struct tcp_sock *tp = tcp_sk(sk);
1668 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1669 tp->ucopy.dma_chan = get_softnet_dma();
1670 if (tp->ucopy.dma_chan)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001671 ret = tcp_v4_do_rcv(sk, skb);
Chris Leech1a2449a2006-05-23 18:05:53 -07001672 else
1673#endif
1674 {
1675 if (!tcp_prequeue(sk, skb))
1676 ret = tcp_v4_do_rcv(sk, skb);
1677 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001678 } else
1679 sk_add_backlog(sk, skb);
1680 bh_unlock_sock(sk);
1681
1682 sock_put(sk);
1683
1684 return ret;
1685
1686no_tcp_socket:
1687 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1688 goto discard_it;
1689
1690 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1691bad_packet:
1692 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1693 } else {
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001694 tcp_v4_send_reset(NULL, skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001695 }
1696
1697discard_it:
1698 /* Discard frame. */
1699 kfree_skb(skb);
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001700 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001701
1702discard_and_relse:
1703 sock_put(sk);
1704 goto discard_it;
1705
1706do_time_wait:
1707 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
YOSHIFUJI Hideaki9469c7b2006-10-10 19:41:46 -07001708 inet_twsk_put(inet_twsk(sk));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001709 goto discard_it;
1710 }
1711
1712 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1713 TCP_INC_STATS_BH(TCP_MIB_INERRS);
YOSHIFUJI Hideaki9469c7b2006-10-10 19:41:46 -07001714 inet_twsk_put(inet_twsk(sk));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001715 goto discard_it;
1716 }
YOSHIFUJI Hideaki9469c7b2006-10-10 19:41:46 -07001717 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001718 case TCP_TW_SYN: {
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09001719 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
Pavel Emelyanovc67499c2008-01-31 05:06:40 -08001720 &tcp_hashinfo,
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07001721 iph->daddr, th->dest,
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001722 inet_iif(skb));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001723 if (sk2) {
YOSHIFUJI Hideaki9469c7b2006-10-10 19:41:46 -07001724 inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1725 inet_twsk_put(inet_twsk(sk));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001726 sk = sk2;
1727 goto process;
1728 }
1729 /* Fall through to ACK */
1730 }
1731 case TCP_TW_ACK:
1732 tcp_v4_timewait_ack(sk, skb);
1733 break;
1734 case TCP_TW_RST:
1735 goto no_tcp_socket;
1736 case TCP_TW_SUCCESS:;
1737 }
1738 goto discard_it;
1739}
1740
Linus Torvalds1da177e2005-04-16 15:20:36 -07001741/* VJ's idea. Save last timestamp seen from this destination
1742 * and hold it at least for normal timewait interval to use for duplicate
1743 * segment detection in subsequent connections, before they enter synchronized
1744 * state.
1745 */
1746
1747int tcp_v4_remember_stamp(struct sock *sk)
1748{
1749 struct inet_sock *inet = inet_sk(sk);
1750 struct tcp_sock *tp = tcp_sk(sk);
1751 struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1752 struct inet_peer *peer = NULL;
1753 int release_it = 0;
1754
1755 if (!rt || rt->rt_dst != inet->daddr) {
1756 peer = inet_getpeer(inet->daddr, 1);
1757 release_it = 1;
1758 } else {
1759 if (!rt->peer)
1760 rt_bind_peer(rt, 1);
1761 peer = rt->peer;
1762 }
1763
1764 if (peer) {
1765 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
James Morris9d729f72007-03-04 16:12:44 -08001766 (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001767 peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1768 peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1769 peer->tcp_ts = tp->rx_opt.ts_recent;
1770 }
1771 if (release_it)
1772 inet_putpeer(peer);
1773 return 1;
1774 }
1775
1776 return 0;
1777}
1778
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07001779int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001780{
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07001781 struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001782
1783 if (peer) {
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07001784 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1785
1786 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
James Morris9d729f72007-03-04 16:12:44 -08001787 (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07001788 peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1789 peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1790 peer->tcp_ts = tcptw->tw_ts_recent;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001791 }
1792 inet_putpeer(peer);
1793 return 1;
1794 }
1795
1796 return 0;
1797}
1798
Arnaldo Carvalho de Melo8292a172005-12-13 23:15:52 -08001799struct inet_connection_sock_af_ops ipv4_specific = {
Arnaldo Carvalho de Melo543d9cf2006-03-20 22:48:35 -08001800 .queue_xmit = ip_queue_xmit,
1801 .send_check = tcp_v4_send_check,
1802 .rebuild_header = inet_sk_rebuild_header,
1803 .conn_request = tcp_v4_conn_request,
1804 .syn_recv_sock = tcp_v4_syn_recv_sock,
1805 .remember_stamp = tcp_v4_remember_stamp,
1806 .net_header_len = sizeof(struct iphdr),
1807 .setsockopt = ip_setsockopt,
1808 .getsockopt = ip_getsockopt,
1809 .addr2sockaddr = inet_csk_addr2sockaddr,
1810 .sockaddr_len = sizeof(struct sockaddr_in),
Arnaldo Carvalho de Meloab1e0a12008-02-03 04:06:04 -08001811 .bind_conflict = inet_csk_bind_conflict,
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08001812#ifdef CONFIG_COMPAT
Arnaldo Carvalho de Melo543d9cf2006-03-20 22:48:35 -08001813 .compat_setsockopt = compat_ip_setsockopt,
1814 .compat_getsockopt = compat_ip_getsockopt,
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08001815#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07001816};
1817
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001818#ifdef CONFIG_TCP_MD5SIG
Andrew Mortonb6332e62006-11-30 19:16:28 -08001819static struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001820 .md5_lookup = tcp_v4_md5_lookup,
1821 .calc_md5_hash = tcp_v4_calc_md5_hash,
1822 .md5_add = tcp_v4_md5_add_func,
1823 .md5_parse = tcp_v4_parse_md5_keys,
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001824};
Andrew Mortonb6332e62006-11-30 19:16:28 -08001825#endif
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001826
Linus Torvalds1da177e2005-04-16 15:20:36 -07001827/* NOTE: A lot of things set to zero explicitly by call to
1828 * sk_alloc() so need not be done here.
1829 */
1830static int tcp_v4_init_sock(struct sock *sk)
1831{
Arnaldo Carvalho de Melo6687e982005-08-10 04:03:31 -03001832 struct inet_connection_sock *icsk = inet_csk(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001833 struct tcp_sock *tp = tcp_sk(sk);
1834
1835 skb_queue_head_init(&tp->out_of_order_queue);
1836 tcp_init_xmit_timers(sk);
1837 tcp_prequeue_init(tp);
1838
Arnaldo Carvalho de Melo6687e982005-08-10 04:03:31 -03001839 icsk->icsk_rto = TCP_TIMEOUT_INIT;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001840 tp->mdev = TCP_TIMEOUT_INIT;
1841
1842 /* So many TCP implementations out there (incorrectly) count the
1843 * initial SYN frame in their delayed-ACK and congestion control
1844 * algorithms that we must have the following bandaid to talk
1845 * efficiently to them. -DaveM
1846 */
1847 tp->snd_cwnd = 2;
1848
1849 /* See draft-stevens-tcpca-spec-01 for discussion of the
1850 * initialization of these values.
1851 */
1852 tp->snd_ssthresh = 0x7fffffff; /* Infinity */
1853 tp->snd_cwnd_clamp = ~0;
David S. Millerc1b4a7e2005-07-05 15:24:38 -07001854 tp->mss_cache = 536;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001855
1856 tp->reordering = sysctl_tcp_reordering;
Arnaldo Carvalho de Melo6687e982005-08-10 04:03:31 -03001857 icsk->icsk_ca_ops = &tcp_init_congestion_ops;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001858
1859 sk->sk_state = TCP_CLOSE;
1860
1861 sk->sk_write_space = sk_stream_write_space;
1862 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1863
Arnaldo Carvalho de Melo8292a172005-12-13 23:15:52 -08001864 icsk->icsk_af_ops = &ipv4_specific;
Arnaldo Carvalho de Melod83d8462005-12-13 23:26:10 -08001865 icsk->icsk_sync_mss = tcp_sync_mss;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001866#ifdef CONFIG_TCP_MD5SIG
1867 tp->af_specific = &tcp_sock_ipv4_specific;
1868#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07001869
1870 sk->sk_sndbuf = sysctl_tcp_wmem[1];
1871 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1872
1873 atomic_inc(&tcp_sockets_allocated);
1874
1875 return 0;
1876}
1877
1878int tcp_v4_destroy_sock(struct sock *sk)
1879{
1880 struct tcp_sock *tp = tcp_sk(sk);
1881
1882 tcp_clear_xmit_timers(sk);
1883
Arnaldo Carvalho de Melo6687e982005-08-10 04:03:31 -03001884 tcp_cleanup_congestion_control(sk);
Stephen Hemminger317a76f2005-06-23 12:19:55 -07001885
Linus Torvalds1da177e2005-04-16 15:20:36 -07001886 /* Cleanup up the write buffer. */
David S. Millerfe067e82007-03-07 12:12:44 -08001887 tcp_write_queue_purge(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001888
1889 /* Cleans up our, hopefully empty, out_of_order_queue. */
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001890 __skb_queue_purge(&tp->out_of_order_queue);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001891
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001892#ifdef CONFIG_TCP_MD5SIG
1893 /* Clean up the MD5 key list, if any */
1894 if (tp->md5sig_info) {
1895 tcp_v4_clear_md5_list(sk);
1896 kfree(tp->md5sig_info);
1897 tp->md5sig_info = NULL;
1898 }
1899#endif
1900
Chris Leech1a2449a2006-05-23 18:05:53 -07001901#ifdef CONFIG_NET_DMA
1902 /* Cleans up our sk_async_wait_queue */
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001903 __skb_queue_purge(&sk->sk_async_wait_queue);
Chris Leech1a2449a2006-05-23 18:05:53 -07001904#endif
1905
Linus Torvalds1da177e2005-04-16 15:20:36 -07001906 /* Clean prequeue, it must be empty really */
1907 __skb_queue_purge(&tp->ucopy.prequeue);
1908
1909 /* Clean up a referenced TCP bind bucket. */
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001910 if (inet_csk(sk)->icsk_bind_hash)
Arnaldo Carvalho de Meloab1e0a12008-02-03 04:06:04 -08001911 inet_put_port(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001912
1913 /*
1914 * If sendmsg cached page exists, toss it.
1915 */
1916 if (sk->sk_sndmsg_page) {
1917 __free_page(sk->sk_sndmsg_page);
1918 sk->sk_sndmsg_page = NULL;
1919 }
1920
Patrick McManusec3c0982008-03-21 16:33:01 -07001921 if (tp->defer_tcp_accept.request) {
1922 reqsk_free(tp->defer_tcp_accept.request);
1923 sock_put(tp->defer_tcp_accept.listen_sk);
1924 sock_put(sk);
1925 tp->defer_tcp_accept.listen_sk = NULL;
1926 tp->defer_tcp_accept.request = NULL;
1927 }
1928
Linus Torvalds1da177e2005-04-16 15:20:36 -07001929 atomic_dec(&tcp_sockets_allocated);
1930
1931 return 0;
1932}
1933
1934EXPORT_SYMBOL(tcp_v4_destroy_sock);
1935
1936#ifdef CONFIG_PROC_FS
1937/* Proc filesystem TCP sock list dumping. */
1938
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07001939static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001940{
1941 return hlist_empty(head) ? NULL :
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07001942 list_entry(head->first, struct inet_timewait_sock, tw_node);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001943}
1944
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07001945static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001946{
1947 return tw->tw_node.next ?
1948 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1949}
1950
1951static void *listening_get_next(struct seq_file *seq, void *cur)
1952{
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001953 struct inet_connection_sock *icsk;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001954 struct hlist_node *node;
1955 struct sock *sk = cur;
1956 struct tcp_iter_state* st = seq->private;
Denis V. Luneva4146b12008-04-13 22:11:14 -07001957 struct net *net = seq_file_net(seq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001958
1959 if (!sk) {
1960 st->bucket = 0;
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -07001961 sk = sk_head(&tcp_hashinfo.listening_hash[0]);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001962 goto get_sk;
1963 }
1964
1965 ++st->num;
1966
1967 if (st->state == TCP_SEQ_STATE_OPENREQ) {
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001968 struct request_sock *req = cur;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001969
Eric Dumazet72a3eff2006-11-16 02:30:37 -08001970 icsk = inet_csk(st->syn_wait_sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001971 req = req->dl_next;
1972 while (1) {
1973 while (req) {
Daniel Lezcanof40c8172008-03-21 04:13:54 -07001974 if (req->rsk_ops->family == st->family &&
YOSHIFUJI Hideaki878628f2008-03-26 03:57:35 +09001975 net_eq(sock_net(req->sk), net)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001976 cur = req;
1977 goto out;
1978 }
1979 req = req->dl_next;
1980 }
Eric Dumazet72a3eff2006-11-16 02:30:37 -08001981 if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001982 break;
1983get_req:
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001984 req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
Linus Torvalds1da177e2005-04-16 15:20:36 -07001985 }
1986 sk = sk_next(st->syn_wait_sk);
1987 st->state = TCP_SEQ_STATE_LISTENING;
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001988 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001989 } else {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001990 icsk = inet_csk(sk);
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001991 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1992 if (reqsk_queue_len(&icsk->icsk_accept_queue))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001993 goto start_req;
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001994 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001995 sk = sk_next(sk);
1996 }
1997get_sk:
1998 sk_for_each_from(sk, node) {
YOSHIFUJI Hideaki878628f2008-03-26 03:57:35 +09001999 if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002000 cur = sk;
2001 goto out;
2002 }
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002003 icsk = inet_csk(sk);
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07002004 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2005 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002006start_req:
2007 st->uid = sock_i_uid(sk);
2008 st->syn_wait_sk = sk;
2009 st->state = TCP_SEQ_STATE_OPENREQ;
2010 st->sbucket = 0;
2011 goto get_req;
2012 }
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07002013 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002014 }
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -07002015 if (++st->bucket < INET_LHTABLE_SIZE) {
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -07002016 sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002017 goto get_sk;
2018 }
2019 cur = NULL;
2020out:
2021 return cur;
2022}
2023
2024static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2025{
2026 void *rc = listening_get_next(seq, NULL);
2027
2028 while (rc && *pos) {
2029 rc = listening_get_next(seq, rc);
2030 --*pos;
2031 }
2032 return rc;
2033}
2034
2035static void *established_get_first(struct seq_file *seq)
2036{
2037 struct tcp_iter_state* st = seq->private;
Denis V. Luneva4146b12008-04-13 22:11:14 -07002038 struct net *net = seq_file_net(seq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002039 void *rc = NULL;
2040
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -07002041 for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002042 struct sock *sk;
2043 struct hlist_node *node;
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07002044 struct inet_timewait_sock *tw;
Eric Dumazet230140c2007-11-07 02:40:20 -08002045 rwlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002046
Eric Dumazet230140c2007-11-07 02:40:20 -08002047 read_lock_bh(lock);
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -07002048 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
Daniel Lezcanof40c8172008-03-21 04:13:54 -07002049 if (sk->sk_family != st->family ||
YOSHIFUJI Hideaki878628f2008-03-26 03:57:35 +09002050 !net_eq(sock_net(sk), net)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002051 continue;
2052 }
2053 rc = sk;
2054 goto out;
2055 }
2056 st->state = TCP_SEQ_STATE_TIME_WAIT;
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07002057 inet_twsk_for_each(tw, node,
Eric Dumazetdbca9b2752007-02-08 14:16:46 -08002058 &tcp_hashinfo.ehash[st->bucket].twchain) {
Pavel Emelyanov28518fc2008-03-21 15:52:00 -07002059 if (tw->tw_family != st->family ||
YOSHIFUJI Hideaki878628f2008-03-26 03:57:35 +09002060 !net_eq(twsk_net(tw), net)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002061 continue;
2062 }
2063 rc = tw;
2064 goto out;
2065 }
Eric Dumazet230140c2007-11-07 02:40:20 -08002066 read_unlock_bh(lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002067 st->state = TCP_SEQ_STATE_ESTABLISHED;
2068 }
2069out:
2070 return rc;
2071}
2072
2073static void *established_get_next(struct seq_file *seq, void *cur)
2074{
2075 struct sock *sk = cur;
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07002076 struct inet_timewait_sock *tw;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002077 struct hlist_node *node;
2078 struct tcp_iter_state* st = seq->private;
Denis V. Luneva4146b12008-04-13 22:11:14 -07002079 struct net *net = seq_file_net(seq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002080
2081 ++st->num;
2082
2083 if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2084 tw = cur;
2085 tw = tw_next(tw);
2086get_tw:
YOSHIFUJI Hideaki878628f2008-03-26 03:57:35 +09002087 while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002088 tw = tw_next(tw);
2089 }
2090 if (tw) {
2091 cur = tw;
2092 goto out;
2093 }
Eric Dumazet230140c2007-11-07 02:40:20 -08002094 read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002095 st->state = TCP_SEQ_STATE_ESTABLISHED;
2096
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -07002097 if (++st->bucket < tcp_hashinfo.ehash_size) {
Eric Dumazet230140c2007-11-07 02:40:20 -08002098 read_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -07002099 sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002100 } else {
2101 cur = NULL;
2102 goto out;
2103 }
2104 } else
2105 sk = sk_next(sk);
2106
2107 sk_for_each_from(sk, node) {
YOSHIFUJI Hideaki878628f2008-03-26 03:57:35 +09002108 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002109 goto found;
2110 }
2111
2112 st->state = TCP_SEQ_STATE_TIME_WAIT;
Eric Dumazetdbca9b2752007-02-08 14:16:46 -08002113 tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002114 goto get_tw;
2115found:
2116 cur = sk;
2117out:
2118 return cur;
2119}
2120
2121static void *established_get_idx(struct seq_file *seq, loff_t pos)
2122{
2123 void *rc = established_get_first(seq);
2124
2125 while (rc && pos) {
2126 rc = established_get_next(seq, rc);
2127 --pos;
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -02002128 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002129 return rc;
2130}
2131
2132static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2133{
2134 void *rc;
2135 struct tcp_iter_state* st = seq->private;
2136
Arnaldo Carvalho de Melof3f05f72005-08-09 20:08:09 -07002137 inet_listen_lock(&tcp_hashinfo);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002138 st->state = TCP_SEQ_STATE_LISTENING;
2139 rc = listening_get_idx(seq, &pos);
2140
2141 if (!rc) {
Arnaldo Carvalho de Melof3f05f72005-08-09 20:08:09 -07002142 inet_listen_unlock(&tcp_hashinfo);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002143 st->state = TCP_SEQ_STATE_ESTABLISHED;
2144 rc = established_get_idx(seq, pos);
2145 }
2146
2147 return rc;
2148}
2149
2150static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2151{
2152 struct tcp_iter_state* st = seq->private;
2153 st->state = TCP_SEQ_STATE_LISTENING;
2154 st->num = 0;
2155 return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2156}
2157
2158static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2159{
2160 void *rc = NULL;
2161 struct tcp_iter_state* st;
2162
2163 if (v == SEQ_START_TOKEN) {
2164 rc = tcp_get_idx(seq, 0);
2165 goto out;
2166 }
2167 st = seq->private;
2168
2169 switch (st->state) {
2170 case TCP_SEQ_STATE_OPENREQ:
2171 case TCP_SEQ_STATE_LISTENING:
2172 rc = listening_get_next(seq, v);
2173 if (!rc) {
Arnaldo Carvalho de Melof3f05f72005-08-09 20:08:09 -07002174 inet_listen_unlock(&tcp_hashinfo);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002175 st->state = TCP_SEQ_STATE_ESTABLISHED;
2176 rc = established_get_first(seq);
2177 }
2178 break;
2179 case TCP_SEQ_STATE_ESTABLISHED:
2180 case TCP_SEQ_STATE_TIME_WAIT:
2181 rc = established_get_next(seq, v);
2182 break;
2183 }
2184out:
2185 ++*pos;
2186 return rc;
2187}
2188
2189static void tcp_seq_stop(struct seq_file *seq, void *v)
2190{
2191 struct tcp_iter_state* st = seq->private;
2192
2193 switch (st->state) {
2194 case TCP_SEQ_STATE_OPENREQ:
2195 if (v) {
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07002196 struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2197 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002198 }
2199 case TCP_SEQ_STATE_LISTENING:
2200 if (v != SEQ_START_TOKEN)
Arnaldo Carvalho de Melof3f05f72005-08-09 20:08:09 -07002201 inet_listen_unlock(&tcp_hashinfo);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002202 break;
2203 case TCP_SEQ_STATE_TIME_WAIT:
2204 case TCP_SEQ_STATE_ESTABLISHED:
2205 if (v)
Eric Dumazet230140c2007-11-07 02:40:20 -08002206 read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002207 break;
2208 }
2209}
2210
2211static int tcp_seq_open(struct inode *inode, struct file *file)
2212{
2213 struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002214 struct tcp_iter_state *s;
Denis V. Lunev52d6f3f2008-04-13 22:12:41 -07002215 int err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002216
2217 if (unlikely(afinfo == NULL))
2218 return -EINVAL;
2219
Denis V. Lunev52d6f3f2008-04-13 22:12:41 -07002220 err = seq_open_net(inode, file, &afinfo->seq_ops,
2221 sizeof(struct tcp_iter_state));
2222 if (err < 0)
2223 return err;
Daniel Lezcanof40c8172008-03-21 04:13:54 -07002224
Denis V. Lunev52d6f3f2008-04-13 22:12:41 -07002225 s = ((struct seq_file *)file->private_data)->private;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002226 s->family = afinfo->family;
Daniel Lezcanof40c8172008-03-21 04:13:54 -07002227 return 0;
2228}
2229
Daniel Lezcano6f8b13b2008-03-21 04:14:45 -07002230int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002231{
2232 int rc = 0;
2233 struct proc_dir_entry *p;
2234
Denis V. Lunev68fcadd2008-04-13 22:13:30 -07002235 afinfo->seq_fops.owner = afinfo->owner;
2236 afinfo->seq_fops.open = tcp_seq_open;
2237 afinfo->seq_fops.read = seq_read;
2238 afinfo->seq_fops.llseek = seq_lseek;
2239 afinfo->seq_fops.release = seq_release_net;
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -02002240
Denis V. Lunev9427c4b2008-04-13 22:12:13 -07002241 afinfo->seq_ops.start = tcp_seq_start;
2242 afinfo->seq_ops.next = tcp_seq_next;
2243 afinfo->seq_ops.stop = tcp_seq_stop;
2244
Denis V. Lunev68fcadd2008-04-13 22:13:30 -07002245 p = proc_net_fops_create(net, afinfo->name, S_IRUGO, &afinfo->seq_fops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002246 if (p)
2247 p->data = afinfo;
2248 else
2249 rc = -ENOMEM;
2250 return rc;
2251}
2252
Daniel Lezcano6f8b13b2008-03-21 04:14:45 -07002253void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002254{
Daniel Lezcano6f8b13b2008-03-21 04:14:45 -07002255 proc_net_remove(net, afinfo->name);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002256}
2257
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07002258static void get_openreq4(struct sock *sk, struct request_sock *req,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002259 char *tmpbuf, int i, int uid)
2260{
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002261 const struct inet_request_sock *ireq = inet_rsk(req);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002262 int ttd = req->expires - jiffies;
2263
2264 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2265 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2266 i,
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002267 ireq->loc_addr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002268 ntohs(inet_sk(sk)->sport),
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002269 ireq->rmt_addr,
2270 ntohs(ireq->rmt_port),
Linus Torvalds1da177e2005-04-16 15:20:36 -07002271 TCP_SYN_RECV,
2272 0, 0, /* could print option size, but that is af dependent. */
2273 1, /* timers active (only the expire timer) */
2274 jiffies_to_clock_t(ttd),
2275 req->retrans,
2276 uid,
2277 0, /* non standard timer */
2278 0, /* open_requests have no inode */
2279 atomic_read(&sk->sk_refcnt),
2280 req);
2281}
2282
Ilpo Järvinencf4c6bf2007-02-22 01:13:58 -08002283static void get_tcp4_sock(struct sock *sk, char *tmpbuf, int i)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002284{
2285 int timer_active;
2286 unsigned long timer_expires;
Ilpo Järvinencf4c6bf2007-02-22 01:13:58 -08002287 struct tcp_sock *tp = tcp_sk(sk);
2288 const struct inet_connection_sock *icsk = inet_csk(sk);
2289 struct inet_sock *inet = inet_sk(sk);
Al Viro714e85b2006-11-14 20:51:49 -08002290 __be32 dest = inet->daddr;
2291 __be32 src = inet->rcv_saddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002292 __u16 destp = ntohs(inet->dport);
2293 __u16 srcp = ntohs(inet->sport);
2294
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07002295 if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002296 timer_active = 1;
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07002297 timer_expires = icsk->icsk_timeout;
2298 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002299 timer_active = 4;
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07002300 timer_expires = icsk->icsk_timeout;
Ilpo Järvinencf4c6bf2007-02-22 01:13:58 -08002301 } else if (timer_pending(&sk->sk_timer)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002302 timer_active = 2;
Ilpo Järvinencf4c6bf2007-02-22 01:13:58 -08002303 timer_expires = sk->sk_timer.expires;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002304 } else {
2305 timer_active = 0;
2306 timer_expires = jiffies;
2307 }
2308
2309 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2310 "%08X %5d %8d %lu %d %p %u %u %u %u %d",
Ilpo Järvinencf4c6bf2007-02-22 01:13:58 -08002311 i, src, srcp, dest, destp, sk->sk_state,
Sridhar Samudrala47da8ee2006-06-27 13:29:00 -07002312 tp->write_seq - tp->snd_una,
Ilpo Järvinencf4c6bf2007-02-22 01:13:58 -08002313 sk->sk_state == TCP_LISTEN ? sk->sk_ack_backlog :
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -02002314 (tp->rcv_nxt - tp->copied_seq),
Linus Torvalds1da177e2005-04-16 15:20:36 -07002315 timer_active,
2316 jiffies_to_clock_t(timer_expires - jiffies),
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07002317 icsk->icsk_retransmits,
Ilpo Järvinencf4c6bf2007-02-22 01:13:58 -08002318 sock_i_uid(sk),
Arnaldo Carvalho de Melo6687e982005-08-10 04:03:31 -03002319 icsk->icsk_probes_out,
Ilpo Järvinencf4c6bf2007-02-22 01:13:58 -08002320 sock_i_ino(sk),
2321 atomic_read(&sk->sk_refcnt), sk,
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07002322 icsk->icsk_rto,
2323 icsk->icsk_ack.ato,
2324 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002325 tp->snd_cwnd,
2326 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2327}
2328
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -02002329static void get_timewait4_sock(struct inet_timewait_sock *tw,
2330 char *tmpbuf, int i)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002331{
Al Viro23f33c22006-09-27 18:43:50 -07002332 __be32 dest, src;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002333 __u16 destp, srcp;
2334 int ttd = tw->tw_ttd - jiffies;
2335
2336 if (ttd < 0)
2337 ttd = 0;
2338
2339 dest = tw->tw_daddr;
2340 src = tw->tw_rcv_saddr;
2341 destp = ntohs(tw->tw_dport);
2342 srcp = ntohs(tw->tw_sport);
2343
2344 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2345 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2346 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2347 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2348 atomic_read(&tw->tw_refcnt), tw);
2349}
2350
2351#define TMPSZ 150
2352
2353static int tcp4_seq_show(struct seq_file *seq, void *v)
2354{
2355 struct tcp_iter_state* st;
2356 char tmpbuf[TMPSZ + 1];
2357
2358 if (v == SEQ_START_TOKEN) {
2359 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2360 " sl local_address rem_address st tx_queue "
2361 "rx_queue tr tm->when retrnsmt uid timeout "
2362 "inode");
2363 goto out;
2364 }
2365 st = seq->private;
2366
2367 switch (st->state) {
2368 case TCP_SEQ_STATE_LISTENING:
2369 case TCP_SEQ_STATE_ESTABLISHED:
2370 get_tcp4_sock(v, tmpbuf, st->num);
2371 break;
2372 case TCP_SEQ_STATE_OPENREQ:
2373 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2374 break;
2375 case TCP_SEQ_STATE_TIME_WAIT:
2376 get_timewait4_sock(v, tmpbuf, st->num);
2377 break;
2378 }
2379 seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2380out:
2381 return 0;
2382}
2383
Linus Torvalds1da177e2005-04-16 15:20:36 -07002384static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2385 .owner = THIS_MODULE,
2386 .name = "tcp",
2387 .family = AF_INET,
Denis V. Lunev9427c4b2008-04-13 22:12:13 -07002388 .seq_ops = {
2389 .show = tcp4_seq_show,
2390 },
Linus Torvalds1da177e2005-04-16 15:20:36 -07002391};
2392
Pavel Emelyanov757764f2008-03-24 14:56:02 -07002393static int tcp4_proc_init_net(struct net *net)
2394{
2395 return tcp_proc_register(net, &tcp4_seq_afinfo);
2396}
2397
2398static void tcp4_proc_exit_net(struct net *net)
2399{
2400 tcp_proc_unregister(net, &tcp4_seq_afinfo);
2401}
2402
2403static struct pernet_operations tcp4_net_ops = {
2404 .init = tcp4_proc_init_net,
2405 .exit = tcp4_proc_exit_net,
2406};
2407
Linus Torvalds1da177e2005-04-16 15:20:36 -07002408int __init tcp4_proc_init(void)
2409{
Pavel Emelyanov757764f2008-03-24 14:56:02 -07002410 return register_pernet_subsys(&tcp4_net_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002411}
2412
2413void tcp4_proc_exit(void)
2414{
Pavel Emelyanov757764f2008-03-24 14:56:02 -07002415 unregister_pernet_subsys(&tcp4_net_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002416}
2417#endif /* CONFIG_PROC_FS */
2418
2419struct proto tcp_prot = {
2420 .name = "TCP",
2421 .owner = THIS_MODULE,
2422 .close = tcp_close,
2423 .connect = tcp_v4_connect,
2424 .disconnect = tcp_disconnect,
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07002425 .accept = inet_csk_accept,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002426 .ioctl = tcp_ioctl,
2427 .init = tcp_v4_init_sock,
2428 .destroy = tcp_v4_destroy_sock,
2429 .shutdown = tcp_shutdown,
2430 .setsockopt = tcp_setsockopt,
2431 .getsockopt = tcp_getsockopt,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002432 .recvmsg = tcp_recvmsg,
2433 .backlog_rcv = tcp_v4_do_rcv,
Arnaldo Carvalho de Meloab1e0a12008-02-03 04:06:04 -08002434 .hash = inet_hash,
2435 .unhash = inet_unhash,
2436 .get_port = inet_csk_get_port,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002437 .enter_memory_pressure = tcp_enter_memory_pressure,
2438 .sockets_allocated = &tcp_sockets_allocated,
Arnaldo Carvalho de Melo0a5578c2005-08-09 20:11:41 -07002439 .orphan_count = &tcp_orphan_count,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002440 .memory_allocated = &tcp_memory_allocated,
2441 .memory_pressure = &tcp_memory_pressure,
2442 .sysctl_mem = sysctl_tcp_mem,
2443 .sysctl_wmem = sysctl_tcp_wmem,
2444 .sysctl_rmem = sysctl_tcp_rmem,
2445 .max_header = MAX_TCP_HEADER,
2446 .obj_size = sizeof(struct tcp_sock),
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08002447 .twsk_prot = &tcp_timewait_sock_ops,
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07002448 .rsk_prot = &tcp_request_sock_ops,
Pavel Emelyanov39d8cda2008-03-22 16:50:58 -07002449 .h.hashinfo = &tcp_hashinfo,
Arnaldo Carvalho de Melo543d9cf2006-03-20 22:48:35 -08002450#ifdef CONFIG_COMPAT
2451 .compat_setsockopt = compat_tcp_setsockopt,
2452 .compat_getsockopt = compat_tcp_getsockopt,
2453#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07002454};
2455
Denis V. Lunev046ee902008-04-03 14:31:33 -07002456
2457static int __net_init tcp_sk_init(struct net *net)
2458{
2459 return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2460 PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2461}
2462
2463static void __net_exit tcp_sk_exit(struct net *net)
2464{
2465 inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2466}
2467
2468static struct pernet_operations __net_initdata tcp_sk_ops = {
2469 .init = tcp_sk_init,
2470 .exit = tcp_sk_exit,
2471};
2472
Denis V. Lunev9b0f9762008-02-29 11:13:15 -08002473void __init tcp_v4_init(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002474{
Denis V. Lunev046ee902008-04-03 14:31:33 -07002475 if (register_pernet_device(&tcp_sk_ops))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002476 panic("Failed to create the TCP control socket.\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07002477}
2478
2479EXPORT_SYMBOL(ipv4_specific);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002480EXPORT_SYMBOL(tcp_hashinfo);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002481EXPORT_SYMBOL(tcp_prot);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002482EXPORT_SYMBOL(tcp_v4_conn_request);
2483EXPORT_SYMBOL(tcp_v4_connect);
2484EXPORT_SYMBOL(tcp_v4_do_rcv);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002485EXPORT_SYMBOL(tcp_v4_remember_stamp);
2486EXPORT_SYMBOL(tcp_v4_send_check);
2487EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2488
2489#ifdef CONFIG_PROC_FS
2490EXPORT_SYMBOL(tcp_proc_register);
2491EXPORT_SYMBOL(tcp_proc_unregister);
2492#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07002493EXPORT_SYMBOL(sysctl_tcp_low_latency);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002494