blob: e7e91e60ac74ffd2ad7344df7526b52e76cdcaf2 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
8 * Version: $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
9 *
10 * IPv4 specific functions
11 *
12 *
13 * code split from:
14 * linux/ipv4/tcp.c
15 * linux/ipv4/tcp_input.c
16 * linux/ipv4/tcp_output.c
17 *
18 * See tcp.c for author information
19 *
20 * This program is free software; you can redistribute it and/or
21 * modify it under the terms of the GNU General Public License
22 * as published by the Free Software Foundation; either version
23 * 2 of the License, or (at your option) any later version.
24 */
25
26/*
27 * Changes:
28 * David S. Miller : New socket lookup architecture.
29 * This code is dedicated to John Dyson.
30 * David S. Miller : Change semantics of established hash,
31 * half is devoted to TIME_WAIT sockets
32 * and the rest go in the other half.
33 * Andi Kleen : Add support for syncookies and fixed
34 * some bugs: ip options weren't passed to
35 * the TCP layer, missed a check for an
36 * ACK bit.
37 * Andi Kleen : Implemented fast path mtu discovery.
38 * Fixed many serious bugs in the
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -070039 * request_sock handling and moved
Linus Torvalds1da177e2005-04-16 15:20:36 -070040 * most of it into the af independent code.
41 * Added tail drop and some other bugfixes.
42 * Added new listen sematics.
43 * Mike McLagan : Routing by source
44 * Juan Jose Ciarlante: ip_dynaddr bits
45 * Andi Kleen: various fixes.
46 * Vitaly E. Lavrov : Transparent proxy revived after year
47 * coma.
48 * Andi Kleen : Fix new listen.
49 * Andi Kleen : Fix accept error reporting.
50 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
51 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
52 * a single port at the same time.
53 */
54
55#include <linux/config.h>
56
57#include <linux/types.h>
58#include <linux/fcntl.h>
59#include <linux/module.h>
60#include <linux/random.h>
61#include <linux/cache.h>
62#include <linux/jhash.h>
63#include <linux/init.h>
64#include <linux/times.h>
65
66#include <net/icmp.h>
Arnaldo Carvalho de Melo304a1612005-08-09 19:59:20 -070067#include <net/inet_hashtables.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070068#include <net/tcp.h>
69#include <net/ipv6.h>
70#include <net/inet_common.h>
71#include <net/xfrm.h>
72
73#include <linux/inet.h>
74#include <linux/ipv6.h>
75#include <linux/stddef.h>
76#include <linux/proc_fs.h>
77#include <linux/seq_file.h>
78
79extern int sysctl_ip_dynaddr;
80int sysctl_tcp_tw_reuse;
81int sysctl_tcp_low_latency;
82
83/* Check TCP sequence numbers in ICMP packets. */
84#define ICMP_MIN_LENGTH 8
85
86/* Socket used for sending RSTs */
87static struct socket *tcp_socket;
88
89void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
90 struct sk_buff *skb);
91
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -070092struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
93 .lhash_lock = RW_LOCK_UNLOCKED,
94 .lhash_users = ATOMIC_INIT(0),
95 .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
96 .portalloc_lock = SPIN_LOCK_UNLOCKED,
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -070097 .port_rover = 1024 - 1,
Linus Torvalds1da177e2005-04-16 15:20:36 -070098};
99
100/*
101 * This array holds the first and last local port number.
102 * For high-usage systems, use sysctl to change this to
103 * 32768-61000
104 */
105int sysctl_local_port_range[2] = { 1024, 4999 };
Linus Torvalds1da177e2005-04-16 15:20:36 -0700106
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -0700107static inline int tcp_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700108{
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -0700109 const u32 sk_rcv_saddr = inet_rcv_saddr(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700110 struct sock *sk2;
111 struct hlist_node *node;
112 int reuse = sk->sk_reuse;
113
114 sk_for_each_bound(sk2, node, &tb->owners) {
115 if (sk != sk2 &&
116 !tcp_v6_ipv6only(sk2) &&
117 (!sk->sk_bound_dev_if ||
118 !sk2->sk_bound_dev_if ||
119 sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
120 if (!reuse || !sk2->sk_reuse ||
121 sk2->sk_state == TCP_LISTEN) {
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -0700122 const u32 sk2_rcv_saddr = inet_rcv_saddr(sk2);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700123 if (!sk2_rcv_saddr || !sk_rcv_saddr ||
124 sk2_rcv_saddr == sk_rcv_saddr)
125 break;
126 }
127 }
128 }
129 return node != NULL;
130}
131
132/* Obtain a reference to a local port for the given sock,
133 * if snum is zero it means select any available local port.
134 */
135static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
136{
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -0700137 struct inet_bind_hashbucket *head;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700138 struct hlist_node *node;
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -0700139 struct inet_bind_bucket *tb;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700140 int ret;
141
142 local_bh_disable();
143 if (!snum) {
144 int low = sysctl_local_port_range[0];
145 int high = sysctl_local_port_range[1];
146 int remaining = (high - low) + 1;
147 int rover;
148
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -0700149 spin_lock(&tcp_hashinfo.portalloc_lock);
150 if (tcp_hashinfo.port_rover < low)
Folkert van Heusden0b2531b2005-05-03 14:36:08 -0700151 rover = low;
152 else
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -0700153 rover = tcp_hashinfo.port_rover;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700154 do {
155 rover++;
Folkert van Heusden0b2531b2005-05-03 14:36:08 -0700156 if (rover > high)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700157 rover = low;
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -0700158 head = &tcp_hashinfo.bhash[inet_bhashfn(rover, tcp_hashinfo.bhash_size)];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700159 spin_lock(&head->lock);
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -0700160 inet_bind_bucket_for_each(tb, node, &head->chain)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700161 if (tb->port == rover)
162 goto next;
163 break;
164 next:
165 spin_unlock(&head->lock);
166 } while (--remaining > 0);
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -0700167 tcp_hashinfo.port_rover = rover;
168 spin_unlock(&tcp_hashinfo.portalloc_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700169
David S. Millerd5d28372005-08-23 10:49:54 -0700170 /* Exhausted local port range during search? It is not
171 * possible for us to be holding one of the bind hash
172 * locks if this test triggers, because if 'remaining'
173 * drops to zero, we broke out of the do/while loop at
174 * the top level, not from the 'break;' statement.
175 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700176 ret = 1;
David S. Millerd5d28372005-08-23 10:49:54 -0700177 if (unlikely(remaining <= 0))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700178 goto fail;
179
180 /* OK, here is the one we will use. HEAD is
181 * non-NULL and we hold it's mutex.
182 */
183 snum = rover;
184 } else {
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -0700185 head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700186 spin_lock(&head->lock);
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -0700187 inet_bind_bucket_for_each(tb, node, &head->chain)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700188 if (tb->port == snum)
189 goto tb_found;
190 }
191 tb = NULL;
192 goto tb_not_found;
193tb_found:
194 if (!hlist_empty(&tb->owners)) {
195 if (sk->sk_reuse > 1)
196 goto success;
197 if (tb->fastreuse > 0 &&
198 sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
199 goto success;
200 } else {
201 ret = 1;
202 if (tcp_bind_conflict(sk, tb))
203 goto fail_unlock;
204 }
205 }
206tb_not_found:
207 ret = 1;
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -0700208 if (!tb && (tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, snum)) == NULL)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700209 goto fail_unlock;
210 if (hlist_empty(&tb->owners)) {
211 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
212 tb->fastreuse = 1;
213 else
214 tb->fastreuse = 0;
215 } else if (tb->fastreuse &&
216 (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
217 tb->fastreuse = 0;
218success:
Arnaldo Carvalho de Meloa55ebcc2005-08-09 20:01:14 -0700219 if (!inet_sk(sk)->bind_hash)
Arnaldo Carvalho de Melo2d8c4ce2005-08-09 20:07:13 -0700220 inet_bind_hash(sk, tb, snum);
Arnaldo Carvalho de Meloa55ebcc2005-08-09 20:01:14 -0700221 BUG_TRAP(inet_sk(sk)->bind_hash == tb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700222 ret = 0;
223
224fail_unlock:
225 spin_unlock(&head->lock);
226fail:
227 local_bh_enable();
228 return ret;
229}
230
Linus Torvalds1da177e2005-04-16 15:20:36 -0700231static void tcp_v4_hash(struct sock *sk)
232{
Arnaldo Carvalho de Melo81849d12005-08-09 20:08:50 -0700233 inet_hash(&tcp_hashinfo, sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700234}
235
236void tcp_unhash(struct sock *sk)
237{
Arnaldo Carvalho de Melo81849d12005-08-09 20:08:50 -0700238 inet_unhash(&tcp_hashinfo, sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700239}
240
Linus Torvalds1da177e2005-04-16 15:20:36 -0700241static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
242{
243 return secure_tcp_sequence_number(skb->nh.iph->daddr,
244 skb->nh.iph->saddr,
245 skb->h.th->dest,
246 skb->h.th->source);
247}
248
249/* called with local bh disabled */
250static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -0700251 struct inet_timewait_sock **twp)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700252{
253 struct inet_sock *inet = inet_sk(sk);
254 u32 daddr = inet->rcv_saddr;
255 u32 saddr = inet->daddr;
256 int dif = sk->sk_bound_dev_if;
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -0700257 INET_ADDR_COOKIE(acookie, saddr, daddr)
258 const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -0700259 const int hash = inet_ehashfn(daddr, lport, saddr, inet->dport, tcp_hashinfo.ehash_size);
260 struct inet_ehash_bucket *head = &tcp_hashinfo.ehash[hash];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700261 struct sock *sk2;
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -0700262 const struct hlist_node *node;
263 struct inet_timewait_sock *tw;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700264
265 write_lock(&head->lock);
266
267 /* Check TIME-WAIT sockets first. */
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -0700268 sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) {
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -0700269 tw = inet_twsk(sk2);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700270
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -0700271 if (INET_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
272 const struct tcp_timewait_sock *tcptw = tcp_twsk(sk2);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700273 struct tcp_sock *tp = tcp_sk(sk);
274
275 /* With PAWS, it is safe from the viewpoint
276 of data integrity. Even without PAWS it
277 is safe provided sequence spaces do not
278 overlap i.e. at data rates <= 80Mbit/sec.
279
280 Actually, the idea is close to VJ's one,
281 only timestamp cache is held not per host,
282 but per port pair and TW bucket is used
283 as state holder.
284
285 If TW bucket has been already destroyed we
286 fall back to VJ's scheme and use initial
287 timestamp retrieved from peer table.
288 */
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -0700289 if (tcptw->tw_ts_recent_stamp &&
Linus Torvalds1da177e2005-04-16 15:20:36 -0700290 (!twp || (sysctl_tcp_tw_reuse &&
291 xtime.tv_sec -
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -0700292 tcptw->tw_ts_recent_stamp > 1))) {
293 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
294 if (tp->write_seq == 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700295 tp->write_seq = 1;
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -0700296 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
297 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700298 sock_hold(sk2);
299 goto unique;
300 } else
301 goto not_unique;
302 }
303 }
304 tw = NULL;
305
306 /* And established part... */
307 sk_for_each(sk2, node, &head->chain) {
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -0700308 if (INET_MATCH(sk2, acookie, saddr, daddr, ports, dif))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700309 goto not_unique;
310 }
311
312unique:
313 /* Must record num and sport now. Otherwise we will see
314 * in hash table socket with a funny identity. */
315 inet->num = lport;
316 inet->sport = htons(lport);
317 sk->sk_hashent = hash;
318 BUG_TRAP(sk_unhashed(sk));
319 __sk_add_node(sk, &head->chain);
320 sock_prot_inc_use(sk->sk_prot);
321 write_unlock(&head->lock);
322
323 if (twp) {
324 *twp = tw;
325 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
326 } else if (tw) {
327 /* Silly. Should hash-dance instead... */
328 tcp_tw_deschedule(tw);
329 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
330
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -0700331 inet_twsk_put(tw);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700332 }
333
334 return 0;
335
336not_unique:
337 write_unlock(&head->lock);
338 return -EADDRNOTAVAIL;
339}
340
341static inline u32 connect_port_offset(const struct sock *sk)
342{
343 const struct inet_sock *inet = inet_sk(sk);
344
345 return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr,
346 inet->dport);
347}
348
349/*
350 * Bind a port for a connect operation and hash it.
351 */
352static inline int tcp_v4_hash_connect(struct sock *sk)
353{
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -0700354 const unsigned short snum = inet_sk(sk)->num;
355 struct inet_bind_hashbucket *head;
356 struct inet_bind_bucket *tb;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700357 int ret;
358
359 if (!snum) {
360 int low = sysctl_local_port_range[0];
361 int high = sysctl_local_port_range[1];
362 int range = high - low;
363 int i;
364 int port;
365 static u32 hint;
366 u32 offset = hint + connect_port_offset(sk);
367 struct hlist_node *node;
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -0700368 struct inet_timewait_sock *tw = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700369
370 local_bh_disable();
371 for (i = 1; i <= range; i++) {
372 port = low + (i + offset) % range;
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -0700373 head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700374 spin_lock(&head->lock);
375
376 /* Does not bother with rcv_saddr checks,
377 * because the established check is already
378 * unique enough.
379 */
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -0700380 inet_bind_bucket_for_each(tb, node, &head->chain) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700381 if (tb->port == port) {
382 BUG_TRAP(!hlist_empty(&tb->owners));
383 if (tb->fastreuse >= 0)
384 goto next_port;
385 if (!__tcp_v4_check_established(sk,
386 port,
387 &tw))
388 goto ok;
389 goto next_port;
390 }
391 }
392
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -0700393 tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700394 if (!tb) {
395 spin_unlock(&head->lock);
396 break;
397 }
398 tb->fastreuse = -1;
399 goto ok;
400
401 next_port:
402 spin_unlock(&head->lock);
403 }
404 local_bh_enable();
405
406 return -EADDRNOTAVAIL;
407
408ok:
409 hint += i;
410
411 /* Head lock still held and bh's disabled */
Arnaldo Carvalho de Melo2d8c4ce2005-08-09 20:07:13 -0700412 inet_bind_hash(sk, tb, port);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700413 if (sk_unhashed(sk)) {
414 inet_sk(sk)->sport = htons(port);
Arnaldo Carvalho de Melof3f05f72005-08-09 20:08:09 -0700415 __inet_hash(&tcp_hashinfo, sk, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700416 }
417 spin_unlock(&head->lock);
418
419 if (tw) {
420 tcp_tw_deschedule(tw);
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -0700421 inet_twsk_put(tw);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700422 }
423
424 ret = 0;
425 goto out;
426 }
427
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -0700428 head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
Arnaldo Carvalho de Meloa55ebcc2005-08-09 20:01:14 -0700429 tb = inet_sk(sk)->bind_hash;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700430 spin_lock_bh(&head->lock);
431 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
Arnaldo Carvalho de Melof3f05f72005-08-09 20:08:09 -0700432 __inet_hash(&tcp_hashinfo, sk, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700433 spin_unlock_bh(&head->lock);
434 return 0;
435 } else {
436 spin_unlock(&head->lock);
437 /* No definite answer... Walk to established hash table */
438 ret = __tcp_v4_check_established(sk, snum, NULL);
439out:
440 local_bh_enable();
441 return ret;
442 }
443}
444
445/* This will initiate an outgoing connection. */
446int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
447{
448 struct inet_sock *inet = inet_sk(sk);
449 struct tcp_sock *tp = tcp_sk(sk);
450 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
451 struct rtable *rt;
452 u32 daddr, nexthop;
453 int tmp;
454 int err;
455
456 if (addr_len < sizeof(struct sockaddr_in))
457 return -EINVAL;
458
459 if (usin->sin_family != AF_INET)
460 return -EAFNOSUPPORT;
461
462 nexthop = daddr = usin->sin_addr.s_addr;
463 if (inet->opt && inet->opt->srr) {
464 if (!daddr)
465 return -EINVAL;
466 nexthop = inet->opt->faddr;
467 }
468
469 tmp = ip_route_connect(&rt, nexthop, inet->saddr,
470 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
471 IPPROTO_TCP,
472 inet->sport, usin->sin_port, sk);
473 if (tmp < 0)
474 return tmp;
475
476 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
477 ip_rt_put(rt);
478 return -ENETUNREACH;
479 }
480
481 if (!inet->opt || !inet->opt->srr)
482 daddr = rt->rt_dst;
483
484 if (!inet->saddr)
485 inet->saddr = rt->rt_src;
486 inet->rcv_saddr = inet->saddr;
487
488 if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
489 /* Reset inherited state */
490 tp->rx_opt.ts_recent = 0;
491 tp->rx_opt.ts_recent_stamp = 0;
492 tp->write_seq = 0;
493 }
494
495 if (sysctl_tcp_tw_recycle &&
496 !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
497 struct inet_peer *peer = rt_get_peer(rt);
498
499 /* VJ's idea. We save last timestamp seen from
500 * the destination in peer table, when entering state TIME-WAIT
501 * and initialize rx_opt.ts_recent from it, when trying new connection.
502 */
503
504 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
505 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
506 tp->rx_opt.ts_recent = peer->tcp_ts;
507 }
508 }
509
510 inet->dport = usin->sin_port;
511 inet->daddr = daddr;
512
513 tp->ext_header_len = 0;
514 if (inet->opt)
515 tp->ext_header_len = inet->opt->optlen;
516
517 tp->rx_opt.mss_clamp = 536;
518
519 /* Socket identity is still unknown (sport may be zero).
520 * However we set state to SYN-SENT and not releasing socket
521 * lock select source port, enter ourselves into the hash tables and
522 * complete initialization after this.
523 */
524 tcp_set_state(sk, TCP_SYN_SENT);
525 err = tcp_v4_hash_connect(sk);
526 if (err)
527 goto failure;
528
529 err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
530 if (err)
531 goto failure;
532
533 /* OK, now commit destination to socket. */
Arnaldo Carvalho de Melo6cbb0df2005-08-09 19:49:02 -0700534 sk_setup_caps(sk, &rt->u.dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700535
536 if (!tp->write_seq)
537 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
538 inet->daddr,
539 inet->sport,
540 usin->sin_port);
541
542 inet->id = tp->write_seq ^ jiffies;
543
544 err = tcp_connect(sk);
545 rt = NULL;
546 if (err)
547 goto failure;
548
549 return 0;
550
551failure:
552 /* This unhashes the socket and releases the local port, if necessary. */
553 tcp_set_state(sk, TCP_CLOSE);
554 ip_rt_put(rt);
555 sk->sk_route_caps = 0;
556 inet->dport = 0;
557 return err;
558}
559
560static __inline__ int tcp_v4_iif(struct sk_buff *skb)
561{
562 return ((struct rtable *)skb->dst)->rt_iif;
563}
564
565static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
566{
567 return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
568}
569
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -0700570static struct request_sock *tcp_v4_search_req(struct tcp_sock *tp,
571 struct request_sock ***prevp,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700572 __u16 rport,
573 __u32 raddr, __u32 laddr)
574{
Arnaldo Carvalho de Melo2ad69c52005-06-18 22:48:55 -0700575 struct listen_sock *lopt = tp->accept_queue.listen_opt;
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -0700576 struct request_sock *req, **prev;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700577
578 for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
579 (req = *prev) != NULL;
580 prev = &req->dl_next) {
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -0700581 const struct inet_request_sock *ireq = inet_rsk(req);
582
583 if (ireq->rmt_port == rport &&
584 ireq->rmt_addr == raddr &&
585 ireq->loc_addr == laddr &&
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -0700586 TCP_INET_FAMILY(req->rsk_ops->family)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700587 BUG_TRAP(!req->sk);
588 *prevp = prev;
589 break;
590 }
591 }
592
593 return req;
594}
595
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -0700596static void tcp_v4_synq_add(struct sock *sk, struct request_sock *req)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700597{
598 struct tcp_sock *tp = tcp_sk(sk);
Arnaldo Carvalho de Melo2ad69c52005-06-18 22:48:55 -0700599 struct listen_sock *lopt = tp->accept_queue.listen_opt;
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -0700600 u32 h = tcp_v4_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700601
Arnaldo Carvalho de Melo0e875062005-06-18 22:47:59 -0700602 reqsk_queue_hash_req(&tp->accept_queue, h, req, TCP_TIMEOUT_INIT);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700603 tcp_synq_added(sk);
604}
605
606
607/*
608 * This routine does path mtu discovery as defined in RFC1191.
609 */
610static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
611 u32 mtu)
612{
613 struct dst_entry *dst;
614 struct inet_sock *inet = inet_sk(sk);
615 struct tcp_sock *tp = tcp_sk(sk);
616
617 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
618 * send out by Linux are always <576bytes so they should go through
619 * unfragmented).
620 */
621 if (sk->sk_state == TCP_LISTEN)
622 return;
623
624 /* We don't check in the destentry if pmtu discovery is forbidden
625 * on this route. We just assume that no packet_to_big packets
626 * are send back when pmtu discovery is not active.
627 * There is a small race when the user changes this flag in the
628 * route, but I think that's acceptable.
629 */
630 if ((dst = __sk_dst_check(sk, 0)) == NULL)
631 return;
632
633 dst->ops->update_pmtu(dst, mtu);
634
635 /* Something is about to be wrong... Remember soft error
636 * for the case, if this connection will not able to recover.
637 */
638 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
639 sk->sk_err_soft = EMSGSIZE;
640
641 mtu = dst_mtu(dst);
642
643 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
644 tp->pmtu_cookie > mtu) {
645 tcp_sync_mss(sk, mtu);
646
647 /* Resend the TCP packet because it's
648 * clear that the old packet has been
649 * dropped. This is the new "fast" path mtu
650 * discovery.
651 */
652 tcp_simple_retransmit(sk);
653 } /* else let the usual retransmit timer handle it */
654}
655
656/*
657 * This routine is called by the ICMP module when it gets some
658 * sort of error condition. If err < 0 then the socket should
659 * be closed and the error returned to the user. If err > 0
660 * it's just the icmp type << 8 | icmp code. After adjustment
661 * header points to the first 8 bytes of the tcp header. We need
662 * to find the appropriate port.
663 *
664 * The locking strategy used here is very "optimistic". When
665 * someone else accesses the socket the ICMP is just dropped
666 * and for some paths there is no check at all.
667 * A more general error queue to queue errors for later handling
668 * is probably better.
669 *
670 */
671
672void tcp_v4_err(struct sk_buff *skb, u32 info)
673{
674 struct iphdr *iph = (struct iphdr *)skb->data;
675 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
676 struct tcp_sock *tp;
677 struct inet_sock *inet;
678 int type = skb->h.icmph->type;
679 int code = skb->h.icmph->code;
680 struct sock *sk;
681 __u32 seq;
682 int err;
683
684 if (skb->len < (iph->ihl << 2) + 8) {
685 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
686 return;
687 }
688
Arnaldo Carvalho de Meloe48c4142005-08-09 20:09:46 -0700689 sk = inet_lookup(&tcp_hashinfo, iph->daddr, th->dest, iph->saddr,
690 th->source, tcp_v4_iif(skb));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700691 if (!sk) {
692 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
693 return;
694 }
695 if (sk->sk_state == TCP_TIME_WAIT) {
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -0700696 inet_twsk_put((struct inet_timewait_sock *)sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700697 return;
698 }
699
700 bh_lock_sock(sk);
701 /* If too many ICMPs get dropped on busy
702 * servers this needs to be solved differently.
703 */
704 if (sock_owned_by_user(sk))
705 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
706
707 if (sk->sk_state == TCP_CLOSE)
708 goto out;
709
710 tp = tcp_sk(sk);
711 seq = ntohl(th->seq);
712 if (sk->sk_state != TCP_LISTEN &&
713 !between(seq, tp->snd_una, tp->snd_nxt)) {
714 NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
715 goto out;
716 }
717
718 switch (type) {
719 case ICMP_SOURCE_QUENCH:
720 /* Just silently ignore these. */
721 goto out;
722 case ICMP_PARAMETERPROB:
723 err = EPROTO;
724 break;
725 case ICMP_DEST_UNREACH:
726 if (code > NR_ICMP_UNREACH)
727 goto out;
728
729 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
730 if (!sock_owned_by_user(sk))
731 do_pmtu_discovery(sk, iph, info);
732 goto out;
733 }
734
735 err = icmp_err_convert[code].errno;
736 break;
737 case ICMP_TIME_EXCEEDED:
738 err = EHOSTUNREACH;
739 break;
740 default:
741 goto out;
742 }
743
744 switch (sk->sk_state) {
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -0700745 struct request_sock *req, **prev;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700746 case TCP_LISTEN:
747 if (sock_owned_by_user(sk))
748 goto out;
749
750 req = tcp_v4_search_req(tp, &prev, th->dest,
751 iph->daddr, iph->saddr);
752 if (!req)
753 goto out;
754
755 /* ICMPs are not backlogged, hence we cannot get
756 an established socket here.
757 */
758 BUG_TRAP(!req->sk);
759
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -0700760 if (seq != tcp_rsk(req)->snt_isn) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700761 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
762 goto out;
763 }
764
765 /*
766 * Still in SYN_RECV, just remove it silently.
767 * There is no good way to pass the error to the newly
768 * created socket, and POSIX does not want network
769 * errors returned from accept().
770 */
771 tcp_synq_drop(sk, req, prev);
772 goto out;
773
774 case TCP_SYN_SENT:
775 case TCP_SYN_RECV: /* Cannot happen.
776 It can f.e. if SYNs crossed.
777 */
778 if (!sock_owned_by_user(sk)) {
779 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
780 sk->sk_err = err;
781
782 sk->sk_error_report(sk);
783
784 tcp_done(sk);
785 } else {
786 sk->sk_err_soft = err;
787 }
788 goto out;
789 }
790
791 /* If we've already connected we will keep trying
792 * until we time out, or the user gives up.
793 *
794 * rfc1122 4.2.3.9 allows to consider as hard errors
795 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
796 * but it is obsoleted by pmtu discovery).
797 *
798 * Note, that in modern internet, where routing is unreliable
799 * and in each dark corner broken firewalls sit, sending random
800 * errors ordered by their masters even this two messages finally lose
801 * their original sense (even Linux sends invalid PORT_UNREACHs)
802 *
803 * Now we are in compliance with RFCs.
804 * --ANK (980905)
805 */
806
807 inet = inet_sk(sk);
808 if (!sock_owned_by_user(sk) && inet->recverr) {
809 sk->sk_err = err;
810 sk->sk_error_report(sk);
811 } else { /* Only an error on timeout */
812 sk->sk_err_soft = err;
813 }
814
815out:
816 bh_unlock_sock(sk);
817 sock_put(sk);
818}
819
820/* This routine computes an IPv4 TCP checksum. */
821void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
822 struct sk_buff *skb)
823{
824 struct inet_sock *inet = inet_sk(sk);
825
826 if (skb->ip_summed == CHECKSUM_HW) {
827 th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
828 skb->csum = offsetof(struct tcphdr, check);
829 } else {
830 th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
831 csum_partial((char *)th,
832 th->doff << 2,
833 skb->csum));
834 }
835}
836
837/*
838 * This routine will send an RST to the other tcp.
839 *
840 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
841 * for reset.
842 * Answer: if a packet caused RST, it is not for a socket
843 * existing in our system, if it is matched to a socket,
844 * it is just duplicate segment or bug in other side's TCP.
845 * So that we build reply only basing on parameters
846 * arrived with segment.
847 * Exception: precedence violation. We do not implement it in any case.
848 */
849
850static void tcp_v4_send_reset(struct sk_buff *skb)
851{
852 struct tcphdr *th = skb->h.th;
853 struct tcphdr rth;
854 struct ip_reply_arg arg;
855
856 /* Never send a reset in response to a reset. */
857 if (th->rst)
858 return;
859
860 if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
861 return;
862
863 /* Swap the send and the receive. */
864 memset(&rth, 0, sizeof(struct tcphdr));
865 rth.dest = th->source;
866 rth.source = th->dest;
867 rth.doff = sizeof(struct tcphdr) / 4;
868 rth.rst = 1;
869
870 if (th->ack) {
871 rth.seq = th->ack_seq;
872 } else {
873 rth.ack = 1;
874 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
875 skb->len - (th->doff << 2));
876 }
877
878 memset(&arg, 0, sizeof arg);
879 arg.iov[0].iov_base = (unsigned char *)&rth;
880 arg.iov[0].iov_len = sizeof rth;
881 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
882 skb->nh.iph->saddr, /*XXX*/
883 sizeof(struct tcphdr), IPPROTO_TCP, 0);
884 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
885
886 ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
887
888 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
889 TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
890}
891
892/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
893 outside socket context is ugly, certainly. What can I do?
894 */
895
896static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
897 u32 win, u32 ts)
898{
899 struct tcphdr *th = skb->h.th;
900 struct {
901 struct tcphdr th;
902 u32 tsopt[3];
903 } rep;
904 struct ip_reply_arg arg;
905
906 memset(&rep.th, 0, sizeof(struct tcphdr));
907 memset(&arg, 0, sizeof arg);
908
909 arg.iov[0].iov_base = (unsigned char *)&rep;
910 arg.iov[0].iov_len = sizeof(rep.th);
911 if (ts) {
912 rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
913 (TCPOPT_TIMESTAMP << 8) |
914 TCPOLEN_TIMESTAMP);
915 rep.tsopt[1] = htonl(tcp_time_stamp);
916 rep.tsopt[2] = htonl(ts);
917 arg.iov[0].iov_len = sizeof(rep);
918 }
919
920 /* Swap the send and the receive. */
921 rep.th.dest = th->source;
922 rep.th.source = th->dest;
923 rep.th.doff = arg.iov[0].iov_len / 4;
924 rep.th.seq = htonl(seq);
925 rep.th.ack_seq = htonl(ack);
926 rep.th.ack = 1;
927 rep.th.window = htons(win);
928
929 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
930 skb->nh.iph->saddr, /*XXX*/
931 arg.iov[0].iov_len, IPPROTO_TCP, 0);
932 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
933
934 ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
935
936 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
937}
938
939static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
940{
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -0700941 struct inet_timewait_sock *tw = inet_twsk(sk);
942 const struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700943
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -0700944 tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
945 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcptw->tw_ts_recent);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700946
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -0700947 inet_twsk_put(tw);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700948}
949
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -0700950static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700951{
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -0700952 tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700953 req->ts_recent);
954}
955
956static struct dst_entry* tcp_v4_route_req(struct sock *sk,
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -0700957 struct request_sock *req)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700958{
959 struct rtable *rt;
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -0700960 const struct inet_request_sock *ireq = inet_rsk(req);
961 struct ip_options *opt = inet_rsk(req)->opt;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700962 struct flowi fl = { .oif = sk->sk_bound_dev_if,
963 .nl_u = { .ip4_u =
964 { .daddr = ((opt && opt->srr) ?
965 opt->faddr :
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -0700966 ireq->rmt_addr),
967 .saddr = ireq->loc_addr,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700968 .tos = RT_CONN_FLAGS(sk) } },
969 .proto = IPPROTO_TCP,
970 .uli_u = { .ports =
971 { .sport = inet_sk(sk)->sport,
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -0700972 .dport = ireq->rmt_port } } };
Linus Torvalds1da177e2005-04-16 15:20:36 -0700973
974 if (ip_route_output_flow(&rt, &fl, sk, 0)) {
975 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
976 return NULL;
977 }
978 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
979 ip_rt_put(rt);
980 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
981 return NULL;
982 }
983 return &rt->u.dst;
984}
985
986/*
987 * Send a SYN-ACK after having received an ACK.
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -0700988 * This still operates on a request_sock only, not on a big
Linus Torvalds1da177e2005-04-16 15:20:36 -0700989 * socket.
990 */
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -0700991static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700992 struct dst_entry *dst)
993{
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -0700994 const struct inet_request_sock *ireq = inet_rsk(req);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700995 int err = -1;
996 struct sk_buff * skb;
997
998 /* First, grab a route. */
999 if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1000 goto out;
1001
1002 skb = tcp_make_synack(sk, dst, req);
1003
1004 if (skb) {
1005 struct tcphdr *th = skb->h.th;
1006
1007 th->check = tcp_v4_check(th, skb->len,
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001008 ireq->loc_addr,
1009 ireq->rmt_addr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001010 csum_partial((char *)th, skb->len,
1011 skb->csum));
1012
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001013 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
1014 ireq->rmt_addr,
1015 ireq->opt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001016 if (err == NET_XMIT_CN)
1017 err = 0;
1018 }
1019
1020out:
1021 dst_release(dst);
1022 return err;
1023}
1024
1025/*
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001026 * IPv4 request_sock destructor.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001027 */
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001028static void tcp_v4_reqsk_destructor(struct request_sock *req)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001029{
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001030 if (inet_rsk(req)->opt)
1031 kfree(inet_rsk(req)->opt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001032}
1033
1034static inline void syn_flood_warning(struct sk_buff *skb)
1035{
1036 static unsigned long warntime;
1037
1038 if (time_after(jiffies, (warntime + HZ * 60))) {
1039 warntime = jiffies;
1040 printk(KERN_INFO
1041 "possible SYN flooding on port %d. Sending cookies.\n",
1042 ntohs(skb->h.th->dest));
1043 }
1044}
1045
1046/*
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001047 * Save and compile IPv4 options into the request_sock if needed.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001048 */
1049static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
1050 struct sk_buff *skb)
1051{
1052 struct ip_options *opt = &(IPCB(skb)->opt);
1053 struct ip_options *dopt = NULL;
1054
1055 if (opt && opt->optlen) {
1056 int opt_size = optlength(opt);
1057 dopt = kmalloc(opt_size, GFP_ATOMIC);
1058 if (dopt) {
1059 if (ip_options_echo(dopt, skb)) {
1060 kfree(dopt);
1061 dopt = NULL;
1062 }
1063 }
1064 }
1065 return dopt;
1066}
1067
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001068struct request_sock_ops tcp_request_sock_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001069 .family = PF_INET,
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001070 .obj_size = sizeof(struct tcp_request_sock),
Linus Torvalds1da177e2005-04-16 15:20:36 -07001071 .rtx_syn_ack = tcp_v4_send_synack,
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001072 .send_ack = tcp_v4_reqsk_send_ack,
1073 .destructor = tcp_v4_reqsk_destructor,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001074 .send_reset = tcp_v4_send_reset,
1075};
1076
1077int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1078{
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001079 struct inet_request_sock *ireq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001080 struct tcp_options_received tmp_opt;
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001081 struct request_sock *req;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001082 __u32 saddr = skb->nh.iph->saddr;
1083 __u32 daddr = skb->nh.iph->daddr;
1084 __u32 isn = TCP_SKB_CB(skb)->when;
1085 struct dst_entry *dst = NULL;
1086#ifdef CONFIG_SYN_COOKIES
1087 int want_cookie = 0;
1088#else
1089#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1090#endif
1091
1092 /* Never answer to SYNs send to broadcast or multicast */
1093 if (((struct rtable *)skb->dst)->rt_flags &
1094 (RTCF_BROADCAST | RTCF_MULTICAST))
1095 goto drop;
1096
1097 /* TW buckets are converted to open requests without
1098 * limitations, they conserve resources and peer is
1099 * evidently real one.
1100 */
1101 if (tcp_synq_is_full(sk) && !isn) {
1102#ifdef CONFIG_SYN_COOKIES
1103 if (sysctl_tcp_syncookies) {
1104 want_cookie = 1;
1105 } else
1106#endif
1107 goto drop;
1108 }
1109
1110 /* Accept backlog is full. If we have already queued enough
1111 * of warm entries in syn queue, drop request. It is better than
1112 * clogging syn queue with openreqs with exponentially increasing
1113 * timeout.
1114 */
1115 if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1116 goto drop;
1117
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001118 req = reqsk_alloc(&tcp_request_sock_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001119 if (!req)
1120 goto drop;
1121
1122 tcp_clear_options(&tmp_opt);
1123 tmp_opt.mss_clamp = 536;
1124 tmp_opt.user_mss = tcp_sk(sk)->rx_opt.user_mss;
1125
1126 tcp_parse_options(skb, &tmp_opt, 0);
1127
1128 if (want_cookie) {
1129 tcp_clear_options(&tmp_opt);
1130 tmp_opt.saw_tstamp = 0;
1131 }
1132
1133 if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1134 /* Some OSes (unknown ones, but I see them on web server, which
1135 * contains information interesting only for windows'
1136 * users) do not send their stamp in SYN. It is easy case.
1137 * We simply do not advertise TS support.
1138 */
1139 tmp_opt.saw_tstamp = 0;
1140 tmp_opt.tstamp_ok = 0;
1141 }
1142 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1143
1144 tcp_openreq_init(req, &tmp_opt, skb);
1145
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001146 ireq = inet_rsk(req);
1147 ireq->loc_addr = daddr;
1148 ireq->rmt_addr = saddr;
1149 ireq->opt = tcp_v4_save_options(sk, skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001150 if (!want_cookie)
1151 TCP_ECN_create_request(req, skb->h.th);
1152
1153 if (want_cookie) {
1154#ifdef CONFIG_SYN_COOKIES
1155 syn_flood_warning(skb);
1156#endif
1157 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1158 } else if (!isn) {
1159 struct inet_peer *peer = NULL;
1160
1161 /* VJ's idea. We save last timestamp seen
1162 * from the destination in peer table, when entering
1163 * state TIME-WAIT, and check against it before
1164 * accepting new connection request.
1165 *
1166 * If "isn" is not zero, this request hit alive
1167 * timewait bucket, so that all the necessary checks
1168 * are made in the function processing timewait state.
1169 */
1170 if (tmp_opt.saw_tstamp &&
1171 sysctl_tcp_tw_recycle &&
1172 (dst = tcp_v4_route_req(sk, req)) != NULL &&
1173 (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1174 peer->v4daddr == saddr) {
1175 if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1176 (s32)(peer->tcp_ts - req->ts_recent) >
1177 TCP_PAWS_WINDOW) {
1178 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1179 dst_release(dst);
1180 goto drop_and_free;
1181 }
1182 }
1183 /* Kill the following clause, if you dislike this way. */
1184 else if (!sysctl_tcp_syncookies &&
1185 (sysctl_max_syn_backlog - tcp_synq_len(sk) <
1186 (sysctl_max_syn_backlog >> 2)) &&
1187 (!peer || !peer->tcp_ts_stamp) &&
1188 (!dst || !dst_metric(dst, RTAX_RTT))) {
1189 /* Without syncookies last quarter of
1190 * backlog is filled with destinations,
1191 * proven to be alive.
1192 * It means that we continue to communicate
1193 * to destinations, already remembered
1194 * to the moment of synflood.
1195 */
Heikki Orsilaca933452005-08-08 14:26:52 -07001196 LIMIT_NETDEBUG(printk(KERN_DEBUG "TCP: drop open "
1197 "request from %u.%u."
1198 "%u.%u/%u\n",
1199 NIPQUAD(saddr),
1200 ntohs(skb->h.th->source)));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001201 dst_release(dst);
1202 goto drop_and_free;
1203 }
1204
1205 isn = tcp_v4_init_sequence(sk, skb);
1206 }
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001207 tcp_rsk(req)->snt_isn = isn;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001208
1209 if (tcp_v4_send_synack(sk, req, dst))
1210 goto drop_and_free;
1211
1212 if (want_cookie) {
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001213 reqsk_free(req);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001214 } else {
1215 tcp_v4_synq_add(sk, req);
1216 }
1217 return 0;
1218
1219drop_and_free:
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001220 reqsk_free(req);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001221drop:
1222 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1223 return 0;
1224}
1225
1226
1227/*
1228 * The three way handshake has completed - we got a valid synack -
1229 * now create the new socket.
1230 */
1231struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001232 struct request_sock *req,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001233 struct dst_entry *dst)
1234{
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001235 struct inet_request_sock *ireq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001236 struct inet_sock *newinet;
1237 struct tcp_sock *newtp;
1238 struct sock *newsk;
1239
1240 if (sk_acceptq_is_full(sk))
1241 goto exit_overflow;
1242
1243 if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1244 goto exit;
1245
1246 newsk = tcp_create_openreq_child(sk, req, skb);
1247 if (!newsk)
1248 goto exit;
1249
Arnaldo Carvalho de Melo6cbb0df2005-08-09 19:49:02 -07001250 sk_setup_caps(newsk, dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001251
1252 newtp = tcp_sk(newsk);
1253 newinet = inet_sk(newsk);
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001254 ireq = inet_rsk(req);
1255 newinet->daddr = ireq->rmt_addr;
1256 newinet->rcv_saddr = ireq->loc_addr;
1257 newinet->saddr = ireq->loc_addr;
1258 newinet->opt = ireq->opt;
1259 ireq->opt = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001260 newinet->mc_index = tcp_v4_iif(skb);
1261 newinet->mc_ttl = skb->nh.iph->ttl;
1262 newtp->ext_header_len = 0;
1263 if (newinet->opt)
1264 newtp->ext_header_len = newinet->opt->optlen;
1265 newinet->id = newtp->write_seq ^ jiffies;
1266
1267 tcp_sync_mss(newsk, dst_mtu(dst));
1268 newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1269 tcp_initialize_rcv_mss(newsk);
1270
Arnaldo Carvalho de Melof3f05f72005-08-09 20:08:09 -07001271 __inet_hash(&tcp_hashinfo, newsk, 0);
Arnaldo Carvalho de Melo2d8c4ce2005-08-09 20:07:13 -07001272 __inet_inherit_port(&tcp_hashinfo, sk, newsk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001273
1274 return newsk;
1275
1276exit_overflow:
1277 NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1278exit:
1279 NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1280 dst_release(dst);
1281 return NULL;
1282}
1283
1284static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1285{
1286 struct tcphdr *th = skb->h.th;
1287 struct iphdr *iph = skb->nh.iph;
1288 struct tcp_sock *tp = tcp_sk(sk);
1289 struct sock *nsk;
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001290 struct request_sock **prev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001291 /* Find possible connection requests. */
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001292 struct request_sock *req = tcp_v4_search_req(tp, &prev, th->source,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001293 iph->saddr, iph->daddr);
1294 if (req)
1295 return tcp_check_req(sk, skb, req, prev);
1296
Arnaldo Carvalho de Meloe48c4142005-08-09 20:09:46 -07001297 nsk = __inet_lookup_established(&tcp_hashinfo, skb->nh.iph->saddr,
1298 th->source, skb->nh.iph->daddr,
1299 ntohs(th->dest), tcp_v4_iif(skb));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001300
1301 if (nsk) {
1302 if (nsk->sk_state != TCP_TIME_WAIT) {
1303 bh_lock_sock(nsk);
1304 return nsk;
1305 }
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07001306 inet_twsk_put((struct inet_timewait_sock *)nsk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001307 return NULL;
1308 }
1309
1310#ifdef CONFIG_SYN_COOKIES
1311 if (!th->rst && !th->syn && th->ack)
1312 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1313#endif
1314 return sk;
1315}
1316
1317static int tcp_v4_checksum_init(struct sk_buff *skb)
1318{
1319 if (skb->ip_summed == CHECKSUM_HW) {
1320 skb->ip_summed = CHECKSUM_UNNECESSARY;
1321 if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1322 skb->nh.iph->daddr, skb->csum))
1323 return 0;
1324
Heikki Orsilaca933452005-08-08 14:26:52 -07001325 LIMIT_NETDEBUG(printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001326 skb->ip_summed = CHECKSUM_NONE;
1327 }
1328 if (skb->len <= 76) {
1329 if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1330 skb->nh.iph->daddr,
1331 skb_checksum(skb, 0, skb->len, 0)))
1332 return -1;
1333 skb->ip_summed = CHECKSUM_UNNECESSARY;
1334 } else {
1335 skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
1336 skb->nh.iph->saddr,
1337 skb->nh.iph->daddr, 0);
1338 }
1339 return 0;
1340}
1341
1342
1343/* The socket must have it's spinlock held when we get
1344 * here.
1345 *
1346 * We have a potential double-lock case here, so even when
1347 * doing backlog processing we use the BH locking scheme.
1348 * This is because we cannot sleep with the original spinlock
1349 * held.
1350 */
1351int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1352{
1353 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1354 TCP_CHECK_TIMER(sk);
1355 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1356 goto reset;
1357 TCP_CHECK_TIMER(sk);
1358 return 0;
1359 }
1360
1361 if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1362 goto csum_err;
1363
1364 if (sk->sk_state == TCP_LISTEN) {
1365 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1366 if (!nsk)
1367 goto discard;
1368
1369 if (nsk != sk) {
1370 if (tcp_child_process(sk, nsk, skb))
1371 goto reset;
1372 return 0;
1373 }
1374 }
1375
1376 TCP_CHECK_TIMER(sk);
1377 if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1378 goto reset;
1379 TCP_CHECK_TIMER(sk);
1380 return 0;
1381
1382reset:
1383 tcp_v4_send_reset(skb);
1384discard:
1385 kfree_skb(skb);
1386 /* Be careful here. If this function gets more complicated and
1387 * gcc suffers from register pressure on the x86, sk (in %ebx)
1388 * might be destroyed here. This current version compiles correctly,
1389 * but you have been warned.
1390 */
1391 return 0;
1392
1393csum_err:
1394 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1395 goto discard;
1396}
1397
1398/*
1399 * From tcp_input.c
1400 */
1401
1402int tcp_v4_rcv(struct sk_buff *skb)
1403{
1404 struct tcphdr *th;
1405 struct sock *sk;
1406 int ret;
1407
1408 if (skb->pkt_type != PACKET_HOST)
1409 goto discard_it;
1410
1411 /* Count it even if it's bad */
1412 TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1413
1414 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1415 goto discard_it;
1416
1417 th = skb->h.th;
1418
1419 if (th->doff < sizeof(struct tcphdr) / 4)
1420 goto bad_packet;
1421 if (!pskb_may_pull(skb, th->doff * 4))
1422 goto discard_it;
1423
1424 /* An explanation is required here, I think.
1425 * Packet length and doff are validated by header prediction,
1426 * provided case of th->doff==0 is elimineted.
1427 * So, we defer the checks. */
1428 if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1429 tcp_v4_checksum_init(skb) < 0))
1430 goto bad_packet;
1431
1432 th = skb->h.th;
1433 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1434 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1435 skb->len - th->doff * 4);
1436 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1437 TCP_SKB_CB(skb)->when = 0;
1438 TCP_SKB_CB(skb)->flags = skb->nh.iph->tos;
1439 TCP_SKB_CB(skb)->sacked = 0;
1440
Arnaldo Carvalho de Meloe48c4142005-08-09 20:09:46 -07001441 sk = __inet_lookup(&tcp_hashinfo, skb->nh.iph->saddr, th->source,
1442 skb->nh.iph->daddr, ntohs(th->dest),
1443 tcp_v4_iif(skb));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001444
1445 if (!sk)
1446 goto no_tcp_socket;
1447
1448process:
1449 if (sk->sk_state == TCP_TIME_WAIT)
1450 goto do_time_wait;
1451
1452 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1453 goto discard_and_relse;
1454
1455 if (sk_filter(sk, skb, 0))
1456 goto discard_and_relse;
1457
1458 skb->dev = NULL;
1459
1460 bh_lock_sock(sk);
1461 ret = 0;
1462 if (!sock_owned_by_user(sk)) {
1463 if (!tcp_prequeue(sk, skb))
1464 ret = tcp_v4_do_rcv(sk, skb);
1465 } else
1466 sk_add_backlog(sk, skb);
1467 bh_unlock_sock(sk);
1468
1469 sock_put(sk);
1470
1471 return ret;
1472
1473no_tcp_socket:
1474 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1475 goto discard_it;
1476
1477 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1478bad_packet:
1479 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1480 } else {
1481 tcp_v4_send_reset(skb);
1482 }
1483
1484discard_it:
1485 /* Discard frame. */
1486 kfree_skb(skb);
1487 return 0;
1488
1489discard_and_relse:
1490 sock_put(sk);
1491 goto discard_it;
1492
1493do_time_wait:
1494 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07001495 inet_twsk_put((struct inet_timewait_sock *) sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001496 goto discard_it;
1497 }
1498
1499 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1500 TCP_INC_STATS_BH(TCP_MIB_INERRS);
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07001501 inet_twsk_put((struct inet_timewait_sock *) sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001502 goto discard_it;
1503 }
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07001504 switch (tcp_timewait_state_process((struct inet_timewait_sock *)sk,
1505 skb, th)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001506 case TCP_TW_SYN: {
Arnaldo Carvalho de Melo33b62232005-08-09 20:09:06 -07001507 struct sock *sk2 = inet_lookup_listener(&tcp_hashinfo,
1508 skb->nh.iph->daddr,
1509 ntohs(th->dest),
1510 tcp_v4_iif(skb));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001511 if (sk2) {
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07001512 tcp_tw_deschedule((struct inet_timewait_sock *)sk);
1513 inet_twsk_put((struct inet_timewait_sock *)sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001514 sk = sk2;
1515 goto process;
1516 }
1517 /* Fall through to ACK */
1518 }
1519 case TCP_TW_ACK:
1520 tcp_v4_timewait_ack(sk, skb);
1521 break;
1522 case TCP_TW_RST:
1523 goto no_tcp_socket;
1524 case TCP_TW_SUCCESS:;
1525 }
1526 goto discard_it;
1527}
1528
Linus Torvalds1da177e2005-04-16 15:20:36 -07001529static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1530{
1531 struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1532 struct inet_sock *inet = inet_sk(sk);
1533
1534 sin->sin_family = AF_INET;
1535 sin->sin_addr.s_addr = inet->daddr;
1536 sin->sin_port = inet->dport;
1537}
1538
1539/* VJ's idea. Save last timestamp seen from this destination
1540 * and hold it at least for normal timewait interval to use for duplicate
1541 * segment detection in subsequent connections, before they enter synchronized
1542 * state.
1543 */
1544
1545int tcp_v4_remember_stamp(struct sock *sk)
1546{
1547 struct inet_sock *inet = inet_sk(sk);
1548 struct tcp_sock *tp = tcp_sk(sk);
1549 struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1550 struct inet_peer *peer = NULL;
1551 int release_it = 0;
1552
1553 if (!rt || rt->rt_dst != inet->daddr) {
1554 peer = inet_getpeer(inet->daddr, 1);
1555 release_it = 1;
1556 } else {
1557 if (!rt->peer)
1558 rt_bind_peer(rt, 1);
1559 peer = rt->peer;
1560 }
1561
1562 if (peer) {
1563 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1564 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1565 peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1566 peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1567 peer->tcp_ts = tp->rx_opt.ts_recent;
1568 }
1569 if (release_it)
1570 inet_putpeer(peer);
1571 return 1;
1572 }
1573
1574 return 0;
1575}
1576
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07001577int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001578{
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07001579 struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001580
1581 if (peer) {
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07001582 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1583
1584 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
Linus Torvalds1da177e2005-04-16 15:20:36 -07001585 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07001586 peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1587 peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1588 peer->tcp_ts = tcptw->tw_ts_recent;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001589 }
1590 inet_putpeer(peer);
1591 return 1;
1592 }
1593
1594 return 0;
1595}
1596
1597struct tcp_func ipv4_specific = {
1598 .queue_xmit = ip_queue_xmit,
1599 .send_check = tcp_v4_send_check,
Arnaldo Carvalho de Melo32519f12005-08-09 19:50:02 -07001600 .rebuild_header = inet_sk_rebuild_header,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001601 .conn_request = tcp_v4_conn_request,
1602 .syn_recv_sock = tcp_v4_syn_recv_sock,
1603 .remember_stamp = tcp_v4_remember_stamp,
1604 .net_header_len = sizeof(struct iphdr),
1605 .setsockopt = ip_setsockopt,
1606 .getsockopt = ip_getsockopt,
1607 .addr2sockaddr = v4_addr2sockaddr,
1608 .sockaddr_len = sizeof(struct sockaddr_in),
1609};
1610
1611/* NOTE: A lot of things set to zero explicitly by call to
1612 * sk_alloc() so need not be done here.
1613 */
1614static int tcp_v4_init_sock(struct sock *sk)
1615{
1616 struct tcp_sock *tp = tcp_sk(sk);
1617
1618 skb_queue_head_init(&tp->out_of_order_queue);
1619 tcp_init_xmit_timers(sk);
1620 tcp_prequeue_init(tp);
1621
1622 tp->rto = TCP_TIMEOUT_INIT;
1623 tp->mdev = TCP_TIMEOUT_INIT;
1624
1625 /* So many TCP implementations out there (incorrectly) count the
1626 * initial SYN frame in their delayed-ACK and congestion control
1627 * algorithms that we must have the following bandaid to talk
1628 * efficiently to them. -DaveM
1629 */
1630 tp->snd_cwnd = 2;
1631
1632 /* See draft-stevens-tcpca-spec-01 for discussion of the
1633 * initialization of these values.
1634 */
1635 tp->snd_ssthresh = 0x7fffffff; /* Infinity */
1636 tp->snd_cwnd_clamp = ~0;
David S. Millerc1b4a7e2005-07-05 15:24:38 -07001637 tp->mss_cache = 536;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001638
1639 tp->reordering = sysctl_tcp_reordering;
Stephen Hemminger5f8ef482005-06-23 20:37:36 -07001640 tp->ca_ops = &tcp_init_congestion_ops;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001641
1642 sk->sk_state = TCP_CLOSE;
1643
1644 sk->sk_write_space = sk_stream_write_space;
1645 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1646
1647 tp->af_specific = &ipv4_specific;
1648
1649 sk->sk_sndbuf = sysctl_tcp_wmem[1];
1650 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1651
1652 atomic_inc(&tcp_sockets_allocated);
1653
1654 return 0;
1655}
1656
1657int tcp_v4_destroy_sock(struct sock *sk)
1658{
1659 struct tcp_sock *tp = tcp_sk(sk);
1660
1661 tcp_clear_xmit_timers(sk);
1662
Stephen Hemminger317a76f2005-06-23 12:19:55 -07001663 tcp_cleanup_congestion_control(tp);
1664
Linus Torvalds1da177e2005-04-16 15:20:36 -07001665 /* Cleanup up the write buffer. */
1666 sk_stream_writequeue_purge(sk);
1667
1668 /* Cleans up our, hopefully empty, out_of_order_queue. */
1669 __skb_queue_purge(&tp->out_of_order_queue);
1670
1671 /* Clean prequeue, it must be empty really */
1672 __skb_queue_purge(&tp->ucopy.prequeue);
1673
1674 /* Clean up a referenced TCP bind bucket. */
Arnaldo Carvalho de Meloa55ebcc2005-08-09 20:01:14 -07001675 if (inet_sk(sk)->bind_hash)
Arnaldo Carvalho de Melo2d8c4ce2005-08-09 20:07:13 -07001676 inet_put_port(&tcp_hashinfo, sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001677
1678 /*
1679 * If sendmsg cached page exists, toss it.
1680 */
1681 if (sk->sk_sndmsg_page) {
1682 __free_page(sk->sk_sndmsg_page);
1683 sk->sk_sndmsg_page = NULL;
1684 }
1685
1686 atomic_dec(&tcp_sockets_allocated);
1687
1688 return 0;
1689}
1690
1691EXPORT_SYMBOL(tcp_v4_destroy_sock);
1692
1693#ifdef CONFIG_PROC_FS
1694/* Proc filesystem TCP sock list dumping. */
1695
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07001696static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001697{
1698 return hlist_empty(head) ? NULL :
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07001699 list_entry(head->first, struct inet_timewait_sock, tw_node);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001700}
1701
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07001702static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001703{
1704 return tw->tw_node.next ?
1705 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1706}
1707
1708static void *listening_get_next(struct seq_file *seq, void *cur)
1709{
1710 struct tcp_sock *tp;
1711 struct hlist_node *node;
1712 struct sock *sk = cur;
1713 struct tcp_iter_state* st = seq->private;
1714
1715 if (!sk) {
1716 st->bucket = 0;
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -07001717 sk = sk_head(&tcp_hashinfo.listening_hash[0]);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001718 goto get_sk;
1719 }
1720
1721 ++st->num;
1722
1723 if (st->state == TCP_SEQ_STATE_OPENREQ) {
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001724 struct request_sock *req = cur;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001725
1726 tp = tcp_sk(st->syn_wait_sk);
1727 req = req->dl_next;
1728 while (1) {
1729 while (req) {
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001730 if (req->rsk_ops->family == st->family) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001731 cur = req;
1732 goto out;
1733 }
1734 req = req->dl_next;
1735 }
1736 if (++st->sbucket >= TCP_SYNQ_HSIZE)
1737 break;
1738get_req:
Arnaldo Carvalho de Melo0e875062005-06-18 22:47:59 -07001739 req = tp->accept_queue.listen_opt->syn_table[st->sbucket];
Linus Torvalds1da177e2005-04-16 15:20:36 -07001740 }
1741 sk = sk_next(st->syn_wait_sk);
1742 st->state = TCP_SEQ_STATE_LISTENING;
Arnaldo Carvalho de Melo0e875062005-06-18 22:47:59 -07001743 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001744 } else {
1745 tp = tcp_sk(sk);
Arnaldo Carvalho de Melo0e875062005-06-18 22:47:59 -07001746 read_lock_bh(&tp->accept_queue.syn_wait_lock);
1747 if (reqsk_queue_len(&tp->accept_queue))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001748 goto start_req;
Arnaldo Carvalho de Melo0e875062005-06-18 22:47:59 -07001749 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001750 sk = sk_next(sk);
1751 }
1752get_sk:
1753 sk_for_each_from(sk, node) {
1754 if (sk->sk_family == st->family) {
1755 cur = sk;
1756 goto out;
1757 }
1758 tp = tcp_sk(sk);
Arnaldo Carvalho de Melo0e875062005-06-18 22:47:59 -07001759 read_lock_bh(&tp->accept_queue.syn_wait_lock);
1760 if (reqsk_queue_len(&tp->accept_queue)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001761start_req:
1762 st->uid = sock_i_uid(sk);
1763 st->syn_wait_sk = sk;
1764 st->state = TCP_SEQ_STATE_OPENREQ;
1765 st->sbucket = 0;
1766 goto get_req;
1767 }
Arnaldo Carvalho de Melo0e875062005-06-18 22:47:59 -07001768 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001769 }
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -07001770 if (++st->bucket < INET_LHTABLE_SIZE) {
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -07001771 sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001772 goto get_sk;
1773 }
1774 cur = NULL;
1775out:
1776 return cur;
1777}
1778
1779static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1780{
1781 void *rc = listening_get_next(seq, NULL);
1782
1783 while (rc && *pos) {
1784 rc = listening_get_next(seq, rc);
1785 --*pos;
1786 }
1787 return rc;
1788}
1789
1790static void *established_get_first(struct seq_file *seq)
1791{
1792 struct tcp_iter_state* st = seq->private;
1793 void *rc = NULL;
1794
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -07001795 for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001796 struct sock *sk;
1797 struct hlist_node *node;
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07001798 struct inet_timewait_sock *tw;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001799
1800 /* We can reschedule _before_ having picked the target: */
1801 cond_resched_softirq();
1802
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -07001803 read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
1804 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001805 if (sk->sk_family != st->family) {
1806 continue;
1807 }
1808 rc = sk;
1809 goto out;
1810 }
1811 st->state = TCP_SEQ_STATE_TIME_WAIT;
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07001812 inet_twsk_for_each(tw, node,
1813 &tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001814 if (tw->tw_family != st->family) {
1815 continue;
1816 }
1817 rc = tw;
1818 goto out;
1819 }
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -07001820 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001821 st->state = TCP_SEQ_STATE_ESTABLISHED;
1822 }
1823out:
1824 return rc;
1825}
1826
1827static void *established_get_next(struct seq_file *seq, void *cur)
1828{
1829 struct sock *sk = cur;
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07001830 struct inet_timewait_sock *tw;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001831 struct hlist_node *node;
1832 struct tcp_iter_state* st = seq->private;
1833
1834 ++st->num;
1835
1836 if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
1837 tw = cur;
1838 tw = tw_next(tw);
1839get_tw:
1840 while (tw && tw->tw_family != st->family) {
1841 tw = tw_next(tw);
1842 }
1843 if (tw) {
1844 cur = tw;
1845 goto out;
1846 }
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -07001847 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001848 st->state = TCP_SEQ_STATE_ESTABLISHED;
1849
1850 /* We can reschedule between buckets: */
1851 cond_resched_softirq();
1852
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -07001853 if (++st->bucket < tcp_hashinfo.ehash_size) {
1854 read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
1855 sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001856 } else {
1857 cur = NULL;
1858 goto out;
1859 }
1860 } else
1861 sk = sk_next(sk);
1862
1863 sk_for_each_from(sk, node) {
1864 if (sk->sk_family == st->family)
1865 goto found;
1866 }
1867
1868 st->state = TCP_SEQ_STATE_TIME_WAIT;
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -07001869 tw = tw_head(&tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001870 goto get_tw;
1871found:
1872 cur = sk;
1873out:
1874 return cur;
1875}
1876
1877static void *established_get_idx(struct seq_file *seq, loff_t pos)
1878{
1879 void *rc = established_get_first(seq);
1880
1881 while (rc && pos) {
1882 rc = established_get_next(seq, rc);
1883 --pos;
1884 }
1885 return rc;
1886}
1887
1888static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
1889{
1890 void *rc;
1891 struct tcp_iter_state* st = seq->private;
1892
Arnaldo Carvalho de Melof3f05f72005-08-09 20:08:09 -07001893 inet_listen_lock(&tcp_hashinfo);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001894 st->state = TCP_SEQ_STATE_LISTENING;
1895 rc = listening_get_idx(seq, &pos);
1896
1897 if (!rc) {
Arnaldo Carvalho de Melof3f05f72005-08-09 20:08:09 -07001898 inet_listen_unlock(&tcp_hashinfo);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001899 local_bh_disable();
1900 st->state = TCP_SEQ_STATE_ESTABLISHED;
1901 rc = established_get_idx(seq, pos);
1902 }
1903
1904 return rc;
1905}
1906
1907static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
1908{
1909 struct tcp_iter_state* st = seq->private;
1910 st->state = TCP_SEQ_STATE_LISTENING;
1911 st->num = 0;
1912 return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
1913}
1914
1915static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1916{
1917 void *rc = NULL;
1918 struct tcp_iter_state* st;
1919
1920 if (v == SEQ_START_TOKEN) {
1921 rc = tcp_get_idx(seq, 0);
1922 goto out;
1923 }
1924 st = seq->private;
1925
1926 switch (st->state) {
1927 case TCP_SEQ_STATE_OPENREQ:
1928 case TCP_SEQ_STATE_LISTENING:
1929 rc = listening_get_next(seq, v);
1930 if (!rc) {
Arnaldo Carvalho de Melof3f05f72005-08-09 20:08:09 -07001931 inet_listen_unlock(&tcp_hashinfo);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001932 local_bh_disable();
1933 st->state = TCP_SEQ_STATE_ESTABLISHED;
1934 rc = established_get_first(seq);
1935 }
1936 break;
1937 case TCP_SEQ_STATE_ESTABLISHED:
1938 case TCP_SEQ_STATE_TIME_WAIT:
1939 rc = established_get_next(seq, v);
1940 break;
1941 }
1942out:
1943 ++*pos;
1944 return rc;
1945}
1946
1947static void tcp_seq_stop(struct seq_file *seq, void *v)
1948{
1949 struct tcp_iter_state* st = seq->private;
1950
1951 switch (st->state) {
1952 case TCP_SEQ_STATE_OPENREQ:
1953 if (v) {
1954 struct tcp_sock *tp = tcp_sk(st->syn_wait_sk);
Arnaldo Carvalho de Melo0e875062005-06-18 22:47:59 -07001955 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001956 }
1957 case TCP_SEQ_STATE_LISTENING:
1958 if (v != SEQ_START_TOKEN)
Arnaldo Carvalho de Melof3f05f72005-08-09 20:08:09 -07001959 inet_listen_unlock(&tcp_hashinfo);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001960 break;
1961 case TCP_SEQ_STATE_TIME_WAIT:
1962 case TCP_SEQ_STATE_ESTABLISHED:
1963 if (v)
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -07001964 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001965 local_bh_enable();
1966 break;
1967 }
1968}
1969
1970static int tcp_seq_open(struct inode *inode, struct file *file)
1971{
1972 struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
1973 struct seq_file *seq;
1974 struct tcp_iter_state *s;
1975 int rc;
1976
1977 if (unlikely(afinfo == NULL))
1978 return -EINVAL;
1979
1980 s = kmalloc(sizeof(*s), GFP_KERNEL);
1981 if (!s)
1982 return -ENOMEM;
1983 memset(s, 0, sizeof(*s));
1984 s->family = afinfo->family;
1985 s->seq_ops.start = tcp_seq_start;
1986 s->seq_ops.next = tcp_seq_next;
1987 s->seq_ops.show = afinfo->seq_show;
1988 s->seq_ops.stop = tcp_seq_stop;
1989
1990 rc = seq_open(file, &s->seq_ops);
1991 if (rc)
1992 goto out_kfree;
1993 seq = file->private_data;
1994 seq->private = s;
1995out:
1996 return rc;
1997out_kfree:
1998 kfree(s);
1999 goto out;
2000}
2001
2002int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2003{
2004 int rc = 0;
2005 struct proc_dir_entry *p;
2006
2007 if (!afinfo)
2008 return -EINVAL;
2009 afinfo->seq_fops->owner = afinfo->owner;
2010 afinfo->seq_fops->open = tcp_seq_open;
2011 afinfo->seq_fops->read = seq_read;
2012 afinfo->seq_fops->llseek = seq_lseek;
2013 afinfo->seq_fops->release = seq_release_private;
2014
2015 p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
2016 if (p)
2017 p->data = afinfo;
2018 else
2019 rc = -ENOMEM;
2020 return rc;
2021}
2022
2023void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2024{
2025 if (!afinfo)
2026 return;
2027 proc_net_remove(afinfo->name);
2028 memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
2029}
2030
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07002031static void get_openreq4(struct sock *sk, struct request_sock *req,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002032 char *tmpbuf, int i, int uid)
2033{
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002034 const struct inet_request_sock *ireq = inet_rsk(req);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002035 int ttd = req->expires - jiffies;
2036
2037 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2038 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2039 i,
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002040 ireq->loc_addr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002041 ntohs(inet_sk(sk)->sport),
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002042 ireq->rmt_addr,
2043 ntohs(ireq->rmt_port),
Linus Torvalds1da177e2005-04-16 15:20:36 -07002044 TCP_SYN_RECV,
2045 0, 0, /* could print option size, but that is af dependent. */
2046 1, /* timers active (only the expire timer) */
2047 jiffies_to_clock_t(ttd),
2048 req->retrans,
2049 uid,
2050 0, /* non standard timer */
2051 0, /* open_requests have no inode */
2052 atomic_read(&sk->sk_refcnt),
2053 req);
2054}
2055
2056static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
2057{
2058 int timer_active;
2059 unsigned long timer_expires;
2060 struct tcp_sock *tp = tcp_sk(sp);
2061 struct inet_sock *inet = inet_sk(sp);
2062 unsigned int dest = inet->daddr;
2063 unsigned int src = inet->rcv_saddr;
2064 __u16 destp = ntohs(inet->dport);
2065 __u16 srcp = ntohs(inet->sport);
2066
2067 if (tp->pending == TCP_TIME_RETRANS) {
2068 timer_active = 1;
2069 timer_expires = tp->timeout;
2070 } else if (tp->pending == TCP_TIME_PROBE0) {
2071 timer_active = 4;
2072 timer_expires = tp->timeout;
2073 } else if (timer_pending(&sp->sk_timer)) {
2074 timer_active = 2;
2075 timer_expires = sp->sk_timer.expires;
2076 } else {
2077 timer_active = 0;
2078 timer_expires = jiffies;
2079 }
2080
2081 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2082 "%08X %5d %8d %lu %d %p %u %u %u %u %d",
2083 i, src, srcp, dest, destp, sp->sk_state,
2084 tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
2085 timer_active,
2086 jiffies_to_clock_t(timer_expires - jiffies),
2087 tp->retransmits,
2088 sock_i_uid(sp),
2089 tp->probes_out,
2090 sock_i_ino(sp),
2091 atomic_read(&sp->sk_refcnt), sp,
2092 tp->rto, tp->ack.ato, (tp->ack.quick << 1) | tp->ack.pingpong,
2093 tp->snd_cwnd,
2094 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2095}
2096
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07002097static void get_timewait4_sock(struct inet_timewait_sock *tw, char *tmpbuf, int i)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002098{
2099 unsigned int dest, src;
2100 __u16 destp, srcp;
2101 int ttd = tw->tw_ttd - jiffies;
2102
2103 if (ttd < 0)
2104 ttd = 0;
2105
2106 dest = tw->tw_daddr;
2107 src = tw->tw_rcv_saddr;
2108 destp = ntohs(tw->tw_dport);
2109 srcp = ntohs(tw->tw_sport);
2110
2111 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2112 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2113 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2114 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2115 atomic_read(&tw->tw_refcnt), tw);
2116}
2117
2118#define TMPSZ 150
2119
2120static int tcp4_seq_show(struct seq_file *seq, void *v)
2121{
2122 struct tcp_iter_state* st;
2123 char tmpbuf[TMPSZ + 1];
2124
2125 if (v == SEQ_START_TOKEN) {
2126 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2127 " sl local_address rem_address st tx_queue "
2128 "rx_queue tr tm->when retrnsmt uid timeout "
2129 "inode");
2130 goto out;
2131 }
2132 st = seq->private;
2133
2134 switch (st->state) {
2135 case TCP_SEQ_STATE_LISTENING:
2136 case TCP_SEQ_STATE_ESTABLISHED:
2137 get_tcp4_sock(v, tmpbuf, st->num);
2138 break;
2139 case TCP_SEQ_STATE_OPENREQ:
2140 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2141 break;
2142 case TCP_SEQ_STATE_TIME_WAIT:
2143 get_timewait4_sock(v, tmpbuf, st->num);
2144 break;
2145 }
2146 seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2147out:
2148 return 0;
2149}
2150
2151static struct file_operations tcp4_seq_fops;
2152static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2153 .owner = THIS_MODULE,
2154 .name = "tcp",
2155 .family = AF_INET,
2156 .seq_show = tcp4_seq_show,
2157 .seq_fops = &tcp4_seq_fops,
2158};
2159
2160int __init tcp4_proc_init(void)
2161{
2162 return tcp_proc_register(&tcp4_seq_afinfo);
2163}
2164
2165void tcp4_proc_exit(void)
2166{
2167 tcp_proc_unregister(&tcp4_seq_afinfo);
2168}
2169#endif /* CONFIG_PROC_FS */
2170
2171struct proto tcp_prot = {
2172 .name = "TCP",
2173 .owner = THIS_MODULE,
2174 .close = tcp_close,
2175 .connect = tcp_v4_connect,
2176 .disconnect = tcp_disconnect,
2177 .accept = tcp_accept,
2178 .ioctl = tcp_ioctl,
2179 .init = tcp_v4_init_sock,
2180 .destroy = tcp_v4_destroy_sock,
2181 .shutdown = tcp_shutdown,
2182 .setsockopt = tcp_setsockopt,
2183 .getsockopt = tcp_getsockopt,
2184 .sendmsg = tcp_sendmsg,
2185 .recvmsg = tcp_recvmsg,
2186 .backlog_rcv = tcp_v4_do_rcv,
2187 .hash = tcp_v4_hash,
2188 .unhash = tcp_unhash,
2189 .get_port = tcp_v4_get_port,
2190 .enter_memory_pressure = tcp_enter_memory_pressure,
2191 .sockets_allocated = &tcp_sockets_allocated,
2192 .memory_allocated = &tcp_memory_allocated,
2193 .memory_pressure = &tcp_memory_pressure,
2194 .sysctl_mem = sysctl_tcp_mem,
2195 .sysctl_wmem = sysctl_tcp_wmem,
2196 .sysctl_rmem = sysctl_tcp_rmem,
2197 .max_header = MAX_TCP_HEADER,
2198 .obj_size = sizeof(struct tcp_sock),
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07002199 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07002200 .rsk_prot = &tcp_request_sock_ops,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002201};
2202
2203
2204
2205void __init tcp_v4_init(struct net_proto_family *ops)
2206{
2207 int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
2208 if (err < 0)
2209 panic("Failed to create the TCP control socket.\n");
2210 tcp_socket->sk->sk_allocation = GFP_ATOMIC;
2211 inet_sk(tcp_socket->sk)->uc_ttl = -1;
2212
2213 /* Unhash it so that IP input processing does not even
2214 * see it, we do not wish this socket to see incoming
2215 * packets.
2216 */
2217 tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
2218}
2219
2220EXPORT_SYMBOL(ipv4_specific);
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -07002221EXPORT_SYMBOL(inet_bind_bucket_create);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002222EXPORT_SYMBOL(tcp_hashinfo);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002223EXPORT_SYMBOL(tcp_prot);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002224EXPORT_SYMBOL(tcp_unhash);
2225EXPORT_SYMBOL(tcp_v4_conn_request);
2226EXPORT_SYMBOL(tcp_v4_connect);
2227EXPORT_SYMBOL(tcp_v4_do_rcv);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002228EXPORT_SYMBOL(tcp_v4_remember_stamp);
2229EXPORT_SYMBOL(tcp_v4_send_check);
2230EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2231
2232#ifdef CONFIG_PROC_FS
2233EXPORT_SYMBOL(tcp_proc_register);
2234EXPORT_SYMBOL(tcp_proc_unregister);
2235#endif
2236EXPORT_SYMBOL(sysctl_local_port_range);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002237EXPORT_SYMBOL(sysctl_tcp_low_latency);
2238EXPORT_SYMBOL(sysctl_tcp_tw_reuse);
2239