blob: 2f605b9e6b679db1a456239aa70e46a6ff19fa15 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
8 * Version: $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
9 *
10 * IPv4 specific functions
11 *
12 *
13 * code split from:
14 * linux/ipv4/tcp.c
15 * linux/ipv4/tcp_input.c
16 * linux/ipv4/tcp_output.c
17 *
18 * See tcp.c for author information
19 *
20 * This program is free software; you can redistribute it and/or
21 * modify it under the terms of the GNU General Public License
22 * as published by the Free Software Foundation; either version
23 * 2 of the License, or (at your option) any later version.
24 */
25
26/*
27 * Changes:
28 * David S. Miller : New socket lookup architecture.
29 * This code is dedicated to John Dyson.
30 * David S. Miller : Change semantics of established hash,
31 * half is devoted to TIME_WAIT sockets
32 * and the rest go in the other half.
33 * Andi Kleen : Add support for syncookies and fixed
34 * some bugs: ip options weren't passed to
35 * the TCP layer, missed a check for an
36 * ACK bit.
37 * Andi Kleen : Implemented fast path mtu discovery.
38 * Fixed many serious bugs in the
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -070039 * request_sock handling and moved
Linus Torvalds1da177e2005-04-16 15:20:36 -070040 * most of it into the af independent code.
41 * Added tail drop and some other bugfixes.
42 * Added new listen sematics.
43 * Mike McLagan : Routing by source
44 * Juan Jose Ciarlante: ip_dynaddr bits
45 * Andi Kleen: various fixes.
46 * Vitaly E. Lavrov : Transparent proxy revived after year
47 * coma.
48 * Andi Kleen : Fix new listen.
49 * Andi Kleen : Fix accept error reporting.
50 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
51 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
52 * a single port at the same time.
53 */
54
55#include <linux/config.h>
56
57#include <linux/types.h>
58#include <linux/fcntl.h>
59#include <linux/module.h>
60#include <linux/random.h>
61#include <linux/cache.h>
62#include <linux/jhash.h>
63#include <linux/init.h>
64#include <linux/times.h>
65
66#include <net/icmp.h>
Arnaldo Carvalho de Melo304a1612005-08-09 19:59:20 -070067#include <net/inet_hashtables.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070068#include <net/tcp.h>
69#include <net/ipv6.h>
70#include <net/inet_common.h>
71#include <net/xfrm.h>
72
73#include <linux/inet.h>
74#include <linux/ipv6.h>
75#include <linux/stddef.h>
76#include <linux/proc_fs.h>
77#include <linux/seq_file.h>
78
79extern int sysctl_ip_dynaddr;
80int sysctl_tcp_tw_reuse;
81int sysctl_tcp_low_latency;
82
83/* Check TCP sequence numbers in ICMP packets. */
84#define ICMP_MIN_LENGTH 8
85
86/* Socket used for sending RSTs */
87static struct socket *tcp_socket;
88
89void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
90 struct sk_buff *skb);
91
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -070092struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
93 .lhash_lock = RW_LOCK_UNLOCKED,
94 .lhash_users = ATOMIC_INIT(0),
95 .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
96 .portalloc_lock = SPIN_LOCK_UNLOCKED,
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -070097 .port_rover = 1024 - 1,
Linus Torvalds1da177e2005-04-16 15:20:36 -070098};
99
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -0700100static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
101{
102 return inet_csk_get_port(&tcp_hashinfo, sk, snum);
103}
104
Linus Torvalds1da177e2005-04-16 15:20:36 -0700105static void tcp_v4_hash(struct sock *sk)
106{
Arnaldo Carvalho de Melo81849d12005-08-09 20:08:50 -0700107 inet_hash(&tcp_hashinfo, sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700108}
109
110void tcp_unhash(struct sock *sk)
111{
Arnaldo Carvalho de Melo81849d12005-08-09 20:08:50 -0700112 inet_unhash(&tcp_hashinfo, sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700113}
114
Linus Torvalds1da177e2005-04-16 15:20:36 -0700115static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
116{
117 return secure_tcp_sequence_number(skb->nh.iph->daddr,
118 skb->nh.iph->saddr,
119 skb->h.th->dest,
120 skb->h.th->source);
121}
122
123/* called with local bh disabled */
124static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -0700125 struct inet_timewait_sock **twp)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700126{
127 struct inet_sock *inet = inet_sk(sk);
128 u32 daddr = inet->rcv_saddr;
129 u32 saddr = inet->daddr;
130 int dif = sk->sk_bound_dev_if;
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -0700131 INET_ADDR_COOKIE(acookie, saddr, daddr)
132 const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -0700133 const int hash = inet_ehashfn(daddr, lport, saddr, inet->dport, tcp_hashinfo.ehash_size);
134 struct inet_ehash_bucket *head = &tcp_hashinfo.ehash[hash];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700135 struct sock *sk2;
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -0700136 const struct hlist_node *node;
137 struct inet_timewait_sock *tw;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700138
139 write_lock(&head->lock);
140
141 /* Check TIME-WAIT sockets first. */
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -0700142 sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) {
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -0700143 tw = inet_twsk(sk2);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700144
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -0700145 if (INET_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
146 const struct tcp_timewait_sock *tcptw = tcp_twsk(sk2);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700147 struct tcp_sock *tp = tcp_sk(sk);
148
149 /* With PAWS, it is safe from the viewpoint
150 of data integrity. Even without PAWS it
151 is safe provided sequence spaces do not
152 overlap i.e. at data rates <= 80Mbit/sec.
153
154 Actually, the idea is close to VJ's one,
155 only timestamp cache is held not per host,
156 but per port pair and TW bucket is used
157 as state holder.
158
159 If TW bucket has been already destroyed we
160 fall back to VJ's scheme and use initial
161 timestamp retrieved from peer table.
162 */
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -0700163 if (tcptw->tw_ts_recent_stamp &&
Linus Torvalds1da177e2005-04-16 15:20:36 -0700164 (!twp || (sysctl_tcp_tw_reuse &&
165 xtime.tv_sec -
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -0700166 tcptw->tw_ts_recent_stamp > 1))) {
167 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
168 if (tp->write_seq == 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700169 tp->write_seq = 1;
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -0700170 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
171 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700172 sock_hold(sk2);
173 goto unique;
174 } else
175 goto not_unique;
176 }
177 }
178 tw = NULL;
179
180 /* And established part... */
181 sk_for_each(sk2, node, &head->chain) {
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -0700182 if (INET_MATCH(sk2, acookie, saddr, daddr, ports, dif))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700183 goto not_unique;
184 }
185
186unique:
187 /* Must record num and sport now. Otherwise we will see
188 * in hash table socket with a funny identity. */
189 inet->num = lport;
190 inet->sport = htons(lport);
191 sk->sk_hashent = hash;
192 BUG_TRAP(sk_unhashed(sk));
193 __sk_add_node(sk, &head->chain);
194 sock_prot_inc_use(sk->sk_prot);
195 write_unlock(&head->lock);
196
197 if (twp) {
198 *twp = tw;
199 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
200 } else if (tw) {
201 /* Silly. Should hash-dance instead... */
202 tcp_tw_deschedule(tw);
203 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
204
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -0700205 inet_twsk_put(tw);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700206 }
207
208 return 0;
209
210not_unique:
211 write_unlock(&head->lock);
212 return -EADDRNOTAVAIL;
213}
214
215static inline u32 connect_port_offset(const struct sock *sk)
216{
217 const struct inet_sock *inet = inet_sk(sk);
218
219 return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr,
220 inet->dport);
221}
222
223/*
224 * Bind a port for a connect operation and hash it.
225 */
226static inline int tcp_v4_hash_connect(struct sock *sk)
227{
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -0700228 const unsigned short snum = inet_sk(sk)->num;
229 struct inet_bind_hashbucket *head;
230 struct inet_bind_bucket *tb;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700231 int ret;
232
233 if (!snum) {
234 int low = sysctl_local_port_range[0];
235 int high = sysctl_local_port_range[1];
236 int range = high - low;
237 int i;
238 int port;
239 static u32 hint;
240 u32 offset = hint + connect_port_offset(sk);
241 struct hlist_node *node;
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -0700242 struct inet_timewait_sock *tw = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700243
244 local_bh_disable();
245 for (i = 1; i <= range; i++) {
246 port = low + (i + offset) % range;
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -0700247 head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700248 spin_lock(&head->lock);
249
250 /* Does not bother with rcv_saddr checks,
251 * because the established check is already
252 * unique enough.
253 */
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -0700254 inet_bind_bucket_for_each(tb, node, &head->chain) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700255 if (tb->port == port) {
256 BUG_TRAP(!hlist_empty(&tb->owners));
257 if (tb->fastreuse >= 0)
258 goto next_port;
259 if (!__tcp_v4_check_established(sk,
260 port,
261 &tw))
262 goto ok;
263 goto next_port;
264 }
265 }
266
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -0700267 tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700268 if (!tb) {
269 spin_unlock(&head->lock);
270 break;
271 }
272 tb->fastreuse = -1;
273 goto ok;
274
275 next_port:
276 spin_unlock(&head->lock);
277 }
278 local_bh_enable();
279
280 return -EADDRNOTAVAIL;
281
282ok:
283 hint += i;
284
285 /* Head lock still held and bh's disabled */
Arnaldo Carvalho de Melo2d8c4ce2005-08-09 20:07:13 -0700286 inet_bind_hash(sk, tb, port);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700287 if (sk_unhashed(sk)) {
288 inet_sk(sk)->sport = htons(port);
Arnaldo Carvalho de Melof3f05f72005-08-09 20:08:09 -0700289 __inet_hash(&tcp_hashinfo, sk, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700290 }
291 spin_unlock(&head->lock);
292
293 if (tw) {
294 tcp_tw_deschedule(tw);
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -0700295 inet_twsk_put(tw);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700296 }
297
298 ret = 0;
299 goto out;
300 }
301
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -0700302 head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -0700303 tb = inet_csk(sk)->icsk_bind_hash;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700304 spin_lock_bh(&head->lock);
305 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
Arnaldo Carvalho de Melof3f05f72005-08-09 20:08:09 -0700306 __inet_hash(&tcp_hashinfo, sk, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700307 spin_unlock_bh(&head->lock);
308 return 0;
309 } else {
310 spin_unlock(&head->lock);
311 /* No definite answer... Walk to established hash table */
312 ret = __tcp_v4_check_established(sk, snum, NULL);
313out:
314 local_bh_enable();
315 return ret;
316 }
317}
318
319/* This will initiate an outgoing connection. */
320int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
321{
322 struct inet_sock *inet = inet_sk(sk);
323 struct tcp_sock *tp = tcp_sk(sk);
324 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
325 struct rtable *rt;
326 u32 daddr, nexthop;
327 int tmp;
328 int err;
329
330 if (addr_len < sizeof(struct sockaddr_in))
331 return -EINVAL;
332
333 if (usin->sin_family != AF_INET)
334 return -EAFNOSUPPORT;
335
336 nexthop = daddr = usin->sin_addr.s_addr;
337 if (inet->opt && inet->opt->srr) {
338 if (!daddr)
339 return -EINVAL;
340 nexthop = inet->opt->faddr;
341 }
342
343 tmp = ip_route_connect(&rt, nexthop, inet->saddr,
344 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
345 IPPROTO_TCP,
346 inet->sport, usin->sin_port, sk);
347 if (tmp < 0)
348 return tmp;
349
350 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
351 ip_rt_put(rt);
352 return -ENETUNREACH;
353 }
354
355 if (!inet->opt || !inet->opt->srr)
356 daddr = rt->rt_dst;
357
358 if (!inet->saddr)
359 inet->saddr = rt->rt_src;
360 inet->rcv_saddr = inet->saddr;
361
362 if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
363 /* Reset inherited state */
364 tp->rx_opt.ts_recent = 0;
365 tp->rx_opt.ts_recent_stamp = 0;
366 tp->write_seq = 0;
367 }
368
369 if (sysctl_tcp_tw_recycle &&
370 !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
371 struct inet_peer *peer = rt_get_peer(rt);
372
373 /* VJ's idea. We save last timestamp seen from
374 * the destination in peer table, when entering state TIME-WAIT
375 * and initialize rx_opt.ts_recent from it, when trying new connection.
376 */
377
378 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
379 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
380 tp->rx_opt.ts_recent = peer->tcp_ts;
381 }
382 }
383
384 inet->dport = usin->sin_port;
385 inet->daddr = daddr;
386
387 tp->ext_header_len = 0;
388 if (inet->opt)
389 tp->ext_header_len = inet->opt->optlen;
390
391 tp->rx_opt.mss_clamp = 536;
392
393 /* Socket identity is still unknown (sport may be zero).
394 * However we set state to SYN-SENT and not releasing socket
395 * lock select source port, enter ourselves into the hash tables and
396 * complete initialization after this.
397 */
398 tcp_set_state(sk, TCP_SYN_SENT);
399 err = tcp_v4_hash_connect(sk);
400 if (err)
401 goto failure;
402
403 err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
404 if (err)
405 goto failure;
406
407 /* OK, now commit destination to socket. */
Arnaldo Carvalho de Melo6cbb0df2005-08-09 19:49:02 -0700408 sk_setup_caps(sk, &rt->u.dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700409
410 if (!tp->write_seq)
411 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
412 inet->daddr,
413 inet->sport,
414 usin->sin_port);
415
416 inet->id = tp->write_seq ^ jiffies;
417
418 err = tcp_connect(sk);
419 rt = NULL;
420 if (err)
421 goto failure;
422
423 return 0;
424
425failure:
426 /* This unhashes the socket and releases the local port, if necessary. */
427 tcp_set_state(sk, TCP_CLOSE);
428 ip_rt_put(rt);
429 sk->sk_route_caps = 0;
430 inet->dport = 0;
431 return err;
432}
433
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -0700434static inline int inet_iif(const struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700435{
436 return ((struct rtable *)skb->dst)->rt_iif;
437}
438
Linus Torvalds1da177e2005-04-16 15:20:36 -0700439/*
440 * This routine does path mtu discovery as defined in RFC1191.
441 */
442static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
443 u32 mtu)
444{
445 struct dst_entry *dst;
446 struct inet_sock *inet = inet_sk(sk);
447 struct tcp_sock *tp = tcp_sk(sk);
448
449 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
450 * send out by Linux are always <576bytes so they should go through
451 * unfragmented).
452 */
453 if (sk->sk_state == TCP_LISTEN)
454 return;
455
456 /* We don't check in the destentry if pmtu discovery is forbidden
457 * on this route. We just assume that no packet_to_big packets
458 * are send back when pmtu discovery is not active.
459 * There is a small race when the user changes this flag in the
460 * route, but I think that's acceptable.
461 */
462 if ((dst = __sk_dst_check(sk, 0)) == NULL)
463 return;
464
465 dst->ops->update_pmtu(dst, mtu);
466
467 /* Something is about to be wrong... Remember soft error
468 * for the case, if this connection will not able to recover.
469 */
470 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
471 sk->sk_err_soft = EMSGSIZE;
472
473 mtu = dst_mtu(dst);
474
475 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
476 tp->pmtu_cookie > mtu) {
477 tcp_sync_mss(sk, mtu);
478
479 /* Resend the TCP packet because it's
480 * clear that the old packet has been
481 * dropped. This is the new "fast" path mtu
482 * discovery.
483 */
484 tcp_simple_retransmit(sk);
485 } /* else let the usual retransmit timer handle it */
486}
487
488/*
489 * This routine is called by the ICMP module when it gets some
490 * sort of error condition. If err < 0 then the socket should
491 * be closed and the error returned to the user. If err > 0
492 * it's just the icmp type << 8 | icmp code. After adjustment
493 * header points to the first 8 bytes of the tcp header. We need
494 * to find the appropriate port.
495 *
496 * The locking strategy used here is very "optimistic". When
497 * someone else accesses the socket the ICMP is just dropped
498 * and for some paths there is no check at all.
499 * A more general error queue to queue errors for later handling
500 * is probably better.
501 *
502 */
503
504void tcp_v4_err(struct sk_buff *skb, u32 info)
505{
506 struct iphdr *iph = (struct iphdr *)skb->data;
507 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
508 struct tcp_sock *tp;
509 struct inet_sock *inet;
510 int type = skb->h.icmph->type;
511 int code = skb->h.icmph->code;
512 struct sock *sk;
513 __u32 seq;
514 int err;
515
516 if (skb->len < (iph->ihl << 2) + 8) {
517 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
518 return;
519 }
520
Arnaldo Carvalho de Meloe48c4142005-08-09 20:09:46 -0700521 sk = inet_lookup(&tcp_hashinfo, iph->daddr, th->dest, iph->saddr,
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -0700522 th->source, inet_iif(skb));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700523 if (!sk) {
524 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
525 return;
526 }
527 if (sk->sk_state == TCP_TIME_WAIT) {
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -0700528 inet_twsk_put((struct inet_timewait_sock *)sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700529 return;
530 }
531
532 bh_lock_sock(sk);
533 /* If too many ICMPs get dropped on busy
534 * servers this needs to be solved differently.
535 */
536 if (sock_owned_by_user(sk))
537 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
538
539 if (sk->sk_state == TCP_CLOSE)
540 goto out;
541
542 tp = tcp_sk(sk);
543 seq = ntohl(th->seq);
544 if (sk->sk_state != TCP_LISTEN &&
545 !between(seq, tp->snd_una, tp->snd_nxt)) {
546 NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
547 goto out;
548 }
549
550 switch (type) {
551 case ICMP_SOURCE_QUENCH:
552 /* Just silently ignore these. */
553 goto out;
554 case ICMP_PARAMETERPROB:
555 err = EPROTO;
556 break;
557 case ICMP_DEST_UNREACH:
558 if (code > NR_ICMP_UNREACH)
559 goto out;
560
561 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
562 if (!sock_owned_by_user(sk))
563 do_pmtu_discovery(sk, iph, info);
564 goto out;
565 }
566
567 err = icmp_err_convert[code].errno;
568 break;
569 case ICMP_TIME_EXCEEDED:
570 err = EHOSTUNREACH;
571 break;
572 default:
573 goto out;
574 }
575
576 switch (sk->sk_state) {
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -0700577 struct request_sock *req, **prev;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700578 case TCP_LISTEN:
579 if (sock_owned_by_user(sk))
580 goto out;
581
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -0700582 req = inet_csk_search_req(sk, &prev, th->dest,
583 iph->daddr, iph->saddr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700584 if (!req)
585 goto out;
586
587 /* ICMPs are not backlogged, hence we cannot get
588 an established socket here.
589 */
590 BUG_TRAP(!req->sk);
591
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -0700592 if (seq != tcp_rsk(req)->snt_isn) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700593 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
594 goto out;
595 }
596
597 /*
598 * Still in SYN_RECV, just remove it silently.
599 * There is no good way to pass the error to the newly
600 * created socket, and POSIX does not want network
601 * errors returned from accept().
602 */
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -0700603 inet_csk_reqsk_queue_drop(sk, req, prev);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700604 goto out;
605
606 case TCP_SYN_SENT:
607 case TCP_SYN_RECV: /* Cannot happen.
608 It can f.e. if SYNs crossed.
609 */
610 if (!sock_owned_by_user(sk)) {
611 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
612 sk->sk_err = err;
613
614 sk->sk_error_report(sk);
615
616 tcp_done(sk);
617 } else {
618 sk->sk_err_soft = err;
619 }
620 goto out;
621 }
622
623 /* If we've already connected we will keep trying
624 * until we time out, or the user gives up.
625 *
626 * rfc1122 4.2.3.9 allows to consider as hard errors
627 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
628 * but it is obsoleted by pmtu discovery).
629 *
630 * Note, that in modern internet, where routing is unreliable
631 * and in each dark corner broken firewalls sit, sending random
632 * errors ordered by their masters even this two messages finally lose
633 * their original sense (even Linux sends invalid PORT_UNREACHs)
634 *
635 * Now we are in compliance with RFCs.
636 * --ANK (980905)
637 */
638
639 inet = inet_sk(sk);
640 if (!sock_owned_by_user(sk) && inet->recverr) {
641 sk->sk_err = err;
642 sk->sk_error_report(sk);
643 } else { /* Only an error on timeout */
644 sk->sk_err_soft = err;
645 }
646
647out:
648 bh_unlock_sock(sk);
649 sock_put(sk);
650}
651
652/* This routine computes an IPv4 TCP checksum. */
653void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
654 struct sk_buff *skb)
655{
656 struct inet_sock *inet = inet_sk(sk);
657
658 if (skb->ip_summed == CHECKSUM_HW) {
659 th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
660 skb->csum = offsetof(struct tcphdr, check);
661 } else {
662 th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
663 csum_partial((char *)th,
664 th->doff << 2,
665 skb->csum));
666 }
667}
668
669/*
670 * This routine will send an RST to the other tcp.
671 *
672 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
673 * for reset.
674 * Answer: if a packet caused RST, it is not for a socket
675 * existing in our system, if it is matched to a socket,
676 * it is just duplicate segment or bug in other side's TCP.
677 * So that we build reply only basing on parameters
678 * arrived with segment.
679 * Exception: precedence violation. We do not implement it in any case.
680 */
681
682static void tcp_v4_send_reset(struct sk_buff *skb)
683{
684 struct tcphdr *th = skb->h.th;
685 struct tcphdr rth;
686 struct ip_reply_arg arg;
687
688 /* Never send a reset in response to a reset. */
689 if (th->rst)
690 return;
691
692 if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
693 return;
694
695 /* Swap the send and the receive. */
696 memset(&rth, 0, sizeof(struct tcphdr));
697 rth.dest = th->source;
698 rth.source = th->dest;
699 rth.doff = sizeof(struct tcphdr) / 4;
700 rth.rst = 1;
701
702 if (th->ack) {
703 rth.seq = th->ack_seq;
704 } else {
705 rth.ack = 1;
706 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
707 skb->len - (th->doff << 2));
708 }
709
710 memset(&arg, 0, sizeof arg);
711 arg.iov[0].iov_base = (unsigned char *)&rth;
712 arg.iov[0].iov_len = sizeof rth;
713 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
714 skb->nh.iph->saddr, /*XXX*/
715 sizeof(struct tcphdr), IPPROTO_TCP, 0);
716 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
717
718 ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
719
720 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
721 TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
722}
723
724/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
725 outside socket context is ugly, certainly. What can I do?
726 */
727
728static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
729 u32 win, u32 ts)
730{
731 struct tcphdr *th = skb->h.th;
732 struct {
733 struct tcphdr th;
734 u32 tsopt[3];
735 } rep;
736 struct ip_reply_arg arg;
737
738 memset(&rep.th, 0, sizeof(struct tcphdr));
739 memset(&arg, 0, sizeof arg);
740
741 arg.iov[0].iov_base = (unsigned char *)&rep;
742 arg.iov[0].iov_len = sizeof(rep.th);
743 if (ts) {
744 rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
745 (TCPOPT_TIMESTAMP << 8) |
746 TCPOLEN_TIMESTAMP);
747 rep.tsopt[1] = htonl(tcp_time_stamp);
748 rep.tsopt[2] = htonl(ts);
749 arg.iov[0].iov_len = sizeof(rep);
750 }
751
752 /* Swap the send and the receive. */
753 rep.th.dest = th->source;
754 rep.th.source = th->dest;
755 rep.th.doff = arg.iov[0].iov_len / 4;
756 rep.th.seq = htonl(seq);
757 rep.th.ack_seq = htonl(ack);
758 rep.th.ack = 1;
759 rep.th.window = htons(win);
760
761 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
762 skb->nh.iph->saddr, /*XXX*/
763 arg.iov[0].iov_len, IPPROTO_TCP, 0);
764 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
765
766 ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
767
768 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
769}
770
771static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
772{
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -0700773 struct inet_timewait_sock *tw = inet_twsk(sk);
774 const struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700775
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -0700776 tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
777 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcptw->tw_ts_recent);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700778
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -0700779 inet_twsk_put(tw);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700780}
781
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -0700782static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700783{
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -0700784 tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700785 req->ts_recent);
786}
787
Linus Torvalds1da177e2005-04-16 15:20:36 -0700788/*
789 * Send a SYN-ACK after having received an ACK.
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -0700790 * This still operates on a request_sock only, not on a big
Linus Torvalds1da177e2005-04-16 15:20:36 -0700791 * socket.
792 */
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -0700793static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700794 struct dst_entry *dst)
795{
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -0700796 const struct inet_request_sock *ireq = inet_rsk(req);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700797 int err = -1;
798 struct sk_buff * skb;
799
800 /* First, grab a route. */
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -0700801 if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700802 goto out;
803
804 skb = tcp_make_synack(sk, dst, req);
805
806 if (skb) {
807 struct tcphdr *th = skb->h.th;
808
809 th->check = tcp_v4_check(th, skb->len,
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -0700810 ireq->loc_addr,
811 ireq->rmt_addr,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700812 csum_partial((char *)th, skb->len,
813 skb->csum));
814
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -0700815 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
816 ireq->rmt_addr,
817 ireq->opt);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700818 if (err == NET_XMIT_CN)
819 err = 0;
820 }
821
822out:
823 dst_release(dst);
824 return err;
825}
826
827/*
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -0700828 * IPv4 request_sock destructor.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700829 */
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -0700830static void tcp_v4_reqsk_destructor(struct request_sock *req)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700831{
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -0700832 if (inet_rsk(req)->opt)
833 kfree(inet_rsk(req)->opt);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700834}
835
836static inline void syn_flood_warning(struct sk_buff *skb)
837{
838 static unsigned long warntime;
839
840 if (time_after(jiffies, (warntime + HZ * 60))) {
841 warntime = jiffies;
842 printk(KERN_INFO
843 "possible SYN flooding on port %d. Sending cookies.\n",
844 ntohs(skb->h.th->dest));
845 }
846}
847
848/*
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -0700849 * Save and compile IPv4 options into the request_sock if needed.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700850 */
851static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
852 struct sk_buff *skb)
853{
854 struct ip_options *opt = &(IPCB(skb)->opt);
855 struct ip_options *dopt = NULL;
856
857 if (opt && opt->optlen) {
858 int opt_size = optlength(opt);
859 dopt = kmalloc(opt_size, GFP_ATOMIC);
860 if (dopt) {
861 if (ip_options_echo(dopt, skb)) {
862 kfree(dopt);
863 dopt = NULL;
864 }
865 }
866 }
867 return dopt;
868}
869
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -0700870struct request_sock_ops tcp_request_sock_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700871 .family = PF_INET,
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -0700872 .obj_size = sizeof(struct tcp_request_sock),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700873 .rtx_syn_ack = tcp_v4_send_synack,
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -0700874 .send_ack = tcp_v4_reqsk_send_ack,
875 .destructor = tcp_v4_reqsk_destructor,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700876 .send_reset = tcp_v4_send_reset,
877};
878
879int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
880{
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -0700881 struct inet_request_sock *ireq;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700882 struct tcp_options_received tmp_opt;
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -0700883 struct request_sock *req;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700884 __u32 saddr = skb->nh.iph->saddr;
885 __u32 daddr = skb->nh.iph->daddr;
886 __u32 isn = TCP_SKB_CB(skb)->when;
887 struct dst_entry *dst = NULL;
888#ifdef CONFIG_SYN_COOKIES
889 int want_cookie = 0;
890#else
891#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
892#endif
893
894 /* Never answer to SYNs send to broadcast or multicast */
895 if (((struct rtable *)skb->dst)->rt_flags &
896 (RTCF_BROADCAST | RTCF_MULTICAST))
897 goto drop;
898
899 /* TW buckets are converted to open requests without
900 * limitations, they conserve resources and peer is
901 * evidently real one.
902 */
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -0700903 if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700904#ifdef CONFIG_SYN_COOKIES
905 if (sysctl_tcp_syncookies) {
906 want_cookie = 1;
907 } else
908#endif
909 goto drop;
910 }
911
912 /* Accept backlog is full. If we have already queued enough
913 * of warm entries in syn queue, drop request. It is better than
914 * clogging syn queue with openreqs with exponentially increasing
915 * timeout.
916 */
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -0700917 if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700918 goto drop;
919
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -0700920 req = reqsk_alloc(&tcp_request_sock_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700921 if (!req)
922 goto drop;
923
924 tcp_clear_options(&tmp_opt);
925 tmp_opt.mss_clamp = 536;
926 tmp_opt.user_mss = tcp_sk(sk)->rx_opt.user_mss;
927
928 tcp_parse_options(skb, &tmp_opt, 0);
929
930 if (want_cookie) {
931 tcp_clear_options(&tmp_opt);
932 tmp_opt.saw_tstamp = 0;
933 }
934
935 if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
936 /* Some OSes (unknown ones, but I see them on web server, which
937 * contains information interesting only for windows'
938 * users) do not send their stamp in SYN. It is easy case.
939 * We simply do not advertise TS support.
940 */
941 tmp_opt.saw_tstamp = 0;
942 tmp_opt.tstamp_ok = 0;
943 }
944 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
945
946 tcp_openreq_init(req, &tmp_opt, skb);
947
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -0700948 ireq = inet_rsk(req);
949 ireq->loc_addr = daddr;
950 ireq->rmt_addr = saddr;
951 ireq->opt = tcp_v4_save_options(sk, skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700952 if (!want_cookie)
953 TCP_ECN_create_request(req, skb->h.th);
954
955 if (want_cookie) {
956#ifdef CONFIG_SYN_COOKIES
957 syn_flood_warning(skb);
958#endif
959 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
960 } else if (!isn) {
961 struct inet_peer *peer = NULL;
962
963 /* VJ's idea. We save last timestamp seen
964 * from the destination in peer table, when entering
965 * state TIME-WAIT, and check against it before
966 * accepting new connection request.
967 *
968 * If "isn" is not zero, this request hit alive
969 * timewait bucket, so that all the necessary checks
970 * are made in the function processing timewait state.
971 */
972 if (tmp_opt.saw_tstamp &&
973 sysctl_tcp_tw_recycle &&
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -0700974 (dst = inet_csk_route_req(sk, req)) != NULL &&
Linus Torvalds1da177e2005-04-16 15:20:36 -0700975 (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
976 peer->v4daddr == saddr) {
977 if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
978 (s32)(peer->tcp_ts - req->ts_recent) >
979 TCP_PAWS_WINDOW) {
980 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
981 dst_release(dst);
982 goto drop_and_free;
983 }
984 }
985 /* Kill the following clause, if you dislike this way. */
986 else if (!sysctl_tcp_syncookies &&
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -0700987 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
Linus Torvalds1da177e2005-04-16 15:20:36 -0700988 (sysctl_max_syn_backlog >> 2)) &&
989 (!peer || !peer->tcp_ts_stamp) &&
990 (!dst || !dst_metric(dst, RTAX_RTT))) {
991 /* Without syncookies last quarter of
992 * backlog is filled with destinations,
993 * proven to be alive.
994 * It means that we continue to communicate
995 * to destinations, already remembered
996 * to the moment of synflood.
997 */
Heikki Orsilaca933452005-08-08 14:26:52 -0700998 LIMIT_NETDEBUG(printk(KERN_DEBUG "TCP: drop open "
999 "request from %u.%u."
1000 "%u.%u/%u\n",
1001 NIPQUAD(saddr),
1002 ntohs(skb->h.th->source)));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001003 dst_release(dst);
1004 goto drop_and_free;
1005 }
1006
1007 isn = tcp_v4_init_sequence(sk, skb);
1008 }
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001009 tcp_rsk(req)->snt_isn = isn;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001010
1011 if (tcp_v4_send_synack(sk, req, dst))
1012 goto drop_and_free;
1013
1014 if (want_cookie) {
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001015 reqsk_free(req);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001016 } else {
Arnaldo Carvalho de Melo3f421ba2005-08-09 20:11:08 -07001017 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001018 }
1019 return 0;
1020
1021drop_and_free:
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001022 reqsk_free(req);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001023drop:
1024 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1025 return 0;
1026}
1027
1028
1029/*
1030 * The three way handshake has completed - we got a valid synack -
1031 * now create the new socket.
1032 */
1033struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001034 struct request_sock *req,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001035 struct dst_entry *dst)
1036{
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001037 struct inet_request_sock *ireq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001038 struct inet_sock *newinet;
1039 struct tcp_sock *newtp;
1040 struct sock *newsk;
1041
1042 if (sk_acceptq_is_full(sk))
1043 goto exit_overflow;
1044
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001045 if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001046 goto exit;
1047
1048 newsk = tcp_create_openreq_child(sk, req, skb);
1049 if (!newsk)
1050 goto exit;
1051
Arnaldo Carvalho de Melo6cbb0df2005-08-09 19:49:02 -07001052 sk_setup_caps(newsk, dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001053
1054 newtp = tcp_sk(newsk);
1055 newinet = inet_sk(newsk);
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001056 ireq = inet_rsk(req);
1057 newinet->daddr = ireq->rmt_addr;
1058 newinet->rcv_saddr = ireq->loc_addr;
1059 newinet->saddr = ireq->loc_addr;
1060 newinet->opt = ireq->opt;
1061 ireq->opt = NULL;
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001062 newinet->mc_index = inet_iif(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001063 newinet->mc_ttl = skb->nh.iph->ttl;
1064 newtp->ext_header_len = 0;
1065 if (newinet->opt)
1066 newtp->ext_header_len = newinet->opt->optlen;
1067 newinet->id = newtp->write_seq ^ jiffies;
1068
1069 tcp_sync_mss(newsk, dst_mtu(dst));
1070 newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1071 tcp_initialize_rcv_mss(newsk);
1072
Arnaldo Carvalho de Melof3f05f72005-08-09 20:08:09 -07001073 __inet_hash(&tcp_hashinfo, newsk, 0);
Arnaldo Carvalho de Melo2d8c4ce2005-08-09 20:07:13 -07001074 __inet_inherit_port(&tcp_hashinfo, sk, newsk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001075
1076 return newsk;
1077
1078exit_overflow:
1079 NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1080exit:
1081 NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1082 dst_release(dst);
1083 return NULL;
1084}
1085
1086static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1087{
1088 struct tcphdr *th = skb->h.th;
1089 struct iphdr *iph = skb->nh.iph;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001090 struct sock *nsk;
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001091 struct request_sock **prev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001092 /* Find possible connection requests. */
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001093 struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1094 iph->saddr, iph->daddr);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001095 if (req)
1096 return tcp_check_req(sk, skb, req, prev);
1097
Arnaldo Carvalho de Meloe48c4142005-08-09 20:09:46 -07001098 nsk = __inet_lookup_established(&tcp_hashinfo, skb->nh.iph->saddr,
1099 th->source, skb->nh.iph->daddr,
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001100 ntohs(th->dest), inet_iif(skb));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001101
1102 if (nsk) {
1103 if (nsk->sk_state != TCP_TIME_WAIT) {
1104 bh_lock_sock(nsk);
1105 return nsk;
1106 }
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07001107 inet_twsk_put((struct inet_timewait_sock *)nsk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001108 return NULL;
1109 }
1110
1111#ifdef CONFIG_SYN_COOKIES
1112 if (!th->rst && !th->syn && th->ack)
1113 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1114#endif
1115 return sk;
1116}
1117
1118static int tcp_v4_checksum_init(struct sk_buff *skb)
1119{
1120 if (skb->ip_summed == CHECKSUM_HW) {
1121 skb->ip_summed = CHECKSUM_UNNECESSARY;
1122 if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1123 skb->nh.iph->daddr, skb->csum))
1124 return 0;
1125
Heikki Orsilaca933452005-08-08 14:26:52 -07001126 LIMIT_NETDEBUG(printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001127 skb->ip_summed = CHECKSUM_NONE;
1128 }
1129 if (skb->len <= 76) {
1130 if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1131 skb->nh.iph->daddr,
1132 skb_checksum(skb, 0, skb->len, 0)))
1133 return -1;
1134 skb->ip_summed = CHECKSUM_UNNECESSARY;
1135 } else {
1136 skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
1137 skb->nh.iph->saddr,
1138 skb->nh.iph->daddr, 0);
1139 }
1140 return 0;
1141}
1142
1143
1144/* The socket must have it's spinlock held when we get
1145 * here.
1146 *
1147 * We have a potential double-lock case here, so even when
1148 * doing backlog processing we use the BH locking scheme.
1149 * This is because we cannot sleep with the original spinlock
1150 * held.
1151 */
1152int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1153{
1154 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1155 TCP_CHECK_TIMER(sk);
1156 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1157 goto reset;
1158 TCP_CHECK_TIMER(sk);
1159 return 0;
1160 }
1161
1162 if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1163 goto csum_err;
1164
1165 if (sk->sk_state == TCP_LISTEN) {
1166 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1167 if (!nsk)
1168 goto discard;
1169
1170 if (nsk != sk) {
1171 if (tcp_child_process(sk, nsk, skb))
1172 goto reset;
1173 return 0;
1174 }
1175 }
1176
1177 TCP_CHECK_TIMER(sk);
1178 if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1179 goto reset;
1180 TCP_CHECK_TIMER(sk);
1181 return 0;
1182
1183reset:
1184 tcp_v4_send_reset(skb);
1185discard:
1186 kfree_skb(skb);
1187 /* Be careful here. If this function gets more complicated and
1188 * gcc suffers from register pressure on the x86, sk (in %ebx)
1189 * might be destroyed here. This current version compiles correctly,
1190 * but you have been warned.
1191 */
1192 return 0;
1193
1194csum_err:
1195 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1196 goto discard;
1197}
1198
1199/*
1200 * From tcp_input.c
1201 */
1202
1203int tcp_v4_rcv(struct sk_buff *skb)
1204{
1205 struct tcphdr *th;
1206 struct sock *sk;
1207 int ret;
1208
1209 if (skb->pkt_type != PACKET_HOST)
1210 goto discard_it;
1211
1212 /* Count it even if it's bad */
1213 TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1214
1215 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1216 goto discard_it;
1217
1218 th = skb->h.th;
1219
1220 if (th->doff < sizeof(struct tcphdr) / 4)
1221 goto bad_packet;
1222 if (!pskb_may_pull(skb, th->doff * 4))
1223 goto discard_it;
1224
1225 /* An explanation is required here, I think.
1226 * Packet length and doff are validated by header prediction,
1227 * provided case of th->doff==0 is elimineted.
1228 * So, we defer the checks. */
1229 if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1230 tcp_v4_checksum_init(skb) < 0))
1231 goto bad_packet;
1232
1233 th = skb->h.th;
1234 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1235 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1236 skb->len - th->doff * 4);
1237 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1238 TCP_SKB_CB(skb)->when = 0;
1239 TCP_SKB_CB(skb)->flags = skb->nh.iph->tos;
1240 TCP_SKB_CB(skb)->sacked = 0;
1241
Arnaldo Carvalho de Meloe48c4142005-08-09 20:09:46 -07001242 sk = __inet_lookup(&tcp_hashinfo, skb->nh.iph->saddr, th->source,
1243 skb->nh.iph->daddr, ntohs(th->dest),
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001244 inet_iif(skb));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001245
1246 if (!sk)
1247 goto no_tcp_socket;
1248
1249process:
1250 if (sk->sk_state == TCP_TIME_WAIT)
1251 goto do_time_wait;
1252
1253 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1254 goto discard_and_relse;
1255
1256 if (sk_filter(sk, skb, 0))
1257 goto discard_and_relse;
1258
1259 skb->dev = NULL;
1260
1261 bh_lock_sock(sk);
1262 ret = 0;
1263 if (!sock_owned_by_user(sk)) {
1264 if (!tcp_prequeue(sk, skb))
1265 ret = tcp_v4_do_rcv(sk, skb);
1266 } else
1267 sk_add_backlog(sk, skb);
1268 bh_unlock_sock(sk);
1269
1270 sock_put(sk);
1271
1272 return ret;
1273
1274no_tcp_socket:
1275 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1276 goto discard_it;
1277
1278 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1279bad_packet:
1280 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1281 } else {
1282 tcp_v4_send_reset(skb);
1283 }
1284
1285discard_it:
1286 /* Discard frame. */
1287 kfree_skb(skb);
1288 return 0;
1289
1290discard_and_relse:
1291 sock_put(sk);
1292 goto discard_it;
1293
1294do_time_wait:
1295 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07001296 inet_twsk_put((struct inet_timewait_sock *) sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001297 goto discard_it;
1298 }
1299
1300 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1301 TCP_INC_STATS_BH(TCP_MIB_INERRS);
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07001302 inet_twsk_put((struct inet_timewait_sock *) sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001303 goto discard_it;
1304 }
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07001305 switch (tcp_timewait_state_process((struct inet_timewait_sock *)sk,
1306 skb, th)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001307 case TCP_TW_SYN: {
Arnaldo Carvalho de Melo33b62232005-08-09 20:09:06 -07001308 struct sock *sk2 = inet_lookup_listener(&tcp_hashinfo,
1309 skb->nh.iph->daddr,
1310 ntohs(th->dest),
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001311 inet_iif(skb));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001312 if (sk2) {
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07001313 tcp_tw_deschedule((struct inet_timewait_sock *)sk);
1314 inet_twsk_put((struct inet_timewait_sock *)sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001315 sk = sk2;
1316 goto process;
1317 }
1318 /* Fall through to ACK */
1319 }
1320 case TCP_TW_ACK:
1321 tcp_v4_timewait_ack(sk, skb);
1322 break;
1323 case TCP_TW_RST:
1324 goto no_tcp_socket;
1325 case TCP_TW_SUCCESS:;
1326 }
1327 goto discard_it;
1328}
1329
Linus Torvalds1da177e2005-04-16 15:20:36 -07001330static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1331{
1332 struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1333 struct inet_sock *inet = inet_sk(sk);
1334
1335 sin->sin_family = AF_INET;
1336 sin->sin_addr.s_addr = inet->daddr;
1337 sin->sin_port = inet->dport;
1338}
1339
1340/* VJ's idea. Save last timestamp seen from this destination
1341 * and hold it at least for normal timewait interval to use for duplicate
1342 * segment detection in subsequent connections, before they enter synchronized
1343 * state.
1344 */
1345
1346int tcp_v4_remember_stamp(struct sock *sk)
1347{
1348 struct inet_sock *inet = inet_sk(sk);
1349 struct tcp_sock *tp = tcp_sk(sk);
1350 struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1351 struct inet_peer *peer = NULL;
1352 int release_it = 0;
1353
1354 if (!rt || rt->rt_dst != inet->daddr) {
1355 peer = inet_getpeer(inet->daddr, 1);
1356 release_it = 1;
1357 } else {
1358 if (!rt->peer)
1359 rt_bind_peer(rt, 1);
1360 peer = rt->peer;
1361 }
1362
1363 if (peer) {
1364 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1365 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1366 peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1367 peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1368 peer->tcp_ts = tp->rx_opt.ts_recent;
1369 }
1370 if (release_it)
1371 inet_putpeer(peer);
1372 return 1;
1373 }
1374
1375 return 0;
1376}
1377
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07001378int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001379{
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07001380 struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001381
1382 if (peer) {
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07001383 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1384
1385 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
Linus Torvalds1da177e2005-04-16 15:20:36 -07001386 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07001387 peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1388 peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1389 peer->tcp_ts = tcptw->tw_ts_recent;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001390 }
1391 inet_putpeer(peer);
1392 return 1;
1393 }
1394
1395 return 0;
1396}
1397
1398struct tcp_func ipv4_specific = {
1399 .queue_xmit = ip_queue_xmit,
1400 .send_check = tcp_v4_send_check,
Arnaldo Carvalho de Melo32519f12005-08-09 19:50:02 -07001401 .rebuild_header = inet_sk_rebuild_header,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001402 .conn_request = tcp_v4_conn_request,
1403 .syn_recv_sock = tcp_v4_syn_recv_sock,
1404 .remember_stamp = tcp_v4_remember_stamp,
1405 .net_header_len = sizeof(struct iphdr),
1406 .setsockopt = ip_setsockopt,
1407 .getsockopt = ip_getsockopt,
1408 .addr2sockaddr = v4_addr2sockaddr,
1409 .sockaddr_len = sizeof(struct sockaddr_in),
1410};
1411
1412/* NOTE: A lot of things set to zero explicitly by call to
1413 * sk_alloc() so need not be done here.
1414 */
1415static int tcp_v4_init_sock(struct sock *sk)
1416{
1417 struct tcp_sock *tp = tcp_sk(sk);
1418
1419 skb_queue_head_init(&tp->out_of_order_queue);
1420 tcp_init_xmit_timers(sk);
1421 tcp_prequeue_init(tp);
1422
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001423 inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001424 tp->mdev = TCP_TIMEOUT_INIT;
1425
1426 /* So many TCP implementations out there (incorrectly) count the
1427 * initial SYN frame in their delayed-ACK and congestion control
1428 * algorithms that we must have the following bandaid to talk
1429 * efficiently to them. -DaveM
1430 */
1431 tp->snd_cwnd = 2;
1432
1433 /* See draft-stevens-tcpca-spec-01 for discussion of the
1434 * initialization of these values.
1435 */
1436 tp->snd_ssthresh = 0x7fffffff; /* Infinity */
1437 tp->snd_cwnd_clamp = ~0;
David S. Millerc1b4a7e2005-07-05 15:24:38 -07001438 tp->mss_cache = 536;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001439
1440 tp->reordering = sysctl_tcp_reordering;
Stephen Hemminger5f8ef482005-06-23 20:37:36 -07001441 tp->ca_ops = &tcp_init_congestion_ops;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001442
1443 sk->sk_state = TCP_CLOSE;
1444
1445 sk->sk_write_space = sk_stream_write_space;
1446 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1447
1448 tp->af_specific = &ipv4_specific;
1449
1450 sk->sk_sndbuf = sysctl_tcp_wmem[1];
1451 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1452
1453 atomic_inc(&tcp_sockets_allocated);
1454
1455 return 0;
1456}
1457
1458int tcp_v4_destroy_sock(struct sock *sk)
1459{
1460 struct tcp_sock *tp = tcp_sk(sk);
1461
1462 tcp_clear_xmit_timers(sk);
1463
Stephen Hemminger317a76f2005-06-23 12:19:55 -07001464 tcp_cleanup_congestion_control(tp);
1465
Linus Torvalds1da177e2005-04-16 15:20:36 -07001466 /* Cleanup up the write buffer. */
1467 sk_stream_writequeue_purge(sk);
1468
1469 /* Cleans up our, hopefully empty, out_of_order_queue. */
1470 __skb_queue_purge(&tp->out_of_order_queue);
1471
1472 /* Clean prequeue, it must be empty really */
1473 __skb_queue_purge(&tp->ucopy.prequeue);
1474
1475 /* Clean up a referenced TCP bind bucket. */
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001476 if (inet_csk(sk)->icsk_bind_hash)
Arnaldo Carvalho de Melo2d8c4ce2005-08-09 20:07:13 -07001477 inet_put_port(&tcp_hashinfo, sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001478
1479 /*
1480 * If sendmsg cached page exists, toss it.
1481 */
1482 if (sk->sk_sndmsg_page) {
1483 __free_page(sk->sk_sndmsg_page);
1484 sk->sk_sndmsg_page = NULL;
1485 }
1486
1487 atomic_dec(&tcp_sockets_allocated);
1488
1489 return 0;
1490}
1491
1492EXPORT_SYMBOL(tcp_v4_destroy_sock);
1493
1494#ifdef CONFIG_PROC_FS
1495/* Proc filesystem TCP sock list dumping. */
1496
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07001497static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001498{
1499 return hlist_empty(head) ? NULL :
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07001500 list_entry(head->first, struct inet_timewait_sock, tw_node);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001501}
1502
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07001503static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001504{
1505 return tw->tw_node.next ?
1506 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1507}
1508
1509static void *listening_get_next(struct seq_file *seq, void *cur)
1510{
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001511 struct inet_connection_sock *icsk;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001512 struct hlist_node *node;
1513 struct sock *sk = cur;
1514 struct tcp_iter_state* st = seq->private;
1515
1516 if (!sk) {
1517 st->bucket = 0;
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -07001518 sk = sk_head(&tcp_hashinfo.listening_hash[0]);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001519 goto get_sk;
1520 }
1521
1522 ++st->num;
1523
1524 if (st->state == TCP_SEQ_STATE_OPENREQ) {
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001525 struct request_sock *req = cur;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001526
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001527 icsk = inet_csk(st->syn_wait_sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001528 req = req->dl_next;
1529 while (1) {
1530 while (req) {
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001531 if (req->rsk_ops->family == st->family) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001532 cur = req;
1533 goto out;
1534 }
1535 req = req->dl_next;
1536 }
1537 if (++st->sbucket >= TCP_SYNQ_HSIZE)
1538 break;
1539get_req:
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001540 req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
Linus Torvalds1da177e2005-04-16 15:20:36 -07001541 }
1542 sk = sk_next(st->syn_wait_sk);
1543 st->state = TCP_SEQ_STATE_LISTENING;
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001544 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001545 } else {
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001546 icsk = inet_csk(sk);
1547 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1548 if (reqsk_queue_len(&icsk->icsk_accept_queue))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001549 goto start_req;
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001550 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001551 sk = sk_next(sk);
1552 }
1553get_sk:
1554 sk_for_each_from(sk, node) {
1555 if (sk->sk_family == st->family) {
1556 cur = sk;
1557 goto out;
1558 }
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001559 icsk = inet_csk(sk);
1560 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1561 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001562start_req:
1563 st->uid = sock_i_uid(sk);
1564 st->syn_wait_sk = sk;
1565 st->state = TCP_SEQ_STATE_OPENREQ;
1566 st->sbucket = 0;
1567 goto get_req;
1568 }
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001569 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001570 }
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -07001571 if (++st->bucket < INET_LHTABLE_SIZE) {
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -07001572 sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001573 goto get_sk;
1574 }
1575 cur = NULL;
1576out:
1577 return cur;
1578}
1579
1580static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1581{
1582 void *rc = listening_get_next(seq, NULL);
1583
1584 while (rc && *pos) {
1585 rc = listening_get_next(seq, rc);
1586 --*pos;
1587 }
1588 return rc;
1589}
1590
1591static void *established_get_first(struct seq_file *seq)
1592{
1593 struct tcp_iter_state* st = seq->private;
1594 void *rc = NULL;
1595
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -07001596 for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001597 struct sock *sk;
1598 struct hlist_node *node;
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07001599 struct inet_timewait_sock *tw;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001600
1601 /* We can reschedule _before_ having picked the target: */
1602 cond_resched_softirq();
1603
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -07001604 read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
1605 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001606 if (sk->sk_family != st->family) {
1607 continue;
1608 }
1609 rc = sk;
1610 goto out;
1611 }
1612 st->state = TCP_SEQ_STATE_TIME_WAIT;
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07001613 inet_twsk_for_each(tw, node,
1614 &tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001615 if (tw->tw_family != st->family) {
1616 continue;
1617 }
1618 rc = tw;
1619 goto out;
1620 }
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -07001621 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001622 st->state = TCP_SEQ_STATE_ESTABLISHED;
1623 }
1624out:
1625 return rc;
1626}
1627
1628static void *established_get_next(struct seq_file *seq, void *cur)
1629{
1630 struct sock *sk = cur;
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07001631 struct inet_timewait_sock *tw;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001632 struct hlist_node *node;
1633 struct tcp_iter_state* st = seq->private;
1634
1635 ++st->num;
1636
1637 if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
1638 tw = cur;
1639 tw = tw_next(tw);
1640get_tw:
1641 while (tw && tw->tw_family != st->family) {
1642 tw = tw_next(tw);
1643 }
1644 if (tw) {
1645 cur = tw;
1646 goto out;
1647 }
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -07001648 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001649 st->state = TCP_SEQ_STATE_ESTABLISHED;
1650
1651 /* We can reschedule between buckets: */
1652 cond_resched_softirq();
1653
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -07001654 if (++st->bucket < tcp_hashinfo.ehash_size) {
1655 read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
1656 sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001657 } else {
1658 cur = NULL;
1659 goto out;
1660 }
1661 } else
1662 sk = sk_next(sk);
1663
1664 sk_for_each_from(sk, node) {
1665 if (sk->sk_family == st->family)
1666 goto found;
1667 }
1668
1669 st->state = TCP_SEQ_STATE_TIME_WAIT;
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -07001670 tw = tw_head(&tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001671 goto get_tw;
1672found:
1673 cur = sk;
1674out:
1675 return cur;
1676}
1677
1678static void *established_get_idx(struct seq_file *seq, loff_t pos)
1679{
1680 void *rc = established_get_first(seq);
1681
1682 while (rc && pos) {
1683 rc = established_get_next(seq, rc);
1684 --pos;
1685 }
1686 return rc;
1687}
1688
1689static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
1690{
1691 void *rc;
1692 struct tcp_iter_state* st = seq->private;
1693
Arnaldo Carvalho de Melof3f05f72005-08-09 20:08:09 -07001694 inet_listen_lock(&tcp_hashinfo);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001695 st->state = TCP_SEQ_STATE_LISTENING;
1696 rc = listening_get_idx(seq, &pos);
1697
1698 if (!rc) {
Arnaldo Carvalho de Melof3f05f72005-08-09 20:08:09 -07001699 inet_listen_unlock(&tcp_hashinfo);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001700 local_bh_disable();
1701 st->state = TCP_SEQ_STATE_ESTABLISHED;
1702 rc = established_get_idx(seq, pos);
1703 }
1704
1705 return rc;
1706}
1707
1708static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
1709{
1710 struct tcp_iter_state* st = seq->private;
1711 st->state = TCP_SEQ_STATE_LISTENING;
1712 st->num = 0;
1713 return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
1714}
1715
1716static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1717{
1718 void *rc = NULL;
1719 struct tcp_iter_state* st;
1720
1721 if (v == SEQ_START_TOKEN) {
1722 rc = tcp_get_idx(seq, 0);
1723 goto out;
1724 }
1725 st = seq->private;
1726
1727 switch (st->state) {
1728 case TCP_SEQ_STATE_OPENREQ:
1729 case TCP_SEQ_STATE_LISTENING:
1730 rc = listening_get_next(seq, v);
1731 if (!rc) {
Arnaldo Carvalho de Melof3f05f72005-08-09 20:08:09 -07001732 inet_listen_unlock(&tcp_hashinfo);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001733 local_bh_disable();
1734 st->state = TCP_SEQ_STATE_ESTABLISHED;
1735 rc = established_get_first(seq);
1736 }
1737 break;
1738 case TCP_SEQ_STATE_ESTABLISHED:
1739 case TCP_SEQ_STATE_TIME_WAIT:
1740 rc = established_get_next(seq, v);
1741 break;
1742 }
1743out:
1744 ++*pos;
1745 return rc;
1746}
1747
1748static void tcp_seq_stop(struct seq_file *seq, void *v)
1749{
1750 struct tcp_iter_state* st = seq->private;
1751
1752 switch (st->state) {
1753 case TCP_SEQ_STATE_OPENREQ:
1754 if (v) {
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001755 struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
1756 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001757 }
1758 case TCP_SEQ_STATE_LISTENING:
1759 if (v != SEQ_START_TOKEN)
Arnaldo Carvalho de Melof3f05f72005-08-09 20:08:09 -07001760 inet_listen_unlock(&tcp_hashinfo);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001761 break;
1762 case TCP_SEQ_STATE_TIME_WAIT:
1763 case TCP_SEQ_STATE_ESTABLISHED:
1764 if (v)
Arnaldo Carvalho de Melo6e04e022005-08-09 20:07:35 -07001765 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001766 local_bh_enable();
1767 break;
1768 }
1769}
1770
1771static int tcp_seq_open(struct inode *inode, struct file *file)
1772{
1773 struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
1774 struct seq_file *seq;
1775 struct tcp_iter_state *s;
1776 int rc;
1777
1778 if (unlikely(afinfo == NULL))
1779 return -EINVAL;
1780
1781 s = kmalloc(sizeof(*s), GFP_KERNEL);
1782 if (!s)
1783 return -ENOMEM;
1784 memset(s, 0, sizeof(*s));
1785 s->family = afinfo->family;
1786 s->seq_ops.start = tcp_seq_start;
1787 s->seq_ops.next = tcp_seq_next;
1788 s->seq_ops.show = afinfo->seq_show;
1789 s->seq_ops.stop = tcp_seq_stop;
1790
1791 rc = seq_open(file, &s->seq_ops);
1792 if (rc)
1793 goto out_kfree;
1794 seq = file->private_data;
1795 seq->private = s;
1796out:
1797 return rc;
1798out_kfree:
1799 kfree(s);
1800 goto out;
1801}
1802
1803int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
1804{
1805 int rc = 0;
1806 struct proc_dir_entry *p;
1807
1808 if (!afinfo)
1809 return -EINVAL;
1810 afinfo->seq_fops->owner = afinfo->owner;
1811 afinfo->seq_fops->open = tcp_seq_open;
1812 afinfo->seq_fops->read = seq_read;
1813 afinfo->seq_fops->llseek = seq_lseek;
1814 afinfo->seq_fops->release = seq_release_private;
1815
1816 p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
1817 if (p)
1818 p->data = afinfo;
1819 else
1820 rc = -ENOMEM;
1821 return rc;
1822}
1823
1824void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
1825{
1826 if (!afinfo)
1827 return;
1828 proc_net_remove(afinfo->name);
1829 memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
1830}
1831
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001832static void get_openreq4(struct sock *sk, struct request_sock *req,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001833 char *tmpbuf, int i, int uid)
1834{
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001835 const struct inet_request_sock *ireq = inet_rsk(req);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001836 int ttd = req->expires - jiffies;
1837
1838 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
1839 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
1840 i,
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001841 ireq->loc_addr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001842 ntohs(inet_sk(sk)->sport),
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001843 ireq->rmt_addr,
1844 ntohs(ireq->rmt_port),
Linus Torvalds1da177e2005-04-16 15:20:36 -07001845 TCP_SYN_RECV,
1846 0, 0, /* could print option size, but that is af dependent. */
1847 1, /* timers active (only the expire timer) */
1848 jiffies_to_clock_t(ttd),
1849 req->retrans,
1850 uid,
1851 0, /* non standard timer */
1852 0, /* open_requests have no inode */
1853 atomic_read(&sk->sk_refcnt),
1854 req);
1855}
1856
1857static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
1858{
1859 int timer_active;
1860 unsigned long timer_expires;
1861 struct tcp_sock *tp = tcp_sk(sp);
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001862 const struct inet_connection_sock *icsk = inet_csk(sp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001863 struct inet_sock *inet = inet_sk(sp);
1864 unsigned int dest = inet->daddr;
1865 unsigned int src = inet->rcv_saddr;
1866 __u16 destp = ntohs(inet->dport);
1867 __u16 srcp = ntohs(inet->sport);
1868
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001869 if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001870 timer_active = 1;
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001871 timer_expires = icsk->icsk_timeout;
1872 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001873 timer_active = 4;
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001874 timer_expires = icsk->icsk_timeout;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001875 } else if (timer_pending(&sp->sk_timer)) {
1876 timer_active = 2;
1877 timer_expires = sp->sk_timer.expires;
1878 } else {
1879 timer_active = 0;
1880 timer_expires = jiffies;
1881 }
1882
1883 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
1884 "%08X %5d %8d %lu %d %p %u %u %u %u %d",
1885 i, src, srcp, dest, destp, sp->sk_state,
1886 tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
1887 timer_active,
1888 jiffies_to_clock_t(timer_expires - jiffies),
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001889 icsk->icsk_retransmits,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001890 sock_i_uid(sp),
1891 tp->probes_out,
1892 sock_i_ino(sp),
1893 atomic_read(&sp->sk_refcnt), sp,
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001894 icsk->icsk_rto,
1895 icsk->icsk_ack.ato,
1896 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001897 tp->snd_cwnd,
1898 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
1899}
1900
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07001901static void get_timewait4_sock(struct inet_timewait_sock *tw, char *tmpbuf, int i)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001902{
1903 unsigned int dest, src;
1904 __u16 destp, srcp;
1905 int ttd = tw->tw_ttd - jiffies;
1906
1907 if (ttd < 0)
1908 ttd = 0;
1909
1910 dest = tw->tw_daddr;
1911 src = tw->tw_rcv_saddr;
1912 destp = ntohs(tw->tw_dport);
1913 srcp = ntohs(tw->tw_sport);
1914
1915 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
1916 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
1917 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
1918 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
1919 atomic_read(&tw->tw_refcnt), tw);
1920}
1921
1922#define TMPSZ 150
1923
1924static int tcp4_seq_show(struct seq_file *seq, void *v)
1925{
1926 struct tcp_iter_state* st;
1927 char tmpbuf[TMPSZ + 1];
1928
1929 if (v == SEQ_START_TOKEN) {
1930 seq_printf(seq, "%-*s\n", TMPSZ - 1,
1931 " sl local_address rem_address st tx_queue "
1932 "rx_queue tr tm->when retrnsmt uid timeout "
1933 "inode");
1934 goto out;
1935 }
1936 st = seq->private;
1937
1938 switch (st->state) {
1939 case TCP_SEQ_STATE_LISTENING:
1940 case TCP_SEQ_STATE_ESTABLISHED:
1941 get_tcp4_sock(v, tmpbuf, st->num);
1942 break;
1943 case TCP_SEQ_STATE_OPENREQ:
1944 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
1945 break;
1946 case TCP_SEQ_STATE_TIME_WAIT:
1947 get_timewait4_sock(v, tmpbuf, st->num);
1948 break;
1949 }
1950 seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
1951out:
1952 return 0;
1953}
1954
1955static struct file_operations tcp4_seq_fops;
1956static struct tcp_seq_afinfo tcp4_seq_afinfo = {
1957 .owner = THIS_MODULE,
1958 .name = "tcp",
1959 .family = AF_INET,
1960 .seq_show = tcp4_seq_show,
1961 .seq_fops = &tcp4_seq_fops,
1962};
1963
1964int __init tcp4_proc_init(void)
1965{
1966 return tcp_proc_register(&tcp4_seq_afinfo);
1967}
1968
1969void tcp4_proc_exit(void)
1970{
1971 tcp_proc_unregister(&tcp4_seq_afinfo);
1972}
1973#endif /* CONFIG_PROC_FS */
1974
1975struct proto tcp_prot = {
1976 .name = "TCP",
1977 .owner = THIS_MODULE,
1978 .close = tcp_close,
1979 .connect = tcp_v4_connect,
1980 .disconnect = tcp_disconnect,
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001981 .accept = inet_csk_accept,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001982 .ioctl = tcp_ioctl,
1983 .init = tcp_v4_init_sock,
1984 .destroy = tcp_v4_destroy_sock,
1985 .shutdown = tcp_shutdown,
1986 .setsockopt = tcp_setsockopt,
1987 .getsockopt = tcp_getsockopt,
1988 .sendmsg = tcp_sendmsg,
1989 .recvmsg = tcp_recvmsg,
1990 .backlog_rcv = tcp_v4_do_rcv,
1991 .hash = tcp_v4_hash,
1992 .unhash = tcp_unhash,
1993 .get_port = tcp_v4_get_port,
1994 .enter_memory_pressure = tcp_enter_memory_pressure,
1995 .sockets_allocated = &tcp_sockets_allocated,
1996 .memory_allocated = &tcp_memory_allocated,
1997 .memory_pressure = &tcp_memory_pressure,
1998 .sysctl_mem = sysctl_tcp_mem,
1999 .sysctl_wmem = sysctl_tcp_wmem,
2000 .sysctl_rmem = sysctl_tcp_rmem,
2001 .max_header = MAX_TCP_HEADER,
2002 .obj_size = sizeof(struct tcp_sock),
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07002003 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07002004 .rsk_prot = &tcp_request_sock_ops,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002005};
2006
2007
2008
2009void __init tcp_v4_init(struct net_proto_family *ops)
2010{
2011 int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
2012 if (err < 0)
2013 panic("Failed to create the TCP control socket.\n");
2014 tcp_socket->sk->sk_allocation = GFP_ATOMIC;
2015 inet_sk(tcp_socket->sk)->uc_ttl = -1;
2016
2017 /* Unhash it so that IP input processing does not even
2018 * see it, we do not wish this socket to see incoming
2019 * packets.
2020 */
2021 tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
2022}
2023
2024EXPORT_SYMBOL(ipv4_specific);
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -07002025EXPORT_SYMBOL(inet_bind_bucket_create);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002026EXPORT_SYMBOL(tcp_hashinfo);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002027EXPORT_SYMBOL(tcp_prot);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002028EXPORT_SYMBOL(tcp_unhash);
2029EXPORT_SYMBOL(tcp_v4_conn_request);
2030EXPORT_SYMBOL(tcp_v4_connect);
2031EXPORT_SYMBOL(tcp_v4_do_rcv);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002032EXPORT_SYMBOL(tcp_v4_remember_stamp);
2033EXPORT_SYMBOL(tcp_v4_send_check);
2034EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2035
2036#ifdef CONFIG_PROC_FS
2037EXPORT_SYMBOL(tcp_proc_register);
2038EXPORT_SYMBOL(tcp_proc_unregister);
2039#endif
2040EXPORT_SYMBOL(sysctl_local_port_range);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002041EXPORT_SYMBOL(sysctl_tcp_low_latency);
2042EXPORT_SYMBOL(sysctl_tcp_tw_reuse);
2043