Blame - net/ipv4/tcp_ipv4.c - SHIFTPHONES/mainline/linux

blob: 62f62bb05c2ae479eae4c03ee2986fd43e20b9a4 [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
				2	* INET An implementation of the TCP/IP protocol suite for the LINUX
				3	* operating system. INET is implemented using the BSD Socket
				4	* interface as the means of communication with the user level.
				5	*
				6	* Implementation of the Transmission Control Protocol(TCP).
				7	*
				8	* Version: $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
				9	*
				10	* IPv4 specific functions
				11	*
				12	*
				13	* code split from:
				14	* linux/ipv4/tcp.c
				15	* linux/ipv4/tcp_input.c
				16	* linux/ipv4/tcp_output.c
				17	*
				18	* See tcp.c for author information
				19	*
				20	* This program is free software; you can redistribute it and/or
				21	* modify it under the terms of the GNU General Public License
				22	* as published by the Free Software Foundation; either version
				23	* 2 of the License, or (at your option) any later version.
				24	*/
				25
				26	/*
				27	* Changes:
				28	* David S. Miller : New socket lookup architecture.
				29	* This code is dedicated to John Dyson.
				30	* David S. Miller : Change semantics of established hash,
				31	* half is devoted to TIME_WAIT sockets
				32	* and the rest go in the other half.
				33	* Andi Kleen : Add support for syncookies and fixed
				34	* some bugs: ip options weren't passed to
				35	* the TCP layer, missed a check for an
				36	* ACK bit.
				37	* Andi Kleen : Implemented fast path mtu discovery.
				38	* Fixed many serious bugs in the
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	39	* request_sock handling and moved
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	40	* most of it into the af independent code.
				41	* Added tail drop and some other bugfixes.
				42	* Added new listen sematics.
				43	* Mike McLagan : Routing by source
				44	* Juan Jose Ciarlante: ip_dynaddr bits
				45	* Andi Kleen: various fixes.
				46	* Vitaly E. Lavrov : Transparent proxy revived after year
				47	* coma.
				48	* Andi Kleen : Fix new listen.
				49	* Andi Kleen : Fix accept error reporting.
				50	* YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
				51	* Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
				52	* a single port at the same time.
				53	*/
				54
				55	#include <linux/config.h>
				56
				57	#include <linux/types.h>
				58	#include <linux/fcntl.h>
				59	#include <linux/module.h>
				60	#include <linux/random.h>
				61	#include <linux/cache.h>
				62	#include <linux/jhash.h>
				63	#include <linux/init.h>
				64	#include <linux/times.h>
				65
				66	#include <net/icmp.h>
				67	#include <net/tcp.h>
				68	#include <net/ipv6.h>
				69	#include <net/inet_common.h>
				70	#include <net/xfrm.h>
				71
				72	#include <linux/inet.h>
				73	#include <linux/ipv6.h>
				74	#include <linux/stddef.h>
				75	#include <linux/proc_fs.h>
				76	#include <linux/seq_file.h>
				77
				78	extern int sysctl_ip_dynaddr;
				79	int sysctl_tcp_tw_reuse;
				80	int sysctl_tcp_low_latency;
				81
				82	/* Check TCP sequence numbers in ICMP packets. */
				83	#define ICMP_MIN_LENGTH 8
				84
				85	/* Socket used for sending RSTs */
				86	static struct socket *tcp_socket;
				87
				88	void tcp_v4_send_check(struct sock sk, struct tcphdr th, int len,
				89	struct sk_buff *skb);
				90
				91	struct tcp_hashinfo __cacheline_aligned tcp_hashinfo = {
				92	.__tcp_lhash_lock = RW_LOCK_UNLOCKED,
				93	.__tcp_lhash_users = ATOMIC_INIT(0),
				94	.__tcp_lhash_wait
				95	= __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.__tcp_lhash_wait),
				96	.__tcp_portalloc_lock = SPIN_LOCK_UNLOCKED
				97	};
				98
				99	/*
				100	* This array holds the first and last local port number.
				101	* For high-usage systems, use sysctl to change this to
				102	* 32768-61000
				103	*/
				104	int sysctl_local_port_range[2] = { 1024, 4999 };
				105	int tcp_port_rover = 1024 - 1;
				106
				107	static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
				108	__u32 faddr, __u16 fport)
				109	{
				110	int h = (laddr ^ lport) ^ (faddr ^ fport);
				111	h ^= h >> 16;
				112	h ^= h >> 8;
				113	return h & (tcp_ehash_size - 1);
				114	}
				115
				116	static __inline__ int tcp_sk_hashfn(struct sock *sk)
				117	{
				118	struct inet_sock *inet = inet_sk(sk);
				119	__u32 laddr = inet->rcv_saddr;
				120	__u16 lport = inet->num;
				121	__u32 faddr = inet->daddr;
				122	__u16 fport = inet->dport;
				123
				124	return tcp_hashfn(laddr, lport, faddr, fport);
				125	}
				126
				127	/* Allocate and initialize a new TCP local port bind bucket.
				128	* The bindhash mutex for snum's hash chain must be held here.
				129	*/
				130	struct tcp_bind_bucket tcp_bucket_create(struct tcp_bind_hashbucket head,
				131	unsigned short snum)
				132	{
				133	struct tcp_bind_bucket *tb = kmem_cache_alloc(tcp_bucket_cachep,
				134	SLAB_ATOMIC);
				135	if (tb) {
				136	tb->port = snum;
				137	tb->fastreuse = 0;
				138	INIT_HLIST_HEAD(&tb->owners);
				139	hlist_add_head(&tb->node, &head->chain);
				140	}
				141	return tb;
				142	}
				143
				144	/* Caller must hold hashbucket lock for this tb with local BH disabled */
				145	void tcp_bucket_destroy(struct tcp_bind_bucket *tb)
				146	{
				147	if (hlist_empty(&tb->owners)) {
				148	__hlist_del(&tb->node);
				149	kmem_cache_free(tcp_bucket_cachep, tb);
				150	}
				151	}
				152
				153	/* Caller must disable local BH processing. */
				154	static __inline__ void __tcp_inherit_port(struct sock sk, struct sock child)
				155	{
				156	struct tcp_bind_hashbucket *head =
				157	&tcp_bhash[tcp_bhashfn(inet_sk(child)->num)];
				158	struct tcp_bind_bucket *tb;
				159
				160	spin_lock(&head->lock);
				161	tb = tcp_sk(sk)->bind_hash;
				162	sk_add_bind_node(child, &tb->owners);
				163	tcp_sk(child)->bind_hash = tb;
				164	spin_unlock(&head->lock);
				165	}
				166
				167	inline void tcp_inherit_port(struct sock sk, struct sock child)
				168	{
				169	local_bh_disable();
				170	__tcp_inherit_port(sk, child);
				171	local_bh_enable();
				172	}
				173
				174	void tcp_bind_hash(struct sock sk, struct tcp_bind_bucket tb,
				175	unsigned short snum)
				176	{
				177	inet_sk(sk)->num = snum;
				178	sk_add_bind_node(sk, &tb->owners);
				179	tcp_sk(sk)->bind_hash = tb;
				180	}
				181
				182	static inline int tcp_bind_conflict(struct sock sk, struct tcp_bind_bucket tb)
				183	{
				184	const u32 sk_rcv_saddr = tcp_v4_rcv_saddr(sk);
				185	struct sock *sk2;
				186	struct hlist_node *node;
				187	int reuse = sk->sk_reuse;
				188
				189	sk_for_each_bound(sk2, node, &tb->owners) {
				190	if (sk != sk2 &&
				191	!tcp_v6_ipv6only(sk2) &&
				192	(!sk->sk_bound_dev_if \|\|
				193	!sk2->sk_bound_dev_if \|\|
				194	sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
				195	if (!reuse \|\| !sk2->sk_reuse \|\|
				196	sk2->sk_state == TCP_LISTEN) {
				197	const u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2);
				198	if (!sk2_rcv_saddr \|\| !sk_rcv_saddr \|\|
				199	sk2_rcv_saddr == sk_rcv_saddr)
				200	break;
				201	}
				202	}
				203	}
				204	return node != NULL;
				205	}
				206
				207	/* Obtain a reference to a local port for the given sock,
				208	* if snum is zero it means select any available local port.
				209	*/
				210	static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
				211	{
				212	struct tcp_bind_hashbucket *head;
				213	struct hlist_node *node;
				214	struct tcp_bind_bucket *tb;
				215	int ret;
				216
				217	local_bh_disable();
				218	if (!snum) {
				219	int low = sysctl_local_port_range[0];
				220	int high = sysctl_local_port_range[1];
				221	int remaining = (high - low) + 1;
				222	int rover;
				223
				224	spin_lock(&tcp_portalloc_lock);
Folkert van Heusden	0b2531b	2005-05-03 14:36:08 -0700	[diff] [blame]	225	if (tcp_port_rover < low)
				226	rover = low;
				227	else
				228	rover = tcp_port_rover;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	229	do {
				230	rover++;
Folkert van Heusden	0b2531b	2005-05-03 14:36:08 -0700	[diff] [blame]	231	if (rover > high)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	232	rover = low;
				233	head = &tcp_bhash[tcp_bhashfn(rover)];
				234	spin_lock(&head->lock);
				235	tb_for_each(tb, node, &head->chain)
				236	if (tb->port == rover)
				237	goto next;
				238	break;
				239	next:
				240	spin_unlock(&head->lock);
				241	} while (--remaining > 0);
				242	tcp_port_rover = rover;
				243	spin_unlock(&tcp_portalloc_lock);
				244
				245	/* Exhausted local port range during search? */
				246	ret = 1;
				247	if (remaining <= 0)
				248	goto fail;
				249
				250	/* OK, here is the one we will use. HEAD is
				251	* non-NULL and we hold it's mutex.
				252	*/
				253	snum = rover;
				254	} else {
				255	head = &tcp_bhash[tcp_bhashfn(snum)];
				256	spin_lock(&head->lock);
				257	tb_for_each(tb, node, &head->chain)
				258	if (tb->port == snum)
				259	goto tb_found;
				260	}
				261	tb = NULL;
				262	goto tb_not_found;
				263	tb_found:
				264	if (!hlist_empty(&tb->owners)) {
				265	if (sk->sk_reuse > 1)
				266	goto success;
				267	if (tb->fastreuse > 0 &&
				268	sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
				269	goto success;
				270	} else {
				271	ret = 1;
				272	if (tcp_bind_conflict(sk, tb))
				273	goto fail_unlock;
				274	}
				275	}
				276	tb_not_found:
				277	ret = 1;
				278	if (!tb && (tb = tcp_bucket_create(head, snum)) == NULL)
				279	goto fail_unlock;
				280	if (hlist_empty(&tb->owners)) {
				281	if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
				282	tb->fastreuse = 1;
				283	else
				284	tb->fastreuse = 0;
				285	} else if (tb->fastreuse &&
				286	(!sk->sk_reuse \|\| sk->sk_state == TCP_LISTEN))
				287	tb->fastreuse = 0;
				288	success:
				289	if (!tcp_sk(sk)->bind_hash)
				290	tcp_bind_hash(sk, tb, snum);
				291	BUG_TRAP(tcp_sk(sk)->bind_hash == tb);
				292	ret = 0;
				293
				294	fail_unlock:
				295	spin_unlock(&head->lock);
				296	fail:
				297	local_bh_enable();
				298	return ret;
				299	}
				300
				301	/* Get rid of any references to a local port held by the
				302	* given sock.
				303	*/
				304	static void __tcp_put_port(struct sock *sk)
				305	{
				306	struct inet_sock *inet = inet_sk(sk);
				307	struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(inet->num)];
				308	struct tcp_bind_bucket *tb;
				309
				310	spin_lock(&head->lock);
				311	tb = tcp_sk(sk)->bind_hash;
				312	__sk_del_bind_node(sk);
				313	tcp_sk(sk)->bind_hash = NULL;
				314	inet->num = 0;
				315	tcp_bucket_destroy(tb);
				316	spin_unlock(&head->lock);
				317	}
				318
				319	void tcp_put_port(struct sock *sk)
				320	{
				321	local_bh_disable();
				322	__tcp_put_port(sk);
				323	local_bh_enable();
				324	}
				325
				326	/* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
				327	* Look, when several writers sleep and reader wakes them up, all but one
				328	* immediately hit write lock and grab all the cpus. Exclusive sleep solves
				329	* this, _but_ remember, it adds useless work on UP machines (wake up each
				330	* exclusive lock release). It should be ifdefed really.
				331	*/
				332
				333	void tcp_listen_wlock(void)
				334	{
				335	write_lock(&tcp_lhash_lock);
				336
				337	if (atomic_read(&tcp_lhash_users)) {
				338	DEFINE_WAIT(wait);
				339
				340	for (;;) {
				341	prepare_to_wait_exclusive(&tcp_lhash_wait,
				342	&wait, TASK_UNINTERRUPTIBLE);
				343	if (!atomic_read(&tcp_lhash_users))
				344	break;
				345	write_unlock_bh(&tcp_lhash_lock);
				346	schedule();
				347	write_lock_bh(&tcp_lhash_lock);
				348	}
				349
				350	finish_wait(&tcp_lhash_wait, &wait);
				351	}
				352	}
				353
				354	static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible)
				355	{
				356	struct hlist_head *list;
				357	rwlock_t *lock;
				358
				359	BUG_TRAP(sk_unhashed(sk));
				360	if (listen_possible && sk->sk_state == TCP_LISTEN) {
				361	list = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
				362	lock = &tcp_lhash_lock;
				363	tcp_listen_wlock();
				364	} else {
				365	list = &tcp_ehash[(sk->sk_hashent = tcp_sk_hashfn(sk))].chain;
				366	lock = &tcp_ehash[sk->sk_hashent].lock;
				367	write_lock(lock);
				368	}
				369	__sk_add_node(sk, list);
				370	sock_prot_inc_use(sk->sk_prot);
				371	write_unlock(lock);
				372	if (listen_possible && sk->sk_state == TCP_LISTEN)
				373	wake_up(&tcp_lhash_wait);
				374	}
				375
				376	static void tcp_v4_hash(struct sock *sk)
				377	{
				378	if (sk->sk_state != TCP_CLOSE) {
				379	local_bh_disable();
				380	__tcp_v4_hash(sk, 1);
				381	local_bh_enable();
				382	}
				383	}
				384
				385	void tcp_unhash(struct sock *sk)
				386	{
				387	rwlock_t *lock;
				388
				389	if (sk_unhashed(sk))
				390	goto ende;
				391
				392	if (sk->sk_state == TCP_LISTEN) {
				393	local_bh_disable();
				394	tcp_listen_wlock();
				395	lock = &tcp_lhash_lock;
				396	} else {
				397	struct tcp_ehash_bucket *head = &tcp_ehash[sk->sk_hashent];
				398	lock = &head->lock;
				399	write_lock_bh(&head->lock);
				400	}
				401
				402	if (__sk_del_node_init(sk))
				403	sock_prot_dec_use(sk->sk_prot);
				404	write_unlock_bh(lock);
				405
				406	ende:
				407	if (sk->sk_state == TCP_LISTEN)
				408	wake_up(&tcp_lhash_wait);
				409	}
				410
				411	/* Don't inline this cruft. Here are some nice properties to
				412	* exploit here. The BSD API does not allow a listening TCP
				413	* to specify the remote port nor the remote address for the
				414	* connection. So always assume those are both wildcarded
				415	* during the search since they can never be otherwise.
				416	*/
				417	static struct sock __tcp_v4_lookup_listener(struct hlist_head head, u32 daddr,
				418	unsigned short hnum, int dif)
				419	{
				420	struct sock result = NULL, sk;
				421	struct hlist_node *node;
				422	int score, hiscore;
				423
				424	hiscore=-1;
				425	sk_for_each(sk, node, head) {
				426	struct inet_sock *inet = inet_sk(sk);
				427
				428	if (inet->num == hnum && !ipv6_only_sock(sk)) {
				429	__u32 rcv_saddr = inet->rcv_saddr;
				430
				431	score = (sk->sk_family == PF_INET ? 1 : 0);
				432	if (rcv_saddr) {
				433	if (rcv_saddr != daddr)
				434	continue;
				435	score+=2;
				436	}
				437	if (sk->sk_bound_dev_if) {
				438	if (sk->sk_bound_dev_if != dif)
				439	continue;
				440	score+=2;
				441	}
				442	if (score == 5)
				443	return sk;
				444	if (score > hiscore) {
				445	hiscore = score;
				446	result = sk;
				447	}
				448	}
				449	}
				450	return result;
				451	}
				452
				453	/* Optimize the common listener case. */
				454	static inline struct sock *tcp_v4_lookup_listener(u32 daddr,
				455	unsigned short hnum, int dif)
				456	{
				457	struct sock *sk = NULL;
				458	struct hlist_head *head;
				459
				460	read_lock(&tcp_lhash_lock);
				461	head = &tcp_listening_hash[tcp_lhashfn(hnum)];
				462	if (!hlist_empty(head)) {
				463	struct inet_sock *inet = inet_sk((sk = __sk_head(head)));
				464
				465	if (inet->num == hnum && !sk->sk_node.next &&
				466	(!inet->rcv_saddr \|\| inet->rcv_saddr == daddr) &&
				467	(sk->sk_family == PF_INET \|\| !ipv6_only_sock(sk)) &&
				468	!sk->sk_bound_dev_if)
				469	goto sherry_cache;
				470	sk = __tcp_v4_lookup_listener(head, daddr, hnum, dif);
				471	}
				472	if (sk) {
				473	sherry_cache:
				474	sock_hold(sk);
				475	}
				476	read_unlock(&tcp_lhash_lock);
				477	return sk;
				478	}
				479
				480	/* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
				481	* we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
				482	*
				483	* Local BH must be disabled here.
				484	*/
				485
				486	static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport,
				487	u32 daddr, u16 hnum,
				488	int dif)
				489	{
				490	struct tcp_ehash_bucket *head;
				491	TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
				492	__u32 ports = TCP_COMBINED_PORTS(sport, hnum);
				493	struct sock *sk;
				494	struct hlist_node *node;
				495	/* Optimize here for direct hit, only listening connections can
				496	* have wildcards anyways.
				497	*/
				498	int hash = tcp_hashfn(daddr, hnum, saddr, sport);
				499	head = &tcp_ehash[hash];
				500	read_lock(&head->lock);
				501	sk_for_each(sk, node, &head->chain) {
				502	if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
				503	goto hit; /* You sunk my battleship! */
				504	}
				505
				506	/* Must check for a TIME_WAIT'er before going to listener hash. */
				507	sk_for_each(sk, node, &(head + tcp_ehash_size)->chain) {
				508	if (TCP_IPV4_TW_MATCH(sk, acookie, saddr, daddr, ports, dif))
				509	goto hit;
				510	}
				511	sk = NULL;
				512	out:
				513	read_unlock(&head->lock);
				514	return sk;
				515	hit:
				516	sock_hold(sk);
				517	goto out;
				518	}
				519
				520	static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
				521	u32 daddr, u16 hnum, int dif)
				522	{
				523	struct sock *sk = __tcp_v4_lookup_established(saddr, sport,
				524	daddr, hnum, dif);
				525
				526	return sk ? : tcp_v4_lookup_listener(daddr, hnum, dif);
				527	}
				528
				529	inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr,
				530	u16 dport, int dif)
				531	{
				532	struct sock *sk;
				533
				534	local_bh_disable();
				535	sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
				536	local_bh_enable();
				537
				538	return sk;
				539	}
				540
				541	EXPORT_SYMBOL_GPL(tcp_v4_lookup);
				542
				543	static inline __u32 tcp_v4_init_sequence(struct sock sk, struct sk_buff skb)
				544	{
				545	return secure_tcp_sequence_number(skb->nh.iph->daddr,
				546	skb->nh.iph->saddr,
				547	skb->h.th->dest,
				548	skb->h.th->source);
				549	}
				550
				551	/* called with local bh disabled */
				552	static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
				553	struct tcp_tw_bucket **twp)
				554	{
				555	struct inet_sock *inet = inet_sk(sk);
				556	u32 daddr = inet->rcv_saddr;
				557	u32 saddr = inet->daddr;
				558	int dif = sk->sk_bound_dev_if;
				559	TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
				560	__u32 ports = TCP_COMBINED_PORTS(inet->dport, lport);
				561	int hash = tcp_hashfn(daddr, lport, saddr, inet->dport);
				562	struct tcp_ehash_bucket *head = &tcp_ehash[hash];
				563	struct sock *sk2;
				564	struct hlist_node *node;
				565	struct tcp_tw_bucket *tw;
				566
				567	write_lock(&head->lock);
				568
				569	/* Check TIME-WAIT sockets first. */
				570	sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) {
				571	tw = (struct tcp_tw_bucket *)sk2;
				572
				573	if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
				574	struct tcp_sock *tp = tcp_sk(sk);
				575
				576	/* With PAWS, it is safe from the viewpoint
				577	of data integrity. Even without PAWS it
				578	is safe provided sequence spaces do not
				579	overlap i.e. at data rates <= 80Mbit/sec.
				580
				581	Actually, the idea is close to VJ's one,
				582	only timestamp cache is held not per host,
				583	but per port pair and TW bucket is used
				584	as state holder.
				585
				586	If TW bucket has been already destroyed we
				587	fall back to VJ's scheme and use initial
				588	timestamp retrieved from peer table.
				589	*/
				590	if (tw->tw_ts_recent_stamp &&
				591	(!twp \|\| (sysctl_tcp_tw_reuse &&
				592	xtime.tv_sec -
				593	tw->tw_ts_recent_stamp > 1))) {
				594	if ((tp->write_seq =
				595	tw->tw_snd_nxt + 65535 + 2) == 0)
				596	tp->write_seq = 1;
				597	tp->rx_opt.ts_recent = tw->tw_ts_recent;
				598	tp->rx_opt.ts_recent_stamp = tw->tw_ts_recent_stamp;
				599	sock_hold(sk2);
				600	goto unique;
				601	} else
				602	goto not_unique;
				603	}
				604	}
				605	tw = NULL;
				606
				607	/* And established part... */
				608	sk_for_each(sk2, node, &head->chain) {
				609	if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
				610	goto not_unique;
				611	}
				612
				613	unique:
				614	/* Must record num and sport now. Otherwise we will see
				615	* in hash table socket with a funny identity. */
				616	inet->num = lport;
				617	inet->sport = htons(lport);
				618	sk->sk_hashent = hash;
				619	BUG_TRAP(sk_unhashed(sk));
				620	__sk_add_node(sk, &head->chain);
				621	sock_prot_inc_use(sk->sk_prot);
				622	write_unlock(&head->lock);
				623
				624	if (twp) {
				625	*twp = tw;
				626	NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
				627	} else if (tw) {
				628	/* Silly. Should hash-dance instead... */
				629	tcp_tw_deschedule(tw);
				630	NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
				631
				632	tcp_tw_put(tw);
				633	}
				634
				635	return 0;
				636
				637	not_unique:
				638	write_unlock(&head->lock);
				639	return -EADDRNOTAVAIL;
				640	}
				641
				642	static inline u32 connect_port_offset(const struct sock *sk)
				643	{
				644	const struct inet_sock *inet = inet_sk(sk);
				645
				646	return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr,
				647	inet->dport);
				648	}
				649
				650	/*
				651	* Bind a port for a connect operation and hash it.
				652	*/
				653	static inline int tcp_v4_hash_connect(struct sock *sk)
				654	{
				655	unsigned short snum = inet_sk(sk)->num;
				656	struct tcp_bind_hashbucket *head;
				657	struct tcp_bind_bucket *tb;
				658	int ret;
				659
				660	if (!snum) {
				661	int low = sysctl_local_port_range[0];
				662	int high = sysctl_local_port_range[1];
				663	int range = high - low;
				664	int i;
				665	int port;
				666	static u32 hint;
				667	u32 offset = hint + connect_port_offset(sk);
				668	struct hlist_node *node;
				669	struct tcp_tw_bucket *tw = NULL;
				670
				671	local_bh_disable();
				672	for (i = 1; i <= range; i++) {
				673	port = low + (i + offset) % range;
				674	head = &tcp_bhash[tcp_bhashfn(port)];
				675	spin_lock(&head->lock);
				676
				677	/* Does not bother with rcv_saddr checks,
				678	* because the established check is already
				679	* unique enough.
				680	*/
				681	tb_for_each(tb, node, &head->chain) {
				682	if (tb->port == port) {
				683	BUG_TRAP(!hlist_empty(&tb->owners));
				684	if (tb->fastreuse >= 0)
				685	goto next_port;
				686	if (!__tcp_v4_check_established(sk,
				687	port,
				688	&tw))
				689	goto ok;
				690	goto next_port;
				691	}
				692	}
				693
				694	tb = tcp_bucket_create(head, port);
				695	if (!tb) {
				696	spin_unlock(&head->lock);
				697	break;
				698	}
				699	tb->fastreuse = -1;
				700	goto ok;
				701
				702	next_port:
				703	spin_unlock(&head->lock);
				704	}
				705	local_bh_enable();
				706
				707	return -EADDRNOTAVAIL;
				708
				709	ok:
				710	hint += i;
				711
				712	/* Head lock still held and bh's disabled */
				713	tcp_bind_hash(sk, tb, port);
				714	if (sk_unhashed(sk)) {
				715	inet_sk(sk)->sport = htons(port);
				716	__tcp_v4_hash(sk, 0);
				717	}
				718	spin_unlock(&head->lock);
				719
				720	if (tw) {
				721	tcp_tw_deschedule(tw);
				722	tcp_tw_put(tw);
				723	}
				724
				725	ret = 0;
				726	goto out;
				727	}
				728
				729	head = &tcp_bhash[tcp_bhashfn(snum)];
				730	tb = tcp_sk(sk)->bind_hash;
				731	spin_lock_bh(&head->lock);
				732	if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
				733	__tcp_v4_hash(sk, 0);
				734	spin_unlock_bh(&head->lock);
				735	return 0;
				736	} else {
				737	spin_unlock(&head->lock);
				738	/* No definite answer... Walk to established hash table */
				739	ret = __tcp_v4_check_established(sk, snum, NULL);
				740	out:
				741	local_bh_enable();
				742	return ret;
				743	}
				744	}
				745
				746	/* This will initiate an outgoing connection. */
				747	int tcp_v4_connect(struct sock sk, struct sockaddr uaddr, int addr_len)
				748	{
				749	struct inet_sock *inet = inet_sk(sk);
				750	struct tcp_sock *tp = tcp_sk(sk);
				751	struct sockaddr_in usin = (struct sockaddr_in )uaddr;
				752	struct rtable *rt;
				753	u32 daddr, nexthop;
				754	int tmp;
				755	int err;
				756
				757	if (addr_len < sizeof(struct sockaddr_in))
				758	return -EINVAL;
				759
				760	if (usin->sin_family != AF_INET)
				761	return -EAFNOSUPPORT;
				762
				763	nexthop = daddr = usin->sin_addr.s_addr;
				764	if (inet->opt && inet->opt->srr) {
				765	if (!daddr)
				766	return -EINVAL;
				767	nexthop = inet->opt->faddr;
				768	}
				769
				770	tmp = ip_route_connect(&rt, nexthop, inet->saddr,
				771	RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
				772	IPPROTO_TCP,
				773	inet->sport, usin->sin_port, sk);
				774	if (tmp < 0)
				775	return tmp;
				776
				777	if (rt->rt_flags & (RTCF_MULTICAST \| RTCF_BROADCAST)) {
				778	ip_rt_put(rt);
				779	return -ENETUNREACH;
				780	}
				781
				782	if (!inet->opt \|\| !inet->opt->srr)
				783	daddr = rt->rt_dst;
				784
				785	if (!inet->saddr)
				786	inet->saddr = rt->rt_src;
				787	inet->rcv_saddr = inet->saddr;
				788
				789	if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
				790	/* Reset inherited state */
				791	tp->rx_opt.ts_recent = 0;
				792	tp->rx_opt.ts_recent_stamp = 0;
				793	tp->write_seq = 0;
				794	}
				795
				796	if (sysctl_tcp_tw_recycle &&
				797	!tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
				798	struct inet_peer *peer = rt_get_peer(rt);
				799
				800	/* VJ's idea. We save last timestamp seen from
				801	* the destination in peer table, when entering state TIME-WAIT
				802	* and initialize rx_opt.ts_recent from it, when trying new connection.
				803	*/
				804
				805	if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
				806	tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
				807	tp->rx_opt.ts_recent = peer->tcp_ts;
				808	}
				809	}
				810
				811	inet->dport = usin->sin_port;
				812	inet->daddr = daddr;
				813
				814	tp->ext_header_len = 0;
				815	if (inet->opt)
				816	tp->ext_header_len = inet->opt->optlen;
				817
				818	tp->rx_opt.mss_clamp = 536;
				819
				820	/* Socket identity is still unknown (sport may be zero).
				821	* However we set state to SYN-SENT and not releasing socket
				822	* lock select source port, enter ourselves into the hash tables and
				823	* complete initialization after this.
				824	*/
				825	tcp_set_state(sk, TCP_SYN_SENT);
				826	err = tcp_v4_hash_connect(sk);
				827	if (err)
				828	goto failure;
				829
				830	err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
				831	if (err)
				832	goto failure;
				833
				834	/* OK, now commit destination to socket. */
				835	__sk_dst_set(sk, &rt->u.dst);
				836	tcp_v4_setup_caps(sk, &rt->u.dst);
				837
				838	if (!tp->write_seq)
				839	tp->write_seq = secure_tcp_sequence_number(inet->saddr,
				840	inet->daddr,
				841	inet->sport,
				842	usin->sin_port);
				843
				844	inet->id = tp->write_seq ^ jiffies;
				845
				846	err = tcp_connect(sk);
				847	rt = NULL;
				848	if (err)
				849	goto failure;
				850
				851	return 0;
				852
				853	failure:
				854	/* This unhashes the socket and releases the local port, if necessary. */
				855	tcp_set_state(sk, TCP_CLOSE);
				856	ip_rt_put(rt);
				857	sk->sk_route_caps = 0;
				858	inet->dport = 0;
				859	return err;
				860	}
				861
				862	static __inline__ int tcp_v4_iif(struct sk_buff *skb)
				863	{
				864	return ((struct rtable *)skb->dst)->rt_iif;
				865	}
				866
				867	static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
				868	{
				869	return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
				870	}
				871
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	872	static struct request_sock tcp_v4_search_req(struct tcp_sock tp,
				873	struct request_sock ***prevp,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	874	__u16 rport,
				875	__u32 raddr, __u32 laddr)
				876	{
Arnaldo Carvalho de Melo	2ad69c5	2005-06-18 22:48:55 -0700	[diff] [blame]	877	struct listen_sock *lopt = tp->accept_queue.listen_opt;
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	878	struct request_sock req, *prev;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	879
				880	for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
				881	(req = *prev) != NULL;
				882	prev = &req->dl_next) {
Arnaldo Carvalho de Melo	2e6599c	2005-06-18 22:46:52 -0700	[diff] [blame]	883	const struct inet_request_sock *ireq = inet_rsk(req);
				884
				885	if (ireq->rmt_port == rport &&
				886	ireq->rmt_addr == raddr &&
				887	ireq->loc_addr == laddr &&
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	888	TCP_INET_FAMILY(req->rsk_ops->family)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	889	BUG_TRAP(!req->sk);
				890	*prevp = prev;
				891	break;
				892	}
				893	}
				894
				895	return req;
				896	}
				897
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	898	static void tcp_v4_synq_add(struct sock sk, struct request_sock req)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	899	{
				900	struct tcp_sock *tp = tcp_sk(sk);
Arnaldo Carvalho de Melo	2ad69c5	2005-06-18 22:48:55 -0700	[diff] [blame]	901	struct listen_sock *lopt = tp->accept_queue.listen_opt;
Arnaldo Carvalho de Melo	2e6599c	2005-06-18 22:46:52 -0700	[diff] [blame]	902	u32 h = tcp_v4_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	903
Arnaldo Carvalho de Melo	0e87506	2005-06-18 22:47:59 -0700	[diff] [blame]	904	reqsk_queue_hash_req(&tp->accept_queue, h, req, TCP_TIMEOUT_INIT);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	905	tcp_synq_added(sk);
				906	}
				907
				908
				909	/*
				910	* This routine does path mtu discovery as defined in RFC1191.
				911	*/
				912	static inline void do_pmtu_discovery(struct sock sk, struct iphdr iph,
				913	u32 mtu)
				914	{
				915	struct dst_entry *dst;
				916	struct inet_sock *inet = inet_sk(sk);
				917	struct tcp_sock *tp = tcp_sk(sk);
				918
				919	/* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
				920	* send out by Linux are always <576bytes so they should go through
				921	* unfragmented).
				922	*/
				923	if (sk->sk_state == TCP_LISTEN)
				924	return;
				925
				926	/* We don't check in the destentry if pmtu discovery is forbidden
				927	* on this route. We just assume that no packet_to_big packets
				928	* are send back when pmtu discovery is not active.
				929	* There is a small race when the user changes this flag in the
				930	* route, but I think that's acceptable.
				931	*/
				932	if ((dst = __sk_dst_check(sk, 0)) == NULL)
				933	return;
				934
				935	dst->ops->update_pmtu(dst, mtu);
				936
				937	/* Something is about to be wrong... Remember soft error
				938	* for the case, if this connection will not able to recover.
				939	*/
				940	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
				941	sk->sk_err_soft = EMSGSIZE;
				942
				943	mtu = dst_mtu(dst);
				944
				945	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
				946	tp->pmtu_cookie > mtu) {
				947	tcp_sync_mss(sk, mtu);
				948
				949	/* Resend the TCP packet because it's
				950	* clear that the old packet has been
				951	* dropped. This is the new "fast" path mtu
				952	* discovery.
				953	*/
				954	tcp_simple_retransmit(sk);
				955	} /* else let the usual retransmit timer handle it */
				956	}
				957
				958	/*
				959	* This routine is called by the ICMP module when it gets some
				960	* sort of error condition. If err < 0 then the socket should
				961	* be closed and the error returned to the user. If err > 0
				962	* it's just the icmp type << 8 \| icmp code. After adjustment
				963	* header points to the first 8 bytes of the tcp header. We need
				964	* to find the appropriate port.
				965	*
				966	* The locking strategy used here is very "optimistic". When
				967	* someone else accesses the socket the ICMP is just dropped
				968	* and for some paths there is no check at all.
				969	* A more general error queue to queue errors for later handling
				970	* is probably better.
				971	*
				972	*/
				973
				974	void tcp_v4_err(struct sk_buff *skb, u32 info)
				975	{
				976	struct iphdr iph = (struct iphdr )skb->data;
				977	struct tcphdr th = (struct tcphdr )(skb->data + (iph->ihl << 2));
				978	struct tcp_sock *tp;
				979	struct inet_sock *inet;
				980	int type = skb->h.icmph->type;
				981	int code = skb->h.icmph->code;
				982	struct sock *sk;
				983	__u32 seq;
				984	int err;
				985
				986	if (skb->len < (iph->ihl << 2) + 8) {
				987	ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
				988	return;
				989	}
				990
				991	sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr,
				992	th->source, tcp_v4_iif(skb));
				993	if (!sk) {
				994	ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
				995	return;
				996	}
				997	if (sk->sk_state == TCP_TIME_WAIT) {
				998	tcp_tw_put((struct tcp_tw_bucket *)sk);
				999	return;
				1000	}
				1001
				1002	bh_lock_sock(sk);
				1003	/* If too many ICMPs get dropped on busy
				1004	* servers this needs to be solved differently.
				1005	*/
				1006	if (sock_owned_by_user(sk))
				1007	NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
				1008
				1009	if (sk->sk_state == TCP_CLOSE)
				1010	goto out;
				1011
				1012	tp = tcp_sk(sk);
				1013	seq = ntohl(th->seq);
				1014	if (sk->sk_state != TCP_LISTEN &&
				1015	!between(seq, tp->snd_una, tp->snd_nxt)) {
				1016	NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
				1017	goto out;
				1018	}
				1019
				1020	switch (type) {
				1021	case ICMP_SOURCE_QUENCH:
				1022	/* Just silently ignore these. */
				1023	goto out;
				1024	case ICMP_PARAMETERPROB:
				1025	err = EPROTO;
				1026	break;
				1027	case ICMP_DEST_UNREACH:
				1028	if (code > NR_ICMP_UNREACH)
				1029	goto out;
				1030
				1031	if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
				1032	if (!sock_owned_by_user(sk))
				1033	do_pmtu_discovery(sk, iph, info);
				1034	goto out;
				1035	}
				1036
				1037	err = icmp_err_convert[code].errno;
				1038	break;
				1039	case ICMP_TIME_EXCEEDED:
				1040	err = EHOSTUNREACH;
				1041	break;
				1042	default:
				1043	goto out;
				1044	}
				1045
				1046	switch (sk->sk_state) {
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	1047	struct request_sock req, *prev;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1048	case TCP_LISTEN:
				1049	if (sock_owned_by_user(sk))
				1050	goto out;
				1051
				1052	req = tcp_v4_search_req(tp, &prev, th->dest,
				1053	iph->daddr, iph->saddr);
				1054	if (!req)
				1055	goto out;
				1056
				1057	/* ICMPs are not backlogged, hence we cannot get
				1058	an established socket here.
				1059	*/
				1060	BUG_TRAP(!req->sk);
				1061
Arnaldo Carvalho de Melo	2e6599c	2005-06-18 22:46:52 -0700	[diff] [blame]	1062	if (seq != tcp_rsk(req)->snt_isn) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1063	NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
				1064	goto out;
				1065	}
				1066
				1067	/*
				1068	* Still in SYN_RECV, just remove it silently.
				1069	* There is no good way to pass the error to the newly
				1070	* created socket, and POSIX does not want network
				1071	* errors returned from accept().
				1072	*/
				1073	tcp_synq_drop(sk, req, prev);
				1074	goto out;
				1075
				1076	case TCP_SYN_SENT:
				1077	case TCP_SYN_RECV: /* Cannot happen.
				1078	It can f.e. if SYNs crossed.
				1079	*/
				1080	if (!sock_owned_by_user(sk)) {
				1081	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
				1082	sk->sk_err = err;
				1083
				1084	sk->sk_error_report(sk);
				1085
				1086	tcp_done(sk);
				1087	} else {
				1088	sk->sk_err_soft = err;
				1089	}
				1090	goto out;
				1091	}
				1092
				1093	/* If we've already connected we will keep trying
				1094	* until we time out, or the user gives up.
				1095	*
				1096	* rfc1122 4.2.3.9 allows to consider as hard errors
				1097	* only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
				1098	* but it is obsoleted by pmtu discovery).
				1099	*
				1100	* Note, that in modern internet, where routing is unreliable
				1101	* and in each dark corner broken firewalls sit, sending random
				1102	* errors ordered by their masters even this two messages finally lose
				1103	* their original sense (even Linux sends invalid PORT_UNREACHs)
				1104	*
				1105	* Now we are in compliance with RFCs.
				1106	* --ANK (980905)
				1107	*/
				1108
				1109	inet = inet_sk(sk);
				1110	if (!sock_owned_by_user(sk) && inet->recverr) {
				1111	sk->sk_err = err;
				1112	sk->sk_error_report(sk);
				1113	} else { /* Only an error on timeout */
				1114	sk->sk_err_soft = err;
				1115	}
				1116
				1117	out:
				1118	bh_unlock_sock(sk);
				1119	sock_put(sk);
				1120	}
				1121
				1122	/* This routine computes an IPv4 TCP checksum. */
				1123	void tcp_v4_send_check(struct sock sk, struct tcphdr th, int len,
				1124	struct sk_buff *skb)
				1125	{
				1126	struct inet_sock *inet = inet_sk(sk);
				1127
				1128	if (skb->ip_summed == CHECKSUM_HW) {
				1129	th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
				1130	skb->csum = offsetof(struct tcphdr, check);
				1131	} else {
				1132	th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
				1133	csum_partial((char *)th,
				1134	th->doff << 2,
				1135	skb->csum));
				1136	}
				1137	}
				1138
				1139	/*
				1140	* This routine will send an RST to the other tcp.
				1141	*
				1142	* Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
				1143	* for reset.
				1144	* Answer: if a packet caused RST, it is not for a socket
				1145	* existing in our system, if it is matched to a socket,
				1146	* it is just duplicate segment or bug in other side's TCP.
				1147	* So that we build reply only basing on parameters
				1148	* arrived with segment.
				1149	* Exception: precedence violation. We do not implement it in any case.
				1150	*/
				1151
				1152	static void tcp_v4_send_reset(struct sk_buff *skb)
				1153	{
				1154	struct tcphdr *th = skb->h.th;
				1155	struct tcphdr rth;
				1156	struct ip_reply_arg arg;
				1157
				1158	/* Never send a reset in response to a reset. */
				1159	if (th->rst)
				1160	return;
				1161
				1162	if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
				1163	return;
				1164
				1165	/* Swap the send and the receive. */
				1166	memset(&rth, 0, sizeof(struct tcphdr));
				1167	rth.dest = th->source;
				1168	rth.source = th->dest;
				1169	rth.doff = sizeof(struct tcphdr) / 4;
				1170	rth.rst = 1;
				1171
				1172	if (th->ack) {
				1173	rth.seq = th->ack_seq;
				1174	} else {
				1175	rth.ack = 1;
				1176	rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
				1177	skb->len - (th->doff << 2));
				1178	}
				1179
				1180	memset(&arg, 0, sizeof arg);
				1181	arg.iov[0].iov_base = (unsigned char *)&rth;
				1182	arg.iov[0].iov_len = sizeof rth;
				1183	arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
				1184	skb->nh.iph->saddr, /XXX/
				1185	sizeof(struct tcphdr), IPPROTO_TCP, 0);
				1186	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
				1187
				1188	ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
				1189
				1190	TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
				1191	TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
				1192	}
				1193
				1194	/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
				1195	outside socket context is ugly, certainly. What can I do?
				1196	*/
				1197
				1198	static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
				1199	u32 win, u32 ts)
				1200	{
				1201	struct tcphdr *th = skb->h.th;
				1202	struct {
				1203	struct tcphdr th;
				1204	u32 tsopt[3];
				1205	} rep;
				1206	struct ip_reply_arg arg;
				1207
				1208	memset(&rep.th, 0, sizeof(struct tcphdr));
				1209	memset(&arg, 0, sizeof arg);
				1210
				1211	arg.iov[0].iov_base = (unsigned char *)&rep;
				1212	arg.iov[0].iov_len = sizeof(rep.th);
				1213	if (ts) {
				1214	rep.tsopt[0] = htonl((TCPOPT_NOP << 24) \| (TCPOPT_NOP << 16) \|
				1215	(TCPOPT_TIMESTAMP << 8) \|
				1216	TCPOLEN_TIMESTAMP);
				1217	rep.tsopt[1] = htonl(tcp_time_stamp);
				1218	rep.tsopt[2] = htonl(ts);
				1219	arg.iov[0].iov_len = sizeof(rep);
				1220	}
				1221
				1222	/* Swap the send and the receive. */
				1223	rep.th.dest = th->source;
				1224	rep.th.source = th->dest;
				1225	rep.th.doff = arg.iov[0].iov_len / 4;
				1226	rep.th.seq = htonl(seq);
				1227	rep.th.ack_seq = htonl(ack);
				1228	rep.th.ack = 1;
				1229	rep.th.window = htons(win);
				1230
				1231	arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
				1232	skb->nh.iph->saddr, /XXX/
				1233	arg.iov[0].iov_len, IPPROTO_TCP, 0);
				1234	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
				1235
				1236	ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
				1237
				1238	TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
				1239	}
				1240
				1241	static void tcp_v4_timewait_ack(struct sock sk, struct sk_buff skb)
				1242	{
				1243	struct tcp_tw_bucket tw = (struct tcp_tw_bucket )sk;
				1244
				1245	tcp_v4_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt,
				1246	tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent);
				1247
				1248	tcp_tw_put(tw);
				1249	}
				1250
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	1251	static void tcp_v4_reqsk_send_ack(struct sk_buff skb, struct request_sock req)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1252	{
Arnaldo Carvalho de Melo	2e6599c	2005-06-18 22:46:52 -0700	[diff] [blame]	1253	tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1254	req->ts_recent);
				1255	}
				1256
				1257	static struct dst_entry* tcp_v4_route_req(struct sock *sk,
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	1258	struct request_sock *req)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1259	{
				1260	struct rtable *rt;
Arnaldo Carvalho de Melo	2e6599c	2005-06-18 22:46:52 -0700	[diff] [blame]	1261	const struct inet_request_sock *ireq = inet_rsk(req);
				1262	struct ip_options *opt = inet_rsk(req)->opt;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1263	struct flowi fl = { .oif = sk->sk_bound_dev_if,
				1264	.nl_u = { .ip4_u =
				1265	{ .daddr = ((opt && opt->srr) ?
				1266	opt->faddr :
Arnaldo Carvalho de Melo	2e6599c	2005-06-18 22:46:52 -0700	[diff] [blame]	1267	ireq->rmt_addr),
				1268	.saddr = ireq->loc_addr,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1269	.tos = RT_CONN_FLAGS(sk) } },
				1270	.proto = IPPROTO_TCP,
				1271	.uli_u = { .ports =
				1272	{ .sport = inet_sk(sk)->sport,
Arnaldo Carvalho de Melo	2e6599c	2005-06-18 22:46:52 -0700	[diff] [blame]	1273	.dport = ireq->rmt_port } } };
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1274
				1275	if (ip_route_output_flow(&rt, &fl, sk, 0)) {
				1276	IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
				1277	return NULL;
				1278	}
				1279	if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
				1280	ip_rt_put(rt);
				1281	IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
				1282	return NULL;
				1283	}
				1284	return &rt->u.dst;
				1285	}
				1286
				1287	/*
				1288	* Send a SYN-ACK after having received an ACK.
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	1289	* This still operates on a request_sock only, not on a big
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1290	* socket.
				1291	*/
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	1292	static int tcp_v4_send_synack(struct sock sk, struct request_sock req,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1293	struct dst_entry *dst)
				1294	{
Arnaldo Carvalho de Melo	2e6599c	2005-06-18 22:46:52 -0700	[diff] [blame]	1295	const struct inet_request_sock *ireq = inet_rsk(req);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1296	int err = -1;
				1297	struct sk_buff * skb;
				1298
				1299	/* First, grab a route. */
				1300	if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
				1301	goto out;
				1302
				1303	skb = tcp_make_synack(sk, dst, req);
				1304
				1305	if (skb) {
				1306	struct tcphdr *th = skb->h.th;
				1307
				1308	th->check = tcp_v4_check(th, skb->len,
Arnaldo Carvalho de Melo	2e6599c	2005-06-18 22:46:52 -0700	[diff] [blame]	1309	ireq->loc_addr,
				1310	ireq->rmt_addr,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1311	csum_partial((char *)th, skb->len,
				1312	skb->csum));
				1313
Arnaldo Carvalho de Melo	2e6599c	2005-06-18 22:46:52 -0700	[diff] [blame]	1314	err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
				1315	ireq->rmt_addr,
				1316	ireq->opt);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1317	if (err == NET_XMIT_CN)
				1318	err = 0;
				1319	}
				1320
				1321	out:
				1322	dst_release(dst);
				1323	return err;
				1324	}
				1325
				1326	/*
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	1327	* IPv4 request_sock destructor.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1328	*/
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	1329	static void tcp_v4_reqsk_destructor(struct request_sock *req)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1330	{
Arnaldo Carvalho de Melo	2e6599c	2005-06-18 22:46:52 -0700	[diff] [blame]	1331	if (inet_rsk(req)->opt)
				1332	kfree(inet_rsk(req)->opt);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1333	}
				1334
				1335	static inline void syn_flood_warning(struct sk_buff *skb)
				1336	{
				1337	static unsigned long warntime;
				1338
				1339	if (time_after(jiffies, (warntime + HZ * 60))) {
				1340	warntime = jiffies;
				1341	printk(KERN_INFO
				1342	"possible SYN flooding on port %d. Sending cookies.\n",
				1343	ntohs(skb->h.th->dest));
				1344	}
				1345	}
				1346
				1347	/*
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	1348	* Save and compile IPv4 options into the request_sock if needed.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1349	*/
				1350	static inline struct ip_options tcp_v4_save_options(struct sock sk,
				1351	struct sk_buff *skb)
				1352	{
				1353	struct ip_options *opt = &(IPCB(skb)->opt);
				1354	struct ip_options *dopt = NULL;
				1355
				1356	if (opt && opt->optlen) {
				1357	int opt_size = optlength(opt);
				1358	dopt = kmalloc(opt_size, GFP_ATOMIC);
				1359	if (dopt) {
				1360	if (ip_options_echo(dopt, skb)) {
				1361	kfree(dopt);
				1362	dopt = NULL;
				1363	}
				1364	}
				1365	}
				1366	return dopt;
				1367	}
				1368
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	1369	struct request_sock_ops tcp_request_sock_ops = {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1370	.family = PF_INET,
Arnaldo Carvalho de Melo	2e6599c	2005-06-18 22:46:52 -0700	[diff] [blame]	1371	.obj_size = sizeof(struct tcp_request_sock),
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1372	.rtx_syn_ack = tcp_v4_send_synack,
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	1373	.send_ack = tcp_v4_reqsk_send_ack,
				1374	.destructor = tcp_v4_reqsk_destructor,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1375	.send_reset = tcp_v4_send_reset,
				1376	};
				1377
				1378	int tcp_v4_conn_request(struct sock sk, struct sk_buff skb)
				1379	{
Arnaldo Carvalho de Melo	2e6599c	2005-06-18 22:46:52 -0700	[diff] [blame]	1380	struct inet_request_sock *ireq;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1381	struct tcp_options_received tmp_opt;
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	1382	struct request_sock *req;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1383	__u32 saddr = skb->nh.iph->saddr;
				1384	__u32 daddr = skb->nh.iph->daddr;
				1385	__u32 isn = TCP_SKB_CB(skb)->when;
				1386	struct dst_entry *dst = NULL;
				1387	#ifdef CONFIG_SYN_COOKIES
				1388	int want_cookie = 0;
				1389	#else
				1390	#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
				1391	#endif
				1392
				1393	/* Never answer to SYNs send to broadcast or multicast */
				1394	if (((struct rtable *)skb->dst)->rt_flags &
				1395	(RTCF_BROADCAST \| RTCF_MULTICAST))
				1396	goto drop;
				1397
				1398	/* TW buckets are converted to open requests without
				1399	* limitations, they conserve resources and peer is
				1400	* evidently real one.
				1401	*/
				1402	if (tcp_synq_is_full(sk) && !isn) {
				1403	#ifdef CONFIG_SYN_COOKIES
				1404	if (sysctl_tcp_syncookies) {
				1405	want_cookie = 1;
				1406	} else
				1407	#endif
				1408	goto drop;
				1409	}
				1410
				1411	/* Accept backlog is full. If we have already queued enough
				1412	* of warm entries in syn queue, drop request. It is better than
				1413	* clogging syn queue with openreqs with exponentially increasing
				1414	* timeout.
				1415	*/
				1416	if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
				1417	goto drop;
				1418
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	1419	req = reqsk_alloc(&tcp_request_sock_ops);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1420	if (!req)
				1421	goto drop;
				1422
				1423	tcp_clear_options(&tmp_opt);
				1424	tmp_opt.mss_clamp = 536;
				1425	tmp_opt.user_mss = tcp_sk(sk)->rx_opt.user_mss;
				1426
				1427	tcp_parse_options(skb, &tmp_opt, 0);
				1428
				1429	if (want_cookie) {
				1430	tcp_clear_options(&tmp_opt);
				1431	tmp_opt.saw_tstamp = 0;
				1432	}
				1433
				1434	if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
				1435	/* Some OSes (unknown ones, but I see them on web server, which
				1436	* contains information interesting only for windows'
				1437	* users) do not send their stamp in SYN. It is easy case.
				1438	* We simply do not advertise TS support.
				1439	*/
				1440	tmp_opt.saw_tstamp = 0;
				1441	tmp_opt.tstamp_ok = 0;
				1442	}
				1443	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
				1444
				1445	tcp_openreq_init(req, &tmp_opt, skb);
				1446
Arnaldo Carvalho de Melo	2e6599c	2005-06-18 22:46:52 -0700	[diff] [blame]	1447	ireq = inet_rsk(req);
				1448	ireq->loc_addr = daddr;
				1449	ireq->rmt_addr = saddr;
				1450	ireq->opt = tcp_v4_save_options(sk, skb);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1451	if (!want_cookie)
				1452	TCP_ECN_create_request(req, skb->h.th);
				1453
				1454	if (want_cookie) {
				1455	#ifdef CONFIG_SYN_COOKIES
				1456	syn_flood_warning(skb);
				1457	#endif
				1458	isn = cookie_v4_init_sequence(sk, skb, &req->mss);
				1459	} else if (!isn) {
				1460	struct inet_peer *peer = NULL;
				1461
				1462	/* VJ's idea. We save last timestamp seen
				1463	* from the destination in peer table, when entering
				1464	* state TIME-WAIT, and check against it before
				1465	* accepting new connection request.
				1466	*
				1467	* If "isn" is not zero, this request hit alive
				1468	* timewait bucket, so that all the necessary checks
				1469	* are made in the function processing timewait state.
				1470	*/
				1471	if (tmp_opt.saw_tstamp &&
				1472	sysctl_tcp_tw_recycle &&
				1473	(dst = tcp_v4_route_req(sk, req)) != NULL &&
				1474	(peer = rt_get_peer((struct rtable *)dst)) != NULL &&
				1475	peer->v4daddr == saddr) {
				1476	if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
				1477	(s32)(peer->tcp_ts - req->ts_recent) >
				1478	TCP_PAWS_WINDOW) {
				1479	NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
				1480	dst_release(dst);
				1481	goto drop_and_free;
				1482	}
				1483	}
				1484	/* Kill the following clause, if you dislike this way. */
				1485	else if (!sysctl_tcp_syncookies &&
				1486	(sysctl_max_syn_backlog - tcp_synq_len(sk) <
				1487	(sysctl_max_syn_backlog >> 2)) &&
				1488	(!peer \|\| !peer->tcp_ts_stamp) &&
				1489	(!dst \|\| !dst_metric(dst, RTAX_RTT))) {
				1490	/* Without syncookies last quarter of
				1491	* backlog is filled with destinations,
				1492	* proven to be alive.
				1493	* It means that we continue to communicate
				1494	* to destinations, already remembered
				1495	* to the moment of synflood.
				1496	*/
				1497	NETDEBUG(if (net_ratelimit()) \
				1498	printk(KERN_DEBUG "TCP: drop open "
				1499	"request from %u.%u."
				1500	"%u.%u/%u\n", \
				1501	NIPQUAD(saddr),
				1502	ntohs(skb->h.th->source)));
				1503	dst_release(dst);
				1504	goto drop_and_free;
				1505	}
				1506
				1507	isn = tcp_v4_init_sequence(sk, skb);
				1508	}
Arnaldo Carvalho de Melo	2e6599c	2005-06-18 22:46:52 -0700	[diff] [blame]	1509	tcp_rsk(req)->snt_isn = isn;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1510
				1511	if (tcp_v4_send_synack(sk, req, dst))
				1512	goto drop_and_free;
				1513
				1514	if (want_cookie) {
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	1515	reqsk_free(req);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1516	} else {
				1517	tcp_v4_synq_add(sk, req);
				1518	}
				1519	return 0;
				1520
				1521	drop_and_free:
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	1522	reqsk_free(req);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1523	drop:
				1524	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
				1525	return 0;
				1526	}
				1527
				1528
				1529	/*
				1530	* The three way handshake has completed - we got a valid synack -
				1531	* now create the new socket.
				1532	*/
				1533	struct sock tcp_v4_syn_recv_sock(struct sock sk, struct sk_buff *skb,
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	1534	struct request_sock *req,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1535	struct dst_entry *dst)
				1536	{
Arnaldo Carvalho de Melo	2e6599c	2005-06-18 22:46:52 -0700	[diff] [blame]	1537	struct inet_request_sock *ireq;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1538	struct inet_sock *newinet;
				1539	struct tcp_sock *newtp;
				1540	struct sock *newsk;
				1541
				1542	if (sk_acceptq_is_full(sk))
				1543	goto exit_overflow;
				1544
				1545	if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
				1546	goto exit;
				1547
				1548	newsk = tcp_create_openreq_child(sk, req, skb);
				1549	if (!newsk)
				1550	goto exit;
				1551
				1552	newsk->sk_dst_cache = dst;
				1553	tcp_v4_setup_caps(newsk, dst);
				1554
				1555	newtp = tcp_sk(newsk);
				1556	newinet = inet_sk(newsk);
Arnaldo Carvalho de Melo	2e6599c	2005-06-18 22:46:52 -0700	[diff] [blame]	1557	ireq = inet_rsk(req);
				1558	newinet->daddr = ireq->rmt_addr;
				1559	newinet->rcv_saddr = ireq->loc_addr;
				1560	newinet->saddr = ireq->loc_addr;
				1561	newinet->opt = ireq->opt;
				1562	ireq->opt = NULL;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1563	newinet->mc_index = tcp_v4_iif(skb);
				1564	newinet->mc_ttl = skb->nh.iph->ttl;
				1565	newtp->ext_header_len = 0;
				1566	if (newinet->opt)
				1567	newtp->ext_header_len = newinet->opt->optlen;
				1568	newinet->id = newtp->write_seq ^ jiffies;
				1569
				1570	tcp_sync_mss(newsk, dst_mtu(dst));
				1571	newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
				1572	tcp_initialize_rcv_mss(newsk);
				1573
				1574	__tcp_v4_hash(newsk, 0);
				1575	__tcp_inherit_port(sk, newsk);
				1576
				1577	return newsk;
				1578
				1579	exit_overflow:
				1580	NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
				1581	exit:
				1582	NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
				1583	dst_release(dst);
				1584	return NULL;
				1585	}
				1586
				1587	static struct sock tcp_v4_hnd_req(struct sock sk, struct sk_buff *skb)
				1588	{
				1589	struct tcphdr *th = skb->h.th;
				1590	struct iphdr *iph = skb->nh.iph;
				1591	struct tcp_sock *tp = tcp_sk(sk);
				1592	struct sock *nsk;
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	1593	struct request_sock **prev;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1594	/* Find possible connection requests. */
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	1595	struct request_sock *req = tcp_v4_search_req(tp, &prev, th->source,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1596	iph->saddr, iph->daddr);
				1597	if (req)
				1598	return tcp_check_req(sk, skb, req, prev);
				1599
				1600	nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
				1601	th->source,
				1602	skb->nh.iph->daddr,
				1603	ntohs(th->dest),
				1604	tcp_v4_iif(skb));
				1605
				1606	if (nsk) {
				1607	if (nsk->sk_state != TCP_TIME_WAIT) {
				1608	bh_lock_sock(nsk);
				1609	return nsk;
				1610	}
				1611	tcp_tw_put((struct tcp_tw_bucket *)nsk);
				1612	return NULL;
				1613	}
				1614
				1615	#ifdef CONFIG_SYN_COOKIES
				1616	if (!th->rst && !th->syn && th->ack)
				1617	sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
				1618	#endif
				1619	return sk;
				1620	}
				1621
				1622	static int tcp_v4_checksum_init(struct sk_buff *skb)
				1623	{
				1624	if (skb->ip_summed == CHECKSUM_HW) {
				1625	skb->ip_summed = CHECKSUM_UNNECESSARY;
				1626	if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
				1627	skb->nh.iph->daddr, skb->csum))
				1628	return 0;
				1629
				1630	NETDEBUG(if (net_ratelimit())
				1631	printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
				1632	skb->ip_summed = CHECKSUM_NONE;
				1633	}
				1634	if (skb->len <= 76) {
				1635	if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
				1636	skb->nh.iph->daddr,
				1637	skb_checksum(skb, 0, skb->len, 0)))
				1638	return -1;
				1639	skb->ip_summed = CHECKSUM_UNNECESSARY;
				1640	} else {
				1641	skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
				1642	skb->nh.iph->saddr,
				1643	skb->nh.iph->daddr, 0);
				1644	}
				1645	return 0;
				1646	}
				1647
				1648
				1649	/* The socket must have it's spinlock held when we get
				1650	* here.
				1651	*
				1652	* We have a potential double-lock case here, so even when
				1653	* doing backlog processing we use the BH locking scheme.
				1654	* This is because we cannot sleep with the original spinlock
				1655	* held.
				1656	*/
				1657	int tcp_v4_do_rcv(struct sock sk, struct sk_buff skb)
				1658	{
				1659	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
				1660	TCP_CHECK_TIMER(sk);
				1661	if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
				1662	goto reset;
				1663	TCP_CHECK_TIMER(sk);
				1664	return 0;
				1665	}
				1666
				1667	if (skb->len < (skb->h.th->doff << 2) \|\| tcp_checksum_complete(skb))
				1668	goto csum_err;
				1669
				1670	if (sk->sk_state == TCP_LISTEN) {
				1671	struct sock *nsk = tcp_v4_hnd_req(sk, skb);
				1672	if (!nsk)
				1673	goto discard;
				1674
				1675	if (nsk != sk) {
				1676	if (tcp_child_process(sk, nsk, skb))
				1677	goto reset;
				1678	return 0;
				1679	}
				1680	}
				1681
				1682	TCP_CHECK_TIMER(sk);
				1683	if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
				1684	goto reset;
				1685	TCP_CHECK_TIMER(sk);
				1686	return 0;
				1687
				1688	reset:
				1689	tcp_v4_send_reset(skb);
				1690	discard:
				1691	kfree_skb(skb);
				1692	/* Be careful here. If this function gets more complicated and
				1693	* gcc suffers from register pressure on the x86, sk (in %ebx)
				1694	* might be destroyed here. This current version compiles correctly,
				1695	* but you have been warned.
				1696	*/
				1697	return 0;
				1698
				1699	csum_err:
				1700	TCP_INC_STATS_BH(TCP_MIB_INERRS);
				1701	goto discard;
				1702	}
				1703
				1704	/*
				1705	* From tcp_input.c
				1706	*/
				1707
				1708	int tcp_v4_rcv(struct sk_buff *skb)
				1709	{
				1710	struct tcphdr *th;
				1711	struct sock *sk;
				1712	int ret;
				1713
				1714	if (skb->pkt_type != PACKET_HOST)
				1715	goto discard_it;
				1716
				1717	/* Count it even if it's bad */
				1718	TCP_INC_STATS_BH(TCP_MIB_INSEGS);
				1719
				1720	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
				1721	goto discard_it;
				1722
				1723	th = skb->h.th;
				1724
				1725	if (th->doff < sizeof(struct tcphdr) / 4)
				1726	goto bad_packet;
				1727	if (!pskb_may_pull(skb, th->doff * 4))
				1728	goto discard_it;
				1729
				1730	/* An explanation is required here, I think.
				1731	* Packet length and doff are validated by header prediction,
				1732	* provided case of th->doff==0 is elimineted.
				1733	* So, we defer the checks. */
				1734	if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
				1735	tcp_v4_checksum_init(skb) < 0))
				1736	goto bad_packet;
				1737
				1738	th = skb->h.th;
				1739	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
				1740	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
				1741	skb->len - th->doff * 4);
				1742	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
				1743	TCP_SKB_CB(skb)->when = 0;
				1744	TCP_SKB_CB(skb)->flags = skb->nh.iph->tos;
				1745	TCP_SKB_CB(skb)->sacked = 0;
				1746
				1747	sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
				1748	skb->nh.iph->daddr, ntohs(th->dest),
				1749	tcp_v4_iif(skb));
				1750
				1751	if (!sk)
				1752	goto no_tcp_socket;
				1753
				1754	process:
				1755	if (sk->sk_state == TCP_TIME_WAIT)
				1756	goto do_time_wait;
				1757
				1758	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
				1759	goto discard_and_relse;
				1760
				1761	if (sk_filter(sk, skb, 0))
				1762	goto discard_and_relse;
				1763
				1764	skb->dev = NULL;
				1765
				1766	bh_lock_sock(sk);
				1767	ret = 0;
				1768	if (!sock_owned_by_user(sk)) {
				1769	if (!tcp_prequeue(sk, skb))
				1770	ret = tcp_v4_do_rcv(sk, skb);
				1771	} else
				1772	sk_add_backlog(sk, skb);
				1773	bh_unlock_sock(sk);
				1774
				1775	sock_put(sk);
				1776
				1777	return ret;
				1778
				1779	no_tcp_socket:
				1780	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
				1781	goto discard_it;
				1782
				1783	if (skb->len < (th->doff << 2) \|\| tcp_checksum_complete(skb)) {
				1784	bad_packet:
				1785	TCP_INC_STATS_BH(TCP_MIB_INERRS);
				1786	} else {
				1787	tcp_v4_send_reset(skb);
				1788	}
				1789
				1790	discard_it:
				1791	/* Discard frame. */
				1792	kfree_skb(skb);
				1793	return 0;
				1794
				1795	discard_and_relse:
				1796	sock_put(sk);
				1797	goto discard_it;
				1798
				1799	do_time_wait:
				1800	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
				1801	tcp_tw_put((struct tcp_tw_bucket *) sk);
				1802	goto discard_it;
				1803	}
				1804
				1805	if (skb->len < (th->doff << 2) \|\| tcp_checksum_complete(skb)) {
				1806	TCP_INC_STATS_BH(TCP_MIB_INERRS);
				1807	tcp_tw_put((struct tcp_tw_bucket *) sk);
				1808	goto discard_it;
				1809	}
				1810	switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
				1811	skb, th, skb->len)) {
				1812	case TCP_TW_SYN: {
				1813	struct sock *sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr,
				1814	ntohs(th->dest),
				1815	tcp_v4_iif(skb));
				1816	if (sk2) {
				1817	tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
				1818	tcp_tw_put((struct tcp_tw_bucket *)sk);
				1819	sk = sk2;
				1820	goto process;
				1821	}
				1822	/* Fall through to ACK */
				1823	}
				1824	case TCP_TW_ACK:
				1825	tcp_v4_timewait_ack(sk, skb);
				1826	break;
				1827	case TCP_TW_RST:
				1828	goto no_tcp_socket;
				1829	case TCP_TW_SUCCESS:;
				1830	}
				1831	goto discard_it;
				1832	}
				1833
				1834	/* With per-bucket locks this operation is not-atomic, so that
				1835	* this version is not worse.
				1836	*/
				1837	static void __tcp_v4_rehash(struct sock *sk)
				1838	{
				1839	sk->sk_prot->unhash(sk);
				1840	sk->sk_prot->hash(sk);
				1841	}
				1842
				1843	static int tcp_v4_reselect_saddr(struct sock *sk)
				1844	{
				1845	struct inet_sock *inet = inet_sk(sk);
				1846	int err;
				1847	struct rtable *rt;
				1848	__u32 old_saddr = inet->saddr;
				1849	__u32 new_saddr;
				1850	__u32 daddr = inet->daddr;
				1851
				1852	if (inet->opt && inet->opt->srr)
				1853	daddr = inet->opt->faddr;
				1854
				1855	/* Query new route. */
				1856	err = ip_route_connect(&rt, daddr, 0,
				1857	RT_CONN_FLAGS(sk),
				1858	sk->sk_bound_dev_if,
				1859	IPPROTO_TCP,
				1860	inet->sport, inet->dport, sk);
				1861	if (err)
				1862	return err;
				1863
				1864	__sk_dst_set(sk, &rt->u.dst);
				1865	tcp_v4_setup_caps(sk, &rt->u.dst);
				1866
				1867	new_saddr = rt->rt_src;
				1868
				1869	if (new_saddr == old_saddr)
				1870	return 0;
				1871
				1872	if (sysctl_ip_dynaddr > 1) {
				1873	printk(KERN_INFO "tcp_v4_rebuild_header(): shifting inet->"
				1874	"saddr from %d.%d.%d.%d to %d.%d.%d.%d\n",
				1875	NIPQUAD(old_saddr),
				1876	NIPQUAD(new_saddr));
				1877	}
				1878
				1879	inet->saddr = new_saddr;
				1880	inet->rcv_saddr = new_saddr;
				1881
				1882	/* XXX The only one ugly spot where we need to
				1883	* XXX really change the sockets identity after
				1884	* XXX it has entered the hashes. -DaveM
				1885	*
				1886	* Besides that, it does not check for connection
				1887	* uniqueness. Wait for troubles.
				1888	*/
				1889	__tcp_v4_rehash(sk);
				1890	return 0;
				1891	}
				1892
				1893	int tcp_v4_rebuild_header(struct sock *sk)
				1894	{
				1895	struct inet_sock *inet = inet_sk(sk);
				1896	struct rtable rt = (struct rtable )__sk_dst_check(sk, 0);
				1897	u32 daddr;
				1898	int err;
				1899
				1900	/* Route is OK, nothing to do. */
				1901	if (rt)
				1902	return 0;
				1903
				1904	/* Reroute. */
				1905	daddr = inet->daddr;
				1906	if (inet->opt && inet->opt->srr)
				1907	daddr = inet->opt->faddr;
				1908
				1909	{
				1910	struct flowi fl = { .oif = sk->sk_bound_dev_if,
				1911	.nl_u = { .ip4_u =
				1912	{ .daddr = daddr,
				1913	.saddr = inet->saddr,
				1914	.tos = RT_CONN_FLAGS(sk) } },
				1915	.proto = IPPROTO_TCP,
				1916	.uli_u = { .ports =
				1917	{ .sport = inet->sport,
				1918	.dport = inet->dport } } };
				1919
				1920	err = ip_route_output_flow(&rt, &fl, sk, 0);
				1921	}
				1922	if (!err) {
				1923	__sk_dst_set(sk, &rt->u.dst);
				1924	tcp_v4_setup_caps(sk, &rt->u.dst);
				1925	return 0;
				1926	}
				1927
				1928	/* Routing failed... */
				1929	sk->sk_route_caps = 0;
				1930
				1931	if (!sysctl_ip_dynaddr \|\|
				1932	sk->sk_state != TCP_SYN_SENT \|\|
				1933	(sk->sk_userlocks & SOCK_BINDADDR_LOCK) \|\|
				1934	(err = tcp_v4_reselect_saddr(sk)) != 0)
				1935	sk->sk_err_soft = -err;
				1936
				1937	return err;
				1938	}
				1939
				1940	static void v4_addr2sockaddr(struct sock sk, struct sockaddr uaddr)
				1941	{
				1942	struct sockaddr_in sin = (struct sockaddr_in ) uaddr;
				1943	struct inet_sock *inet = inet_sk(sk);
				1944
				1945	sin->sin_family = AF_INET;
				1946	sin->sin_addr.s_addr = inet->daddr;
				1947	sin->sin_port = inet->dport;
				1948	}
				1949
				1950	/* VJ's idea. Save last timestamp seen from this destination
				1951	* and hold it at least for normal timewait interval to use for duplicate
				1952	* segment detection in subsequent connections, before they enter synchronized
				1953	* state.
				1954	*/
				1955
				1956	int tcp_v4_remember_stamp(struct sock *sk)
				1957	{
				1958	struct inet_sock *inet = inet_sk(sk);
				1959	struct tcp_sock *tp = tcp_sk(sk);
				1960	struct rtable rt = (struct rtable )__sk_dst_get(sk);
				1961	struct inet_peer *peer = NULL;
				1962	int release_it = 0;
				1963
				1964	if (!rt \|\| rt->rt_dst != inet->daddr) {
				1965	peer = inet_getpeer(inet->daddr, 1);
				1966	release_it = 1;
				1967	} else {
				1968	if (!rt->peer)
				1969	rt_bind_peer(rt, 1);
				1970	peer = rt->peer;
				1971	}
				1972
				1973	if (peer) {
				1974	if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 \|\|
				1975	(peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
				1976	peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
				1977	peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
				1978	peer->tcp_ts = tp->rx_opt.ts_recent;
				1979	}
				1980	if (release_it)
				1981	inet_putpeer(peer);
				1982	return 1;
				1983	}
				1984
				1985	return 0;
				1986	}
				1987
				1988	int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
				1989	{
				1990	struct inet_peer *peer = NULL;
				1991
				1992	peer = inet_getpeer(tw->tw_daddr, 1);
				1993
				1994	if (peer) {
				1995	if ((s32)(peer->tcp_ts - tw->tw_ts_recent) <= 0 \|\|
				1996	(peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
				1997	peer->tcp_ts_stamp <= tw->tw_ts_recent_stamp)) {
				1998	peer->tcp_ts_stamp = tw->tw_ts_recent_stamp;
				1999	peer->tcp_ts = tw->tw_ts_recent;
				2000	}
				2001	inet_putpeer(peer);
				2002	return 1;
				2003	}
				2004
				2005	return 0;
				2006	}
				2007
				2008	struct tcp_func ipv4_specific = {
				2009	.queue_xmit = ip_queue_xmit,
				2010	.send_check = tcp_v4_send_check,
				2011	.rebuild_header = tcp_v4_rebuild_header,
				2012	.conn_request = tcp_v4_conn_request,
				2013	.syn_recv_sock = tcp_v4_syn_recv_sock,
				2014	.remember_stamp = tcp_v4_remember_stamp,
				2015	.net_header_len = sizeof(struct iphdr),
				2016	.setsockopt = ip_setsockopt,
				2017	.getsockopt = ip_getsockopt,
				2018	.addr2sockaddr = v4_addr2sockaddr,
				2019	.sockaddr_len = sizeof(struct sockaddr_in),
				2020	};
				2021
				2022	/* NOTE: A lot of things set to zero explicitly by call to
				2023	* sk_alloc() so need not be done here.
				2024	*/
				2025	static int tcp_v4_init_sock(struct sock *sk)
				2026	{
				2027	struct tcp_sock *tp = tcp_sk(sk);
				2028
				2029	skb_queue_head_init(&tp->out_of_order_queue);
				2030	tcp_init_xmit_timers(sk);
				2031	tcp_prequeue_init(tp);
				2032
				2033	tp->rto = TCP_TIMEOUT_INIT;
				2034	tp->mdev = TCP_TIMEOUT_INIT;
				2035
				2036	/* So many TCP implementations out there (incorrectly) count the
				2037	* initial SYN frame in their delayed-ACK and congestion control
				2038	* algorithms that we must have the following bandaid to talk
				2039	* efficiently to them. -DaveM
				2040	*/
				2041	tp->snd_cwnd = 2;
				2042
				2043	/* See draft-stevens-tcpca-spec-01 for discussion of the
				2044	* initialization of these values.
				2045	*/
				2046	tp->snd_ssthresh = 0x7fffffff; /* Infinity */
				2047	tp->snd_cwnd_clamp = ~0;
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame^]	2048	tp->mss_cache = 536;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2049
				2050	tp->reordering = sysctl_tcp_reordering;
Stephen Hemminger	5f8ef48	2005-06-23 20:37:36 -0700	[diff] [blame]	2051	tp->ca_ops = &tcp_init_congestion_ops;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2052
				2053	sk->sk_state = TCP_CLOSE;
				2054
				2055	sk->sk_write_space = sk_stream_write_space;
				2056	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
				2057
				2058	tp->af_specific = &ipv4_specific;
				2059
				2060	sk->sk_sndbuf = sysctl_tcp_wmem[1];
				2061	sk->sk_rcvbuf = sysctl_tcp_rmem[1];
				2062
				2063	atomic_inc(&tcp_sockets_allocated);
				2064
				2065	return 0;
				2066	}
				2067
				2068	int tcp_v4_destroy_sock(struct sock *sk)
				2069	{
				2070	struct tcp_sock *tp = tcp_sk(sk);
				2071
				2072	tcp_clear_xmit_timers(sk);
				2073
Stephen Hemminger	317a76f	2005-06-23 12:19:55 -0700	[diff] [blame]	2074	tcp_cleanup_congestion_control(tp);
				2075
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2076	/* Cleanup up the write buffer. */
				2077	sk_stream_writequeue_purge(sk);
				2078
				2079	/* Cleans up our, hopefully empty, out_of_order_queue. */
				2080	__skb_queue_purge(&tp->out_of_order_queue);
				2081
				2082	/* Clean prequeue, it must be empty really */
				2083	__skb_queue_purge(&tp->ucopy.prequeue);
				2084
				2085	/* Clean up a referenced TCP bind bucket. */
				2086	if (tp->bind_hash)
				2087	tcp_put_port(sk);
				2088
				2089	/*
				2090	* If sendmsg cached page exists, toss it.
				2091	*/
				2092	if (sk->sk_sndmsg_page) {
				2093	__free_page(sk->sk_sndmsg_page);
				2094	sk->sk_sndmsg_page = NULL;
				2095	}
				2096
				2097	atomic_dec(&tcp_sockets_allocated);
				2098
				2099	return 0;
				2100	}
				2101
				2102	EXPORT_SYMBOL(tcp_v4_destroy_sock);
				2103
				2104	#ifdef CONFIG_PROC_FS
				2105	/* Proc filesystem TCP sock list dumping. */
				2106
				2107	static inline struct tcp_tw_bucket tw_head(struct hlist_head head)
				2108	{
				2109	return hlist_empty(head) ? NULL :
				2110	list_entry(head->first, struct tcp_tw_bucket, tw_node);
				2111	}
				2112
				2113	static inline struct tcp_tw_bucket tw_next(struct tcp_tw_bucket tw)
				2114	{
				2115	return tw->tw_node.next ?
				2116	hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
				2117	}
				2118
				2119	static void listening_get_next(struct seq_file seq, void *cur)
				2120	{
				2121	struct tcp_sock *tp;
				2122	struct hlist_node *node;
				2123	struct sock *sk = cur;
				2124	struct tcp_iter_state* st = seq->private;
				2125
				2126	if (!sk) {
				2127	st->bucket = 0;
				2128	sk = sk_head(&tcp_listening_hash[0]);
				2129	goto get_sk;
				2130	}
				2131
				2132	++st->num;
				2133
				2134	if (st->state == TCP_SEQ_STATE_OPENREQ) {
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	2135	struct request_sock *req = cur;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2136
				2137	tp = tcp_sk(st->syn_wait_sk);
				2138	req = req->dl_next;
				2139	while (1) {
				2140	while (req) {
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	2141	if (req->rsk_ops->family == st->family) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2142	cur = req;
				2143	goto out;
				2144	}
				2145	req = req->dl_next;
				2146	}
				2147	if (++st->sbucket >= TCP_SYNQ_HSIZE)
				2148	break;
				2149	get_req:
Arnaldo Carvalho de Melo	0e87506	2005-06-18 22:47:59 -0700	[diff] [blame]	2150	req = tp->accept_queue.listen_opt->syn_table[st->sbucket];
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2151	}
				2152	sk = sk_next(st->syn_wait_sk);
				2153	st->state = TCP_SEQ_STATE_LISTENING;
Arnaldo Carvalho de Melo	0e87506	2005-06-18 22:47:59 -0700	[diff] [blame]	2154	read_unlock_bh(&tp->accept_queue.syn_wait_lock);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2155	} else {
				2156	tp = tcp_sk(sk);
Arnaldo Carvalho de Melo	0e87506	2005-06-18 22:47:59 -0700	[diff] [blame]	2157	read_lock_bh(&tp->accept_queue.syn_wait_lock);
				2158	if (reqsk_queue_len(&tp->accept_queue))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2159	goto start_req;
Arnaldo Carvalho de Melo	0e87506	2005-06-18 22:47:59 -0700	[diff] [blame]	2160	read_unlock_bh(&tp->accept_queue.syn_wait_lock);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2161	sk = sk_next(sk);
				2162	}
				2163	get_sk:
				2164	sk_for_each_from(sk, node) {
				2165	if (sk->sk_family == st->family) {
				2166	cur = sk;
				2167	goto out;
				2168	}
				2169	tp = tcp_sk(sk);
Arnaldo Carvalho de Melo	0e87506	2005-06-18 22:47:59 -0700	[diff] [blame]	2170	read_lock_bh(&tp->accept_queue.syn_wait_lock);
				2171	if (reqsk_queue_len(&tp->accept_queue)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2172	start_req:
				2173	st->uid = sock_i_uid(sk);
				2174	st->syn_wait_sk = sk;
				2175	st->state = TCP_SEQ_STATE_OPENREQ;
				2176	st->sbucket = 0;
				2177	goto get_req;
				2178	}
Arnaldo Carvalho de Melo	0e87506	2005-06-18 22:47:59 -0700	[diff] [blame]	2179	read_unlock_bh(&tp->accept_queue.syn_wait_lock);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2180	}
				2181	if (++st->bucket < TCP_LHTABLE_SIZE) {
				2182	sk = sk_head(&tcp_listening_hash[st->bucket]);
				2183	goto get_sk;
				2184	}
				2185	cur = NULL;
				2186	out:
				2187	return cur;
				2188	}
				2189
				2190	static void listening_get_idx(struct seq_file seq, loff_t *pos)
				2191	{
				2192	void *rc = listening_get_next(seq, NULL);
				2193
				2194	while (rc && *pos) {
				2195	rc = listening_get_next(seq, rc);
				2196	--*pos;
				2197	}
				2198	return rc;
				2199	}
				2200
				2201	static void established_get_first(struct seq_file seq)
				2202	{
				2203	struct tcp_iter_state* st = seq->private;
				2204	void *rc = NULL;
				2205
				2206	for (st->bucket = 0; st->bucket < tcp_ehash_size; ++st->bucket) {
				2207	struct sock *sk;
				2208	struct hlist_node *node;
				2209	struct tcp_tw_bucket *tw;
				2210
				2211	/* We can reschedule _before_ having picked the target: */
				2212	cond_resched_softirq();
				2213
				2214	read_lock(&tcp_ehash[st->bucket].lock);
				2215	sk_for_each(sk, node, &tcp_ehash[st->bucket].chain) {
				2216	if (sk->sk_family != st->family) {
				2217	continue;
				2218	}
				2219	rc = sk;
				2220	goto out;
				2221	}
				2222	st->state = TCP_SEQ_STATE_TIME_WAIT;
				2223	tw_for_each(tw, node,
				2224	&tcp_ehash[st->bucket + tcp_ehash_size].chain) {
				2225	if (tw->tw_family != st->family) {
				2226	continue;
				2227	}
				2228	rc = tw;
				2229	goto out;
				2230	}
				2231	read_unlock(&tcp_ehash[st->bucket].lock);
				2232	st->state = TCP_SEQ_STATE_ESTABLISHED;
				2233	}
				2234	out:
				2235	return rc;
				2236	}
				2237
				2238	static void established_get_next(struct seq_file seq, void *cur)
				2239	{
				2240	struct sock *sk = cur;
				2241	struct tcp_tw_bucket *tw;
				2242	struct hlist_node *node;
				2243	struct tcp_iter_state* st = seq->private;
				2244
				2245	++st->num;
				2246
				2247	if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
				2248	tw = cur;
				2249	tw = tw_next(tw);
				2250	get_tw:
				2251	while (tw && tw->tw_family != st->family) {
				2252	tw = tw_next(tw);
				2253	}
				2254	if (tw) {
				2255	cur = tw;
				2256	goto out;
				2257	}
				2258	read_unlock(&tcp_ehash[st->bucket].lock);
				2259	st->state = TCP_SEQ_STATE_ESTABLISHED;
				2260
				2261	/* We can reschedule between buckets: */
				2262	cond_resched_softirq();
				2263
				2264	if (++st->bucket < tcp_ehash_size) {
				2265	read_lock(&tcp_ehash[st->bucket].lock);
				2266	sk = sk_head(&tcp_ehash[st->bucket].chain);
				2267	} else {
				2268	cur = NULL;
				2269	goto out;
				2270	}
				2271	} else
				2272	sk = sk_next(sk);
				2273
				2274	sk_for_each_from(sk, node) {
				2275	if (sk->sk_family == st->family)
				2276	goto found;
				2277	}
				2278
				2279	st->state = TCP_SEQ_STATE_TIME_WAIT;
				2280	tw = tw_head(&tcp_ehash[st->bucket + tcp_ehash_size].chain);
				2281	goto get_tw;
				2282	found:
				2283	cur = sk;
				2284	out:
				2285	return cur;
				2286	}
				2287
				2288	static void established_get_idx(struct seq_file seq, loff_t pos)
				2289	{
				2290	void *rc = established_get_first(seq);
				2291
				2292	while (rc && pos) {
				2293	rc = established_get_next(seq, rc);
				2294	--pos;
				2295	}
				2296	return rc;
				2297	}
				2298
				2299	static void tcp_get_idx(struct seq_file seq, loff_t pos)
				2300	{
				2301	void *rc;
				2302	struct tcp_iter_state* st = seq->private;
				2303
				2304	tcp_listen_lock();
				2305	st->state = TCP_SEQ_STATE_LISTENING;
				2306	rc = listening_get_idx(seq, &pos);
				2307
				2308	if (!rc) {
				2309	tcp_listen_unlock();
				2310	local_bh_disable();
				2311	st->state = TCP_SEQ_STATE_ESTABLISHED;
				2312	rc = established_get_idx(seq, pos);
				2313	}
				2314
				2315	return rc;
				2316	}
				2317
				2318	static void tcp_seq_start(struct seq_file seq, loff_t *pos)
				2319	{
				2320	struct tcp_iter_state* st = seq->private;
				2321	st->state = TCP_SEQ_STATE_LISTENING;
				2322	st->num = 0;
				2323	return pos ? tcp_get_idx(seq, pos - 1) : SEQ_START_TOKEN;
				2324	}
				2325
				2326	static void tcp_seq_next(struct seq_file seq, void v, loff_t pos)
				2327	{
				2328	void *rc = NULL;
				2329	struct tcp_iter_state* st;
				2330
				2331	if (v == SEQ_START_TOKEN) {
				2332	rc = tcp_get_idx(seq, 0);
				2333	goto out;
				2334	}
				2335	st = seq->private;
				2336
				2337	switch (st->state) {
				2338	case TCP_SEQ_STATE_OPENREQ:
				2339	case TCP_SEQ_STATE_LISTENING:
				2340	rc = listening_get_next(seq, v);
				2341	if (!rc) {
				2342	tcp_listen_unlock();
				2343	local_bh_disable();
				2344	st->state = TCP_SEQ_STATE_ESTABLISHED;
				2345	rc = established_get_first(seq);
				2346	}
				2347	break;
				2348	case TCP_SEQ_STATE_ESTABLISHED:
				2349	case TCP_SEQ_STATE_TIME_WAIT:
				2350	rc = established_get_next(seq, v);
				2351	break;
				2352	}
				2353	out:
				2354	++*pos;
				2355	return rc;
				2356	}
				2357
				2358	static void tcp_seq_stop(struct seq_file seq, void v)
				2359	{
				2360	struct tcp_iter_state* st = seq->private;
				2361
				2362	switch (st->state) {
				2363	case TCP_SEQ_STATE_OPENREQ:
				2364	if (v) {
				2365	struct tcp_sock *tp = tcp_sk(st->syn_wait_sk);
Arnaldo Carvalho de Melo	0e87506	2005-06-18 22:47:59 -0700	[diff] [blame]	2366	read_unlock_bh(&tp->accept_queue.syn_wait_lock);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2367	}
				2368	case TCP_SEQ_STATE_LISTENING:
				2369	if (v != SEQ_START_TOKEN)
				2370	tcp_listen_unlock();
				2371	break;
				2372	case TCP_SEQ_STATE_TIME_WAIT:
				2373	case TCP_SEQ_STATE_ESTABLISHED:
				2374	if (v)
				2375	read_unlock(&tcp_ehash[st->bucket].lock);
				2376	local_bh_enable();
				2377	break;
				2378	}
				2379	}
				2380
				2381	static int tcp_seq_open(struct inode inode, struct file file)
				2382	{
				2383	struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
				2384	struct seq_file *seq;
				2385	struct tcp_iter_state *s;
				2386	int rc;
				2387
				2388	if (unlikely(afinfo == NULL))
				2389	return -EINVAL;
				2390
				2391	s = kmalloc(sizeof(*s), GFP_KERNEL);
				2392	if (!s)
				2393	return -ENOMEM;
				2394	memset(s, 0, sizeof(*s));
				2395	s->family = afinfo->family;
				2396	s->seq_ops.start = tcp_seq_start;
				2397	s->seq_ops.next = tcp_seq_next;
				2398	s->seq_ops.show = afinfo->seq_show;
				2399	s->seq_ops.stop = tcp_seq_stop;
				2400
				2401	rc = seq_open(file, &s->seq_ops);
				2402	if (rc)
				2403	goto out_kfree;
				2404	seq = file->private_data;
				2405	seq->private = s;
				2406	out:
				2407	return rc;
				2408	out_kfree:
				2409	kfree(s);
				2410	goto out;
				2411	}
				2412
				2413	int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
				2414	{
				2415	int rc = 0;
				2416	struct proc_dir_entry *p;
				2417
				2418	if (!afinfo)
				2419	return -EINVAL;
				2420	afinfo->seq_fops->owner = afinfo->owner;
				2421	afinfo->seq_fops->open = tcp_seq_open;
				2422	afinfo->seq_fops->read = seq_read;
				2423	afinfo->seq_fops->llseek = seq_lseek;
				2424	afinfo->seq_fops->release = seq_release_private;
				2425
				2426	p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
				2427	if (p)
				2428	p->data = afinfo;
				2429	else
				2430	rc = -ENOMEM;
				2431	return rc;
				2432	}
				2433
				2434	void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
				2435	{
				2436	if (!afinfo)
				2437	return;
				2438	proc_net_remove(afinfo->name);
				2439	memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
				2440	}
				2441
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	2442	static void get_openreq4(struct sock sk, struct request_sock req,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2443	char *tmpbuf, int i, int uid)
				2444	{
Arnaldo Carvalho de Melo	2e6599c	2005-06-18 22:46:52 -0700	[diff] [blame]	2445	const struct inet_request_sock *ireq = inet_rsk(req);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2446	int ttd = req->expires - jiffies;
				2447
				2448	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
				2449	" %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
				2450	i,
Arnaldo Carvalho de Melo	2e6599c	2005-06-18 22:46:52 -0700	[diff] [blame]	2451	ireq->loc_addr,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2452	ntohs(inet_sk(sk)->sport),
Arnaldo Carvalho de Melo	2e6599c	2005-06-18 22:46:52 -0700	[diff] [blame]	2453	ireq->rmt_addr,
				2454	ntohs(ireq->rmt_port),
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2455	TCP_SYN_RECV,
				2456	0, 0, /* could print option size, but that is af dependent. */
				2457	1, /* timers active (only the expire timer) */
				2458	jiffies_to_clock_t(ttd),
				2459	req->retrans,
				2460	uid,
				2461	0, /* non standard timer */
				2462	0, /* open_requests have no inode */
				2463	atomic_read(&sk->sk_refcnt),
				2464	req);
				2465	}
				2466
				2467	static void get_tcp4_sock(struct sock sp, char tmpbuf, int i)
				2468	{
				2469	int timer_active;
				2470	unsigned long timer_expires;
				2471	struct tcp_sock *tp = tcp_sk(sp);
				2472	struct inet_sock *inet = inet_sk(sp);
				2473	unsigned int dest = inet->daddr;
				2474	unsigned int src = inet->rcv_saddr;
				2475	__u16 destp = ntohs(inet->dport);
				2476	__u16 srcp = ntohs(inet->sport);
				2477
				2478	if (tp->pending == TCP_TIME_RETRANS) {
				2479	timer_active = 1;
				2480	timer_expires = tp->timeout;
				2481	} else if (tp->pending == TCP_TIME_PROBE0) {
				2482	timer_active = 4;
				2483	timer_expires = tp->timeout;
				2484	} else if (timer_pending(&sp->sk_timer)) {
				2485	timer_active = 2;
				2486	timer_expires = sp->sk_timer.expires;
				2487	} else {
				2488	timer_active = 0;
				2489	timer_expires = jiffies;
				2490	}
				2491
				2492	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
				2493	"%08X %5d %8d %lu %d %p %u %u %u %u %d",
				2494	i, src, srcp, dest, destp, sp->sk_state,
				2495	tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
				2496	timer_active,
				2497	jiffies_to_clock_t(timer_expires - jiffies),
				2498	tp->retransmits,
				2499	sock_i_uid(sp),
				2500	tp->probes_out,
				2501	sock_i_ino(sp),
				2502	atomic_read(&sp->sk_refcnt), sp,
				2503	tp->rto, tp->ack.ato, (tp->ack.quick << 1) \| tp->ack.pingpong,
				2504	tp->snd_cwnd,
				2505	tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
				2506	}
				2507
				2508	static void get_timewait4_sock(struct tcp_tw_bucket tw, char tmpbuf, int i)
				2509	{
				2510	unsigned int dest, src;
				2511	__u16 destp, srcp;
				2512	int ttd = tw->tw_ttd - jiffies;
				2513
				2514	if (ttd < 0)
				2515	ttd = 0;
				2516
				2517	dest = tw->tw_daddr;
				2518	src = tw->tw_rcv_saddr;
				2519	destp = ntohs(tw->tw_dport);
				2520	srcp = ntohs(tw->tw_sport);
				2521
				2522	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
				2523	" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
				2524	i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
				2525	3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
				2526	atomic_read(&tw->tw_refcnt), tw);
				2527	}
				2528
				2529	#define TMPSZ 150
				2530
				2531	static int tcp4_seq_show(struct seq_file seq, void v)
				2532	{
				2533	struct tcp_iter_state* st;
				2534	char tmpbuf[TMPSZ + 1];
				2535
				2536	if (v == SEQ_START_TOKEN) {
				2537	seq_printf(seq, "%-*s\n", TMPSZ - 1,
				2538	" sl local_address rem_address st tx_queue "
				2539	"rx_queue tr tm->when retrnsmt uid timeout "
				2540	"inode");
				2541	goto out;
				2542	}
				2543	st = seq->private;
				2544
				2545	switch (st->state) {
				2546	case TCP_SEQ_STATE_LISTENING:
				2547	case TCP_SEQ_STATE_ESTABLISHED:
				2548	get_tcp4_sock(v, tmpbuf, st->num);
				2549	break;
				2550	case TCP_SEQ_STATE_OPENREQ:
				2551	get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
				2552	break;
				2553	case TCP_SEQ_STATE_TIME_WAIT:
				2554	get_timewait4_sock(v, tmpbuf, st->num);
				2555	break;
				2556	}
				2557	seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
				2558	out:
				2559	return 0;
				2560	}
				2561
				2562	static struct file_operations tcp4_seq_fops;
				2563	static struct tcp_seq_afinfo tcp4_seq_afinfo = {
				2564	.owner = THIS_MODULE,
				2565	.name = "tcp",
				2566	.family = AF_INET,
				2567	.seq_show = tcp4_seq_show,
				2568	.seq_fops = &tcp4_seq_fops,
				2569	};
				2570
				2571	int __init tcp4_proc_init(void)
				2572	{
				2573	return tcp_proc_register(&tcp4_seq_afinfo);
				2574	}
				2575
				2576	void tcp4_proc_exit(void)
				2577	{
				2578	tcp_proc_unregister(&tcp4_seq_afinfo);
				2579	}
				2580	#endif /* CONFIG_PROC_FS */
				2581
				2582	struct proto tcp_prot = {
				2583	.name = "TCP",
				2584	.owner = THIS_MODULE,
				2585	.close = tcp_close,
				2586	.connect = tcp_v4_connect,
				2587	.disconnect = tcp_disconnect,
				2588	.accept = tcp_accept,
				2589	.ioctl = tcp_ioctl,
				2590	.init = tcp_v4_init_sock,
				2591	.destroy = tcp_v4_destroy_sock,
				2592	.shutdown = tcp_shutdown,
				2593	.setsockopt = tcp_setsockopt,
				2594	.getsockopt = tcp_getsockopt,
				2595	.sendmsg = tcp_sendmsg,
				2596	.recvmsg = tcp_recvmsg,
				2597	.backlog_rcv = tcp_v4_do_rcv,
				2598	.hash = tcp_v4_hash,
				2599	.unhash = tcp_unhash,
				2600	.get_port = tcp_v4_get_port,
				2601	.enter_memory_pressure = tcp_enter_memory_pressure,
				2602	.sockets_allocated = &tcp_sockets_allocated,
				2603	.memory_allocated = &tcp_memory_allocated,
				2604	.memory_pressure = &tcp_memory_pressure,
				2605	.sysctl_mem = sysctl_tcp_mem,
				2606	.sysctl_wmem = sysctl_tcp_wmem,
				2607	.sysctl_rmem = sysctl_tcp_rmem,
				2608	.max_header = MAX_TCP_HEADER,
				2609	.obj_size = sizeof(struct tcp_sock),
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame]	2610	.rsk_prot = &tcp_request_sock_ops,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2611	};
				2612
				2613
				2614
				2615	void __init tcp_v4_init(struct net_proto_family *ops)
				2616	{
				2617	int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
				2618	if (err < 0)
				2619	panic("Failed to create the TCP control socket.\n");
				2620	tcp_socket->sk->sk_allocation = GFP_ATOMIC;
				2621	inet_sk(tcp_socket->sk)->uc_ttl = -1;
				2622
				2623	/* Unhash it so that IP input processing does not even
				2624	* see it, we do not wish this socket to see incoming
				2625	* packets.
				2626	*/
				2627	tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
				2628	}
				2629
				2630	EXPORT_SYMBOL(ipv4_specific);
				2631	EXPORT_SYMBOL(tcp_bind_hash);
				2632	EXPORT_SYMBOL(tcp_bucket_create);
				2633	EXPORT_SYMBOL(tcp_hashinfo);
				2634	EXPORT_SYMBOL(tcp_inherit_port);
				2635	EXPORT_SYMBOL(tcp_listen_wlock);
				2636	EXPORT_SYMBOL(tcp_port_rover);
				2637	EXPORT_SYMBOL(tcp_prot);
				2638	EXPORT_SYMBOL(tcp_put_port);
				2639	EXPORT_SYMBOL(tcp_unhash);
				2640	EXPORT_SYMBOL(tcp_v4_conn_request);
				2641	EXPORT_SYMBOL(tcp_v4_connect);
				2642	EXPORT_SYMBOL(tcp_v4_do_rcv);
				2643	EXPORT_SYMBOL(tcp_v4_rebuild_header);
				2644	EXPORT_SYMBOL(tcp_v4_remember_stamp);
				2645	EXPORT_SYMBOL(tcp_v4_send_check);
				2646	EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
				2647
				2648	#ifdef CONFIG_PROC_FS
				2649	EXPORT_SYMBOL(tcp_proc_register);
				2650	EXPORT_SYMBOL(tcp_proc_unregister);
				2651	#endif
				2652	EXPORT_SYMBOL(sysctl_local_port_range);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2653	EXPORT_SYMBOL(sysctl_tcp_low_latency);
				2654	EXPORT_SYMBOL(sysctl_tcp_tw_reuse);
				2655