Blame - include/net/tcp.h - SHIFTPHONES/android_kernel_shift_sdm845

blob: 6663086a5e357ca60029b70d979cec19fe277e8f [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
				2	* INET An implementation of the TCP/IP protocol suite for the LINUX
				3	* operating system. INET is implemented using the BSD Socket
				4	* interface as the means of communication with the user level.
				5	*
				6	* Definitions for the TCP module.
				7	*
				8	* Version: @(#)tcp.h 1.0.5 05/23/93
				9	*
Jesper Juhl	02c30a8	2005-05-05 16:16:16 -0700	[diff] [blame]	10	* Authors: Ross Biro
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	11	* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
				12	*
				13	* This program is free software; you can redistribute it and/or
				14	* modify it under the terms of the GNU General Public License
				15	* as published by the Free Software Foundation; either version
				16	* 2 of the License, or (at your option) any later version.
				17	*/
				18	#ifndef _TCP_H
				19	#define _TCP_H
				20
				21	#define TCP_DEBUG 1
				22	#define FASTRETRANS_DEBUG 1
				23
				24	/* Cancel timers, when they are not required. */
				25	#undef TCP_CLEAR_TIMERS
				26
				27	#include <linux/config.h>
				28	#include <linux/list.h>
				29	#include <linux/tcp.h>
				30	#include <linux/slab.h>
				31	#include <linux/cache.h>
				32	#include <linux/percpu.h>
				33	#include <net/checksum.h>
Arnaldo Carvalho de Melo	2e6599c	2005-06-18 22:46:52 -0700	[diff] [blame]	34	#include <net/request_sock.h>
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	35	#include <net/sock.h>
				36	#include <net/snmp.h>
				37	#include <net/ip.h>
				38	#if defined(CONFIG_IPV6) \|\| defined (CONFIG_IPV6_MODULE)
				39	#include <linux/ipv6.h>
				40	#endif
				41	#include <linux/seq_file.h>
				42
				43	/* This is for all connections with a full identity, no wildcards.
				44	* New scheme, half the table is for TIME_WAIT, the other half is
				45	* for the rest. I'll experiment with dynamic table growth later.
				46	*/
				47	struct tcp_ehash_bucket {
				48	rwlock_t lock;
				49	struct hlist_head chain;
				50	} __attribute__((__aligned__(8)));
				51
				52	/* This is for listening sockets, thus all sockets which possess wildcards. */
				53	#define TCP_LHTABLE_SIZE 32 /* Yes, really, this is all you need. */
				54
				55	/* There are a few simple rules, which allow for local port reuse by
				56	* an application. In essence:
				57	*
				58	* 1) Sockets bound to different interfaces may share a local port.
				59	* Failing that, goto test 2.
				60	* 2) If all sockets have sk->sk_reuse set, and none of them are in
				61	* TCP_LISTEN state, the port may be shared.
				62	* Failing that, goto test 3.
				63	* 3) If all sockets are bound to a specific inet_sk(sk)->rcv_saddr local
				64	* address, and none of them are the same, the port may be
				65	* shared.
				66	* Failing this, the port cannot be shared.
				67	*
				68	* The interesting point, is test #2. This is what an FTP server does
				69	* all day. To optimize this case we use a specific flag bit defined
				70	* below. As we add sockets to a bind bucket list, we perform a
				71	* check of: (newsk->sk_reuse && (newsk->sk_state != TCP_LISTEN))
				72	* As long as all sockets added to a bind bucket pass this test,
				73	* the flag bit will be set.
				74	* The resulting situation is that tcp_v[46]_verify_bind() can just check
				75	* for this flag bit, if it is set and the socket trying to bind has
				76	* sk->sk_reuse set, we don't even have to walk the owners list at all,
				77	* we return that it is ok to bind this socket to the requested local port.
				78	*
				79	* Sounds like a lot of work, but it is worth it. In a more naive
				80	* implementation (ie. current FreeBSD etc.) the entire list of ports
				81	* must be walked for each data port opened by an ftp server. Needless
				82	* to say, this does not scale at all. With a couple thousand FTP
				83	* users logged onto your box, isn't it nice to know that new data
				84	* ports are created in O(1) time? I thought so. ;-) -DaveM
				85	*/
				86	struct tcp_bind_bucket {
				87	unsigned short port;
				88	signed short fastreuse;
				89	struct hlist_node node;
				90	struct hlist_head owners;
				91	};
				92
				93	#define tb_for_each(tb, node, head) hlist_for_each_entry(tb, node, head, node)
				94
				95	struct tcp_bind_hashbucket {
				96	spinlock_t lock;
				97	struct hlist_head chain;
				98	};
				99
				100	static inline struct tcp_bind_bucket __tb_head(struct tcp_bind_hashbucket head)
				101	{
				102	return hlist_entry(head->chain.first, struct tcp_bind_bucket, node);
				103	}
				104
				105	static inline struct tcp_bind_bucket tb_head(struct tcp_bind_hashbucket head)
				106	{
				107	return hlist_empty(&head->chain) ? NULL : __tb_head(head);
				108	}
				109
				110	extern struct tcp_hashinfo {
				111	/* This is for sockets with full identity only. Sockets here will
				112	* always be without wildcards and will have the following invariant:
				113	*
				114	* TCP_ESTABLISHED <= sk->sk_state < TCP_CLOSE
				115	*
				116	* First half of the table is for sockets not in TIME_WAIT, second half
				117	* is for TIME_WAIT sockets only.
				118	*/
				119	struct tcp_ehash_bucket *__tcp_ehash;
				120
				121	/* Ok, let's try this, I give up, we do need a local binding
				122	* TCP hash as well as the others for fast bind/connect.
				123	*/
				124	struct tcp_bind_hashbucket *__tcp_bhash;
				125
				126	int __tcp_bhash_size;
				127	int __tcp_ehash_size;
				128
				129	/* All sockets in TCP_LISTEN state will be in here. This is the only
				130	* table where wildcard'd TCP sockets can exist. Hash function here
				131	* is just local port number.
				132	*/
				133	struct hlist_head __tcp_listening_hash[TCP_LHTABLE_SIZE];
				134
				135	/* All the above members are written once at bootup and
				136	* never written again _or_ are predominantly read-access.
				137	*
				138	* Now align to a new cache line as all the following members
				139	* are often dirty.
				140	*/
				141	rwlock_t __tcp_lhash_lock ____cacheline_aligned;
				142	atomic_t __tcp_lhash_users;
				143	wait_queue_head_t __tcp_lhash_wait;
				144	spinlock_t __tcp_portalloc_lock;
				145	} tcp_hashinfo;
				146
				147	#define tcp_ehash (tcp_hashinfo.__tcp_ehash)
				148	#define tcp_bhash (tcp_hashinfo.__tcp_bhash)
				149	#define tcp_ehash_size (tcp_hashinfo.__tcp_ehash_size)
				150	#define tcp_bhash_size (tcp_hashinfo.__tcp_bhash_size)
				151	#define tcp_listening_hash (tcp_hashinfo.__tcp_listening_hash)
				152	#define tcp_lhash_lock (tcp_hashinfo.__tcp_lhash_lock)
				153	#define tcp_lhash_users (tcp_hashinfo.__tcp_lhash_users)
				154	#define tcp_lhash_wait (tcp_hashinfo.__tcp_lhash_wait)
				155	#define tcp_portalloc_lock (tcp_hashinfo.__tcp_portalloc_lock)
				156
				157	extern kmem_cache_t *tcp_bucket_cachep;
				158	extern struct tcp_bind_bucket tcp_bucket_create(struct tcp_bind_hashbucket head,
				159	unsigned short snum);
				160	extern void tcp_bucket_destroy(struct tcp_bind_bucket *tb);
				161	extern void tcp_bucket_unlock(struct sock *sk);
				162	extern int tcp_port_rover;
				163
				164	/* These are AF independent. */
				165	static __inline__ int tcp_bhashfn(__u16 lport)
				166	{
				167	return (lport & (tcp_bhash_size - 1));
				168	}
				169
				170	extern void tcp_bind_hash(struct sock sk, struct tcp_bind_bucket tb,
				171	unsigned short snum);
				172
				173	#if (BITS_PER_LONG == 64)
				174	#define TCP_ADDRCMP_ALIGN_BYTES 8
				175	#else
				176	#define TCP_ADDRCMP_ALIGN_BYTES 4
				177	#endif
				178
				179	/* This is a TIME_WAIT bucket. It works around the memory consumption
				180	* problems of sockets in such a state on heavily loaded servers, but
				181	* without violating the protocol specification.
				182	*/
				183	struct tcp_tw_bucket {
				184	/*
				185	* Now struct sock also uses sock_common, so please just
				186	* don't add nothing before this first member (__tw_common) --acme
				187	*/
				188	struct sock_common __tw_common;
				189	#define tw_family __tw_common.skc_family
				190	#define tw_state __tw_common.skc_state
				191	#define tw_reuse __tw_common.skc_reuse
				192	#define tw_bound_dev_if __tw_common.skc_bound_dev_if
				193	#define tw_node __tw_common.skc_node
				194	#define tw_bind_node __tw_common.skc_bind_node
				195	#define tw_refcnt __tw_common.skc_refcnt
				196	volatile unsigned char tw_substate;
				197	unsigned char tw_rcv_wscale;
				198	__u16 tw_sport;
				199	/* Socket demultiplex comparisons on incoming packets. */
				200	/* these five are in inet_sock */
				201	__u32 tw_daddr
				202	__attribute__((aligned(TCP_ADDRCMP_ALIGN_BYTES)));
				203	__u32 tw_rcv_saddr;
				204	__u16 tw_dport;
				205	__u16 tw_num;
				206	/* And these are ours. */
				207	int tw_hashent;
				208	int tw_timeout;
				209	__u32 tw_rcv_nxt;
				210	__u32 tw_snd_nxt;
				211	__u32 tw_rcv_wnd;
				212	__u32 tw_ts_recent;
				213	long tw_ts_recent_stamp;
				214	unsigned long tw_ttd;
				215	struct tcp_bind_bucket *tw_tb;
				216	struct hlist_node tw_death_node;
				217	#if defined(CONFIG_IPV6) \|\| defined(CONFIG_IPV6_MODULE)
				218	struct in6_addr tw_v6_daddr;
				219	struct in6_addr tw_v6_rcv_saddr;
				220	int tw_v6_ipv6only;
				221	#endif
				222	};
				223
				224	static __inline__ void tw_add_node(struct tcp_tw_bucket *tw,
				225	struct hlist_head *list)
				226	{
				227	hlist_add_head(&tw->tw_node, list);
				228	}
				229
				230	static __inline__ void tw_add_bind_node(struct tcp_tw_bucket *tw,
				231	struct hlist_head *list)
				232	{
				233	hlist_add_head(&tw->tw_bind_node, list);
				234	}
				235
				236	static inline int tw_dead_hashed(struct tcp_tw_bucket *tw)
				237	{
				238	return tw->tw_death_node.pprev != NULL;
				239	}
				240
				241	static __inline__ void tw_dead_node_init(struct tcp_tw_bucket *tw)
				242	{
				243	tw->tw_death_node.pprev = NULL;
				244	}
				245
				246	static __inline__ void __tw_del_dead_node(struct tcp_tw_bucket *tw)
				247	{
				248	__hlist_del(&tw->tw_death_node);
				249	tw_dead_node_init(tw);
				250	}
				251
				252	static __inline__ int tw_del_dead_node(struct tcp_tw_bucket *tw)
				253	{
				254	if (tw_dead_hashed(tw)) {
				255	__tw_del_dead_node(tw);
				256	return 1;
				257	}
				258	return 0;
				259	}
				260
				261	#define tw_for_each(tw, node, head) \
				262	hlist_for_each_entry(tw, node, head, tw_node)
				263
				264	#define tw_for_each_inmate(tw, node, jail) \
				265	hlist_for_each_entry(tw, node, jail, tw_death_node)
				266
				267	#define tw_for_each_inmate_safe(tw, node, safe, jail) \
				268	hlist_for_each_entry_safe(tw, node, safe, jail, tw_death_node)
				269
				270	#define tcptw_sk(__sk) ((struct tcp_tw_bucket *)(__sk))
				271
				272	static inline u32 tcp_v4_rcv_saddr(const struct sock *sk)
				273	{
				274	return likely(sk->sk_state != TCP_TIME_WAIT) ?
				275	inet_sk(sk)->rcv_saddr : tcptw_sk(sk)->tw_rcv_saddr;
				276	}
				277
				278	#if defined(CONFIG_IPV6) \|\| defined(CONFIG_IPV6_MODULE)
				279	static inline struct in6_addr __tcp_v6_rcv_saddr(const struct sock sk)
				280	{
				281	return likely(sk->sk_state != TCP_TIME_WAIT) ?
				282	&inet6_sk(sk)->rcv_saddr : &tcptw_sk(sk)->tw_v6_rcv_saddr;
				283	}
				284
				285	static inline struct in6_addr tcp_v6_rcv_saddr(const struct sock sk)
				286	{
				287	return sk->sk_family == AF_INET6 ? __tcp_v6_rcv_saddr(sk) : NULL;
				288	}
				289
				290	#define tcptw_sk_ipv6only(__sk) (tcptw_sk(__sk)->tw_v6_ipv6only)
				291
				292	static inline int tcp_v6_ipv6only(const struct sock *sk)
				293	{
				294	return likely(sk->sk_state != TCP_TIME_WAIT) ?
				295	ipv6_only_sock(sk) : tcptw_sk_ipv6only(sk);
				296	}
				297	#else
				298	# define __tcp_v6_rcv_saddr(__sk) NULL
				299	# define tcp_v6_rcv_saddr(__sk) NULL
				300	# define tcptw_sk_ipv6only(__sk) 0
				301	# define tcp_v6_ipv6only(__sk) 0
				302	#endif
				303
				304	extern kmem_cache_t *tcp_timewait_cachep;
				305
				306	static inline void tcp_tw_put(struct tcp_tw_bucket *tw)
				307	{
				308	if (atomic_dec_and_test(&tw->tw_refcnt)) {
				309	#ifdef INET_REFCNT_DEBUG
				310	printk(KERN_DEBUG "tw_bucket %p released\n", tw);
				311	#endif
				312	kmem_cache_free(tcp_timewait_cachep, tw);
				313	}
				314	}
				315
				316	extern atomic_t tcp_orphan_count;
				317	extern int tcp_tw_count;
				318	extern void tcp_time_wait(struct sock *sk, int state, int timeo);
				319	extern void tcp_tw_deschedule(struct tcp_tw_bucket *tw);
				320
				321
				322	/* Socket demux engine toys. */
				323	#ifdef __BIG_ENDIAN
				324	#define TCP_COMBINED_PORTS(__sport, __dport) \
				325	(((__u32)(__sport)<<16) \| (__u32)(__dport))
				326	#else /* __LITTLE_ENDIAN */
				327	#define TCP_COMBINED_PORTS(__sport, __dport) \
				328	(((__u32)(__dport)<<16) \| (__u32)(__sport))
				329	#endif
				330
				331	#if (BITS_PER_LONG == 64)
				332	#ifdef __BIG_ENDIAN
				333	#define TCP_V4_ADDR_COOKIE(__name, __saddr, __daddr) \
				334	__u64 __name = (((__u64)(__saddr))<<32)\|((__u64)(__daddr));
				335	#else /* __LITTLE_ENDIAN */
				336	#define TCP_V4_ADDR_COOKIE(__name, __saddr, __daddr) \
				337	__u64 __name = (((__u64)(__daddr))<<32)\|((__u64)(__saddr));
				338	#endif /* __BIG_ENDIAN */
				339	#define TCP_IPV4_MATCH(__sk, __cookie, __saddr, __daddr, __ports, __dif)\
				340	(((((__u64 )&(inet_sk(__sk)->daddr)))== (__cookie)) && \
				341	((((__u32 )&(inet_sk(__sk)->dport)))== (__ports)) && \
				342	(!((__sk)->sk_bound_dev_if) \|\| ((__sk)->sk_bound_dev_if == (__dif))))
				343	#define TCP_IPV4_TW_MATCH(__sk, __cookie, __saddr, __daddr, __ports, __dif)\
				344	(((((__u64 )&(tcptw_sk(__sk)->tw_daddr))) == (__cookie)) && \
				345	((((__u32 )&(tcptw_sk(__sk)->tw_dport))) == (__ports)) && \
				346	(!((__sk)->sk_bound_dev_if) \|\| ((__sk)->sk_bound_dev_if == (__dif))))
				347	#else /* 32-bit arch */
				348	#define TCP_V4_ADDR_COOKIE(__name, __saddr, __daddr)
				349	#define TCP_IPV4_MATCH(__sk, __cookie, __saddr, __daddr, __ports, __dif)\
				350	((inet_sk(__sk)->daddr == (__saddr)) && \
				351	(inet_sk(__sk)->rcv_saddr == (__daddr)) && \
				352	((((__u32 )&(inet_sk(__sk)->dport)))== (__ports)) && \
				353	(!((__sk)->sk_bound_dev_if) \|\| ((__sk)->sk_bound_dev_if == (__dif))))
				354	#define TCP_IPV4_TW_MATCH(__sk, __cookie, __saddr, __daddr, __ports, __dif)\
				355	((tcptw_sk(__sk)->tw_daddr == (__saddr)) && \
				356	(tcptw_sk(__sk)->tw_rcv_saddr == (__daddr)) && \
				357	((((__u32 )&(tcptw_sk(__sk)->tw_dport))) == (__ports)) && \
				358	(!((__sk)->sk_bound_dev_if) \|\| ((__sk)->sk_bound_dev_if == (__dif))))
				359	#endif /* 64-bit arch */
				360
				361	#define TCP_IPV6_MATCH(__sk, __saddr, __daddr, __ports, __dif) \
				362	(((((__u32 )&(inet_sk(__sk)->dport)))== (__ports)) && \
				363	((__sk)->sk_family == AF_INET6) && \
				364	ipv6_addr_equal(&inet6_sk(__sk)->daddr, (__saddr)) && \
				365	ipv6_addr_equal(&inet6_sk(__sk)->rcv_saddr, (__daddr)) && \
				366	(!((__sk)->sk_bound_dev_if) \|\| ((__sk)->sk_bound_dev_if == (__dif))))
				367
				368	/* These can have wildcards, don't try too hard. */
				369	static __inline__ int tcp_lhashfn(unsigned short num)
				370	{
				371	return num & (TCP_LHTABLE_SIZE - 1);
				372	}
				373
				374	static __inline__ int tcp_sk_listen_hashfn(struct sock *sk)
				375	{
				376	return tcp_lhashfn(inet_sk(sk)->num);
				377	}
				378
				379	#define MAX_TCP_HEADER (128 + MAX_HEADER)
				380
				381	/*
				382	* Never offer a window over 32767 without using window scaling. Some
				383	* poor stacks do signed 16bit maths!
				384	*/
				385	#define MAX_TCP_WINDOW 32767U
				386
				387	/* Minimal accepted MSS. It is (60+60+8) - (20+20). */
				388	#define TCP_MIN_MSS 88U
				389
				390	/* Minimal RCV_MSS. */
				391	#define TCP_MIN_RCVMSS 536U
				392
				393	/* After receiving this amount of duplicate ACKs fast retransmit starts. */
				394	#define TCP_FASTRETRANS_THRESH 3
				395
				396	/* Maximal reordering. */
				397	#define TCP_MAX_REORDERING 127
				398
				399	/* Maximal number of ACKs sent quickly to accelerate slow-start. */
				400	#define TCP_MAX_QUICKACKS 16U
				401
				402	/* urg_data states */
				403	#define TCP_URG_VALID 0x0100
				404	#define TCP_URG_NOTYET 0x0200
				405	#define TCP_URG_READ 0x0400
				406
				407	#define TCP_RETR1 3 /*
				408	* This is how many retries it does before it
				409	* tries to figure out if the gateway is
				410	* down. Minimal RFC value is 3; it corresponds
				411	* to ~3sec-8min depending on RTO.
				412	*/
				413
				414	#define TCP_RETR2 15 /*
				415	* This should take at least
				416	* 90 minutes to time out.
				417	* RFC1122 says that the limit is 100 sec.
				418	* 15 is ~13-30min depending on RTO.
				419	*/
				420
				421	#define TCP_SYN_RETRIES 5 /* number of times to retry active opening a
				422	* connection: ~180sec is RFC minumum */
				423
				424	#define TCP_SYNACK_RETRIES 5 /* number of times to retry passive opening a
				425	* connection: ~180sec is RFC minumum */
				426
				427
				428	#define TCP_ORPHAN_RETRIES 7 /* number of times to retry on an orphaned
				429	* socket. 7 is ~50sec-16min.
				430	*/
				431
				432
				433	#define TCP_TIMEWAIT_LEN (60HZ) / how long to wait to destroy TIME-WAIT
				434	* state, about 60 seconds */
				435	#define TCP_FIN_TIMEOUT TCP_TIMEWAIT_LEN
				436	/* BSD style FIN_WAIT2 deadlock breaker.
				437	* It used to be 3min, new value is 60sec,
				438	* to combine FIN-WAIT-2 timeout with
				439	* TIME-WAIT timer.
				440	*/
				441
				442	#define TCP_DELACK_MAX ((unsigned)(HZ/5)) /* maximal time to delay before sending an ACK */
				443	#if HZ >= 100
				444	#define TCP_DELACK_MIN ((unsigned)(HZ/25)) /* minimal time to delay before sending an ACK */
				445	#define TCP_ATO_MIN ((unsigned)(HZ/25))
				446	#else
				447	#define TCP_DELACK_MIN 4U
				448	#define TCP_ATO_MIN 4U
				449	#endif
				450	#define TCP_RTO_MAX ((unsigned)(120*HZ))
				451	#define TCP_RTO_MIN ((unsigned)(HZ/5))
				452	#define TCP_TIMEOUT_INIT ((unsigned)(3HZ)) / RFC 1122 initial RTO value */
				453
				454	#define TCP_RESOURCE_PROBE_INTERVAL ((unsigned)(HZ/2U)) /* Maximal interval between probes
				455	* for local resources.
				456	*/
				457
				458	#define TCP_KEEPALIVE_TIME (12060HZ) /* two hours */
				459	#define TCP_KEEPALIVE_PROBES 9 /* Max of 9 keepalive probes */
				460	#define TCP_KEEPALIVE_INTVL (75*HZ)
				461
				462	#define MAX_TCP_KEEPIDLE 32767
				463	#define MAX_TCP_KEEPINTVL 32767
				464	#define MAX_TCP_KEEPCNT 127
				465	#define MAX_TCP_SYNCNT 127
				466
				467	#define TCP_SYNQ_INTERVAL (HZ/5) /* Period of SYNACK timer */
				468	#define TCP_SYNQ_HSIZE 512 /* Size of SYNACK hash table */
				469
				470	#define TCP_PAWS_24DAYS (60 * 60 * 24 * 24)
				471	#define TCP_PAWS_MSL 60 /* Per-host timestamps are invalidated
				472	* after this time. It should be equal
				473	* (or greater than) TCP_TIMEWAIT_LEN
				474	* to provide reliability equal to one
				475	* provided by timewait state.
				476	*/
				477	#define TCP_PAWS_WINDOW 1 /* Replay window for per-host
				478	* timestamps. It must be less than
				479	* minimal timewait lifetime.
				480	*/
				481
				482	#define TCP_TW_RECYCLE_SLOTS_LOG 5
				483	#define TCP_TW_RECYCLE_SLOTS (1<<TCP_TW_RECYCLE_SLOTS_LOG)
				484
				485	/* If time > 4sec, it is "slow" path, no recycling is required,
				486	so that we select tick to get range about 4 seconds.
				487	*/
				488
				489	#if HZ <= 16 \|\| HZ > 4096
				490	# error Unsupported: HZ <= 16 or HZ > 4096
				491	#elif HZ <= 32
				492	# define TCP_TW_RECYCLE_TICK (5+2-TCP_TW_RECYCLE_SLOTS_LOG)
				493	#elif HZ <= 64
				494	# define TCP_TW_RECYCLE_TICK (6+2-TCP_TW_RECYCLE_SLOTS_LOG)
				495	#elif HZ <= 128
				496	# define TCP_TW_RECYCLE_TICK (7+2-TCP_TW_RECYCLE_SLOTS_LOG)
				497	#elif HZ <= 256
				498	# define TCP_TW_RECYCLE_TICK (8+2-TCP_TW_RECYCLE_SLOTS_LOG)
				499	#elif HZ <= 512
				500	# define TCP_TW_RECYCLE_TICK (9+2-TCP_TW_RECYCLE_SLOTS_LOG)
				501	#elif HZ <= 1024
				502	# define TCP_TW_RECYCLE_TICK (10+2-TCP_TW_RECYCLE_SLOTS_LOG)
				503	#elif HZ <= 2048
				504	# define TCP_TW_RECYCLE_TICK (11+2-TCP_TW_RECYCLE_SLOTS_LOG)
				505	#else
				506	# define TCP_TW_RECYCLE_TICK (12+2-TCP_TW_RECYCLE_SLOTS_LOG)
				507	#endif
				508
				509	#define BICTCP_BETA_SCALE 1024 /* Scale factor beta calculation
				510	* max_cwnd = snd_cwnd * beta
				511	*/
				512	#define BICTCP_MAX_INCREMENT 32 /*
				513	* Limit on the amount of
				514	* increment allowed during
				515	* binary search.
				516	*/
				517	#define BICTCP_FUNC_OF_MIN_INCR 11 /*
				518	* log(B/Smin)/log(B/(B-1))+1,
				519	* Smin:min increment
				520	* B:log factor
				521	*/
				522	#define BICTCP_B 4 /*
				523	* In binary search,
				524	* go to point (max+min)/N
				525	*/
				526
				527	/*
				528	* TCP option
				529	*/
				530
				531	#define TCPOPT_NOP 1 /* Padding */
				532	#define TCPOPT_EOL 0 /* End of options */
				533	#define TCPOPT_MSS 2 /* Segment size negotiating */
				534	#define TCPOPT_WINDOW 3 /* Window scaling */
				535	#define TCPOPT_SACK_PERM 4 /* SACK Permitted */
				536	#define TCPOPT_SACK 5 /* SACK Block */
				537	#define TCPOPT_TIMESTAMP 8 /* Better RTT estimations/PAWS */
				538
				539	/*
				540	* TCP option lengths
				541	*/
				542
				543	#define TCPOLEN_MSS 4
				544	#define TCPOLEN_WINDOW 3
				545	#define TCPOLEN_SACK_PERM 2
				546	#define TCPOLEN_TIMESTAMP 10
				547
				548	/* But this is what stacks really send out. */
				549	#define TCPOLEN_TSTAMP_ALIGNED 12
				550	#define TCPOLEN_WSCALE_ALIGNED 4
				551	#define TCPOLEN_SACKPERM_ALIGNED 4
				552	#define TCPOLEN_SACK_BASE 2
				553	#define TCPOLEN_SACK_BASE_ALIGNED 4
				554	#define TCPOLEN_SACK_PERBLOCK 8
				555
				556	#define TCP_TIME_RETRANS 1 /* Retransmit timer */
				557	#define TCP_TIME_DACK 2 /* Delayed ack timer */
				558	#define TCP_TIME_PROBE0 3 /* Zero window probe timer */
				559	#define TCP_TIME_KEEPOPEN 4 /* Keepalive timer */
				560
				561	/* Flags in tp->nonagle */
				562	#define TCP_NAGLE_OFF 1 /* Nagle's algo is disabled */
				563	#define TCP_NAGLE_CORK 2 /* Socket is corked */
				564	#define TCP_NAGLE_PUSH 4 /* Cork is overriden for already queued data */
				565
				566	/* sysctl variables for tcp */
				567	extern int sysctl_max_syn_backlog;
				568	extern int sysctl_tcp_timestamps;
				569	extern int sysctl_tcp_window_scaling;
				570	extern int sysctl_tcp_sack;
				571	extern int sysctl_tcp_fin_timeout;
				572	extern int sysctl_tcp_tw_recycle;
				573	extern int sysctl_tcp_keepalive_time;
				574	extern int sysctl_tcp_keepalive_probes;
				575	extern int sysctl_tcp_keepalive_intvl;
				576	extern int sysctl_tcp_syn_retries;
				577	extern int sysctl_tcp_synack_retries;
				578	extern int sysctl_tcp_retries1;
				579	extern int sysctl_tcp_retries2;
				580	extern int sysctl_tcp_orphan_retries;
				581	extern int sysctl_tcp_syncookies;
				582	extern int sysctl_tcp_retrans_collapse;
				583	extern int sysctl_tcp_stdurg;
				584	extern int sysctl_tcp_rfc1337;
				585	extern int sysctl_tcp_abort_on_overflow;
				586	extern int sysctl_tcp_max_orphans;
				587	extern int sysctl_tcp_max_tw_buckets;
				588	extern int sysctl_tcp_fack;
				589	extern int sysctl_tcp_reordering;
				590	extern int sysctl_tcp_ecn;
				591	extern int sysctl_tcp_dsack;
				592	extern int sysctl_tcp_mem[3];
				593	extern int sysctl_tcp_wmem[3];
				594	extern int sysctl_tcp_rmem[3];
				595	extern int sysctl_tcp_app_win;
				596	extern int sysctl_tcp_adv_win_scale;
				597	extern int sysctl_tcp_tw_reuse;
				598	extern int sysctl_tcp_frto;
				599	extern int sysctl_tcp_low_latency;
				600	extern int sysctl_tcp_westwood;
				601	extern int sysctl_tcp_vegas_cong_avoid;
				602	extern int sysctl_tcp_vegas_alpha;
				603	extern int sysctl_tcp_vegas_beta;
				604	extern int sysctl_tcp_vegas_gamma;
				605	extern int sysctl_tcp_nometrics_save;
				606	extern int sysctl_tcp_bic;
				607	extern int sysctl_tcp_bic_fast_convergence;
				608	extern int sysctl_tcp_bic_low_window;
				609	extern int sysctl_tcp_bic_beta;
				610	extern int sysctl_tcp_moderate_rcvbuf;
				611	extern int sysctl_tcp_tso_win_divisor;
				612
				613	extern atomic_t tcp_memory_allocated;
				614	extern atomic_t tcp_sockets_allocated;
				615	extern int tcp_memory_pressure;
				616
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	617	#if defined(CONFIG_IPV6) \|\| defined(CONFIG_IPV6_MODULE)
				618	#define TCP_INET_FAMILY(fam) ((fam) == AF_INET)
				619	#else
				620	#define TCP_INET_FAMILY(fam) 1
				621	#endif
				622
				623	/*
				624	* Pointers to address related TCP functions
				625	* (i.e. things that depend on the address family)
				626	*/
				627
				628	struct tcp_func {
				629	int (queue_xmit) (struct sk_buff skb,
				630	int ipfragok);
				631
				632	void (send_check) (struct sock sk,
				633	struct tcphdr *th,
				634	int len,
				635	struct sk_buff *skb);
				636
				637	int (rebuild_header) (struct sock sk);
				638
				639	int (conn_request) (struct sock sk,
				640	struct sk_buff *skb);
				641
				642	struct sock * (syn_recv_sock) (struct sock sk,
				643	struct sk_buff *skb,
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame^]	644	struct request_sock *req,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	645	struct dst_entry *dst);
				646
				647	int (remember_stamp) (struct sock sk);
				648
				649	__u16 net_header_len;
				650
				651	int (setsockopt) (struct sock sk,
				652	int level,
				653	int optname,
				654	char __user *optval,
				655	int optlen);
				656
				657	int (getsockopt) (struct sock sk,
				658	int level,
				659	int optname,
				660	char __user *optval,
				661	int __user *optlen);
				662
				663
				664	void (addr2sockaddr) (struct sock sk,
				665	struct sockaddr *);
				666
				667	int sockaddr_len;
				668	};
				669
				670	/*
				671	* The next routines deal with comparing 32 bit unsigned ints
				672	* and worry about wraparound (automatic with unsigned arithmetic).
				673	*/
				674
				675	static inline int before(__u32 seq1, __u32 seq2)
				676	{
				677	return (__s32)(seq1-seq2) < 0;
				678	}
				679
				680	static inline int after(__u32 seq1, __u32 seq2)
				681	{
				682	return (__s32)(seq2-seq1) < 0;
				683	}
				684
				685
				686	/* is s2<=s1<=s3 ? */
				687	static inline int between(__u32 seq1, __u32 seq2, __u32 seq3)
				688	{
				689	return seq3 - seq2 >= seq1 - seq2;
				690	}
				691
				692
				693	extern struct proto tcp_prot;
				694
				695	DECLARE_SNMP_STAT(struct tcp_mib, tcp_statistics);
				696	#define TCP_INC_STATS(field) SNMP_INC_STATS(tcp_statistics, field)
				697	#define TCP_INC_STATS_BH(field) SNMP_INC_STATS_BH(tcp_statistics, field)
				698	#define TCP_INC_STATS_USER(field) SNMP_INC_STATS_USER(tcp_statistics, field)
				699	#define TCP_DEC_STATS(field) SNMP_DEC_STATS(tcp_statistics, field)
				700	#define TCP_ADD_STATS_BH(field, val) SNMP_ADD_STATS_BH(tcp_statistics, field, val)
				701	#define TCP_ADD_STATS_USER(field, val) SNMP_ADD_STATS_USER(tcp_statistics, field, val)
				702
				703	extern void tcp_put_port(struct sock *sk);
				704	extern void tcp_inherit_port(struct sock sk, struct sock child);
				705
				706	extern void tcp_v4_err(struct sk_buff *skb, u32);
				707
				708	extern void tcp_shutdown (struct sock *sk, int how);
				709
				710	extern int tcp_v4_rcv(struct sk_buff *skb);
				711
				712	extern int tcp_v4_remember_stamp(struct sock *sk);
				713
				714	extern int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw);
				715
				716	extern int tcp_sendmsg(struct kiocb iocb, struct sock sk,
				717	struct msghdr *msg, size_t size);
				718	extern ssize_t tcp_sendpage(struct socket sock, struct page page, int offset, size_t size, int flags);
				719
				720	extern int tcp_ioctl(struct sock *sk,
				721	int cmd,
				722	unsigned long arg);
				723
				724	extern int tcp_rcv_state_process(struct sock *sk,
				725	struct sk_buff *skb,
				726	struct tcphdr *th,
				727	unsigned len);
				728
				729	extern int tcp_rcv_established(struct sock *sk,
				730	struct sk_buff *skb,
				731	struct tcphdr *th,
				732	unsigned len);
				733
				734	extern void tcp_rcv_space_adjust(struct sock *sk);
				735
				736	enum tcp_ack_state_t
				737	{
				738	TCP_ACK_SCHED = 1,
				739	TCP_ACK_TIMER = 2,
				740	TCP_ACK_PUSHED= 4
				741	};
				742
				743	static inline void tcp_schedule_ack(struct tcp_sock *tp)
				744	{
				745	tp->ack.pending \|= TCP_ACK_SCHED;
				746	}
				747
				748	static inline int tcp_ack_scheduled(struct tcp_sock *tp)
				749	{
				750	return tp->ack.pending&TCP_ACK_SCHED;
				751	}
				752
				753	static __inline__ void tcp_dec_quickack_mode(struct tcp_sock *tp)
				754	{
				755	if (tp->ack.quick && --tp->ack.quick == 0) {
				756	/* Leaving quickack mode we deflate ATO. */
				757	tp->ack.ato = TCP_ATO_MIN;
				758	}
				759	}
				760
				761	extern void tcp_enter_quickack_mode(struct tcp_sock *tp);
				762
				763	static __inline__ void tcp_delack_init(struct tcp_sock *tp)
				764	{
				765	memset(&tp->ack, 0, sizeof(tp->ack));
				766	}
				767
				768	static inline void tcp_clear_options(struct tcp_options_received *rx_opt)
				769	{
				770	rx_opt->tstamp_ok = rx_opt->sack_ok = rx_opt->wscale_ok = rx_opt->snd_wscale = 0;
				771	}
				772
				773	enum tcp_tw_status
				774	{
				775	TCP_TW_SUCCESS = 0,
				776	TCP_TW_RST = 1,
				777	TCP_TW_ACK = 2,
				778	TCP_TW_SYN = 3
				779	};
				780
				781
				782	extern enum tcp_tw_status tcp_timewait_state_process(struct tcp_tw_bucket *tw,
				783	struct sk_buff *skb,
				784	struct tcphdr *th,
				785	unsigned len);
				786
				787	extern struct sock * tcp_check_req(struct sock sk,struct sk_buff skb,
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame^]	788	struct request_sock *req,
				789	struct request_sock **prev);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	790	extern int tcp_child_process(struct sock *parent,
				791	struct sock *child,
				792	struct sk_buff *skb);
				793	extern void tcp_enter_frto(struct sock *sk);
				794	extern void tcp_enter_loss(struct sock *sk, int how);
				795	extern void tcp_clear_retrans(struct tcp_sock *tp);
				796	extern void tcp_update_metrics(struct sock *sk);
				797
				798	extern void tcp_close(struct sock *sk,
				799	long timeout);
				800	extern struct sock * tcp_accept(struct sock sk, int flags, int err);
				801	extern unsigned int tcp_poll(struct file * file, struct socket sock, struct poll_table_struct wait);
				802
				803	extern int tcp_getsockopt(struct sock *sk, int level,
				804	int optname,
				805	char __user *optval,
				806	int __user *optlen);
				807	extern int tcp_setsockopt(struct sock *sk, int level,
				808	int optname, char __user *optval,
				809	int optlen);
				810	extern void tcp_set_keepalive(struct sock *sk, int val);
				811	extern int tcp_recvmsg(struct kiocb iocb, struct sock sk,
				812	struct msghdr *msg,
				813	size_t len, int nonblock,
				814	int flags, int *addr_len);
				815
				816	extern int tcp_listen_start(struct sock *sk);
				817
				818	extern void tcp_parse_options(struct sk_buff *skb,
				819	struct tcp_options_received *opt_rx,
				820	int estab);
				821
				822	/*
				823	* TCP v4 functions exported for the inet6 API
				824	*/
				825
				826	extern int tcp_v4_rebuild_header(struct sock *sk);
				827
				828	extern int tcp_v4_build_header(struct sock *sk,
				829	struct sk_buff *skb);
				830
				831	extern void tcp_v4_send_check(struct sock *sk,
				832	struct tcphdr *th, int len,
				833	struct sk_buff *skb);
				834
				835	extern int tcp_v4_conn_request(struct sock *sk,
				836	struct sk_buff *skb);
				837
				838	extern struct sock * tcp_create_openreq_child(struct sock *sk,
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame^]	839	struct request_sock *req,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	840	struct sk_buff *skb);
				841
				842	extern struct sock * tcp_v4_syn_recv_sock(struct sock *sk,
				843	struct sk_buff *skb,
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame^]	844	struct request_sock *req,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	845	struct dst_entry *dst);
				846
				847	extern int tcp_v4_do_rcv(struct sock *sk,
				848	struct sk_buff *skb);
				849
				850	extern int tcp_v4_connect(struct sock *sk,
				851	struct sockaddr *uaddr,
				852	int addr_len);
				853
				854	extern int tcp_connect(struct sock *sk);
				855
				856	extern struct sk_buff * tcp_make_synack(struct sock *sk,
				857	struct dst_entry *dst,
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame^]	858	struct request_sock *req);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	859
				860	extern int tcp_disconnect(struct sock *sk, int flags);
				861
				862	extern void tcp_unhash(struct sock *sk);
				863
				864	extern int tcp_v4_hash_connecting(struct sock *sk);
				865
				866
				867	/* From syncookies.c */
				868	extern struct sock cookie_v4_check(struct sock sk, struct sk_buff *skb,
				869	struct ip_options *opt);
				870	extern __u32 cookie_v4_init_sequence(struct sock sk, struct sk_buff skb,
				871	__u16 *mss);
				872
				873	/* tcp_output.c */
				874
				875	extern int tcp_write_xmit(struct sock *, int nonagle);
				876	extern int tcp_retransmit_skb(struct sock , struct sk_buff );
				877	extern void tcp_xmit_retransmit_queue(struct sock *);
				878	extern void tcp_simple_retransmit(struct sock *);
				879	extern int tcp_trim_head(struct sock , struct sk_buff , u32);
				880
				881	extern void tcp_send_probe0(struct sock *);
				882	extern void tcp_send_partial(struct sock *);
				883	extern int tcp_write_wakeup(struct sock *);
				884	extern void tcp_send_fin(struct sock *sk);
				885	extern void tcp_send_active_reset(struct sock *sk, int priority);
				886	extern int tcp_send_synack(struct sock *);
				887	extern void tcp_push_one(struct sock *, unsigned mss_now);
				888	extern void tcp_send_ack(struct sock *sk);
				889	extern void tcp_send_delayed_ack(struct sock *sk);
				890
				891	/* tcp_timer.c */
				892	extern void tcp_init_xmit_timers(struct sock *);
				893	extern void tcp_clear_xmit_timers(struct sock *);
				894
				895	extern void tcp_delete_keepalive_timer(struct sock *);
				896	extern void tcp_reset_keepalive_timer(struct sock *, unsigned long);
				897	extern unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu);
				898	extern unsigned int tcp_current_mss(struct sock *sk, int large);
				899
				900	#ifdef TCP_DEBUG
				901	extern const char tcp_timer_bug_msg[];
				902	#endif
				903
				904	/* tcp_diag.c */
				905	extern void tcp_get_info(struct sock , struct tcp_info );
				906
				907	/* Read 'sendfile()'-style from a TCP socket */
				908	typedef int (sk_read_actor_t)(read_descriptor_t , struct sk_buff *,
				909	unsigned int, size_t);
				910	extern int tcp_read_sock(struct sock sk, read_descriptor_t desc,
				911	sk_read_actor_t recv_actor);
				912
				913	static inline void tcp_clear_xmit_timer(struct sock *sk, int what)
				914	{
				915	struct tcp_sock *tp = tcp_sk(sk);
				916
				917	switch (what) {
				918	case TCP_TIME_RETRANS:
				919	case TCP_TIME_PROBE0:
				920	tp->pending = 0;
				921
				922	#ifdef TCP_CLEAR_TIMERS
				923	sk_stop_timer(sk, &tp->retransmit_timer);
				924	#endif
				925	break;
				926	case TCP_TIME_DACK:
				927	tp->ack.blocked = 0;
				928	tp->ack.pending = 0;
				929
				930	#ifdef TCP_CLEAR_TIMERS
				931	sk_stop_timer(sk, &tp->delack_timer);
				932	#endif
				933	break;
				934	default:
				935	#ifdef TCP_DEBUG
				936	printk(tcp_timer_bug_msg);
				937	#endif
				938	return;
				939	};
				940
				941	}
				942
				943	/*
				944	* Reset the retransmission timer
				945	*/
				946	static inline void tcp_reset_xmit_timer(struct sock *sk, int what, unsigned long when)
				947	{
				948	struct tcp_sock *tp = tcp_sk(sk);
				949
				950	if (when > TCP_RTO_MAX) {
				951	#ifdef TCP_DEBUG
				952	printk(KERN_DEBUG "reset_xmit_timer sk=%p %d when=0x%lx, caller=%p\n", sk, what, when, current_text_addr());
				953	#endif
				954	when = TCP_RTO_MAX;
				955	}
				956
				957	switch (what) {
				958	case TCP_TIME_RETRANS:
				959	case TCP_TIME_PROBE0:
				960	tp->pending = what;
				961	tp->timeout = jiffies+when;
				962	sk_reset_timer(sk, &tp->retransmit_timer, tp->timeout);
				963	break;
				964
				965	case TCP_TIME_DACK:
				966	tp->ack.pending \|= TCP_ACK_TIMER;
				967	tp->ack.timeout = jiffies+when;
				968	sk_reset_timer(sk, &tp->delack_timer, tp->ack.timeout);
				969	break;
				970
				971	default:
				972	#ifdef TCP_DEBUG
				973	printk(tcp_timer_bug_msg);
				974	#endif
				975	return;
				976	};
				977	}
				978
				979	/* Initialize RCV_MSS value.
				980	* RCV_MSS is an our guess about MSS used by the peer.
				981	* We haven't any direct information about the MSS.
				982	* It's better to underestimate the RCV_MSS rather than overestimate.
				983	* Overestimations make us ACKing less frequently than needed.
				984	* Underestimations are more easy to detect and fix by tcp_measure_rcv_mss().
				985	*/
				986
				987	static inline void tcp_initialize_rcv_mss(struct sock *sk)
				988	{
				989	struct tcp_sock *tp = tcp_sk(sk);
				990	unsigned int hint = min(tp->advmss, tp->mss_cache_std);
				991
				992	hint = min(hint, tp->rcv_wnd/2);
				993	hint = min(hint, TCP_MIN_RCVMSS);
				994	hint = max(hint, TCP_MIN_MSS);
				995
				996	tp->ack.rcv_mss = hint;
				997	}
				998
				999	static __inline__ void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd)
				1000	{
				1001	tp->pred_flags = htonl((tp->tcp_header_len << 26) \|
				1002	ntohl(TCP_FLAG_ACK) \|
				1003	snd_wnd);
				1004	}
				1005
				1006	static __inline__ void tcp_fast_path_on(struct tcp_sock *tp)
				1007	{
				1008	__tcp_fast_path_on(tp, tp->snd_wnd >> tp->rx_opt.snd_wscale);
				1009	}
				1010
				1011	static inline void tcp_fast_path_check(struct sock sk, struct tcp_sock tp)
				1012	{
				1013	if (skb_queue_len(&tp->out_of_order_queue) == 0 &&
				1014	tp->rcv_wnd &&
				1015	atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf &&
				1016	!tp->urg_data)
				1017	tcp_fast_path_on(tp);
				1018	}
				1019
				1020	/* Compute the actual receive window we are currently advertising.
				1021	* Rcv_nxt can be after the window if our peer push more data
				1022	* than the offered window.
				1023	*/
				1024	static __inline__ u32 tcp_receive_window(const struct tcp_sock *tp)
				1025	{
				1026	s32 win = tp->rcv_wup + tp->rcv_wnd - tp->rcv_nxt;
				1027
				1028	if (win < 0)
				1029	win = 0;
				1030	return (u32) win;
				1031	}
				1032
				1033	/* Choose a new window, without checks for shrinking, and without
				1034	* scaling applied to the result. The caller does these things
				1035	* if necessary. This is a "raw" window selection.
				1036	*/
				1037	extern u32 __tcp_select_window(struct sock *sk);
				1038
				1039	/* TCP timestamps are only 32-bits, this causes a slight
				1040	* complication on 64-bit systems since we store a snapshot
				1041	* of jiffies in the buffer control blocks below. We decidely
				1042	* only use of the low 32-bits of jiffies and hide the ugly
				1043	* casts with the following macro.
				1044	*/
				1045	#define tcp_time_stamp ((__u32)(jiffies))
				1046
				1047	/* This is what the send packet queueing engine uses to pass
				1048	* TCP per-packet control information to the transmission
				1049	* code. We also store the host-order sequence numbers in
				1050	* here too. This is 36 bytes on 32-bit architectures,
				1051	* 40 bytes on 64-bit machines, if this grows please adjust
				1052	* skbuff.h:skbuff->cb[xxx] size appropriately.
				1053	*/
				1054	struct tcp_skb_cb {
				1055	union {
				1056	struct inet_skb_parm h4;
				1057	#if defined(CONFIG_IPV6) \|\| defined (CONFIG_IPV6_MODULE)
				1058	struct inet6_skb_parm h6;
				1059	#endif
				1060	} header; /* For incoming frames */
				1061	__u32 seq; /* Starting sequence number */
				1062	__u32 end_seq; /* SEQ + FIN + SYN + datalen */
				1063	__u32 when; /* used to compute rtt's */
				1064	__u8 flags; /* TCP header flags. */
				1065
				1066	/* NOTE: These must match up to the flags byte in a
				1067	* real TCP header.
				1068	*/
				1069	#define TCPCB_FLAG_FIN 0x01
				1070	#define TCPCB_FLAG_SYN 0x02
				1071	#define TCPCB_FLAG_RST 0x04
				1072	#define TCPCB_FLAG_PSH 0x08
				1073	#define TCPCB_FLAG_ACK 0x10
				1074	#define TCPCB_FLAG_URG 0x20
				1075	#define TCPCB_FLAG_ECE 0x40
				1076	#define TCPCB_FLAG_CWR 0x80
				1077
				1078	__u8 sacked; /* State flags for SACK/FACK. */
				1079	#define TCPCB_SACKED_ACKED 0x01 /* SKB ACK'd by a SACK block */
				1080	#define TCPCB_SACKED_RETRANS 0x02 /* SKB retransmitted */
				1081	#define TCPCB_LOST 0x04 /* SKB is lost */
				1082	#define TCPCB_TAGBITS 0x07 /* All tag bits */
				1083
				1084	#define TCPCB_EVER_RETRANS 0x80 /* Ever retransmitted frame */
				1085	#define TCPCB_RETRANS (TCPCB_SACKED_RETRANS\|TCPCB_EVER_RETRANS)
				1086
				1087	#define TCPCB_URG 0x20 /* Urgent pointer advenced here */
				1088
				1089	#define TCPCB_AT_TAIL (TCPCB_URG)
				1090
				1091	__u16 urg_ptr; /* Valid w/URG flags is set. */
				1092	__u32 ack_seq; /* Sequence number ACK'd */
				1093	};
				1094
				1095	#define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0]))
				1096
				1097	#include <net/tcp_ecn.h>
				1098
				1099	/* Due to TSO, an SKB can be composed of multiple actual
				1100	* packets. To keep these tracked properly, we use this.
				1101	*/
				1102	static inline int tcp_skb_pcount(const struct sk_buff *skb)
				1103	{
				1104	return skb_shinfo(skb)->tso_segs;
				1105	}
				1106
				1107	/* This is valid iff tcp_skb_pcount() > 1. */
				1108	static inline int tcp_skb_mss(const struct sk_buff *skb)
				1109	{
				1110	return skb_shinfo(skb)->tso_size;
				1111	}
				1112
				1113	static inline void tcp_dec_pcount_approx(__u32 *count,
				1114	const struct sk_buff *skb)
				1115	{
				1116	if (*count) {
				1117	*count -= tcp_skb_pcount(skb);
				1118	if ((int)*count < 0)
				1119	*count = 0;
				1120	}
				1121	}
				1122
				1123	static inline void tcp_packets_out_inc(struct sock *sk,
				1124	struct tcp_sock *tp,
				1125	const struct sk_buff *skb)
				1126	{
				1127	int orig = tp->packets_out;
				1128
				1129	tp->packets_out += tcp_skb_pcount(skb);
				1130	if (!orig)
				1131	tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
				1132	}
				1133
				1134	static inline void tcp_packets_out_dec(struct tcp_sock *tp,
				1135	const struct sk_buff *skb)
				1136	{
				1137	tp->packets_out -= tcp_skb_pcount(skb);
				1138	}
				1139
				1140	/* This determines how many packets are "in the network" to the best
				1141	* of our knowledge. In many cases it is conservative, but where
				1142	* detailed information is available from the receiver (via SACK
				1143	* blocks etc.) we can make more aggressive calculations.
				1144	*
				1145	* Use this for decisions involving congestion control, use just
				1146	* tp->packets_out to determine if the send queue is empty or not.
				1147	*
				1148	* Read this equation as:
				1149	*
				1150	* "Packets sent once on transmission queue" MINUS
				1151	* "Packets left network, but not honestly ACKed yet" PLUS
				1152	* "Packets fast retransmitted"
				1153	*/
				1154	static __inline__ unsigned int tcp_packets_in_flight(const struct tcp_sock *tp)
				1155	{
				1156	return (tp->packets_out - tp->left_out + tp->retrans_out);
				1157	}
				1158
				1159	/*
				1160	* Which congestion algorithim is in use on the connection.
				1161	*/
				1162	#define tcp_is_vegas(__tp) ((__tp)->adv_cong == TCP_VEGAS)
				1163	#define tcp_is_westwood(__tp) ((__tp)->adv_cong == TCP_WESTWOOD)
				1164	#define tcp_is_bic(__tp) ((__tp)->adv_cong == TCP_BIC)
				1165
				1166	/* Recalculate snd_ssthresh, we want to set it to:
				1167	*
				1168	* Reno:
				1169	* one half the current congestion window, but no
				1170	* less than two segments
				1171	*
				1172	* BIC:
				1173	* behave like Reno until low_window is reached,
				1174	* then increase congestion window slowly
				1175	*/
				1176	static inline __u32 tcp_recalc_ssthresh(struct tcp_sock *tp)
				1177	{
				1178	if (tcp_is_bic(tp)) {
				1179	if (sysctl_tcp_bic_fast_convergence &&
				1180	tp->snd_cwnd < tp->bictcp.last_max_cwnd)
				1181	tp->bictcp.last_max_cwnd = (tp->snd_cwnd *
				1182	(BICTCP_BETA_SCALE
				1183	+ sysctl_tcp_bic_beta))
				1184	/ (2 * BICTCP_BETA_SCALE);
				1185	else
				1186	tp->bictcp.last_max_cwnd = tp->snd_cwnd;
				1187
				1188	if (tp->snd_cwnd > sysctl_tcp_bic_low_window)
				1189	return max((tp->snd_cwnd * sysctl_tcp_bic_beta)
				1190	/ BICTCP_BETA_SCALE, 2U);
				1191	}
				1192
				1193	return max(tp->snd_cwnd >> 1U, 2U);
				1194	}
				1195
				1196	/* Stop taking Vegas samples for now. */
				1197	#define tcp_vegas_disable(__tp) ((__tp)->vegas.doing_vegas_now = 0)
				1198
				1199	static inline void tcp_vegas_enable(struct tcp_sock *tp)
				1200	{
				1201	/* There are several situations when we must "re-start" Vegas:
				1202	*
				1203	* o when a connection is established
				1204	* o after an RTO
				1205	* o after fast recovery
				1206	* o when we send a packet and there is no outstanding
				1207	* unacknowledged data (restarting an idle connection)
				1208	*
				1209	* In these circumstances we cannot do a Vegas calculation at the
				1210	* end of the first RTT, because any calculation we do is using
				1211	* stale info -- both the saved cwnd and congestion feedback are
				1212	* stale.
				1213	*
				1214	* Instead we must wait until the completion of an RTT during
				1215	* which we actually receive ACKs.
				1216	*/
				1217
				1218	/* Begin taking Vegas samples next time we send something. */
				1219	tp->vegas.doing_vegas_now = 1;
				1220
				1221	/* Set the beginning of the next send window. */
				1222	tp->vegas.beg_snd_nxt = tp->snd_nxt;
				1223
				1224	tp->vegas.cntRTT = 0;
				1225	tp->vegas.minRTT = 0x7fffffff;
				1226	}
				1227
				1228	/* Should we be taking Vegas samples right now? */
				1229	#define tcp_vegas_enabled(__tp) ((__tp)->vegas.doing_vegas_now)
				1230
				1231	extern void tcp_ca_init(struct tcp_sock *tp);
				1232
				1233	static inline void tcp_set_ca_state(struct tcp_sock *tp, u8 ca_state)
				1234	{
				1235	if (tcp_is_vegas(tp)) {
				1236	if (ca_state == TCP_CA_Open)
				1237	tcp_vegas_enable(tp);
				1238	else
				1239	tcp_vegas_disable(tp);
				1240	}
				1241	tp->ca_state = ca_state;
				1242	}
				1243
				1244	/* If cwnd > ssthresh, we may raise ssthresh to be half-way to cwnd.
				1245	* The exception is rate halving phase, when cwnd is decreasing towards
				1246	* ssthresh.
				1247	*/
				1248	static inline __u32 tcp_current_ssthresh(struct tcp_sock *tp)
				1249	{
				1250	if ((1<<tp->ca_state)&(TCPF_CA_CWR\|TCPF_CA_Recovery))
				1251	return tp->snd_ssthresh;
				1252	else
				1253	return max(tp->snd_ssthresh,
				1254	((tp->snd_cwnd >> 1) +
				1255	(tp->snd_cwnd >> 2)));
				1256	}
				1257
				1258	static inline void tcp_sync_left_out(struct tcp_sock *tp)
				1259	{
				1260	if (tp->rx_opt.sack_ok &&
				1261	(tp->sacked_out >= tp->packets_out - tp->lost_out))
				1262	tp->sacked_out = tp->packets_out - tp->lost_out;
				1263	tp->left_out = tp->sacked_out + tp->lost_out;
				1264	}
				1265
				1266	extern void tcp_cwnd_application_limited(struct sock *sk);
				1267
				1268	/* Congestion window validation. (RFC2861) */
				1269
				1270	static inline void tcp_cwnd_validate(struct sock sk, struct tcp_sock tp)
				1271	{
				1272	__u32 packets_out = tp->packets_out;
				1273
				1274	if (packets_out >= tp->snd_cwnd) {
				1275	/* Network is feed fully. */
				1276	tp->snd_cwnd_used = 0;
				1277	tp->snd_cwnd_stamp = tcp_time_stamp;
				1278	} else {
				1279	/* Network starves. */
				1280	if (tp->packets_out > tp->snd_cwnd_used)
				1281	tp->snd_cwnd_used = tp->packets_out;
				1282
				1283	if ((s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= tp->rto)
				1284	tcp_cwnd_application_limited(sk);
				1285	}
				1286	}
				1287
				1288	/* Set slow start threshould and cwnd not falling to slow start */
				1289	static inline void __tcp_enter_cwr(struct tcp_sock *tp)
				1290	{
				1291	tp->undo_marker = 0;
				1292	tp->snd_ssthresh = tcp_recalc_ssthresh(tp);
				1293	tp->snd_cwnd = min(tp->snd_cwnd,
				1294	tcp_packets_in_flight(tp) + 1U);
				1295	tp->snd_cwnd_cnt = 0;
				1296	tp->high_seq = tp->snd_nxt;
				1297	tp->snd_cwnd_stamp = tcp_time_stamp;
				1298	TCP_ECN_queue_cwr(tp);
				1299	}
				1300
				1301	static inline void tcp_enter_cwr(struct tcp_sock *tp)
				1302	{
				1303	tp->prior_ssthresh = 0;
				1304	if (tp->ca_state < TCP_CA_CWR) {
				1305	__tcp_enter_cwr(tp);
				1306	tcp_set_ca_state(tp, TCP_CA_CWR);
				1307	}
				1308	}
				1309
				1310	extern __u32 tcp_init_cwnd(struct tcp_sock tp, struct dst_entry dst);
				1311
				1312	/* Slow start with delack produces 3 packets of burst, so that
				1313	* it is safe "de facto".
				1314	*/
				1315	static __inline__ __u32 tcp_max_burst(const struct tcp_sock *tp)
				1316	{
				1317	return 3;
				1318	}
				1319
				1320	static __inline__ int tcp_minshall_check(const struct tcp_sock *tp)
				1321	{
				1322	return after(tp->snd_sml,tp->snd_una) &&
				1323	!after(tp->snd_sml, tp->snd_nxt);
				1324	}
				1325
				1326	static __inline__ void tcp_minshall_update(struct tcp_sock *tp, int mss,
				1327	const struct sk_buff *skb)
				1328	{
				1329	if (skb->len < mss)
				1330	tp->snd_sml = TCP_SKB_CB(skb)->end_seq;
				1331	}
				1332
				1333	/* Return 0, if packet can be sent now without violation Nagle's rules:
				1334	1. It is full sized.
				1335	2. Or it contains FIN.
				1336	3. Or TCP_NODELAY was set.
				1337	4. Or TCP_CORK is not set, and all sent packets are ACKed.
				1338	With Minshall's modification: all sent small packets are ACKed.
				1339	*/
				1340
				1341	static __inline__ int
				1342	tcp_nagle_check(const struct tcp_sock tp, const struct sk_buff skb,
				1343	unsigned mss_now, int nonagle)
				1344	{
				1345	return (skb->len < mss_now &&
				1346	!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) &&
				1347	((nonagle&TCP_NAGLE_CORK) \|\|
				1348	(!nonagle &&
				1349	tp->packets_out &&
				1350	tcp_minshall_check(tp))));
				1351	}
				1352
David S. Miller	d5ac99a	2005-04-24 19:12:33 -0700	[diff] [blame]	1353	extern void tcp_set_skb_tso_segs(struct sock , struct sk_buff );
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1354
				1355	/* This checks if the data bearing packet SKB (usually sk->sk_send_head)
				1356	* should be put on the wire right now.
				1357	*/
David S. Miller	d5ac99a	2005-04-24 19:12:33 -0700	[diff] [blame]	1358	static __inline__ int tcp_snd_test(struct sock *sk,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1359	struct sk_buff *skb,
				1360	unsigned cur_mss, int nonagle)
				1361	{
David S. Miller	d5ac99a	2005-04-24 19:12:33 -0700	[diff] [blame]	1362	struct tcp_sock *tp = tcp_sk(sk);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1363	int pkts = tcp_skb_pcount(skb);
				1364
				1365	if (!pkts) {
David S. Miller	d5ac99a	2005-04-24 19:12:33 -0700	[diff] [blame]	1366	tcp_set_skb_tso_segs(sk, skb);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1367	pkts = tcp_skb_pcount(skb);
				1368	}
				1369
				1370	/* RFC 1122 - section 4.2.3.4
				1371	*
				1372	* We must queue if
				1373	*
				1374	* a) The right edge of this frame exceeds the window
				1375	* b) There are packets in flight and we have a small segment
				1376	* [SWS avoidance and Nagle algorithm]
				1377	* (part of SWS is done on packetization)
				1378	* Minshall version sounds: there are no _small_
				1379	* segments in flight. (tcp_nagle_check)
				1380	* c) We have too many packets 'in flight'
				1381	*
				1382	* Don't use the nagle rule for urgent data (or
				1383	* for the final FIN -DaveM).
				1384	*
				1385	* Also, Nagle rule does not apply to frames, which
				1386	* sit in the middle of queue (they have no chances
				1387	* to get new data) and if room at tail of skb is
				1388	* not enough to save something seriously (<32 for now).
				1389	*/
				1390
				1391	/* Don't be strict about the congestion window for the
				1392	* final FIN frame. -DaveM
				1393	*/
				1394	return (((nonagle&TCP_NAGLE_PUSH) \|\| tp->urg_mode
				1395	\|\| !tcp_nagle_check(tp, skb, cur_mss, nonagle)) &&
				1396	(((tcp_packets_in_flight(tp) + (pkts-1)) < tp->snd_cwnd) \|\|
				1397	(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)) &&
				1398	!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd));
				1399	}
				1400
				1401	static __inline__ void tcp_check_probe_timer(struct sock sk, struct tcp_sock tp)
				1402	{
				1403	if (!tp->packets_out && !tp->pending)
				1404	tcp_reset_xmit_timer(sk, TCP_TIME_PROBE0, tp->rto);
				1405	}
				1406
				1407	static __inline__ int tcp_skb_is_last(const struct sock *sk,
				1408	const struct sk_buff *skb)
				1409	{
				1410	return skb->next == (struct sk_buff *)&sk->sk_write_queue;
				1411	}
				1412
				1413	/* Push out any pending frames which were held back due to
				1414	* TCP_CORK or attempt at coalescing tiny packets.
				1415	* The socket must be locked by the caller.
				1416	*/
				1417	static __inline__ void __tcp_push_pending_frames(struct sock *sk,
				1418	struct tcp_sock *tp,
				1419	unsigned cur_mss,
				1420	int nonagle)
				1421	{
				1422	struct sk_buff *skb = sk->sk_send_head;
				1423
				1424	if (skb) {
				1425	if (!tcp_skb_is_last(sk, skb))
				1426	nonagle = TCP_NAGLE_PUSH;
David S. Miller	d5ac99a	2005-04-24 19:12:33 -0700	[diff] [blame]	1427	if (!tcp_snd_test(sk, skb, cur_mss, nonagle) \|\|
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1428	tcp_write_xmit(sk, nonagle))
				1429	tcp_check_probe_timer(sk, tp);
				1430	}
				1431	tcp_cwnd_validate(sk, tp);
				1432	}
				1433
				1434	static __inline__ void tcp_push_pending_frames(struct sock *sk,
				1435	struct tcp_sock *tp)
				1436	{
				1437	__tcp_push_pending_frames(sk, tp, tcp_current_mss(sk, 1), tp->nonagle);
				1438	}
				1439
				1440	static __inline__ int tcp_may_send_now(struct sock sk, struct tcp_sock tp)
				1441	{
				1442	struct sk_buff *skb = sk->sk_send_head;
				1443
				1444	return (skb &&
David S. Miller	d5ac99a	2005-04-24 19:12:33 -0700	[diff] [blame]	1445	tcp_snd_test(sk, skb, tcp_current_mss(sk, 1),
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1446	tcp_skb_is_last(sk, skb) ? TCP_NAGLE_PUSH : tp->nonagle));
				1447	}
				1448
				1449	static __inline__ void tcp_init_wl(struct tcp_sock *tp, u32 ack, u32 seq)
				1450	{
				1451	tp->snd_wl1 = seq;
				1452	}
				1453
				1454	static __inline__ void tcp_update_wl(struct tcp_sock *tp, u32 ack, u32 seq)
				1455	{
				1456	tp->snd_wl1 = seq;
				1457	}
				1458
				1459	extern void tcp_destroy_sock(struct sock *sk);
				1460
				1461
				1462	/*
				1463	* Calculate(/check) TCP checksum
				1464	*/
				1465	static __inline__ u16 tcp_v4_check(struct tcphdr *th, int len,
				1466	unsigned long saddr, unsigned long daddr,
				1467	unsigned long base)
				1468	{
				1469	return csum_tcpudp_magic(saddr,daddr,len,IPPROTO_TCP,base);
				1470	}
				1471
				1472	static __inline__ int __tcp_checksum_complete(struct sk_buff *skb)
				1473	{
				1474	return (unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum));
				1475	}
				1476
				1477	static __inline__ int tcp_checksum_complete(struct sk_buff *skb)
				1478	{
				1479	return skb->ip_summed != CHECKSUM_UNNECESSARY &&
				1480	__tcp_checksum_complete(skb);
				1481	}
				1482
				1483	/* Prequeue for VJ style copy to user, combined with checksumming. */
				1484
				1485	static __inline__ void tcp_prequeue_init(struct tcp_sock *tp)
				1486	{
				1487	tp->ucopy.task = NULL;
				1488	tp->ucopy.len = 0;
				1489	tp->ucopy.memory = 0;
				1490	skb_queue_head_init(&tp->ucopy.prequeue);
				1491	}
				1492
				1493	/* Packet is added to VJ-style prequeue for processing in process
				1494	* context, if a reader task is waiting. Apparently, this exciting
				1495	* idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
				1496	* failed somewhere. Latency? Burstiness? Well, at least now we will
				1497	* see, why it failed. 8)8) --ANK
				1498	*
				1499	* NOTE: is this not too big to inline?
				1500	*/
				1501	static __inline__ int tcp_prequeue(struct sock sk, struct sk_buff skb)
				1502	{
				1503	struct tcp_sock *tp = tcp_sk(sk);
				1504
				1505	if (!sysctl_tcp_low_latency && tp->ucopy.task) {
				1506	__skb_queue_tail(&tp->ucopy.prequeue, skb);
				1507	tp->ucopy.memory += skb->truesize;
				1508	if (tp->ucopy.memory > sk->sk_rcvbuf) {
				1509	struct sk_buff *skb1;
				1510
				1511	BUG_ON(sock_owned_by_user(sk));
				1512
				1513	while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) {
				1514	sk->sk_backlog_rcv(sk, skb1);
				1515	NET_INC_STATS_BH(LINUX_MIB_TCPPREQUEUEDROPPED);
				1516	}
				1517
				1518	tp->ucopy.memory = 0;
				1519	} else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
				1520	wake_up_interruptible(sk->sk_sleep);
				1521	if (!tcp_ack_scheduled(tp))
				1522	tcp_reset_xmit_timer(sk, TCP_TIME_DACK, (3*TCP_RTO_MIN)/4);
				1523	}
				1524	return 1;
				1525	}
				1526	return 0;
				1527	}
				1528
				1529
				1530	#undef STATE_TRACE
				1531
				1532	#ifdef STATE_TRACE
				1533	static const char *statename[]={
				1534	"Unused","Established","Syn Sent","Syn Recv",
				1535	"Fin Wait 1","Fin Wait 2","Time Wait", "Close",
				1536	"Close Wait","Last ACK","Listen","Closing"
				1537	};
				1538	#endif
				1539
				1540	static __inline__ void tcp_set_state(struct sock *sk, int state)
				1541	{
				1542	int oldstate = sk->sk_state;
				1543
				1544	switch (state) {
				1545	case TCP_ESTABLISHED:
				1546	if (oldstate != TCP_ESTABLISHED)
				1547	TCP_INC_STATS(TCP_MIB_CURRESTAB);
				1548	break;
				1549
				1550	case TCP_CLOSE:
				1551	if (oldstate == TCP_CLOSE_WAIT \|\| oldstate == TCP_ESTABLISHED)
				1552	TCP_INC_STATS(TCP_MIB_ESTABRESETS);
				1553
				1554	sk->sk_prot->unhash(sk);
				1555	if (tcp_sk(sk)->bind_hash &&
				1556	!(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
				1557	tcp_put_port(sk);
				1558	/* fall through */
				1559	default:
				1560	if (oldstate==TCP_ESTABLISHED)
				1561	TCP_DEC_STATS(TCP_MIB_CURRESTAB);
				1562	}
				1563
				1564	/* Change state AFTER socket is unhashed to avoid closed
				1565	* socket sitting in hash tables.
				1566	*/
				1567	sk->sk_state = state;
				1568
				1569	#ifdef STATE_TRACE
				1570	SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n",sk, statename[oldstate],statename[state]);
				1571	#endif
				1572	}
				1573
				1574	static __inline__ void tcp_done(struct sock *sk)
				1575	{
				1576	tcp_set_state(sk, TCP_CLOSE);
				1577	tcp_clear_xmit_timers(sk);
				1578
				1579	sk->sk_shutdown = SHUTDOWN_MASK;
				1580
				1581	if (!sock_flag(sk, SOCK_DEAD))
				1582	sk->sk_state_change(sk);
				1583	else
				1584	tcp_destroy_sock(sk);
				1585	}
				1586
				1587	static __inline__ void tcp_sack_reset(struct tcp_options_received *rx_opt)
				1588	{
				1589	rx_opt->dsack = 0;
				1590	rx_opt->eff_sacks = 0;
				1591	rx_opt->num_sacks = 0;
				1592	}
				1593
				1594	static __inline__ void tcp_build_and_update_options(__u32 ptr, struct tcp_sock tp, __u32 tstamp)
				1595	{
				1596	if (tp->rx_opt.tstamp_ok) {
				1597	*ptr++ = __constant_htonl((TCPOPT_NOP << 24) \|
				1598	(TCPOPT_NOP << 16) \|
				1599	(TCPOPT_TIMESTAMP << 8) \|
				1600	TCPOLEN_TIMESTAMP);
				1601	*ptr++ = htonl(tstamp);
				1602	*ptr++ = htonl(tp->rx_opt.ts_recent);
				1603	}
				1604	if (tp->rx_opt.eff_sacks) {
				1605	struct tcp_sack_block *sp = tp->rx_opt.dsack ? tp->duplicate_sack : tp->selective_acks;
				1606	int this_sack;
				1607
				1608	*ptr++ = __constant_htonl((TCPOPT_NOP << 24) \|
				1609	(TCPOPT_NOP << 16) \|
				1610	(TCPOPT_SACK << 8) \|
				1611	(TCPOLEN_SACK_BASE +
				1612	(tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK)));
				1613	for(this_sack = 0; this_sack < tp->rx_opt.eff_sacks; this_sack++) {
				1614	*ptr++ = htonl(sp[this_sack].start_seq);
				1615	*ptr++ = htonl(sp[this_sack].end_seq);
				1616	}
				1617	if (tp->rx_opt.dsack) {
				1618	tp->rx_opt.dsack = 0;
				1619	tp->rx_opt.eff_sacks--;
				1620	}
				1621	}
				1622	}
				1623
				1624	/* Construct a tcp options header for a SYN or SYN_ACK packet.
				1625	* If this is every changed make sure to change the definition of
				1626	* MAX_SYN_SIZE to match the new maximum number of options that you
				1627	* can generate.
				1628	*/
				1629	static inline void tcp_syn_build_options(__u32 *ptr, int mss, int ts, int sack,
				1630	int offer_wscale, int wscale, __u32 tstamp, __u32 ts_recent)
				1631	{
				1632	/* We always get an MSS option.
				1633	* The option bytes which will be seen in normal data
				1634	* packets should timestamps be used, must be in the MSS
				1635	* advertised. But we subtract them from tp->mss_cache so
				1636	* that calculations in tcp_sendmsg are simpler etc.
				1637	* So account for this fact here if necessary. If we
				1638	* don't do this correctly, as a receiver we won't
				1639	* recognize data packets as being full sized when we
				1640	* should, and thus we won't abide by the delayed ACK
				1641	* rules correctly.
				1642	* SACKs don't matter, we never delay an ACK when we
				1643	* have any of those going out.
				1644	*/
				1645	*ptr++ = htonl((TCPOPT_MSS << 24) \| (TCPOLEN_MSS << 16) \| mss);
				1646	if (ts) {
				1647	if(sack)
				1648	*ptr++ = __constant_htonl((TCPOPT_SACK_PERM << 24) \| (TCPOLEN_SACK_PERM << 16) \|
				1649	(TCPOPT_TIMESTAMP << 8) \| TCPOLEN_TIMESTAMP);
				1650	else
				1651	*ptr++ = __constant_htonl((TCPOPT_NOP << 24) \| (TCPOPT_NOP << 16) \|
				1652	(TCPOPT_TIMESTAMP << 8) \| TCPOLEN_TIMESTAMP);
				1653	ptr++ = htonl(tstamp); / TSVAL */
				1654	ptr++ = htonl(ts_recent); / TSECR */
				1655	} else if(sack)
				1656	*ptr++ = __constant_htonl((TCPOPT_NOP << 24) \| (TCPOPT_NOP << 16) \|
				1657	(TCPOPT_SACK_PERM << 8) \| TCPOLEN_SACK_PERM);
				1658	if (offer_wscale)
				1659	*ptr++ = htonl((TCPOPT_NOP << 24) \| (TCPOPT_WINDOW << 16) \| (TCPOLEN_WINDOW << 8) \| (wscale));
				1660	}
				1661
				1662	/* Determine a window scaling and initial window to offer. */
				1663	extern void tcp_select_initial_window(int __space, __u32 mss,
				1664	__u32 rcv_wnd, __u32 window_clamp,
				1665	int wscale_ok, __u8 *rcv_wscale);
				1666
				1667	static inline int tcp_win_from_space(int space)
				1668	{
				1669	return sysctl_tcp_adv_win_scale<=0 ?
				1670	(space>>(-sysctl_tcp_adv_win_scale)) :
				1671	space - (space>>sysctl_tcp_adv_win_scale);
				1672	}
				1673
				1674	/* Note: caller must be prepared to deal with negative returns */
				1675	static inline int tcp_space(const struct sock *sk)
				1676	{
				1677	return tcp_win_from_space(sk->sk_rcvbuf -
				1678	atomic_read(&sk->sk_rmem_alloc));
				1679	}
				1680
				1681	static inline int tcp_full_space(const struct sock *sk)
				1682	{
				1683	return tcp_win_from_space(sk->sk_rcvbuf);
				1684	}
				1685
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame^]	1686	static inline void tcp_acceptq_queue(struct sock sk, struct request_sock req,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1687	struct sock *child)
				1688	{
				1689	struct tcp_sock *tp = tcp_sk(sk);
				1690
				1691	req->sk = child;
				1692	sk_acceptq_added(sk);
				1693
				1694	if (!tp->accept_queue_tail) {
				1695	tp->accept_queue = req;
				1696	} else {
				1697	tp->accept_queue_tail->dl_next = req;
				1698	}
				1699	tp->accept_queue_tail = req;
				1700	req->dl_next = NULL;
				1701	}
				1702
				1703	struct tcp_listen_opt
				1704	{
				1705	u8 max_qlen_log; /* log_2 of maximal queued SYNs */
				1706	int qlen;
				1707	int qlen_young;
				1708	int clock_hand;
				1709	u32 hash_rnd;
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame^]	1710	struct request_sock *syn_table[TCP_SYNQ_HSIZE];
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1711	};
				1712
				1713	static inline void
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame^]	1714	tcp_synq_removed(struct sock sk, struct request_sock req)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1715	{
				1716	struct tcp_listen_opt *lopt = tcp_sk(sk)->listen_opt;
				1717
				1718	if (--lopt->qlen == 0)
				1719	tcp_delete_keepalive_timer(sk);
				1720	if (req->retrans == 0)
				1721	lopt->qlen_young--;
				1722	}
				1723
				1724	static inline void tcp_synq_added(struct sock *sk)
				1725	{
				1726	struct tcp_listen_opt *lopt = tcp_sk(sk)->listen_opt;
				1727
				1728	if (lopt->qlen++ == 0)
				1729	tcp_reset_keepalive_timer(sk, TCP_TIMEOUT_INIT);
				1730	lopt->qlen_young++;
				1731	}
				1732
				1733	static inline int tcp_synq_len(struct sock *sk)
				1734	{
				1735	return tcp_sk(sk)->listen_opt->qlen;
				1736	}
				1737
				1738	static inline int tcp_synq_young(struct sock *sk)
				1739	{
				1740	return tcp_sk(sk)->listen_opt->qlen_young;
				1741	}
				1742
				1743	static inline int tcp_synq_is_full(struct sock *sk)
				1744	{
				1745	return tcp_synq_len(sk) >> tcp_sk(sk)->listen_opt->max_qlen_log;
				1746	}
				1747
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame^]	1748	static inline void tcp_synq_unlink(struct tcp_sock tp, struct request_sock req,
				1749	struct request_sock **prev)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1750	{
				1751	write_lock(&tp->syn_wait_lock);
				1752	*prev = req->dl_next;
				1753	write_unlock(&tp->syn_wait_lock);
				1754	}
				1755
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame^]	1756	static inline void tcp_synq_drop(struct sock sk, struct request_sock req,
				1757	struct request_sock **prev)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1758	{
				1759	tcp_synq_unlink(tcp_sk(sk), req, prev);
				1760	tcp_synq_removed(sk, req);
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame^]	1761	reqsk_free(req);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1762	}
				1763
Arnaldo Carvalho de Melo	60236fd	2005-06-18 22:47:21 -0700	[diff] [blame^]	1764	static __inline__ void tcp_openreq_init(struct request_sock *req,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1765	struct tcp_options_received *rx_opt,
				1766	struct sk_buff *skb)
				1767	{
Arnaldo Carvalho de Melo	2e6599c	2005-06-18 22:46:52 -0700	[diff] [blame]	1768	struct inet_request_sock *ireq = inet_rsk(req);
				1769
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1770	req->rcv_wnd = 0; /* So that tcp_send_synack() knows! */
Arnaldo Carvalho de Melo	2e6599c	2005-06-18 22:46:52 -0700	[diff] [blame]	1771	tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1772	req->mss = rx_opt->mss_clamp;
				1773	req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0;
Arnaldo Carvalho de Melo	2e6599c	2005-06-18 22:46:52 -0700	[diff] [blame]	1774	ireq->tstamp_ok = rx_opt->tstamp_ok;
				1775	ireq->sack_ok = rx_opt->sack_ok;
				1776	ireq->snd_wscale = rx_opt->snd_wscale;
				1777	ireq->wscale_ok = rx_opt->wscale_ok;
				1778	ireq->acked = 0;
				1779	ireq->ecn_ok = 0;
				1780	ireq->rmt_port = skb->h.th->source;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1781	}
				1782
				1783	extern void tcp_enter_memory_pressure(void);
				1784
				1785	extern void tcp_listen_wlock(void);
				1786
				1787	/* - We may sleep inside this lock.
				1788	* - If sleeping is not required (or called from BH),
				1789	* use plain read_(un)lock(&tcp_lhash_lock).
				1790	*/
				1791
				1792	static inline void tcp_listen_lock(void)
				1793	{
				1794	/* read_lock synchronizes to candidates to writers */
				1795	read_lock(&tcp_lhash_lock);
				1796	atomic_inc(&tcp_lhash_users);
				1797	read_unlock(&tcp_lhash_lock);
				1798	}
				1799
				1800	static inline void tcp_listen_unlock(void)
				1801	{
				1802	if (atomic_dec_and_test(&tcp_lhash_users))
				1803	wake_up(&tcp_lhash_wait);
				1804	}
				1805
				1806	static inline int keepalive_intvl_when(const struct tcp_sock *tp)
				1807	{
				1808	return tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl;
				1809	}
				1810
				1811	static inline int keepalive_time_when(const struct tcp_sock *tp)
				1812	{
				1813	return tp->keepalive_time ? : sysctl_tcp_keepalive_time;
				1814	}
				1815
				1816	static inline int tcp_fin_time(const struct tcp_sock *tp)
				1817	{
				1818	int fin_timeout = tp->linger2 ? : sysctl_tcp_fin_timeout;
				1819
				1820	if (fin_timeout < (tp->rto<<2) - (tp->rto>>1))
				1821	fin_timeout = (tp->rto<<2) - (tp->rto>>1);
				1822
				1823	return fin_timeout;
				1824	}
				1825
				1826	static inline int tcp_paws_check(const struct tcp_options_received *rx_opt, int rst)
				1827	{
				1828	if ((s32)(rx_opt->rcv_tsval - rx_opt->ts_recent) >= 0)
				1829	return 0;
				1830	if (xtime.tv_sec >= rx_opt->ts_recent_stamp + TCP_PAWS_24DAYS)
				1831	return 0;
				1832
				1833	/* RST segments are not recommended to carry timestamp,
				1834	and, if they do, it is recommended to ignore PAWS because
				1835	"their cleanup function should take precedence over timestamps."
				1836	Certainly, it is mistake. It is necessary to understand the reasons
				1837	of this constraint to relax it: if peer reboots, clock may go
				1838	out-of-sync and half-open connections will not be reset.
				1839	Actually, the problem would be not existing if all
				1840	the implementations followed draft about maintaining clock
				1841	via reboots. Linux-2.2 DOES NOT!
				1842
				1843	However, we can relax time bounds for RST segments to MSL.
				1844	*/
				1845	if (rst && xtime.tv_sec >= rx_opt->ts_recent_stamp + TCP_PAWS_MSL)
				1846	return 0;
				1847	return 1;
				1848	}
				1849
				1850	static inline void tcp_v4_setup_caps(struct sock sk, struct dst_entry dst)
				1851	{
				1852	sk->sk_route_caps = dst->dev->features;
				1853	if (sk->sk_route_caps & NETIF_F_TSO) {
				1854	if (sock_flag(sk, SOCK_NO_LARGESEND) \|\| dst->header_len)
				1855	sk->sk_route_caps &= ~NETIF_F_TSO;
				1856	}
				1857	}
				1858
				1859	#define TCP_CHECK_TIMER(sk) do { } while (0)
				1860
				1861	static inline int tcp_use_frto(const struct sock *sk)
				1862	{
				1863	const struct tcp_sock *tp = tcp_sk(sk);
				1864
				1865	/* F-RTO must be activated in sysctl and there must be some
				1866	* unsent new data, and the advertised window should allow
				1867	* sending it.
				1868	*/
				1869	return (sysctl_tcp_frto && sk->sk_send_head &&
				1870	!after(TCP_SKB_CB(sk->sk_send_head)->end_seq,
				1871	tp->snd_una + tp->snd_wnd));
				1872	}
				1873
				1874	static inline void tcp_mib_init(void)
				1875	{
				1876	/* See RFC 2012 */
				1877	TCP_ADD_STATS_USER(TCP_MIB_RTOALGORITHM, 1);
				1878	TCP_ADD_STATS_USER(TCP_MIB_RTOMIN, TCP_RTO_MIN*1000/HZ);
				1879	TCP_ADD_STATS_USER(TCP_MIB_RTOMAX, TCP_RTO_MAX*1000/HZ);
				1880	TCP_ADD_STATS_USER(TCP_MIB_MAXCONN, -1);
				1881	}
				1882
				1883	/* /proc */
				1884	enum tcp_seq_states {
				1885	TCP_SEQ_STATE_LISTENING,
				1886	TCP_SEQ_STATE_OPENREQ,
				1887	TCP_SEQ_STATE_ESTABLISHED,
				1888	TCP_SEQ_STATE_TIME_WAIT,
				1889	};
				1890
				1891	struct tcp_seq_afinfo {
				1892	struct module *owner;
				1893	char *name;
				1894	sa_family_t family;
				1895	int (seq_show) (struct seq_file m, void *v);
				1896	struct file_operations *seq_fops;
				1897	};
				1898
				1899	struct tcp_iter_state {
				1900	sa_family_t family;
				1901	enum tcp_seq_states state;
				1902	struct sock *syn_wait_sk;
				1903	int bucket, sbucket, num, uid;
				1904	struct seq_operations seq_ops;
				1905	};
				1906
				1907	extern int tcp_proc_register(struct tcp_seq_afinfo *afinfo);
				1908	extern void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo);
				1909
				1910	/* TCP Westwood functions and constants */
				1911
				1912	#define TCP_WESTWOOD_INIT_RTT (20HZ) / maybe too conservative?! */
				1913	#define TCP_WESTWOOD_RTT_MIN (HZ/20) /* 50ms */
				1914
				1915	static inline void tcp_westwood_update_rtt(struct tcp_sock *tp, __u32 rtt_seq)
				1916	{
				1917	if (tcp_is_westwood(tp))
				1918	tp->westwood.rtt = rtt_seq;
				1919	}
				1920
				1921	static inline __u32 __tcp_westwood_bw_rttmin(const struct tcp_sock *tp)
				1922	{
				1923	return max((tp->westwood.bw_est) * (tp->westwood.rtt_min) /
				1924	(__u32) (tp->mss_cache_std),
				1925	2U);
				1926	}
				1927
				1928	static inline __u32 tcp_westwood_bw_rttmin(const struct tcp_sock *tp)
				1929	{
				1930	return tcp_is_westwood(tp) ? __tcp_westwood_bw_rttmin(tp) : 0;
				1931	}
				1932
				1933	static inline int tcp_westwood_ssthresh(struct tcp_sock *tp)
				1934	{
				1935	__u32 ssthresh = 0;
				1936
				1937	if (tcp_is_westwood(tp)) {
				1938	ssthresh = __tcp_westwood_bw_rttmin(tp);
				1939	if (ssthresh)
				1940	tp->snd_ssthresh = ssthresh;
				1941	}
				1942
				1943	return (ssthresh != 0);
				1944	}
				1945
				1946	static inline int tcp_westwood_cwnd(struct tcp_sock *tp)
				1947	{
				1948	__u32 cwnd = 0;
				1949
				1950	if (tcp_is_westwood(tp)) {
				1951	cwnd = __tcp_westwood_bw_rttmin(tp);
				1952	if (cwnd)
				1953	tp->snd_cwnd = cwnd;
				1954	}
				1955
				1956	return (cwnd != 0);
				1957	}
				1958	#endif /* _TCP_H */