Blame - net/ipv4/tcp.c - SHIFTPHONES/mainline/linux

blob: 5e6bc4b32875a88cff1458d1915284cc877db6f9 [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
				2	* INET An implementation of the TCP/IP protocol suite for the LINUX
				3	* operating system. INET is implemented using the BSD Socket
				4	* interface as the means of communication with the user level.
				5	*
				6	* Implementation of the Transmission Control Protocol(TCP).
				7	*
				8	* Version: $Id: tcp.c,v 1.216 2002/02/01 22:01:04 davem Exp $
				9	*
Jesper Juhl	02c30a8	2005-05-05 16:16:16 -0700	[diff] [blame]	10	* Authors: Ross Biro
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	11	* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
				12	* Mark Evans, <evansmp@uhura.aston.ac.uk>
				13	* Corey Minyard <wf-rch!minyard@relay.EU.net>
				14	* Florian La Roche, <flla@stud.uni-sb.de>
				15	* Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
				16	* Linus Torvalds, <torvalds@cs.helsinki.fi>
				17	* Alan Cox, <gw4pts@gw4pts.ampr.org>
				18	* Matthew Dillon, <dillon@apollo.west.oic.com>
				19	* Arnt Gulbrandsen, <agulbra@nvg.unit.no>
				20	* Jorge Cwik, <jorge@laser.satlink.net>
				21	*
				22	* Fixes:
				23	* Alan Cox : Numerous verify_area() calls
				24	* Alan Cox : Set the ACK bit on a reset
				25	* Alan Cox : Stopped it crashing if it closed while
				26	* sk->inuse=1 and was trying to connect
				27	* (tcp_err()).
				28	* Alan Cox : All icmp error handling was broken
				29	* pointers passed where wrong and the
				30	* socket was looked up backwards. Nobody
				31	* tested any icmp error code obviously.
				32	* Alan Cox : tcp_err() now handled properly. It
				33	* wakes people on errors. poll
				34	* behaves and the icmp error race
				35	* has gone by moving it into sock.c
				36	* Alan Cox : tcp_send_reset() fixed to work for
				37	* everything not just packets for
				38	* unknown sockets.
				39	* Alan Cox : tcp option processing.
				40	* Alan Cox : Reset tweaked (still not 100%) [Had
				41	* syn rule wrong]
				42	* Herp Rosmanith : More reset fixes
				43	* Alan Cox : No longer acks invalid rst frames.
				44	* Acking any kind of RST is right out.
				45	* Alan Cox : Sets an ignore me flag on an rst
				46	* receive otherwise odd bits of prattle
				47	* escape still
				48	* Alan Cox : Fixed another acking RST frame bug.
				49	* Should stop LAN workplace lockups.
				50	* Alan Cox : Some tidyups using the new skb list
				51	* facilities
				52	* Alan Cox : sk->keepopen now seems to work
				53	* Alan Cox : Pulls options out correctly on accepts
				54	* Alan Cox : Fixed assorted sk->rqueue->next errors
				55	* Alan Cox : PSH doesn't end a TCP read. Switched a
				56	* bit to skb ops.
				57	* Alan Cox : Tidied tcp_data to avoid a potential
				58	* nasty.
				59	* Alan Cox : Added some better commenting, as the
				60	* tcp is hard to follow
				61	* Alan Cox : Removed incorrect check for 20 * psh
				62	* Michael O'Reilly : ack < copied bug fix.
				63	* Johannes Stille : Misc tcp fixes (not all in yet).
				64	* Alan Cox : FIN with no memory -> CRASH
				65	* Alan Cox : Added socket option proto entries.
				66	* Also added awareness of them to accept.
				67	* Alan Cox : Added TCP options (SOL_TCP)
				68	* Alan Cox : Switched wakeup calls to callbacks,
				69	* so the kernel can layer network
				70	* sockets.
				71	* Alan Cox : Use ip_tos/ip_ttl settings.
				72	* Alan Cox : Handle FIN (more) properly (we hope).
				73	* Alan Cox : RST frames sent on unsynchronised
				74	* state ack error.
				75	* Alan Cox : Put in missing check for SYN bit.
				76	* Alan Cox : Added tcp_select_window() aka NET2E
				77	* window non shrink trick.
				78	* Alan Cox : Added a couple of small NET2E timer
				79	* fixes
				80	* Charles Hedrick : TCP fixes
				81	* Toomas Tamm : TCP window fixes
				82	* Alan Cox : Small URG fix to rlogin ^C ack fight
				83	* Charles Hedrick : Rewrote most of it to actually work
				84	* Linus : Rewrote tcp_read() and URG handling
				85	* completely
				86	* Gerhard Koerting: Fixed some missing timer handling
				87	* Matthew Dillon : Reworked TCP machine states as per RFC
				88	* Gerhard Koerting: PC/TCP workarounds
				89	* Adam Caldwell : Assorted timer/timing errors
				90	* Matthew Dillon : Fixed another RST bug
				91	* Alan Cox : Move to kernel side addressing changes.
				92	* Alan Cox : Beginning work on TCP fastpathing
				93	* (not yet usable)
				94	* Arnt Gulbrandsen: Turbocharged tcp_check() routine.
				95	* Alan Cox : TCP fast path debugging
				96	* Alan Cox : Window clamping
				97	* Michael Riepe : Bug in tcp_check()
				98	* Matt Dillon : More TCP improvements and RST bug fixes
				99	* Matt Dillon : Yet more small nasties remove from the
				100	* TCP code (Be very nice to this man if
				101	* tcp finally works 100%) 8)
				102	* Alan Cox : BSD accept semantics.
				103	* Alan Cox : Reset on closedown bug.
				104	* Peter De Schrijver : ENOTCONN check missing in tcp_sendto().
				105	* Michael Pall : Handle poll() after URG properly in
				106	* all cases.
				107	* Michael Pall : Undo the last fix in tcp_read_urg()
				108	* (multi URG PUSH broke rlogin).
				109	* Michael Pall : Fix the multi URG PUSH problem in
				110	* tcp_readable(), poll() after URG
				111	* works now.
				112	* Michael Pall : recv(...,MSG_OOB) never blocks in the
				113	* BSD api.
				114	* Alan Cox : Changed the semantics of sk->socket to
				115	* fix a race and a signal problem with
				116	* accept() and async I/O.
				117	* Alan Cox : Relaxed the rules on tcp_sendto().
				118	* Yury Shevchuk : Really fixed accept() blocking problem.
				119	* Craig I. Hagan : Allow for BSD compatible TIME_WAIT for
				120	* clients/servers which listen in on
				121	* fixed ports.
				122	* Alan Cox : Cleaned the above up and shrank it to
				123	* a sensible code size.
				124	* Alan Cox : Self connect lockup fix.
				125	* Alan Cox : No connect to multicast.
				126	* Ross Biro : Close unaccepted children on master
				127	* socket close.
				128	* Alan Cox : Reset tracing code.
				129	* Alan Cox : Spurious resets on shutdown.
				130	* Alan Cox : Giant 15 minute/60 second timer error
				131	* Alan Cox : Small whoops in polling before an
				132	* accept.
				133	* Alan Cox : Kept the state trace facility since
				134	* it's handy for debugging.
				135	* Alan Cox : More reset handler fixes.
				136	* Alan Cox : Started rewriting the code based on
				137	* the RFC's for other useful protocol
				138	* references see: Comer, KA9Q NOS, and
				139	* for a reference on the difference
				140	* between specifications and how BSD
				141	* works see the 4.4lite source.
				142	* A.N.Kuznetsov : Don't time wait on completion of tidy
				143	* close.
				144	* Linus Torvalds : Fin/Shutdown & copied_seq changes.
				145	* Linus Torvalds : Fixed BSD port reuse to work first syn
				146	* Alan Cox : Reimplemented timers as per the RFC
				147	* and using multiple timers for sanity.
				148	* Alan Cox : Small bug fixes, and a lot of new
				149	* comments.
				150	* Alan Cox : Fixed dual reader crash by locking
				151	* the buffers (much like datagram.c)
				152	* Alan Cox : Fixed stuck sockets in probe. A probe
				153	* now gets fed up of retrying without
				154	* (even a no space) answer.
				155	* Alan Cox : Extracted closing code better
				156	* Alan Cox : Fixed the closing state machine to
				157	* resemble the RFC.
				158	* Alan Cox : More 'per spec' fixes.
				159	* Jorge Cwik : Even faster checksumming.
				160	* Alan Cox : tcp_data() doesn't ack illegal PSH
				161	* only frames. At least one pc tcp stack
				162	* generates them.
				163	* Alan Cox : Cache last socket.
				164	* Alan Cox : Per route irtt.
				165	* Matt Day : poll()->select() match BSD precisely on error
				166	* Alan Cox : New buffers
				167	* Marc Tamsky : Various sk->prot->retransmits and
				168	* sk->retransmits misupdating fixed.
				169	* Fixed tcp_write_timeout: stuck close,
				170	* and TCP syn retries gets used now.
				171	* Mark Yarvis : In tcp_read_wakeup(), don't send an
				172	* ack if state is TCP_CLOSED.
				173	* Alan Cox : Look up device on a retransmit - routes may
				174	* change. Doesn't yet cope with MSS shrink right
				175	* but it's a start!
				176	* Marc Tamsky : Closing in closing fixes.
				177	* Mike Shaver : RFC1122 verifications.
				178	* Alan Cox : rcv_saddr errors.
				179	* Alan Cox : Block double connect().
				180	* Alan Cox : Small hooks for enSKIP.
				181	* Alexey Kuznetsov: Path MTU discovery.
				182	* Alan Cox : Support soft errors.
				183	* Alan Cox : Fix MTU discovery pathological case
				184	* when the remote claims no mtu!
				185	* Marc Tamsky : TCP_CLOSE fix.
				186	* Colin (G3TNE) : Send a reset on syn ack replies in
				187	* window but wrong (fixes NT lpd problems)
				188	* Pedro Roque : Better TCP window handling, delayed ack.
				189	* Joerg Reuter : No modification of locked buffers in
				190	* tcp_do_retransmit()
				191	* Eric Schenk : Changed receiver side silly window
				192	* avoidance algorithm to BSD style
				193	* algorithm. This doubles throughput
				194	* against machines running Solaris,
				195	* and seems to result in general
				196	* improvement.
				197	* Stefan Magdalinski : adjusted tcp_readable() to fix FIONREAD
				198	* Willy Konynenberg : Transparent proxying support.
				199	* Mike McLagan : Routing by source
				200	* Keith Owens : Do proper merging with partial SKB's in
				201	* tcp_do_sendmsg to avoid burstiness.
				202	* Eric Schenk : Fix fast close down bug with
				203	* shutdown() followed by close().
				204	* Andi Kleen : Make poll agree with SIGIO
				205	* Salvatore Sanfilippo : Support SO_LINGER with linger == 1 and
				206	* lingertime == 0 (RFC 793 ABORT Call)
				207	* Hirokazu Takahashi : Use copy_from_user() instead of
				208	* csum_and_copy_from_user() if possible.
				209	*
				210	* This program is free software; you can redistribute it and/or
				211	* modify it under the terms of the GNU General Public License
				212	* as published by the Free Software Foundation; either version
				213	* 2 of the License, or(at your option) any later version.
				214	*
				215	* Description of States:
				216	*
				217	* TCP_SYN_SENT sent a connection request, waiting for ack
				218	*
				219	* TCP_SYN_RECV received a connection request, sent ack,
				220	* waiting for final ack in three-way handshake.
				221	*
				222	* TCP_ESTABLISHED connection established
				223	*
				224	* TCP_FIN_WAIT1 our side has shutdown, waiting to complete
				225	* transmission of remaining buffered data
				226	*
				227	* TCP_FIN_WAIT2 all buffered data sent, waiting for remote
				228	* to shutdown
				229	*
				230	* TCP_CLOSING both sides have shutdown but we still have
				231	* data we have to finish sending
				232	*
				233	* TCP_TIME_WAIT timeout to catch resent junk before entering
				234	* closed, can only be entered from FIN_WAIT2
				235	* or CLOSING. Required because the other end
				236	* may not have gotten our last ACK causing it
				237	* to retransmit the data packet (which we ignore)
				238	*
				239	* TCP_CLOSE_WAIT remote side has shutdown and is waiting for
				240	* us to finish writing our data and to shutdown
				241	* (we have to close() to move on to LAST_ACK)
				242	*
				243	* TCP_LAST_ACK out side has shutdown after remote has
				244	* shutdown. There may still be data in our
				245	* buffer that we have to finish sending
				246	*
				247	* TCP_CLOSE socket is finished
				248	*/
				249
				250	#include <linux/config.h>
				251	#include <linux/module.h>
				252	#include <linux/types.h>
				253	#include <linux/fcntl.h>
				254	#include <linux/poll.h>
				255	#include <linux/init.h>
				256	#include <linux/smp_lock.h>
				257	#include <linux/fs.h>
				258	#include <linux/random.h>
				259	#include <linux/bootmem.h>
				260
				261	#include <net/icmp.h>
				262	#include <net/tcp.h>
				263	#include <net/xfrm.h>
				264	#include <net/ip.h>
				265
				266
				267	#include <asm/uaccess.h>
				268	#include <asm/ioctls.h>
				269
				270	int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
				271
Eric Dumazet	ba89966	2005-08-26 12:05:31 -0700	[diff] [blame]	272	DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics) __read_mostly;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	273
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	274	atomic_t tcp_orphan_count = ATOMIC_INIT(0);
				275
Arnaldo Carvalho de Melo	0a5578c	2005-08-09 20:11:41 -0700	[diff] [blame]	276	EXPORT_SYMBOL_GPL(tcp_orphan_count);
				277
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	278	int sysctl_tcp_mem[3];
				279	int sysctl_tcp_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 };
				280	int sysctl_tcp_rmem[3] = { 4 * 1024, 87380, 87380 * 2 };
				281
				282	EXPORT_SYMBOL(sysctl_tcp_mem);
				283	EXPORT_SYMBOL(sysctl_tcp_rmem);
				284	EXPORT_SYMBOL(sysctl_tcp_wmem);
				285
				286	atomic_t tcp_memory_allocated; /* Current allocated memory. */
				287	atomic_t tcp_sockets_allocated; /* Current number of TCP sockets. */
				288
				289	EXPORT_SYMBOL(tcp_memory_allocated);
				290	EXPORT_SYMBOL(tcp_sockets_allocated);
				291
				292	/*
				293	* Pressure flag: try to collapse.
				294	* Technical note: it is used by multiple contexts non atomically.
				295	* All the sk_stream_mem_schedule() is of this nature: accounting
				296	* is strict, actions are advisory and have some latency.
				297	*/
				298	int tcp_memory_pressure;
				299
				300	EXPORT_SYMBOL(tcp_memory_pressure);
				301
				302	void tcp_enter_memory_pressure(void)
				303	{
				304	if (!tcp_memory_pressure) {
				305	NET_INC_STATS(LINUX_MIB_TCPMEMORYPRESSURES);
				306	tcp_memory_pressure = 1;
				307	}
				308	}
				309
				310	EXPORT_SYMBOL(tcp_enter_memory_pressure);
				311
				312	/*
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	313	* Wait for a TCP event.
				314	*
				315	* Note that we don't need to lock the socket, as the upper poll layers
				316	* take care of normal races (between the test and the event) and we don't
				317	* go look at any of the socket buffers directly.
				318	*/
				319	unsigned int tcp_poll(struct file file, struct socket sock, poll_table *wait)
				320	{
				321	unsigned int mask;
				322	struct sock *sk = sock->sk;
				323	struct tcp_sock *tp = tcp_sk(sk);
				324
				325	poll_wait(file, sk->sk_sleep, wait);
				326	if (sk->sk_state == TCP_LISTEN)
Arnaldo Carvalho de Melo	dc40c7b	2005-08-23 21:52:58 -0700	[diff] [blame]	327	return inet_csk_listen_poll(sk);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	328
				329	/* Socket is not locked. We are protected from async events
				330	by poll logic and correct handling of state changes
				331	made by another threads is impossible in any case.
				332	*/
				333
				334	mask = 0;
				335	if (sk->sk_err)
				336	mask = POLLERR;
				337
				338	/*
				339	* POLLHUP is certainly not done right. But poll() doesn't
				340	* have a notion of HUP in just one direction, and for a
				341	* socket the read side is more interesting.
				342	*
				343	* Some poll() documentation says that POLLHUP is incompatible
				344	* with the POLLOUT/POLLWR flags, so somebody should check this
				345	* all. But careful, it tends to be safer to return too many
				346	* bits than too few, and you can easily break real applications
				347	* if you don't tell them that something has hung up!
				348	*
				349	* Check-me.
				350	*
				351	* Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
				352	* our fs/select.c). It means that after we received EOF,
				353	* poll always returns immediately, making impossible poll() on write()
				354	* in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
				355	* if and only if shutdown has been made in both directions.
				356	* Actually, it is interesting to look how Solaris and DUX
				357	* solve this dilemma. I would prefer, if PULLHUP were maskable,
				358	* then we could set it on SND_SHUTDOWN. BTW examples given
				359	* in Stevens' books assume exactly this behaviour, it explains
				360	* why PULLHUP is incompatible with POLLOUT. --ANK
				361	*
				362	* NOTE. Check for TCP_CLOSE is added. The goal is to prevent
				363	* blocking on fresh not-connected or disconnected socket. --ANK
				364	*/
				365	if (sk->sk_shutdown == SHUTDOWN_MASK \|\| sk->sk_state == TCP_CLOSE)
				366	mask \|= POLLHUP;
				367	if (sk->sk_shutdown & RCV_SHUTDOWN)
				368	mask \|= POLLIN \| POLLRDNORM;
				369
				370	/* Connected? */
				371	if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT \| TCPF_SYN_RECV)) {
				372	/* Potential race condition. If read of tp below will
				373	* escape above sk->sk_state, we can be illegally awaken
				374	* in SYN_* states. */
				375	if ((tp->rcv_nxt != tp->copied_seq) &&
				376	(tp->urg_seq != tp->copied_seq \|\|
				377	tp->rcv_nxt != tp->copied_seq + 1 \|\|
				378	sock_flag(sk, SOCK_URGINLINE) \|\| !tp->urg_data))
				379	mask \|= POLLIN \| POLLRDNORM;
				380
				381	if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
				382	if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
				383	mask \|= POLLOUT \| POLLWRNORM;
				384	} else { /* send SIGIO later */
				385	set_bit(SOCK_ASYNC_NOSPACE,
				386	&sk->sk_socket->flags);
				387	set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
				388
				389	/* Race breaker. If space is freed after
				390	* wspace test but before the flags are set,
				391	* IO signal will be lost.
				392	*/
				393	if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
				394	mask \|= POLLOUT \| POLLWRNORM;
				395	}
				396	}
				397
				398	if (tp->urg_data & TCP_URG_VALID)
				399	mask \|= POLLPRI;
				400	}
				401	return mask;
				402	}
				403
				404	int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
				405	{
				406	struct tcp_sock *tp = tcp_sk(sk);
				407	int answ;
				408
				409	switch (cmd) {
				410	case SIOCINQ:
				411	if (sk->sk_state == TCP_LISTEN)
				412	return -EINVAL;
				413
				414	lock_sock(sk);
				415	if ((1 << sk->sk_state) & (TCPF_SYN_SENT \| TCPF_SYN_RECV))
				416	answ = 0;
				417	else if (sock_flag(sk, SOCK_URGINLINE) \|\|
				418	!tp->urg_data \|\|
				419	before(tp->urg_seq, tp->copied_seq) \|\|
				420	!before(tp->urg_seq, tp->rcv_nxt)) {
				421	answ = tp->rcv_nxt - tp->copied_seq;
				422
				423	/* Subtract 1, if FIN is in queue. */
				424	if (answ && !skb_queue_empty(&sk->sk_receive_queue))
				425	answ -=
				426	((struct sk_buff *)sk->sk_receive_queue.prev)->h.th->fin;
				427	} else
				428	answ = tp->urg_seq - tp->copied_seq;
				429	release_sock(sk);
				430	break;
				431	case SIOCATMARK:
				432	answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
				433	break;
				434	case SIOCOUTQ:
				435	if (sk->sk_state == TCP_LISTEN)
				436	return -EINVAL;
				437
				438	if ((1 << sk->sk_state) & (TCPF_SYN_SENT \| TCPF_SYN_RECV))
				439	answ = 0;
				440	else
				441	answ = tp->write_seq - tp->snd_una;
				442	break;
				443	default:
				444	return -ENOIOCTLCMD;
				445	};
				446
				447	return put_user(answ, (int __user *)arg);
				448	}
				449
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	450	static inline void tcp_mark_push(struct tcp_sock tp, struct sk_buff skb)
				451	{
				452	TCP_SKB_CB(skb)->flags \|= TCPCB_FLAG_PSH;
				453	tp->pushed_seq = tp->write_seq;
				454	}
				455
				456	static inline int forced_push(struct tcp_sock *tp)
				457	{
				458	return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
				459	}
				460
				461	static inline void skb_entail(struct sock sk, struct tcp_sock tp,
				462	struct sk_buff *skb)
				463	{
				464	skb->csum = 0;
				465	TCP_SKB_CB(skb)->seq = tp->write_seq;
				466	TCP_SKB_CB(skb)->end_seq = tp->write_seq;
				467	TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
				468	TCP_SKB_CB(skb)->sacked = 0;
				469	skb_header_release(skb);
				470	__skb_queue_tail(&sk->sk_write_queue, skb);
				471	sk_charge_skb(sk, skb);
				472	if (!sk->sk_send_head)
				473	sk->sk_send_head = skb;
David S. Miller	89ebd19	2005-08-23 10:13:06 -0700	[diff] [blame]	474	if (tp->nonagle & TCP_NAGLE_PUSH)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	475	tp->nonagle &= ~TCP_NAGLE_PUSH;
				476	}
				477
				478	static inline void tcp_mark_urg(struct tcp_sock *tp, int flags,
				479	struct sk_buff *skb)
				480	{
				481	if (flags & MSG_OOB) {
				482	tp->urg_mode = 1;
				483	tp->snd_up = tp->write_seq;
				484	TCP_SKB_CB(skb)->sacked \|= TCPCB_URG;
				485	}
				486	}
				487
				488	static inline void tcp_push(struct sock sk, struct tcp_sock tp, int flags,
				489	int mss_now, int nonagle)
				490	{
				491	if (sk->sk_send_head) {
				492	struct sk_buff *skb = sk->sk_write_queue.prev;
				493	if (!(flags & MSG_MORE) \|\| forced_push(tp))
				494	tcp_mark_push(tp, skb);
				495	tcp_mark_urg(tp, flags, skb);
				496	__tcp_push_pending_frames(sk, tp, mss_now,
				497	(flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
				498	}
				499	}
				500
				501	static ssize_t do_tcp_sendpages(struct sock sk, struct page *pages, int poffset,
				502	size_t psize, int flags)
				503	{
				504	struct tcp_sock *tp = tcp_sk(sk);
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame]	505	int mss_now, size_goal;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	506	int err;
				507	ssize_t copied;
				508	long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
				509
				510	/* Wait for a connection to finish. */
				511	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED \| TCPF_CLOSE_WAIT))
				512	if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
				513	goto out_err;
				514
				515	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
				516
				517	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame]	518	size_goal = tp->xmit_size_goal;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	519	copied = 0;
				520
				521	err = -EPIPE;
				522	if (sk->sk_err \|\| (sk->sk_shutdown & SEND_SHUTDOWN))
				523	goto do_error;
				524
				525	while (psize > 0) {
				526	struct sk_buff *skb = sk->sk_write_queue.prev;
				527	struct page *page = pages[poffset / PAGE_SIZE];
				528	int copy, i, can_coalesce;
				529	int offset = poffset % PAGE_SIZE;
				530	int size = min_t(size_t, psize, PAGE_SIZE - offset);
				531
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame]	532	if (!sk->sk_send_head \|\| (copy = size_goal - skb->len) <= 0) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	533	new_segment:
				534	if (!sk_stream_memory_free(sk))
				535	goto wait_for_sndbuf;
				536
				537	skb = sk_stream_alloc_pskb(sk, 0, 0,
				538	sk->sk_allocation);
				539	if (!skb)
				540	goto wait_for_memory;
				541
				542	skb_entail(sk, tp, skb);
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame]	543	copy = size_goal;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	544	}
				545
				546	if (copy > size)
				547	copy = size;
				548
				549	i = skb_shinfo(skb)->nr_frags;
				550	can_coalesce = skb_can_coalesce(skb, i, page, offset);
				551	if (!can_coalesce && i >= MAX_SKB_FRAGS) {
				552	tcp_mark_push(tp, skb);
				553	goto new_segment;
				554	}
Herbert Xu	d80d99d6	2005-09-01 17:48:23 -0700	[diff] [blame]	555	if (!sk_stream_wmem_schedule(sk, copy))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	556	goto wait_for_memory;
				557
				558	if (can_coalesce) {
				559	skb_shinfo(skb)->frags[i - 1].size += copy;
				560	} else {
				561	get_page(page);
				562	skb_fill_page_desc(skb, i, page, offset, copy);
				563	}
				564
				565	skb->len += copy;
				566	skb->data_len += copy;
				567	skb->truesize += copy;
				568	sk->sk_wmem_queued += copy;
				569	sk->sk_forward_alloc -= copy;
				570	skb->ip_summed = CHECKSUM_HW;
				571	tp->write_seq += copy;
				572	TCP_SKB_CB(skb)->end_seq += copy;
				573	skb_shinfo(skb)->tso_segs = 0;
				574
				575	if (!copied)
				576	TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
				577
				578	copied += copy;
				579	poffset += copy;
				580	if (!(psize -= copy))
				581	goto out;
				582
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame]	583	if (skb->len < mss_now \|\| (flags & MSG_OOB))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	584	continue;
				585
				586	if (forced_push(tp)) {
				587	tcp_mark_push(tp, skb);
				588	__tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
				589	} else if (skb == sk->sk_send_head)
				590	tcp_push_one(sk, mss_now);
				591	continue;
				592
				593	wait_for_sndbuf:
				594	set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
				595	wait_for_memory:
				596	if (copied)
				597	tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
				598
				599	if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
				600	goto do_error;
				601
				602	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame]	603	size_goal = tp->xmit_size_goal;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	604	}
				605
				606	out:
				607	if (copied)
				608	tcp_push(sk, tp, flags, mss_now, tp->nonagle);
				609	return copied;
				610
				611	do_error:
				612	if (copied)
				613	goto out;
				614	out_err:
				615	return sk_stream_error(sk, flags, err);
				616	}
				617
				618	ssize_t tcp_sendpage(struct socket sock, struct page page, int offset,
				619	size_t size, int flags)
				620	{
				621	ssize_t res;
				622	struct sock *sk = sock->sk;
				623
				624	#define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM \| NETIF_F_NO_CSUM \| NETIF_F_HW_CSUM)
				625
				626	if (!(sk->sk_route_caps & NETIF_F_SG) \|\|
				627	!(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS))
				628	return sock_no_sendpage(sock, page, offset, size, flags);
				629
				630	#undef TCP_ZC_CSUM_FLAGS
				631
				632	lock_sock(sk);
				633	TCP_CHECK_TIMER(sk);
				634	res = do_tcp_sendpages(sk, &page, offset, size, flags);
				635	TCP_CHECK_TIMER(sk);
				636	release_sock(sk);
				637	return res;
				638	}
				639
				640	#define TCP_PAGE(sk) (sk->sk_sndmsg_page)
				641	#define TCP_OFF(sk) (sk->sk_sndmsg_off)
				642
				643	static inline int select_size(struct sock sk, struct tcp_sock tp)
				644	{
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame]	645	int tmp = tp->mss_cache;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	646
David S. Miller	b4e26f5	2005-07-05 15:20:27 -0700	[diff] [blame]	647	if (sk->sk_route_caps & NETIF_F_SG) {
				648	if (sk->sk_route_caps & NETIF_F_TSO)
				649	tmp = 0;
				650	else {
				651	int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
				652
				653	if (tmp >= pgbreak &&
				654	tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
				655	tmp = pgbreak;
				656	}
				657	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	658
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	659	return tmp;
				660	}
				661
				662	int tcp_sendmsg(struct kiocb iocb, struct sock sk, struct msghdr *msg,
				663	size_t size)
				664	{
				665	struct iovec *iov;
				666	struct tcp_sock *tp = tcp_sk(sk);
				667	struct sk_buff *skb;
				668	int iovlen, flags;
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame]	669	int mss_now, size_goal;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	670	int err, copied;
				671	long timeo;
				672
				673	lock_sock(sk);
				674	TCP_CHECK_TIMER(sk);
				675
				676	flags = msg->msg_flags;
				677	timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
				678
				679	/* Wait for a connection to finish. */
				680	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED \| TCPF_CLOSE_WAIT))
				681	if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
				682	goto out_err;
				683
				684	/* This should be in poll */
				685	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
				686
				687	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame]	688	size_goal = tp->xmit_size_goal;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	689
				690	/* Ok commence sending. */
				691	iovlen = msg->msg_iovlen;
				692	iov = msg->msg_iov;
				693	copied = 0;
				694
				695	err = -EPIPE;
				696	if (sk->sk_err \|\| (sk->sk_shutdown & SEND_SHUTDOWN))
				697	goto do_error;
				698
				699	while (--iovlen >= 0) {
				700	int seglen = iov->iov_len;
				701	unsigned char __user *from = iov->iov_base;
				702
				703	iov++;
				704
				705	while (seglen > 0) {
				706	int copy;
				707
				708	skb = sk->sk_write_queue.prev;
				709
				710	if (!sk->sk_send_head \|\|
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame]	711	(copy = size_goal - skb->len) <= 0) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	712
				713	new_segment:
				714	/* Allocate new segment. If the interface is SG,
				715	* allocate skb fitting to single page.
				716	*/
				717	if (!sk_stream_memory_free(sk))
				718	goto wait_for_sndbuf;
				719
				720	skb = sk_stream_alloc_pskb(sk, select_size(sk, tp),
				721	0, sk->sk_allocation);
				722	if (!skb)
				723	goto wait_for_memory;
				724
				725	/*
				726	* Check whether we can use HW checksum.
				727	*/
				728	if (sk->sk_route_caps &
				729	(NETIF_F_IP_CSUM \| NETIF_F_NO_CSUM \|
				730	NETIF_F_HW_CSUM))
				731	skb->ip_summed = CHECKSUM_HW;
				732
				733	skb_entail(sk, tp, skb);
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame]	734	copy = size_goal;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	735	}
				736
				737	/* Try to append data to the end of skb. */
				738	if (copy > seglen)
				739	copy = seglen;
				740
				741	/* Where to copy to? */
				742	if (skb_tailroom(skb) > 0) {
				743	/* We have some space in skb head. Superb! */
				744	if (copy > skb_tailroom(skb))
				745	copy = skb_tailroom(skb);
				746	if ((err = skb_add_data(skb, from, copy)) != 0)
				747	goto do_fault;
				748	} else {
				749	int merge = 0;
				750	int i = skb_shinfo(skb)->nr_frags;
				751	struct page *page = TCP_PAGE(sk);
				752	int off = TCP_OFF(sk);
				753
				754	if (skb_can_coalesce(skb, i, page, off) &&
				755	off != PAGE_SIZE) {
				756	/* We can extend the last page
				757	* fragment. */
				758	merge = 1;
				759	} else if (i == MAX_SKB_FRAGS \|\|
				760	(!i &&
				761	!(sk->sk_route_caps & NETIF_F_SG))) {
				762	/* Need to add new fragment and cannot
				763	* do this because interface is non-SG,
				764	* or because all the page slots are
				765	* busy. */
				766	tcp_mark_push(tp, skb);
				767	goto new_segment;
				768	} else if (page) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	769	if (off == PAGE_SIZE) {
				770	put_page(page);
				771	TCP_PAGE(sk) = page = NULL;
Herbert Xu	fb5f5e6	2005-09-05 18:55:48 -0700	[diff] [blame]	772	off = 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	773	}
Herbert Xu	ef01578	2005-09-01 17:48:59 -0700	[diff] [blame]	774	} else
Herbert Xu	fb5f5e6	2005-09-05 18:55:48 -0700	[diff] [blame]	775	off = 0;
Herbert Xu	ef01578	2005-09-01 17:48:59 -0700	[diff] [blame]	776
				777	if (copy > PAGE_SIZE - off)
				778	copy = PAGE_SIZE - off;
				779
				780	if (!sk_stream_wmem_schedule(sk, copy))
				781	goto wait_for_memory;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	782
				783	if (!page) {
				784	/* Allocate new cache page. */
				785	if (!(page = sk_stream_alloc_page(sk)))
				786	goto wait_for_memory;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	787	}
				788
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	789	/* Time to copy data. We are close to
				790	* the end! */
				791	err = skb_copy_to_page(sk, from, skb, page,
				792	off, copy);
				793	if (err) {
				794	/* If this page was new, give it to the
				795	* socket so it does not get leaked.
				796	*/
				797	if (!TCP_PAGE(sk)) {
				798	TCP_PAGE(sk) = page;
				799	TCP_OFF(sk) = 0;
				800	}
				801	goto do_error;
				802	}
				803
				804	/* Update the skb. */
				805	if (merge) {
				806	skb_shinfo(skb)->frags[i - 1].size +=
				807	copy;
				808	} else {
				809	skb_fill_page_desc(skb, i, page, off, copy);
				810	if (TCP_PAGE(sk)) {
				811	get_page(page);
				812	} else if (off + copy < PAGE_SIZE) {
				813	get_page(page);
				814	TCP_PAGE(sk) = page;
				815	}
				816	}
				817
				818	TCP_OFF(sk) = off + copy;
				819	}
				820
				821	if (!copied)
				822	TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
				823
				824	tp->write_seq += copy;
				825	TCP_SKB_CB(skb)->end_seq += copy;
				826	skb_shinfo(skb)->tso_segs = 0;
				827
				828	from += copy;
				829	copied += copy;
				830	if ((seglen -= copy) == 0 && iovlen == 0)
				831	goto out;
				832
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame]	833	if (skb->len < mss_now \|\| (flags & MSG_OOB))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	834	continue;
				835
				836	if (forced_push(tp)) {
				837	tcp_mark_push(tp, skb);
				838	__tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
				839	} else if (skb == sk->sk_send_head)
				840	tcp_push_one(sk, mss_now);
				841	continue;
				842
				843	wait_for_sndbuf:
				844	set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
				845	wait_for_memory:
				846	if (copied)
				847	tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
				848
				849	if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
				850	goto do_error;
				851
				852	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame]	853	size_goal = tp->xmit_size_goal;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	854	}
				855	}
				856
				857	out:
				858	if (copied)
				859	tcp_push(sk, tp, flags, mss_now, tp->nonagle);
				860	TCP_CHECK_TIMER(sk);
				861	release_sock(sk);
				862	return copied;
				863
				864	do_fault:
				865	if (!skb->len) {
				866	if (sk->sk_send_head == skb)
				867	sk->sk_send_head = NULL;
David S. Miller	8728b83	2005-08-09 19:25:21 -0700	[diff] [blame]	868	__skb_unlink(skb, &sk->sk_write_queue);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	869	sk_stream_free_skb(sk, skb);
				870	}
				871
				872	do_error:
				873	if (copied)
				874	goto out;
				875	out_err:
				876	err = sk_stream_error(sk, flags, err);
				877	TCP_CHECK_TIMER(sk);
				878	release_sock(sk);
				879	return err;
				880	}
				881
				882	/*
				883	* Handle reading urgent data. BSD has very simple semantics for
				884	* this, no blocking and very strange errors 8)
				885	*/
				886
				887	static int tcp_recv_urg(struct sock *sk, long timeo,
				888	struct msghdr *msg, int len, int flags,
				889	int *addr_len)
				890	{
				891	struct tcp_sock *tp = tcp_sk(sk);
				892
				893	/* No URG data to read. */
				894	if (sock_flag(sk, SOCK_URGINLINE) \|\| !tp->urg_data \|\|
				895	tp->urg_data == TCP_URG_READ)
				896	return -EINVAL; /* Yes this is right ! */
				897
				898	if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
				899	return -ENOTCONN;
				900
				901	if (tp->urg_data & TCP_URG_VALID) {
				902	int err = 0;
				903	char c = tp->urg_data;
				904
				905	if (!(flags & MSG_PEEK))
				906	tp->urg_data = TCP_URG_READ;
				907
				908	/* Read urgent data. */
				909	msg->msg_flags \|= MSG_OOB;
				910
				911	if (len > 0) {
				912	if (!(flags & MSG_TRUNC))
				913	err = memcpy_toiovec(msg->msg_iov, &c, 1);
				914	len = 1;
				915	} else
				916	msg->msg_flags \|= MSG_TRUNC;
				917
				918	return err ? -EFAULT : len;
				919	}
				920
				921	if (sk->sk_state == TCP_CLOSE \|\| (sk->sk_shutdown & RCV_SHUTDOWN))
				922	return 0;
				923
				924	/* Fixed the recv(..., MSG_OOB) behaviour. BSD docs and
				925	* the available implementations agree in this case:
				926	* this call should never block, independent of the
				927	* blocking state of the socket.
				928	* Mike <pall@rz.uni-karlsruhe.de>
				929	*/
				930	return -EAGAIN;
				931	}
				932
				933	/* Clean up the receive buffer for full frames taken by the user,
				934	* then send an ACK if necessary. COPIED is the number of bytes
				935	* tcp_recvmsg has given to the user so far, it speeds up the
				936	* calculation of whether or not we must ACK for the sake of
				937	* a window update.
				938	*/
				939	static void cleanup_rbuf(struct sock *sk, int copied)
				940	{
				941	struct tcp_sock *tp = tcp_sk(sk);
				942	int time_to_ack = 0;
				943
				944	#if TCP_DEBUG
				945	struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
				946
				947	BUG_TRAP(!skb \|\| before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
				948	#endif
				949
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	950	if (inet_csk_ack_scheduled(sk)) {
				951	const struct inet_connection_sock *icsk = inet_csk(sk);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	952	/* Delayed ACKs frequently hit locked sockets during bulk
				953	* receive. */
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	954	if (icsk->icsk_ack.blocked \|\|
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	955	/* Once-per-two-segments ACK was not sent by tcp_input.c */
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	956	tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss \|\|
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	957	/*
				958	* If this read emptied read buffer, we send ACK, if
				959	* connection is not bidirectional, user drained
				960	* receive buffer and there was a small segment
				961	* in queue.
				962	*/
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	963	(copied > 0 && (icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
				964	!icsk->icsk_ack.pingpong && !atomic_read(&sk->sk_rmem_alloc)))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	965	time_to_ack = 1;
				966	}
				967
				968	/* We send an ACK if we can now advertise a non-zero window
				969	* which has been raised "significantly".
				970	*
				971	* Even if window raised up to infinity, do not send window open ACK
				972	* in states, where we will not receive more. It is useless.
				973	*/
				974	if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
				975	__u32 rcv_window_now = tcp_receive_window(tp);
				976
				977	/* Optimize, __tcp_select_window() is not cheap. */
				978	if (2*rcv_window_now <= tp->window_clamp) {
				979	__u32 new_window = __tcp_select_window(sk);
				980
				981	/* Send ACK now, if this read freed lots of space
				982	* in our buffer. Certainly, new_window is new window.
				983	* We can advertise it now, if it is not less than current one.
				984	* "Lots" means "at least twice" here.
				985	*/
				986	if (new_window && new_window >= 2 * rcv_window_now)
				987	time_to_ack = 1;
				988	}
				989	}
				990	if (time_to_ack)
				991	tcp_send_ack(sk);
				992	}
				993
				994	static void tcp_prequeue_process(struct sock *sk)
				995	{
				996	struct sk_buff *skb;
				997	struct tcp_sock *tp = tcp_sk(sk);
				998
David S. Miller	b03efcf	2005-07-08 14:57:23 -0700	[diff] [blame]	999	NET_INC_STATS_USER(LINUX_MIB_TCPPREQUEUED);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1000
				1001	/* RX process wants to run with disabled BHs, though it is not
				1002	* necessary */
				1003	local_bh_disable();
				1004	while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
				1005	sk->sk_backlog_rcv(sk, skb);
				1006	local_bh_enable();
				1007
				1008	/* Clear memory counter. */
				1009	tp->ucopy.memory = 0;
				1010	}
				1011
				1012	static inline struct sk_buff tcp_recv_skb(struct sock sk, u32 seq, u32 *off)
				1013	{
				1014	struct sk_buff *skb;
				1015	u32 offset;
				1016
				1017	skb_queue_walk(&sk->sk_receive_queue, skb) {
				1018	offset = seq - TCP_SKB_CB(skb)->seq;
				1019	if (skb->h.th->syn)
				1020	offset--;
				1021	if (offset < skb->len \|\| skb->h.th->fin) {
				1022	*off = offset;
				1023	return skb;
				1024	}
				1025	}
				1026	return NULL;
				1027	}
				1028
				1029	/*
				1030	* This routine provides an alternative to tcp_recvmsg() for routines
				1031	* that would like to handle copying from skbuffs directly in 'sendfile'
				1032	* fashion.
				1033	* Note:
				1034	* - It is assumed that the socket was locked by the caller.
				1035	* - The routine does not block.
				1036	* - At present, there is no support for reading OOB data
				1037	* or for 'peeking' the socket using this routine
				1038	* (although both would be easy to implement).
				1039	*/
				1040	int tcp_read_sock(struct sock sk, read_descriptor_t desc,
				1041	sk_read_actor_t recv_actor)
				1042	{
				1043	struct sk_buff *skb;
				1044	struct tcp_sock *tp = tcp_sk(sk);
				1045	u32 seq = tp->copied_seq;
				1046	u32 offset;
				1047	int copied = 0;
				1048
				1049	if (sk->sk_state == TCP_LISTEN)
				1050	return -ENOTCONN;
				1051	while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
				1052	if (offset < skb->len) {
				1053	size_t used, len;
				1054
				1055	len = skb->len - offset;
				1056	/* Stop reading if we hit a patch of urgent data */
				1057	if (tp->urg_data) {
				1058	u32 urg_offset = tp->urg_seq - seq;
				1059	if (urg_offset < len)
				1060	len = urg_offset;
				1061	if (!len)
				1062	break;
				1063	}
				1064	used = recv_actor(desc, skb, offset, len);
				1065	if (used <= len) {
				1066	seq += used;
				1067	copied += used;
				1068	offset += used;
				1069	}
				1070	if (offset != skb->len)
				1071	break;
				1072	}
				1073	if (skb->h.th->fin) {
				1074	sk_eat_skb(sk, skb);
				1075	++seq;
				1076	break;
				1077	}
				1078	sk_eat_skb(sk, skb);
				1079	if (!desc->count)
				1080	break;
				1081	}
				1082	tp->copied_seq = seq;
				1083
				1084	tcp_rcv_space_adjust(sk);
				1085
				1086	/* Clean up data we have read: This will do ACK frames. */
				1087	if (copied)
				1088	cleanup_rbuf(sk, copied);
				1089	return copied;
				1090	}
				1091
				1092	/*
				1093	* This routine copies from a sock struct into the user buffer.
				1094	*
				1095	* Technical note: in 2.3 we work on _locked_ socket, so that
				1096	* tricks with *seq access order and skb->users are not required.
				1097	* Probably, code can be easily improved even more.
				1098	*/
				1099
				1100	int tcp_recvmsg(struct kiocb iocb, struct sock sk, struct msghdr *msg,
				1101	size_t len, int nonblock, int flags, int *addr_len)
				1102	{
				1103	struct tcp_sock *tp = tcp_sk(sk);
				1104	int copied = 0;
				1105	u32 peek_seq;
				1106	u32 *seq;
				1107	unsigned long used;
				1108	int err;
				1109	int target; /* Read at least this many bytes */
				1110	long timeo;
				1111	struct task_struct *user_recv = NULL;
				1112
				1113	lock_sock(sk);
				1114
				1115	TCP_CHECK_TIMER(sk);
				1116
				1117	err = -ENOTCONN;
				1118	if (sk->sk_state == TCP_LISTEN)
				1119	goto out;
				1120
				1121	timeo = sock_rcvtimeo(sk, nonblock);
				1122
				1123	/* Urgent data needs to be handled specially. */
				1124	if (flags & MSG_OOB)
				1125	goto recv_urg;
				1126
				1127	seq = &tp->copied_seq;
				1128	if (flags & MSG_PEEK) {
				1129	peek_seq = tp->copied_seq;
				1130	seq = &peek_seq;
				1131	}
				1132
				1133	target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
				1134
				1135	do {
				1136	struct sk_buff *skb;
				1137	u32 offset;
				1138
				1139	/* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
				1140	if (tp->urg_data && tp->urg_seq == *seq) {
				1141	if (copied)
				1142	break;
				1143	if (signal_pending(current)) {
				1144	copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
				1145	break;
				1146	}
				1147	}
				1148
				1149	/* Next get a buffer. */
				1150
				1151	skb = skb_peek(&sk->sk_receive_queue);
				1152	do {
				1153	if (!skb)
				1154	break;
				1155
				1156	/* Now that we have two receive queues this
				1157	* shouldn't happen.
				1158	*/
				1159	if (before(*seq, TCP_SKB_CB(skb)->seq)) {
				1160	printk(KERN_INFO "recvmsg bug: copied %X "
				1161	"seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
				1162	break;
				1163	}
				1164	offset = *seq - TCP_SKB_CB(skb)->seq;
				1165	if (skb->h.th->syn)
				1166	offset--;
				1167	if (offset < skb->len)
				1168	goto found_ok_skb;
				1169	if (skb->h.th->fin)
				1170	goto found_fin_ok;
				1171	BUG_TRAP(flags & MSG_PEEK);
				1172	skb = skb->next;
				1173	} while (skb != (struct sk_buff *)&sk->sk_receive_queue);
				1174
				1175	/* Well, if we have backlog, try to process it now yet. */
				1176
				1177	if (copied >= target && !sk->sk_backlog.tail)
				1178	break;
				1179
				1180	if (copied) {
				1181	if (sk->sk_err \|\|
				1182	sk->sk_state == TCP_CLOSE \|\|
				1183	(sk->sk_shutdown & RCV_SHUTDOWN) \|\|
				1184	!timeo \|\|
				1185	signal_pending(current) \|\|
				1186	(flags & MSG_PEEK))
				1187	break;
				1188	} else {
				1189	if (sock_flag(sk, SOCK_DONE))
				1190	break;
				1191
				1192	if (sk->sk_err) {
				1193	copied = sock_error(sk);
				1194	break;
				1195	}
				1196
				1197	if (sk->sk_shutdown & RCV_SHUTDOWN)
				1198	break;
				1199
				1200	if (sk->sk_state == TCP_CLOSE) {
				1201	if (!sock_flag(sk, SOCK_DONE)) {
				1202	/* This occurs when user tries to read
				1203	* from never connected socket.
				1204	*/
				1205	copied = -ENOTCONN;
				1206	break;
				1207	}
				1208	break;
				1209	}
				1210
				1211	if (!timeo) {
				1212	copied = -EAGAIN;
				1213	break;
				1214	}
				1215
				1216	if (signal_pending(current)) {
				1217	copied = sock_intr_errno(timeo);
				1218	break;
				1219	}
				1220	}
				1221
				1222	cleanup_rbuf(sk, copied);
				1223
David S. Miller	7df5512	2005-06-18 23:01:10 -0700	[diff] [blame]	1224	if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1225	/* Install new reader */
				1226	if (!user_recv && !(flags & (MSG_TRUNC \| MSG_PEEK))) {
				1227	user_recv = current;
				1228	tp->ucopy.task = user_recv;
				1229	tp->ucopy.iov = msg->msg_iov;
				1230	}
				1231
				1232	tp->ucopy.len = len;
				1233
				1234	BUG_TRAP(tp->copied_seq == tp->rcv_nxt \|\|
				1235	(flags & (MSG_PEEK \| MSG_TRUNC)));
				1236
				1237	/* Ugly... If prequeue is not empty, we have to
				1238	* process it before releasing socket, otherwise
				1239	* order will be broken at second iteration.
				1240	* More elegant solution is required!!!
				1241	*
				1242	* Look: we have the following (pseudo)queues:
				1243	*
				1244	* 1. packets in flight
				1245	* 2. backlog
				1246	* 3. prequeue
				1247	* 4. receive_queue
				1248	*
				1249	* Each queue can be processed only if the next ones
				1250	* are empty. At this point we have empty receive_queue.
				1251	* But prequeue _can_ be not empty after 2nd iteration,
				1252	* when we jumped to start of loop because backlog
				1253	* processing added something to receive_queue.
				1254	* We cannot release_sock(), because backlog contains
				1255	* packets arrived _after_ prequeued ones.
				1256	*
				1257	* Shortly, algorithm is clear --- to process all
				1258	* the queues in order. We could make it more directly,
				1259	* requeueing packets from backlog to prequeue, if
				1260	* is not empty. It is more elegant, but eats cycles,
				1261	* unfortunately.
				1262	*/
David S. Miller	b03efcf	2005-07-08 14:57:23 -0700	[diff] [blame]	1263	if (!skb_queue_empty(&tp->ucopy.prequeue))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1264	goto do_prequeue;
				1265
				1266	/* __ Set realtime policy in scheduler __ */
				1267	}
				1268
				1269	if (copied >= target) {
				1270	/* Do not sleep, just process backlog. */
				1271	release_sock(sk);
				1272	lock_sock(sk);
				1273	} else
				1274	sk_wait_data(sk, &timeo);
				1275
				1276	if (user_recv) {
				1277	int chunk;
				1278
				1279	/* __ Restore normal policy in scheduler __ */
				1280
				1281	if ((chunk = len - tp->ucopy.len) != 0) {
				1282	NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
				1283	len -= chunk;
				1284	copied += chunk;
				1285	}
				1286
				1287	if (tp->rcv_nxt == tp->copied_seq &&
David S. Miller	b03efcf	2005-07-08 14:57:23 -0700	[diff] [blame]	1288	!skb_queue_empty(&tp->ucopy.prequeue)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1289	do_prequeue:
				1290	tcp_prequeue_process(sk);
				1291
				1292	if ((chunk = len - tp->ucopy.len) != 0) {
				1293	NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
				1294	len -= chunk;
				1295	copied += chunk;
				1296	}
				1297	}
				1298	}
				1299	if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {
				1300	if (net_ratelimit())
				1301	printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
				1302	current->comm, current->pid);
				1303	peek_seq = tp->copied_seq;
				1304	}
				1305	continue;
				1306
				1307	found_ok_skb:
				1308	/* Ok so how much can we use? */
				1309	used = skb->len - offset;
				1310	if (len < used)
				1311	used = len;
				1312
				1313	/* Do we have urgent data here? */
				1314	if (tp->urg_data) {
				1315	u32 urg_offset = tp->urg_seq - *seq;
				1316	if (urg_offset < used) {
				1317	if (!urg_offset) {
				1318	if (!sock_flag(sk, SOCK_URGINLINE)) {
				1319	++*seq;
				1320	offset++;
				1321	used--;
				1322	if (!used)
				1323	goto skip_copy;
				1324	}
				1325	} else
				1326	used = urg_offset;
				1327	}
				1328	}
				1329
				1330	if (!(flags & MSG_TRUNC)) {
				1331	err = skb_copy_datagram_iovec(skb, offset,
				1332	msg->msg_iov, used);
				1333	if (err) {
				1334	/* Exception. Bailout! */
				1335	if (!copied)
				1336	copied = -EFAULT;
				1337	break;
				1338	}
				1339	}
				1340
				1341	*seq += used;
				1342	copied += used;
				1343	len -= used;
				1344
				1345	tcp_rcv_space_adjust(sk);
				1346
				1347	skip_copy:
				1348	if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
				1349	tp->urg_data = 0;
				1350	tcp_fast_path_check(sk, tp);
				1351	}
				1352	if (used + offset < skb->len)
				1353	continue;
				1354
				1355	if (skb->h.th->fin)
				1356	goto found_fin_ok;
				1357	if (!(flags & MSG_PEEK))
				1358	sk_eat_skb(sk, skb);
				1359	continue;
				1360
				1361	found_fin_ok:
				1362	/* Process the FIN. */
				1363	++*seq;
				1364	if (!(flags & MSG_PEEK))
				1365	sk_eat_skb(sk, skb);
				1366	break;
				1367	} while (len > 0);
				1368
				1369	if (user_recv) {
David S. Miller	b03efcf	2005-07-08 14:57:23 -0700	[diff] [blame]	1370	if (!skb_queue_empty(&tp->ucopy.prequeue)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1371	int chunk;
				1372
				1373	tp->ucopy.len = copied > 0 ? len : 0;
				1374
				1375	tcp_prequeue_process(sk);
				1376
				1377	if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
				1378	NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
				1379	len -= chunk;
				1380	copied += chunk;
				1381	}
				1382	}
				1383
				1384	tp->ucopy.task = NULL;
				1385	tp->ucopy.len = 0;
				1386	}
				1387
				1388	/* According to UNIX98, msg_name/msg_namelen are ignored
				1389	* on connected socket. I was just happy when found this 8) --ANK
				1390	*/
				1391
				1392	/* Clean up data we have read: This will do ACK frames. */
				1393	cleanup_rbuf(sk, copied);
				1394
				1395	TCP_CHECK_TIMER(sk);
				1396	release_sock(sk);
				1397	return copied;
				1398
				1399	out:
				1400	TCP_CHECK_TIMER(sk);
				1401	release_sock(sk);
				1402	return err;
				1403
				1404	recv_urg:
				1405	err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
				1406	goto out;
				1407	}
				1408
				1409	/*
				1410	* State processing on a close. This implements the state shift for
				1411	* sending our FIN frame. Note that we only send a FIN for some
				1412	* states. A shutdown() may have already sent the FIN, or we may be
				1413	* closed.
				1414	*/
				1415
				1416	static unsigned char new_state[16] = {
				1417	/* current state: new state: action: */
				1418	/* (Invalid) */ TCP_CLOSE,
				1419	/* TCP_ESTABLISHED */ TCP_FIN_WAIT1 \| TCP_ACTION_FIN,
				1420	/* TCP_SYN_SENT */ TCP_CLOSE,
				1421	/* TCP_SYN_RECV */ TCP_FIN_WAIT1 \| TCP_ACTION_FIN,
				1422	/* TCP_FIN_WAIT1 */ TCP_FIN_WAIT1,
				1423	/* TCP_FIN_WAIT2 */ TCP_FIN_WAIT2,
				1424	/* TCP_TIME_WAIT */ TCP_CLOSE,
				1425	/* TCP_CLOSE */ TCP_CLOSE,
				1426	/* TCP_CLOSE_WAIT */ TCP_LAST_ACK \| TCP_ACTION_FIN,
				1427	/* TCP_LAST_ACK */ TCP_LAST_ACK,
				1428	/* TCP_LISTEN */ TCP_CLOSE,
				1429	/* TCP_CLOSING */ TCP_CLOSING,
				1430	};
				1431
				1432	static int tcp_close_state(struct sock *sk)
				1433	{
				1434	int next = (int)new_state[sk->sk_state];
				1435	int ns = next & TCP_STATE_MASK;
				1436
				1437	tcp_set_state(sk, ns);
				1438
				1439	return next & TCP_ACTION_FIN;
				1440	}
				1441
				1442	/*
				1443	* Shutdown the sending side of a connection. Much like close except
				1444	* that we don't receive shut down or set_sock_flag(sk, SOCK_DEAD).
				1445	*/
				1446
				1447	void tcp_shutdown(struct sock *sk, int how)
				1448	{
				1449	/* We need to grab some memory, and put together a FIN,
				1450	* and then put it into the queue to be sent.
				1451	* Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
				1452	*/
				1453	if (!(how & SEND_SHUTDOWN))
				1454	return;
				1455
				1456	/* If we've already sent a FIN, or it's a closed state, skip this. */
				1457	if ((1 << sk->sk_state) &
				1458	(TCPF_ESTABLISHED \| TCPF_SYN_SENT \|
				1459	TCPF_SYN_RECV \| TCPF_CLOSE_WAIT)) {
				1460	/* Clear out any half completed packets. FIN if needed. */
				1461	if (tcp_close_state(sk))
				1462	tcp_send_fin(sk);
				1463	}
				1464	}
				1465
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1466	void tcp_close(struct sock *sk, long timeout)
				1467	{
				1468	struct sk_buff *skb;
				1469	int data_was_unread = 0;
				1470
				1471	lock_sock(sk);
				1472	sk->sk_shutdown = SHUTDOWN_MASK;
				1473
				1474	if (sk->sk_state == TCP_LISTEN) {
				1475	tcp_set_state(sk, TCP_CLOSE);
				1476
				1477	/* Special case. */
Arnaldo Carvalho de Melo	0a5578c	2005-08-09 20:11:41 -0700	[diff] [blame]	1478	inet_csk_listen_stop(sk);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1479
				1480	goto adjudge_to_death;
				1481	}
				1482
				1483	/* We need to flush the recv. buffs. We do this only on the
				1484	* descriptor close, not protocol-sourced closes, because the
				1485	* reader process may not have drained the data yet!
				1486	*/
				1487	while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
				1488	u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
				1489	skb->h.th->fin;
				1490	data_was_unread += len;
				1491	__kfree_skb(skb);
				1492	}
				1493
				1494	sk_stream_mem_reclaim(sk);
				1495
				1496	/* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
				1497	* 3.10, we send a RST here because data was lost. To
				1498	* witness the awful effects of the old behavior of always
				1499	* doing a FIN, run an older 2.1.x kernel or 2.0.x, start
				1500	* a bulk GET in an FTP client, suspend the process, wait
				1501	* for the client to advertise a zero window, then kill -9
				1502	* the FTP client, wheee... Note: timeout is always zero
				1503	* in such a case.
				1504	*/
				1505	if (data_was_unread) {
				1506	/* Unread data was tossed, zap the connection. */
				1507	NET_INC_STATS_USER(LINUX_MIB_TCPABORTONCLOSE);
				1508	tcp_set_state(sk, TCP_CLOSE);
				1509	tcp_send_active_reset(sk, GFP_KERNEL);
				1510	} else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
				1511	/* Check zero linger _after_ checking for unread data. */
				1512	sk->sk_prot->disconnect(sk, 0);
				1513	NET_INC_STATS_USER(LINUX_MIB_TCPABORTONDATA);
				1514	} else if (tcp_close_state(sk)) {
				1515	/* We FIN if the application ate all the data before
				1516	* zapping the connection.
				1517	*/
				1518
				1519	/* RED-PEN. Formally speaking, we have broken TCP state
				1520	* machine. State transitions:
				1521	*
				1522	* TCP_ESTABLISHED -> TCP_FIN_WAIT1
				1523	* TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible)
				1524	* TCP_CLOSE_WAIT -> TCP_LAST_ACK
				1525	*
				1526	* are legal only when FIN has been sent (i.e. in window),
				1527	* rather than queued out of window. Purists blame.
				1528	*
				1529	* F.e. "RFC state" is ESTABLISHED,
				1530	* if Linux state is FIN-WAIT-1, but FIN is still not sent.
				1531	*
				1532	* The visible declinations are that sometimes
				1533	* we enter time-wait state, when it is not required really
				1534	* (harmless), do not send active resets, when they are
				1535	* required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
				1536	* they look as CLOSING or LAST_ACK for Linux)
				1537	* Probably, I missed some more holelets.
				1538	* --ANK
				1539	*/
				1540	tcp_send_fin(sk);
				1541	}
				1542
				1543	sk_stream_wait_close(sk, timeout);
				1544
				1545	adjudge_to_death:
				1546	/* It is the last release_sock in its life. It will remove backlog. */
				1547	release_sock(sk);
				1548
				1549
				1550	/* Now socket is owned by kernel and we acquire BH lock
				1551	to finish close. No need to check for user refs.
				1552	*/
				1553	local_bh_disable();
				1554	bh_lock_sock(sk);
				1555	BUG_TRAP(!sock_owned_by_user(sk));
				1556
				1557	sock_hold(sk);
				1558	sock_orphan(sk);
				1559
				1560	/* This is a (useful) BSD violating of the RFC. There is a
				1561	* problem with TCP as specified in that the other end could
				1562	* keep a socket open forever with no application left this end.
				1563	* We use a 3 minute timeout (about the same as BSD) then kill
				1564	* our end. If they send after that then tough - BUT: long enough
				1565	* that we won't make the old 4*rto = almost no time - whoops
				1566	* reset mistake.
				1567	*
				1568	* Nope, it was not mistake. It is really desired behaviour
				1569	* f.e. on http servers, when such sockets are useless, but
				1570	* consume significant resources. Let's do it with special
				1571	* linger2 option. --ANK
				1572	*/
				1573
				1574	if (sk->sk_state == TCP_FIN_WAIT2) {
				1575	struct tcp_sock *tp = tcp_sk(sk);
				1576	if (tp->linger2 < 0) {
				1577	tcp_set_state(sk, TCP_CLOSE);
				1578	tcp_send_active_reset(sk, GFP_ATOMIC);
				1579	NET_INC_STATS_BH(LINUX_MIB_TCPABORTONLINGER);
				1580	} else {
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	1581	const int tmo = tcp_fin_time(sk);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1582
				1583	if (tmo > TCP_TIMEWAIT_LEN) {
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	1584	inet_csk_reset_keepalive_timer(sk, tcp_fin_time(sk));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1585	} else {
Arnaldo Carvalho de Melo	0a5578c	2005-08-09 20:11:41 -0700	[diff] [blame]	1586	atomic_inc(sk->sk_prot->orphan_count);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1587	tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
				1588	goto out;
				1589	}
				1590	}
				1591	}
				1592	if (sk->sk_state != TCP_CLOSE) {
				1593	sk_stream_mem_reclaim(sk);
Arnaldo Carvalho de Melo	0a5578c	2005-08-09 20:11:41 -0700	[diff] [blame]	1594	if (atomic_read(sk->sk_prot->orphan_count) > sysctl_tcp_max_orphans \|\|
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1595	(sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
				1596	atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
				1597	if (net_ratelimit())
				1598	printk(KERN_INFO "TCP: too many of orphaned "
				1599	"sockets\n");
				1600	tcp_set_state(sk, TCP_CLOSE);
				1601	tcp_send_active_reset(sk, GFP_ATOMIC);
				1602	NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
				1603	}
				1604	}
Arnaldo Carvalho de Melo	0a5578c	2005-08-09 20:11:41 -0700	[diff] [blame]	1605	atomic_inc(sk->sk_prot->orphan_count);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1606
				1607	if (sk->sk_state == TCP_CLOSE)
Arnaldo Carvalho de Melo	0a5578c	2005-08-09 20:11:41 -0700	[diff] [blame]	1608	inet_csk_destroy_sock(sk);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1609	/* Otherwise, socket is reprieved until protocol close. */
				1610
				1611	out:
				1612	bh_unlock_sock(sk);
				1613	local_bh_enable();
				1614	sock_put(sk);
				1615	}
				1616
				1617	/* These states need RST on ABORT according to RFC793 */
				1618
				1619	static inline int tcp_need_reset(int state)
				1620	{
				1621	return (1 << state) &
				1622	(TCPF_ESTABLISHED \| TCPF_CLOSE_WAIT \| TCPF_FIN_WAIT1 \|
				1623	TCPF_FIN_WAIT2 \| TCPF_SYN_RECV);
				1624	}
				1625
				1626	int tcp_disconnect(struct sock *sk, int flags)
				1627	{
				1628	struct inet_sock *inet = inet_sk(sk);
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	1629	struct inet_connection_sock *icsk = inet_csk(sk);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1630	struct tcp_sock *tp = tcp_sk(sk);
				1631	int err = 0;
				1632	int old_state = sk->sk_state;
				1633
				1634	if (old_state != TCP_CLOSE)
				1635	tcp_set_state(sk, TCP_CLOSE);
				1636
				1637	/* ABORT function of RFC793 */
				1638	if (old_state == TCP_LISTEN) {
Arnaldo Carvalho de Melo	0a5578c	2005-08-09 20:11:41 -0700	[diff] [blame]	1639	inet_csk_listen_stop(sk);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1640	} else if (tcp_need_reset(old_state) \|\|
				1641	(tp->snd_nxt != tp->write_seq &&
				1642	(1 << old_state) & (TCPF_CLOSING \| TCPF_LAST_ACK))) {
Stephen Hemminger	caa20d9a	2005-11-10 17:13:47 -0800	[diff] [blame]	1643	/* The last check adjusts for discrepancy of Linux wrt. RFC
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1644	* states
				1645	*/
				1646	tcp_send_active_reset(sk, gfp_any());
				1647	sk->sk_err = ECONNRESET;
				1648	} else if (old_state == TCP_SYN_SENT)
				1649	sk->sk_err = ECONNRESET;
				1650
				1651	tcp_clear_xmit_timers(sk);
				1652	__skb_queue_purge(&sk->sk_receive_queue);
				1653	sk_stream_writequeue_purge(sk);
				1654	__skb_queue_purge(&tp->out_of_order_queue);
				1655
				1656	inet->dport = 0;
				1657
				1658	if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
				1659	inet_reset_saddr(sk);
				1660
				1661	sk->sk_shutdown = 0;
				1662	sock_reset_flag(sk, SOCK_DONE);
				1663	tp->srtt = 0;
				1664	if ((tp->write_seq += tp->max_window + 2) == 0)
				1665	tp->write_seq = 1;
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	1666	icsk->icsk_backoff = 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1667	tp->snd_cwnd = 2;
Arnaldo Carvalho de Melo	6687e98	2005-08-10 04:03:31 -0300	[diff] [blame]	1668	icsk->icsk_probes_out = 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1669	tp->packets_out = 0;
				1670	tp->snd_ssthresh = 0x7fffffff;
				1671	tp->snd_cwnd_cnt = 0;
Stephen Hemminger	9772efb	2005-11-10 17:09:53 -0800	[diff] [blame]	1672	tp->bytes_acked = 0;
Arnaldo Carvalho de Melo	6687e98	2005-08-10 04:03:31 -0300	[diff] [blame]	1673	tcp_set_ca_state(sk, TCP_CA_Open);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1674	tcp_clear_retrans(tp);
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	1675	inet_csk_delack_init(sk);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1676	sk->sk_send_head = NULL;
				1677	tp->rx_opt.saw_tstamp = 0;
				1678	tcp_sack_reset(&tp->rx_opt);
				1679	__sk_dst_reset(sk);
				1680
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	1681	BUG_TRAP(!inet->num \|\| icsk->icsk_bind_hash);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1682
				1683	sk->sk_error_report(sk);
				1684	return err;
				1685	}
				1686
				1687	/*
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1688	* Socket option code for TCP.
				1689	*/
				1690	int tcp_setsockopt(struct sock sk, int level, int optname, char __user optval,
				1691	int optlen)
				1692	{
				1693	struct tcp_sock *tp = tcp_sk(sk);
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	1694	struct inet_connection_sock *icsk = inet_csk(sk);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1695	int val;
				1696	int err = 0;
				1697
				1698	if (level != SOL_TCP)
				1699	return tp->af_specific->setsockopt(sk, level, optname,
				1700	optval, optlen);
				1701
Stephen Hemminger	5f8ef48	2005-06-23 20:37:36 -0700	[diff] [blame]	1702	/* This is a string value all the others are int's */
				1703	if (optname == TCP_CONGESTION) {
				1704	char name[TCP_CA_NAME_MAX];
				1705
				1706	if (optlen < 1)
				1707	return -EINVAL;
				1708
				1709	val = strncpy_from_user(name, optval,
				1710	min(TCP_CA_NAME_MAX-1, optlen));
				1711	if (val < 0)
				1712	return -EFAULT;
				1713	name[val] = 0;
				1714
				1715	lock_sock(sk);
Arnaldo Carvalho de Melo	6687e98	2005-08-10 04:03:31 -0300	[diff] [blame]	1716	err = tcp_set_congestion_control(sk, name);
Stephen Hemminger	5f8ef48	2005-06-23 20:37:36 -0700	[diff] [blame]	1717	release_sock(sk);
				1718	return err;
				1719	}
				1720
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1721	if (optlen < sizeof(int))
				1722	return -EINVAL;
				1723
				1724	if (get_user(val, (int __user *)optval))
				1725	return -EFAULT;
				1726
				1727	lock_sock(sk);
				1728
				1729	switch (optname) {
				1730	case TCP_MAXSEG:
				1731	/* Values greater than interface MTU won't take effect. However
				1732	* at the point when this call is done we typically don't yet
				1733	* know which interface is going to be used */
				1734	if (val < 8 \|\| val > MAX_TCP_WINDOW) {
				1735	err = -EINVAL;
				1736	break;
				1737	}
				1738	tp->rx_opt.user_mss = val;
				1739	break;
				1740
				1741	case TCP_NODELAY:
				1742	if (val) {
				1743	/* TCP_NODELAY is weaker than TCP_CORK, so that
				1744	* this option on corked socket is remembered, but
				1745	* it is not activated until cork is cleared.
				1746	*
				1747	* However, when TCP_NODELAY is set we make
				1748	* an explicit push, which overrides even TCP_CORK
				1749	* for currently queued segments.
				1750	*/
				1751	tp->nonagle \|= TCP_NAGLE_OFF\|TCP_NAGLE_PUSH;
				1752	tcp_push_pending_frames(sk, tp);
				1753	} else {
				1754	tp->nonagle &= ~TCP_NAGLE_OFF;
				1755	}
				1756	break;
				1757
				1758	case TCP_CORK:
				1759	/* When set indicates to always queue non-full frames.
				1760	* Later the user clears this option and we transmit
				1761	* any pending partial frames in the queue. This is
				1762	* meant to be used alongside sendfile() to get properly
				1763	* filled frames when the user (for example) must write
				1764	* out headers with a write() call first and then use
				1765	* sendfile to send out the data parts.
				1766	*
				1767	* TCP_CORK can be set together with TCP_NODELAY and it is
				1768	* stronger than TCP_NODELAY.
				1769	*/
				1770	if (val) {
				1771	tp->nonagle \|= TCP_NAGLE_CORK;
				1772	} else {
				1773	tp->nonagle &= ~TCP_NAGLE_CORK;
				1774	if (tp->nonagle&TCP_NAGLE_OFF)
				1775	tp->nonagle \|= TCP_NAGLE_PUSH;
				1776	tcp_push_pending_frames(sk, tp);
				1777	}
				1778	break;
				1779
				1780	case TCP_KEEPIDLE:
				1781	if (val < 1 \|\| val > MAX_TCP_KEEPIDLE)
				1782	err = -EINVAL;
				1783	else {
				1784	tp->keepalive_time = val * HZ;
				1785	if (sock_flag(sk, SOCK_KEEPOPEN) &&
				1786	!((1 << sk->sk_state) &
				1787	(TCPF_CLOSE \| TCPF_LISTEN))) {
				1788	__u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
				1789	if (tp->keepalive_time > elapsed)
				1790	elapsed = tp->keepalive_time - elapsed;
				1791	else
				1792	elapsed = 0;
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	1793	inet_csk_reset_keepalive_timer(sk, elapsed);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1794	}
				1795	}
				1796	break;
				1797	case TCP_KEEPINTVL:
				1798	if (val < 1 \|\| val > MAX_TCP_KEEPINTVL)
				1799	err = -EINVAL;
				1800	else
				1801	tp->keepalive_intvl = val * HZ;
				1802	break;
				1803	case TCP_KEEPCNT:
				1804	if (val < 1 \|\| val > MAX_TCP_KEEPCNT)
				1805	err = -EINVAL;
				1806	else
				1807	tp->keepalive_probes = val;
				1808	break;
				1809	case TCP_SYNCNT:
				1810	if (val < 1 \|\| val > MAX_TCP_SYNCNT)
				1811	err = -EINVAL;
				1812	else
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	1813	icsk->icsk_syn_retries = val;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1814	break;
				1815
				1816	case TCP_LINGER2:
				1817	if (val < 0)
				1818	tp->linger2 = -1;
				1819	else if (val > sysctl_tcp_fin_timeout / HZ)
				1820	tp->linger2 = 0;
				1821	else
				1822	tp->linger2 = val * HZ;
				1823	break;
				1824
				1825	case TCP_DEFER_ACCEPT:
Arnaldo Carvalho de Melo	295f732	2005-08-09 20:11:56 -0700	[diff] [blame]	1826	icsk->icsk_accept_queue.rskq_defer_accept = 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1827	if (val > 0) {
				1828	/* Translate value in seconds to number of
				1829	* retransmits */
Arnaldo Carvalho de Melo	295f732	2005-08-09 20:11:56 -0700	[diff] [blame]	1830	while (icsk->icsk_accept_queue.rskq_defer_accept < 32 &&
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1831	val > ((TCP_TIMEOUT_INIT / HZ) <<
Arnaldo Carvalho de Melo	295f732	2005-08-09 20:11:56 -0700	[diff] [blame]	1832	icsk->icsk_accept_queue.rskq_defer_accept))
				1833	icsk->icsk_accept_queue.rskq_defer_accept++;
				1834	icsk->icsk_accept_queue.rskq_defer_accept++;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1835	}
				1836	break;
				1837
				1838	case TCP_WINDOW_CLAMP:
				1839	if (!val) {
				1840	if (sk->sk_state != TCP_CLOSE) {
				1841	err = -EINVAL;
				1842	break;
				1843	}
				1844	tp->window_clamp = 0;
				1845	} else
				1846	tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
				1847	SOCK_MIN_RCVBUF / 2 : val;
				1848	break;
				1849
				1850	case TCP_QUICKACK:
				1851	if (!val) {
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	1852	icsk->icsk_ack.pingpong = 1;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1853	} else {
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	1854	icsk->icsk_ack.pingpong = 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1855	if ((1 << sk->sk_state) &
				1856	(TCPF_ESTABLISHED \| TCPF_CLOSE_WAIT) &&
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	1857	inet_csk_ack_scheduled(sk)) {
				1858	icsk->icsk_ack.pending \|= ICSK_ACK_PUSHED;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1859	cleanup_rbuf(sk, 1);
				1860	if (!(val & 1))
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	1861	icsk->icsk_ack.pingpong = 1;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1862	}
				1863	}
				1864	break;
				1865
				1866	default:
				1867	err = -ENOPROTOOPT;
				1868	break;
				1869	};
				1870	release_sock(sk);
				1871	return err;
				1872	}
				1873
				1874	/* Return information about state of tcp endpoint in API format. */
				1875	void tcp_get_info(struct sock sk, struct tcp_info info)
				1876	{
				1877	struct tcp_sock *tp = tcp_sk(sk);
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	1878	const struct inet_connection_sock *icsk = inet_csk(sk);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1879	u32 now = tcp_time_stamp;
				1880
				1881	memset(info, 0, sizeof(*info));
				1882
				1883	info->tcpi_state = sk->sk_state;
Arnaldo Carvalho de Melo	6687e98	2005-08-10 04:03:31 -0300	[diff] [blame]	1884	info->tcpi_ca_state = icsk->icsk_ca_state;
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	1885	info->tcpi_retransmits = icsk->icsk_retransmits;
Arnaldo Carvalho de Melo	6687e98	2005-08-10 04:03:31 -0300	[diff] [blame]	1886	info->tcpi_probes = icsk->icsk_probes_out;
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	1887	info->tcpi_backoff = icsk->icsk_backoff;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1888
				1889	if (tp->rx_opt.tstamp_ok)
				1890	info->tcpi_options \|= TCPI_OPT_TIMESTAMPS;
				1891	if (tp->rx_opt.sack_ok)
				1892	info->tcpi_options \|= TCPI_OPT_SACK;
				1893	if (tp->rx_opt.wscale_ok) {
				1894	info->tcpi_options \|= TCPI_OPT_WSCALE;
				1895	info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
				1896	info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
				1897	}
				1898
				1899	if (tp->ecn_flags&TCP_ECN_OK)
				1900	info->tcpi_options \|= TCPI_OPT_ECN;
				1901
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	1902	info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
				1903	info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato);
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame]	1904	info->tcpi_snd_mss = tp->mss_cache;
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	1905	info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1906
				1907	info->tcpi_unacked = tp->packets_out;
				1908	info->tcpi_sacked = tp->sacked_out;
				1909	info->tcpi_lost = tp->lost_out;
				1910	info->tcpi_retrans = tp->retrans_out;
				1911	info->tcpi_fackets = tp->fackets_out;
				1912
				1913	info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
Arnaldo Carvalho de Melo	463c84b	2005-08-09 20:10:42 -0700	[diff] [blame]	1914	info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1915	info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
				1916
				1917	info->tcpi_pmtu = tp->pmtu_cookie;
				1918	info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
				1919	info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;
				1920	info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;
				1921	info->tcpi_snd_ssthresh = tp->snd_ssthresh;
				1922	info->tcpi_snd_cwnd = tp->snd_cwnd;
				1923	info->tcpi_advmss = tp->advmss;
				1924	info->tcpi_reordering = tp->reordering;
				1925
				1926	info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
				1927	info->tcpi_rcv_space = tp->rcvq_space.space;
				1928
				1929	info->tcpi_total_retrans = tp->total_retrans;
				1930	}
				1931
				1932	EXPORT_SYMBOL_GPL(tcp_get_info);
				1933
				1934	int tcp_getsockopt(struct sock sk, int level, int optname, char __user optval,
				1935	int __user *optlen)
				1936	{
Arnaldo Carvalho de Melo	295f732	2005-08-09 20:11:56 -0700	[diff] [blame]	1937	struct inet_connection_sock *icsk = inet_csk(sk);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1938	struct tcp_sock *tp = tcp_sk(sk);
				1939	int val, len;
				1940
				1941	if (level != SOL_TCP)
				1942	return tp->af_specific->getsockopt(sk, level, optname,
				1943	optval, optlen);
				1944
				1945	if (get_user(len, optlen))
				1946	return -EFAULT;
				1947
				1948	len = min_t(unsigned int, len, sizeof(int));
				1949
				1950	if (len < 0)
				1951	return -EINVAL;
				1952
				1953	switch (optname) {
				1954	case TCP_MAXSEG:
David S. Miller	c1b4a7e	2005-07-05 15:24:38 -0700	[diff] [blame]	1955	val = tp->mss_cache;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1956	if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE \| TCPF_LISTEN)))
				1957	val = tp->rx_opt.user_mss;
				1958	break;
				1959	case TCP_NODELAY:
				1960	val = !!(tp->nonagle&TCP_NAGLE_OFF);
				1961	break;
				1962	case TCP_CORK:
				1963	val = !!(tp->nonagle&TCP_NAGLE_CORK);
				1964	break;
				1965	case TCP_KEEPIDLE:
				1966	val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time) / HZ;
				1967	break;
				1968	case TCP_KEEPINTVL:
				1969	val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl) / HZ;
				1970	break;
				1971	case TCP_KEEPCNT:
				1972	val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
				1973	break;
				1974	case TCP_SYNCNT:
Arnaldo Carvalho de Melo	295f732	2005-08-09 20:11:56 -0700	[diff] [blame]	1975	val = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1976	break;
				1977	case TCP_LINGER2:
				1978	val = tp->linger2;
				1979	if (val >= 0)
				1980	val = (val ? : sysctl_tcp_fin_timeout) / HZ;
				1981	break;
				1982	case TCP_DEFER_ACCEPT:
Arnaldo Carvalho de Melo	295f732	2005-08-09 20:11:56 -0700	[diff] [blame]	1983	val = !icsk->icsk_accept_queue.rskq_defer_accept ? 0 :
				1984	((TCP_TIMEOUT_INIT / HZ) << (icsk->icsk_accept_queue.rskq_defer_accept - 1));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1985	break;
				1986	case TCP_WINDOW_CLAMP:
				1987	val = tp->window_clamp;
				1988	break;
				1989	case TCP_INFO: {
				1990	struct tcp_info info;
				1991
				1992	if (get_user(len, optlen))
				1993	return -EFAULT;
				1994
				1995	tcp_get_info(sk, &info);
				1996
				1997	len = min_t(unsigned int, len, sizeof(info));
				1998	if (put_user(len, optlen))
				1999	return -EFAULT;
				2000	if (copy_to_user(optval, &info, len))
				2001	return -EFAULT;
				2002	return 0;
				2003	}
				2004	case TCP_QUICKACK:
Arnaldo Carvalho de Melo	295f732	2005-08-09 20:11:56 -0700	[diff] [blame]	2005	val = !icsk->icsk_ack.pingpong;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2006	break;
Stephen Hemminger	5f8ef48	2005-06-23 20:37:36 -0700	[diff] [blame]	2007
				2008	case TCP_CONGESTION:
				2009	if (get_user(len, optlen))
				2010	return -EFAULT;
				2011	len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
				2012	if (put_user(len, optlen))
				2013	return -EFAULT;
Arnaldo Carvalho de Melo	6687e98	2005-08-10 04:03:31 -0300	[diff] [blame]	2014	if (copy_to_user(optval, icsk->icsk_ca_ops->name, len))
Stephen Hemminger	5f8ef48	2005-06-23 20:37:36 -0700	[diff] [blame]	2015	return -EFAULT;
				2016	return 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2017	default:
				2018	return -ENOPROTOOPT;
				2019	};
				2020
				2021	if (put_user(len, optlen))
				2022	return -EFAULT;
				2023	if (copy_to_user(optval, &val, len))
				2024	return -EFAULT;
				2025	return 0;
				2026	}
				2027
				2028
				2029	extern void __skb_cb_too_small_for_tcp(int, int);
Stephen Hemminger	5f8ef48	2005-06-23 20:37:36 -0700	[diff] [blame]	2030	extern struct tcp_congestion_ops tcp_reno;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2031
				2032	static __initdata unsigned long thash_entries;
				2033	static int __init set_thash_entries(char *str)
				2034	{
				2035	if (!str)
				2036	return 0;
				2037	thash_entries = simple_strtoul(str, &str, 0);
				2038	return 1;
				2039	}
				2040	__setup("thash_entries=", set_thash_entries);
				2041
				2042	void __init tcp_init(void)
				2043	{
				2044	struct sk_buff *skb = NULL;
				2045	int order, i;
				2046
				2047	if (sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
				2048	__skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
				2049	sizeof(skb->cb));
				2050
Arnaldo Carvalho de Melo	6e04e02	2005-08-09 20:07:35 -0700	[diff] [blame]	2051	tcp_hashinfo.bind_bucket_cachep =
				2052	kmem_cache_create("tcp_bind_bucket",
				2053	sizeof(struct inet_bind_bucket), 0,
				2054	SLAB_HWCACHE_ALIGN, NULL, NULL);
				2055	if (!tcp_hashinfo.bind_bucket_cachep)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2056	panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
				2057
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2058	/* Size and allocate the main established and bind bucket
				2059	* hash tables.
				2060	*
				2061	* The methodology is similar to that of the buffer cache.
				2062	*/
Arnaldo Carvalho de Melo	6e04e02	2005-08-09 20:07:35 -0700	[diff] [blame]	2063	tcp_hashinfo.ehash =
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2064	alloc_large_system_hash("TCP established",
Arnaldo Carvalho de Melo	0f7ff92	2005-08-09 19:59:44 -0700	[diff] [blame]	2065	sizeof(struct inet_ehash_bucket),
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2066	thash_entries,
				2067	(num_physpages >= 128 * 1024) ?
Mike Stroyan	18955cf	2005-11-29 16:12:55 -0800	[diff] [blame^]	2068	13 : 15,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2069	HASH_HIGHMEM,
Arnaldo Carvalho de Melo	6e04e02	2005-08-09 20:07:35 -0700	[diff] [blame]	2070	&tcp_hashinfo.ehash_size,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2071	NULL,
				2072	0);
Arnaldo Carvalho de Melo	6e04e02	2005-08-09 20:07:35 -0700	[diff] [blame]	2073	tcp_hashinfo.ehash_size = (1 << tcp_hashinfo.ehash_size) >> 1;
				2074	for (i = 0; i < (tcp_hashinfo.ehash_size << 1); i++) {
				2075	rwlock_init(&tcp_hashinfo.ehash[i].lock);
				2076	INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2077	}
				2078
Arnaldo Carvalho de Melo	6e04e02	2005-08-09 20:07:35 -0700	[diff] [blame]	2079	tcp_hashinfo.bhash =
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2080	alloc_large_system_hash("TCP bind",
Arnaldo Carvalho de Melo	0f7ff92	2005-08-09 19:59:44 -0700	[diff] [blame]	2081	sizeof(struct inet_bind_hashbucket),
Arnaldo Carvalho de Melo	6e04e02	2005-08-09 20:07:35 -0700	[diff] [blame]	2082	tcp_hashinfo.ehash_size,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2083	(num_physpages >= 128 * 1024) ?
Mike Stroyan	18955cf	2005-11-29 16:12:55 -0800	[diff] [blame^]	2084	13 : 15,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2085	HASH_HIGHMEM,
Arnaldo Carvalho de Melo	6e04e02	2005-08-09 20:07:35 -0700	[diff] [blame]	2086	&tcp_hashinfo.bhash_size,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2087	NULL,
				2088	64 * 1024);
Arnaldo Carvalho de Melo	6e04e02	2005-08-09 20:07:35 -0700	[diff] [blame]	2089	tcp_hashinfo.bhash_size = 1 << tcp_hashinfo.bhash_size;
				2090	for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
				2091	spin_lock_init(&tcp_hashinfo.bhash[i].lock);
				2092	INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2093	}
				2094
				2095	/* Try to be a bit smarter and adjust defaults depending
				2096	* on available memory.
				2097	*/
				2098	for (order = 0; ((1 << order) << PAGE_SHIFT) <
Arnaldo Carvalho de Melo	6e04e02	2005-08-09 20:07:35 -0700	[diff] [blame]	2099	(tcp_hashinfo.bhash_size * sizeof(struct inet_bind_hashbucket));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2100	order++)
				2101	;
Andi Kleen	e762648	2005-06-13 14:24:52 -0700	[diff] [blame]	2102	if (order >= 4) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2103	sysctl_local_port_range[0] = 32768;
				2104	sysctl_local_port_range[1] = 61000;
Arnaldo Carvalho de Melo	295ff7e	2005-08-09 20:44:40 -0700	[diff] [blame]	2105	tcp_death_row.sysctl_max_tw_buckets = 180000;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2106	sysctl_tcp_max_orphans = 4096 << (order - 4);
				2107	sysctl_max_syn_backlog = 1024;
				2108	} else if (order < 3) {
				2109	sysctl_local_port_range[0] = 1024 * (3 - order);
Arnaldo Carvalho de Melo	295ff7e	2005-08-09 20:44:40 -0700	[diff] [blame]	2110	tcp_death_row.sysctl_max_tw_buckets >>= (3 - order);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2111	sysctl_tcp_max_orphans >>= (3 - order);
				2112	sysctl_max_syn_backlog = 128;
				2113	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2114
				2115	sysctl_tcp_mem[0] = 768 << order;
				2116	sysctl_tcp_mem[1] = 1024 << order;
				2117	sysctl_tcp_mem[2] = 1536 << order;
				2118
				2119	if (order < 3) {
				2120	sysctl_tcp_wmem[2] = 64 * 1024;
				2121	sysctl_tcp_rmem[0] = PAGE_SIZE;
				2122	sysctl_tcp_rmem[1] = 43689;
				2123	sysctl_tcp_rmem[2] = 2 * 43689;
				2124	}
				2125
				2126	printk(KERN_INFO "TCP: Hash tables configured "
				2127	"(established %d bind %d)\n",
Arnaldo Carvalho de Melo	6e04e02	2005-08-09 20:07:35 -0700	[diff] [blame]	2128	tcp_hashinfo.ehash_size << 1, tcp_hashinfo.bhash_size);
Stephen Hemminger	317a76f	2005-06-23 12:19:55 -0700	[diff] [blame]	2129
				2130	tcp_register_congestion_control(&tcp_reno);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2131	}
				2132
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2133	EXPORT_SYMBOL(tcp_close);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2134	EXPORT_SYMBOL(tcp_disconnect);
				2135	EXPORT_SYMBOL(tcp_getsockopt);
				2136	EXPORT_SYMBOL(tcp_ioctl);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2137	EXPORT_SYMBOL(tcp_poll);
				2138	EXPORT_SYMBOL(tcp_read_sock);
				2139	EXPORT_SYMBOL(tcp_recvmsg);
				2140	EXPORT_SYMBOL(tcp_sendmsg);
				2141	EXPORT_SYMBOL(tcp_sendpage);
				2142	EXPORT_SYMBOL(tcp_setsockopt);
				2143	EXPORT_SYMBOL(tcp_shutdown);
				2144	EXPORT_SYMBOL(tcp_statistics);