blob: ad6435ba6d72ffd8caf783bb25cad7ec151d6909 [file] [log] [blame]
Thomas Gleixner2874c5f2019-05-27 08:55:01 +02001// SPDX-License-Identifier: GPL-2.0-or-later
Linus Torvalds1da177e2005-04-16 15:20:36 -07002/*
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
6 *
7 * Implementation of the Transmission Control Protocol(TCP).
8 *
Linus Torvalds1da177e2005-04-16 15:20:36 -07009 * IPv4 specific functions
10 *
Linus Torvalds1da177e2005-04-16 15:20:36 -070011 * code split from:
12 * linux/ipv4/tcp.c
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
15 *
16 * See tcp.c for author information
Linus Torvalds1da177e2005-04-16 15:20:36 -070017 */
18
19/*
20 * Changes:
21 * David S. Miller : New socket lookup architecture.
22 * This code is dedicated to John Dyson.
23 * David S. Miller : Change semantics of established hash,
24 * half is devoted to TIME_WAIT sockets
25 * and the rest go in the other half.
26 * Andi Kleen : Add support for syncookies and fixed
27 * some bugs: ip options weren't passed to
28 * the TCP layer, missed a check for an
29 * ACK bit.
30 * Andi Kleen : Implemented fast path mtu discovery.
31 * Fixed many serious bugs in the
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -070032 * request_sock handling and moved
Linus Torvalds1da177e2005-04-16 15:20:36 -070033 * most of it into the af independent code.
34 * Added tail drop and some other bugfixes.
Stephen Hemmingercaa20d9a2005-11-10 17:13:47 -080035 * Added new listen semantics.
Linus Torvalds1da177e2005-04-16 15:20:36 -070036 * Mike McLagan : Routing by source
37 * Juan Jose Ciarlante: ip_dynaddr bits
38 * Andi Kleen: various fixes.
39 * Vitaly E. Lavrov : Transparent proxy revived after year
40 * coma.
41 * Andi Kleen : Fix new listen.
42 * Andi Kleen : Fix accept error reporting.
43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
45 * a single port at the same time.
46 */
47
Joe Perchesafd465032012-03-12 07:03:32 +000048#define pr_fmt(fmt) "TCP: " fmt
Linus Torvalds1da177e2005-04-16 15:20:36 -070049
Herbert Xueb4dea52008-12-29 23:04:08 -080050#include <linux/bottom_half.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070051#include <linux/types.h>
52#include <linux/fcntl.h>
53#include <linux/module.h>
54#include <linux/random.h>
55#include <linux/cache.h>
56#include <linux/jhash.h>
57#include <linux/init.h>
58#include <linux/times.h>
Tejun Heo5a0e3ad2010-03-24 17:04:11 +090059#include <linux/slab.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070060
Eric W. Biederman457c4cb2007-09-12 12:01:34 +020061#include <net/net_namespace.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070062#include <net/icmp.h>
Arnaldo Carvalho de Melo304a1612005-08-09 19:59:20 -070063#include <net/inet_hashtables.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070064#include <net/tcp.h>
Arnaldo Carvalho de Melo20380732005-08-16 02:18:02 -030065#include <net/transp_v6.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070066#include <net/ipv6.h>
67#include <net/inet_common.h>
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -080068#include <net/timewait_sock.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070069#include <net/xfrm.h>
David S. Miller6e5714e2011-08-03 20:50:44 -070070#include <net/secure_seq.h>
Eliezer Tamir076bb0c2013-07-10 17:13:17 +030071#include <net/busy_poll.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070072
73#include <linux/inet.h>
74#include <linux/ipv6.h>
75#include <linux/stddef.h>
76#include <linux/proc_fs.h>
77#include <linux/seq_file.h>
Ivan Delalande67973182017-06-15 18:07:06 -070078#include <linux/inetdevice.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070079
Herbert Xucf80e0e2016-01-24 21:20:23 +080080#include <crypto/hash.h>
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -080081#include <linux/scatterlist.h>
82
Song Liuc24b14c2017-10-23 09:20:24 -070083#include <trace/events/tcp.h>
84
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -080085#ifdef CONFIG_TCP_MD5SIG
Eric Dumazeta915da9b2012-01-31 05:18:33 +000086static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
Eric Dumazet318cf7a2011-10-24 02:46:04 -040087 __be32 daddr, __be32 saddr, const struct tcphdr *th);
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -080088#endif
89
Eric Dumazet5caea4e2008-11-20 00:40:07 -080090struct inet_hashinfo tcp_hashinfo;
Eric Dumazet4bc2f182010-07-09 21:22:10 +000091EXPORT_SYMBOL(tcp_hashinfo);
Linus Torvalds1da177e2005-04-16 15:20:36 -070092
Eric Dumazet84b114b2017-05-05 06:56:54 -070093static u32 tcp_v4_init_seq(const struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -070094{
Eric Dumazet84b114b2017-05-05 06:56:54 -070095 return secure_tcp_seq(ip_hdr(skb)->daddr,
96 ip_hdr(skb)->saddr,
97 tcp_hdr(skb)->dest,
98 tcp_hdr(skb)->source);
99}
100
Eric Dumazet5d2ed052017-06-07 10:34:39 -0700101static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
Eric Dumazet84b114b2017-05-05 06:56:54 -0700102{
Eric Dumazet5d2ed052017-06-07 10:34:39 -0700103 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700104}
105
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -0800106int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
107{
Maciej Żenczykowski79e9fed2018-06-03 10:41:17 -0700108 const struct inet_timewait_sock *tw = inet_twsk(sktw);
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -0800109 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
110 struct tcp_sock *tp = tcp_sk(sk);
Maciej Żenczykowski79e9fed2018-06-03 10:41:17 -0700111 int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
112
113 if (reuse == 2) {
114 /* Still does not detect *everything* that goes through
115 * lo, since we require a loopback src or dst address
116 * or direct binding to 'lo' interface.
117 */
118 bool loopback = false;
119 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
120 loopback = true;
121#if IS_ENABLED(CONFIG_IPV6)
122 if (tw->tw_family == AF_INET6) {
123 if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
Eric Dumazetbe2644a2019-10-01 10:49:06 -0700124 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
Maciej Żenczykowski79e9fed2018-06-03 10:41:17 -0700125 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
Eric Dumazetbe2644a2019-10-01 10:49:06 -0700126 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
Maciej Żenczykowski79e9fed2018-06-03 10:41:17 -0700127 loopback = true;
128 } else
129#endif
130 {
131 if (ipv4_is_loopback(tw->tw_daddr) ||
132 ipv4_is_loopback(tw->tw_rcv_saddr))
133 loopback = true;
134 }
135 if (!loopback)
136 reuse = 0;
137 }
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -0800138
139 /* With PAWS, it is safe from the viewpoint
140 of data integrity. Even without PAWS it is safe provided sequence
141 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
142
143 Actually, the idea is close to VJ's one, only timestamp cache is
144 held not per host, but per port pair and TW bucket is used as state
145 holder.
146
147 If TW bucket has been already destroyed we fall back to VJ's scheme
148 and use initial timestamp retrieved from peer table.
149 */
150 if (tcptw->tw_ts_recent_stamp &&
Arnd Bergmanncca9bab2018-07-11 12:16:12 +0200151 (!twp || (reuse && time_after32(ktime_get_seconds(),
152 tcptw->tw_ts_recent_stamp)))) {
Stefan Baranoff21684dc2018-07-10 17:25:20 -0400153 /* In case of repair and re-using TIME-WAIT sockets we still
154 * want to be sure that it is safe as above but honor the
155 * sequence numbers and time stamps set as part of the repair
156 * process.
157 *
158 * Without this check re-using a TIME-WAIT socket with TCP
159 * repair would accumulate a -1 on the repair assigned
160 * sequence number. The first time it is reused the sequence
161 * is -1, the second time -2, etc. This fixes that issue
162 * without appearing to create any others.
163 */
164 if (likely(!tp->repair)) {
Eric Dumazet0f317462019-10-10 20:17:41 -0700165 u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
166
167 if (!seq)
168 seq = 1;
169 WRITE_ONCE(tp->write_seq, seq);
Stefan Baranoff21684dc2018-07-10 17:25:20 -0400170 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
171 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
172 }
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -0800173 sock_hold(sktw);
174 return 1;
175 }
176
177 return 0;
178}
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -0800179EXPORT_SYMBOL_GPL(tcp_twsk_unique);
180
Andrey Ignatovd74bad42018-03-30 15:08:05 -0700181static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
182 int addr_len)
183{
184 /* This check is replicated from tcp_v4_connect() and intended to
185 * prevent BPF program called below from accessing bytes that are out
186 * of the bound specified by user in addr_len.
187 */
188 if (addr_len < sizeof(struct sockaddr_in))
189 return -EINVAL;
190
191 sock_owned_by_me(sk);
192
193 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
194}
195
Linus Torvalds1da177e2005-04-16 15:20:36 -0700196/* This will initiate an outgoing connection. */
197int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
198{
David S. Miller2d7192d2011-04-26 13:28:44 -0700199 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700200 struct inet_sock *inet = inet_sk(sk);
201 struct tcp_sock *tp = tcp_sk(sk);
David S. Millerdca8b082011-02-24 13:38:12 -0800202 __be16 orig_sport, orig_dport;
Al Virobada8ad2006-09-26 21:27:15 -0700203 __be32 daddr, nexthop;
David S. Millerda905bd2011-05-06 16:11:19 -0700204 struct flowi4 *fl4;
David S. Miller2d7192d2011-04-26 13:28:44 -0700205 struct rtable *rt;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700206 int err;
Eric Dumazetf6d8bd02011-04-21 09:45:37 +0000207 struct ip_options_rcu *inet_opt;
Haishuang Yan1946e672016-12-28 17:52:32 +0800208 struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700209
210 if (addr_len < sizeof(struct sockaddr_in))
211 return -EINVAL;
212
213 if (usin->sin_family != AF_INET)
214 return -EAFNOSUPPORT;
215
216 nexthop = daddr = usin->sin_addr.s_addr;
Eric Dumazetf6d8bd02011-04-21 09:45:37 +0000217 inet_opt = rcu_dereference_protected(inet->inet_opt,
Hannes Frederic Sowa1e1d04e2016-04-05 17:10:15 +0200218 lockdep_sock_is_held(sk));
Eric Dumazetf6d8bd02011-04-21 09:45:37 +0000219 if (inet_opt && inet_opt->opt.srr) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700220 if (!daddr)
221 return -EINVAL;
Eric Dumazetf6d8bd02011-04-21 09:45:37 +0000222 nexthop = inet_opt->opt.faddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700223 }
224
David S. Millerdca8b082011-02-24 13:38:12 -0800225 orig_sport = inet->inet_sport;
226 orig_dport = usin->sin_port;
David S. Millerda905bd2011-05-06 16:11:19 -0700227 fl4 = &inet->cork.fl.u.ip4;
228 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
David S. Millerb23dd4f2011-03-02 14:31:35 -0800229 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
230 IPPROTO_TCP,
Steffen Klassert0e0d44a2013-08-28 08:04:14 +0200231 orig_sport, orig_dport, sk);
David S. Millerb23dd4f2011-03-02 14:31:35 -0800232 if (IS_ERR(rt)) {
233 err = PTR_ERR(rt);
234 if (err == -ENETUNREACH)
Eric Dumazetf1d8cba2013-11-28 09:51:22 -0800235 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
David S. Millerb23dd4f2011-03-02 14:31:35 -0800236 return err;
Wei Dong584bdf82007-05-31 22:49:28 -0700237 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700238
239 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
240 ip_rt_put(rt);
241 return -ENETUNREACH;
242 }
243
Eric Dumazetf6d8bd02011-04-21 09:45:37 +0000244 if (!inet_opt || !inet_opt->opt.srr)
David S. Millerda905bd2011-05-06 16:11:19 -0700245 daddr = fl4->daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700246
Eric Dumazetc720c7e82009-10-15 06:30:45 +0000247 if (!inet->inet_saddr)
David S. Millerda905bd2011-05-06 16:11:19 -0700248 inet->inet_saddr = fl4->saddr;
Eric Dumazetd1e559d2015-03-18 14:05:35 -0700249 sk_rcv_saddr_set(sk, inet->inet_saddr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700250
Eric Dumazetc720c7e82009-10-15 06:30:45 +0000251 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700252 /* Reset inherited state */
253 tp->rx_opt.ts_recent = 0;
254 tp->rx_opt.ts_recent_stamp = 0;
Pavel Emelyanovee995282012-04-19 03:40:39 +0000255 if (likely(!tp->repair))
Eric Dumazet0f317462019-10-10 20:17:41 -0700256 WRITE_ONCE(tp->write_seq, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700257 }
258
Eric Dumazetc720c7e82009-10-15 06:30:45 +0000259 inet->inet_dport = usin->sin_port;
Eric Dumazetd1e559d2015-03-18 14:05:35 -0700260 sk_daddr_set(sk, daddr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700261
Arnaldo Carvalho de Melod83d8462005-12-13 23:26:10 -0800262 inet_csk(sk)->icsk_ext_hdr_len = 0;
Eric Dumazetf6d8bd02011-04-21 09:45:37 +0000263 if (inet_opt)
264 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700265
William Allen Simpsonbee7ca92009-11-10 09:51:18 +0000266 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700267
268 /* Socket identity is still unknown (sport may be zero).
269 * However we set state to SYN-SENT and not releasing socket
270 * lock select source port, enter ourselves into the hash tables and
271 * complete initialization after this.
272 */
273 tcp_set_state(sk, TCP_SYN_SENT);
Haishuang Yan1946e672016-12-28 17:52:32 +0800274 err = inet_hash_connect(tcp_death_row, sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700275 if (err)
276 goto failure;
277
Tom Herbert877d1f62015-07-28 16:02:05 -0700278 sk_set_txhash(sk);
Sathya Perla9e7ceb02014-10-22 21:42:01 +0530279
David S. Millerda905bd2011-05-06 16:11:19 -0700280 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
David S. Millerb23dd4f2011-03-02 14:31:35 -0800281 inet->inet_sport, inet->inet_dport, sk);
282 if (IS_ERR(rt)) {
283 err = PTR_ERR(rt);
284 rt = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700285 goto failure;
David S. Millerb23dd4f2011-03-02 14:31:35 -0800286 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700287 /* OK, now commit destination to socket. */
Herbert Xubcd76112006-06-30 13:36:35 -0700288 sk->sk_gso_type = SKB_GSO_TCPV4;
Changli Gaod8d1f302010-06-10 23:31:35 -0700289 sk_setup_caps(sk, &rt->dst);
Wei Wang19f6d3f2017-01-23 10:59:22 -0800290 rt = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700291
Alexey Kodanev00355fa2017-02-22 13:23:55 +0300292 if (likely(!tp->repair)) {
Alexey Kodanev00355fa2017-02-22 13:23:55 +0300293 if (!tp->write_seq)
Eric Dumazet0f317462019-10-10 20:17:41 -0700294 WRITE_ONCE(tp->write_seq,
295 secure_tcp_seq(inet->inet_saddr,
296 inet->inet_daddr,
297 inet->inet_sport,
298 usin->sin_port));
Eric Dumazet5d2ed052017-06-07 10:34:39 -0700299 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
300 inet->inet_saddr,
Eric Dumazet84b114b2017-05-05 06:56:54 -0700301 inet->inet_daddr);
Alexey Kodanev00355fa2017-02-22 13:23:55 +0300302 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700303
Eric Dumazeta904a062019-11-01 10:32:19 -0700304 inet->inet_id = prandom_u32();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700305
Wei Wang19f6d3f2017-01-23 10:59:22 -0800306 if (tcp_fastopen_defer_connect(sk, &err))
307 return err;
308 if (err)
309 goto failure;
310
Andrey Vagin2b916472012-11-22 01:13:58 +0000311 err = tcp_connect(sk);
Pavel Emelyanovee995282012-04-19 03:40:39 +0000312
Linus Torvalds1da177e2005-04-16 15:20:36 -0700313 if (err)
314 goto failure;
315
316 return 0;
317
318failure:
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -0200319 /*
320 * This unhashes the socket and releases the local port,
321 * if necessary.
322 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700323 tcp_set_state(sk, TCP_CLOSE);
324 ip_rt_put(rt);
325 sk->sk_route_caps = 0;
Eric Dumazetc720c7e82009-10-15 06:30:45 +0000326 inet->inet_dport = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700327 return err;
328}
Eric Dumazet4bc2f182010-07-09 21:22:10 +0000329EXPORT_SYMBOL(tcp_v4_connect);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700330
Linus Torvalds1da177e2005-04-16 15:20:36 -0700331/*
Eric Dumazet563d34d2012-07-23 09:48:52 +0200332 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
333 * It can be called through tcp_release_cb() if socket was owned by user
334 * at the time tcp_v4_err() was called to handle ICMP message.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700335 */
Neal Cardwell4fab9072014-08-14 12:40:05 -0400336void tcp_v4_mtu_reduced(struct sock *sk)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700337{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700338 struct inet_sock *inet = inet_sk(sk);
Eric Dumazet02b2faa2017-03-03 14:08:21 -0800339 struct dst_entry *dst;
340 u32 mtu;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700341
Eric Dumazet02b2faa2017-03-03 14:08:21 -0800342 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
343 return;
344 mtu = tcp_sk(sk)->mtu_info;
David S. Miller80d0a692012-07-16 03:28:06 -0700345 dst = inet_csk_update_pmtu(sk, mtu);
346 if (!dst)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700347 return;
348
Linus Torvalds1da177e2005-04-16 15:20:36 -0700349 /* Something is about to be wrong... Remember soft error
350 * for the case, if this connection will not able to recover.
351 */
352 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
353 sk->sk_err_soft = EMSGSIZE;
354
355 mtu = dst_mtu(dst);
356
357 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
Hannes Frederic Sowa482fc602013-11-05 02:24:17 +0100358 ip_sk_accept_pmtu(sk) &&
Arnaldo Carvalho de Melod83d8462005-12-13 23:26:10 -0800359 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700360 tcp_sync_mss(sk, mtu);
361
362 /* Resend the TCP packet because it's
363 * clear that the old packet has been
364 * dropped. This is the new "fast" path mtu
365 * discovery.
366 */
367 tcp_simple_retransmit(sk);
368 } /* else let the usual retransmit timer handle it */
369}
Neal Cardwell4fab9072014-08-14 12:40:05 -0400370EXPORT_SYMBOL(tcp_v4_mtu_reduced);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700371
David S. Miller55be7a92012-07-11 21:27:49 -0700372static void do_redirect(struct sk_buff *skb, struct sock *sk)
373{
374 struct dst_entry *dst = __sk_dst_check(sk, 0);
375
David S. Miller1ed5c482012-07-12 00:41:25 -0700376 if (dst)
David S. Miller6700c272012-07-17 03:29:28 -0700377 dst->ops->redirect(dst, sk, skb);
David S. Miller55be7a92012-07-11 21:27:49 -0700378}
379
Eric Dumazet26e37362015-03-22 10:22:22 -0700380
381/* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
Eric Dumazet9cf74902016-02-02 19:31:12 -0800382void tcp_req_err(struct sock *sk, u32 seq, bool abort)
Eric Dumazet26e37362015-03-22 10:22:22 -0700383{
384 struct request_sock *req = inet_reqsk(sk);
385 struct net *net = sock_net(sk);
386
387 /* ICMPs are not backlogged, hence we cannot get
388 * an established socket here.
389 */
Eric Dumazet26e37362015-03-22 10:22:22 -0700390 if (seq != tcp_rsk(req)->snt_isn) {
Eric Dumazet02a1d6e2016-04-27 16:44:39 -0700391 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
Eric Dumazet9cf74902016-02-02 19:31:12 -0800392 } else if (abort) {
Eric Dumazet26e37362015-03-22 10:22:22 -0700393 /*
394 * Still in SYN_RECV, just remove it silently.
395 * There is no good way to pass the error to the newly
396 * created socket, and POSIX does not want network
397 * errors returned from accept().
398 */
Fan Duc6973662015-03-23 15:00:41 -0700399 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
Eric Dumazet9caad862016-04-01 08:52:20 -0700400 tcp_listendrop(req->rsk_listener);
Eric Dumazet26e37362015-03-22 10:22:22 -0700401 }
Eric Dumazetef84d8c2015-10-14 11:16:26 -0700402 reqsk_put(req);
Eric Dumazet26e37362015-03-22 10:22:22 -0700403}
404EXPORT_SYMBOL(tcp_req_err);
405
Eric Dumazetf7456642020-05-26 19:48:49 -0700406/* TCP-LD (RFC 6069) logic */
Eric Dumazetd2924562020-05-27 17:34:58 -0700407void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
Eric Dumazetf7456642020-05-26 19:48:49 -0700408{
409 struct inet_connection_sock *icsk = inet_csk(sk);
410 struct tcp_sock *tp = tcp_sk(sk);
411 struct sk_buff *skb;
412 s32 remaining;
413 u32 delta_us;
414
415 if (sock_owned_by_user(sk))
416 return;
417
418 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
419 !icsk->icsk_backoff)
420 return;
421
422 skb = tcp_rtx_queue_head(sk);
423 if (WARN_ON_ONCE(!skb))
424 return;
425
426 icsk->icsk_backoff--;
427 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
428 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
429
430 tcp_mstamp_refresh(tp);
431 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
432 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
433
434 if (remaining > 0) {
435 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
436 remaining, TCP_RTO_MAX);
437 } else {
438 /* RTO revert clocked out retransmission.
439 * Will retransmit now.
440 */
441 tcp_retransmit_timer(sk);
442 }
443}
Eric Dumazetd2924562020-05-27 17:34:58 -0700444EXPORT_SYMBOL(tcp_ld_RTO_revert);
Eric Dumazetf7456642020-05-26 19:48:49 -0700445
Linus Torvalds1da177e2005-04-16 15:20:36 -0700446/*
447 * This routine is called by the ICMP module when it gets some
448 * sort of error condition. If err < 0 then the socket should
449 * be closed and the error returned to the user. If err > 0
450 * it's just the icmp type << 8 | icmp code. After adjustment
451 * header points to the first 8 bytes of the tcp header. We need
452 * to find the appropriate port.
453 *
454 * The locking strategy used here is very "optimistic". When
455 * someone else accesses the socket the ICMP is just dropped
456 * and for some paths there is no check at all.
457 * A more general error queue to queue errors for later handling
458 * is probably better.
459 *
460 */
461
Eric Dumazeta12daf12020-05-26 19:48:50 -0700462int tcp_v4_err(struct sk_buff *skb, u32 info)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700463{
Eric Dumazeta12daf12020-05-26 19:48:50 -0700464 const struct iphdr *iph = (const struct iphdr *)skb->data;
465 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700466 struct tcp_sock *tp;
467 struct inet_sock *inet;
Eric Dumazeta12daf12020-05-26 19:48:50 -0700468 const int type = icmp_hdr(skb)->type;
469 const int code = icmp_hdr(skb)->code;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700470 struct sock *sk;
Yuchung Cheng0a672f72014-05-11 20:22:12 -0700471 struct request_sock *fastopen;
Eric Dumazet9a568de2017-05-16 14:00:14 -0700472 u32 seq, snd_una;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700473 int err;
Eric Dumazeta12daf12020-05-26 19:48:50 -0700474 struct net *net = dev_net(skb->dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700475
Eric Dumazet26e37362015-03-22 10:22:22 -0700476 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
477 th->dest, iph->saddr, ntohs(th->source),
Eric Dumazeta12daf12020-05-26 19:48:50 -0700478 inet_iif(skb), 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700479 if (!sk) {
Eric Dumazet5d3848b2016-04-27 16:44:29 -0700480 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
Stefano Brivio32bbd872018-11-08 12:19:21 +0100481 return -ENOENT;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700482 }
483 if (sk->sk_state == TCP_TIME_WAIT) {
YOSHIFUJI Hideaki9469c7b2006-10-10 19:41:46 -0700484 inet_twsk_put(inet_twsk(sk));
Stefano Brivio32bbd872018-11-08 12:19:21 +0100485 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700486 }
Eric Dumazet26e37362015-03-22 10:22:22 -0700487 seq = ntohl(th->seq);
Stefano Brivio32bbd872018-11-08 12:19:21 +0100488 if (sk->sk_state == TCP_NEW_SYN_RECV) {
489 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
490 type == ICMP_TIME_EXCEEDED ||
491 (type == ICMP_DEST_UNREACH &&
492 (code == ICMP_NET_UNREACH ||
493 code == ICMP_HOST_UNREACH)));
494 return 0;
495 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700496
497 bh_lock_sock(sk);
498 /* If too many ICMPs get dropped on busy
499 * servers this needs to be solved differently.
Eric Dumazet563d34d2012-07-23 09:48:52 +0200500 * We do take care of PMTU discovery (RFC1191) special case :
501 * we can receive locally generated ICMP messages while socket is held.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700502 */
Eric Dumazetb74aa932013-01-19 16:10:37 +0000503 if (sock_owned_by_user(sk)) {
504 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
Eric Dumazet02a1d6e2016-04-27 16:44:39 -0700505 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
Eric Dumazetb74aa932013-01-19 16:10:37 +0000506 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700507 if (sk->sk_state == TCP_CLOSE)
508 goto out;
509
stephen hemminger97e3ecd12010-03-18 11:27:32 +0000510 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
Eric Dumazet02a1d6e2016-04-27 16:44:39 -0700511 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
stephen hemminger97e3ecd12010-03-18 11:27:32 +0000512 goto out;
513 }
514
Linus Torvalds1da177e2005-04-16 15:20:36 -0700515 tp = tcp_sk(sk);
Yuchung Cheng0a672f72014-05-11 20:22:12 -0700516 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
Eric Dumazetd983ea62019-10-10 20:17:38 -0700517 fastopen = rcu_dereference(tp->fastopen_rsk);
Yuchung Cheng0a672f72014-05-11 20:22:12 -0700518 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700519 if (sk->sk_state != TCP_LISTEN &&
Yuchung Cheng0a672f72014-05-11 20:22:12 -0700520 !between(seq, snd_una, tp->snd_nxt)) {
Eric Dumazet02a1d6e2016-04-27 16:44:39 -0700521 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700522 goto out;
523 }
524
525 switch (type) {
David S. Miller55be7a92012-07-11 21:27:49 -0700526 case ICMP_REDIRECT:
Jon Maxwell45caeaa2017-03-10 16:40:33 +1100527 if (!sock_owned_by_user(sk))
Eric Dumazeta12daf12020-05-26 19:48:50 -0700528 do_redirect(skb, sk);
David S. Miller55be7a92012-07-11 21:27:49 -0700529 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700530 case ICMP_SOURCE_QUENCH:
531 /* Just silently ignore these. */
532 goto out;
533 case ICMP_PARAMETERPROB:
534 err = EPROTO;
535 break;
536 case ICMP_DEST_UNREACH:
537 if (code > NR_ICMP_UNREACH)
538 goto out;
539
540 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
Eric Dumazet0d4f0602013-03-18 07:01:28 +0000541 /* We are not interested in TCP_LISTEN and open_requests
542 * (SYN-ACKs send out by Linux are always <576bytes so
543 * they should go through unfragmented).
544 */
545 if (sk->sk_state == TCP_LISTEN)
546 goto out;
547
Eric Dumazet563d34d2012-07-23 09:48:52 +0200548 tp->mtu_info = info;
Eric Dumazet144d56e2012-08-20 00:22:46 +0000549 if (!sock_owned_by_user(sk)) {
Eric Dumazet563d34d2012-07-23 09:48:52 +0200550 tcp_v4_mtu_reduced(sk);
Eric Dumazet144d56e2012-08-20 00:22:46 +0000551 } else {
Eric Dumazet7aa54702016-12-03 11:14:57 -0800552 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
Eric Dumazet144d56e2012-08-20 00:22:46 +0000553 sock_hold(sk);
554 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700555 goto out;
556 }
557
558 err = icmp_err_convert[code].errno;
Eric Dumazetf7456642020-05-26 19:48:49 -0700559 /* check if this ICMP message allows revert of backoff.
560 * (see RFC 6069)
561 */
562 if (!fastopen &&
563 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
564 tcp_ld_RTO_revert(sk, seq);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700565 break;
566 case ICMP_TIME_EXCEEDED:
567 err = EHOSTUNREACH;
568 break;
569 default:
570 goto out;
571 }
572
573 switch (sk->sk_state) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700574 case TCP_SYN_SENT:
Yuchung Cheng0a672f72014-05-11 20:22:12 -0700575 case TCP_SYN_RECV:
576 /* Only in fast or simultaneous open. If a fast open socket is
577 * is already accepted it is treated as a connected one below.
578 */
Ian Morris51456b22015-04-03 09:17:26 +0100579 if (fastopen && !fastopen->sk)
Yuchung Cheng0a672f72014-05-11 20:22:12 -0700580 break;
581
Eric Dumazeta12daf12020-05-26 19:48:50 -0700582 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
Eric Dumazet45af29c2020-05-24 11:00:02 -0700583
Linus Torvalds1da177e2005-04-16 15:20:36 -0700584 if (!sock_owned_by_user(sk)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700585 sk->sk_err = err;
586
587 sk->sk_error_report(sk);
588
589 tcp_done(sk);
590 } else {
591 sk->sk_err_soft = err;
592 }
593 goto out;
594 }
595
596 /* If we've already connected we will keep trying
597 * until we time out, or the user gives up.
598 *
599 * rfc1122 4.2.3.9 allows to consider as hard errors
600 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
601 * but it is obsoleted by pmtu discovery).
602 *
603 * Note, that in modern internet, where routing is unreliable
604 * and in each dark corner broken firewalls sit, sending random
605 * errors ordered by their masters even this two messages finally lose
606 * their original sense (even Linux sends invalid PORT_UNREACHs)
607 *
608 * Now we are in compliance with RFCs.
609 * --ANK (980905)
610 */
611
612 inet = inet_sk(sk);
613 if (!sock_owned_by_user(sk) && inet->recverr) {
614 sk->sk_err = err;
615 sk->sk_error_report(sk);
616 } else { /* Only an error on timeout */
617 sk->sk_err_soft = err;
618 }
619
620out:
621 bh_unlock_sock(sk);
622 sock_put(sk);
Stefano Brivio32bbd872018-11-08 12:19:21 +0100623 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700624}
625
Daniel Borkmann28850dc2013-06-07 05:11:46 +0000626void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700627{
Arnaldo Carvalho de Meloaa8223c2007-04-10 21:04:22 -0700628 struct tcphdr *th = tcp_hdr(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700629
Eric Dumazet98be9b12018-02-19 11:56:52 -0800630 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
631 skb->csum_start = skb_transport_header(skb) - skb->head;
632 skb->csum_offset = offsetof(struct tcphdr, check);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700633}
634
Herbert Xu419f9f82010-04-11 02:15:53 +0000635/* This routine computes an IPv4 TCP checksum. */
Herbert Xubb296242010-04-11 02:15:55 +0000636void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
Herbert Xu419f9f82010-04-11 02:15:53 +0000637{
Eric Dumazetcf533ea2011-10-21 05:22:42 -0400638 const struct inet_sock *inet = inet_sk(sk);
Herbert Xu419f9f82010-04-11 02:15:53 +0000639
640 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
641}
Eric Dumazet4bc2f182010-07-09 21:22:10 +0000642EXPORT_SYMBOL(tcp_v4_send_check);
Herbert Xu419f9f82010-04-11 02:15:53 +0000643
Linus Torvalds1da177e2005-04-16 15:20:36 -0700644/*
645 * This routine will send an RST to the other tcp.
646 *
647 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
648 * for reset.
649 * Answer: if a packet caused RST, it is not for a socket
650 * existing in our system, if it is matched to a socket,
651 * it is just duplicate segment or bug in other side's TCP.
652 * So that we build reply only basing on parameters
653 * arrived with segment.
654 * Exception: precedence violation. We do not implement it in any case.
655 */
656
Eric Dumazeta00e7442015-09-29 07:42:39 -0700657static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700658{
Eric Dumazetcf533ea2011-10-21 05:22:42 -0400659 const struct tcphdr *th = tcp_hdr(skb);
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800660 struct {
661 struct tcphdr th;
662#ifdef CONFIG_TCP_MD5SIG
Al Viro714e85b2006-11-14 20:51:49 -0800663 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800664#endif
665 } rep;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700666 struct ip_reply_arg arg;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800667#ifdef CONFIG_TCP_MD5SIG
Florian Westphale46787f2015-12-21 21:29:25 +0100668 struct tcp_md5sig_key *key = NULL;
Shawn Lu658ddaa2012-01-31 22:35:48 +0000669 const __u8 *hash_location = NULL;
670 unsigned char newhash[16];
671 int genhash;
672 struct sock *sk1 = NULL;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800673#endif
Eric Dumazetd6fb3962019-06-13 21:22:35 -0700674 u64 transmit_time = 0;
Jon Maxwell00483692018-05-10 16:53:51 +1000675 struct sock *ctl_sk;
Eric Dumazetd6fb3962019-06-13 21:22:35 -0700676 struct net *net;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700677
678 /* Never send a reset in response to a reset. */
679 if (th->rst)
680 return;
681
Eric Dumazetc3658e82014-11-25 07:40:04 -0800682 /* If sk not NULL, it means we did a successful lookup and incoming
683 * route had to be correct. prequeue might have dropped our dst.
684 */
685 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700686 return;
687
688 /* Swap the send and the receive. */
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800689 memset(&rep, 0, sizeof(rep));
690 rep.th.dest = th->source;
691 rep.th.source = th->dest;
692 rep.th.doff = sizeof(struct tcphdr) / 4;
693 rep.th.rst = 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700694
695 if (th->ack) {
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800696 rep.th.seq = th->ack_seq;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700697 } else {
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800698 rep.th.ack = 1;
699 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
700 skb->len - (th->doff << 2));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700701 }
702
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -0200703 memset(&arg, 0, sizeof(arg));
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800704 arg.iov[0].iov_base = (unsigned char *)&rep;
705 arg.iov[0].iov_len = sizeof(rep.th);
706
Eric Dumazet0f85fea2014-12-09 09:56:08 -0800707 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800708#ifdef CONFIG_TCP_MD5SIG
Eric Dumazet3b24d852016-04-01 08:52:17 -0700709 rcu_read_lock();
Shawn Lu658ddaa2012-01-31 22:35:48 +0000710 hash_location = tcp_parse_md5sig_option(th);
Florian Westphal271c3b92015-12-21 21:29:26 +0100711 if (sk && sk_fullsock(sk)) {
David Aherncea97602019-12-30 14:14:25 -0800712 const union tcp_md5_addr *addr;
David Aherndea53bb2019-12-30 14:14:28 -0800713 int l3index;
David Aherncea97602019-12-30 14:14:25 -0800714
David Aherndea53bb2019-12-30 14:14:28 -0800715 /* sdif set, means packet ingressed via a device
716 * in an L3 domain and inet_iif is set to it.
717 */
718 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
David Aherncea97602019-12-30 14:14:25 -0800719 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
David Aherndea53bb2019-12-30 14:14:28 -0800720 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
Florian Westphale46787f2015-12-21 21:29:25 +0100721 } else if (hash_location) {
David Aherncea97602019-12-30 14:14:25 -0800722 const union tcp_md5_addr *addr;
David Ahern534322c2019-12-30 14:14:27 -0800723 int sdif = tcp_v4_sdif(skb);
724 int dif = inet_iif(skb);
David Aherndea53bb2019-12-30 14:14:28 -0800725 int l3index;
David Aherncea97602019-12-30 14:14:25 -0800726
Shawn Lu658ddaa2012-01-31 22:35:48 +0000727 /*
728 * active side is lost. Try to find listening socket through
729 * source port, and then find md5 key through listening socket.
730 * we are not loose security here:
731 * Incoming packet is checked with md5 hash with finding key,
732 * no RST generated if md5 hash doesn't match.
733 */
Craig Galleka5836362016-02-10 11:50:38 -0500734 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
735 ip_hdr(skb)->saddr,
Tom Herbertda5e3632013-01-22 09:50:24 +0000736 th->source, ip_hdr(skb)->daddr,
David Ahern534322c2019-12-30 14:14:27 -0800737 ntohs(th->source), dif, sdif);
Shawn Lu658ddaa2012-01-31 22:35:48 +0000738 /* don't send rst if it can't find key */
739 if (!sk1)
Eric Dumazet3b24d852016-04-01 08:52:17 -0700740 goto out;
741
David Aherndea53bb2019-12-30 14:14:28 -0800742 /* sdif set, means packet ingressed via a device
743 * in an L3 domain and dif is set to it.
744 */
745 l3index = sdif ? dif : 0;
David Aherncea97602019-12-30 14:14:25 -0800746 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
David Aherndea53bb2019-12-30 14:14:28 -0800747 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
Shawn Lu658ddaa2012-01-31 22:35:48 +0000748 if (!key)
Eric Dumazet3b24d852016-04-01 08:52:17 -0700749 goto out;
750
Shawn Lu658ddaa2012-01-31 22:35:48 +0000751
Eric Dumazet39f8e582015-03-24 15:58:55 -0700752 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
Shawn Lu658ddaa2012-01-31 22:35:48 +0000753 if (genhash || memcmp(hash_location, newhash, 16) != 0)
Eric Dumazet3b24d852016-04-01 08:52:17 -0700754 goto out;
755
Shawn Lu658ddaa2012-01-31 22:35:48 +0000756 }
757
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800758 if (key) {
759 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
760 (TCPOPT_NOP << 16) |
761 (TCPOPT_MD5SIG << 8) |
762 TCPOLEN_MD5SIG);
763 /* Update length and the length the header thinks exists */
764 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
765 rep.th.doff = arg.iov[0].iov_len / 4;
766
Adam Langley49a72df2008-07-19 00:01:42 -0700767 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
Ilpo Järvinen78e645cb2008-10-09 14:37:47 -0700768 key, ip_hdr(skb)->saddr,
769 ip_hdr(skb)->daddr, &rep.th);
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800770 }
771#endif
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700772 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
773 ip_hdr(skb)->saddr, /* XXX */
Ilpo Järvinen52cd5752008-10-08 11:34:06 -0700774 arg.iov[0].iov_len, IPPROTO_TCP, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700775 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
Florian Westphal271c3b92015-12-21 21:29:26 +0100776 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
777
Shawn Lue2446ea2012-02-04 12:38:09 +0000778 /* When socket is gone, all binding information is lost.
Alexey Kuznetsov4c675252012-10-12 04:34:17 +0000779 * routing might fail in this case. No choice here, if we choose to force
780 * input interface, we will misroute in case of asymmetric route.
Shawn Lue2446ea2012-02-04 12:38:09 +0000781 */
Song Liuc24b14c2017-10-23 09:20:24 -0700782 if (sk) {
Alexey Kuznetsov4c675252012-10-12 04:34:17 +0000783 arg.bound_dev_if = sk->sk_bound_dev_if;
Song Liu5c487bb2018-02-06 20:50:23 -0800784 if (sk_fullsock(sk))
785 trace_tcp_send_reset(sk, skb);
Song Liuc24b14c2017-10-23 09:20:24 -0700786 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700787
Florian Westphal271c3b92015-12-21 21:29:26 +0100788 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
789 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
790
Eric Dumazet66b13d92011-10-24 03:06:21 -0400791 arg.tos = ip_hdr(skb)->tos;
Lorenzo Colittie2d118a2016-11-04 02:23:43 +0900792 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
Eric Dumazet47dcc202016-05-06 09:46:18 -0700793 local_bh_disable();
Eric Dumazet5472c3c2019-05-31 19:17:33 -0700794 ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
Eric Dumazeta842fe12019-06-12 11:57:25 -0700795 if (sk) {
Jon Maxwell00483692018-05-10 16:53:51 +1000796 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
797 inet_twsk(sk)->tw_mark : sk->sk_mark;
Eric Dumazetf6c0f5d2019-09-24 08:01:16 -0700798 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
799 inet_twsk(sk)->tw_priority : sk->sk_priority;
Eric Dumazetd6fb3962019-06-13 21:22:35 -0700800 transmit_time = tcp_transmit_time(sk);
Eric Dumazeta842fe12019-06-12 11:57:25 -0700801 }
Jon Maxwell00483692018-05-10 16:53:51 +1000802 ip_send_unicast_reply(ctl_sk,
Eric Dumazetbdbbb852015-01-29 21:35:05 -0800803 skb, &TCP_SKB_CB(skb)->header.h4.opt,
Eric Dumazet24a2d432014-09-27 09:50:55 -0700804 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
Eric Dumazetd6fb3962019-06-13 21:22:35 -0700805 &arg, arg.iov[0].iov_len,
806 transmit_time);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700807
Jon Maxwell00483692018-05-10 16:53:51 +1000808 ctl_sk->sk_mark = 0;
Eric Dumazet90bbcc62016-04-27 16:44:32 -0700809 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
810 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
Eric Dumazet47dcc202016-05-06 09:46:18 -0700811 local_bh_enable();
Shawn Lu658ddaa2012-01-31 22:35:48 +0000812
813#ifdef CONFIG_TCP_MD5SIG
Eric Dumazet3b24d852016-04-01 08:52:17 -0700814out:
815 rcu_read_unlock();
Shawn Lu658ddaa2012-01-31 22:35:48 +0000816#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700817}
818
819/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
820 outside socket context is ugly, certainly. What can I do?
821 */
822
Lorenzo Colittie2d118a2016-11-04 02:23:43 +0900823static void tcp_v4_send_ack(const struct sock *sk,
Eric Dumazete62a1232016-01-21 08:02:54 -0800824 struct sk_buff *skb, u32 seq, u32 ack,
Andrey Vaginee684b62013-02-11 05:50:19 +0000825 u32 win, u32 tsval, u32 tsecr, int oif,
KOVACS Krisztian88ef4a52008-10-01 07:41:00 -0700826 struct tcp_md5sig_key *key,
Eric Dumazet66b13d92011-10-24 03:06:21 -0400827 int reply_flags, u8 tos)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700828{
Eric Dumazetcf533ea2011-10-21 05:22:42 -0400829 const struct tcphdr *th = tcp_hdr(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700830 struct {
831 struct tcphdr th;
Al Viro714e85b2006-11-14 20:51:49 -0800832 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800833#ifdef CONFIG_TCP_MD5SIG
Al Viro714e85b2006-11-14 20:51:49 -0800834 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800835#endif
836 ];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700837 } rep;
Lorenzo Colittie2d118a2016-11-04 02:23:43 +0900838 struct net *net = sock_net(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700839 struct ip_reply_arg arg;
Jon Maxwell00483692018-05-10 16:53:51 +1000840 struct sock *ctl_sk;
Eric Dumazetd6fb3962019-06-13 21:22:35 -0700841 u64 transmit_time;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700842
843 memset(&rep.th, 0, sizeof(struct tcphdr));
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -0200844 memset(&arg, 0, sizeof(arg));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700845
846 arg.iov[0].iov_base = (unsigned char *)&rep;
847 arg.iov[0].iov_len = sizeof(rep.th);
Andrey Vaginee684b62013-02-11 05:50:19 +0000848 if (tsecr) {
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800849 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
850 (TCPOPT_TIMESTAMP << 8) |
851 TCPOLEN_TIMESTAMP);
Andrey Vaginee684b62013-02-11 05:50:19 +0000852 rep.opt[1] = htonl(tsval);
853 rep.opt[2] = htonl(tsecr);
Craig Schlentercb48cfe2007-01-09 00:11:15 -0800854 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700855 }
856
857 /* Swap the send and the receive. */
858 rep.th.dest = th->source;
859 rep.th.source = th->dest;
860 rep.th.doff = arg.iov[0].iov_len / 4;
861 rep.th.seq = htonl(seq);
862 rep.th.ack_seq = htonl(ack);
863 rep.th.ack = 1;
864 rep.th.window = htons(win);
865
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800866#ifdef CONFIG_TCP_MD5SIG
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800867 if (key) {
Andrey Vaginee684b62013-02-11 05:50:19 +0000868 int offset = (tsecr) ? 3 : 0;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800869
870 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
871 (TCPOPT_NOP << 16) |
872 (TCPOPT_MD5SIG << 8) |
873 TCPOLEN_MD5SIG);
874 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
875 rep.th.doff = arg.iov[0].iov_len/4;
876
Adam Langley49a72df2008-07-19 00:01:42 -0700877 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
Adam Langley90b7e112008-07-31 20:49:48 -0700878 key, ip_hdr(skb)->saddr,
879 ip_hdr(skb)->daddr, &rep.th);
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800880 }
881#endif
KOVACS Krisztian88ef4a52008-10-01 07:41:00 -0700882 arg.flags = reply_flags;
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700883 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
884 ip_hdr(skb)->saddr, /* XXX */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700885 arg.iov[0].iov_len, IPPROTO_TCP, 0);
886 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
YOSHIFUJI Hideaki9501f972008-04-18 12:45:16 +0900887 if (oif)
888 arg.bound_dev_if = oif;
Eric Dumazet66b13d92011-10-24 03:06:21 -0400889 arg.tos = tos;
Lorenzo Colittie2d118a2016-11-04 02:23:43 +0900890 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
Eric Dumazet47dcc202016-05-06 09:46:18 -0700891 local_bh_disable();
Eric Dumazet5472c3c2019-05-31 19:17:33 -0700892 ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
Eric Dumazeta842fe12019-06-12 11:57:25 -0700893 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
894 inet_twsk(sk)->tw_mark : sk->sk_mark;
Eric Dumazetf6c0f5d2019-09-24 08:01:16 -0700895 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
896 inet_twsk(sk)->tw_priority : sk->sk_priority;
Eric Dumazetd6fb3962019-06-13 21:22:35 -0700897 transmit_time = tcp_transmit_time(sk);
Jon Maxwell00483692018-05-10 16:53:51 +1000898 ip_send_unicast_reply(ctl_sk,
Eric Dumazetbdbbb852015-01-29 21:35:05 -0800899 skb, &TCP_SKB_CB(skb)->header.h4.opt,
Eric Dumazet24a2d432014-09-27 09:50:55 -0700900 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
Eric Dumazetd6fb3962019-06-13 21:22:35 -0700901 &arg, arg.iov[0].iov_len,
902 transmit_time);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700903
Jon Maxwell00483692018-05-10 16:53:51 +1000904 ctl_sk->sk_mark = 0;
Eric Dumazet90bbcc62016-04-27 16:44:32 -0700905 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
Eric Dumazet47dcc202016-05-06 09:46:18 -0700906 local_bh_enable();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700907}
908
909static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
910{
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -0700911 struct inet_timewait_sock *tw = inet_twsk(sk);
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -0800912 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700913
Lorenzo Colittie2d118a2016-11-04 02:23:43 +0900914 tcp_v4_send_ack(sk, skb,
Eric Dumazete62a1232016-01-21 08:02:54 -0800915 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -0200916 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
Eric Dumazet9a568de2017-05-16 14:00:14 -0700917 tcp_time_stamp_raw() + tcptw->tw_ts_offset,
YOSHIFUJI Hideaki9501f972008-04-18 12:45:16 +0900918 tcptw->tw_ts_recent,
919 tw->tw_bound_dev_if,
KOVACS Krisztian88ef4a52008-10-01 07:41:00 -0700920 tcp_twsk_md5_key(tcptw),
Eric Dumazet66b13d92011-10-24 03:06:21 -0400921 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
922 tw->tw_tos
YOSHIFUJI Hideaki9501f972008-04-18 12:45:16 +0900923 );
Linus Torvalds1da177e2005-04-16 15:20:36 -0700924
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -0700925 inet_twsk_put(tw);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700926}
927
Eric Dumazeta00e7442015-09-29 07:42:39 -0700928static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -0200929 struct request_sock *req)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700930{
David Aherncea97602019-12-30 14:14:25 -0800931 const union tcp_md5_addr *addr;
David Aherndea53bb2019-12-30 14:14:28 -0800932 int l3index;
David Aherncea97602019-12-30 14:14:25 -0800933
Jerry Chu168a8f52012-08-31 12:29:13 +0000934 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
935 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
936 */
Eric Dumazete62a1232016-01-21 08:02:54 -0800937 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
938 tcp_sk(sk)->snd_nxt;
939
Eric Dumazet20a2b492016-08-22 11:31:10 -0700940 /* RFC 7323 2.3
941 * The window field (SEG.WND) of every outgoing segment, with the
942 * exception of <SYN> segments, MUST be right-shifted by
943 * Rcv.Wind.Shift bits:
944 */
David Aherncea97602019-12-30 14:14:25 -0800945 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
David Aherndea53bb2019-12-30 14:14:28 -0800946 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
Lorenzo Colittie2d118a2016-11-04 02:23:43 +0900947 tcp_v4_send_ack(sk, skb, seq,
Eric Dumazet20a2b492016-08-22 11:31:10 -0700948 tcp_rsk(req)->rcv_nxt,
949 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
Eric Dumazet9a568de2017-05-16 14:00:14 -0700950 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
YOSHIFUJI Hideaki9501f972008-04-18 12:45:16 +0900951 req->ts_recent,
952 0,
David Aherndea53bb2019-12-30 14:14:28 -0800953 tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
Eric Dumazet66b13d92011-10-24 03:06:21 -0400954 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
955 ip_hdr(skb)->tos);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700956}
957
Linus Torvalds1da177e2005-04-16 15:20:36 -0700958/*
Kris Katterjohn9bf1d832008-02-17 22:29:19 -0800959 * Send a SYN-ACK after having received a SYN.
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -0700960 * This still operates on a request_sock only, not on a big
Linus Torvalds1da177e2005-04-16 15:20:36 -0700961 * socket.
962 */
Eric Dumazet0f935db2015-09-25 07:39:21 -0700963static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
Octavian Purdilad6274bd2014-06-25 17:09:58 +0300964 struct flowi *fl,
Octavian Purdila72659ec2010-01-17 19:09:39 -0800965 struct request_sock *req,
Eric Dumazetca6fb062015-10-02 11:43:35 -0700966 struct tcp_fastopen_cookie *foc,
Eric Dumazetb3d05142016-04-13 22:05:39 -0700967 enum tcp_synack_type synack_type)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700968{
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -0700969 const struct inet_request_sock *ireq = inet_rsk(req);
David S. Miller6bd023f2011-05-18 18:32:03 -0400970 struct flowi4 fl4;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700971 int err = -1;
Weilong Chend41db5a2013-12-23 14:37:28 +0800972 struct sk_buff *skb;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700973
974 /* First, grab a route. */
David S. Millerba3f7f02012-07-17 14:02:46 -0700975 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
Denis V. Lunevfd80eb92008-02-29 11:43:03 -0800976 return -1;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700977
Eric Dumazetb3d05142016-04-13 22:05:39 -0700978 skb = tcp_make_synack(sk, dst, req, foc, synack_type);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700979
980 if (skb) {
Eric Dumazet634fb9792013-10-09 15:21:29 -0700981 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700982
Eric Dumazet2ab2ddd2018-10-02 12:35:05 -0700983 rcu_read_lock();
Eric Dumazet634fb9792013-10-09 15:21:29 -0700984 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
985 ireq->ir_rmt_addr,
Eric Dumazet2ab2ddd2018-10-02 12:35:05 -0700986 rcu_dereference(ireq->ireq_opt));
987 rcu_read_unlock();
Gerrit Renkerb9df3cb2006-11-14 11:21:36 -0200988 err = net_xmit_eval(err);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700989 }
990
Linus Torvalds1da177e2005-04-16 15:20:36 -0700991 return err;
992}
993
994/*
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -0700995 * IPv4 request_sock destructor.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700996 */
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -0700997static void tcp_v4_reqsk_destructor(struct request_sock *req)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700998{
Eric Dumazetc92e8c02017-10-20 09:04:13 -0700999 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001000}
1001
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001002#ifdef CONFIG_TCP_MD5SIG
1003/*
1004 * RFC2385 MD5 checksumming requires a mapping of
1005 * IP address->MD5 Key.
1006 * We need to maintain these in the sk structure.
1007 */
1008
Eric Dumazet921f9a02019-02-26 09:49:11 -08001009DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
Eric Dumazet6015c712018-11-27 15:03:21 -08001010EXPORT_SYMBOL(tcp_md5_needed);
1011
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001012/* Find the Key structure for an address. */
David Aherndea53bb2019-12-30 14:14:28 -08001013struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
Eric Dumazet6015c712018-11-27 15:03:21 -08001014 const union tcp_md5_addr *addr,
1015 int family)
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001016{
Eric Dumazetfd3a1542015-03-24 15:58:56 -07001017 const struct tcp_sock *tp = tcp_sk(sk);
Eric Dumazeta915da9b2012-01-31 05:18:33 +00001018 struct tcp_md5sig_key *key;
Eric Dumazetfd3a1542015-03-24 15:58:56 -07001019 const struct tcp_md5sig_info *md5sig;
Ivan Delalande67973182017-06-15 18:07:06 -07001020 __be32 mask;
1021 struct tcp_md5sig_key *best_match = NULL;
1022 bool match;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001023
Eric Dumazeta8afca02012-01-31 18:45:40 +00001024 /* caller either holds rcu_read_lock() or socket lock */
1025 md5sig = rcu_dereference_check(tp->md5sig_info,
Hannes Frederic Sowa1e1d04e2016-04-05 17:10:15 +02001026 lockdep_sock_is_held(sk));
Eric Dumazeta8afca02012-01-31 18:45:40 +00001027 if (!md5sig)
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001028 return NULL;
Arnd Bergmann083a0322017-06-20 22:11:21 +02001029
Amol Groverc8b91772020-02-21 23:27:14 +05301030 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1031 lockdep_sock_is_held(sk)) {
Eric Dumazeta915da9b2012-01-31 05:18:33 +00001032 if (key->family != family)
1033 continue;
David Aherndea53bb2019-12-30 14:14:28 -08001034 if (key->l3index && key->l3index != l3index)
1035 continue;
Ivan Delalande67973182017-06-15 18:07:06 -07001036 if (family == AF_INET) {
1037 mask = inet_make_mask(key->prefixlen);
1038 match = (key->addr.a4.s_addr & mask) ==
1039 (addr->a4.s_addr & mask);
1040#if IS_ENABLED(CONFIG_IPV6)
1041 } else if (family == AF_INET6) {
1042 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1043 key->prefixlen);
1044#endif
1045 } else {
1046 match = false;
1047 }
1048
1049 if (match && (!best_match ||
1050 key->prefixlen > best_match->prefixlen))
1051 best_match = key;
1052 }
1053 return best_match;
1054}
Eric Dumazet6015c712018-11-27 15:03:21 -08001055EXPORT_SYMBOL(__tcp_md5_do_lookup);
Ivan Delalande67973182017-06-15 18:07:06 -07001056
Wu Fengguange8f37d52017-07-06 07:58:53 +08001057static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1058 const union tcp_md5_addr *addr,
David Aherndea53bb2019-12-30 14:14:28 -08001059 int family, u8 prefixlen,
1060 int l3index)
Ivan Delalande67973182017-06-15 18:07:06 -07001061{
1062 const struct tcp_sock *tp = tcp_sk(sk);
1063 struct tcp_md5sig_key *key;
1064 unsigned int size = sizeof(struct in_addr);
1065 const struct tcp_md5sig_info *md5sig;
1066
1067 /* caller either holds rcu_read_lock() or socket lock */
1068 md5sig = rcu_dereference_check(tp->md5sig_info,
1069 lockdep_sock_is_held(sk));
1070 if (!md5sig)
1071 return NULL;
1072#if IS_ENABLED(CONFIG_IPV6)
1073 if (family == AF_INET6)
1074 size = sizeof(struct in6_addr);
1075#endif
Amol Groverc8b91772020-02-21 23:27:14 +05301076 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1077 lockdep_sock_is_held(sk)) {
Ivan Delalande67973182017-06-15 18:07:06 -07001078 if (key->family != family)
1079 continue;
David Aherndea53bb2019-12-30 14:14:28 -08001080 if (key->l3index && key->l3index != l3index)
1081 continue;
Ivan Delalande67973182017-06-15 18:07:06 -07001082 if (!memcmp(&key->addr, addr, size) &&
1083 key->prefixlen == prefixlen)
Eric Dumazeta915da9b2012-01-31 05:18:33 +00001084 return key;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001085 }
1086 return NULL;
1087}
1088
Eric Dumazetb83e3de2015-09-25 07:39:15 -07001089struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
Eric Dumazetfd3a1542015-03-24 15:58:56 -07001090 const struct sock *addr_sk)
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001091{
Eric Dumazetb52e6922015-04-09 14:36:42 -07001092 const union tcp_md5_addr *addr;
David Aherndea53bb2019-12-30 14:14:28 -08001093 int l3index;
Eric Dumazeta915da9b2012-01-31 05:18:33 +00001094
David Aherndea53bb2019-12-30 14:14:28 -08001095 l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1096 addr_sk->sk_bound_dev_if);
Eric Dumazetb52e6922015-04-09 14:36:42 -07001097 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
David Aherndea53bb2019-12-30 14:14:28 -08001098 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001099}
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001100EXPORT_SYMBOL(tcp_v4_md5_lookup);
1101
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001102/* This can be called on a newly created socket, from other files */
Eric Dumazeta915da9b2012-01-31 05:18:33 +00001103int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
David Aherndea53bb2019-12-30 14:14:28 -08001104 int family, u8 prefixlen, int l3index,
1105 const u8 *newkey, u8 newkeylen, gfp_t gfp)
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001106{
1107 /* Add Key to the list */
Matthias M. Dellwegb0a713e2007-10-29 20:55:27 -07001108 struct tcp_md5sig_key *key;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001109 struct tcp_sock *tp = tcp_sk(sk);
Eric Dumazeta915da9b2012-01-31 05:18:33 +00001110 struct tcp_md5sig_info *md5sig;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001111
David Aherndea53bb2019-12-30 14:14:28 -08001112 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001113 if (key) {
1114 /* Pre-existing entry - just update that one. */
Eric Dumazeta915da9b2012-01-31 05:18:33 +00001115 memcpy(key->key, newkey, newkeylen);
Matthias M. Dellwegb0a713e2007-10-29 20:55:27 -07001116 key->keylen = newkeylen;
Eric Dumazeta915da9b2012-01-31 05:18:33 +00001117 return 0;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001118 }
Eric Dumazeta915da9b2012-01-31 05:18:33 +00001119
Eric Dumazeta8afca02012-01-31 18:45:40 +00001120 md5sig = rcu_dereference_protected(tp->md5sig_info,
Hannes Frederic Sowa1e1d04e2016-04-05 17:10:15 +02001121 lockdep_sock_is_held(sk));
Eric Dumazeta915da9b2012-01-31 05:18:33 +00001122 if (!md5sig) {
1123 md5sig = kmalloc(sizeof(*md5sig), gfp);
1124 if (!md5sig)
1125 return -ENOMEM;
1126
1127 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1128 INIT_HLIST_HEAD(&md5sig->head);
Eric Dumazeta8afca02012-01-31 18:45:40 +00001129 rcu_assign_pointer(tp->md5sig_info, md5sig);
Eric Dumazeta915da9b2012-01-31 05:18:33 +00001130 }
1131
Eric Dumazet5f3d9cb2012-01-31 10:56:48 +00001132 key = sock_kmalloc(sk, sizeof(*key), gfp);
Eric Dumazeta915da9b2012-01-31 05:18:33 +00001133 if (!key)
1134 return -ENOMEM;
Eric Dumazet71cea172013-05-20 06:52:26 +00001135 if (!tcp_alloc_md5sig_pool()) {
Eric Dumazet5f3d9cb2012-01-31 10:56:48 +00001136 sock_kfree_s(sk, key, sizeof(*key));
Eric Dumazeta915da9b2012-01-31 05:18:33 +00001137 return -ENOMEM;
1138 }
1139
1140 memcpy(key->key, newkey, newkeylen);
1141 key->keylen = newkeylen;
1142 key->family = family;
Ivan Delalande67973182017-06-15 18:07:06 -07001143 key->prefixlen = prefixlen;
David Aherndea53bb2019-12-30 14:14:28 -08001144 key->l3index = l3index;
Eric Dumazeta915da9b2012-01-31 05:18:33 +00001145 memcpy(&key->addr, addr,
1146 (family == AF_INET6) ? sizeof(struct in6_addr) :
1147 sizeof(struct in_addr));
1148 hlist_add_head_rcu(&key->node, &md5sig->head);
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001149 return 0;
1150}
Eric Dumazeta915da9b2012-01-31 05:18:33 +00001151EXPORT_SYMBOL(tcp_md5_do_add);
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001152
Ivan Delalande67973182017-06-15 18:07:06 -07001153int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
David Aherndea53bb2019-12-30 14:14:28 -08001154 u8 prefixlen, int l3index)
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001155{
Eric Dumazeta915da9b2012-01-31 05:18:33 +00001156 struct tcp_md5sig_key *key;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001157
David Aherndea53bb2019-12-30 14:14:28 -08001158 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
Eric Dumazeta915da9b2012-01-31 05:18:33 +00001159 if (!key)
1160 return -ENOENT;
1161 hlist_del_rcu(&key->node);
Eric Dumazet5f3d9cb2012-01-31 10:56:48 +00001162 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
Eric Dumazeta915da9b2012-01-31 05:18:33 +00001163 kfree_rcu(key, rcu);
Eric Dumazeta915da9b2012-01-31 05:18:33 +00001164 return 0;
1165}
1166EXPORT_SYMBOL(tcp_md5_do_del);
1167
stephen hemmingere0683e702012-10-26 14:31:40 +00001168static void tcp_clear_md5_list(struct sock *sk)
Eric Dumazeta915da9b2012-01-31 05:18:33 +00001169{
1170 struct tcp_sock *tp = tcp_sk(sk);
1171 struct tcp_md5sig_key *key;
Sasha Levinb67bfe02013-02-27 17:06:00 -08001172 struct hlist_node *n;
Eric Dumazeta8afca02012-01-31 18:45:40 +00001173 struct tcp_md5sig_info *md5sig;
Eric Dumazeta915da9b2012-01-31 05:18:33 +00001174
Eric Dumazeta8afca02012-01-31 18:45:40 +00001175 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1176
Sasha Levinb67bfe02013-02-27 17:06:00 -08001177 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
Eric Dumazeta915da9b2012-01-31 05:18:33 +00001178 hlist_del_rcu(&key->node);
Eric Dumazet5f3d9cb2012-01-31 10:56:48 +00001179 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
Eric Dumazeta915da9b2012-01-31 05:18:33 +00001180 kfree_rcu(key, rcu);
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001181 }
1182}
1183
Ivan Delalande8917a772017-06-15 18:07:07 -07001184static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1185 char __user *optval, int optlen)
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001186{
1187 struct tcp_md5sig cmd;
1188 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
David Aherncea97602019-12-30 14:14:25 -08001189 const union tcp_md5_addr *addr;
Ivan Delalande8917a772017-06-15 18:07:07 -07001190 u8 prefixlen = 32;
David Aherndea53bb2019-12-30 14:14:28 -08001191 int l3index = 0;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001192
1193 if (optlen < sizeof(cmd))
1194 return -EINVAL;
1195
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -02001196 if (copy_from_user(&cmd, optval, sizeof(cmd)))
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001197 return -EFAULT;
1198
1199 if (sin->sin_family != AF_INET)
1200 return -EINVAL;
1201
Ivan Delalande8917a772017-06-15 18:07:07 -07001202 if (optname == TCP_MD5SIG_EXT &&
1203 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1204 prefixlen = cmd.tcpm_prefixlen;
1205 if (prefixlen > 32)
1206 return -EINVAL;
1207 }
1208
David Ahern6b102db2019-12-30 14:14:29 -08001209 if (optname == TCP_MD5SIG_EXT &&
1210 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1211 struct net_device *dev;
1212
1213 rcu_read_lock();
1214 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1215 if (dev && netif_is_l3_master(dev))
1216 l3index = dev->ifindex;
1217
1218 rcu_read_unlock();
1219
1220 /* ok to reference set/not set outside of rcu;
1221 * right now device MUST be an L3 master
1222 */
1223 if (!dev || !l3index)
1224 return -EINVAL;
1225 }
1226
David Aherncea97602019-12-30 14:14:25 -08001227 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1228
Dmitry Popov64a124e2014-08-03 22:45:19 +04001229 if (!cmd.tcpm_keylen)
David Aherndea53bb2019-12-30 14:14:28 -08001230 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index);
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001231
1232 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1233 return -EINVAL;
1234
David Aherndea53bb2019-12-30 14:14:28 -08001235 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index,
David Aherncea97602019-12-30 14:14:25 -08001236 cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001237}
1238
Eric Dumazet19689e32016-06-27 18:51:53 +02001239static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1240 __be32 daddr, __be32 saddr,
1241 const struct tcphdr *th, int nbytes)
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001242{
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001243 struct tcp4_pseudohdr *bp;
Adam Langley49a72df2008-07-19 00:01:42 -07001244 struct scatterlist sg;
Eric Dumazet19689e32016-06-27 18:51:53 +02001245 struct tcphdr *_th;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001246
Eric Dumazet19689e32016-06-27 18:51:53 +02001247 bp = hp->scratch;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001248 bp->saddr = saddr;
1249 bp->daddr = daddr;
1250 bp->pad = 0;
YOSHIFUJI Hideaki076fb722008-04-17 12:48:12 +09001251 bp->protocol = IPPROTO_TCP;
Adam Langley49a72df2008-07-19 00:01:42 -07001252 bp->len = cpu_to_be16(nbytes);
David S. Millerc7da57a2007-10-26 00:41:21 -07001253
Eric Dumazet19689e32016-06-27 18:51:53 +02001254 _th = (struct tcphdr *)(bp + 1);
1255 memcpy(_th, th, sizeof(*th));
1256 _th->check = 0;
1257
1258 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1259 ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1260 sizeof(*bp) + sizeof(*th));
Herbert Xucf80e0e2016-01-24 21:20:23 +08001261 return crypto_ahash_update(hp->md5_req);
Adam Langley49a72df2008-07-19 00:01:42 -07001262}
1263
Eric Dumazeta915da9b2012-01-31 05:18:33 +00001264static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
Eric Dumazet318cf7a2011-10-24 02:46:04 -04001265 __be32 daddr, __be32 saddr, const struct tcphdr *th)
Adam Langley49a72df2008-07-19 00:01:42 -07001266{
1267 struct tcp_md5sig_pool *hp;
Herbert Xucf80e0e2016-01-24 21:20:23 +08001268 struct ahash_request *req;
Adam Langley49a72df2008-07-19 00:01:42 -07001269
1270 hp = tcp_get_md5sig_pool();
1271 if (!hp)
1272 goto clear_hash_noput;
Herbert Xucf80e0e2016-01-24 21:20:23 +08001273 req = hp->md5_req;
Adam Langley49a72df2008-07-19 00:01:42 -07001274
Herbert Xucf80e0e2016-01-24 21:20:23 +08001275 if (crypto_ahash_init(req))
Adam Langley49a72df2008-07-19 00:01:42 -07001276 goto clear_hash;
Eric Dumazet19689e32016-06-27 18:51:53 +02001277 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
Adam Langley49a72df2008-07-19 00:01:42 -07001278 goto clear_hash;
1279 if (tcp_md5_hash_key(hp, key))
1280 goto clear_hash;
Herbert Xucf80e0e2016-01-24 21:20:23 +08001281 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1282 if (crypto_ahash_final(req))
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001283 goto clear_hash;
1284
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001285 tcp_put_md5sig_pool();
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001286 return 0;
Adam Langley49a72df2008-07-19 00:01:42 -07001287
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001288clear_hash:
1289 tcp_put_md5sig_pool();
1290clear_hash_noput:
1291 memset(md5_hash, 0, 16);
Adam Langley49a72df2008-07-19 00:01:42 -07001292 return 1;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001293}
1294
Eric Dumazet39f8e582015-03-24 15:58:55 -07001295int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1296 const struct sock *sk,
Eric Dumazet318cf7a2011-10-24 02:46:04 -04001297 const struct sk_buff *skb)
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001298{
Adam Langley49a72df2008-07-19 00:01:42 -07001299 struct tcp_md5sig_pool *hp;
Herbert Xucf80e0e2016-01-24 21:20:23 +08001300 struct ahash_request *req;
Eric Dumazet318cf7a2011-10-24 02:46:04 -04001301 const struct tcphdr *th = tcp_hdr(skb);
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001302 __be32 saddr, daddr;
1303
Eric Dumazet39f8e582015-03-24 15:58:55 -07001304 if (sk) { /* valid for establish/request sockets */
1305 saddr = sk->sk_rcv_saddr;
1306 daddr = sk->sk_daddr;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001307 } else {
Adam Langley49a72df2008-07-19 00:01:42 -07001308 const struct iphdr *iph = ip_hdr(skb);
1309 saddr = iph->saddr;
1310 daddr = iph->daddr;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001311 }
Adam Langley49a72df2008-07-19 00:01:42 -07001312
1313 hp = tcp_get_md5sig_pool();
1314 if (!hp)
1315 goto clear_hash_noput;
Herbert Xucf80e0e2016-01-24 21:20:23 +08001316 req = hp->md5_req;
Adam Langley49a72df2008-07-19 00:01:42 -07001317
Herbert Xucf80e0e2016-01-24 21:20:23 +08001318 if (crypto_ahash_init(req))
Adam Langley49a72df2008-07-19 00:01:42 -07001319 goto clear_hash;
1320
Eric Dumazet19689e32016-06-27 18:51:53 +02001321 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
Adam Langley49a72df2008-07-19 00:01:42 -07001322 goto clear_hash;
1323 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1324 goto clear_hash;
1325 if (tcp_md5_hash_key(hp, key))
1326 goto clear_hash;
Herbert Xucf80e0e2016-01-24 21:20:23 +08001327 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1328 if (crypto_ahash_final(req))
Adam Langley49a72df2008-07-19 00:01:42 -07001329 goto clear_hash;
1330
1331 tcp_put_md5sig_pool();
1332 return 0;
1333
1334clear_hash:
1335 tcp_put_md5sig_pool();
1336clear_hash_noput:
1337 memset(md5_hash, 0, 16);
1338 return 1;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001339}
Adam Langley49a72df2008-07-19 00:01:42 -07001340EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001341
Eric Dumazetba8e2752015-10-02 11:43:28 -07001342#endif
1343
Eric Dumazetff74e232015-03-24 15:58:54 -07001344/* Called with rcu_read_lock() */
Eric Dumazetba8e2752015-10-02 11:43:28 -07001345static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
David Ahern534322c2019-12-30 14:14:27 -08001346 const struct sk_buff *skb,
1347 int dif, int sdif)
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001348{
Eric Dumazetba8e2752015-10-02 11:43:28 -07001349#ifdef CONFIG_TCP_MD5SIG
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001350 /*
1351 * This gets called for each TCP segment that arrives
1352 * so we want to be efficient.
1353 * We have 3 drop cases:
1354 * o No MD5 hash and one expected.
1355 * o MD5 hash and we're not expecting one.
1356 * o MD5 hash and its wrong.
1357 */
Eric Dumazetcf533ea2011-10-21 05:22:42 -04001358 const __u8 *hash_location = NULL;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001359 struct tcp_md5sig_key *hash_expected;
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07001360 const struct iphdr *iph = ip_hdr(skb);
Eric Dumazetcf533ea2011-10-21 05:22:42 -04001361 const struct tcphdr *th = tcp_hdr(skb);
David Aherncea97602019-12-30 14:14:25 -08001362 const union tcp_md5_addr *addr;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001363 unsigned char newhash[16];
David Aherndea53bb2019-12-30 14:14:28 -08001364 int genhash, l3index;
1365
1366 /* sdif set, means packet ingressed via a device
1367 * in an L3 domain and dif is set to the l3mdev
1368 */
1369 l3index = sdif ? dif : 0;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001370
David Aherncea97602019-12-30 14:14:25 -08001371 addr = (union tcp_md5_addr *)&iph->saddr;
David Aherndea53bb2019-12-30 14:14:28 -08001372 hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
YOSHIFUJI Hideaki7d5d5522008-04-17 12:29:53 +09001373 hash_location = tcp_parse_md5sig_option(th);
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001374
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001375 /* We've parsed the options - do we have a hash? */
1376 if (!hash_expected && !hash_location)
Eric Dumazeta2a385d2012-05-16 23:15:34 +00001377 return false;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001378
1379 if (hash_expected && !hash_location) {
Eric Dumazetc10d9312016-04-29 14:16:47 -07001380 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
Eric Dumazeta2a385d2012-05-16 23:15:34 +00001381 return true;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001382 }
1383
1384 if (!hash_expected && hash_location) {
Eric Dumazetc10d9312016-04-29 14:16:47 -07001385 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
Eric Dumazeta2a385d2012-05-16 23:15:34 +00001386 return true;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001387 }
1388
1389 /* Okay, so this is hash_expected and hash_location -
1390 * so we need to calculate the checksum.
1391 */
Adam Langley49a72df2008-07-19 00:01:42 -07001392 genhash = tcp_v4_md5_hash_skb(newhash,
1393 hash_expected,
Eric Dumazet39f8e582015-03-24 15:58:55 -07001394 NULL, skb);
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001395
1396 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
Eric Dumazet72145a62016-08-24 09:01:23 -07001397 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
David Aherndea53bb2019-12-30 14:14:28 -08001398 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
Joe Perchese87cc472012-05-13 21:56:26 +00001399 &iph->saddr, ntohs(th->source),
1400 &iph->daddr, ntohs(th->dest),
1401 genhash ? " tcp_v4_calc_md5_hash failed"
David Aherndea53bb2019-12-30 14:14:28 -08001402 : "", l3index);
Eric Dumazeta2a385d2012-05-16 23:15:34 +00001403 return true;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001404 }
Eric Dumazeta2a385d2012-05-16 23:15:34 +00001405 return false;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001406#endif
Eric Dumazetba8e2752015-10-02 11:43:28 -07001407 return false;
1408}
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001409
Eric Dumazetb40cf182015-09-25 07:39:08 -07001410static void tcp_v4_init_req(struct request_sock *req,
1411 const struct sock *sk_listener,
Octavian Purdila16bea702014-06-25 17:09:53 +03001412 struct sk_buff *skb)
1413{
1414 struct inet_request_sock *ireq = inet_rsk(req);
Eric Dumazetc92e8c02017-10-20 09:04:13 -07001415 struct net *net = sock_net(sk_listener);
Octavian Purdila16bea702014-06-25 17:09:53 +03001416
Eric Dumazet08d2cc3b2015-03-18 14:05:38 -07001417 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1418 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
Eric Dumazetc92e8c02017-10-20 09:04:13 -07001419 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
Octavian Purdila16bea702014-06-25 17:09:53 +03001420}
1421
Eric Dumazetf9646292015-09-29 07:42:50 -07001422static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1423 struct flowi *fl,
Soheil Hassas Yeganeh4396e462017-03-15 16:30:46 -04001424 const struct request_sock *req)
Octavian Purdilad94e0412014-06-25 17:09:55 +03001425{
Soheil Hassas Yeganeh4396e462017-03-15 16:30:46 -04001426 return inet_csk_route_req(sk, &fl->u.ip4, req);
Octavian Purdilad94e0412014-06-25 17:09:55 +03001427}
1428
Eric Dumazet72a3eff2006-11-16 02:30:37 -08001429struct request_sock_ops tcp_request_sock_ops __read_mostly = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001430 .family = PF_INET,
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001431 .obj_size = sizeof(struct tcp_request_sock),
Octavian Purdila5db92c92014-06-25 17:09:59 +03001432 .rtx_syn_ack = tcp_rtx_synack,
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001433 .send_ack = tcp_v4_reqsk_send_ack,
1434 .destructor = tcp_v4_reqsk_destructor,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001435 .send_reset = tcp_v4_send_reset,
stephen hemminger688d1942014-08-29 23:32:05 -07001436 .syn_ack_timeout = tcp_syn_ack_timeout,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001437};
1438
Mat Martineau35b2c322020-01-09 07:59:21 -08001439const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
Octavian Purdila2aec4a22014-06-25 17:10:00 +03001440 .mss_clamp = TCP_MSS_DEFAULT,
Octavian Purdila16bea702014-06-25 17:09:53 +03001441#ifdef CONFIG_TCP_MD5SIG
Eric Dumazetfd3a1542015-03-24 15:58:56 -07001442 .req_md5_lookup = tcp_v4_md5_lookup,
John Dykstrae3afe7b2009-07-16 05:04:51 +00001443 .calc_md5_hash = tcp_v4_md5_hash_skb,
Andrew Mortonb6332e62006-11-30 19:16:28 -08001444#endif
Octavian Purdila16bea702014-06-25 17:09:53 +03001445 .init_req = tcp_v4_init_req,
Octavian Purdilafb7b37a2014-06-25 17:09:54 +03001446#ifdef CONFIG_SYN_COOKIES
1447 .cookie_init_seq = cookie_v4_init_sequence,
1448#endif
Octavian Purdilad94e0412014-06-25 17:09:55 +03001449 .route_req = tcp_v4_route_req,
Eric Dumazet84b114b2017-05-05 06:56:54 -07001450 .init_seq = tcp_v4_init_seq,
1451 .init_ts_off = tcp_v4_init_ts_off,
Octavian Purdilad6274bd2014-06-25 17:09:58 +03001452 .send_synack = tcp_v4_send_synack,
Octavian Purdila16bea702014-06-25 17:09:53 +03001453};
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001454
Linus Torvalds1da177e2005-04-16 15:20:36 -07001455int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1456{
Linus Torvalds1da177e2005-04-16 15:20:36 -07001457 /* Never answer to SYNs send to broadcast or multicast */
Eric Dumazet511c3f92009-06-02 05:14:27 +00001458 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001459 goto drop;
1460
Octavian Purdila1fb6f152014-06-25 17:10:02 +03001461 return tcp_conn_request(&tcp_request_sock_ops,
1462 &tcp_request_sock_ipv4_ops, sk, skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001463
Linus Torvalds1da177e2005-04-16 15:20:36 -07001464drop:
Eric Dumazet9caad862016-04-01 08:52:20 -07001465 tcp_listendrop(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001466 return 0;
1467}
Eric Dumazet4bc2f182010-07-09 21:22:10 +00001468EXPORT_SYMBOL(tcp_v4_conn_request);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001469
1470
1471/*
1472 * The three way handshake has completed - we got a valid synack -
1473 * now create the new socket.
1474 */
Eric Dumazet0c271712015-09-29 07:42:48 -07001475struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001476 struct request_sock *req,
Eric Dumazet5e0724d2015-10-22 08:20:46 -07001477 struct dst_entry *dst,
1478 struct request_sock *req_unhash,
1479 bool *own_req)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001480{
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001481 struct inet_request_sock *ireq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001482 struct inet_sock *newinet;
1483 struct tcp_sock *newtp;
1484 struct sock *newsk;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001485#ifdef CONFIG_TCP_MD5SIG
David Aherncea97602019-12-30 14:14:25 -08001486 const union tcp_md5_addr *addr;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001487 struct tcp_md5sig_key *key;
David Aherndea53bb2019-12-30 14:14:28 -08001488 int l3index;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001489#endif
Eric Dumazetf6d8bd02011-04-21 09:45:37 +00001490 struct ip_options_rcu *inet_opt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001491
1492 if (sk_acceptq_is_full(sk))
1493 goto exit_overflow;
1494
Linus Torvalds1da177e2005-04-16 15:20:36 -07001495 newsk = tcp_create_openreq_child(sk, req, skb);
1496 if (!newsk)
Balazs Scheidler093d2822010-10-21 13:06:43 +02001497 goto exit_nonewsk;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001498
Herbert Xubcd76112006-06-30 13:36:35 -07001499 newsk->sk_gso_type = SKB_GSO_TCPV4;
Neal Cardwellfae6ef82012-08-19 03:30:38 +00001500 inet_sk_rx_dst_set(newsk, skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001501
1502 newtp = tcp_sk(newsk);
1503 newinet = inet_sk(newsk);
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001504 ireq = inet_rsk(req);
Eric Dumazetd1e559d2015-03-18 14:05:35 -07001505 sk_daddr_set(newsk, ireq->ir_rmt_addr);
1506 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
David Ahern6dd9a142015-12-16 13:20:44 -08001507 newsk->sk_bound_dev_if = ireq->ir_iif;
Eric Dumazetc92e8c02017-10-20 09:04:13 -07001508 newinet->inet_saddr = ireq->ir_loc_addr;
1509 inet_opt = rcu_dereference(ireq->ireq_opt);
1510 RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07001511 newinet->mc_index = inet_iif(skb);
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07001512 newinet->mc_ttl = ip_hdr(skb)->ttl;
Jiri Benc4c507d22012-02-09 09:35:49 +00001513 newinet->rcv_tos = ip_hdr(skb)->tos;
Arnaldo Carvalho de Melod83d8462005-12-13 23:26:10 -08001514 inet_csk(newsk)->icsk_ext_hdr_len = 0;
Eric Dumazetf6d8bd02011-04-21 09:45:37 +00001515 if (inet_opt)
1516 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
Eric Dumazeta904a062019-11-01 10:32:19 -07001517 newinet->inet_id = prandom_u32();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001518
Eric Dumazetdfd25ff2012-03-10 09:20:21 +00001519 if (!dst) {
1520 dst = inet_csk_route_child_sock(sk, newsk, req);
1521 if (!dst)
1522 goto put_and_exit;
1523 } else {
1524 /* syncookie case : see end of cookie_v4_check() */
1525 }
David S. Miller0e734412011-05-08 15:28:03 -07001526 sk_setup_caps(newsk, dst);
1527
Daniel Borkmann81164412015-01-05 23:57:48 +01001528 tcp_ca_openreq_child(newsk, dst);
1529
Linus Torvalds1da177e2005-04-16 15:20:36 -07001530 tcp_sync_mss(newsk, dst_mtu(dst));
Eric Dumazet3541f9e2017-02-02 08:04:56 -08001531 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
Tom Quetchenbachf5fff5d2008-09-21 00:21:51 -07001532
Linus Torvalds1da177e2005-04-16 15:20:36 -07001533 tcp_initialize_rcv_mss(newsk);
1534
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001535#ifdef CONFIG_TCP_MD5SIG
David Aherndea53bb2019-12-30 14:14:28 -08001536 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001537 /* Copy over the MD5 key from the original socket */
David Aherncea97602019-12-30 14:14:25 -08001538 addr = (union tcp_md5_addr *)&newinet->inet_daddr;
David Aherndea53bb2019-12-30 14:14:28 -08001539 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
Ian Morris00db4122015-04-03 09:17:27 +01001540 if (key) {
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001541 /*
1542 * We're using one, so create a matching key
1543 * on the newsk structure. If we fail to get
1544 * memory, then we end up not copying the key
1545 * across. Shucks.
1546 */
David Aherndea53bb2019-12-30 14:14:28 -08001547 tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index,
David Aherncea97602019-12-30 14:14:25 -08001548 key->key, key->keylen, GFP_ATOMIC);
Eric Dumazeta4654192010-05-16 00:36:33 -07001549 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001550 }
1551#endif
1552
David S. Miller0e734412011-05-08 15:28:03 -07001553 if (__inet_inherit_port(sk, newsk) < 0)
1554 goto put_and_exit;
Eric Dumazet5e0724d2015-10-22 08:20:46 -07001555 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
Eric Dumazetc92e8c02017-10-20 09:04:13 -07001556 if (likely(*own_req)) {
Eric Dumazet49a496c2015-11-05 12:50:19 -08001557 tcp_move_syn(newtp, req);
Eric Dumazetc92e8c02017-10-20 09:04:13 -07001558 ireq->ireq_opt = NULL;
1559 } else {
1560 newinet->inet_opt = NULL;
1561 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001562 return newsk;
1563
1564exit_overflow:
Eric Dumazetc10d9312016-04-29 14:16:47 -07001565 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
Balazs Scheidler093d2822010-10-21 13:06:43 +02001566exit_nonewsk:
1567 dst_release(dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001568exit:
Eric Dumazet9caad862016-04-01 08:52:20 -07001569 tcp_listendrop(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001570 return NULL;
David S. Miller0e734412011-05-08 15:28:03 -07001571put_and_exit:
Eric Dumazetc92e8c02017-10-20 09:04:13 -07001572 newinet->inet_opt = NULL;
Christoph Paasche337e242012-12-14 04:07:58 +00001573 inet_csk_prepare_forced_close(newsk);
1574 tcp_done(newsk);
David S. Miller0e734412011-05-08 15:28:03 -07001575 goto exit;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001576}
Eric Dumazet4bc2f182010-07-09 21:22:10 +00001577EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001578
Eric Dumazet079096f2015-10-02 11:43:32 -07001579static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001580{
Linus Torvalds1da177e2005-04-16 15:20:36 -07001581#ifdef CONFIG_SYN_COOKIES
Eric Dumazet079096f2015-10-02 11:43:32 -07001582 const struct tcphdr *th = tcp_hdr(skb);
1583
Florian Westphalaf9b4732010-06-03 00:43:44 +00001584 if (!th->syn)
Cong Wang461b74c2014-10-15 14:33:22 -07001585 sk = cookie_v4_check(sk, skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001586#endif
1587 return sk;
1588}
1589
Petar Penkov9349d602019-07-29 09:59:14 -07001590u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1591 struct tcphdr *th, u32 *cookie)
1592{
1593 u16 mss = 0;
1594#ifdef CONFIG_SYN_COOKIES
1595 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1596 &tcp_request_sock_ipv4_ops, sk, th);
1597 if (mss) {
1598 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1599 tcp_synq_overflow(sk);
1600 }
1601#endif
1602 return mss;
1603}
1604
Linus Torvalds1da177e2005-04-16 15:20:36 -07001605/* The socket must have it's spinlock held when we get
Eric Dumazete994b2f2015-10-02 11:43:39 -07001606 * here, unless it is a TCP_LISTEN socket.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001607 *
1608 * We have a potential double-lock case here, so even when
1609 * doing backlog processing we use the BH locking scheme.
1610 * This is because we cannot sleep with the original spinlock
1611 * held.
1612 */
1613int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1614{
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001615 struct sock *rsk;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001616
Linus Torvalds1da177e2005-04-16 15:20:36 -07001617 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
Eric Dumazet404e0a82012-07-29 23:20:37 +00001618 struct dst_entry *dst = sk->sk_rx_dst;
1619
Tom Herbertbdeab992011-08-14 19:45:55 +00001620 sock_rps_save_rxhash(sk, skb);
Eric Dumazet3d973792014-11-11 05:54:27 -08001621 sk_mark_napi_id(sk, skb);
Eric Dumazet404e0a82012-07-29 23:20:37 +00001622 if (dst) {
Eric Dumazet505fbcf2012-07-27 06:23:40 +00001623 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
Ian Morris51456b22015-04-03 09:17:26 +01001624 !dst->ops->check(dst, 0)) {
David S. Miller92101b32012-07-23 16:29:00 -07001625 dst_release(dst);
1626 sk->sk_rx_dst = NULL;
1627 }
1628 }
Yafang Shao3d97d882018-05-29 23:27:31 +08001629 tcp_rcv_established(sk, skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001630 return 0;
1631 }
1632
Eric Dumazet12e25e12015-06-03 23:49:21 -07001633 if (tcp_checksum_complete(skb))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001634 goto csum_err;
1635
1636 if (sk->sk_state == TCP_LISTEN) {
Eric Dumazet079096f2015-10-02 11:43:32 -07001637 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1638
Linus Torvalds1da177e2005-04-16 15:20:36 -07001639 if (!nsk)
1640 goto discard;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001641 if (nsk != sk) {
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001642 if (tcp_child_process(sk, nsk, skb)) {
1643 rsk = nsk;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001644 goto reset;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001645 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001646 return 0;
1647 }
Eric Dumazetca551582010-06-03 09:03:58 +00001648 } else
Tom Herbertbdeab992011-08-14 19:45:55 +00001649 sock_rps_save_rxhash(sk, skb);
Eric Dumazetca551582010-06-03 09:03:58 +00001650
Eric Dumazet72ab4a82015-09-29 07:42:41 -07001651 if (tcp_rcv_state_process(sk, skb)) {
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001652 rsk = sk;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001653 goto reset;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001654 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001655 return 0;
1656
1657reset:
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08001658 tcp_v4_send_reset(rsk, skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001659discard:
1660 kfree_skb(skb);
1661 /* Be careful here. If this function gets more complicated and
1662 * gcc suffers from register pressure on the x86, sk (in %ebx)
1663 * might be destroyed here. This current version compiles correctly,
1664 * but you have been warned.
1665 */
1666 return 0;
1667
1668csum_err:
Eric Dumazetc10d9312016-04-29 14:16:47 -07001669 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1670 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001671 goto discard;
1672}
Eric Dumazet4bc2f182010-07-09 21:22:10 +00001673EXPORT_SYMBOL(tcp_v4_do_rcv);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001674
Paolo Abeni74874492017-09-28 15:51:36 +02001675int tcp_v4_early_demux(struct sk_buff *skb)
David S. Miller41063e92012-06-19 21:22:05 -07001676{
David S. Miller41063e92012-06-19 21:22:05 -07001677 const struct iphdr *iph;
1678 const struct tcphdr *th;
1679 struct sock *sk;
David S. Miller41063e92012-06-19 21:22:05 -07001680
David S. Miller41063e92012-06-19 21:22:05 -07001681 if (skb->pkt_type != PACKET_HOST)
Paolo Abeni74874492017-09-28 15:51:36 +02001682 return 0;
David S. Miller41063e92012-06-19 21:22:05 -07001683
Eric Dumazet45f00f92012-10-22 21:42:47 +00001684 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
Paolo Abeni74874492017-09-28 15:51:36 +02001685 return 0;
David S. Miller41063e92012-06-19 21:22:05 -07001686
1687 iph = ip_hdr(skb);
Eric Dumazet45f00f92012-10-22 21:42:47 +00001688 th = tcp_hdr(skb);
David S. Miller41063e92012-06-19 21:22:05 -07001689
1690 if (th->doff < sizeof(struct tcphdr) / 4)
Paolo Abeni74874492017-09-28 15:51:36 +02001691 return 0;
David S. Miller41063e92012-06-19 21:22:05 -07001692
Eric Dumazet45f00f92012-10-22 21:42:47 +00001693 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
David S. Miller41063e92012-06-19 21:22:05 -07001694 iph->saddr, th->source,
Vijay Subramanian7011d082012-06-23 17:38:10 +00001695 iph->daddr, ntohs(th->dest),
David Ahern3fa6f612017-08-07 08:44:17 -07001696 skb->skb_iif, inet_sdif(skb));
David S. Miller41063e92012-06-19 21:22:05 -07001697 if (sk) {
1698 skb->sk = sk;
1699 skb->destructor = sock_edemux;
Eric Dumazetf7e4eb02015-03-15 21:12:13 -07001700 if (sk_fullsock(sk)) {
Michal Kubečekd0c294c2015-03-23 15:14:00 +01001701 struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
Eric Dumazet505fbcf2012-07-27 06:23:40 +00001702
David S. Miller41063e92012-06-19 21:22:05 -07001703 if (dst)
1704 dst = dst_check(dst, 0);
David S. Miller92101b32012-07-23 16:29:00 -07001705 if (dst &&
Eric Dumazet505fbcf2012-07-27 06:23:40 +00001706 inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
David S. Miller92101b32012-07-23 16:29:00 -07001707 skb_dst_set_noref(skb, dst);
David S. Miller41063e92012-06-19 21:22:05 -07001708 }
1709 }
Paolo Abeni74874492017-09-28 15:51:36 +02001710 return 0;
David S. Miller41063e92012-06-19 21:22:05 -07001711}
1712
Eric Dumazetc9c33212016-08-27 07:37:54 -07001713bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1714{
Eric Dumazet82657922019-10-09 15:21:13 -07001715 u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf);
Eric Dumazet4f693b52018-11-27 14:42:03 -08001716 struct skb_shared_info *shinfo;
1717 const struct tcphdr *th;
1718 struct tcphdr *thtail;
1719 struct sk_buff *tail;
1720 unsigned int hdrlen;
1721 bool fragstolen;
1722 u32 gso_segs;
1723 int delta;
Eric Dumazetc9c33212016-08-27 07:37:54 -07001724
1725 /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1726 * we can fix skb->truesize to its real value to avoid future drops.
1727 * This is valid because skb is not yet charged to the socket.
1728 * It has been noticed pure SACK packets were sometimes dropped
1729 * (if cooked by drivers without copybreak feature).
1730 */
Eric Dumazet60b1af32017-01-24 14:57:36 -08001731 skb_condense(skb);
Eric Dumazetc9c33212016-08-27 07:37:54 -07001732
Eric Dumazetade96282018-11-19 17:45:55 -08001733 skb_dst_drop(skb);
1734
Eric Dumazet4f693b52018-11-27 14:42:03 -08001735 if (unlikely(tcp_checksum_complete(skb))) {
1736 bh_unlock_sock(sk);
1737 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1738 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1739 return true;
1740 }
1741
1742 /* Attempt coalescing to last skb in backlog, even if we are
1743 * above the limits.
1744 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1745 */
1746 th = (const struct tcphdr *)skb->data;
1747 hdrlen = th->doff * 4;
1748 shinfo = skb_shinfo(skb);
1749
1750 if (!shinfo->gso_size)
1751 shinfo->gso_size = skb->len - hdrlen;
1752
1753 if (!shinfo->gso_segs)
1754 shinfo->gso_segs = 1;
1755
1756 tail = sk->sk_backlog.tail;
1757 if (!tail)
1758 goto no_coalesce;
1759 thtail = (struct tcphdr *)tail->data;
1760
1761 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1762 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1763 ((TCP_SKB_CB(tail)->tcp_flags |
Eric Dumazetca2fe292019-04-26 10:10:05 -07001764 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1765 !((TCP_SKB_CB(tail)->tcp_flags &
1766 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
Eric Dumazet4f693b52018-11-27 14:42:03 -08001767 ((TCP_SKB_CB(tail)->tcp_flags ^
1768 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1769#ifdef CONFIG_TLS_DEVICE
1770 tail->decrypted != skb->decrypted ||
1771#endif
1772 thtail->doff != th->doff ||
1773 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1774 goto no_coalesce;
1775
1776 __skb_pull(skb, hdrlen);
1777 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1778 thtail->window = th->window;
1779
1780 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1781
1782 if (after(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))
1783 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1784
Eric Dumazetca2fe292019-04-26 10:10:05 -07001785 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1786 * thtail->fin, so that the fast path in tcp_rcv_established()
1787 * is not entered if we append a packet with a FIN.
1788 * SYN, RST, URG are not present.
1789 * ACK is set on both packets.
1790 * PSH : we do not really care in TCP stack,
1791 * at least for 'GRO' packets.
1792 */
1793 thtail->fin |= th->fin;
Eric Dumazet4f693b52018-11-27 14:42:03 -08001794 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1795
1796 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1797 TCP_SKB_CB(tail)->has_rxtstamp = true;
1798 tail->tstamp = skb->tstamp;
1799 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1800 }
1801
1802 /* Not as strict as GRO. We only need to carry mss max value */
1803 skb_shinfo(tail)->gso_size = max(shinfo->gso_size,
1804 skb_shinfo(tail)->gso_size);
1805
1806 gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs;
1807 skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
1808
1809 sk->sk_backlog.len += delta;
1810 __NET_INC_STATS(sock_net(sk),
1811 LINUX_MIB_TCPBACKLOGCOALESCE);
1812 kfree_skb_partial(skb, fragstolen);
1813 return false;
1814 }
1815 __skb_push(skb, hdrlen);
1816
1817no_coalesce:
1818 /* Only socket owner can try to collapse/prune rx queues
1819 * to reduce memory overhead, so add a little headroom here.
1820 * Few sockets backlog are possibly concurrently non empty.
1821 */
1822 limit += 64*1024;
1823
Eric Dumazetc9c33212016-08-27 07:37:54 -07001824 if (unlikely(sk_add_backlog(sk, skb, limit))) {
1825 bh_unlock_sock(sk);
1826 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1827 return true;
1828 }
1829 return false;
1830}
1831EXPORT_SYMBOL(tcp_add_backlog);
1832
Eric Dumazetac6e7802016-11-10 13:12:35 -08001833int tcp_filter(struct sock *sk, struct sk_buff *skb)
1834{
1835 struct tcphdr *th = (struct tcphdr *)skb->data;
Eric Dumazetac6e7802016-11-10 13:12:35 -08001836
Christoph Paaschf2feaef2019-03-11 11:41:05 -07001837 return sk_filter_trim_cap(sk, skb, th->doff * 4);
Eric Dumazetac6e7802016-11-10 13:12:35 -08001838}
1839EXPORT_SYMBOL(tcp_filter);
1840
Eric Dumazeteeea10b2017-12-03 09:32:59 -08001841static void tcp_v4_restore_cb(struct sk_buff *skb)
1842{
1843 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1844 sizeof(struct inet_skb_parm));
1845}
1846
1847static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1848 const struct tcphdr *th)
1849{
1850 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1851 * barrier() makes sure compiler wont play fool^Waliasing games.
1852 */
1853 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1854 sizeof(struct inet_skb_parm));
1855 barrier();
1856
1857 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1858 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1859 skb->len - th->doff * 4);
1860 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1861 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1862 TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1863 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1864 TCP_SKB_CB(skb)->sacked = 0;
1865 TCP_SKB_CB(skb)->has_rxtstamp =
1866 skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1867}
1868
Linus Torvalds1da177e2005-04-16 15:20:36 -07001869/*
1870 * From tcp_input.c
1871 */
1872
1873int tcp_v4_rcv(struct sk_buff *skb)
1874{
Eric Dumazet3b24d852016-04-01 08:52:17 -07001875 struct net *net = dev_net(skb->dev);
Eric Dumazet8b27dae2019-03-22 08:56:40 -07001876 struct sk_buff *skb_to_free;
David Ahern3fa6f612017-08-07 08:44:17 -07001877 int sdif = inet_sdif(skb);
David Ahern534322c2019-12-30 14:14:27 -08001878 int dif = inet_iif(skb);
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07001879 const struct iphdr *iph;
Eric Dumazetcf533ea2011-10-21 05:22:42 -04001880 const struct tcphdr *th;
Eric Dumazet3b24d852016-04-01 08:52:17 -07001881 bool refcounted;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001882 struct sock *sk;
1883 int ret;
1884
1885 if (skb->pkt_type != PACKET_HOST)
1886 goto discard_it;
1887
1888 /* Count it even if it's bad */
Eric Dumazet90bbcc62016-04-27 16:44:32 -07001889 __TCP_INC_STATS(net, TCP_MIB_INSEGS);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001890
1891 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1892 goto discard_it;
1893
Eric Dumazetea1627c2016-05-13 09:16:40 -07001894 th = (const struct tcphdr *)skb->data;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001895
Eric Dumazetea1627c2016-05-13 09:16:40 -07001896 if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001897 goto bad_packet;
1898 if (!pskb_may_pull(skb, th->doff * 4))
1899 goto discard_it;
1900
1901 /* An explanation is required here, I think.
1902 * Packet length and doff are validated by header prediction,
Stephen Hemmingercaa20d9a2005-11-10 17:13:47 -08001903 * provided case of th->doff==0 is eliminated.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001904 * So, we defer the checks. */
Tom Herberted70fcf2014-05-02 16:29:38 -07001905
1906 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
Eric Dumazet6a5dc9e2013-04-29 08:39:56 +00001907 goto csum_error;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001908
Eric Dumazetea1627c2016-05-13 09:16:40 -07001909 th = (const struct tcphdr *)skb->data;
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07001910 iph = ip_hdr(skb);
Eric Dumazet4bdc3d62015-10-13 17:12:54 -07001911lookup:
Craig Galleka5836362016-02-10 11:50:38 -05001912 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
David Ahern3fa6f612017-08-07 08:44:17 -07001913 th->dest, sdif, &refcounted);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001914 if (!sk)
1915 goto no_tcp_socket;
1916
Eric Dumazetbb134d52010-03-09 05:55:56 +00001917process:
1918 if (sk->sk_state == TCP_TIME_WAIT)
1919 goto do_time_wait;
1920
Eric Dumazet079096f2015-10-02 11:43:32 -07001921 if (sk->sk_state == TCP_NEW_SYN_RECV) {
1922 struct request_sock *req = inet_reqsk(sk);
Eric Dumazete0f97592018-02-13 06:14:12 -08001923 bool req_stolen = false;
Eric Dumazet77166822016-02-18 05:39:18 -08001924 struct sock *nsk;
Eric Dumazet079096f2015-10-02 11:43:32 -07001925
1926 sk = req->rsk_listener;
David Ahern534322c2019-12-30 14:14:27 -08001927 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) {
Eric Dumazete65c3322016-08-24 08:50:24 -07001928 sk_drops_add(sk, skb);
Eric Dumazet72923552016-02-11 22:50:29 -08001929 reqsk_put(req);
1930 goto discard_it;
1931 }
Frank van der Linden4fd44a92018-06-12 23:09:37 +00001932 if (tcp_checksum_complete(skb)) {
1933 reqsk_put(req);
1934 goto csum_error;
1935 }
Eric Dumazet77166822016-02-18 05:39:18 -08001936 if (unlikely(sk->sk_state != TCP_LISTEN)) {
Eric Dumazetf03f2e12015-10-14 11:16:27 -07001937 inet_csk_reqsk_queue_drop_and_put(sk, req);
Eric Dumazet4bdc3d62015-10-13 17:12:54 -07001938 goto lookup;
1939 }
Eric Dumazet3b24d852016-04-01 08:52:17 -07001940 /* We own a reference on the listener, increase it again
1941 * as we might lose it too soon.
1942 */
Eric Dumazet77166822016-02-18 05:39:18 -08001943 sock_hold(sk);
Eric Dumazet3b24d852016-04-01 08:52:17 -07001944 refcounted = true;
Eric Dumazet1f3b3592017-09-08 12:44:47 -07001945 nsk = NULL;
Eric Dumazeteeea10b2017-12-03 09:32:59 -08001946 if (!tcp_filter(sk, skb)) {
1947 th = (const struct tcphdr *)skb->data;
1948 iph = ip_hdr(skb);
1949 tcp_v4_fill_cb(skb, iph, th);
Eric Dumazete0f97592018-02-13 06:14:12 -08001950 nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
Eric Dumazeteeea10b2017-12-03 09:32:59 -08001951 }
Eric Dumazet079096f2015-10-02 11:43:32 -07001952 if (!nsk) {
1953 reqsk_put(req);
Eric Dumazete0f97592018-02-13 06:14:12 -08001954 if (req_stolen) {
1955 /* Another cpu got exclusive access to req
1956 * and created a full blown socket.
1957 * Try to feed this packet to this socket
1958 * instead of discarding it.
1959 */
1960 tcp_v4_restore_cb(skb);
1961 sock_put(sk);
1962 goto lookup;
1963 }
Eric Dumazet77166822016-02-18 05:39:18 -08001964 goto discard_and_relse;
Eric Dumazet079096f2015-10-02 11:43:32 -07001965 }
1966 if (nsk == sk) {
Eric Dumazet079096f2015-10-02 11:43:32 -07001967 reqsk_put(req);
Eric Dumazeteeea10b2017-12-03 09:32:59 -08001968 tcp_v4_restore_cb(skb);
Eric Dumazet079096f2015-10-02 11:43:32 -07001969 } else if (tcp_child_process(sk, nsk, skb)) {
1970 tcp_v4_send_reset(nsk, skb);
Eric Dumazet77166822016-02-18 05:39:18 -08001971 goto discard_and_relse;
Eric Dumazet079096f2015-10-02 11:43:32 -07001972 } else {
Eric Dumazet77166822016-02-18 05:39:18 -08001973 sock_put(sk);
Eric Dumazet079096f2015-10-02 11:43:32 -07001974 return 0;
1975 }
1976 }
Eric Dumazet6cce09f2010-03-07 23:21:57 +00001977 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
Eric Dumazet02a1d6e2016-04-27 16:44:39 -07001978 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
Stephen Hemmingerd218d112010-01-11 16:28:01 -08001979 goto discard_and_relse;
Eric Dumazet6cce09f2010-03-07 23:21:57 +00001980 }
Stephen Hemmingerd218d112010-01-11 16:28:01 -08001981
Linus Torvalds1da177e2005-04-16 15:20:36 -07001982 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1983 goto discard_and_relse;
Dmitry Popov9ea88a12014-08-07 02:38:22 +04001984
David Ahern534322c2019-12-30 14:14:27 -08001985 if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))
Dmitry Popov9ea88a12014-08-07 02:38:22 +04001986 goto discard_and_relse;
Dmitry Popov9ea88a12014-08-07 02:38:22 +04001987
Florian Westphal895b5c92019-09-29 20:54:03 +02001988 nf_reset_ct(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001989
Eric Dumazetac6e7802016-11-10 13:12:35 -08001990 if (tcp_filter(sk, skb))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001991 goto discard_and_relse;
Eric Dumazetac6e7802016-11-10 13:12:35 -08001992 th = (const struct tcphdr *)skb->data;
1993 iph = ip_hdr(skb);
Eric Dumazeteeea10b2017-12-03 09:32:59 -08001994 tcp_v4_fill_cb(skb, iph, th);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001995
1996 skb->dev = NULL;
1997
Eric Dumazete994b2f2015-10-02 11:43:39 -07001998 if (sk->sk_state == TCP_LISTEN) {
1999 ret = tcp_v4_do_rcv(sk, skb);
2000 goto put_and_return;
2001 }
2002
2003 sk_incoming_cpu_update(sk);
2004
Ingo Molnarc6366182006-07-03 00:25:13 -07002005 bh_lock_sock_nested(sk);
Martin KaFai Laua44d6ea2016-03-14 10:52:15 -07002006 tcp_segs_in(tcp_sk(sk), skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002007 ret = 0;
2008 if (!sock_owned_by_user(sk)) {
Eric Dumazet8b27dae2019-03-22 08:56:40 -07002009 skb_to_free = sk->sk_rx_skb_cache;
2010 sk->sk_rx_skb_cache = NULL;
Florian Westphale7942d02017-07-30 03:57:18 +02002011 ret = tcp_v4_do_rcv(sk, skb);
Eric Dumazet8b27dae2019-03-22 08:56:40 -07002012 } else {
2013 if (tcp_add_backlog(sk, skb))
2014 goto discard_and_relse;
2015 skb_to_free = NULL;
Zhu Yi6b03a532010-03-04 18:01:41 +00002016 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002017 bh_unlock_sock(sk);
Eric Dumazet8b27dae2019-03-22 08:56:40 -07002018 if (skb_to_free)
2019 __kfree_skb(skb_to_free);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002020
Eric Dumazete994b2f2015-10-02 11:43:39 -07002021put_and_return:
Eric Dumazet3b24d852016-04-01 08:52:17 -07002022 if (refcounted)
2023 sock_put(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002024
2025 return ret;
2026
2027no_tcp_socket:
2028 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2029 goto discard_it;
2030
Eric Dumazeteeea10b2017-12-03 09:32:59 -08002031 tcp_v4_fill_cb(skb, iph, th);
2032
Eric Dumazet12e25e12015-06-03 23:49:21 -07002033 if (tcp_checksum_complete(skb)) {
Eric Dumazet6a5dc9e2013-04-29 08:39:56 +00002034csum_error:
Eric Dumazet90bbcc62016-04-27 16:44:32 -07002035 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002036bad_packet:
Eric Dumazet90bbcc62016-04-27 16:44:32 -07002037 __TCP_INC_STATS(net, TCP_MIB_INERRS);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002038 } else {
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08002039 tcp_v4_send_reset(NULL, skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002040 }
2041
2042discard_it:
2043 /* Discard frame. */
2044 kfree_skb(skb);
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002045 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002046
2047discard_and_relse:
Eric Dumazet532182c2016-04-01 08:52:19 -07002048 sk_drops_add(sk, skb);
Eric Dumazet3b24d852016-04-01 08:52:17 -07002049 if (refcounted)
2050 sock_put(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002051 goto discard_it;
2052
2053do_time_wait:
2054 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
YOSHIFUJI Hideaki9469c7b2006-10-10 19:41:46 -07002055 inet_twsk_put(inet_twsk(sk));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002056 goto discard_it;
2057 }
2058
Eric Dumazeteeea10b2017-12-03 09:32:59 -08002059 tcp_v4_fill_cb(skb, iph, th);
2060
Eric Dumazet6a5dc9e2013-04-29 08:39:56 +00002061 if (tcp_checksum_complete(skb)) {
2062 inet_twsk_put(inet_twsk(sk));
2063 goto csum_error;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002064 }
YOSHIFUJI Hideaki9469c7b2006-10-10 19:41:46 -07002065 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002066 case TCP_TW_SYN: {
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09002067 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
Craig Galleka5836362016-02-10 11:50:38 -05002068 &tcp_hashinfo, skb,
2069 __tcp_hdrlen(th),
Tom Herbertda5e3632013-01-22 09:50:24 +00002070 iph->saddr, th->source,
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07002071 iph->daddr, th->dest,
David Ahern3fa6f612017-08-07 08:44:17 -07002072 inet_iif(skb),
2073 sdif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002074 if (sk2) {
Eric Dumazetdbe7faa2015-07-08 14:28:30 -07002075 inet_twsk_deschedule_put(inet_twsk(sk));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002076 sk = sk2;
Eric Dumazeteeea10b2017-12-03 09:32:59 -08002077 tcp_v4_restore_cb(skb);
Eric Dumazet3b24d852016-04-01 08:52:17 -07002078 refcounted = false;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002079 goto process;
2080 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002081 }
Gustavo A. R. Silvafcfd6df2017-10-16 15:48:55 -05002082 /* to ACK */
Joe Perchesa8eceea2020-03-12 15:50:22 -07002083 fallthrough;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002084 case TCP_TW_ACK:
2085 tcp_v4_timewait_ack(sk, skb);
2086 break;
2087 case TCP_TW_RST:
Florian Westphal271c3b92015-12-21 21:29:26 +01002088 tcp_v4_send_reset(sk, skb);
2089 inet_twsk_deschedule_put(inet_twsk(sk));
2090 goto discard_it;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002091 case TCP_TW_SUCCESS:;
2092 }
2093 goto discard_it;
2094}
2095
David S. Millerccb7c412010-12-01 18:09:13 -08002096static struct timewait_sock_ops tcp_timewait_sock_ops = {
2097 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
2098 .twsk_unique = tcp_twsk_unique,
2099 .twsk_destructor= tcp_twsk_destructor,
David S. Millerccb7c412010-12-01 18:09:13 -08002100};
Linus Torvalds1da177e2005-04-16 15:20:36 -07002101
Eric Dumazet63d02d12012-08-09 14:11:00 +00002102void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
Eric Dumazet5d299f32012-08-06 05:09:33 +00002103{
2104 struct dst_entry *dst = skb_dst(skb);
2105
Eric Dumazet5037e9e2015-12-14 14:08:53 -08002106 if (dst && dst_hold_safe(dst)) {
Eric Dumazetca777ef2014-09-08 08:06:07 -07002107 sk->sk_rx_dst = dst;
2108 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2109 }
Eric Dumazet5d299f32012-08-06 05:09:33 +00002110}
Eric Dumazet63d02d12012-08-09 14:11:00 +00002111EXPORT_SYMBOL(inet_sk_rx_dst_set);
Eric Dumazet5d299f32012-08-06 05:09:33 +00002112
Stephen Hemminger3b401a82009-09-01 19:25:04 +00002113const struct inet_connection_sock_af_ops ipv4_specific = {
Arnaldo Carvalho de Melo543d9cf2006-03-20 22:48:35 -08002114 .queue_xmit = ip_queue_xmit,
2115 .send_check = tcp_v4_send_check,
2116 .rebuild_header = inet_sk_rebuild_header,
Eric Dumazet5d299f32012-08-06 05:09:33 +00002117 .sk_rx_dst_set = inet_sk_rx_dst_set,
Arnaldo Carvalho de Melo543d9cf2006-03-20 22:48:35 -08002118 .conn_request = tcp_v4_conn_request,
2119 .syn_recv_sock = tcp_v4_syn_recv_sock,
Arnaldo Carvalho de Melo543d9cf2006-03-20 22:48:35 -08002120 .net_header_len = sizeof(struct iphdr),
2121 .setsockopt = ip_setsockopt,
2122 .getsockopt = ip_getsockopt,
2123 .addr2sockaddr = inet_csk_addr2sockaddr,
2124 .sockaddr_len = sizeof(struct sockaddr_in),
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08002125#ifdef CONFIG_COMPAT
Arnaldo Carvalho de Melo543d9cf2006-03-20 22:48:35 -08002126 .compat_setsockopt = compat_ip_setsockopt,
2127 .compat_getsockopt = compat_ip_getsockopt,
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08002128#endif
Neal Cardwell4fab9072014-08-14 12:40:05 -04002129 .mtu_reduced = tcp_v4_mtu_reduced,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002130};
Eric Dumazet4bc2f182010-07-09 21:22:10 +00002131EXPORT_SYMBOL(ipv4_specific);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002132
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08002133#ifdef CONFIG_TCP_MD5SIG
Stephen Hemmingerb2e4b3de2009-09-01 19:25:03 +00002134static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08002135 .md5_lookup = tcp_v4_md5_lookup,
Adam Langley49a72df2008-07-19 00:01:42 -07002136 .calc_md5_hash = tcp_v4_md5_hash_skb,
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08002137 .md5_parse = tcp_v4_parse_md5_keys,
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08002138};
Andrew Mortonb6332e62006-11-30 19:16:28 -08002139#endif
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08002140
Linus Torvalds1da177e2005-04-16 15:20:36 -07002141/* NOTE: A lot of things set to zero explicitly by call to
2142 * sk_alloc() so need not be done here.
2143 */
2144static int tcp_v4_init_sock(struct sock *sk)
2145{
Arnaldo Carvalho de Melo6687e982005-08-10 04:03:31 -03002146 struct inet_connection_sock *icsk = inet_csk(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002147
Neal Cardwell900f65d2012-04-19 09:55:21 +00002148 tcp_init_sock(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002149
Arnaldo Carvalho de Melo8292a172005-12-13 23:15:52 -08002150 icsk->icsk_af_ops = &ipv4_specific;
Neal Cardwell900f65d2012-04-19 09:55:21 +00002151
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08002152#ifdef CONFIG_TCP_MD5SIG
David S. Millerac807fa2012-04-23 03:21:58 -04002153 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08002154#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07002155
Linus Torvalds1da177e2005-04-16 15:20:36 -07002156 return 0;
2157}
2158
Brian Haley7d06b2e2008-06-14 17:04:49 -07002159void tcp_v4_destroy_sock(struct sock *sk)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002160{
2161 struct tcp_sock *tp = tcp_sk(sk);
2162
Song Liue1a4aa52017-10-23 09:20:26 -07002163 trace_tcp_destroy_sock(sk);
2164
Linus Torvalds1da177e2005-04-16 15:20:36 -07002165 tcp_clear_xmit_timers(sk);
2166
Arnaldo Carvalho de Melo6687e982005-08-10 04:03:31 -03002167 tcp_cleanup_congestion_control(sk);
Stephen Hemminger317a76f2005-06-23 12:19:55 -07002168
Dave Watson734942c2017-06-14 11:37:14 -07002169 tcp_cleanup_ulp(sk);
2170
Linus Torvalds1da177e2005-04-16 15:20:36 -07002171 /* Cleanup up the write buffer. */
David S. Millerfe067e82007-03-07 12:12:44 -08002172 tcp_write_queue_purge(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002173
Wei Wangcf1ef3f2017-04-20 14:45:46 -07002174 /* Check if we want to disable active TFO */
2175 tcp_fastopen_active_disable_ofo_check(sk);
2176
Linus Torvalds1da177e2005-04-16 15:20:36 -07002177 /* Cleans up our, hopefully empty, out_of_order_queue. */
Yaogong Wang9f5afea2016-09-07 14:49:28 -07002178 skb_rbtree_purge(&tp->out_of_order_queue);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002179
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08002180#ifdef CONFIG_TCP_MD5SIG
2181 /* Clean up the MD5 key list, if any */
2182 if (tp->md5sig_info) {
Eric Dumazeta915da9b2012-01-31 05:18:33 +00002183 tcp_clear_md5_list(sk);
Mat Martineaufb7df5e2017-12-21 10:29:10 -08002184 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
YOSHIFUJI Hideakicfb6eeb2006-11-14 19:07:45 -08002185 tp->md5sig_info = NULL;
2186 }
2187#endif
2188
Linus Torvalds1da177e2005-04-16 15:20:36 -07002189 /* Clean up a referenced TCP bind bucket. */
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07002190 if (inet_csk(sk)->icsk_bind_hash)
Arnaldo Carvalho de Meloab1e0a12008-02-03 04:06:04 -08002191 inet_put_port(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002192
Eric Dumazetd983ea62019-10-10 20:17:38 -07002193 BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
William Allen Simpson435cf552009-12-02 18:17:05 +00002194
Yuchung Chengcf60af02012-07-19 06:43:09 +00002195 /* If socket is aborted during connect operation */
2196 tcp_free_fastopen_req(tp);
Yuchung Cheng1fba70e2017-10-18 11:22:51 -07002197 tcp_fastopen_destroy_cipher(sk);
Eric Dumazetcd8ae852015-05-03 21:34:46 -07002198 tcp_saved_syn_free(tp);
Yuchung Chengcf60af02012-07-19 06:43:09 +00002199
Glauber Costa180d8cd2011-12-11 21:47:02 +00002200 sk_sockets_allocated_dec(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002201}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002202EXPORT_SYMBOL(tcp_v4_destroy_sock);
2203
2204#ifdef CONFIG_PROC_FS
2205/* Proc filesystem TCP sock list dumping. */
2206
Tom Herberta8b690f2010-06-07 00:43:42 -07002207/*
2208 * Get next listener socket follow cur. If cur is NULL, get first socket
2209 * starting from bucket given in st->bucket; when st->bucket is zero the
2210 * very first socket in the hash table is returned.
2211 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07002212static void *listening_get_next(struct seq_file *seq, void *cur)
2213{
Christoph Hellwig37d849b2018-04-11 09:31:28 +02002214 struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
Jianjun Kong5799de02008-11-03 02:49:10 -08002215 struct tcp_iter_state *st = seq->private;
Denis V. Luneva4146b12008-04-13 22:11:14 -07002216 struct net *net = seq_file_net(seq);
Eric Dumazet3b24d852016-04-01 08:52:17 -07002217 struct inet_listen_hashbucket *ilb;
Eric Dumazet8dbd76e2019-12-13 18:20:41 -08002218 struct hlist_nulls_node *node;
Eric Dumazet3b24d852016-04-01 08:52:17 -07002219 struct sock *sk = cur;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002220
2221 if (!sk) {
Eric Dumazet3b24d852016-04-01 08:52:17 -07002222get_head:
Tom Herberta8b690f2010-06-07 00:43:42 -07002223 ilb = &tcp_hashinfo.listening_hash[st->bucket];
Eric Dumazet9652dc22016-10-19 21:24:58 -07002224 spin_lock(&ilb->lock);
Eric Dumazet8dbd76e2019-12-13 18:20:41 -08002225 sk = sk_nulls_head(&ilb->nulls_head);
Tom Herberta8b690f2010-06-07 00:43:42 -07002226 st->offset = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002227 goto get_sk;
2228 }
Eric Dumazet5caea4e2008-11-20 00:40:07 -08002229 ilb = &tcp_hashinfo.listening_hash[st->bucket];
Linus Torvalds1da177e2005-04-16 15:20:36 -07002230 ++st->num;
Tom Herberta8b690f2010-06-07 00:43:42 -07002231 ++st->offset;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002232
Eric Dumazet8dbd76e2019-12-13 18:20:41 -08002233 sk = sk_nulls_next(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002234get_sk:
Eric Dumazet8dbd76e2019-12-13 18:20:41 -08002235 sk_nulls_for_each_from(sk, node) {
Pavel Emelyanov8475ef92010-11-22 03:26:12 +00002236 if (!net_eq(sock_net(sk), net))
2237 continue;
Christoph Hellwig37d849b2018-04-11 09:31:28 +02002238 if (sk->sk_family == afinfo->family)
Eric Dumazet3b24d852016-04-01 08:52:17 -07002239 return sk;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002240 }
Eric Dumazet9652dc22016-10-19 21:24:58 -07002241 spin_unlock(&ilb->lock);
Tom Herberta8b690f2010-06-07 00:43:42 -07002242 st->offset = 0;
Eric Dumazet3b24d852016-04-01 08:52:17 -07002243 if (++st->bucket < INET_LHTABLE_SIZE)
2244 goto get_head;
2245 return NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002246}
2247
2248static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2249{
Tom Herberta8b690f2010-06-07 00:43:42 -07002250 struct tcp_iter_state *st = seq->private;
2251 void *rc;
2252
2253 st->bucket = 0;
2254 st->offset = 0;
2255 rc = listening_get_next(seq, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002256
2257 while (rc && *pos) {
2258 rc = listening_get_next(seq, rc);
2259 --*pos;
2260 }
2261 return rc;
2262}
2263
Eric Dumazet05dbc7b2013-10-03 00:22:02 -07002264static inline bool empty_bucket(const struct tcp_iter_state *st)
Andi Kleen6eac5602008-08-28 01:08:02 -07002265{
Eric Dumazet05dbc7b2013-10-03 00:22:02 -07002266 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
Andi Kleen6eac5602008-08-28 01:08:02 -07002267}
2268
Tom Herberta8b690f2010-06-07 00:43:42 -07002269/*
2270 * Get first established socket starting from bucket given in st->bucket.
2271 * If st->bucket is zero, the very first socket in the hash is returned.
2272 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07002273static void *established_get_first(struct seq_file *seq)
2274{
Christoph Hellwig37d849b2018-04-11 09:31:28 +02002275 struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
Jianjun Kong5799de02008-11-03 02:49:10 -08002276 struct tcp_iter_state *st = seq->private;
Denis V. Luneva4146b12008-04-13 22:11:14 -07002277 struct net *net = seq_file_net(seq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002278 void *rc = NULL;
2279
Tom Herberta8b690f2010-06-07 00:43:42 -07002280 st->offset = 0;
2281 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002282 struct sock *sk;
Eric Dumazet3ab5aee2008-11-16 19:40:17 -08002283 struct hlist_nulls_node *node;
Eric Dumazet9db66bd2008-11-20 20:39:09 -08002284 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002285
Andi Kleen6eac5602008-08-28 01:08:02 -07002286 /* Lockless fast path for the common case of empty buckets */
2287 if (empty_bucket(st))
2288 continue;
2289
Eric Dumazet9db66bd2008-11-20 20:39:09 -08002290 spin_lock_bh(lock);
Eric Dumazet3ab5aee2008-11-16 19:40:17 -08002291 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
Christoph Hellwig37d849b2018-04-11 09:31:28 +02002292 if (sk->sk_family != afinfo->family ||
YOSHIFUJI Hideaki878628f2008-03-26 03:57:35 +09002293 !net_eq(sock_net(sk), net)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002294 continue;
2295 }
2296 rc = sk;
2297 goto out;
2298 }
Eric Dumazet9db66bd2008-11-20 20:39:09 -08002299 spin_unlock_bh(lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002300 }
2301out:
2302 return rc;
2303}
2304
2305static void *established_get_next(struct seq_file *seq, void *cur)
2306{
Christoph Hellwig37d849b2018-04-11 09:31:28 +02002307 struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002308 struct sock *sk = cur;
Eric Dumazet3ab5aee2008-11-16 19:40:17 -08002309 struct hlist_nulls_node *node;
Jianjun Kong5799de02008-11-03 02:49:10 -08002310 struct tcp_iter_state *st = seq->private;
Denis V. Luneva4146b12008-04-13 22:11:14 -07002311 struct net *net = seq_file_net(seq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002312
2313 ++st->num;
Tom Herberta8b690f2010-06-07 00:43:42 -07002314 ++st->offset;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002315
Eric Dumazet05dbc7b2013-10-03 00:22:02 -07002316 sk = sk_nulls_next(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002317
Eric Dumazet3ab5aee2008-11-16 19:40:17 -08002318 sk_nulls_for_each_from(sk, node) {
Christoph Hellwig37d849b2018-04-11 09:31:28 +02002319 if (sk->sk_family == afinfo->family &&
2320 net_eq(sock_net(sk), net))
Eric Dumazet05dbc7b2013-10-03 00:22:02 -07002321 return sk;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002322 }
2323
Eric Dumazet05dbc7b2013-10-03 00:22:02 -07002324 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2325 ++st->bucket;
2326 return established_get_first(seq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002327}
2328
2329static void *established_get_idx(struct seq_file *seq, loff_t pos)
2330{
Tom Herberta8b690f2010-06-07 00:43:42 -07002331 struct tcp_iter_state *st = seq->private;
2332 void *rc;
2333
2334 st->bucket = 0;
2335 rc = established_get_first(seq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002336
2337 while (rc && pos) {
2338 rc = established_get_next(seq, rc);
2339 --pos;
Arnaldo Carvalho de Melo71742592006-11-17 10:57:30 -02002340 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002341 return rc;
2342}
2343
2344static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2345{
2346 void *rc;
Jianjun Kong5799de02008-11-03 02:49:10 -08002347 struct tcp_iter_state *st = seq->private;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002348
Linus Torvalds1da177e2005-04-16 15:20:36 -07002349 st->state = TCP_SEQ_STATE_LISTENING;
2350 rc = listening_get_idx(seq, &pos);
2351
2352 if (!rc) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002353 st->state = TCP_SEQ_STATE_ESTABLISHED;
2354 rc = established_get_idx(seq, pos);
2355 }
2356
2357 return rc;
2358}
2359
Tom Herberta8b690f2010-06-07 00:43:42 -07002360static void *tcp_seek_last_pos(struct seq_file *seq)
2361{
2362 struct tcp_iter_state *st = seq->private;
2363 int offset = st->offset;
2364 int orig_num = st->num;
2365 void *rc = NULL;
2366
2367 switch (st->state) {
Tom Herberta8b690f2010-06-07 00:43:42 -07002368 case TCP_SEQ_STATE_LISTENING:
2369 if (st->bucket >= INET_LHTABLE_SIZE)
2370 break;
2371 st->state = TCP_SEQ_STATE_LISTENING;
2372 rc = listening_get_next(seq, NULL);
2373 while (offset-- && rc)
2374 rc = listening_get_next(seq, rc);
2375 if (rc)
2376 break;
2377 st->bucket = 0;
Eric Dumazet05dbc7b2013-10-03 00:22:02 -07002378 st->state = TCP_SEQ_STATE_ESTABLISHED;
Joe Perchesa8eceea2020-03-12 15:50:22 -07002379 fallthrough;
Tom Herberta8b690f2010-06-07 00:43:42 -07002380 case TCP_SEQ_STATE_ESTABLISHED:
Tom Herberta8b690f2010-06-07 00:43:42 -07002381 if (st->bucket > tcp_hashinfo.ehash_mask)
2382 break;
2383 rc = established_get_first(seq);
2384 while (offset-- && rc)
2385 rc = established_get_next(seq, rc);
2386 }
2387
2388 st->num = orig_num;
2389
2390 return rc;
2391}
2392
Christoph Hellwig37d849b2018-04-11 09:31:28 +02002393void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002394{
Jianjun Kong5799de02008-11-03 02:49:10 -08002395 struct tcp_iter_state *st = seq->private;
Tom Herberta8b690f2010-06-07 00:43:42 -07002396 void *rc;
2397
2398 if (*pos && *pos == st->last_pos) {
2399 rc = tcp_seek_last_pos(seq);
2400 if (rc)
2401 goto out;
2402 }
2403
Linus Torvalds1da177e2005-04-16 15:20:36 -07002404 st->state = TCP_SEQ_STATE_LISTENING;
2405 st->num = 0;
Tom Herberta8b690f2010-06-07 00:43:42 -07002406 st->bucket = 0;
2407 st->offset = 0;
2408 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2409
2410out:
2411 st->last_pos = *pos;
2412 return rc;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002413}
Christoph Hellwig37d849b2018-04-11 09:31:28 +02002414EXPORT_SYMBOL(tcp_seq_start);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002415
Christoph Hellwig37d849b2018-04-11 09:31:28 +02002416void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002417{
Tom Herberta8b690f2010-06-07 00:43:42 -07002418 struct tcp_iter_state *st = seq->private;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002419 void *rc = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002420
2421 if (v == SEQ_START_TOKEN) {
2422 rc = tcp_get_idx(seq, 0);
2423 goto out;
2424 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002425
2426 switch (st->state) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002427 case TCP_SEQ_STATE_LISTENING:
2428 rc = listening_get_next(seq, v);
2429 if (!rc) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002430 st->state = TCP_SEQ_STATE_ESTABLISHED;
Tom Herberta8b690f2010-06-07 00:43:42 -07002431 st->bucket = 0;
2432 st->offset = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002433 rc = established_get_first(seq);
2434 }
2435 break;
2436 case TCP_SEQ_STATE_ESTABLISHED:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002437 rc = established_get_next(seq, v);
2438 break;
2439 }
2440out:
2441 ++*pos;
Tom Herberta8b690f2010-06-07 00:43:42 -07002442 st->last_pos = *pos;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002443 return rc;
2444}
Christoph Hellwig37d849b2018-04-11 09:31:28 +02002445EXPORT_SYMBOL(tcp_seq_next);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002446
Christoph Hellwig37d849b2018-04-11 09:31:28 +02002447void tcp_seq_stop(struct seq_file *seq, void *v)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002448{
Jianjun Kong5799de02008-11-03 02:49:10 -08002449 struct tcp_iter_state *st = seq->private;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002450
2451 switch (st->state) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002452 case TCP_SEQ_STATE_LISTENING:
2453 if (v != SEQ_START_TOKEN)
Eric Dumazet9652dc22016-10-19 21:24:58 -07002454 spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002455 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002456 case TCP_SEQ_STATE_ESTABLISHED:
2457 if (v)
Eric Dumazet9db66bd2008-11-20 20:39:09 -08002458 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002459 break;
2460 }
2461}
Christoph Hellwig37d849b2018-04-11 09:31:28 +02002462EXPORT_SYMBOL(tcp_seq_stop);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002463
Eric Dumazetd4f06872015-03-12 16:44:09 -07002464static void get_openreq4(const struct request_sock *req,
Eric Dumazetaa3a0c82015-10-02 11:43:30 -07002465 struct seq_file *f, int i)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002466{
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002467 const struct inet_request_sock *ireq = inet_rsk(req);
Eric Dumazetfa76ce732015-03-19 19:04:20 -07002468 long delta = req->rsk_timer.expires - jiffies;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002469
Pavel Emelyanov5e659e42008-04-24 01:02:16 -07002470 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
Tetsuo Handa652586d2013-11-14 14:31:57 -08002471 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
Linus Torvalds1da177e2005-04-16 15:20:36 -07002472 i,
Eric Dumazet634fb9792013-10-09 15:21:29 -07002473 ireq->ir_loc_addr,
Eric Dumazetd4f06872015-03-12 16:44:09 -07002474 ireq->ir_num,
Eric Dumazet634fb9792013-10-09 15:21:29 -07002475 ireq->ir_rmt_addr,
2476 ntohs(ireq->ir_rmt_port),
Linus Torvalds1da177e2005-04-16 15:20:36 -07002477 TCP_SYN_RECV,
2478 0, 0, /* could print option size, but that is af dependent. */
2479 1, /* timers active (only the expire timer) */
Eric Dumazeta399a802012-08-08 21:13:53 +00002480 jiffies_delta_to_clock_t(delta),
Eric Dumazete6c022a2012-10-27 23:16:46 +00002481 req->num_timeout,
Eric Dumazetaa3a0c82015-10-02 11:43:30 -07002482 from_kuid_munged(seq_user_ns(f),
2483 sock_i_uid(req->rsk_listener)),
Linus Torvalds1da177e2005-04-16 15:20:36 -07002484 0, /* non standard timer */
2485 0, /* open_requests have no inode */
Eric Dumazetd4f06872015-03-12 16:44:09 -07002486 0,
Tetsuo Handa652586d2013-11-14 14:31:57 -08002487 req);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002488}
2489
Tetsuo Handa652586d2013-11-14 14:31:57 -08002490static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002491{
2492 int timer_active;
2493 unsigned long timer_expires;
Eric Dumazetcf533ea2011-10-21 05:22:42 -04002494 const struct tcp_sock *tp = tcp_sk(sk);
Ilpo Järvinencf4c6bf2007-02-22 01:13:58 -08002495 const struct inet_connection_sock *icsk = inet_csk(sk);
Eric Dumazetcf533ea2011-10-21 05:22:42 -04002496 const struct inet_sock *inet = inet_sk(sk);
Eric Dumazet0536fcc2015-09-29 07:42:52 -07002497 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
Eric Dumazetc720c7e82009-10-15 06:30:45 +00002498 __be32 dest = inet->inet_daddr;
2499 __be32 src = inet->inet_rcv_saddr;
2500 __u16 destp = ntohs(inet->inet_dport);
2501 __u16 srcp = ntohs(inet->inet_sport);
Eric Dumazet49d09002009-12-03 16:06:13 -08002502 int rx_queue;
Eric Dumazet00fd38d2015-11-12 08:43:18 -08002503 int state;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002504
Nandita Dukkipati6ba8a3b2013-03-11 10:00:43 +00002505 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
Yuchung Cheng57dde7f2017-01-12 22:11:33 -08002506 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
Nandita Dukkipati6ba8a3b2013-03-11 10:00:43 +00002507 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002508 timer_active = 1;
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07002509 timer_expires = icsk->icsk_timeout;
2510 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002511 timer_active = 4;
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07002512 timer_expires = icsk->icsk_timeout;
Ilpo Järvinencf4c6bf2007-02-22 01:13:58 -08002513 } else if (timer_pending(&sk->sk_timer)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002514 timer_active = 2;
Ilpo Järvinencf4c6bf2007-02-22 01:13:58 -08002515 timer_expires = sk->sk_timer.expires;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002516 } else {
2517 timer_active = 0;
2518 timer_expires = jiffies;
2519 }
2520
Yafang Shao986ffdf2017-12-20 11:12:52 +08002521 state = inet_sk_state_load(sk);
Eric Dumazet00fd38d2015-11-12 08:43:18 -08002522 if (state == TCP_LISTEN)
Eric Dumazet288efe82019-11-05 14:11:53 -08002523 rx_queue = READ_ONCE(sk->sk_ack_backlog);
Eric Dumazet49d09002009-12-03 16:06:13 -08002524 else
Eric Dumazet00fd38d2015-11-12 08:43:18 -08002525 /* Because we don't lock the socket,
2526 * we might find a transient negative value.
Eric Dumazet49d09002009-12-03 16:06:13 -08002527 */
Eric Dumazetdba7d9b2019-10-10 20:17:39 -07002528 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
Eric Dumazet7db48e92019-10-10 20:17:40 -07002529 READ_ONCE(tp->copied_seq), 0);
Eric Dumazet49d09002009-12-03 16:06:13 -08002530
Pavel Emelyanov5e659e42008-04-24 01:02:16 -07002531 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
Tetsuo Handa652586d2013-11-14 14:31:57 -08002532 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
Eric Dumazet00fd38d2015-11-12 08:43:18 -08002533 i, src, srcp, dest, destp, state,
Eric Dumazet0f317462019-10-10 20:17:41 -07002534 READ_ONCE(tp->write_seq) - tp->snd_una,
Eric Dumazet49d09002009-12-03 16:06:13 -08002535 rx_queue,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002536 timer_active,
Eric Dumazeta399a802012-08-08 21:13:53 +00002537 jiffies_delta_to_clock_t(timer_expires - jiffies),
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07002538 icsk->icsk_retransmits,
Eric W. Biedermana7cb5a42012-05-24 01:10:10 -06002539 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
Arnaldo Carvalho de Melo6687e982005-08-10 04:03:31 -03002540 icsk->icsk_probes_out,
Ilpo Järvinencf4c6bf2007-02-22 01:13:58 -08002541 sock_i_ino(sk),
Reshetova, Elena41c6d652017-06-30 13:08:01 +03002542 refcount_read(&sk->sk_refcnt), sk,
Stephen Hemminger7be87352008-06-27 20:00:19 -07002543 jiffies_to_clock_t(icsk->icsk_rto),
2544 jiffies_to_clock_t(icsk->icsk_ack.ato),
Wei Wang31954cd2019-01-25 10:53:19 -08002545 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
Linus Torvalds1da177e2005-04-16 15:20:36 -07002546 tp->snd_cwnd,
Eric Dumazet00fd38d2015-11-12 08:43:18 -08002547 state == TCP_LISTEN ?
2548 fastopenq->max_qlen :
Tetsuo Handa652586d2013-11-14 14:31:57 -08002549 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002550}
2551
Eric Dumazetcf533ea2011-10-21 05:22:42 -04002552static void get_timewait4_sock(const struct inet_timewait_sock *tw,
Tetsuo Handa652586d2013-11-14 14:31:57 -08002553 struct seq_file *f, int i)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002554{
Eric Dumazet789f5582015-04-12 18:51:09 -07002555 long delta = tw->tw_timer.expires - jiffies;
Al Viro23f33c22006-09-27 18:43:50 -07002556 __be32 dest, src;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002557 __u16 destp, srcp;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002558
2559 dest = tw->tw_daddr;
2560 src = tw->tw_rcv_saddr;
2561 destp = ntohs(tw->tw_dport);
2562 srcp = ntohs(tw->tw_sport);
2563
Pavel Emelyanov5e659e42008-04-24 01:02:16 -07002564 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
Tetsuo Handa652586d2013-11-14 14:31:57 -08002565 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
Linus Torvalds1da177e2005-04-16 15:20:36 -07002566 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
Eric Dumazeta399a802012-08-08 21:13:53 +00002567 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
Reshetova, Elena41c6d652017-06-30 13:08:01 +03002568 refcount_read(&tw->tw_refcnt), tw);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002569}
2570
2571#define TMPSZ 150
2572
2573static int tcp4_seq_show(struct seq_file *seq, void *v)
2574{
Jianjun Kong5799de02008-11-03 02:49:10 -08002575 struct tcp_iter_state *st;
Eric Dumazet05dbc7b2013-10-03 00:22:02 -07002576 struct sock *sk = v;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002577
Tetsuo Handa652586d2013-11-14 14:31:57 -08002578 seq_setwidth(seq, TMPSZ - 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002579 if (v == SEQ_START_TOKEN) {
Tetsuo Handa652586d2013-11-14 14:31:57 -08002580 seq_puts(seq, " sl local_address rem_address st tx_queue "
Linus Torvalds1da177e2005-04-16 15:20:36 -07002581 "rx_queue tr tm->when retrnsmt uid timeout "
2582 "inode");
2583 goto out;
2584 }
2585 st = seq->private;
2586
Eric Dumazet079096f2015-10-02 11:43:32 -07002587 if (sk->sk_state == TCP_TIME_WAIT)
2588 get_timewait4_sock(v, seq, st->num);
2589 else if (sk->sk_state == TCP_NEW_SYN_RECV)
Eric Dumazetaa3a0c82015-10-02 11:43:30 -07002590 get_openreq4(v, seq, st->num);
Eric Dumazet079096f2015-10-02 11:43:32 -07002591 else
2592 get_tcp4_sock(v, seq, st->num);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002593out:
Tetsuo Handa652586d2013-11-14 14:31:57 -08002594 seq_pad(seq, '\n');
Linus Torvalds1da177e2005-04-16 15:20:36 -07002595 return 0;
2596}
2597
Christoph Hellwig37d849b2018-04-11 09:31:28 +02002598static const struct seq_operations tcp4_seq_ops = {
2599 .show = tcp4_seq_show,
2600 .start = tcp_seq_start,
2601 .next = tcp_seq_next,
2602 .stop = tcp_seq_stop,
2603};
2604
Linus Torvalds1da177e2005-04-16 15:20:36 -07002605static struct tcp_seq_afinfo tcp4_seq_afinfo = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002606 .family = AF_INET,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002607};
2608
Alexey Dobriyan2c8c1e72010-01-17 03:35:32 +00002609static int __net_init tcp4_proc_init_net(struct net *net)
Pavel Emelyanov757764f2008-03-24 14:56:02 -07002610{
Christoph Hellwigc3506372018-04-10 19:42:55 +02002611 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2612 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
Christoph Hellwig37d849b2018-04-11 09:31:28 +02002613 return -ENOMEM;
2614 return 0;
Pavel Emelyanov757764f2008-03-24 14:56:02 -07002615}
2616
Alexey Dobriyan2c8c1e72010-01-17 03:35:32 +00002617static void __net_exit tcp4_proc_exit_net(struct net *net)
Pavel Emelyanov757764f2008-03-24 14:56:02 -07002618{
Christoph Hellwig37d849b2018-04-11 09:31:28 +02002619 remove_proc_entry("tcp", net->proc_net);
Pavel Emelyanov757764f2008-03-24 14:56:02 -07002620}
2621
2622static struct pernet_operations tcp4_net_ops = {
2623 .init = tcp4_proc_init_net,
2624 .exit = tcp4_proc_exit_net,
2625};
2626
Linus Torvalds1da177e2005-04-16 15:20:36 -07002627int __init tcp4_proc_init(void)
2628{
Pavel Emelyanov757764f2008-03-24 14:56:02 -07002629 return register_pernet_subsys(&tcp4_net_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002630}
2631
2632void tcp4_proc_exit(void)
2633{
Pavel Emelyanov757764f2008-03-24 14:56:02 -07002634 unregister_pernet_subsys(&tcp4_net_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002635}
2636#endif /* CONFIG_PROC_FS */
2637
2638struct proto tcp_prot = {
2639 .name = "TCP",
2640 .owner = THIS_MODULE,
2641 .close = tcp_close,
Andrey Ignatovd74bad42018-03-30 15:08:05 -07002642 .pre_connect = tcp_v4_pre_connect,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002643 .connect = tcp_v4_connect,
2644 .disconnect = tcp_disconnect,
Arnaldo Carvalho de Melo463c84b2005-08-09 20:10:42 -07002645 .accept = inet_csk_accept,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002646 .ioctl = tcp_ioctl,
2647 .init = tcp_v4_init_sock,
2648 .destroy = tcp_v4_destroy_sock,
2649 .shutdown = tcp_shutdown,
2650 .setsockopt = tcp_setsockopt,
2651 .getsockopt = tcp_getsockopt,
Ursula Braun4b9d07a2017-01-09 16:55:12 +01002652 .keepalive = tcp_set_keepalive,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002653 .recvmsg = tcp_recvmsg,
Changli Gao7ba42912010-07-10 20:41:55 +00002654 .sendmsg = tcp_sendmsg,
2655 .sendpage = tcp_sendpage,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002656 .backlog_rcv = tcp_v4_do_rcv,
Eric Dumazet46d3cea2012-07-11 05:50:31 +00002657 .release_cb = tcp_release_cb,
Arnaldo Carvalho de Meloab1e0a12008-02-03 04:06:04 -08002658 .hash = inet_hash,
2659 .unhash = inet_unhash,
2660 .get_port = inet_csk_get_port,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002661 .enter_memory_pressure = tcp_enter_memory_pressure,
Eric Dumazet06044752017-06-07 13:29:12 -07002662 .leave_memory_pressure = tcp_leave_memory_pressure,
Eric Dumazetc9bee3b72013-07-22 20:27:07 -07002663 .stream_memory_free = tcp_stream_memory_free,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002664 .sockets_allocated = &tcp_sockets_allocated,
Arnaldo Carvalho de Melo0a5578c2005-08-09 20:11:41 -07002665 .orphan_count = &tcp_orphan_count,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002666 .memory_allocated = &tcp_memory_allocated,
2667 .memory_pressure = &tcp_memory_pressure,
Eric W. Biedermana4fe34b2013-10-19 16:25:36 -07002668 .sysctl_mem = sysctl_tcp_mem,
Eric Dumazet356d1832017-11-07 00:29:28 -08002669 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
2670 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
Linus Torvalds1da177e2005-04-16 15:20:36 -07002671 .max_header = MAX_TCP_HEADER,
2672 .obj_size = sizeof(struct tcp_sock),
Paul E. McKenney5f0d5a32017-01-18 02:53:44 -08002673 .slab_flags = SLAB_TYPESAFE_BY_RCU,
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08002674 .twsk_prot = &tcp_timewait_sock_ops,
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07002675 .rsk_prot = &tcp_request_sock_ops,
Pavel Emelyanov39d8cda2008-03-22 16:50:58 -07002676 .h.hashinfo = &tcp_hashinfo,
Changli Gao7ba42912010-07-10 20:41:55 +00002677 .no_autobind = true,
Arnaldo Carvalho de Melo543d9cf2006-03-20 22:48:35 -08002678#ifdef CONFIG_COMPAT
2679 .compat_setsockopt = compat_tcp_setsockopt,
2680 .compat_getsockopt = compat_tcp_getsockopt,
2681#endif
Lorenzo Colittic1e64e22015-12-16 12:30:05 +09002682 .diag_destroy = tcp_abort,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002683};
Eric Dumazet4bc2f182010-07-09 21:22:10 +00002684EXPORT_SYMBOL(tcp_prot);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002685
Denis V. Lunev046ee902008-04-03 14:31:33 -07002686static void __net_exit tcp_sk_exit(struct net *net)
2687{
Eric Dumazetbdbbb852015-01-29 21:35:05 -08002688 int cpu;
2689
Dust Lib506bc92019-04-01 16:04:53 +08002690 if (net->ipv4.tcp_congestion_control)
Martin KaFai Lau0baf26b2020-01-08 16:35:08 -08002691 bpf_module_put(net->ipv4.tcp_congestion_control,
2692 net->ipv4.tcp_congestion_control->owner);
Stephen Hemminger6670e152017-11-14 08:25:49 -08002693
Eric Dumazetbdbbb852015-01-29 21:35:05 -08002694 for_each_possible_cpu(cpu)
2695 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2696 free_percpu(net->ipv4.tcp_sk);
2697}
2698
2699static int __net_init tcp_sk_init(struct net *net)
2700{
Haishuang Yanfee83d02016-12-28 17:52:33 +08002701 int res, cpu, cnt;
Eric Dumazetbdbbb852015-01-29 21:35:05 -08002702
2703 net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2704 if (!net->ipv4.tcp_sk)
2705 return -ENOMEM;
2706
2707 for_each_possible_cpu(cpu) {
2708 struct sock *sk;
2709
2710 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2711 IPPROTO_TCP, net);
2712 if (res)
2713 goto fail;
Eric Dumazeta9d65322016-04-01 08:52:21 -07002714 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
Eric Dumazet431280e2018-08-22 13:30:45 -07002715
2716 /* Please enforce IP_DF and IPID==0 for RST and
2717 * ACK sent in SYN-RECV and TIME-WAIT state.
2718 */
2719 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
2720
Eric Dumazetbdbbb852015-01-29 21:35:05 -08002721 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2722 }
Daniel Borkmann49213552015-05-19 21:04:22 +02002723
Eric Dumazetbdbbb852015-01-29 21:35:05 -08002724 net->ipv4.sysctl_tcp_ecn = 2;
Daniel Borkmann49213552015-05-19 21:04:22 +02002725 net->ipv4.sysctl_tcp_ecn_fallback = 1;
2726
Fan Dub0f9ca52015-02-10 09:53:16 +08002727 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
Eric Dumazet5f3e2bf002019-06-06 09:15:31 -07002728 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
Fan Du6b58e0a2015-03-06 11:18:23 +08002729 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
Fan Du05cbc0d2015-03-06 11:18:24 +08002730 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
Josh Huntc04b79b2019-08-07 19:52:29 -04002731 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
Eric Dumazetbdbbb852015-01-29 21:35:05 -08002732
Nikolay Borisov13b287e2016-01-07 16:38:43 +02002733 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
Nikolay Borisov9bd68612016-01-07 16:38:44 +02002734 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
Nikolay Borisovb840d152016-01-07 16:38:45 +02002735 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
Nikolay Borisov13b287e2016-01-07 16:38:43 +02002736
Nikolay Borisov6fa25162016-02-03 09:46:49 +02002737 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
Nikolay Borisov7c083ec2016-02-03 09:46:50 +02002738 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
David S. Miller0aca7372016-02-08 04:24:33 -05002739 net->ipv4.sysctl_tcp_syncookies = 1;
Nikolay Borisov1043e252016-02-03 09:46:52 +02002740 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
Nikolay Borisovae5c3f42016-02-03 09:46:53 +02002741 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
Nikolay Borisovc6214a92016-02-03 09:46:54 +02002742 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
Nikolay Borisovc402d9b2016-02-03 09:46:55 +02002743 net->ipv4.sysctl_tcp_orphan_retries = 0;
Nikolay Borisov1e579ca2016-02-03 09:46:56 +02002744 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
Nikolay Borisov4979f2d2016-02-03 09:46:57 +02002745 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
Maciej Żenczykowski79e9fed2018-06-03 10:41:17 -07002746 net->ipv4.sysctl_tcp_tw_reuse = 2;
Kevin(Yudong) Yang65e6d902019-12-09 14:19:59 -05002747 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
Nikolay Borisov12ed8242016-02-03 09:46:51 +02002748
Haishuang Yanfee83d02016-12-28 17:52:33 +08002749 cnt = tcp_hashinfo.ehash_mask + 1;
Yafang Shao743e4812018-09-01 20:21:05 +08002750 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
Haishuang Yan1946e672016-12-28 17:52:32 +08002751 net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2752
Eric Dumazet623d0c22019-10-30 10:05:46 -07002753 net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
Eric Dumazetf9301032017-06-07 10:34:37 -07002754 net->ipv4.sysctl_tcp_sack = 1;
Eric Dumazet9bb37ef2017-06-07 10:34:38 -07002755 net->ipv4.sysctl_tcp_window_scaling = 1;
Eric Dumazet5d2ed052017-06-07 10:34:39 -07002756 net->ipv4.sysctl_tcp_timestamps = 1;
Eric Dumazet2ae21cf2017-10-26 21:54:56 -07002757 net->ipv4.sysctl_tcp_early_retrans = 3;
Eric Dumazete20223f2017-10-26 21:54:57 -07002758 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
Eric Dumazetb510f0d2017-10-26 21:54:59 -07002759 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */
Eric Dumazete0a1e5b2017-10-26 21:55:00 -07002760 net->ipv4.sysctl_tcp_retrans_collapse = 1;
Eric Dumazetc6e21802017-10-26 21:55:06 -07002761 net->ipv4.sysctl_tcp_max_reordering = 300;
Eric Dumazet6496f6b2017-10-26 21:55:07 -07002762 net->ipv4.sysctl_tcp_dsack = 1;
Eric Dumazet0c126542017-10-26 21:55:08 -07002763 net->ipv4.sysctl_tcp_app_win = 31;
Eric Dumazet94f08932017-10-26 21:55:09 -07002764 net->ipv4.sysctl_tcp_adv_win_scale = 1;
Eric Dumazetaf9b69a2017-10-26 21:55:10 -07002765 net->ipv4.sysctl_tcp_frto = 2;
Eric Dumazet4540c0c2017-10-27 07:47:22 -07002766 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
Eric Dumazetd06a9902017-10-27 07:47:23 -07002767 /* This limits the percentage of the congestion window which we
2768 * will allow a single TSO frame to consume. Building TSO frames
2769 * which are too large can cause TCP streams to be bursty.
2770 */
2771 net->ipv4.sysctl_tcp_tso_win_divisor = 3;
Eric Dumazetc73e5802018-11-11 07:34:28 -08002772 /* Default TSQ limit of 16 TSO segments */
2773 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
Eric Dumazetb530b682017-10-27 07:47:26 -07002774 /* rfc5961 challenge ack rate limiting */
2775 net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
Eric Dumazet26e95962017-10-27 07:47:27 -07002776 net->ipv4.sysctl_tcp_min_tso_segs = 2;
Eric Dumazetbd239702017-10-27 07:47:28 -07002777 net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
Eric Dumazet790f00e2017-10-27 07:47:29 -07002778 net->ipv4.sysctl_tcp_autocorking = 1;
Eric Dumazet4170ba62017-10-27 07:47:30 -07002779 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
Eric Dumazet23a7102a2017-10-27 07:47:31 -07002780 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
Eric Dumazetc26e91f2017-10-27 07:47:32 -07002781 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
Eric Dumazet356d1832017-11-07 00:29:28 -08002782 if (net != &init_net) {
2783 memcpy(net->ipv4.sysctl_tcp_rmem,
2784 init_net.ipv4.sysctl_tcp_rmem,
2785 sizeof(init_net.ipv4.sysctl_tcp_rmem));
2786 memcpy(net->ipv4.sysctl_tcp_wmem,
2787 init_net.ipv4.sysctl_tcp_wmem,
2788 sizeof(init_net.ipv4.sysctl_tcp_wmem));
2789 }
Eric Dumazet6d82aa22018-05-17 14:47:28 -07002790 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
Eric Dumazeta70437c2020-04-30 10:35:43 -07002791 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
Eric Dumazet9c21d2f2018-05-17 14:47:29 -07002792 net->ipv4.sysctl_tcp_comp_sack_nr = 44;
Haishuang Yane1cfcbe2017-09-27 11:35:40 +08002793 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
Haishuang Yan43713842017-09-27 11:35:42 +08002794 spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
Haishuang Yan3733be12017-09-27 11:35:43 +08002795 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2796 atomic_set(&net->ipv4.tfo_active_disable_times, 0);
Haishuang Yane1cfcbe2017-09-27 11:35:40 +08002797
Stephen Hemminger6670e152017-11-14 08:25:49 -08002798 /* Reno is always built in */
2799 if (!net_eq(net, &init_net) &&
Martin KaFai Lau0baf26b2020-01-08 16:35:08 -08002800 bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
2801 init_net.ipv4.tcp_congestion_control->owner))
Stephen Hemminger6670e152017-11-14 08:25:49 -08002802 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2803 else
2804 net->ipv4.tcp_congestion_control = &tcp_reno;
2805
Daniel Borkmann49213552015-05-19 21:04:22 +02002806 return 0;
Eric Dumazetbdbbb852015-01-29 21:35:05 -08002807fail:
2808 tcp_sk_exit(net);
2809
2810 return res;
Eric W. Biedermanb099ce22009-12-03 02:29:09 +00002811}
2812
2813static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2814{
Haishuang Yan43713842017-09-27 11:35:42 +08002815 struct net *net;
2816
Haishuang Yan1946e672016-12-28 17:52:32 +08002817 inet_twsk_purge(&tcp_hashinfo, AF_INET);
Haishuang Yan43713842017-09-27 11:35:42 +08002818
2819 list_for_each_entry(net, net_exit_list, exit_list)
2820 tcp_fastopen_ctx_destroy(net);
Denis V. Lunev046ee902008-04-03 14:31:33 -07002821}
2822
2823static struct pernet_operations __net_initdata tcp_sk_ops = {
Eric W. Biedermanb099ce22009-12-03 02:29:09 +00002824 .init = tcp_sk_init,
2825 .exit = tcp_sk_exit,
2826 .exit_batch = tcp_sk_exit_batch,
Denis V. Lunev046ee902008-04-03 14:31:33 -07002827};
2828
Denis V. Lunev9b0f9762008-02-29 11:13:15 -08002829void __init tcp_v4_init(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002830{
Eric W. Biederman6a1b3052009-02-22 00:10:18 -08002831 if (register_pernet_subsys(&tcp_sk_ops))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002832 panic("Failed to create the TCP control socket.\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07002833}