blob: 75b4c4c50dbb198807754d493ea3653d5af3085d [file] [log] [blame]
Mat Martineauf870fa02020-01-21 16:56:15 -08001// SPDX-License-Identifier: GPL-2.0
2/* Multipath TCP
3 *
4 * Copyright (c) 2017 - 2019, Intel Corporation.
5 */
6
7#define pr_fmt(fmt) "MPTCP: " fmt
8
9#include <linux/kernel.h>
10#include <linux/module.h>
11#include <linux/netdevice.h>
Paolo Abeni7a6a6cb2020-01-21 16:56:26 -080012#include <linux/sched/signal.h>
13#include <linux/atomic.h>
Mat Martineauf870fa02020-01-21 16:56:15 -080014#include <net/sock.h>
15#include <net/inet_common.h>
16#include <net/inet_hashtables.h>
17#include <net/protocol.h>
18#include <net/tcp.h>
Mat Martineau3721b9b2020-07-28 15:12:03 -070019#include <net/tcp_states.h>
Peter Krystadcf7da0d2020-01-21 16:56:19 -080020#if IS_ENABLED(CONFIG_MPTCP_IPV6)
21#include <net/transp_v6.h>
22#endif
Mat Martineauf870fa02020-01-21 16:56:15 -080023#include <net/mptcp.h>
Paolo Abenie16163b2020-11-16 10:48:09 +010024#include <net/xfrm.h>
Mat Martineauf870fa02020-01-21 16:56:15 -080025#include "protocol.h"
Florian Westphalfc518952020-03-27 14:48:50 -070026#include "mib.h"
Mat Martineauf870fa02020-01-21 16:56:15 -080027
Florian Westphalb0519de2020-02-06 00:39:37 +010028#if IS_ENABLED(CONFIG_MPTCP_IPV6)
29struct mptcp6_sock {
30 struct mptcp_sock msk;
31 struct ipv6_pinfo np;
32};
33#endif
34
Florian Westphal6771bfd2020-02-26 10:14:48 +010035struct mptcp_skb_cb {
Paolo Abeniab174ad2020-09-14 10:01:12 +020036 u64 map_seq;
37 u64 end_seq;
Florian Westphal6771bfd2020-02-26 10:14:48 +010038 u32 offset;
39};
40
41#define MPTCP_SKB_CB(__skb) ((struct mptcp_skb_cb *)&((__skb)->cb[0]))
42
Paolo Abenid0272362020-03-27 14:48:45 -070043static struct percpu_counter mptcp_sockets_allocated;
44
Paolo Abenie16163b2020-11-16 10:48:09 +010045static void __mptcp_destroy_sock(struct sock *sk);
Paolo Abenid9ca1de2020-11-16 10:48:10 +010046static void __mptcp_check_send_data_fin(struct sock *sk);
Paolo Abenie16163b2020-11-16 10:48:09 +010047
Peter Krystad2303f992020-01-21 16:56:17 -080048/* If msk has an initial subflow socket, and the MP_CAPABLE handshake has not
49 * completed yet or has failed, return the subflow socket.
50 * Otherwise return NULL.
51 */
52static struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk)
53{
Christoph Paaschd22f4982020-01-21 16:56:32 -080054 if (!msk->subflow || READ_ONCE(msk->can_ack))
Peter Krystad2303f992020-01-21 16:56:17 -080055 return NULL;
56
57 return msk->subflow;
58}
59
Florian Westphal6f8a6122020-11-16 10:48:13 +010060/* Returns end sequence number of the receiver's advertised window */
61static u64 mptcp_wnd_end(const struct mptcp_sock *msk)
62{
63 return atomic64_read(&msk->wnd_end);
64}
65
Paolo Abenid2f77c52020-06-29 22:26:22 +020066static bool mptcp_is_tcpsk(struct sock *sk)
Florian Westphal0b4f33d2020-04-02 13:44:51 +020067{
68 struct socket *sock = sk->sk_socket;
69
Florian Westphal0b4f33d2020-04-02 13:44:51 +020070 if (unlikely(sk->sk_prot == &tcp_prot)) {
71 /* we are being invoked after mptcp_accept() has
72 * accepted a non-mp-capable flow: sk is a tcp_sk,
73 * not an mptcp one.
74 *
75 * Hand the socket over to tcp so all further socket ops
76 * bypass mptcp.
77 */
78 sock->ops = &inet_stream_ops;
Paolo Abenid2f77c52020-06-29 22:26:22 +020079 return true;
Florian Westphal0b4f33d2020-04-02 13:44:51 +020080#if IS_ENABLED(CONFIG_MPTCP_IPV6)
81 } else if (unlikely(sk->sk_prot == &tcpv6_prot)) {
82 sock->ops = &inet6_stream_ops;
Paolo Abenid2f77c52020-06-29 22:26:22 +020083 return true;
Florian Westphal0b4f33d2020-04-02 13:44:51 +020084#endif
85 }
86
Paolo Abenid2f77c52020-06-29 22:26:22 +020087 return false;
Florian Westphal0b4f33d2020-04-02 13:44:51 +020088}
89
Paolo Abeni76660af2020-06-29 22:26:24 +020090static struct sock *__mptcp_tcp_fallback(struct mptcp_sock *msk)
Peter Krystadcec37a62020-01-21 16:56:18 -080091{
Peter Krystadcec37a62020-01-21 16:56:18 -080092 sock_owned_by_me((const struct sock *)msk);
93
Davide Carattie1ff9e82020-06-29 22:26:20 +020094 if (likely(!__mptcp_check_fallback(msk)))
Peter Krystadcec37a62020-01-21 16:56:18 -080095 return NULL;
96
Paolo Abeni76660af2020-06-29 22:26:24 +020097 return msk->first;
Peter Krystadcec37a62020-01-21 16:56:18 -080098}
99
Paolo Abenifa680182020-06-29 22:26:23 +0200100static int __mptcp_socket_create(struct mptcp_sock *msk)
Peter Krystad2303f992020-01-21 16:56:17 -0800101{
102 struct mptcp_subflow_context *subflow;
103 struct sock *sk = (struct sock *)msk;
104 struct socket *ssock;
105 int err;
106
Peter Krystad2303f992020-01-21 16:56:17 -0800107 err = mptcp_subflow_create_socket(sk, &ssock);
108 if (err)
Paolo Abenifa680182020-06-29 22:26:23 +0200109 return err;
Peter Krystad2303f992020-01-21 16:56:17 -0800110
Paolo Abeni8ab183d2020-01-21 16:56:33 -0800111 msk->first = ssock->sk;
Peter Krystad2303f992020-01-21 16:56:17 -0800112 msk->subflow = ssock;
113 subflow = mptcp_subflow_ctx(ssock->sk);
Peter Krystadcec37a62020-01-21 16:56:18 -0800114 list_add(&subflow->node, &msk->conn_list);
Paolo Abenie16163b2020-11-16 10:48:09 +0100115 sock_hold(ssock->sk);
Peter Krystad2303f992020-01-21 16:56:17 -0800116 subflow->request_mptcp = 1;
117
Davide Carattie1ff9e82020-06-29 22:26:20 +0200118 /* accept() will wait on first subflow sk_wq, and we always wakes up
119 * via msk->sk_socket
120 */
121 RCU_INIT_POINTER(msk->first->sk_wq, &sk->sk_socket->wq);
122
Paolo Abenifa680182020-06-29 22:26:23 +0200123 return 0;
Peter Krystad2303f992020-01-21 16:56:17 -0800124}
125
Paolo Abeniab174ad2020-09-14 10:01:12 +0200126static void mptcp_drop(struct sock *sk, struct sk_buff *skb)
127{
128 sk_drops_add(sk, skb);
129 __kfree_skb(skb);
130}
131
Paolo Abeni8268ed42020-09-14 10:01:11 +0200132static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to,
133 struct sk_buff *from)
134{
135 bool fragstolen;
136 int delta;
137
138 if (MPTCP_SKB_CB(from)->offset ||
139 !skb_try_coalesce(to, from, &fragstolen, &delta))
140 return false;
141
Paolo Abeni06242e42020-09-14 10:01:14 +0200142 pr_debug("colesced seq %llx into %llx new len %d new end seq %llx",
143 MPTCP_SKB_CB(from)->map_seq, MPTCP_SKB_CB(to)->map_seq,
144 to->len, MPTCP_SKB_CB(from)->end_seq);
Paolo Abeniab174ad2020-09-14 10:01:12 +0200145 MPTCP_SKB_CB(to)->end_seq = MPTCP_SKB_CB(from)->end_seq;
Paolo Abeni8268ed42020-09-14 10:01:11 +0200146 kfree_skb_partial(from, fragstolen);
147 atomic_add(delta, &sk->sk_rmem_alloc);
148 sk_mem_charge(sk, delta);
149 return true;
150}
151
Paolo Abeniab174ad2020-09-14 10:01:12 +0200152static bool mptcp_ooo_try_coalesce(struct mptcp_sock *msk, struct sk_buff *to,
153 struct sk_buff *from)
Florian Westphal6771bfd2020-02-26 10:14:48 +0100154{
Paolo Abeniab174ad2020-09-14 10:01:12 +0200155 if (MPTCP_SKB_CB(from)->map_seq != MPTCP_SKB_CB(to)->end_seq)
156 return false;
157
158 return mptcp_try_coalesce((struct sock *)msk, to, from);
159}
160
161/* "inspired" by tcp_data_queue_ofo(), main differences:
162 * - use mptcp seqs
163 * - don't cope with sacks
164 */
165static void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb)
166{
167 struct sock *sk = (struct sock *)msk;
168 struct rb_node **p, *parent;
169 u64 seq, end_seq, max_seq;
170 struct sk_buff *skb1;
171
172 seq = MPTCP_SKB_CB(skb)->map_seq;
173 end_seq = MPTCP_SKB_CB(skb)->end_seq;
Florian Westphalfa3fe2b2020-11-19 11:46:02 -0800174 max_seq = READ_ONCE(msk->rcv_wnd_sent);
Paolo Abeniab174ad2020-09-14 10:01:12 +0200175
Paolo Abeni06242e42020-09-14 10:01:14 +0200176 pr_debug("msk=%p seq=%llx limit=%llx empty=%d", msk, seq, max_seq,
177 RB_EMPTY_ROOT(&msk->out_of_order_queue));
Florian Westphalfa3fe2b2020-11-19 11:46:02 -0800178 if (after64(end_seq, max_seq)) {
Paolo Abeniab174ad2020-09-14 10:01:12 +0200179 /* out of window */
180 mptcp_drop(sk, skb);
Florian Westphalfa3fe2b2020-11-19 11:46:02 -0800181 pr_debug("oow by %lld, rcv_wnd_sent %llu\n",
182 (unsigned long long)end_seq - (unsigned long)max_seq,
183 (unsigned long long)msk->rcv_wnd_sent);
Paolo Abeni06242e42020-09-14 10:01:14 +0200184 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_NODSSWINDOW);
Paolo Abeniab174ad2020-09-14 10:01:12 +0200185 return;
186 }
187
188 p = &msk->out_of_order_queue.rb_node;
Paolo Abeni06242e42020-09-14 10:01:14 +0200189 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOQUEUE);
Paolo Abeniab174ad2020-09-14 10:01:12 +0200190 if (RB_EMPTY_ROOT(&msk->out_of_order_queue)) {
191 rb_link_node(&skb->rbnode, NULL, p);
192 rb_insert_color(&skb->rbnode, &msk->out_of_order_queue);
193 msk->ooo_last_skb = skb;
194 goto end;
195 }
196
197 /* with 2 subflows, adding at end of ooo queue is quite likely
198 * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup.
199 */
Paolo Abeni06242e42020-09-14 10:01:14 +0200200 if (mptcp_ooo_try_coalesce(msk, msk->ooo_last_skb, skb)) {
201 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOMERGE);
202 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOQUEUETAIL);
Paolo Abeniab174ad2020-09-14 10:01:12 +0200203 return;
Paolo Abeni06242e42020-09-14 10:01:14 +0200204 }
Paolo Abeniab174ad2020-09-14 10:01:12 +0200205
206 /* Can avoid an rbtree lookup if we are adding skb after ooo_last_skb */
207 if (!before64(seq, MPTCP_SKB_CB(msk->ooo_last_skb)->end_seq)) {
Paolo Abeni06242e42020-09-14 10:01:14 +0200208 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOQUEUETAIL);
Paolo Abeniab174ad2020-09-14 10:01:12 +0200209 parent = &msk->ooo_last_skb->rbnode;
210 p = &parent->rb_right;
211 goto insert;
212 }
213
214 /* Find place to insert this segment. Handle overlaps on the way. */
215 parent = NULL;
216 while (*p) {
217 parent = *p;
218 skb1 = rb_to_skb(parent);
219 if (before64(seq, MPTCP_SKB_CB(skb1)->map_seq)) {
220 p = &parent->rb_left;
221 continue;
222 }
223 if (before64(seq, MPTCP_SKB_CB(skb1)->end_seq)) {
224 if (!after64(end_seq, MPTCP_SKB_CB(skb1)->end_seq)) {
225 /* All the bits are present. Drop. */
226 mptcp_drop(sk, skb);
Paolo Abeni06242e42020-09-14 10:01:14 +0200227 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA);
Paolo Abeniab174ad2020-09-14 10:01:12 +0200228 return;
229 }
230 if (after64(seq, MPTCP_SKB_CB(skb1)->map_seq)) {
231 /* partial overlap:
232 * | skb |
233 * | skb1 |
234 * continue traversing
235 */
236 } else {
237 /* skb's seq == skb1's seq and skb covers skb1.
238 * Replace skb1 with skb.
239 */
240 rb_replace_node(&skb1->rbnode, &skb->rbnode,
241 &msk->out_of_order_queue);
242 mptcp_drop(sk, skb1);
Paolo Abeni06242e42020-09-14 10:01:14 +0200243 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA);
Paolo Abeniab174ad2020-09-14 10:01:12 +0200244 goto merge_right;
245 }
246 } else if (mptcp_ooo_try_coalesce(msk, skb1, skb)) {
Paolo Abeni06242e42020-09-14 10:01:14 +0200247 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOMERGE);
Paolo Abeniab174ad2020-09-14 10:01:12 +0200248 return;
249 }
250 p = &parent->rb_right;
251 }
Paolo Abeni06242e42020-09-14 10:01:14 +0200252
Paolo Abeniab174ad2020-09-14 10:01:12 +0200253insert:
254 /* Insert segment into RB tree. */
255 rb_link_node(&skb->rbnode, parent, p);
256 rb_insert_color(&skb->rbnode, &msk->out_of_order_queue);
257
258merge_right:
259 /* Remove other segments covered by skb. */
260 while ((skb1 = skb_rb_next(skb)) != NULL) {
261 if (before64(end_seq, MPTCP_SKB_CB(skb1)->end_seq))
262 break;
263 rb_erase(&skb1->rbnode, &msk->out_of_order_queue);
264 mptcp_drop(sk, skb1);
Paolo Abeni06242e42020-09-14 10:01:14 +0200265 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA);
Paolo Abeniab174ad2020-09-14 10:01:12 +0200266 }
267 /* If there is no skb after us, we are the last_skb ! */
268 if (!skb1)
269 msk->ooo_last_skb = skb;
270
271end:
272 skb_condense(skb);
273 skb_set_owner_r(skb, sk);
274}
275
276static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
277 struct sk_buff *skb, unsigned int offset,
278 size_t copy_len)
279{
280 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
Florian Westphal6771bfd2020-02-26 10:14:48 +0100281 struct sock *sk = (struct sock *)msk;
Florian Westphal4e637c72020-05-25 23:41:13 +0200282 struct sk_buff *tail;
Florian Westphal6771bfd2020-02-26 10:14:48 +0100283
284 __skb_unlink(skb, &ssk->sk_receive_queue);
Florian Westphal4e637c72020-05-25 23:41:13 +0200285
286 skb_ext_reset(skb);
287 skb_orphan(skb);
Paolo Abeniab174ad2020-09-14 10:01:12 +0200288
Paolo Abeni9c3f94e2020-10-27 15:59:14 +0100289 /* try to fetch required memory from subflow */
290 if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
291 if (ssk->sk_forward_alloc < skb->truesize)
292 goto drop;
293 __sk_mem_reclaim(ssk, skb->truesize);
294 if (!sk_rmem_schedule(sk, skb, skb->truesize))
295 goto drop;
296 }
297
Paolo Abeniab174ad2020-09-14 10:01:12 +0200298 /* the skb map_seq accounts for the skb offset:
299 * mptcp_subflow_get_mapped_dsn() is based on the current tp->copied_seq
300 * value
301 */
302 MPTCP_SKB_CB(skb)->map_seq = mptcp_subflow_get_mapped_dsn(subflow);
303 MPTCP_SKB_CB(skb)->end_seq = MPTCP_SKB_CB(skb)->map_seq + copy_len;
Paolo Abeni8268ed42020-09-14 10:01:11 +0200304 MPTCP_SKB_CB(skb)->offset = offset;
Florian Westphal4e637c72020-05-25 23:41:13 +0200305
Paolo Abeniab174ad2020-09-14 10:01:12 +0200306 if (MPTCP_SKB_CB(skb)->map_seq == msk->ack_seq) {
307 /* in sequence */
David S. Miller8b0308f2020-10-05 17:33:26 -0700308 WRITE_ONCE(msk->ack_seq, msk->ack_seq + copy_len);
Paolo Abeniab174ad2020-09-14 10:01:12 +0200309 tail = skb_peek_tail(&sk->sk_receive_queue);
310 if (tail && mptcp_try_coalesce(sk, tail, skb))
311 return true;
Florian Westphal4e637c72020-05-25 23:41:13 +0200312
Paolo Abeniab174ad2020-09-14 10:01:12 +0200313 skb_set_owner_r(skb, sk);
314 __skb_queue_tail(&sk->sk_receive_queue, skb);
315 return true;
316 } else if (after64(MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq)) {
317 mptcp_data_queue_ofo(msk, skb);
318 return false;
319 }
320
321 /* old data, keep it simple and drop the whole pkt, sender
322 * will retransmit as needed, if needed.
323 */
Paolo Abeni06242e42020-09-14 10:01:14 +0200324 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA);
Paolo Abeni9c3f94e2020-10-27 15:59:14 +0100325drop:
Paolo Abeniab174ad2020-09-14 10:01:12 +0200326 mptcp_drop(sk, skb);
327 return false;
Florian Westphal6771bfd2020-02-26 10:14:48 +0100328}
329
Mat Martineau16a9a9d2020-07-28 15:12:05 -0700330static void mptcp_stop_timer(struct sock *sk)
331{
332 struct inet_connection_sock *icsk = inet_csk(sk);
333
334 sk_stop_timer(sk, &icsk->icsk_retransmit_timer);
335 mptcp_sk(sk)->timer_ival = 0;
336}
337
Paolo Abenie16163b2020-11-16 10:48:09 +0100338static void mptcp_close_wake_up(struct sock *sk)
339{
340 if (sock_flag(sk, SOCK_DEAD))
341 return;
342
343 sk->sk_state_change(sk);
344 if (sk->sk_shutdown == SHUTDOWN_MASK ||
345 sk->sk_state == TCP_CLOSE)
346 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP);
347 else
348 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
349}
350
Mat Martineau16a9a9d2020-07-28 15:12:05 -0700351static void mptcp_check_data_fin_ack(struct sock *sk)
352{
353 struct mptcp_sock *msk = mptcp_sk(sk);
354
355 if (__mptcp_check_fallback(msk))
356 return;
357
358 /* Look for an acknowledged DATA_FIN */
359 if (((1 << sk->sk_state) &
360 (TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK)) &&
361 msk->write_seq == atomic64_read(&msk->snd_una)) {
362 mptcp_stop_timer(sk);
363
364 WRITE_ONCE(msk->snd_data_fin_enable, 0);
365
366 switch (sk->sk_state) {
367 case TCP_FIN_WAIT1:
368 inet_sk_state_store(sk, TCP_FIN_WAIT2);
Mat Martineau16a9a9d2020-07-28 15:12:05 -0700369 break;
370 case TCP_CLOSING:
Mat Martineau16a9a9d2020-07-28 15:12:05 -0700371 case TCP_LAST_ACK:
372 inet_sk_state_store(sk, TCP_CLOSE);
Mat Martineau16a9a9d2020-07-28 15:12:05 -0700373 break;
374 }
375
Paolo Abenie16163b2020-11-16 10:48:09 +0100376 mptcp_close_wake_up(sk);
Mat Martineau16a9a9d2020-07-28 15:12:05 -0700377 }
378}
379
Mat Martineau3721b9b2020-07-28 15:12:03 -0700380static bool mptcp_pending_data_fin(struct sock *sk, u64 *seq)
381{
382 struct mptcp_sock *msk = mptcp_sk(sk);
383
384 if (READ_ONCE(msk->rcv_data_fin) &&
385 ((1 << sk->sk_state) &
386 (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2))) {
387 u64 rcv_data_fin_seq = READ_ONCE(msk->rcv_data_fin_seq);
388
389 if (msk->ack_seq == rcv_data_fin_seq) {
390 if (seq)
391 *seq = rcv_data_fin_seq;
392
393 return true;
394 }
395 }
396
397 return false;
398}
399
400static void mptcp_set_timeout(const struct sock *sk, const struct sock *ssk)
401{
402 long tout = ssk && inet_csk(ssk)->icsk_pending ?
403 inet_csk(ssk)->icsk_timeout - jiffies : 0;
404
405 if (tout <= 0)
406 tout = mptcp_sk(sk)->timer_ival;
407 mptcp_sk(sk)->timer_ival = tout > 0 ? tout : TCP_RTO_MIN;
408}
409
Paolo Abeniea4ca582020-11-19 11:46:03 -0800410static bool mptcp_subflow_active(struct mptcp_subflow_context *subflow)
411{
412 struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
413
414 /* can't send if JOIN hasn't completed yet (i.e. is usable for mptcp) */
415 if (subflow->request_join && !subflow->fully_established)
416 return false;
417
418 /* only send if our side has not closed yet */
419 return ((1 << ssk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT));
420}
421
Paolo Abenifd897672020-11-24 22:51:24 +0100422static bool tcp_can_send_ack(const struct sock *ssk)
423{
424 return !((1 << inet_sk_state_load(ssk)) &
425 (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_TIME_WAIT | TCPF_CLOSE));
426}
427
428static void mptcp_send_ack(struct mptcp_sock *msk)
Paolo Abeni7ed90802020-11-16 10:48:14 +0100429{
430 struct mptcp_subflow_context *subflow;
431
432 mptcp_for_each_subflow(msk, subflow) {
433 struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
434
Paolo Abenifd897672020-11-24 22:51:24 +0100435 lock_sock(ssk);
436 if (tcp_can_send_ack(ssk))
Paolo Abeniea4ca582020-11-19 11:46:03 -0800437 tcp_send_ack(ssk);
Paolo Abenifd897672020-11-24 22:51:24 +0100438 release_sock(ssk);
439 }
440}
Paolo Abeniea4ca582020-11-19 11:46:03 -0800441
Paolo Abenifd897672020-11-24 22:51:24 +0100442static bool mptcp_subflow_cleanup_rbuf(struct sock *ssk)
443{
444 int ret;
445
446 lock_sock(ssk);
447 ret = tcp_can_send_ack(ssk);
448 if (ret)
449 tcp_cleanup_rbuf(ssk, 1);
450 release_sock(ssk);
451 return ret;
452}
453
454static void mptcp_cleanup_rbuf(struct mptcp_sock *msk)
455{
Paolo Abeni87952602020-11-27 11:10:24 +0100456 struct sock *ack_hint = READ_ONCE(msk->ack_hint);
Paolo Abenifd897672020-11-24 22:51:24 +0100457 struct mptcp_subflow_context *subflow;
458
459 /* if the hinted ssk is still active, try to use it */
Paolo Abeni87952602020-11-27 11:10:24 +0100460 if (likely(ack_hint)) {
Paolo Abenifd897672020-11-24 22:51:24 +0100461 mptcp_for_each_subflow(msk, subflow) {
462 struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
463
Paolo Abeni87952602020-11-27 11:10:24 +0100464 if (ack_hint == ssk && mptcp_subflow_cleanup_rbuf(ssk))
Paolo Abenifd897672020-11-24 22:51:24 +0100465 return;
466 }
Paolo Abeniea4ca582020-11-19 11:46:03 -0800467 }
Paolo Abenifd897672020-11-24 22:51:24 +0100468
469 /* otherwise pick the first active subflow */
470 mptcp_for_each_subflow(msk, subflow)
471 if (mptcp_subflow_cleanup_rbuf(mptcp_subflow_tcp_sock(subflow)))
472 return;
Paolo Abeni7ed90802020-11-16 10:48:14 +0100473}
474
475static bool mptcp_check_data_fin(struct sock *sk)
Mat Martineau3721b9b2020-07-28 15:12:03 -0700476{
477 struct mptcp_sock *msk = mptcp_sk(sk);
478 u64 rcv_data_fin_seq;
Paolo Abeni7ed90802020-11-16 10:48:14 +0100479 bool ret = false;
Mat Martineau3721b9b2020-07-28 15:12:03 -0700480
481 if (__mptcp_check_fallback(msk) || !msk->first)
Paolo Abeni7ed90802020-11-16 10:48:14 +0100482 return ret;
Mat Martineau3721b9b2020-07-28 15:12:03 -0700483
484 /* Need to ack a DATA_FIN received from a peer while this side
485 * of the connection is in ESTABLISHED, FIN_WAIT1, or FIN_WAIT2.
486 * msk->rcv_data_fin was set when parsing the incoming options
487 * at the subflow level and the msk lock was not held, so this
488 * is the first opportunity to act on the DATA_FIN and change
489 * the msk state.
490 *
491 * If we are caught up to the sequence number of the incoming
492 * DATA_FIN, send the DATA_ACK now and do state transition. If
493 * not caught up, do nothing and let the recv code send DATA_ACK
494 * when catching up.
495 */
496
497 if (mptcp_pending_data_fin(sk, &rcv_data_fin_seq)) {
Mat Martineau917944d2020-09-29 15:08:19 -0700498 WRITE_ONCE(msk->ack_seq, msk->ack_seq + 1);
Mat Martineau3721b9b2020-07-28 15:12:03 -0700499 WRITE_ONCE(msk->rcv_data_fin, 0);
500
501 sk->sk_shutdown |= RCV_SHUTDOWN;
Mat Martineau16a9a9d2020-07-28 15:12:05 -0700502 smp_mb__before_atomic(); /* SHUTDOWN must be visible first */
503 set_bit(MPTCP_DATA_READY, &msk->flags);
Mat Martineau3721b9b2020-07-28 15:12:03 -0700504
505 switch (sk->sk_state) {
506 case TCP_ESTABLISHED:
507 inet_sk_state_store(sk, TCP_CLOSE_WAIT);
508 break;
509 case TCP_FIN_WAIT1:
510 inet_sk_state_store(sk, TCP_CLOSING);
511 break;
512 case TCP_FIN_WAIT2:
513 inet_sk_state_store(sk, TCP_CLOSE);
Mat Martineau3721b9b2020-07-28 15:12:03 -0700514 break;
515 default:
516 /* Other states not expected */
517 WARN_ON_ONCE(1);
518 break;
519 }
520
Paolo Abeni7ed90802020-11-16 10:48:14 +0100521 ret = true;
Mat Martineau3721b9b2020-07-28 15:12:03 -0700522 mptcp_set_timeout(sk, NULL);
Paolo Abenifd897672020-11-24 22:51:24 +0100523 mptcp_send_ack(msk);
Paolo Abenie16163b2020-11-16 10:48:09 +0100524 mptcp_close_wake_up(sk);
Mat Martineau3721b9b2020-07-28 15:12:03 -0700525 }
Paolo Abeni7ed90802020-11-16 10:48:14 +0100526 return ret;
Mat Martineau3721b9b2020-07-28 15:12:03 -0700527}
528
Florian Westphal6771bfd2020-02-26 10:14:48 +0100529static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
530 struct sock *ssk,
531 unsigned int *bytes)
532{
533 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
Florian Westphal600911f2020-02-26 10:14:49 +0100534 struct sock *sk = (struct sock *)msk;
Florian Westphal6771bfd2020-02-26 10:14:48 +0100535 unsigned int moved = 0;
536 bool more_data_avail;
537 struct tcp_sock *tp;
538 bool done = false;
Florian Westphal13c7ba02020-11-03 11:05:03 -0800539 int sk_rbuf;
540
541 sk_rbuf = READ_ONCE(sk->sk_rcvbuf);
542
543 if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
544 int ssk_rbuf = READ_ONCE(ssk->sk_rcvbuf);
545
546 if (unlikely(ssk_rbuf > sk_rbuf)) {
547 WRITE_ONCE(sk->sk_rcvbuf, ssk_rbuf);
548 sk_rbuf = ssk_rbuf;
549 }
550 }
Florian Westphal600911f2020-02-26 10:14:49 +0100551
Paolo Abeniab174ad2020-09-14 10:01:12 +0200552 pr_debug("msk=%p ssk=%p", msk, ssk);
Florian Westphal6771bfd2020-02-26 10:14:48 +0100553 tp = tcp_sk(ssk);
554 do {
555 u32 map_remaining, offset;
556 u32 seq = tp->copied_seq;
557 struct sk_buff *skb;
558 bool fin;
559
560 /* try to move as much data as available */
561 map_remaining = subflow->map_data_len -
562 mptcp_subflow_get_map_offset(subflow);
563
564 skb = skb_peek(&ssk->sk_receive_queue);
Paolo Abenid9fb8c52020-10-06 08:27:34 +0200565 if (!skb) {
566 /* if no data is found, a racing workqueue/recvmsg
567 * already processed the new data, stop here or we
568 * can enter an infinite loop
569 */
570 if (!moved)
571 done = true;
Florian Westphal6771bfd2020-02-26 10:14:48 +0100572 break;
Paolo Abenid9fb8c52020-10-06 08:27:34 +0200573 }
Florian Westphal6771bfd2020-02-26 10:14:48 +0100574
Davide Carattie1ff9e82020-06-29 22:26:20 +0200575 if (__mptcp_check_fallback(msk)) {
576 /* if we are running under the workqueue, TCP could have
577 * collapsed skbs between dummy map creation and now
578 * be sure to adjust the size
579 */
580 map_remaining = skb->len;
581 subflow->map_data_len = skb->len;
582 }
583
Florian Westphal6771bfd2020-02-26 10:14:48 +0100584 offset = seq - TCP_SKB_CB(skb)->seq;
585 fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
586 if (fin) {
587 done = true;
588 seq++;
589 }
590
591 if (offset < skb->len) {
592 size_t len = skb->len - offset;
593
594 if (tp->urg_data)
595 done = true;
596
Paolo Abeniab174ad2020-09-14 10:01:12 +0200597 if (__mptcp_move_skb(msk, ssk, skb, offset, len))
598 moved += len;
Florian Westphal6771bfd2020-02-26 10:14:48 +0100599 seq += len;
Florian Westphal6771bfd2020-02-26 10:14:48 +0100600
601 if (WARN_ON_ONCE(map_remaining < len))
602 break;
603 } else {
604 WARN_ON_ONCE(!fin);
605 sk_eat_skb(ssk, skb);
606 done = true;
607 }
608
609 WRITE_ONCE(tp->copied_seq, seq);
610 more_data_avail = mptcp_subflow_data_available(ssk);
Florian Westphal600911f2020-02-26 10:14:49 +0100611
Florian Westphal13c7ba02020-11-03 11:05:03 -0800612 if (atomic_read(&sk->sk_rmem_alloc) > sk_rbuf) {
Florian Westphal600911f2020-02-26 10:14:49 +0100613 done = true;
614 break;
615 }
Florian Westphal6771bfd2020-02-26 10:14:48 +0100616 } while (more_data_avail);
Paolo Abeni87952602020-11-27 11:10:24 +0100617 WRITE_ONCE(msk->ack_hint, ssk);
Florian Westphal6771bfd2020-02-26 10:14:48 +0100618
Paolo Abeni67193312020-09-14 10:01:09 +0200619 *bytes += moved;
Florian Westphal6771bfd2020-02-26 10:14:48 +0100620 return done;
621}
622
Paolo Abeni87952602020-11-27 11:10:24 +0100623static bool __mptcp_ofo_queue(struct mptcp_sock *msk)
Paolo Abeniab174ad2020-09-14 10:01:12 +0200624{
625 struct sock *sk = (struct sock *)msk;
626 struct sk_buff *skb, *tail;
627 bool moved = false;
628 struct rb_node *p;
629 u64 end_seq;
630
631 p = rb_first(&msk->out_of_order_queue);
Paolo Abeni06242e42020-09-14 10:01:14 +0200632 pr_debug("msk=%p empty=%d", msk, RB_EMPTY_ROOT(&msk->out_of_order_queue));
Paolo Abeniab174ad2020-09-14 10:01:12 +0200633 while (p) {
634 skb = rb_to_skb(p);
635 if (after64(MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq))
636 break;
637
638 p = rb_next(p);
639 rb_erase(&skb->rbnode, &msk->out_of_order_queue);
640
641 if (unlikely(!after64(MPTCP_SKB_CB(skb)->end_seq,
642 msk->ack_seq))) {
643 mptcp_drop(sk, skb);
Paolo Abeni06242e42020-09-14 10:01:14 +0200644 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA);
Paolo Abeniab174ad2020-09-14 10:01:12 +0200645 continue;
646 }
647
648 end_seq = MPTCP_SKB_CB(skb)->end_seq;
649 tail = skb_peek_tail(&sk->sk_receive_queue);
650 if (!tail || !mptcp_ooo_try_coalesce(msk, tail, skb)) {
651 int delta = msk->ack_seq - MPTCP_SKB_CB(skb)->map_seq;
652
653 /* skip overlapping data, if any */
Paolo Abeni06242e42020-09-14 10:01:14 +0200654 pr_debug("uncoalesced seq=%llx ack seq=%llx delta=%d",
655 MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq,
656 delta);
Paolo Abeniab174ad2020-09-14 10:01:12 +0200657 MPTCP_SKB_CB(skb)->offset += delta;
658 __skb_queue_tail(&sk->sk_receive_queue, skb);
659 }
660 msk->ack_seq = end_seq;
661 moved = true;
662 }
663 return moved;
664}
665
Florian Westphal2e522132020-02-26 10:14:51 +0100666/* In most cases we will be able to lock the mptcp socket. If its already
667 * owned, we need to defer to the work queue to avoid ABBA deadlock.
668 */
Paolo Abeni87952602020-11-27 11:10:24 +0100669static void move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk)
Florian Westphal2e522132020-02-26 10:14:51 +0100670{
671 struct sock *sk = (struct sock *)msk;
672 unsigned int moved = 0;
673
Paolo Abeni87952602020-11-27 11:10:24 +0100674 if (inet_sk_state_load(sk) == TCP_CLOSE)
675 return;
Florian Westphal2e522132020-02-26 10:14:51 +0100676
Paolo Abeni87952602020-11-27 11:10:24 +0100677 mptcp_data_lock(sk);
Florian Westphal2e522132020-02-26 10:14:51 +0100678
Paolo Abeni87952602020-11-27 11:10:24 +0100679 __mptcp_move_skbs_from_subflow(msk, ssk, &moved);
680 __mptcp_ofo_queue(msk);
Paolo Abeniab174ad2020-09-14 10:01:12 +0200681
Paolo Abeni87952602020-11-27 11:10:24 +0100682 /* If the moves have caught up with the DATA_FIN sequence number
683 * it's time to ack the DATA_FIN and change socket state, but
684 * this is not a good place to change state. Let the workqueue
685 * do it.
686 */
687 if (mptcp_pending_data_fin(sk, NULL))
688 mptcp_schedule_work(sk);
689 mptcp_data_unlock(sk);
Florian Westphal2e522132020-02-26 10:14:51 +0100690}
691
692void mptcp_data_ready(struct sock *sk, struct sock *ssk)
Florian Westphal101f6f82020-02-26 10:14:46 +0100693{
Paolo Abeni67193312020-09-14 10:01:09 +0200694 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
Florian Westphal101f6f82020-02-26 10:14:46 +0100695 struct mptcp_sock *msk = mptcp_sk(sk);
Florian Westphal13c7ba02020-11-03 11:05:03 -0800696 int sk_rbuf, ssk_rbuf;
Paolo Abeni67193312020-09-14 10:01:09 +0200697 bool wake;
Florian Westphal101f6f82020-02-26 10:14:46 +0100698
Paolo Abeni67193312020-09-14 10:01:09 +0200699 /* move_skbs_to_msk below can legitly clear the data_avail flag,
700 * but we will need later to properly woke the reader, cache its
701 * value
702 */
703 wake = subflow->data_avail == MPTCP_SUBFLOW_DATA_AVAIL;
704 if (wake)
705 set_bit(MPTCP_DATA_READY, &msk->flags);
Florian Westphal6771bfd2020-02-26 10:14:48 +0100706
Florian Westphal13c7ba02020-11-03 11:05:03 -0800707 ssk_rbuf = READ_ONCE(ssk->sk_rcvbuf);
708 sk_rbuf = READ_ONCE(sk->sk_rcvbuf);
709 if (unlikely(ssk_rbuf > sk_rbuf))
710 sk_rbuf = ssk_rbuf;
711
712 /* over limit? can't append more skbs to msk */
713 if (atomic_read(&sk->sk_rmem_alloc) > sk_rbuf)
Florian Westphal2e522132020-02-26 10:14:51 +0100714 goto wake;
715
Paolo Abeniea4ca582020-11-19 11:46:03 -0800716 move_skbs_to_msk(msk, ssk);
Florian Westphal600911f2020-02-26 10:14:49 +0100717
Florian Westphal600911f2020-02-26 10:14:49 +0100718wake:
Paolo Abeni67193312020-09-14 10:01:09 +0200719 if (wake)
720 sk->sk_data_ready(sk);
Florian Westphal101f6f82020-02-26 10:14:46 +0100721}
722
Geliang Tang84dfe362020-11-19 11:46:00 -0800723void __mptcp_flush_join_list(struct mptcp_sock *msk)
Peter Krystadec3edaa2020-03-27 14:48:40 -0700724{
725 if (likely(list_empty(&msk->join_list)))
726 return;
727
728 spin_lock_bh(&msk->join_list_lock);
729 list_splice_tail_init(&msk->join_list, &msk->conn_list);
730 spin_unlock_bh(&msk->join_list_lock);
731}
732
Paolo Abenib51f9b82020-03-27 14:48:44 -0700733static bool mptcp_timer_pending(struct sock *sk)
734{
735 return timer_pending(&inet_csk(sk)->icsk_retransmit_timer);
736}
737
738static void mptcp_reset_timer(struct sock *sk)
739{
740 struct inet_connection_sock *icsk = inet_csk(sk);
741 unsigned long tout;
742
Paolo Abenie16163b2020-11-16 10:48:09 +0100743 /* prevent rescheduling on close */
744 if (unlikely(inet_sk_state_load(sk) == TCP_CLOSE))
745 return;
746
Paolo Abenib51f9b82020-03-27 14:48:44 -0700747 /* should never be called with mptcp level timer cleared */
748 tout = READ_ONCE(mptcp_sk(sk)->timer_ival);
749 if (WARN_ON_ONCE(!tout))
750 tout = TCP_RTO_MIN;
751 sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + tout);
752}
753
Paolo Abeniba8f48f2020-11-16 10:48:05 +0100754bool mptcp_schedule_work(struct sock *sk)
755{
756 if (inet_sk_state_load(sk) != TCP_CLOSE &&
757 schedule_work(&mptcp_sk(sk)->work)) {
758 /* each subflow already holds a reference to the sk, and the
759 * workqueue is invoked by a subflow, so sk can't go away here.
760 */
761 sock_hold(sk);
762 return true;
763 }
764 return false;
765}
766
Paolo Abenib51f9b82020-03-27 14:48:44 -0700767void mptcp_data_acked(struct sock *sk)
768{
769 mptcp_reset_timer(sk);
Paolo Abeni3b1d6212020-03-27 14:48:48 -0700770
Florian Westphal8edf0862020-11-16 10:48:12 +0100771 if ((test_bit(MPTCP_NOSPACE, &mptcp_sk(sk)->flags) ||
Paolo Abeni813e0a62020-11-16 10:48:11 +0100772 mptcp_send_head(sk) ||
Paolo Abeniba8f48f2020-11-16 10:48:05 +0100773 (inet_sk_state_load(sk) != TCP_ESTABLISHED)))
774 mptcp_schedule_work(sk);
Paolo Abenib51f9b82020-03-27 14:48:44 -0700775}
776
Florian Westphal59832e22020-04-02 13:44:52 +0200777void mptcp_subflow_eof(struct sock *sk)
778{
Paolo Abeniba8f48f2020-11-16 10:48:05 +0100779 if (!test_and_set_bit(MPTCP_WORK_EOF, &mptcp_sk(sk)->flags))
780 mptcp_schedule_work(sk);
Florian Westphal59832e22020-04-02 13:44:52 +0200781}
782
Paolo Abeni59698562020-06-10 10:47:41 +0200783static void mptcp_check_for_eof(struct mptcp_sock *msk)
784{
785 struct mptcp_subflow_context *subflow;
786 struct sock *sk = (struct sock *)msk;
787 int receivers = 0;
788
789 mptcp_for_each_subflow(msk, subflow)
790 receivers += !subflow->rx_eof;
Paolo Abenie16163b2020-11-16 10:48:09 +0100791 if (receivers)
792 return;
Paolo Abeni59698562020-06-10 10:47:41 +0200793
Paolo Abenie16163b2020-11-16 10:48:09 +0100794 if (!(sk->sk_shutdown & RCV_SHUTDOWN)) {
Paolo Abeni59698562020-06-10 10:47:41 +0200795 /* hopefully temporary hack: propagate shutdown status
796 * to msk, when all subflows agree on it
797 */
798 sk->sk_shutdown |= RCV_SHUTDOWN;
799
800 smp_mb__before_atomic(); /* SHUTDOWN must be visible first */
801 set_bit(MPTCP_DATA_READY, &msk->flags);
802 sk->sk_data_ready(sk);
803 }
Paolo Abenie16163b2020-11-16 10:48:09 +0100804
805 switch (sk->sk_state) {
806 case TCP_ESTABLISHED:
807 inet_sk_state_store(sk, TCP_CLOSE_WAIT);
808 break;
809 case TCP_FIN_WAIT1:
Paolo Abeni26aa2312020-11-19 11:45:55 -0800810 inet_sk_state_store(sk, TCP_CLOSING);
811 break;
812 case TCP_FIN_WAIT2:
Paolo Abenie16163b2020-11-16 10:48:09 +0100813 inet_sk_state_store(sk, TCP_CLOSE);
814 break;
815 default:
816 return;
817 }
818 mptcp_close_wake_up(sk);
Paolo Abeni59698562020-06-10 10:47:41 +0200819}
820
Paolo Abeni7a6a6cb2020-01-21 16:56:26 -0800821static struct sock *mptcp_subflow_recv_lookup(const struct mptcp_sock *msk)
822{
823 struct mptcp_subflow_context *subflow;
824 struct sock *sk = (struct sock *)msk;
825
826 sock_owned_by_me(sk);
827
828 mptcp_for_each_subflow(msk, subflow) {
829 if (subflow->data_avail)
830 return mptcp_subflow_tcp_sock(subflow);
831 }
832
833 return NULL;
834}
835
Paolo Abeni3f8e0aa2020-03-27 14:48:47 -0700836static bool mptcp_skb_can_collapse_to(u64 write_seq,
837 const struct sk_buff *skb,
838 const struct mptcp_ext *mpext)
Mat Martineau6d0060f2020-01-21 16:56:23 -0800839{
Paolo Abeni57040752020-01-21 16:56:27 -0800840 if (!tcp_skb_can_collapse_to(skb))
841 return false;
842
Paolo Abeni5a369ca2020-11-03 11:05:05 -0800843 /* can collapse only if MPTCP level sequence is in order and this
844 * mapping has not been xmitted yet
845 */
846 return mpext && mpext->data_seq + mpext->data_len == write_seq &&
847 !mpext->frozen;
Paolo Abeni57040752020-01-21 16:56:27 -0800848}
849
Paolo Abeni18b683b2020-03-27 14:48:43 -0700850static bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk,
851 const struct page_frag *pfrag,
852 const struct mptcp_data_frag *df)
853{
854 return df && pfrag->page == df->page &&
Paolo Abenid9ca1de2020-11-16 10:48:10 +0100855 pfrag->size - pfrag->offset > 0 &&
Paolo Abeni18b683b2020-03-27 14:48:43 -0700856 df->data_seq + df->data_len == msk->write_seq;
857}
858
Paolo Abeni724cfd22020-11-27 11:10:25 +0100859static int mptcp_wmem_with_overhead(struct sock *sk, int size)
Paolo Abenie93da922020-11-27 11:10:23 +0100860{
Paolo Abeni724cfd22020-11-27 11:10:25 +0100861 struct mptcp_sock *msk = mptcp_sk(sk);
862 int ret, skbs;
863
864 ret = size + ((sizeof(struct mptcp_data_frag) * size) >> PAGE_SHIFT);
865 skbs = (msk->tx_pending_data + size) / msk->size_goal_cache;
866 if (skbs < msk->skb_tx_cache.qlen)
867 return ret;
868
869 return ret + (skbs - msk->skb_tx_cache.qlen) * SKB_TRUESIZE(MAX_TCP_HEADER);
Paolo Abenie93da922020-11-27 11:10:23 +0100870}
871
872static void __mptcp_wmem_reserve(struct sock *sk, int size)
873{
Paolo Abeni724cfd22020-11-27 11:10:25 +0100874 int amount = mptcp_wmem_with_overhead(sk, size);
Paolo Abenie93da922020-11-27 11:10:23 +0100875 struct mptcp_sock *msk = mptcp_sk(sk);
876
877 WARN_ON_ONCE(msk->wmem_reserved);
878 if (amount <= sk->sk_forward_alloc)
879 goto reserve;
880
881 /* under memory pressure try to reserve at most a single page
882 * otherwise try to reserve the full estimate and fallback
883 * to a single page before entering the error path
884 */
885 if ((tcp_under_memory_pressure(sk) && amount > PAGE_SIZE) ||
886 !sk_wmem_schedule(sk, amount)) {
887 if (amount <= PAGE_SIZE)
888 goto nomem;
889
890 amount = PAGE_SIZE;
891 if (!sk_wmem_schedule(sk, amount))
892 goto nomem;
893 }
894
895reserve:
896 msk->wmem_reserved = amount;
897 sk->sk_forward_alloc -= amount;
898 return;
899
900nomem:
901 /* we will wait for memory on next allocation */
902 msk->wmem_reserved = -1;
903}
904
905static void __mptcp_update_wmem(struct sock *sk)
906{
907 struct mptcp_sock *msk = mptcp_sk(sk);
908
909 if (!msk->wmem_reserved)
910 return;
911
912 if (msk->wmem_reserved < 0)
913 msk->wmem_reserved = 0;
914 if (msk->wmem_reserved > 0) {
915 sk->sk_forward_alloc += msk->wmem_reserved;
916 msk->wmem_reserved = 0;
917 }
918}
919
920static bool mptcp_wmem_alloc(struct sock *sk, int size)
921{
922 struct mptcp_sock *msk = mptcp_sk(sk);
923
924 /* check for pre-existing error condition */
925 if (msk->wmem_reserved < 0)
926 return false;
927
928 if (msk->wmem_reserved >= size)
929 goto account;
930
Paolo Abeni87952602020-11-27 11:10:24 +0100931 mptcp_data_lock(sk);
932 if (!sk_wmem_schedule(sk, size)) {
933 mptcp_data_unlock(sk);
Paolo Abenie93da922020-11-27 11:10:23 +0100934 return false;
Paolo Abeni87952602020-11-27 11:10:24 +0100935 }
Paolo Abenie93da922020-11-27 11:10:23 +0100936
937 sk->sk_forward_alloc -= size;
938 msk->wmem_reserved += size;
Paolo Abeni87952602020-11-27 11:10:24 +0100939 mptcp_data_unlock(sk);
Paolo Abenie93da922020-11-27 11:10:23 +0100940
941account:
942 msk->wmem_reserved -= size;
943 return true;
944}
945
Paolo Abeni87952602020-11-27 11:10:24 +0100946static void mptcp_wmem_uncharge(struct sock *sk, int size)
947{
948 struct mptcp_sock *msk = mptcp_sk(sk);
949
950 if (msk->wmem_reserved < 0)
951 msk->wmem_reserved = 0;
952 msk->wmem_reserved += size;
953}
954
Paolo Abeni724cfd22020-11-27 11:10:25 +0100955static void mptcp_mem_reclaim_partial(struct sock *sk)
956{
957 struct mptcp_sock *msk = mptcp_sk(sk);
958
959 /* if we are experiencing a transint allocation error,
960 * the forward allocation memory has been already
961 * released
962 */
963 if (msk->wmem_reserved < 0)
964 return;
965
966 mptcp_data_lock(sk);
967 sk->sk_forward_alloc += msk->wmem_reserved;
968 sk_mem_reclaim_partial(sk);
969 msk->wmem_reserved = sk->sk_forward_alloc;
970 sk->sk_forward_alloc = 0;
971 mptcp_data_unlock(sk);
972}
973
Paolo Abenid0272362020-03-27 14:48:45 -0700974static void dfrag_uncharge(struct sock *sk, int len)
Paolo Abeni18b683b2020-03-27 14:48:43 -0700975{
Paolo Abenid0272362020-03-27 14:48:45 -0700976 sk_mem_uncharge(sk, len);
Florian Westphal7948f6c2020-03-27 14:48:46 -0700977 sk_wmem_queued_add(sk, -len);
Paolo Abenid0272362020-03-27 14:48:45 -0700978}
979
980static void dfrag_clear(struct sock *sk, struct mptcp_data_frag *dfrag)
981{
982 int len = dfrag->data_len + dfrag->overhead;
983
Paolo Abeni18b683b2020-03-27 14:48:43 -0700984 list_del(&dfrag->list);
Paolo Abenid0272362020-03-27 14:48:45 -0700985 dfrag_uncharge(sk, len);
Paolo Abeni18b683b2020-03-27 14:48:43 -0700986 put_page(dfrag->page);
987}
988
989static void mptcp_clean_una(struct sock *sk)
990{
991 struct mptcp_sock *msk = mptcp_sk(sk);
992 struct mptcp_data_frag *dtmp, *dfrag;
Paolo Abenid0272362020-03-27 14:48:45 -0700993 bool cleaned = false;
Davide Carattie1ff9e82020-06-29 22:26:20 +0200994 u64 snd_una;
995
996 /* on fallback we just need to ignore snd_una, as this is really
997 * plain TCP
998 */
999 if (__mptcp_check_fallback(msk))
Paolo Abenieaa2ffa2020-11-16 10:48:08 +01001000 atomic64_set(&msk->snd_una, msk->snd_nxt);
Florian Westphal6f8a6122020-11-16 10:48:13 +01001001
Paolo Abeni87952602020-11-27 11:10:24 +01001002 mptcp_data_lock(sk);
Davide Carattie1ff9e82020-06-29 22:26:20 +02001003 snd_una = atomic64_read(&msk->snd_una);
Paolo Abeni18b683b2020-03-27 14:48:43 -07001004
1005 list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) {
1006 if (after64(dfrag->data_seq + dfrag->data_len, snd_una))
1007 break;
1008
Paolo Abenid9ca1de2020-11-16 10:48:10 +01001009 if (WARN_ON_ONCE(dfrag == msk->first_pending))
1010 break;
Paolo Abenid0272362020-03-27 14:48:45 -07001011 dfrag_clear(sk, dfrag);
1012 cleaned = true;
1013 }
1014
Florian Westphal7948f6c2020-03-27 14:48:46 -07001015 dfrag = mptcp_rtx_head(sk);
1016 if (dfrag && after64(snd_una, dfrag->data_seq)) {
Paolo Abeni53eb4c32020-07-23 13:02:30 +02001017 u64 delta = snd_una - dfrag->data_seq;
1018
Paolo Abenid9ca1de2020-11-16 10:48:10 +01001019 if (WARN_ON_ONCE(delta > dfrag->already_sent))
Paolo Abeni53eb4c32020-07-23 13:02:30 +02001020 goto out;
Florian Westphal7948f6c2020-03-27 14:48:46 -07001021
1022 dfrag->data_seq += delta;
Paolo Abeni53eb4c32020-07-23 13:02:30 +02001023 dfrag->offset += delta;
Florian Westphal7948f6c2020-03-27 14:48:46 -07001024 dfrag->data_len -= delta;
Paolo Abenid9ca1de2020-11-16 10:48:10 +01001025 dfrag->already_sent -= delta;
Florian Westphal7948f6c2020-03-27 14:48:46 -07001026
1027 dfrag_uncharge(sk, delta);
1028 cleaned = true;
1029 }
1030
Paolo Abeni53eb4c32020-07-23 13:02:30 +02001031out:
Paolo Abenie93da922020-11-27 11:10:23 +01001032 if (cleaned && tcp_under_memory_pressure(sk))
Paolo Abenid0272362020-03-27 14:48:45 -07001033 sk_mem_reclaim_partial(sk);
Paolo Abeni87952602020-11-27 11:10:24 +01001034 mptcp_data_unlock(sk);
Florian Westphal95ed6902020-11-03 11:05:06 -08001035}
Florian Westphal7948f6c2020-03-27 14:48:46 -07001036
Florian Westphal95ed6902020-11-03 11:05:06 -08001037static void mptcp_clean_una_wakeup(struct sock *sk)
1038{
1039 struct mptcp_sock *msk = mptcp_sk(sk);
Paolo Abeni63561a42020-09-14 10:01:07 +02001040
Florian Westphal95ed6902020-11-03 11:05:06 -08001041 mptcp_clean_una(sk);
1042
1043 /* Only wake up writers if a subflow is ready */
Florian Westphal8edf0862020-11-16 10:48:12 +01001044 if (sk_stream_is_writeable(sk)) {
1045 clear_bit(MPTCP_NOSPACE, &msk->flags);
Florian Westphal95ed6902020-11-03 11:05:06 -08001046 sk_stream_write_space(sk);
Paolo Abeni18b683b2020-03-27 14:48:43 -07001047 }
1048}
1049
Paolo Abeni724cfd22020-11-27 11:10:25 +01001050static void mptcp_enter_memory_pressure(struct sock *sk)
Paolo Abeni18b683b2020-03-27 14:48:43 -07001051{
Paolo Abenid9ca1de2020-11-16 10:48:10 +01001052 struct mptcp_subflow_context *subflow;
1053 struct mptcp_sock *msk = mptcp_sk(sk);
1054 bool first = true;
1055
Paolo Abeni18b683b2020-03-27 14:48:43 -07001056 sk_stream_moderate_sndbuf(sk);
Paolo Abenid9ca1de2020-11-16 10:48:10 +01001057 mptcp_for_each_subflow(msk, subflow) {
1058 struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
1059
1060 if (first)
1061 tcp_enter_memory_pressure(ssk);
1062 sk_stream_moderate_sndbuf(ssk);
1063 first = false;
1064 }
Paolo Abeni724cfd22020-11-27 11:10:25 +01001065}
1066
1067/* ensure we get enough memory for the frag hdr, beyond some minimal amount of
1068 * data
1069 */
1070static bool mptcp_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
1071{
1072 if (likely(skb_page_frag_refill(32U + sizeof(struct mptcp_data_frag),
1073 pfrag, sk->sk_allocation)))
1074 return true;
1075
1076 mptcp_enter_memory_pressure(sk);
Paolo Abeni18b683b2020-03-27 14:48:43 -07001077 return false;
1078}
1079
1080static struct mptcp_data_frag *
1081mptcp_carve_data_frag(const struct mptcp_sock *msk, struct page_frag *pfrag,
1082 int orig_offset)
1083{
1084 int offset = ALIGN(orig_offset, sizeof(long));
1085 struct mptcp_data_frag *dfrag;
1086
1087 dfrag = (struct mptcp_data_frag *)(page_to_virt(pfrag->page) + offset);
1088 dfrag->data_len = 0;
1089 dfrag->data_seq = msk->write_seq;
1090 dfrag->overhead = offset - orig_offset + sizeof(struct mptcp_data_frag);
1091 dfrag->offset = offset + sizeof(struct mptcp_data_frag);
Paolo Abenid9ca1de2020-11-16 10:48:10 +01001092 dfrag->already_sent = 0;
Paolo Abeni18b683b2020-03-27 14:48:43 -07001093 dfrag->page = pfrag->page;
1094
1095 return dfrag;
1096}
1097
Paolo Abenicaf971d2020-11-16 10:48:06 +01001098struct mptcp_sendmsg_info {
1099 int mss_now;
1100 int size_goal;
Paolo Abenid9ca1de2020-11-16 10:48:10 +01001101 u16 limit;
1102 u16 sent;
1103 unsigned int flags;
Paolo Abenicaf971d2020-11-16 10:48:06 +01001104};
1105
Florian Westphal6f8a6122020-11-16 10:48:13 +01001106static int mptcp_check_allowed_size(struct mptcp_sock *msk, u64 data_seq,
1107 int avail_size)
1108{
1109 u64 window_end = mptcp_wnd_end(msk);
1110
1111 if (__mptcp_check_fallback(msk))
1112 return avail_size;
1113
1114 if (!before64(data_seq + avail_size, window_end)) {
1115 u64 allowed_size = window_end - data_seq;
1116
1117 return min_t(unsigned int, allowed_size, avail_size);
1118 }
1119
1120 return avail_size;
1121}
1122
Paolo Abeni724cfd22020-11-27 11:10:25 +01001123static bool __mptcp_add_ext(struct sk_buff *skb, gfp_t gfp)
1124{
1125 struct skb_ext *mpext = __skb_ext_alloc(gfp);
1126
1127 if (!mpext)
1128 return false;
1129 __skb_ext_set(skb, SKB_EXT_MPTCP, mpext);
1130 return true;
1131}
1132
1133static struct sk_buff *__mptcp_do_alloc_tx_skb(struct sock *sk)
1134{
1135 struct sk_buff *skb;
1136
1137 skb = alloc_skb_fclone(MAX_TCP_HEADER, sk->sk_allocation);
1138 if (likely(skb)) {
1139 if (likely(__mptcp_add_ext(skb, sk->sk_allocation))) {
1140 skb_reserve(skb, MAX_TCP_HEADER);
1141 skb->reserved_tailroom = skb->end - skb->tail;
1142 return skb;
1143 }
1144 __kfree_skb(skb);
1145 } else {
1146 mptcp_enter_memory_pressure(sk);
1147 }
1148 return NULL;
1149}
1150
1151static bool mptcp_tx_cache_refill(struct sock *sk, int size,
1152 struct sk_buff_head *skbs, int *total_ts)
1153{
1154 struct mptcp_sock *msk = mptcp_sk(sk);
1155 struct sk_buff *skb;
1156 int space_needed;
1157
1158 if (unlikely(tcp_under_memory_pressure(sk))) {
1159 mptcp_mem_reclaim_partial(sk);
1160
1161 /* under pressure pre-allocate at most a single skb */
1162 if (msk->skb_tx_cache.qlen)
1163 return true;
1164 space_needed = msk->size_goal_cache;
1165 } else {
1166 space_needed = msk->tx_pending_data + size -
1167 msk->skb_tx_cache.qlen * msk->size_goal_cache;
1168 }
1169
1170 while (space_needed > 0) {
1171 skb = __mptcp_do_alloc_tx_skb(sk);
1172 if (unlikely(!skb)) {
1173 /* under memory pressure, try to pass the caller a
1174 * single skb to allow forward progress
1175 */
1176 while (skbs->qlen > 1) {
1177 skb = __skb_dequeue_tail(skbs);
1178 __kfree_skb(skb);
1179 }
1180 return skbs->qlen > 0;
1181 }
1182
1183 *total_ts += skb->truesize;
1184 __skb_queue_tail(skbs, skb);
1185 space_needed -= msk->size_goal_cache;
1186 }
1187 return true;
1188}
1189
1190static bool __mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk)
1191{
1192 struct mptcp_sock *msk = mptcp_sk(sk);
1193 struct sk_buff *skb;
1194
1195 if (ssk->sk_tx_skb_cache) {
1196 skb = ssk->sk_tx_skb_cache;
1197 if (unlikely(!skb_ext_find(skb, SKB_EXT_MPTCP) &&
1198 !__mptcp_add_ext(skb, sk->sk_allocation)))
1199 return false;
1200 return true;
1201 }
1202
1203 skb = skb_peek(&msk->skb_tx_cache);
1204 if (skb) {
1205 if (likely(sk_wmem_schedule(ssk, skb->truesize))) {
1206 skb = __skb_dequeue(&msk->skb_tx_cache);
1207 if (WARN_ON_ONCE(!skb))
1208 return false;
1209
1210 mptcp_wmem_uncharge(sk, skb->truesize);
1211 ssk->sk_tx_skb_cache = skb;
1212 return true;
1213 }
1214
1215 /* over memory limit, no point to try to allocate a new skb */
1216 return false;
1217 }
1218
1219 skb = __mptcp_do_alloc_tx_skb(sk);
1220 if (!skb)
1221 return false;
1222
1223 if (likely(sk_wmem_schedule(ssk, skb->truesize))) {
1224 ssk->sk_tx_skb_cache = skb;
1225 return true;
1226 }
1227 kfree_skb(skb);
1228 return false;
1229}
1230
1231static bool mptcp_must_reclaim_memory(struct sock *sk, struct sock *ssk)
1232{
1233 return !ssk->sk_tx_skb_cache &&
1234 !skb_peek(&mptcp_sk(sk)->skb_tx_cache) &&
1235 tcp_under_memory_pressure(sk);
1236}
1237
1238static bool mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk)
1239{
1240 if (unlikely(mptcp_must_reclaim_memory(sk, ssk)))
1241 mptcp_mem_reclaim_partial(sk);
1242 return __mptcp_alloc_tx_skb(sk, ssk);
1243}
1244
Paolo Abeni57040752020-01-21 16:56:27 -08001245static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
Paolo Abenid9ca1de2020-11-16 10:48:10 +01001246 struct mptcp_data_frag *dfrag,
Paolo Abenicaf971d2020-11-16 10:48:06 +01001247 struct mptcp_sendmsg_info *info)
Paolo Abeni57040752020-01-21 16:56:27 -08001248{
Paolo Abenid9ca1de2020-11-16 10:48:10 +01001249 u64 data_seq = dfrag->data_seq + info->sent;
Mat Martineau6d0060f2020-01-21 16:56:23 -08001250 struct mptcp_sock *msk = mptcp_sk(sk);
Florian Westphal6f8a6122020-11-16 10:48:13 +01001251 bool zero_window_probe = false;
Mat Martineau6d0060f2020-01-21 16:56:23 -08001252 struct mptcp_ext *mpext = NULL;
Paolo Abeni57040752020-01-21 16:56:27 -08001253 struct sk_buff *skb, *tail;
Paolo Abenid9ca1de2020-11-16 10:48:10 +01001254 bool can_collapse = false;
1255 int avail_size;
Paolo Abeni724cfd22020-11-27 11:10:25 +01001256 size_t ret = 0;
Mat Martineau6d0060f2020-01-21 16:56:23 -08001257
Paolo Abenid9ca1de2020-11-16 10:48:10 +01001258 pr_debug("msk=%p ssk=%p sending dfrag at seq=%lld len=%d already sent=%d",
1259 msk, ssk, dfrag->data_seq, dfrag->data_len, info->sent);
Mat Martineau6d0060f2020-01-21 16:56:23 -08001260
Paolo Abenid9ca1de2020-11-16 10:48:10 +01001261 /* compute send limit */
1262 info->mss_now = tcp_send_mss(ssk, &info->size_goal, info->flags);
Paolo Abenicaf971d2020-11-16 10:48:06 +01001263 avail_size = info->size_goal;
Paolo Abeni724cfd22020-11-27 11:10:25 +01001264 msk->size_goal_cache = info->size_goal;
Paolo Abeni57040752020-01-21 16:56:27 -08001265 skb = tcp_write_queue_tail(ssk);
1266 if (skb) {
Paolo Abeni57040752020-01-21 16:56:27 -08001267 /* Limit the write to the size available in the
1268 * current skb, if any, so that we create at most a new skb.
1269 * Explicitly tells TCP internals to avoid collapsing on later
1270 * queue management operation, to avoid breaking the ext <->
1271 * SSN association set here
1272 */
Paolo Abenid9ca1de2020-11-16 10:48:10 +01001273 mpext = skb_ext_find(skb, SKB_EXT_MPTCP);
Paolo Abenicaf971d2020-11-16 10:48:06 +01001274 can_collapse = (info->size_goal - skb->len > 0) &&
Paolo Abenid9ca1de2020-11-16 10:48:10 +01001275 mptcp_skb_can_collapse_to(data_seq, skb, mpext);
Paolo Abeni57040752020-01-21 16:56:27 -08001276 if (!can_collapse)
1277 TCP_SKB_CB(skb)->eor = 1;
1278 else
Paolo Abenicaf971d2020-11-16 10:48:06 +01001279 avail_size = info->size_goal - skb->len;
Paolo Abeni57040752020-01-21 16:56:27 -08001280 }
Paolo Abeni18b683b2020-03-27 14:48:43 -07001281
Florian Westphal6f8a6122020-11-16 10:48:13 +01001282 /* Zero window and all data acked? Probe. */
1283 avail_size = mptcp_check_allowed_size(msk, data_seq, avail_size);
1284 if (avail_size == 0) {
1285 if (skb || atomic64_read(&msk->snd_una) != msk->snd_nxt)
1286 return 0;
1287 zero_window_probe = true;
1288 data_seq = atomic64_read(&msk->snd_una) - 1;
1289 avail_size = 1;
1290 }
1291
Paolo Abenid9ca1de2020-11-16 10:48:10 +01001292 if (WARN_ON_ONCE(info->sent > info->limit ||
1293 info->limit > dfrag->data_len))
1294 return 0;
Paolo Abeni3f8e0aa2020-03-27 14:48:47 -07001295
Paolo Abenid9ca1de2020-11-16 10:48:10 +01001296 ret = info->limit - info->sent;
1297 tail = tcp_build_frag(ssk, avail_size, info->flags, dfrag->page,
1298 dfrag->offset + info->sent, &ret);
Paolo Abenie2223992020-11-16 10:48:03 +01001299 if (!tail) {
1300 tcp_remove_empty_skb(sk, tcp_write_queue_tail(ssk));
1301 return -ENOMEM;
Florian Westphal357593832020-08-14 15:56:34 +02001302 }
Paolo Abeni18b683b2020-03-27 14:48:43 -07001303
Paolo Abenie2223992020-11-16 10:48:03 +01001304 /* if the tail skb is still the cached one, collapsing really happened.
Paolo Abeni57040752020-01-21 16:56:27 -08001305 */
Paolo Abenie2223992020-11-16 10:48:03 +01001306 if (skb == tail) {
Paolo Abeni57040752020-01-21 16:56:27 -08001307 WARN_ON_ONCE(!can_collapse);
1308 mpext->data_len += ret;
Florian Westphal6f8a6122020-11-16 10:48:13 +01001309 WARN_ON_ONCE(zero_window_probe);
Paolo Abeni57040752020-01-21 16:56:27 -08001310 goto out;
1311 }
1312
Paolo Abeni724cfd22020-11-27 11:10:25 +01001313 mpext = skb_ext_find(tail, SKB_EXT_MPTCP);
1314 if (WARN_ON_ONCE(!mpext)) {
1315 /* should never reach here, stream corrupted */
1316 return -EINVAL;
1317 }
Mat Martineau6d0060f2020-01-21 16:56:23 -08001318
1319 memset(mpext, 0, sizeof(*mpext));
Paolo Abenid9ca1de2020-11-16 10:48:10 +01001320 mpext->data_seq = data_seq;
Mat Martineau6d0060f2020-01-21 16:56:23 -08001321 mpext->subflow_seq = mptcp_subflow_ctx(ssk)->rel_write_seq;
1322 mpext->data_len = ret;
1323 mpext->use_map = 1;
1324 mpext->dsn64 = 1;
1325
1326 pr_debug("data_seq=%llu subflow_seq=%u data_len=%u dsn64=%d",
1327 mpext->data_seq, mpext->subflow_seq, mpext->data_len,
1328 mpext->dsn64);
1329
Florian Westphal6f8a6122020-11-16 10:48:13 +01001330 if (zero_window_probe) {
1331 mptcp_subflow_ctx(ssk)->rel_write_seq += ret;
1332 mpext->frozen = 1;
1333 ret = 0;
1334 tcp_push_pending_frames(ssk);
1335 }
Paolo Abeni57040752020-01-21 16:56:27 -08001336out:
Mat Martineau6d0060f2020-01-21 16:56:23 -08001337 mptcp_subflow_ctx(ssk)->rel_write_seq += ret;
Mat Martineau6d0060f2020-01-21 16:56:23 -08001338 return ret;
1339}
1340
Paolo Abeni63561a42020-09-14 10:01:07 +02001341static void mptcp_nospace(struct mptcp_sock *msk)
Florian Westphala0e17062020-05-16 10:46:17 +02001342{
Paolo Abeni63561a42020-09-14 10:01:07 +02001343 struct mptcp_subflow_context *subflow;
1344
Florian Westphal8edf0862020-11-16 10:48:12 +01001345 set_bit(MPTCP_NOSPACE, &msk->flags);
Florian Westphala0e17062020-05-16 10:46:17 +02001346 smp_mb__after_atomic(); /* msk->flags is changed by write_space cb */
1347
Paolo Abeni63561a42020-09-14 10:01:07 +02001348 mptcp_for_each_subflow(msk, subflow) {
1349 struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
Florian Westphal8edf0862020-11-16 10:48:12 +01001350 bool ssk_writeable = sk_stream_is_writeable(ssk);
Paolo Abeni63561a42020-09-14 10:01:07 +02001351 struct socket *sock = READ_ONCE(ssk->sk_socket);
1352
Florian Westphal8edf0862020-11-16 10:48:12 +01001353 if (ssk_writeable || !sock)
1354 continue;
1355
Paolo Abeni63561a42020-09-14 10:01:07 +02001356 /* enables ssk->write_space() callbacks */
Florian Westphal8edf0862020-11-16 10:48:12 +01001357 set_bit(SOCK_NOSPACE, &sock->flags);
Paolo Abeni63561a42020-09-14 10:01:07 +02001358 }
Florian Westphal8edf0862020-11-16 10:48:12 +01001359
1360 /* mptcp_data_acked() could run just before we set the NOSPACE bit,
1361 * so explicitly check for snd_una value
1362 */
1363 mptcp_clean_una((struct sock *)msk);
Florian Westphala0e17062020-05-16 10:46:17 +02001364}
1365
Paolo Abenid5f49192020-09-14 10:01:17 +02001366#define MPTCP_SEND_BURST_SIZE ((1 << 16) - \
1367 sizeof(struct tcphdr) - \
1368 MAX_TCP_OPTION_SPACE - \
1369 sizeof(struct ipv6hdr) - \
1370 sizeof(struct frag_hdr))
1371
1372struct subflow_send_info {
1373 struct sock *ssk;
1374 u64 ratio;
1375};
1376
Paolo Abenida51aef2020-09-14 10:01:10 +02001377static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk,
1378 u32 *sndbuf)
Peter Krystadf2962342020-03-27 14:48:39 -07001379{
Paolo Abenid5f49192020-09-14 10:01:17 +02001380 struct subflow_send_info send_info[2];
Peter Krystadf2962342020-03-27 14:48:39 -07001381 struct mptcp_subflow_context *subflow;
Paolo Abenid5f49192020-09-14 10:01:17 +02001382 int i, nr_active = 0;
1383 struct sock *ssk;
1384 u64 ratio;
1385 u32 pace;
Peter Krystadf2962342020-03-27 14:48:39 -07001386
Paolo Abenid5f49192020-09-14 10:01:17 +02001387 sock_owned_by_me((struct sock *)msk);
Peter Krystadf2962342020-03-27 14:48:39 -07001388
Paolo Abenida51aef2020-09-14 10:01:10 +02001389 *sndbuf = 0;
Paolo Abenid5f49192020-09-14 10:01:17 +02001390 if (__mptcp_check_fallback(msk)) {
1391 if (!msk->first)
Peter Krystadf2962342020-03-27 14:48:39 -07001392 return NULL;
Paolo Abenid5f49192020-09-14 10:01:17 +02001393 *sndbuf = msk->first->sk_sndbuf;
1394 return sk_stream_memory_free(msk->first) ? msk->first : NULL;
Peter Krystadf2962342020-03-27 14:48:39 -07001395 }
1396
Paolo Abenid5f49192020-09-14 10:01:17 +02001397 /* re-use last subflow, if the burst allow that */
1398 if (msk->last_snd && msk->snd_burst > 0 &&
1399 sk_stream_memory_free(msk->last_snd) &&
1400 mptcp_subflow_active(mptcp_subflow_ctx(msk->last_snd))) {
1401 mptcp_for_each_subflow(msk, subflow) {
1402 ssk = mptcp_subflow_tcp_sock(subflow);
1403 *sndbuf = max(tcp_sk(ssk)->snd_wnd, *sndbuf);
1404 }
1405 return msk->last_snd;
1406 }
1407
1408 /* pick the subflow with the lower wmem/wspace ratio */
1409 for (i = 0; i < 2; ++i) {
1410 send_info[i].ssk = NULL;
1411 send_info[i].ratio = -1;
1412 }
1413 mptcp_for_each_subflow(msk, subflow) {
1414 ssk = mptcp_subflow_tcp_sock(subflow);
1415 if (!mptcp_subflow_active(subflow))
1416 continue;
1417
1418 nr_active += !subflow->backup;
1419 *sndbuf = max(tcp_sk(ssk)->snd_wnd, *sndbuf);
1420 if (!sk_stream_memory_free(subflow->tcp_sock))
1421 continue;
1422
1423 pace = READ_ONCE(ssk->sk_pacing_rate);
1424 if (!pace)
1425 continue;
1426
1427 ratio = div_u64((u64)READ_ONCE(ssk->sk_wmem_queued) << 32,
1428 pace);
1429 if (ratio < send_info[subflow->backup].ratio) {
1430 send_info[subflow->backup].ssk = ssk;
1431 send_info[subflow->backup].ratio = ratio;
1432 }
1433 }
1434
1435 pr_debug("msk=%p nr_active=%d ssk=%p:%lld backup=%p:%lld",
1436 msk, nr_active, send_info[0].ssk, send_info[0].ratio,
1437 send_info[1].ssk, send_info[1].ratio);
1438
1439 /* pick the best backup if no other subflow is active */
1440 if (!nr_active)
1441 send_info[0].ssk = send_info[1].ssk;
1442
1443 if (send_info[0].ssk) {
1444 msk->last_snd = send_info[0].ssk;
1445 msk->snd_burst = min_t(int, MPTCP_SEND_BURST_SIZE,
1446 sk_stream_wspace(msk->last_snd));
1447 return msk->last_snd;
1448 }
1449 return NULL;
Peter Krystadf2962342020-03-27 14:48:39 -07001450}
1451
Paolo Abenid9ca1de2020-11-16 10:48:10 +01001452static void mptcp_push_release(struct sock *sk, struct sock *ssk,
1453 struct mptcp_sendmsg_info *info)
1454{
1455 mptcp_set_timeout(sk, ssk);
1456 tcp_push(ssk, 0, info->mss_now, tcp_sk(ssk)->nonagle, info->size_goal);
1457 release_sock(ssk);
1458}
1459
1460static void mptcp_push_pending(struct sock *sk, unsigned int flags)
1461{
1462 struct sock *prev_ssk = NULL, *ssk = NULL;
1463 struct mptcp_sock *msk = mptcp_sk(sk);
1464 struct mptcp_sendmsg_info info = {
1465 .flags = flags,
1466 };
1467 struct mptcp_data_frag *dfrag;
1468 int len, copied = 0;
1469 u32 sndbuf;
1470
1471 while ((dfrag = mptcp_send_head(sk))) {
1472 info.sent = dfrag->already_sent;
1473 info.limit = dfrag->data_len;
1474 len = dfrag->data_len - dfrag->already_sent;
1475 while (len > 0) {
1476 int ret = 0;
1477
1478 prev_ssk = ssk;
1479 __mptcp_flush_join_list(msk);
1480 ssk = mptcp_subflow_get_send(msk, &sndbuf);
1481
1482 /* do auto tuning */
1483 if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK) &&
1484 sndbuf > READ_ONCE(sk->sk_sndbuf))
1485 WRITE_ONCE(sk->sk_sndbuf, sndbuf);
1486
1487 /* try to keep the subflow socket lock across
1488 * consecutive xmit on the same socket
1489 */
1490 if (ssk != prev_ssk && prev_ssk)
1491 mptcp_push_release(sk, prev_ssk, &info);
1492 if (!ssk)
1493 goto out;
1494
1495 if (ssk != prev_ssk || !prev_ssk)
1496 lock_sock(ssk);
1497
Paolo Abeni724cfd22020-11-27 11:10:25 +01001498 /* keep it simple and always provide a new skb for the
1499 * subflow, even if we will not use it when collapsing
1500 * on the pending one
1501 */
1502 if (!mptcp_alloc_tx_skb(sk, ssk)) {
1503 mptcp_push_release(sk, ssk, &info);
1504 goto out;
1505 }
1506
Paolo Abenid9ca1de2020-11-16 10:48:10 +01001507 ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info);
1508 if (ret <= 0) {
1509 mptcp_push_release(sk, ssk, &info);
1510 goto out;
1511 }
1512
1513 info.sent += ret;
1514 dfrag->already_sent += ret;
1515 msk->snd_nxt += ret;
1516 msk->snd_burst -= ret;
Paolo Abeni724cfd22020-11-27 11:10:25 +01001517 msk->tx_pending_data -= ret;
Paolo Abenid9ca1de2020-11-16 10:48:10 +01001518 copied += ret;
1519 len -= ret;
1520 }
1521 WRITE_ONCE(msk->first_pending, mptcp_send_next(sk));
1522 }
1523
1524 /* at this point we held the socket lock for the last subflow we used */
1525 if (ssk)
1526 mptcp_push_release(sk, ssk, &info);
1527
1528out:
Paolo Abenib680a212020-11-18 23:05:34 +01001529 if (copied) {
1530 /* start the timer, if it's not pending */
1531 if (!mptcp_timer_pending(sk))
1532 mptcp_reset_timer(sk);
Paolo Abenid9ca1de2020-11-16 10:48:10 +01001533 __mptcp_check_send_data_fin(sk);
Paolo Abenib680a212020-11-18 23:05:34 +01001534 }
Paolo Abenid9ca1de2020-11-16 10:48:10 +01001535}
1536
Mat Martineauf870fa02020-01-21 16:56:15 -08001537static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
1538{
1539 struct mptcp_sock *msk = mptcp_sk(sk);
Florian Westphal17091702020-05-16 10:46:21 +02001540 struct page_frag *pfrag;
Mat Martineau6d0060f2020-01-21 16:56:23 -08001541 size_t copied = 0;
Paolo Abenicaf971d2020-11-16 10:48:06 +01001542 int ret = 0;
Mat Martineau6d0060f2020-01-21 16:56:23 -08001543 long timeo;
Mat Martineauf870fa02020-01-21 16:56:15 -08001544
1545 if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL))
1546 return -EOPNOTSUPP;
1547
Paolo Abenie93da922020-11-27 11:10:23 +01001548 mptcp_lock_sock(sk, __mptcp_wmem_reserve(sk, len));
Mat Martineau1954b862020-02-28 15:47:39 -08001549
1550 timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1551
1552 if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) {
1553 ret = sk_stream_wait_connect(sk, &timeo);
1554 if (ret)
1555 goto out;
1556 }
1557
Florian Westphal17091702020-05-16 10:46:21 +02001558 pfrag = sk_page_frag(sk);
Paolo Abeni18b683b2020-03-27 14:48:43 -07001559 mptcp_clean_una(sk);
1560
Paolo Abenid9ca1de2020-11-16 10:48:10 +01001561 while (msg_data_left(msg)) {
Paolo Abeni724cfd22020-11-27 11:10:25 +01001562 int total_ts, frag_truesize = 0;
Paolo Abenid9ca1de2020-11-16 10:48:10 +01001563 struct mptcp_data_frag *dfrag;
Paolo Abeni724cfd22020-11-27 11:10:25 +01001564 struct sk_buff_head skbs;
Paolo Abenid9ca1de2020-11-16 10:48:10 +01001565 bool dfrag_collapsed;
1566 size_t psize, offset;
Mat Martineau57baaf22020-07-28 15:12:00 -07001567
Paolo Abenid9ca1de2020-11-16 10:48:10 +01001568 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) {
1569 ret = -EPIPE;
1570 goto out;
Florian Westphalfb529e62020-05-16 10:46:18 +02001571 }
1572
Paolo Abenid9ca1de2020-11-16 10:48:10 +01001573 /* reuse tail pfrag, if possible, or carve a new one from the
1574 * page allocator
1575 */
1576 dfrag = mptcp_pending_tail(sk);
1577 dfrag_collapsed = mptcp_frag_can_collapse_to(msk, pfrag, dfrag);
1578 if (!dfrag_collapsed) {
1579 if (!sk_stream_memory_free(sk)) {
1580 mptcp_push_pending(sk, msg->msg_flags);
1581 if (!sk_stream_memory_free(sk))
1582 goto wait_for_memory;
1583 }
1584 if (!mptcp_page_frag_refill(sk, pfrag))
1585 goto wait_for_memory;
1586
1587 dfrag = mptcp_carve_data_frag(msk, pfrag, pfrag->offset);
1588 frag_truesize = dfrag->overhead;
1589 }
1590
1591 /* we do not bound vs wspace, to allow a single packet.
1592 * memory accounting will prevent execessive memory usage
1593 * anyway
1594 */
1595 offset = dfrag->offset + dfrag->data_len;
1596 psize = pfrag->size - offset;
1597 psize = min_t(size_t, psize, msg_data_left(msg));
Paolo Abeni724cfd22020-11-27 11:10:25 +01001598 total_ts = psize + frag_truesize;
1599 __skb_queue_head_init(&skbs);
1600 if (!mptcp_tx_cache_refill(sk, psize, &skbs, &total_ts))
Paolo Abenid9ca1de2020-11-16 10:48:10 +01001601 goto wait_for_memory;
1602
Paolo Abeni724cfd22020-11-27 11:10:25 +01001603 if (!mptcp_wmem_alloc(sk, total_ts)) {
1604 __skb_queue_purge(&skbs);
1605 goto wait_for_memory;
1606 }
1607
1608 skb_queue_splice_tail(&skbs, &msk->skb_tx_cache);
Paolo Abenid9ca1de2020-11-16 10:48:10 +01001609 if (copy_page_from_iter(dfrag->page, offset, psize,
1610 &msg->msg_iter) != psize) {
Paolo Abeni87952602020-11-27 11:10:24 +01001611 mptcp_wmem_uncharge(sk, psize + frag_truesize);
Paolo Abenid9ca1de2020-11-16 10:48:10 +01001612 ret = -EFAULT;
1613 goto out;
1614 }
1615
1616 /* data successfully copied into the write queue */
1617 copied += psize;
1618 dfrag->data_len += psize;
1619 frag_truesize += psize;
1620 pfrag->offset += frag_truesize;
1621 WRITE_ONCE(msk->write_seq, msk->write_seq + psize);
1622
1623 /* charge data on mptcp pending queue to the msk socket
1624 * Note: we charge such data both to sk and ssk
1625 */
1626 sk_wmem_queued_add(sk, frag_truesize);
Paolo Abenid9ca1de2020-11-16 10:48:10 +01001627 if (!dfrag_collapsed) {
1628 get_page(dfrag->page);
1629 list_add_tail(&dfrag->list, &msk->rtx_queue);
1630 if (!msk->first_pending)
1631 WRITE_ONCE(msk->first_pending, dfrag);
1632 }
1633 pr_debug("msk=%p dfrag at seq=%lld len=%d sent=%d new=%d", msk,
1634 dfrag->data_seq, dfrag->data_len, dfrag->already_sent,
1635 !dfrag_collapsed);
1636
Paolo Abenid9ca1de2020-11-16 10:48:10 +01001637 continue;
1638
1639wait_for_memory:
Paolo Abeni63561a42020-09-14 10:01:07 +02001640 mptcp_nospace(msk);
Paolo Abenid9ca1de2020-11-16 10:48:10 +01001641 if (mptcp_timer_pending(sk))
1642 mptcp_reset_timer(sk);
Peter Krystadf2962342020-03-27 14:48:39 -07001643 ret = sk_stream_wait_memory(sk, &timeo);
1644 if (ret)
1645 goto out;
Peter Krystadcec37a62020-01-21 16:56:18 -08001646 }
1647
Paolo Abeni724cfd22020-11-27 11:10:25 +01001648 if (copied) {
1649 msk->tx_pending_data += copied;
Paolo Abenid9ca1de2020-11-16 10:48:10 +01001650 mptcp_push_pending(sk, msg->msg_flags);
Paolo Abeni724cfd22020-11-27 11:10:25 +01001651 }
Paolo Abenida51aef2020-09-14 10:01:10 +02001652
Mat Martineau1954b862020-02-28 15:47:39 -08001653out:
Peter Krystadcec37a62020-01-21 16:56:18 -08001654 release_sock(sk);
Paolo Abeni8555c6b2020-08-03 18:40:39 +02001655 return copied ? : ret;
Mat Martineauf870fa02020-01-21 16:56:15 -08001656}
1657
Paolo Abeni7a6a6cb2020-01-21 16:56:26 -08001658static void mptcp_wait_data(struct sock *sk, long *timeo)
1659{
1660 DEFINE_WAIT_FUNC(wait, woken_wake_function);
1661 struct mptcp_sock *msk = mptcp_sk(sk);
1662
1663 add_wait_queue(sk_sleep(sk), &wait);
1664 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
1665
1666 sk_wait_event(sk, timeo,
1667 test_and_clear_bit(MPTCP_DATA_READY, &msk->flags), &wait);
1668
1669 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
1670 remove_wait_queue(sk_sleep(sk), &wait);
1671}
1672
Florian Westphal6771bfd2020-02-26 10:14:48 +01001673static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk,
1674 struct msghdr *msg,
1675 size_t len)
1676{
Florian Westphal6771bfd2020-02-26 10:14:48 +01001677 struct sk_buff *skb;
1678 int copied = 0;
1679
Paolo Abeni87952602020-11-27 11:10:24 +01001680 while ((skb = skb_peek(&msk->receive_queue)) != NULL) {
Florian Westphal6771bfd2020-02-26 10:14:48 +01001681 u32 offset = MPTCP_SKB_CB(skb)->offset;
1682 u32 data_len = skb->len - offset;
1683 u32 count = min_t(size_t, len - copied, data_len);
1684 int err;
1685
1686 err = skb_copy_datagram_msg(skb, offset, msg, count);
1687 if (unlikely(err < 0)) {
1688 if (!copied)
1689 return err;
1690 break;
1691 }
1692
1693 copied += count;
1694
1695 if (count < data_len) {
1696 MPTCP_SKB_CB(skb)->offset += count;
1697 break;
1698 }
1699
Paolo Abeni87952602020-11-27 11:10:24 +01001700 /* we will bulk release the skb memory later */
1701 skb->destructor = NULL;
1702 msk->rmem_released += skb->truesize;
1703 __skb_unlink(skb, &msk->receive_queue);
Florian Westphal6771bfd2020-02-26 10:14:48 +01001704 __kfree_skb(skb);
1705
1706 if (copied >= len)
1707 break;
1708 }
1709
1710 return copied;
1711}
1712
Florian Westphala6b118f2020-06-30 21:24:45 +02001713/* receive buffer autotuning. See tcp_rcv_space_adjust for more information.
1714 *
1715 * Only difference: Use highest rtt estimate of the subflows in use.
1716 */
1717static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
1718{
1719 struct mptcp_subflow_context *subflow;
1720 struct sock *sk = (struct sock *)msk;
1721 u32 time, advmss = 1;
1722 u64 rtt_us, mstamp;
1723
1724 sock_owned_by_me(sk);
1725
1726 if (copied <= 0)
1727 return;
1728
1729 msk->rcvq_space.copied += copied;
1730
1731 mstamp = div_u64(tcp_clock_ns(), NSEC_PER_USEC);
1732 time = tcp_stamp_us_delta(mstamp, msk->rcvq_space.time);
1733
1734 rtt_us = msk->rcvq_space.rtt_us;
1735 if (rtt_us && time < (rtt_us >> 3))
1736 return;
1737
1738 rtt_us = 0;
1739 mptcp_for_each_subflow(msk, subflow) {
1740 const struct tcp_sock *tp;
1741 u64 sf_rtt_us;
1742 u32 sf_advmss;
1743
1744 tp = tcp_sk(mptcp_subflow_tcp_sock(subflow));
1745
1746 sf_rtt_us = READ_ONCE(tp->rcv_rtt_est.rtt_us);
1747 sf_advmss = READ_ONCE(tp->advmss);
1748
1749 rtt_us = max(sf_rtt_us, rtt_us);
1750 advmss = max(sf_advmss, advmss);
1751 }
1752
1753 msk->rcvq_space.rtt_us = rtt_us;
1754 if (time < (rtt_us >> 3) || rtt_us == 0)
1755 return;
1756
1757 if (msk->rcvq_space.copied <= msk->rcvq_space.space)
1758 goto new_measure;
1759
1760 if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf &&
1761 !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
1762 int rcvmem, rcvbuf;
1763 u64 rcvwin, grow;
1764
1765 rcvwin = ((u64)msk->rcvq_space.copied << 1) + 16 * advmss;
1766
1767 grow = rcvwin * (msk->rcvq_space.copied - msk->rcvq_space.space);
1768
1769 do_div(grow, msk->rcvq_space.space);
1770 rcvwin += (grow << 1);
1771
1772 rcvmem = SKB_TRUESIZE(advmss + MAX_TCP_HEADER);
1773 while (tcp_win_from_space(sk, rcvmem) < advmss)
1774 rcvmem += 128;
1775
1776 do_div(rcvwin, advmss);
1777 rcvbuf = min_t(u64, rcvwin * rcvmem,
1778 sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
1779
1780 if (rcvbuf > sk->sk_rcvbuf) {
1781 u32 window_clamp;
1782
1783 window_clamp = tcp_win_from_space(sk, rcvbuf);
1784 WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
1785
1786 /* Make subflows follow along. If we do not do this, we
1787 * get drops at subflow level if skbs can't be moved to
1788 * the mptcp rx queue fast enough (announced rcv_win can
1789 * exceed ssk->sk_rcvbuf).
1790 */
1791 mptcp_for_each_subflow(msk, subflow) {
1792 struct sock *ssk;
Paolo Abenic76c6952020-09-14 10:01:18 +02001793 bool slow;
Florian Westphala6b118f2020-06-30 21:24:45 +02001794
1795 ssk = mptcp_subflow_tcp_sock(subflow);
Paolo Abenic76c6952020-09-14 10:01:18 +02001796 slow = lock_sock_fast(ssk);
Florian Westphala6b118f2020-06-30 21:24:45 +02001797 WRITE_ONCE(ssk->sk_rcvbuf, rcvbuf);
1798 tcp_sk(ssk)->window_clamp = window_clamp;
Paolo Abenic76c6952020-09-14 10:01:18 +02001799 tcp_cleanup_rbuf(ssk, 1);
1800 unlock_sock_fast(ssk, slow);
Florian Westphala6b118f2020-06-30 21:24:45 +02001801 }
1802 }
1803 }
1804
1805 msk->rcvq_space.space = msk->rcvq_space.copied;
1806new_measure:
1807 msk->rcvq_space.copied = 0;
1808 msk->rcvq_space.time = mstamp;
1809}
1810
Paolo Abeni87952602020-11-27 11:10:24 +01001811static void __mptcp_update_rmem(struct sock *sk)
1812{
1813 struct mptcp_sock *msk = mptcp_sk(sk);
1814
1815 if (!msk->rmem_released)
1816 return;
1817
1818 atomic_sub(msk->rmem_released, &sk->sk_rmem_alloc);
1819 sk_mem_uncharge(sk, msk->rmem_released);
1820 msk->rmem_released = 0;
1821}
1822
1823static void __mptcp_splice_receive_queue(struct sock *sk)
1824{
1825 struct mptcp_sock *msk = mptcp_sk(sk);
1826
1827 skb_queue_splice_tail_init(&sk->sk_receive_queue, &msk->receive_queue);
1828}
1829
Paolo Abeniea4ca582020-11-19 11:46:03 -08001830static bool __mptcp_move_skbs(struct mptcp_sock *msk, unsigned int rcv)
Florian Westphal6771bfd2020-02-26 10:14:48 +01001831{
Paolo Abeni87952602020-11-27 11:10:24 +01001832 struct sock *sk = (struct sock *)msk;
Florian Westphal6771bfd2020-02-26 10:14:48 +01001833 unsigned int moved = 0;
Paolo Abeni87952602020-11-27 11:10:24 +01001834 bool ret, done;
Paolo Abenid5f49192020-09-14 10:01:17 +02001835
1836 __mptcp_flush_join_list(msk);
Florian Westphal6771bfd2020-02-26 10:14:48 +01001837 do {
1838 struct sock *ssk = mptcp_subflow_recv_lookup(msk);
Florian Westphal65f49fe72020-11-03 11:05:04 -08001839 bool slowpath;
Florian Westphal6771bfd2020-02-26 10:14:48 +01001840
Paolo Abeni87952602020-11-27 11:10:24 +01001841 /* we can have data pending in the subflows only if the msk
1842 * receive buffer was full at subflow_data_ready() time,
1843 * that is an unlikely slow path.
1844 */
1845 if (likely(!ssk))
Florian Westphal6771bfd2020-02-26 10:14:48 +01001846 break;
1847
Florian Westphal65f49fe72020-11-03 11:05:04 -08001848 slowpath = lock_sock_fast(ssk);
Paolo Abeni87952602020-11-27 11:10:24 +01001849 mptcp_data_lock(sk);
Florian Westphal6771bfd2020-02-26 10:14:48 +01001850 done = __mptcp_move_skbs_from_subflow(msk, ssk, &moved);
Paolo Abeni87952602020-11-27 11:10:24 +01001851 mptcp_data_unlock(sk);
Paolo Abeniea4ca582020-11-19 11:46:03 -08001852 if (moved && rcv) {
1853 WRITE_ONCE(msk->rmem_pending, min(rcv, moved));
1854 tcp_cleanup_rbuf(ssk, 1);
1855 WRITE_ONCE(msk->rmem_pending, 0);
1856 }
Florian Westphal65f49fe72020-11-03 11:05:04 -08001857 unlock_sock_fast(ssk, slowpath);
Florian Westphal6771bfd2020-02-26 10:14:48 +01001858 } while (!done);
1859
Paolo Abeni87952602020-11-27 11:10:24 +01001860 /* acquire the data lock only if some input data is pending */
1861 ret = moved > 0;
1862 if (!RB_EMPTY_ROOT(&msk->out_of_order_queue) ||
1863 !skb_queue_empty_lockless(&sk->sk_receive_queue)) {
1864 mptcp_data_lock(sk);
1865 __mptcp_update_rmem(sk);
1866 ret |= __mptcp_ofo_queue(msk);
1867 __mptcp_splice_receive_queue(sk);
1868 mptcp_data_unlock(sk);
Paolo Abeniab174ad2020-09-14 10:01:12 +02001869 }
Paolo Abeni87952602020-11-27 11:10:24 +01001870 if (ret)
1871 mptcp_check_data_fin((struct sock *)msk);
1872 return !skb_queue_empty(&msk->receive_queue);
Florian Westphal6771bfd2020-02-26 10:14:48 +01001873}
1874
Mat Martineauf870fa02020-01-21 16:56:15 -08001875static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
1876 int nonblock, int flags, int *addr_len)
1877{
1878 struct mptcp_sock *msk = mptcp_sk(sk);
Peter Krystadcec37a62020-01-21 16:56:18 -08001879 int copied = 0;
Paolo Abeni7a6a6cb2020-01-21 16:56:26 -08001880 int target;
1881 long timeo;
Mat Martineauf870fa02020-01-21 16:56:15 -08001882
1883 if (msg->msg_flags & ~(MSG_WAITALL | MSG_DONTWAIT))
1884 return -EOPNOTSUPP;
1885
Paolo Abeni87952602020-11-27 11:10:24 +01001886 mptcp_lock_sock(sk, __mptcp_splice_receive_queue(sk));
Paolo Abenifd897672020-11-24 22:51:24 +01001887 if (unlikely(sk->sk_state == TCP_LISTEN)) {
1888 copied = -ENOTCONN;
1889 goto out_err;
1890 }
1891
Paolo Abeni7a6a6cb2020-01-21 16:56:26 -08001892 timeo = sock_rcvtimeo(sk, nonblock);
1893
1894 len = min_t(size_t, len, INT_MAX);
1895 target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1896
Paolo Abeniea4ca582020-11-19 11:46:03 -08001897 for (;;) {
1898 int bytes_read, old_space;
Paolo Abeni7a6a6cb2020-01-21 16:56:26 -08001899
Florian Westphal6771bfd2020-02-26 10:14:48 +01001900 bytes_read = __mptcp_recvmsg_mskq(msk, msg, len - copied);
1901 if (unlikely(bytes_read < 0)) {
1902 if (!copied)
1903 copied = bytes_read;
1904 goto out_err;
1905 }
Paolo Abeni7a6a6cb2020-01-21 16:56:26 -08001906
Florian Westphal6771bfd2020-02-26 10:14:48 +01001907 copied += bytes_read;
Paolo Abeni7a6a6cb2020-01-21 16:56:26 -08001908
Paolo Abeni87952602020-11-27 11:10:24 +01001909 if (skb_queue_empty(&msk->receive_queue) &&
Paolo Abeniea4ca582020-11-19 11:46:03 -08001910 __mptcp_move_skbs(msk, len - copied))
Florian Westphal6771bfd2020-02-26 10:14:48 +01001911 continue;
Paolo Abeni7a6a6cb2020-01-21 16:56:26 -08001912
Paolo Abeniea4ca582020-11-19 11:46:03 -08001913 /* be sure to advertise window change */
1914 old_space = READ_ONCE(msk->old_wspace);
1915 if ((tcp_space(sk) - old_space) >= old_space)
Paolo Abenifd897672020-11-24 22:51:24 +01001916 mptcp_cleanup_rbuf(msk);
Paolo Abeniea4ca582020-11-19 11:46:03 -08001917
Paolo Abeni7a6a6cb2020-01-21 16:56:26 -08001918 /* only the master socket status is relevant here. The exit
1919 * conditions mirror closely tcp_recvmsg()
1920 */
1921 if (copied >= target)
1922 break;
1923
1924 if (copied) {
1925 if (sk->sk_err ||
1926 sk->sk_state == TCP_CLOSE ||
1927 (sk->sk_shutdown & RCV_SHUTDOWN) ||
1928 !timeo ||
1929 signal_pending(current))
1930 break;
1931 } else {
1932 if (sk->sk_err) {
1933 copied = sock_error(sk);
1934 break;
1935 }
1936
Paolo Abeni59698562020-06-10 10:47:41 +02001937 if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags))
1938 mptcp_check_for_eof(msk);
1939
Paolo Abeni87952602020-11-27 11:10:24 +01001940 if (sk->sk_shutdown & RCV_SHUTDOWN) {
1941 /* race breaker: the shutdown could be after the
1942 * previous receive queue check
1943 */
1944 if (__mptcp_move_skbs(msk, len - copied))
1945 continue;
Paolo Abeni7a6a6cb2020-01-21 16:56:26 -08001946 break;
Paolo Abeni87952602020-11-27 11:10:24 +01001947 }
Paolo Abeni7a6a6cb2020-01-21 16:56:26 -08001948
1949 if (sk->sk_state == TCP_CLOSE) {
1950 copied = -ENOTCONN;
1951 break;
1952 }
1953
1954 if (!timeo) {
1955 copied = -EAGAIN;
1956 break;
1957 }
1958
1959 if (signal_pending(current)) {
1960 copied = sock_intr_errno(timeo);
1961 break;
1962 }
1963 }
1964
1965 pr_debug("block timeout %ld", timeo);
Paolo Abeni7a6a6cb2020-01-21 16:56:26 -08001966 mptcp_wait_data(sk, &timeo);
Peter Krystadcec37a62020-01-21 16:56:18 -08001967 }
1968
Paolo Abeni87952602020-11-27 11:10:24 +01001969 if (skb_queue_empty_lockless(&sk->sk_receive_queue) &&
1970 skb_queue_empty(&msk->receive_queue)) {
Florian Westphal6771bfd2020-02-26 10:14:48 +01001971 /* entire backlog drained, clear DATA_READY. */
Paolo Abeni7a6a6cb2020-01-21 16:56:26 -08001972 clear_bit(MPTCP_DATA_READY, &msk->flags);
1973
Florian Westphal6771bfd2020-02-26 10:14:48 +01001974 /* .. race-breaker: ssk might have gotten new data
1975 * after last __mptcp_move_skbs() returned false.
Paolo Abeni7a6a6cb2020-01-21 16:56:26 -08001976 */
Paolo Abeniea4ca582020-11-19 11:46:03 -08001977 if (unlikely(__mptcp_move_skbs(msk, 0)))
Paolo Abeni7a6a6cb2020-01-21 16:56:26 -08001978 set_bit(MPTCP_DATA_READY, &msk->flags);
Florian Westphal6771bfd2020-02-26 10:14:48 +01001979 } else if (unlikely(!test_bit(MPTCP_DATA_READY, &msk->flags))) {
1980 /* data to read but mptcp_wait_data() cleared DATA_READY */
1981 set_bit(MPTCP_DATA_READY, &msk->flags);
Paolo Abeni7a6a6cb2020-01-21 16:56:26 -08001982 }
Florian Westphal6771bfd2020-02-26 10:14:48 +01001983out_err:
Paolo Abeni67193312020-09-14 10:01:09 +02001984 pr_debug("msk=%p data_ready=%d rx queue empty=%d copied=%d",
1985 msk, test_bit(MPTCP_DATA_READY, &msk->flags),
Paolo Abeni87952602020-11-27 11:10:24 +01001986 skb_queue_empty_lockless(&sk->sk_receive_queue), copied);
Florian Westphala6b118f2020-06-30 21:24:45 +02001987 mptcp_rcv_space_adjust(msk, copied);
1988
Peter Krystadcec37a62020-01-21 16:56:18 -08001989 release_sock(sk);
Peter Krystadcec37a62020-01-21 16:56:18 -08001990 return copied;
1991}
1992
Paolo Abenib51f9b82020-03-27 14:48:44 -07001993static void mptcp_retransmit_handler(struct sock *sk)
1994{
1995 struct mptcp_sock *msk = mptcp_sk(sk);
1996
Paolo Abenieaa2ffa2020-11-16 10:48:08 +01001997 if (atomic64_read(&msk->snd_una) == READ_ONCE(msk->snd_nxt)) {
Paolo Abenib51f9b82020-03-27 14:48:44 -07001998 mptcp_stop_timer(sk);
Paolo Abeni3b1d6212020-03-27 14:48:48 -07001999 } else {
2000 set_bit(MPTCP_WORK_RTX, &msk->flags);
Paolo Abeniba8f48f2020-11-16 10:48:05 +01002001 mptcp_schedule_work(sk);
Paolo Abeni3b1d6212020-03-27 14:48:48 -07002002 }
Paolo Abenib51f9b82020-03-27 14:48:44 -07002003}
2004
2005static void mptcp_retransmit_timer(struct timer_list *t)
2006{
2007 struct inet_connection_sock *icsk = from_timer(icsk, t,
2008 icsk_retransmit_timer);
2009 struct sock *sk = &icsk->icsk_inet.sk;
2010
2011 bh_lock_sock(sk);
2012 if (!sock_owned_by_user(sk)) {
2013 mptcp_retransmit_handler(sk);
2014 } else {
2015 /* delegate our work to tcp_release_cb() */
2016 if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED,
2017 &sk->sk_tsq_flags))
2018 sock_hold(sk);
2019 }
2020 bh_unlock_sock(sk);
2021 sock_put(sk);
2022}
2023
Paolo Abenie16163b2020-11-16 10:48:09 +01002024static void mptcp_timeout_timer(struct timer_list *t)
2025{
2026 struct sock *sk = from_timer(sk, t, sk_timer);
2027
2028 mptcp_schedule_work(sk);
Florian Westphalb6d69fc2020-11-24 17:24:46 +01002029 sock_put(sk);
Paolo Abenie16163b2020-11-16 10:48:09 +01002030}
2031
Paolo Abeni3b1d6212020-03-27 14:48:48 -07002032/* Find an idle subflow. Return NULL if there is unacked data at tcp
2033 * level.
2034 *
2035 * A backup subflow is returned only if that is the only kind available.
2036 */
2037static struct sock *mptcp_subflow_get_retrans(const struct mptcp_sock *msk)
2038{
2039 struct mptcp_subflow_context *subflow;
2040 struct sock *backup = NULL;
2041
2042 sock_owned_by_me((const struct sock *)msk);
2043
Paolo Abenid5f49192020-09-14 10:01:17 +02002044 if (__mptcp_check_fallback(msk))
Paolo Abenid9ca1de2020-11-16 10:48:10 +01002045 return NULL;
Paolo Abenid5f49192020-09-14 10:01:17 +02002046
Paolo Abeni3b1d6212020-03-27 14:48:48 -07002047 mptcp_for_each_subflow(msk, subflow) {
2048 struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
2049
Paolo Abenid5f49192020-09-14 10:01:17 +02002050 if (!mptcp_subflow_active(subflow))
2051 continue;
2052
Paolo Abeni3b1d6212020-03-27 14:48:48 -07002053 /* still data outstanding at TCP level? Don't retransmit. */
Florian Westphal860975c2020-11-19 11:45:56 -08002054 if (!tcp_write_queue_empty(ssk)) {
2055 if (inet_csk(ssk)->icsk_ca_state >= TCP_CA_Loss)
2056 continue;
Paolo Abeni3b1d6212020-03-27 14:48:48 -07002057 return NULL;
Florian Westphal860975c2020-11-19 11:45:56 -08002058 }
Paolo Abeni3b1d6212020-03-27 14:48:48 -07002059
2060 if (subflow->backup) {
2061 if (!backup)
2062 backup = ssk;
2063 continue;
2064 }
2065
2066 return ssk;
2067 }
2068
2069 return backup;
2070}
2071
Peter Krystadcec37a62020-01-21 16:56:18 -08002072/* subflow sockets can be either outgoing (connect) or incoming
2073 * (accept).
2074 *
2075 * Outgoing subflows use in-kernel sockets.
2076 * Incoming subflows do not have their own 'struct socket' allocated,
2077 * so we need to use tcp_close() after detaching them from the mptcp
2078 * parent socket.
2079 */
Geliang Tangd0876b22020-09-24 08:29:49 +08002080void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
Paolo Abenie16163b2020-11-16 10:48:09 +01002081 struct mptcp_subflow_context *subflow)
Peter Krystadcec37a62020-01-21 16:56:18 -08002082{
Paolo Abenie16163b2020-11-16 10:48:09 +01002083 bool dispose_socket = false;
2084 struct socket *sock;
Peter Krystadcec37a62020-01-21 16:56:18 -08002085
2086 list_del(&subflow->node);
2087
Paolo Abenie16163b2020-11-16 10:48:09 +01002088 lock_sock(ssk);
2089
2090 /* if we are invoked by the msk cleanup code, the subflow is
2091 * already orphaned
2092 */
2093 sock = ssk->sk_socket;
2094 if (sock) {
2095 dispose_socket = sock != sk->sk_socket;
2096 sock_orphan(ssk);
Peter Krystadcec37a62020-01-21 16:56:18 -08002097 }
Paolo Abenie16163b2020-11-16 10:48:09 +01002098
2099 /* if ssk hit tcp_done(), tcp_cleanup_ulp() cleared the related ops
2100 * the ssk has been already destroyed, we just need to release the
2101 * reference owned by msk;
2102 */
2103 if (!inet_csk(ssk)->icsk_ulp_ops) {
2104 kfree_rcu(subflow, rcu);
2105 } else {
2106 /* otherwise ask tcp do dispose of ssk and subflow ctx */
2107 subflow->disposable = 1;
2108 __tcp_close(ssk, 0);
2109
2110 /* close acquired an extra ref */
2111 __sock_put(ssk);
2112 }
2113 release_sock(ssk);
2114 if (dispose_socket)
2115 iput(SOCK_INODE(sock));
2116
2117 sock_put(ssk);
Mat Martineauf870fa02020-01-21 16:56:15 -08002118}
2119
Paolo Abenidc24f8b2020-02-26 12:19:03 +01002120static unsigned int mptcp_sync_mss(struct sock *sk, u32 pmtu)
2121{
2122 return 0;
2123}
2124
Florian Westphalb4162682020-07-07 14:40:48 +02002125static void pm_work(struct mptcp_sock *msk)
2126{
2127 struct mptcp_pm_data *pm = &msk->pm;
2128
2129 spin_lock_bh(&msk->pm.lock);
2130
2131 pr_debug("msk=%p status=%x", msk, pm->status);
2132 if (pm->status & BIT(MPTCP_PM_ADD_ADDR_RECEIVED)) {
2133 pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_RECEIVED);
2134 mptcp_pm_nl_add_addr_received(msk);
2135 }
Geliang Tang84dfe362020-11-19 11:46:00 -08002136 if (pm->status & BIT(MPTCP_PM_ADD_ADDR_SEND_ACK)) {
2137 pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_SEND_ACK);
2138 mptcp_pm_nl_add_addr_send_ack(msk);
2139 }
Geliang Tangd0876b22020-09-24 08:29:49 +08002140 if (pm->status & BIT(MPTCP_PM_RM_ADDR_RECEIVED)) {
2141 pm->status &= ~BIT(MPTCP_PM_RM_ADDR_RECEIVED);
2142 mptcp_pm_nl_rm_addr_received(msk);
2143 }
Florian Westphalb4162682020-07-07 14:40:48 +02002144 if (pm->status & BIT(MPTCP_PM_ESTABLISHED)) {
2145 pm->status &= ~BIT(MPTCP_PM_ESTABLISHED);
2146 mptcp_pm_nl_fully_established(msk);
2147 }
2148 if (pm->status & BIT(MPTCP_PM_SUBFLOW_ESTABLISHED)) {
2149 pm->status &= ~BIT(MPTCP_PM_SUBFLOW_ESTABLISHED);
2150 mptcp_pm_nl_subflow_established(msk);
2151 }
2152
2153 spin_unlock_bh(&msk->pm.lock);
2154}
2155
Paolo Abeni0e4f35d2020-10-09 19:00:01 +02002156static void __mptcp_close_subflow(struct mptcp_sock *msk)
2157{
2158 struct mptcp_subflow_context *subflow, *tmp;
2159
2160 list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) {
2161 struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
2162
2163 if (inet_sk_state_load(ssk) != TCP_CLOSE)
2164 continue;
2165
Paolo Abenie16163b2020-11-16 10:48:09 +01002166 __mptcp_close_ssk((struct sock *)msk, ssk, subflow);
Paolo Abeni0e4f35d2020-10-09 19:00:01 +02002167 }
2168}
2169
Paolo Abenie16163b2020-11-16 10:48:09 +01002170static bool mptcp_check_close_timeout(const struct sock *sk)
2171{
2172 s32 delta = tcp_jiffies32 - inet_csk(sk)->icsk_mtup.probe_timestamp;
2173 struct mptcp_subflow_context *subflow;
2174
2175 if (delta >= TCP_TIMEWAIT_LEN)
2176 return true;
2177
2178 /* if all subflows are in closed status don't bother with additional
2179 * timeout
2180 */
2181 mptcp_for_each_subflow(mptcp_sk(sk), subflow) {
2182 if (inet_sk_state_load(mptcp_subflow_tcp_sock(subflow)) !=
2183 TCP_CLOSE)
2184 return false;
2185 }
2186 return true;
2187}
2188
Paolo Abeni80992012020-02-26 10:14:47 +01002189static void mptcp_worker(struct work_struct *work)
2190{
2191 struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work);
Paolo Abeni3b1d6212020-03-27 14:48:48 -07002192 struct sock *ssk, *sk = &msk->sk.icsk_inet.sk;
Paolo Abenicaf971d2020-11-16 10:48:06 +01002193 struct mptcp_sendmsg_info info = {};
Paolo Abeni3b1d6212020-03-27 14:48:48 -07002194 struct mptcp_data_frag *dfrag;
Paolo Abeni3b1d6212020-03-27 14:48:48 -07002195 size_t copied = 0;
Paolo Abenie16163b2020-11-16 10:48:09 +01002196 int state, ret;
Paolo Abeni80992012020-02-26 10:14:47 +01002197
2198 lock_sock(sk);
Paolo Abenie16163b2020-11-16 10:48:09 +01002199 state = sk->sk_state;
2200 if (unlikely(state == TCP_CLOSE))
2201 goto unlock;
2202
Florian Westphal95ed6902020-11-03 11:05:06 -08002203 mptcp_clean_una_wakeup(sk);
Mat Martineau43b54c62020-07-28 15:12:06 -07002204 mptcp_check_data_fin_ack(sk);
Peter Krystadec3edaa2020-03-27 14:48:40 -07002205 __mptcp_flush_join_list(msk);
Paolo Abeni0e4f35d2020-10-09 19:00:01 +02002206 if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags))
2207 __mptcp_close_subflow(msk);
2208
Paolo Abeni813e0a62020-11-16 10:48:11 +01002209 if (mptcp_send_head(sk))
2210 mptcp_push_pending(sk, 0);
Paolo Abeni3b1d6212020-03-27 14:48:48 -07002211
Florian Westphalb4162682020-07-07 14:40:48 +02002212 if (msk->pm.status)
2213 pm_work(msk);
2214
Florian Westphal59832e22020-04-02 13:44:52 +02002215 if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags))
2216 mptcp_check_for_eof(msk);
2217
Mat Martineau43b54c62020-07-28 15:12:06 -07002218 mptcp_check_data_fin(sk);
2219
Paolo Abenie16163b2020-11-16 10:48:09 +01002220 /* if the msk data is completely acked, or the socket timedout,
2221 * there is no point in keeping around an orphaned sk
2222 */
2223 if (sock_flag(sk, SOCK_DEAD) &&
2224 (mptcp_check_close_timeout(sk) ||
2225 (state != sk->sk_state &&
2226 ((1 << inet_sk_state_load(sk)) & (TCPF_CLOSE | TCPF_FIN_WAIT2))))) {
2227 inet_sk_state_store(sk, TCP_CLOSE);
2228 __mptcp_destroy_sock(sk);
2229 goto unlock;
2230 }
2231
Paolo Abeni3b1d6212020-03-27 14:48:48 -07002232 if (!test_and_clear_bit(MPTCP_WORK_RTX, &msk->flags))
2233 goto unlock;
2234
2235 dfrag = mptcp_rtx_head(sk);
2236 if (!dfrag)
2237 goto unlock;
2238
2239 ssk = mptcp_subflow_get_retrans(msk);
2240 if (!ssk)
2241 goto reset_unlock;
2242
2243 lock_sock(ssk);
2244
Paolo Abenid9ca1de2020-11-16 10:48:10 +01002245 /* limit retransmission to the bytes already sent on some subflows */
2246 info.sent = 0;
2247 info.limit = dfrag->already_sent;
2248 while (info.sent < dfrag->already_sent) {
Paolo Abeni724cfd22020-11-27 11:10:25 +01002249 if (!mptcp_alloc_tx_skb(sk, ssk))
2250 break;
2251
Paolo Abenid9ca1de2020-11-16 10:48:10 +01002252 ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info);
Florian Westphal6f8a6122020-11-16 10:48:13 +01002253 if (ret <= 0)
Paolo Abeni3b1d6212020-03-27 14:48:48 -07002254 break;
2255
Florian Westphalfc518952020-03-27 14:48:50 -07002256 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RETRANSSEGS);
Paolo Abeni3b1d6212020-03-27 14:48:48 -07002257 copied += ret;
Paolo Abenid9ca1de2020-11-16 10:48:10 +01002258 info.sent += ret;
Paolo Abeni3b1d6212020-03-27 14:48:48 -07002259 }
2260 if (copied)
Paolo Abenicaf971d2020-11-16 10:48:06 +01002261 tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle,
2262 info.size_goal);
Paolo Abeni3b1d6212020-03-27 14:48:48 -07002263
Paolo Abeni3b1d6212020-03-27 14:48:48 -07002264 mptcp_set_timeout(sk, ssk);
2265 release_sock(ssk);
2266
2267reset_unlock:
2268 if (!mptcp_timer_pending(sk))
2269 mptcp_reset_timer(sk);
2270
2271unlock:
Paolo Abeni80992012020-02-26 10:14:47 +01002272 release_sock(sk);
2273 sock_put(sk);
2274}
2275
Matthieu Baerts784325e2020-01-21 16:56:28 -08002276static int __mptcp_init_sock(struct sock *sk)
Mat Martineauf870fa02020-01-21 16:56:15 -08002277{
Peter Krystadcec37a62020-01-21 16:56:18 -08002278 struct mptcp_sock *msk = mptcp_sk(sk);
2279
Peter Krystadec3edaa2020-03-27 14:48:40 -07002280 spin_lock_init(&msk->join_list_lock);
2281
Peter Krystadcec37a62020-01-21 16:56:18 -08002282 INIT_LIST_HEAD(&msk->conn_list);
Peter Krystadec3edaa2020-03-27 14:48:40 -07002283 INIT_LIST_HEAD(&msk->join_list);
Paolo Abeni18b683b2020-03-27 14:48:43 -07002284 INIT_LIST_HEAD(&msk->rtx_queue);
Paolo Abeni80992012020-02-26 10:14:47 +01002285 INIT_WORK(&msk->work, mptcp_worker);
Paolo Abeni87952602020-11-27 11:10:24 +01002286 __skb_queue_head_init(&msk->receive_queue);
Paolo Abeni724cfd22020-11-27 11:10:25 +01002287 __skb_queue_head_init(&msk->skb_tx_cache);
Paolo Abeniab174ad2020-09-14 10:01:12 +02002288 msk->out_of_order_queue = RB_ROOT;
Paolo Abenif0e6a4c2020-11-16 10:48:07 +01002289 msk->first_pending = NULL;
Paolo Abenie93da922020-11-27 11:10:23 +01002290 msk->wmem_reserved = 0;
Paolo Abeni87952602020-11-27 11:10:24 +01002291 msk->rmem_released = 0;
Paolo Abeni724cfd22020-11-27 11:10:25 +01002292 msk->tx_pending_data = 0;
2293 msk->size_goal_cache = TCP_BASE_MSS;
Peter Krystadcec37a62020-01-21 16:56:18 -08002294
Paolo Abeniea4ca582020-11-19 11:46:03 -08002295 msk->ack_hint = NULL;
Paolo Abeni8ab183d2020-01-21 16:56:33 -08002296 msk->first = NULL;
Paolo Abenidc24f8b2020-02-26 12:19:03 +01002297 inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss;
Paolo Abeni8ab183d2020-01-21 16:56:33 -08002298
Peter Krystad1b1c7a02020-03-27 14:48:38 -07002299 mptcp_pm_data_init(msk);
2300
Paolo Abenib51f9b82020-03-27 14:48:44 -07002301 /* re-use the csk retrans timer for MPTCP-level retrans */
2302 timer_setup(&msk->sk.icsk_retransmit_timer, mptcp_retransmit_timer, 0);
Paolo Abenie16163b2020-11-16 10:48:09 +01002303 timer_setup(&sk->sk_timer, mptcp_timeout_timer, 0);
Mat Martineauf870fa02020-01-21 16:56:15 -08002304 return 0;
2305}
2306
Matthieu Baerts784325e2020-01-21 16:56:28 -08002307static int mptcp_init_sock(struct sock *sk)
2308{
Florian Westphalfc518952020-03-27 14:48:50 -07002309 struct net *net = sock_net(sk);
2310 int ret;
Paolo Abeni18b683b2020-03-27 14:48:43 -07002311
Geliang Tangb6c08382020-09-24 08:29:54 +08002312 ret = __mptcp_init_sock(sk);
2313 if (ret)
2314 return ret;
2315
Florian Westphalfc518952020-03-27 14:48:50 -07002316 if (!mptcp_is_enabled(net))
2317 return -ENOPROTOOPT;
2318
2319 if (unlikely(!net->mib.mptcp_statistics) && !mptcp_mib_alloc(net))
2320 return -ENOMEM;
2321
Paolo Abenifa680182020-06-29 22:26:23 +02002322 ret = __mptcp_socket_create(mptcp_sk(sk));
2323 if (ret)
2324 return ret;
2325
Paolo Abenid0272362020-03-27 14:48:45 -07002326 sk_sockets_allocated_inc(sk);
Florian Westphala6b118f2020-06-30 21:24:45 +02002327 sk->sk_rcvbuf = sock_net(sk)->ipv4.sysctl_tcp_rmem[1];
Paolo Abenida51aef2020-09-14 10:01:10 +02002328 sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[1];
Paolo Abenid0272362020-03-27 14:48:45 -07002329
Paolo Abeni18b683b2020-03-27 14:48:43 -07002330 return 0;
2331}
2332
2333static void __mptcp_clear_xmit(struct sock *sk)
2334{
2335 struct mptcp_sock *msk = mptcp_sk(sk);
2336 struct mptcp_data_frag *dtmp, *dfrag;
Paolo Abeni724cfd22020-11-27 11:10:25 +01002337 struct sk_buff *skb;
Paolo Abeni18b683b2020-03-27 14:48:43 -07002338
Paolo Abenib51f9b82020-03-27 14:48:44 -07002339 sk_stop_timer(sk, &msk->sk.icsk_retransmit_timer);
2340
Paolo Abenid9ca1de2020-11-16 10:48:10 +01002341 WRITE_ONCE(msk->first_pending, NULL);
Paolo Abeni18b683b2020-03-27 14:48:43 -07002342 list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list)
Paolo Abenid0272362020-03-27 14:48:45 -07002343 dfrag_clear(sk, dfrag);
Paolo Abeni724cfd22020-11-27 11:10:25 +01002344 while ((skb = __skb_dequeue(&msk->skb_tx_cache)) != NULL) {
2345 sk->sk_forward_alloc += skb->truesize;
2346 kfree_skb(skb);
2347 }
Matthieu Baerts784325e2020-01-21 16:56:28 -08002348}
2349
Paolo Abeni80992012020-02-26 10:14:47 +01002350static void mptcp_cancel_work(struct sock *sk)
2351{
2352 struct mptcp_sock *msk = mptcp_sk(sk);
2353
Paolo Abenib2771d22020-11-19 11:45:54 -08002354 if (cancel_work_sync(&msk->work))
Paolo Abenie16163b2020-11-16 10:48:09 +01002355 __sock_put(sk);
Paolo Abeni80992012020-02-26 10:14:47 +01002356}
2357
Geliang Tangd0876b22020-09-24 08:29:49 +08002358void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how)
Peter Krystad21498492020-01-21 16:56:21 -08002359{
2360 lock_sock(ssk);
2361
2362 switch (ssk->sk_state) {
2363 case TCP_LISTEN:
2364 if (!(how & RCV_SHUTDOWN))
2365 break;
Gustavo A. R. Silvadf561f662020-08-23 17:36:59 -05002366 fallthrough;
Peter Krystad21498492020-01-21 16:56:21 -08002367 case TCP_SYN_SENT:
2368 tcp_disconnect(ssk, O_NONBLOCK);
2369 break;
2370 default:
Mat Martineau43b54c62020-07-28 15:12:06 -07002371 if (__mptcp_check_fallback(mptcp_sk(sk))) {
2372 pr_debug("Fallback");
2373 ssk->sk_shutdown |= how;
2374 tcp_shutdown(ssk, how);
2375 } else {
2376 pr_debug("Sending DATA_FIN on subflow %p", ssk);
2377 mptcp_set_timeout(sk, ssk);
2378 tcp_send_ack(ssk);
2379 }
Peter Krystad21498492020-01-21 16:56:21 -08002380 break;
2381 }
2382
Peter Krystad21498492020-01-21 16:56:21 -08002383 release_sock(ssk);
2384}
2385
Mat Martineau6920b852020-07-28 15:12:04 -07002386static const unsigned char new_state[16] = {
2387 /* current state: new state: action: */
2388 [0 /* (Invalid) */] = TCP_CLOSE,
2389 [TCP_ESTABLISHED] = TCP_FIN_WAIT1 | TCP_ACTION_FIN,
2390 [TCP_SYN_SENT] = TCP_CLOSE,
2391 [TCP_SYN_RECV] = TCP_FIN_WAIT1 | TCP_ACTION_FIN,
2392 [TCP_FIN_WAIT1] = TCP_FIN_WAIT1,
2393 [TCP_FIN_WAIT2] = TCP_FIN_WAIT2,
2394 [TCP_TIME_WAIT] = TCP_CLOSE, /* should not happen ! */
2395 [TCP_CLOSE] = TCP_CLOSE,
2396 [TCP_CLOSE_WAIT] = TCP_LAST_ACK | TCP_ACTION_FIN,
2397 [TCP_LAST_ACK] = TCP_LAST_ACK,
2398 [TCP_LISTEN] = TCP_CLOSE,
2399 [TCP_CLOSING] = TCP_CLOSING,
2400 [TCP_NEW_SYN_RECV] = TCP_CLOSE, /* should not happen ! */
2401};
2402
2403static int mptcp_close_state(struct sock *sk)
2404{
2405 int next = (int)new_state[sk->sk_state];
2406 int ns = next & TCP_STATE_MASK;
2407
2408 inet_sk_state_store(sk, ns);
2409
2410 return next & TCP_ACTION_FIN;
2411}
2412
Paolo Abenie16163b2020-11-16 10:48:09 +01002413static void __mptcp_check_send_data_fin(struct sock *sk)
2414{
2415 struct mptcp_subflow_context *subflow;
2416 struct mptcp_sock *msk = mptcp_sk(sk);
2417
2418 pr_debug("msk=%p snd_data_fin_enable=%d pending=%d snd_nxt=%llu write_seq=%llu",
2419 msk, msk->snd_data_fin_enable, !!mptcp_send_head(sk),
2420 msk->snd_nxt, msk->write_seq);
2421
2422 /* we still need to enqueue subflows or not really shutting down,
2423 * skip this
2424 */
2425 if (!msk->snd_data_fin_enable || msk->snd_nxt + 1 != msk->write_seq ||
2426 mptcp_send_head(sk))
2427 return;
2428
2429 WRITE_ONCE(msk->snd_nxt, msk->write_seq);
2430
Paolo Abeni26aa2312020-11-19 11:45:55 -08002431 /* fallback socket will not get data_fin/ack, can move to the next
2432 * state now
2433 */
2434 if (__mptcp_check_fallback(msk)) {
2435 if ((1 << sk->sk_state) & (TCPF_CLOSING | TCPF_LAST_ACK)) {
2436 inet_sk_state_store(sk, TCP_CLOSE);
2437 mptcp_close_wake_up(sk);
2438 } else if (sk->sk_state == TCP_FIN_WAIT1) {
2439 inet_sk_state_store(sk, TCP_FIN_WAIT2);
2440 }
Paolo Abenie16163b2020-11-16 10:48:09 +01002441 }
2442
2443 __mptcp_flush_join_list(msk);
2444 mptcp_for_each_subflow(msk, subflow) {
2445 struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow);
2446
2447 mptcp_subflow_shutdown(sk, tcp_sk, SEND_SHUTDOWN);
2448 }
2449}
2450
2451static void __mptcp_wr_shutdown(struct sock *sk)
2452{
2453 struct mptcp_sock *msk = mptcp_sk(sk);
2454
2455 pr_debug("msk=%p snd_data_fin_enable=%d shutdown=%x state=%d pending=%d",
2456 msk, msk->snd_data_fin_enable, sk->sk_shutdown, sk->sk_state,
2457 !!mptcp_send_head(sk));
2458
2459 /* will be ignored by fallback sockets */
2460 WRITE_ONCE(msk->write_seq, msk->write_seq + 1);
2461 WRITE_ONCE(msk->snd_data_fin_enable, 1);
2462
2463 __mptcp_check_send_data_fin(sk);
2464}
2465
2466static void __mptcp_destroy_sock(struct sock *sk)
Mat Martineauf870fa02020-01-21 16:56:15 -08002467{
Peter Krystadcec37a62020-01-21 16:56:18 -08002468 struct mptcp_subflow_context *subflow, *tmp;
Mat Martineauf870fa02020-01-21 16:56:15 -08002469 struct mptcp_sock *msk = mptcp_sk(sk);
Florian Westphalb2c5b612020-01-29 15:54:45 +01002470 LIST_HEAD(conn_list);
Mat Martineauf870fa02020-01-21 16:56:15 -08002471
Paolo Abenie16163b2020-11-16 10:48:09 +01002472 pr_debug("msk=%p", msk);
Florian Westphal2c22c062020-02-04 18:12:30 +01002473
Paolo Abeni10f6d462020-05-29 17:43:30 +02002474 /* be sure to always acquire the join list lock, to sync vs
2475 * mptcp_finish_join().
2476 */
2477 spin_lock_bh(&msk->join_list_lock);
2478 list_splice_tail_init(&msk->join_list, &msk->conn_list);
2479 spin_unlock_bh(&msk->join_list_lock);
Florian Westphalb2c5b612020-01-29 15:54:45 +01002480 list_splice_init(&msk->conn_list, &conn_list);
2481
Paolo Abeni18b683b2020-03-27 14:48:43 -07002482 __mptcp_clear_xmit(sk);
Paolo Abenie16163b2020-11-16 10:48:09 +01002483 sk_stop_timer(sk, &sk->sk_timer);
2484 msk->pm.status = 0;
Florian Westphalb2c5b612020-01-29 15:54:45 +01002485
2486 list_for_each_entry_safe(subflow, tmp, &conn_list, node) {
Peter Krystadcec37a62020-01-21 16:56:18 -08002487 struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
Paolo Abenie16163b2020-11-16 10:48:09 +01002488 __mptcp_close_ssk(sk, ssk, subflow);
Mat Martineauf870fa02020-01-21 16:56:15 -08002489 }
2490
Paolo Abenie16163b2020-11-16 10:48:09 +01002491 sk->sk_prot->destroy(sk);
Paolo Abeni80992012020-02-26 10:14:47 +01002492
Paolo Abenie93da922020-11-27 11:10:23 +01002493 WARN_ON_ONCE(msk->wmem_reserved);
Paolo Abeni87952602020-11-27 11:10:24 +01002494 WARN_ON_ONCE(msk->rmem_released);
Paolo Abenie16163b2020-11-16 10:48:09 +01002495 sk_stream_kill_queues(sk);
2496 xfrm_sk_free_policy(sk);
2497 sk_refcnt_debug_release(sk);
2498 sock_put(sk);
2499}
Florian Westphal6771bfd2020-02-26 10:14:48 +01002500
Paolo Abenie16163b2020-11-16 10:48:09 +01002501static void mptcp_close(struct sock *sk, long timeout)
2502{
2503 struct mptcp_subflow_context *subflow;
2504 bool do_cancel_work = false;
2505
2506 lock_sock(sk);
2507 sk->sk_shutdown = SHUTDOWN_MASK;
2508
2509 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) {
2510 inet_sk_state_store(sk, TCP_CLOSE);
2511 goto cleanup;
2512 }
2513
2514 if (mptcp_close_state(sk))
2515 __mptcp_wr_shutdown(sk);
2516
2517 sk_stream_wait_close(sk, timeout);
2518
2519cleanup:
2520 /* orphan all the subflows */
2521 inet_csk(sk)->icsk_mtup.probe_timestamp = tcp_jiffies32;
2522 list_for_each_entry(subflow, &mptcp_sk(sk)->conn_list, node) {
2523 struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
2524 bool slow, dispose_socket;
2525 struct socket *sock;
2526
2527 slow = lock_sock_fast(ssk);
2528 sock = ssk->sk_socket;
2529 dispose_socket = sock && sock != sk->sk_socket;
2530 sock_orphan(ssk);
2531 unlock_sock_fast(ssk, slow);
2532
2533 /* for the outgoing subflows we additionally need to free
2534 * the associated socket
2535 */
2536 if (dispose_socket)
2537 iput(SOCK_INODE(sock));
2538 }
2539 sock_orphan(sk);
2540
2541 sock_hold(sk);
2542 pr_debug("msk=%p state=%d", sk, sk->sk_state);
2543 if (sk->sk_state == TCP_CLOSE) {
2544 __mptcp_destroy_sock(sk);
2545 do_cancel_work = true;
2546 } else {
2547 sk_reset_timer(sk, &sk->sk_timer, jiffies + TCP_TIMEWAIT_LEN);
2548 }
2549 release_sock(sk);
2550 if (do_cancel_work)
2551 mptcp_cancel_work(sk);
2552 sock_put(sk);
Mat Martineauf870fa02020-01-21 16:56:15 -08002553}
2554
Peter Krystadcf7da0d2020-01-21 16:56:19 -08002555static void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk)
2556{
2557#if IS_ENABLED(CONFIG_MPTCP_IPV6)
2558 const struct ipv6_pinfo *ssk6 = inet6_sk(ssk);
2559 struct ipv6_pinfo *msk6 = inet6_sk(msk);
2560
2561 msk->sk_v6_daddr = ssk->sk_v6_daddr;
2562 msk->sk_v6_rcv_saddr = ssk->sk_v6_rcv_saddr;
2563
2564 if (msk6 && ssk6) {
2565 msk6->saddr = ssk6->saddr;
2566 msk6->flow_label = ssk6->flow_label;
2567 }
2568#endif
2569
2570 inet_sk(msk)->inet_num = inet_sk(ssk)->inet_num;
2571 inet_sk(msk)->inet_dport = inet_sk(ssk)->inet_dport;
2572 inet_sk(msk)->inet_sport = inet_sk(ssk)->inet_sport;
2573 inet_sk(msk)->inet_daddr = inet_sk(ssk)->inet_daddr;
2574 inet_sk(msk)->inet_saddr = inet_sk(ssk)->inet_saddr;
2575 inet_sk(msk)->inet_rcv_saddr = inet_sk(ssk)->inet_rcv_saddr;
2576}
2577
Paolo Abeni18b683b2020-03-27 14:48:43 -07002578static int mptcp_disconnect(struct sock *sk, int flags)
2579{
Florian Westphal42c556f2020-04-29 20:43:20 +02002580 /* Should never be called.
2581 * inet_stream_connect() calls ->disconnect, but that
2582 * refers to the subflow socket, not the mptcp one.
2583 */
2584 WARN_ON_ONCE(1);
2585 return 0;
Paolo Abeni18b683b2020-03-27 14:48:43 -07002586}
2587
Florian Westphalb0519de2020-02-06 00:39:37 +01002588#if IS_ENABLED(CONFIG_MPTCP_IPV6)
2589static struct ipv6_pinfo *mptcp_inet6_sk(const struct sock *sk)
2590{
2591 unsigned int offset = sizeof(struct mptcp6_sock) - sizeof(struct ipv6_pinfo);
2592
2593 return (struct ipv6_pinfo *)(((u8 *)sk) + offset);
2594}
2595#endif
2596
Paolo Abenifca5c822020-04-20 16:25:06 +02002597struct sock *mptcp_sk_clone(const struct sock *sk,
Paolo Abenicfde1412020-04-30 15:01:52 +02002598 const struct mptcp_options_received *mp_opt,
Paolo Abenifca5c822020-04-20 16:25:06 +02002599 struct request_sock *req)
Florian Westphalb0519de2020-02-06 00:39:37 +01002600{
Paolo Abeni58b09912020-03-13 16:52:41 +01002601 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
Florian Westphalb0519de2020-02-06 00:39:37 +01002602 struct sock *nsk = sk_clone_lock(sk, GFP_ATOMIC);
Paolo Abeni58b09912020-03-13 16:52:41 +01002603 struct mptcp_sock *msk;
2604 u64 ack_seq;
Florian Westphalb0519de2020-02-06 00:39:37 +01002605
2606 if (!nsk)
2607 return NULL;
2608
2609#if IS_ENABLED(CONFIG_MPTCP_IPV6)
2610 if (nsk->sk_family == AF_INET6)
2611 inet_sk(nsk)->pinet6 = mptcp_inet6_sk(nsk);
2612#endif
2613
Paolo Abeni58b09912020-03-13 16:52:41 +01002614 __mptcp_init_sock(nsk);
2615
2616 msk = mptcp_sk(nsk);
2617 msk->local_key = subflow_req->local_key;
2618 msk->token = subflow_req->token;
2619 msk->subflow = NULL;
Paolo Abenib93df082020-07-23 13:02:32 +02002620 WRITE_ONCE(msk->fully_established, false);
Paolo Abeni58b09912020-03-13 16:52:41 +01002621
Paolo Abeni58b09912020-03-13 16:52:41 +01002622 msk->write_seq = subflow_req->idsn + 1;
Paolo Abenieaa2ffa2020-11-16 10:48:08 +01002623 msk->snd_nxt = msk->write_seq;
Paolo Abenicc9d2562020-03-27 14:48:42 -07002624 atomic64_set(&msk->snd_una, msk->write_seq);
Florian Westphal6f8a6122020-11-16 10:48:13 +01002625 atomic64_set(&msk->wnd_end, msk->snd_nxt + req->rsk_rcv_wnd);
2626
Paolo Abenicfde1412020-04-30 15:01:52 +02002627 if (mp_opt->mp_capable) {
Paolo Abeni58b09912020-03-13 16:52:41 +01002628 msk->can_ack = true;
Paolo Abenicfde1412020-04-30 15:01:52 +02002629 msk->remote_key = mp_opt->sndr_key;
Paolo Abeni58b09912020-03-13 16:52:41 +01002630 mptcp_crypto_key_sha(msk->remote_key, NULL, &ack_seq);
2631 ack_seq++;
Mat Martineau917944d2020-09-29 15:08:19 -07002632 WRITE_ONCE(msk->ack_seq, ack_seq);
Florian Westphalfa3fe2b2020-11-19 11:46:02 -08002633 WRITE_ONCE(msk->rcv_wnd_sent, ack_seq);
Paolo Abeni58b09912020-03-13 16:52:41 +01002634 }
Paolo Abeni7f20d5f2020-03-17 15:53:34 +01002635
Florian Westphal5e200872020-04-20 16:25:04 +02002636 sock_reset_flag(nsk, SOCK_RCU_FREE);
Paolo Abeni7f20d5f2020-03-17 15:53:34 +01002637 /* will be fully established after successful MPC subflow creation */
2638 inet_sk_state_store(nsk, TCP_SYN_RECV);
Paolo Abeni58b09912020-03-13 16:52:41 +01002639 bh_unlock_sock(nsk);
2640
2641 /* keep a single reference */
2642 __sock_put(nsk);
Florian Westphalb0519de2020-02-06 00:39:37 +01002643 return nsk;
2644}
2645
Florian Westphala6b118f2020-06-30 21:24:45 +02002646void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk)
2647{
2648 const struct tcp_sock *tp = tcp_sk(ssk);
2649
2650 msk->rcvq_space.copied = 0;
2651 msk->rcvq_space.rtt_us = 0;
2652
2653 msk->rcvq_space.time = tp->tcp_mstamp;
2654
2655 /* initial rcv_space offering made to peer */
2656 msk->rcvq_space.space = min_t(u32, tp->rcv_wnd,
2657 TCP_INIT_CWND * tp->advmss);
2658 if (msk->rcvq_space.space == 0)
2659 msk->rcvq_space.space = TCP_INIT_CWND * TCP_MSS_DEFAULT;
Florian Westphal6f8a6122020-11-16 10:48:13 +01002660
2661 atomic64_set(&msk->wnd_end, msk->snd_nxt + tcp_sk(ssk)->snd_wnd);
Florian Westphala6b118f2020-06-30 21:24:45 +02002662}
2663
Peter Krystadcf7da0d2020-01-21 16:56:19 -08002664static struct sock *mptcp_accept(struct sock *sk, int flags, int *err,
2665 bool kern)
2666{
2667 struct mptcp_sock *msk = mptcp_sk(sk);
2668 struct socket *listener;
2669 struct sock *newsk;
2670
2671 listener = __mptcp_nmpc_socket(msk);
2672 if (WARN_ON_ONCE(!listener)) {
2673 *err = -EINVAL;
2674 return NULL;
2675 }
2676
2677 pr_debug("msk=%p, listener=%p", msk, mptcp_subflow_ctx(listener->sk));
2678 newsk = inet_csk_accept(listener->sk, flags, err, kern);
2679 if (!newsk)
2680 return NULL;
2681
2682 pr_debug("msk=%p, subflow is mptcp=%d", msk, sk_is_mptcp(newsk));
Peter Krystadcf7da0d2020-01-21 16:56:19 -08002683 if (sk_is_mptcp(newsk)) {
2684 struct mptcp_subflow_context *subflow;
2685 struct sock *new_mptcp_sock;
Peter Krystadcf7da0d2020-01-21 16:56:19 -08002686
2687 subflow = mptcp_subflow_ctx(newsk);
Paolo Abeni58b09912020-03-13 16:52:41 +01002688 new_mptcp_sock = subflow->conn;
2689
2690 /* is_mptcp should be false if subflow->conn is missing, see
2691 * subflow_syn_recv_sock()
2692 */
2693 if (WARN_ON_ONCE(!new_mptcp_sock)) {
2694 tcp_sk(newsk)->is_mptcp = 0;
2695 return newsk;
2696 }
2697
2698 /* acquire the 2nd reference for the owning socket */
2699 sock_hold(new_mptcp_sock);
Peter Krystadcf7da0d2020-01-21 16:56:19 -08002700 newsk = new_mptcp_sock;
Paolo Abeni0397c6d2020-11-19 11:45:58 -08002701 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEPASSIVEACK);
Florian Westphalfc518952020-03-27 14:48:50 -07002702 } else {
2703 MPTCP_INC_STATS(sock_net(sk),
2704 MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK);
Peter Krystadcf7da0d2020-01-21 16:56:19 -08002705 }
2706
2707 return newsk;
2708}
2709
Geliang Tang5c8c1642020-09-24 08:29:57 +08002710void mptcp_destroy_common(struct mptcp_sock *msk)
2711{
Paolo Abeni87952602020-11-27 11:10:24 +01002712 struct sock *sk = (struct sock *)msk;
2713
2714 /* move to sk_receive_queue, sk_stream_kill_queues will purge it */
2715 skb_queue_splice_tail_init(&msk->receive_queue, &sk->sk_receive_queue);
2716
Geliang Tang5c8c1642020-09-24 08:29:57 +08002717 skb_rbtree_purge(&msk->out_of_order_queue);
2718 mptcp_token_destroy(msk);
2719 mptcp_pm_free_anno_list(msk);
2720}
2721
Peter Krystad79c09492020-01-21 16:56:20 -08002722static void mptcp_destroy(struct sock *sk)
2723{
Florian Westphalc9fd9c52020-01-29 15:54:43 +01002724 struct mptcp_sock *msk = mptcp_sk(sk);
2725
Geliang Tang5c8c1642020-09-24 08:29:57 +08002726 mptcp_destroy_common(msk);
Paolo Abenid0272362020-03-27 14:48:45 -07002727 sk_sockets_allocated_dec(sk);
Peter Krystad79c09492020-01-21 16:56:20 -08002728}
2729
Florian Westphalfd1452d2020-07-05 01:30:16 +02002730static int mptcp_setsockopt_sol_socket(struct mptcp_sock *msk, int optname,
Christoph Hellwiga7b75c52020-07-23 08:09:07 +02002731 sockptr_t optval, unsigned int optlen)
Florian Westphalfd1452d2020-07-05 01:30:16 +02002732{
2733 struct sock *sk = (struct sock *)msk;
2734 struct socket *ssock;
2735 int ret;
2736
2737 switch (optname) {
2738 case SO_REUSEPORT:
2739 case SO_REUSEADDR:
2740 lock_sock(sk);
2741 ssock = __mptcp_nmpc_socket(msk);
2742 if (!ssock) {
2743 release_sock(sk);
2744 return -EINVAL;
2745 }
2746
Christoph Hellwiga7b75c52020-07-23 08:09:07 +02002747 ret = sock_setsockopt(ssock, SOL_SOCKET, optname, optval, optlen);
Florian Westphalfd1452d2020-07-05 01:30:16 +02002748 if (ret == 0) {
2749 if (optname == SO_REUSEPORT)
2750 sk->sk_reuseport = ssock->sk->sk_reuseport;
2751 else if (optname == SO_REUSEADDR)
2752 sk->sk_reuse = ssock->sk->sk_reuse;
2753 }
2754 release_sock(sk);
2755 return ret;
2756 }
2757
Christoph Hellwiga7b75c52020-07-23 08:09:07 +02002758 return sock_setsockopt(sk->sk_socket, SOL_SOCKET, optname, optval, optlen);
Florian Westphalfd1452d2020-07-05 01:30:16 +02002759}
2760
Florian Westphalc9b95a12020-07-05 01:30:17 +02002761static int mptcp_setsockopt_v6(struct mptcp_sock *msk, int optname,
Christoph Hellwiga7b75c52020-07-23 08:09:07 +02002762 sockptr_t optval, unsigned int optlen)
Florian Westphalc9b95a12020-07-05 01:30:17 +02002763{
2764 struct sock *sk = (struct sock *)msk;
2765 int ret = -EOPNOTSUPP;
2766 struct socket *ssock;
2767
2768 switch (optname) {
2769 case IPV6_V6ONLY:
2770 lock_sock(sk);
2771 ssock = __mptcp_nmpc_socket(msk);
2772 if (!ssock) {
2773 release_sock(sk);
2774 return -EINVAL;
2775 }
2776
2777 ret = tcp_setsockopt(ssock->sk, SOL_IPV6, optname, optval, optlen);
2778 if (ret == 0)
2779 sk->sk_ipv6only = ssock->sk->sk_ipv6only;
2780
2781 release_sock(sk);
2782 break;
2783 }
2784
2785 return ret;
2786}
2787
Peter Krystad717e79c2020-01-21 16:56:22 -08002788static int mptcp_setsockopt(struct sock *sk, int level, int optname,
Christoph Hellwiga7b75c52020-07-23 08:09:07 +02002789 sockptr_t optval, unsigned int optlen)
Peter Krystad717e79c2020-01-21 16:56:22 -08002790{
2791 struct mptcp_sock *msk = mptcp_sk(sk);
Paolo Abeni76660af2020-06-29 22:26:24 +02002792 struct sock *ssk;
Peter Krystad717e79c2020-01-21 16:56:22 -08002793
2794 pr_debug("msk=%p", msk);
2795
Florian Westphal83f0c102020-07-05 01:30:15 +02002796 if (level == SOL_SOCKET)
Florian Westphalfd1452d2020-07-05 01:30:16 +02002797 return mptcp_setsockopt_sol_socket(msk, optname, optval, optlen);
Florian Westphal83f0c102020-07-05 01:30:15 +02002798
Peter Krystad717e79c2020-01-21 16:56:22 -08002799 /* @@ the meaning of setsockopt() when the socket is connected and
Mat Martineaub6e4a1a2020-02-14 14:14:29 -08002800 * there are multiple subflows is not yet defined. It is up to the
2801 * MPTCP-level socket to configure the subflows until the subflow
2802 * is in TCP fallback, when TCP socket options are passed through
2803 * to the one remaining subflow.
Peter Krystad717e79c2020-01-21 16:56:22 -08002804 */
2805 lock_sock(sk);
Paolo Abeni76660af2020-06-29 22:26:24 +02002806 ssk = __mptcp_tcp_fallback(msk);
Florian Westphale1546592020-04-11 21:05:01 +02002807 release_sock(sk);
Paolo Abeni76660af2020-06-29 22:26:24 +02002808 if (ssk)
2809 return tcp_setsockopt(ssk, level, optname, optval, optlen);
Florian Westphal50e741b2020-01-29 15:54:44 +01002810
Florian Westphalc9b95a12020-07-05 01:30:17 +02002811 if (level == SOL_IPV6)
2812 return mptcp_setsockopt_v6(msk, optname, optval, optlen);
2813
Mat Martineaub6e4a1a2020-02-14 14:14:29 -08002814 return -EOPNOTSUPP;
Peter Krystad717e79c2020-01-21 16:56:22 -08002815}
2816
2817static int mptcp_getsockopt(struct sock *sk, int level, int optname,
Florian Westphal50e741b2020-01-29 15:54:44 +01002818 char __user *optval, int __user *option)
Peter Krystad717e79c2020-01-21 16:56:22 -08002819{
2820 struct mptcp_sock *msk = mptcp_sk(sk);
Paolo Abeni76660af2020-06-29 22:26:24 +02002821 struct sock *ssk;
Peter Krystad717e79c2020-01-21 16:56:22 -08002822
2823 pr_debug("msk=%p", msk);
2824
Mat Martineaub6e4a1a2020-02-14 14:14:29 -08002825 /* @@ the meaning of setsockopt() when the socket is connected and
2826 * there are multiple subflows is not yet defined. It is up to the
2827 * MPTCP-level socket to configure the subflows until the subflow
2828 * is in TCP fallback, when socket options are passed through
2829 * to the one remaining subflow.
Peter Krystad717e79c2020-01-21 16:56:22 -08002830 */
2831 lock_sock(sk);
Paolo Abeni76660af2020-06-29 22:26:24 +02002832 ssk = __mptcp_tcp_fallback(msk);
Florian Westphale1546592020-04-11 21:05:01 +02002833 release_sock(sk);
Paolo Abeni76660af2020-06-29 22:26:24 +02002834 if (ssk)
2835 return tcp_getsockopt(ssk, level, optname, optval, option);
Florian Westphal50e741b2020-01-29 15:54:44 +01002836
Mat Martineaub6e4a1a2020-02-14 14:14:29 -08002837 return -EOPNOTSUPP;
Peter Krystad717e79c2020-01-21 16:56:22 -08002838}
2839
Paolo Abeniea4ca582020-11-19 11:46:03 -08002840#define MPTCP_DEFERRED_ALL (TCPF_WRITE_TIMER_DEFERRED)
Paolo Abeni14c441b2020-02-26 10:14:52 +01002841
Paolo Abenie93da922020-11-27 11:10:23 +01002842/* processes deferred events and flush wmem */
Paolo Abeni14c441b2020-02-26 10:14:52 +01002843static void mptcp_release_cb(struct sock *sk)
2844{
2845 unsigned long flags, nflags;
2846
Paolo Abenie93da922020-11-27 11:10:23 +01002847 /* clear any wmem reservation and errors */
2848 __mptcp_update_wmem(sk);
Paolo Abeni87952602020-11-27 11:10:24 +01002849 __mptcp_update_rmem(sk);
Paolo Abenie93da922020-11-27 11:10:23 +01002850
Paolo Abeni14c441b2020-02-26 10:14:52 +01002851 do {
2852 flags = sk->sk_tsq_flags;
2853 if (!(flags & MPTCP_DEFERRED_ALL))
2854 return;
2855 nflags = flags & ~MPTCP_DEFERRED_ALL;
2856 } while (cmpxchg(&sk->sk_tsq_flags, flags, nflags) != flags);
2857
Paolo Abenib51f9b82020-03-27 14:48:44 -07002858 sock_release_ownership(sk);
2859
Paolo Abenib51f9b82020-03-27 14:48:44 -07002860 if (flags & TCPF_WRITE_TIMER_DEFERRED) {
2861 mptcp_retransmit_handler(sk);
2862 __sock_put(sk);
2863 }
Paolo Abeni14c441b2020-02-26 10:14:52 +01002864}
2865
Paolo Abeni2c5ebd02020-06-26 19:30:00 +02002866static int mptcp_hash(struct sock *sk)
2867{
2868 /* should never be called,
2869 * we hash the TCP subflows not the master socket
2870 */
2871 WARN_ON_ONCE(1);
2872 return 0;
2873}
2874
2875static void mptcp_unhash(struct sock *sk)
2876{
2877 /* called from sk_common_release(), but nothing to do here */
2878}
2879
Peter Krystadcec37a62020-01-21 16:56:18 -08002880static int mptcp_get_port(struct sock *sk, unsigned short snum)
Mat Martineauf870fa02020-01-21 16:56:15 -08002881{
2882 struct mptcp_sock *msk = mptcp_sk(sk);
Peter Krystadcec37a62020-01-21 16:56:18 -08002883 struct socket *ssock;
Mat Martineauf870fa02020-01-21 16:56:15 -08002884
Peter Krystadcec37a62020-01-21 16:56:18 -08002885 ssock = __mptcp_nmpc_socket(msk);
2886 pr_debug("msk=%p, subflow=%p", msk, ssock);
2887 if (WARN_ON_ONCE(!ssock))
2888 return -EINVAL;
Mat Martineauf870fa02020-01-21 16:56:15 -08002889
Peter Krystadcec37a62020-01-21 16:56:18 -08002890 return inet_csk_get_port(ssock->sk, snum);
2891}
Mat Martineauf870fa02020-01-21 16:56:15 -08002892
Peter Krystadcec37a62020-01-21 16:56:18 -08002893void mptcp_finish_connect(struct sock *ssk)
2894{
2895 struct mptcp_subflow_context *subflow;
2896 struct mptcp_sock *msk;
2897 struct sock *sk;
Mat Martineau6d0060f2020-01-21 16:56:23 -08002898 u64 ack_seq;
Mat Martineauf870fa02020-01-21 16:56:15 -08002899
Peter Krystadcec37a62020-01-21 16:56:18 -08002900 subflow = mptcp_subflow_ctx(ssk);
Peter Krystadcec37a62020-01-21 16:56:18 -08002901 sk = subflow->conn;
2902 msk = mptcp_sk(sk);
2903
Mat Martineau648ef4b2020-01-21 16:56:24 -08002904 pr_debug("msk=%p, token=%u", sk, subflow->token);
2905
Mat Martineau6d0060f2020-01-21 16:56:23 -08002906 mptcp_crypto_key_sha(subflow->remote_key, NULL, &ack_seq);
2907 ack_seq++;
Mat Martineau648ef4b2020-01-21 16:56:24 -08002908 subflow->map_seq = ack_seq;
2909 subflow->map_subflow_seq = 1;
Mat Martineau6d0060f2020-01-21 16:56:23 -08002910
Peter Krystadcec37a62020-01-21 16:56:18 -08002911 /* the socket is not connected yet, no msk/subflow ops can access/race
2912 * accessing the field below
2913 */
2914 WRITE_ONCE(msk->remote_key, subflow->remote_key);
2915 WRITE_ONCE(msk->local_key, subflow->local_key);
Mat Martineau6d0060f2020-01-21 16:56:23 -08002916 WRITE_ONCE(msk->write_seq, subflow->idsn + 1);
Paolo Abenieaa2ffa2020-11-16 10:48:08 +01002917 WRITE_ONCE(msk->snd_nxt, msk->write_seq);
Mat Martineau6d0060f2020-01-21 16:56:23 -08002918 WRITE_ONCE(msk->ack_seq, ack_seq);
Florian Westphalfa3fe2b2020-11-19 11:46:02 -08002919 WRITE_ONCE(msk->rcv_wnd_sent, ack_seq);
Christoph Paaschd22f4982020-01-21 16:56:32 -08002920 WRITE_ONCE(msk->can_ack, 1);
Paolo Abenicc9d2562020-03-27 14:48:42 -07002921 atomic64_set(&msk->snd_una, msk->write_seq);
Peter Krystad1b1c7a02020-03-27 14:48:38 -07002922
2923 mptcp_pm_new_connection(msk, 0);
Florian Westphala6b118f2020-06-30 21:24:45 +02002924
2925 mptcp_rcv_space_init(msk, ssk);
Mat Martineauf870fa02020-01-21 16:56:15 -08002926}
2927
Peter Krystadcf7da0d2020-01-21 16:56:19 -08002928static void mptcp_sock_graft(struct sock *sk, struct socket *parent)
2929{
2930 write_lock_bh(&sk->sk_callback_lock);
2931 rcu_assign_pointer(sk->sk_wq, &parent->wq);
2932 sk_set_socket(sk, parent);
2933 sk->sk_uid = SOCK_INODE(parent)->i_uid;
2934 write_unlock_bh(&sk->sk_callback_lock);
2935}
2936
Paolo Abenie16163b2020-11-16 10:48:09 +01002937bool mptcp_finish_join(struct sock *ssk)
Peter Krystadf2962342020-03-27 14:48:39 -07002938{
Paolo Abenie16163b2020-11-16 10:48:09 +01002939 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
Peter Krystadf2962342020-03-27 14:48:39 -07002940 struct mptcp_sock *msk = mptcp_sk(subflow->conn);
2941 struct sock *parent = (void *)msk;
2942 struct socket *parent_sock;
Peter Krystadec3edaa2020-03-27 14:48:40 -07002943 bool ret;
Peter Krystadf2962342020-03-27 14:48:39 -07002944
2945 pr_debug("msk=%p, subflow=%p", msk, subflow);
2946
2947 /* mptcp socket already closing? */
Paolo Abenib93df082020-07-23 13:02:32 +02002948 if (!mptcp_is_fully_established(parent))
Peter Krystadf2962342020-03-27 14:48:39 -07002949 return false;
2950
2951 if (!msk->pm.server_side)
2952 return true;
2953
Paolo Abeni10f6d462020-05-29 17:43:30 +02002954 if (!mptcp_pm_allow_new_subflow(msk))
2955 return false;
2956
2957 /* active connections are already on conn_list, and we can't acquire
2958 * msk lock here.
2959 * use the join list lock as synchronization point and double-check
Paolo Abenie16163b2020-11-16 10:48:09 +01002960 * msk status to avoid racing with __mptcp_destroy_sock()
Paolo Abeni10f6d462020-05-29 17:43:30 +02002961 */
2962 spin_lock_bh(&msk->join_list_lock);
2963 ret = inet_sk_state_load(parent) == TCP_ESTABLISHED;
Paolo Abenie16163b2020-11-16 10:48:09 +01002964 if (ret && !WARN_ON_ONCE(!list_empty(&subflow->node))) {
Paolo Abeni10f6d462020-05-29 17:43:30 +02002965 list_add_tail(&subflow->node, &msk->join_list);
Paolo Abenie16163b2020-11-16 10:48:09 +01002966 sock_hold(ssk);
2967 }
Paolo Abeni10f6d462020-05-29 17:43:30 +02002968 spin_unlock_bh(&msk->join_list_lock);
2969 if (!ret)
2970 return false;
2971
2972 /* attach to msk socket only after we are sure he will deal with us
2973 * at close time
2974 */
Peter Krystadf2962342020-03-27 14:48:39 -07002975 parent_sock = READ_ONCE(parent->sk_socket);
Paolo Abenie16163b2020-11-16 10:48:09 +01002976 if (parent_sock && !ssk->sk_socket)
2977 mptcp_sock_graft(ssk, parent_sock);
Mat Martineau917944d2020-09-29 15:08:19 -07002978 subflow->map_seq = READ_ONCE(msk->ack_seq);
Paolo Abeni10f6d462020-05-29 17:43:30 +02002979 return true;
Peter Krystadf2962342020-03-27 14:48:39 -07002980}
2981
Mat Martineauf870fa02020-01-21 16:56:15 -08002982static struct proto mptcp_prot = {
2983 .name = "MPTCP",
2984 .owner = THIS_MODULE,
2985 .init = mptcp_init_sock,
Paolo Abeni18b683b2020-03-27 14:48:43 -07002986 .disconnect = mptcp_disconnect,
Mat Martineauf870fa02020-01-21 16:56:15 -08002987 .close = mptcp_close,
Peter Krystadcf7da0d2020-01-21 16:56:19 -08002988 .accept = mptcp_accept,
Peter Krystad717e79c2020-01-21 16:56:22 -08002989 .setsockopt = mptcp_setsockopt,
2990 .getsockopt = mptcp_getsockopt,
Mat Martineauf870fa02020-01-21 16:56:15 -08002991 .shutdown = tcp_shutdown,
Peter Krystad79c09492020-01-21 16:56:20 -08002992 .destroy = mptcp_destroy,
Mat Martineauf870fa02020-01-21 16:56:15 -08002993 .sendmsg = mptcp_sendmsg,
2994 .recvmsg = mptcp_recvmsg,
Paolo Abeni14c441b2020-02-26 10:14:52 +01002995 .release_cb = mptcp_release_cb,
Paolo Abeni2c5ebd02020-06-26 19:30:00 +02002996 .hash = mptcp_hash,
2997 .unhash = mptcp_unhash,
Peter Krystadcec37a62020-01-21 16:56:18 -08002998 .get_port = mptcp_get_port,
Paolo Abenid0272362020-03-27 14:48:45 -07002999 .sockets_allocated = &mptcp_sockets_allocated,
3000 .memory_allocated = &tcp_memory_allocated,
3001 .memory_pressure = &tcp_memory_pressure,
Paolo Abenid0272362020-03-27 14:48:45 -07003002 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
Paolo Abeni989ef492020-11-08 19:49:59 +01003003 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
Paolo Abenid0272362020-03-27 14:48:45 -07003004 .sysctl_mem = sysctl_tcp_mem,
Mat Martineauf870fa02020-01-21 16:56:15 -08003005 .obj_size = sizeof(struct mptcp_sock),
Paolo Abeni2c5ebd02020-06-26 19:30:00 +02003006 .slab_flags = SLAB_TYPESAFE_BY_RCU,
Mat Martineauf870fa02020-01-21 16:56:15 -08003007 .no_autobind = true,
3008};
3009
Peter Krystad2303f992020-01-21 16:56:17 -08003010static int mptcp_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
3011{
3012 struct mptcp_sock *msk = mptcp_sk(sock->sk);
3013 struct socket *ssock;
Peter Krystadcf7da0d2020-01-21 16:56:19 -08003014 int err;
Peter Krystad2303f992020-01-21 16:56:17 -08003015
3016 lock_sock(sock->sk);
Paolo Abenifa680182020-06-29 22:26:23 +02003017 ssock = __mptcp_nmpc_socket(msk);
3018 if (!ssock) {
3019 err = -EINVAL;
Peter Krystad2303f992020-01-21 16:56:17 -08003020 goto unlock;
3021 }
3022
3023 err = ssock->ops->bind(ssock, uaddr, addr_len);
Peter Krystadcf7da0d2020-01-21 16:56:19 -08003024 if (!err)
3025 mptcp_copy_inaddrs(sock->sk, ssock->sk);
Peter Krystad2303f992020-01-21 16:56:17 -08003026
3027unlock:
3028 release_sock(sock->sk);
3029 return err;
3030}
3031
Paolo Abeni0235d072020-07-23 13:02:31 +02003032static void mptcp_subflow_early_fallback(struct mptcp_sock *msk,
3033 struct mptcp_subflow_context *subflow)
3034{
3035 subflow->request_mptcp = 0;
3036 __mptcp_do_fallback(msk);
3037}
3038
Peter Krystad2303f992020-01-21 16:56:17 -08003039static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr,
3040 int addr_len, int flags)
3041{
3042 struct mptcp_sock *msk = mptcp_sk(sock->sk);
Paolo Abeni2c5ebd02020-06-26 19:30:00 +02003043 struct mptcp_subflow_context *subflow;
Peter Krystad2303f992020-01-21 16:56:17 -08003044 struct socket *ssock;
3045 int err;
3046
3047 lock_sock(sock->sk);
Paolo Abeni41be81a2020-05-29 17:43:29 +02003048 if (sock->state != SS_UNCONNECTED && msk->subflow) {
3049 /* pending connection or invalid state, let existing subflow
3050 * cope with that
3051 */
3052 ssock = msk->subflow;
3053 goto do_connect;
3054 }
3055
Paolo Abenifa680182020-06-29 22:26:23 +02003056 ssock = __mptcp_nmpc_socket(msk);
3057 if (!ssock) {
3058 err = -EINVAL;
Peter Krystad2303f992020-01-21 16:56:17 -08003059 goto unlock;
3060 }
3061
Paolo Abenifa680182020-06-29 22:26:23 +02003062 mptcp_token_destroy(msk);
3063 inet_sk_state_store(sock->sk, TCP_SYN_SENT);
Paolo Abeni2c5ebd02020-06-26 19:30:00 +02003064 subflow = mptcp_subflow_ctx(ssock->sk);
Peter Krystadcf7da0d2020-01-21 16:56:19 -08003065#ifdef CONFIG_TCP_MD5SIG
3066 /* no MPTCP if MD5SIG is enabled on this socket or we may run out of
3067 * TCP option space.
3068 */
3069 if (rcu_access_pointer(tcp_sk(ssock->sk)->md5sig_info))
Paolo Abeni0235d072020-07-23 13:02:31 +02003070 mptcp_subflow_early_fallback(msk, subflow);
Peter Krystadcf7da0d2020-01-21 16:56:19 -08003071#endif
Paolo Abeni2c5ebd02020-06-26 19:30:00 +02003072 if (subflow->request_mptcp && mptcp_token_new_connect(ssock->sk))
Paolo Abeni0235d072020-07-23 13:02:31 +02003073 mptcp_subflow_early_fallback(msk, subflow);
Peter Krystadcf7da0d2020-01-21 16:56:19 -08003074
Paolo Abeni41be81a2020-05-29 17:43:29 +02003075do_connect:
Peter Krystad2303f992020-01-21 16:56:17 -08003076 err = ssock->ops->connect(ssock, uaddr, addr_len, flags);
Paolo Abeni41be81a2020-05-29 17:43:29 +02003077 sock->state = ssock->state;
3078
3079 /* on successful connect, the msk state will be moved to established by
3080 * subflow_finish_connect()
3081 */
Matthieu Baerts367fe042020-07-27 12:24:33 +02003082 if (!err || err == -EINPROGRESS)
Paolo Abeni41be81a2020-05-29 17:43:29 +02003083 mptcp_copy_inaddrs(sock->sk, ssock->sk);
3084 else
3085 inet_sk_state_store(sock->sk, inet_sk_state_load(ssock->sk));
Peter Krystad2303f992020-01-21 16:56:17 -08003086
3087unlock:
3088 release_sock(sock->sk);
3089 return err;
3090}
3091
Peter Krystadcf7da0d2020-01-21 16:56:19 -08003092static int mptcp_listen(struct socket *sock, int backlog)
3093{
3094 struct mptcp_sock *msk = mptcp_sk(sock->sk);
3095 struct socket *ssock;
3096 int err;
3097
3098 pr_debug("msk=%p", msk);
3099
3100 lock_sock(sock->sk);
Paolo Abenifa680182020-06-29 22:26:23 +02003101 ssock = __mptcp_nmpc_socket(msk);
3102 if (!ssock) {
3103 err = -EINVAL;
Peter Krystadcf7da0d2020-01-21 16:56:19 -08003104 goto unlock;
3105 }
3106
Paolo Abenifa680182020-06-29 22:26:23 +02003107 mptcp_token_destroy(msk);
3108 inet_sk_state_store(sock->sk, TCP_LISTEN);
Florian Westphal5e200872020-04-20 16:25:04 +02003109 sock_set_flag(sock->sk, SOCK_RCU_FREE);
3110
Peter Krystadcf7da0d2020-01-21 16:56:19 -08003111 err = ssock->ops->listen(ssock, backlog);
3112 inet_sk_state_store(sock->sk, inet_sk_state_load(ssock->sk));
3113 if (!err)
3114 mptcp_copy_inaddrs(sock->sk, ssock->sk);
3115
3116unlock:
3117 release_sock(sock->sk);
3118 return err;
3119}
3120
Peter Krystadcf7da0d2020-01-21 16:56:19 -08003121static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
3122 int flags, bool kern)
3123{
3124 struct mptcp_sock *msk = mptcp_sk(sock->sk);
3125 struct socket *ssock;
3126 int err;
3127
3128 pr_debug("msk=%p", msk);
3129
3130 lock_sock(sock->sk);
3131 if (sock->sk->sk_state != TCP_LISTEN)
3132 goto unlock_fail;
3133
3134 ssock = __mptcp_nmpc_socket(msk);
3135 if (!ssock)
3136 goto unlock_fail;
3137
Paolo Abeni8a056612020-06-29 22:26:25 +02003138 clear_bit(MPTCP_DATA_READY, &msk->flags);
Peter Krystadcf7da0d2020-01-21 16:56:19 -08003139 sock_hold(ssock->sk);
3140 release_sock(sock->sk);
3141
3142 err = ssock->ops->accept(sock, newsock, flags, kern);
Paolo Abenid2f77c52020-06-29 22:26:22 +02003143 if (err == 0 && !mptcp_is_tcpsk(newsock->sk)) {
Peter Krystadcf7da0d2020-01-21 16:56:19 -08003144 struct mptcp_sock *msk = mptcp_sk(newsock->sk);
3145 struct mptcp_subflow_context *subflow;
Paolo Abeni0397c6d2020-11-19 11:45:58 -08003146 struct sock *newsk = newsock->sk;
3147 bool slowpath;
3148
3149 slowpath = lock_sock_fast(newsk);
3150 mptcp_copy_inaddrs(newsk, msk->first);
3151 mptcp_rcv_space_init(msk, msk->first);
Peter Krystadcf7da0d2020-01-21 16:56:19 -08003152
3153 /* set ssk->sk_socket of accept()ed flows to mptcp socket.
3154 * This is needed so NOSPACE flag can be set from tcp stack.
3155 */
Peter Krystadec3edaa2020-03-27 14:48:40 -07003156 __mptcp_flush_join_list(msk);
Geliang Tang190f8b02020-08-03 21:00:44 +08003157 mptcp_for_each_subflow(msk, subflow) {
Peter Krystadcf7da0d2020-01-21 16:56:19 -08003158 struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
3159
3160 if (!ssk->sk_socket)
3161 mptcp_sock_graft(ssk, newsock);
3162 }
Paolo Abeni0397c6d2020-11-19 11:45:58 -08003163 unlock_sock_fast(newsk, slowpath);
Peter Krystadcf7da0d2020-01-21 16:56:19 -08003164 }
3165
Paolo Abeni8a056612020-06-29 22:26:25 +02003166 if (inet_csk_listen_poll(ssock->sk))
3167 set_bit(MPTCP_DATA_READY, &msk->flags);
Peter Krystadcf7da0d2020-01-21 16:56:19 -08003168 sock_put(ssock->sk);
3169 return err;
3170
3171unlock_fail:
3172 release_sock(sock->sk);
3173 return -EINVAL;
3174}
3175
Paolo Abeni8a056612020-06-29 22:26:25 +02003176static __poll_t mptcp_check_readable(struct mptcp_sock *msk)
3177{
3178 return test_bit(MPTCP_DATA_READY, &msk->flags) ? EPOLLIN | EPOLLRDNORM :
3179 0;
3180}
3181
Florian Westphal8edf0862020-11-16 10:48:12 +01003182static bool __mptcp_check_writeable(struct mptcp_sock *msk)
3183{
3184 struct sock *sk = (struct sock *)msk;
3185 bool mptcp_writable;
3186
3187 mptcp_clean_una(sk);
3188 mptcp_writable = sk_stream_is_writeable(sk);
3189 if (!mptcp_writable)
3190 mptcp_nospace(msk);
3191
3192 return mptcp_writable;
3193}
3194
3195static __poll_t mptcp_check_writeable(struct mptcp_sock *msk)
3196{
3197 struct sock *sk = (struct sock *)msk;
3198 __poll_t ret = 0;
3199 bool slow;
3200
3201 if (unlikely(sk->sk_shutdown & SEND_SHUTDOWN))
3202 return 0;
3203
3204 if (sk_stream_is_writeable(sk))
3205 return EPOLLOUT | EPOLLWRNORM;
3206
3207 slow = lock_sock_fast(sk);
3208 if (__mptcp_check_writeable(msk))
3209 ret = EPOLLOUT | EPOLLWRNORM;
3210
3211 unlock_sock_fast(sk, slow);
3212 return ret;
3213}
3214
Peter Krystad2303f992020-01-21 16:56:17 -08003215static __poll_t mptcp_poll(struct file *file, struct socket *sock,
3216 struct poll_table_struct *wait)
3217{
Florian Westphal1891c4a2020-01-21 16:56:25 -08003218 struct sock *sk = sock->sk;
Paolo Abeni8ab183d2020-01-21 16:56:33 -08003219 struct mptcp_sock *msk;
Peter Krystad2303f992020-01-21 16:56:17 -08003220 __poll_t mask = 0;
Paolo Abeni8a056612020-06-29 22:26:25 +02003221 int state;
Peter Krystad2303f992020-01-21 16:56:17 -08003222
Florian Westphal1891c4a2020-01-21 16:56:25 -08003223 msk = mptcp_sk(sk);
Florian Westphal1891c4a2020-01-21 16:56:25 -08003224 sock_poll_wait(file, sock, wait);
Florian Westphal1891c4a2020-01-21 16:56:25 -08003225
Paolo Abeni8a056612020-06-29 22:26:25 +02003226 state = inet_sk_state_load(sk);
Paolo Abeni67193312020-09-14 10:01:09 +02003227 pr_debug("msk=%p state=%d flags=%lx", msk, state, msk->flags);
Paolo Abeni8a056612020-06-29 22:26:25 +02003228 if (state == TCP_LISTEN)
3229 return mptcp_check_readable(msk);
3230
3231 if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) {
3232 mask |= mptcp_check_readable(msk);
Florian Westphal8edf0862020-11-16 10:48:12 +01003233 mask |= mptcp_check_writeable(msk);
Paolo Abeni8a056612020-06-29 22:26:25 +02003234 }
Florian Westphal1891c4a2020-01-21 16:56:25 -08003235 if (sk->sk_shutdown & RCV_SHUTDOWN)
3236 mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
3237
Peter Krystad2303f992020-01-21 16:56:17 -08003238 return mask;
3239}
3240
Peter Krystad21498492020-01-21 16:56:21 -08003241static int mptcp_shutdown(struct socket *sock, int how)
3242{
3243 struct mptcp_sock *msk = mptcp_sk(sock->sk);
Paolo Abenie16163b2020-11-16 10:48:09 +01003244 struct sock *sk = sock->sk;
Peter Krystad21498492020-01-21 16:56:21 -08003245 int ret = 0;
3246
3247 pr_debug("sk=%p, how=%d", msk, how);
3248
Paolo Abenie16163b2020-11-16 10:48:09 +01003249 lock_sock(sk);
Peter Krystad21498492020-01-21 16:56:21 -08003250
3251 how++;
Peter Krystad21498492020-01-21 16:56:21 -08003252 if ((how & ~SHUTDOWN_MASK) || !how) {
3253 ret = -EINVAL;
3254 goto out_unlock;
3255 }
3256
3257 if (sock->state == SS_CONNECTING) {
Paolo Abenie16163b2020-11-16 10:48:09 +01003258 if ((1 << sk->sk_state) &
Peter Krystad21498492020-01-21 16:56:21 -08003259 (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_CLOSE))
3260 sock->state = SS_DISCONNECTING;
3261 else
3262 sock->state = SS_CONNECTED;
3263 }
3264
Paolo Abenie16163b2020-11-16 10:48:09 +01003265 sk->sk_shutdown |= how;
3266 if ((how & SEND_SHUTDOWN) && mptcp_close_state(sk))
3267 __mptcp_wr_shutdown(sk);
Peter Krystad21498492020-01-21 16:56:21 -08003268
Davide Carattie1ff9e82020-06-29 22:26:20 +02003269 /* Wake up anyone sleeping in poll. */
Paolo Abenie16163b2020-11-16 10:48:09 +01003270 sk->sk_state_change(sk);
Davide Carattie1ff9e82020-06-29 22:26:20 +02003271
Peter Krystad21498492020-01-21 16:56:21 -08003272out_unlock:
Paolo Abenie16163b2020-11-16 10:48:09 +01003273 release_sock(sk);
Peter Krystad21498492020-01-21 16:56:21 -08003274
3275 return ret;
3276}
3277
Florian Westphale42f1ac2020-01-24 16:04:02 -08003278static const struct proto_ops mptcp_stream_ops = {
3279 .family = PF_INET,
3280 .owner = THIS_MODULE,
3281 .release = inet_release,
3282 .bind = mptcp_bind,
3283 .connect = mptcp_stream_connect,
3284 .socketpair = sock_no_socketpair,
3285 .accept = mptcp_stream_accept,
Paolo Abenid2f77c52020-06-29 22:26:22 +02003286 .getname = inet_getname,
Florian Westphale42f1ac2020-01-24 16:04:02 -08003287 .poll = mptcp_poll,
3288 .ioctl = inet_ioctl,
3289 .gettstamp = sock_gettstamp,
3290 .listen = mptcp_listen,
3291 .shutdown = mptcp_shutdown,
3292 .setsockopt = sock_common_setsockopt,
3293 .getsockopt = sock_common_getsockopt,
3294 .sendmsg = inet_sendmsg,
3295 .recvmsg = inet_recvmsg,
3296 .mmap = sock_no_mmap,
3297 .sendpage = inet_sendpage,
Florian Westphale42f1ac2020-01-24 16:04:02 -08003298};
Peter Krystad2303f992020-01-21 16:56:17 -08003299
Mat Martineauf870fa02020-01-21 16:56:15 -08003300static struct inet_protosw mptcp_protosw = {
3301 .type = SOCK_STREAM,
3302 .protocol = IPPROTO_MPTCP,
3303 .prot = &mptcp_prot,
Peter Krystad2303f992020-01-21 16:56:17 -08003304 .ops = &mptcp_stream_ops,
3305 .flags = INET_PROTOSW_ICSK,
Mat Martineauf870fa02020-01-21 16:56:15 -08003306};
3307
Paolo Abenid39dcec2020-06-26 19:29:59 +02003308void __init mptcp_proto_init(void)
Mat Martineauf870fa02020-01-21 16:56:15 -08003309{
Peter Krystad2303f992020-01-21 16:56:17 -08003310 mptcp_prot.h.hashinfo = tcp_prot.h.hashinfo;
Peter Krystad2303f992020-01-21 16:56:17 -08003311
Paolo Abenid0272362020-03-27 14:48:45 -07003312 if (percpu_counter_init(&mptcp_sockets_allocated, 0, GFP_KERNEL))
3313 panic("Failed to allocate MPTCP pcpu counter\n");
3314
Peter Krystad2303f992020-01-21 16:56:17 -08003315 mptcp_subflow_init();
Peter Krystad1b1c7a02020-03-27 14:48:38 -07003316 mptcp_pm_init();
Paolo Abeni2c5ebd02020-06-26 19:30:00 +02003317 mptcp_token_init();
Peter Krystad2303f992020-01-21 16:56:17 -08003318
Mat Martineauf870fa02020-01-21 16:56:15 -08003319 if (proto_register(&mptcp_prot, 1) != 0)
3320 panic("Failed to register MPTCP proto.\n");
3321
3322 inet_register_protosw(&mptcp_protosw);
Florian Westphal6771bfd2020-02-26 10:14:48 +01003323
3324 BUILD_BUG_ON(sizeof(struct mptcp_skb_cb) > sizeof_field(struct sk_buff, cb));
Mat Martineauf870fa02020-01-21 16:56:15 -08003325}
3326
3327#if IS_ENABLED(CONFIG_MPTCP_IPV6)
Florian Westphale42f1ac2020-01-24 16:04:02 -08003328static const struct proto_ops mptcp_v6_stream_ops = {
3329 .family = PF_INET6,
3330 .owner = THIS_MODULE,
3331 .release = inet6_release,
3332 .bind = mptcp_bind,
3333 .connect = mptcp_stream_connect,
3334 .socketpair = sock_no_socketpair,
3335 .accept = mptcp_stream_accept,
Paolo Abenid2f77c52020-06-29 22:26:22 +02003336 .getname = inet6_getname,
Florian Westphale42f1ac2020-01-24 16:04:02 -08003337 .poll = mptcp_poll,
3338 .ioctl = inet6_ioctl,
3339 .gettstamp = sock_gettstamp,
3340 .listen = mptcp_listen,
3341 .shutdown = mptcp_shutdown,
3342 .setsockopt = sock_common_setsockopt,
3343 .getsockopt = sock_common_getsockopt,
3344 .sendmsg = inet6_sendmsg,
3345 .recvmsg = inet6_recvmsg,
3346 .mmap = sock_no_mmap,
3347 .sendpage = inet_sendpage,
3348#ifdef CONFIG_COMPAT
Christoph Hellwig39869122020-05-18 08:28:06 +02003349 .compat_ioctl = inet6_compat_ioctl,
Florian Westphale42f1ac2020-01-24 16:04:02 -08003350#endif
3351};
3352
Mat Martineauf870fa02020-01-21 16:56:15 -08003353static struct proto mptcp_v6_prot;
3354
Peter Krystad79c09492020-01-21 16:56:20 -08003355static void mptcp_v6_destroy(struct sock *sk)
3356{
3357 mptcp_destroy(sk);
3358 inet6_destroy_sock(sk);
3359}
3360
Mat Martineauf870fa02020-01-21 16:56:15 -08003361static struct inet_protosw mptcp_v6_protosw = {
3362 .type = SOCK_STREAM,
3363 .protocol = IPPROTO_MPTCP,
3364 .prot = &mptcp_v6_prot,
Peter Krystad2303f992020-01-21 16:56:17 -08003365 .ops = &mptcp_v6_stream_ops,
Mat Martineauf870fa02020-01-21 16:56:15 -08003366 .flags = INET_PROTOSW_ICSK,
3367};
3368
Paolo Abenid39dcec2020-06-26 19:29:59 +02003369int __init mptcp_proto_v6_init(void)
Mat Martineauf870fa02020-01-21 16:56:15 -08003370{
3371 int err;
3372
3373 mptcp_v6_prot = mptcp_prot;
3374 strcpy(mptcp_v6_prot.name, "MPTCPv6");
3375 mptcp_v6_prot.slab = NULL;
Peter Krystad79c09492020-01-21 16:56:20 -08003376 mptcp_v6_prot.destroy = mptcp_v6_destroy;
Florian Westphalb0519de2020-02-06 00:39:37 +01003377 mptcp_v6_prot.obj_size = sizeof(struct mptcp6_sock);
Mat Martineauf870fa02020-01-21 16:56:15 -08003378
3379 err = proto_register(&mptcp_v6_prot, 1);
3380 if (err)
3381 return err;
3382
Mat Martineauf870fa02020-01-21 16:56:15 -08003383 err = inet6_register_protosw(&mptcp_v6_protosw);
3384 if (err)
3385 proto_unregister(&mptcp_v6_prot);
3386
3387 return err;
3388}
3389#endif